Index: user/alc/PQ_LAUNDRY/lib/libc/locale/ascii.c
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libc/locale/ascii.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/lib/libc/locale/ascii.c	(revision 306283)

Property changes on: user/alc/PQ_LAUNDRY/lib/libc/locale/ascii.c
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/lib/libc/locale/ascii.c:r303053-306282
Index: user/alc/PQ_LAUNDRY/lib/libc/sys/cap_enter.2
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libc/sys/cap_enter.2	(revision 306282)
+++ user/alc/PQ_LAUNDRY/lib/libc/sys/cap_enter.2	(revision 306283)
@@ -1,130 +1,160 @@
 .\"
 .\" Copyright (c) 2008-2009 Robert N. M. Watson
 .\" All rights reserved.
 .\"
 .\" This software was developed at the University of Cambridge Computer
 .\" Laboratory with support from a grant from Google, Inc.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd September 10, 2016
+.Dd September 22, 2016
 .Dt CAP_ENTER 2
 .Os
 .Sh NAME
 .Nm cap_enter ,
 .Nm cap_getmode
 .Nd Capability mode system calls
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In sys/capsicum.h
 .Ft int
 .Fn cap_enter "void"
 .Ft int
 .Fn cap_getmode "u_int *modep"
 .Sh DESCRIPTION
 .Fn cap_enter
 places the current process into capability mode, a mode of execution in which
 processes may only issue system calls operating on file descriptors or
 reading limited global system state.
 Access to global name spaces, such as file system or IPC name spaces, is
 prevented.
 If the process is already in a capability mode sandbox, the system call is a
 no-op.
 Future process descendants created with
 .Xr fork 2
 or
 .Xr pdfork 2
 will be placed in capability mode from inception.
 .Pp
 When combined with
 .Xr cap_rights_limit 2 ,
 .Xr cap_ioctls_limit 2 ,
 .Xr cap_fcntls_limit 2 ,
 .Fn cap_enter
 may be used to create kernel-enforced sandboxes in which
 appropriately-crafted applications or application components may be run.
 .Pp
 .Fn cap_getmode
 returns a flag indicating whether or not the process is in a capability mode
 sandbox.
+.Sh RUN-TIME SETTINGS
+If the
+.Dv kern.trap_enocap
+sysctl MIB is set to non-zero value, then for any process executing in a
+capability mode sandbox, any syscall which results in either
+.Er ENOTCAPABLE
+or
+.Er ECAPMODE
+error, also generates the synchronous
+.Dv SIGTRAP
+signal to the thread on the syscall return.
+On the signal delivery, the
+.Va si_errno
+member of the
+.Fa siginfo
+signal handler parameter is set to the syscall error value,
+and the
+.Va si_code
+member is set to
+.Dv TRAP_CAP .
+.Pp
+See also the
+.Dv PROC_TRAPCAP_CTL
+and
+.Dv PROC_TRAPCAP_STATUS
+operations of the
+.Xr procctl 2
+function for similar per-process functionality.
 .Sh CAVEAT
 Creating effective process sandboxes is a tricky process that involves
 identifying the least possible rights required by the process and then
 passing those rights into the process in a safe manner.
 Consumers of
 .Fn cap_enter
 should also be aware of other inherited rights, such as access to VM
 resources, memory contents, and other process properties that should be
 considered.
 It is advisable to use
 .Xr fexecve 2
 to create a runtime environment inside the sandbox that has as few implicitly
 acquired rights as possible.
 .Sh RETURN VALUES
 .Rv -std cap_enter cap_getmode
 .Pp
 When the process is in capability mode,
 .Fn cap_getmode
 sets the flag to a non-zero value.
 A zero value means the process is not in capability mode.
 .Sh ERRORS
 The
 .Fn cap_enter
 and
 .Fn cap_getmode
 system calls
 will fail if:
 .Bl -tag -width Er
 .It Bq Er ENOSYS
 The kernel is compiled without:
 .Pp
 .Cd "options CAPABILITY_MODE"
 .El
 .Pp
 The
 .Fn cap_getmode
 system call may also return the following error:
 .Bl -tag -width Er
 .It Bq Er EFAULT
 Pointer
 .Fa modep
 points outside the process's allocated address space.
 .El
 .Sh SEE ALSO
 .Xr cap_fcntls_limit 2 ,
 .Xr cap_ioctls_limit 2 ,
 .Xr cap_rights_limit 2 ,
+.Xr procctl 2 ,
+.Xr sysctl 2 ,
 .Xr fexecve 2 ,
 .Xr cap_sandboxed 3 ,
 .Xr capsicum 4
 .Sh HISTORY
 Support for capabilities and capabilities mode was developed as part of the
 .Tn TrustedBSD
 Project.
 .Sh AUTHORS
 These functions and the capability facility were created by
 .An "Robert N. M. Watson"
 at the University of Cambridge Computer Laboratory with support from a grant
 from Google, Inc.
Index: user/alc/PQ_LAUNDRY/lib/libc/sys/procctl.2
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libc/sys/procctl.2	(revision 306282)
+++ user/alc/PQ_LAUNDRY/lib/libc/sys/procctl.2	(revision 306283)
@@ -1,440 +1,501 @@
 .\" Copyright (c) 2013 Hudson River Trading LLC
 .\" Written by: John H. Baldwin <jhb@FreeBSD.org>
 .\" All rights reserved.
 .\"
 .\" Copyright (c) 2014 The FreeBSD Foundation
 .\" Portions of this documentation were written by Konstantin Belousov
 .\" under sponsorship from the FreeBSD Foundation.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 21, 2015
+.Dd September 22, 2016
 .Dt PROCCTL 2
 .Os
 .Sh NAME
 .Nm procctl
 .Nd control processes
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In sys/procctl.h
 .Ft int
 .Fn procctl "idtype_t idtype" "id_t id" "int cmd" "void *arg"
 .Sh DESCRIPTION
 The
 .Fn procctl
 system call provides for control over processes.
 The
 .Fa idtype
 and
 .Fa id
 arguments specify the set of processes to control.
 If multiple processes match the identifier,
 .Nm
 will make a
 .Dq best effort
 to control as many of the selected processes as possible.
 An error is only returned if no selected processes successfully complete
 the request.
 The following identifier types are supported:
 .Bl -tag -width "Dv P_PGID"
 .It Dv P_PID
 Control the process with the process ID
 .Fa id .
 .It Dv P_PGID
 Control processes belonging to the process group with the ID
 .Fa id .
 .El
 .Pp
 The control request to perform is specified by the
 .Fa cmd
 argument.
 The following commands are supported:
-.Bl -tag -width "PROC_REAP_GETPIDS"
+.Bl -tag -width "Dv PROC_TRAPCAP_STATUS"
 .It Dv PROC_SPROTECT
 Set process protection state.
 This is used to mark a process as protected from being killed if the system
 exhausts the available memory and swap.
 The
 .Fa arg
 parameter must point to an integer containing an operation and zero or more
 optional flags.
 The following operations are supported:
 .Bl -tag -width "Dv PPROT_CLEAR"
 .It Dv PPROT_SET
 Mark the selected processes as protected.
 .It Dv PPROT_CLEAR
 Clear the protected state of selected processes.
 .El
 .Pp
 The following optional flags are supported:
 .Bl -tag -width "Dv PPROT_DESCEND"
 .It Dv PPROT_DESCEND
 Apply the requested operation to all child processes of each selected process
 in addition to each selected process.
 .It Dv PPROT_INHERIT
 When used with
 .Dv PPROT_SET ,
 mark all future child processes of each selected process as protected.
 Future child processes will also mark all of their future child processes.
 .El
 .It Dv PROC_REAP_ACQUIRE
 Acquires the reaper status for the current process.
 Reaper status means that children orphaned by the reaper's descendants
 that were forked after the acquisition of reaper status are reparented to the
 reaper process.
 After system initialization,
 .Xr init 8
 is the default reaper.
 .It Dv PROC_REAP_RELEASE
 Release the reaper state for the current process.
 The reaper of the current process becomes the new reaper of the
 current process's descendants.
 .It Dv PROC_REAP_STATUS
 Provides information about the reaper of the specified process,
 or the process itself when it is a reaper.
 The
 .Fa data
 argument must point to a
 .Vt procctl_reaper_status
 structure which is filled in by the syscall on successful return.
 .Bd -literal
 struct procctl_reaper_status {
 	u_int	rs_flags;
 	u_int	rs_children;
 	u_int	rs_descendants;
 	pid_t	rs_reaper;
 	pid_t	rs_pid;
 };
 .Ed
 The
 .Fa rs_flags
 may have the following flags returned:
 .Bl -tag -width "Dv REAPER_STATUS_REALINIT"
 .It Dv REAPER_STATUS_OWNED
 The specified process has acquired reaper status and has not
 released it.
 When the flag is returned, the specified process
 .Fa id ,
 pid, identifies the reaper, otherwise the
 .Fa rs_reaper
 field of the structure is set to the pid of the reaper
 for the specified process id.
 .It Dv REAPER_STATUS_REALINIT
 The specified process is the root of the reaper tree, i.e.,
 .Xr init 8 .
 .El
 .Pp
 The
 .Fa rs_children
 field returns the number of children of the reaper among the descendants.
 It is possible to have a child whose reaper is not the specified process,
 since the reaper for any existing children is not reset on the
 .Dv PROC_REAP_ACQUIRE
 operation.
 The
 .Fa rs_descendants
 field returns the total number of descendants of the reaper(s),
 not counting descendants of the reaper in the subtree.
 The
 .Fa rs_reaper
 field returns the reaper pid.
 The
 .Fa rs_pid
 returns the pid of one reaper child if there are any descendants.
 .It Dv PROC_REAP_GETPIDS
 Queries the list of descendants of the reaper of the specified process.
 The request takes a pointer to a
 .Vt procctl_reaper_pids
 structure in the
 .Fa data
 parameter.
 .Bd -literal
 struct procctl_reaper_pids {
 	u_int	rp_count;
 	struct procctl_reaper_pidinfo *rp_pids;
 };
 .Ed
 When called, the
 .Fa rp_pids
 field must point to an array of
 .Vt procctl_reaper_pidinfo
 structures, to be filled in on return,
 and the
 .Fa rp_count
 field must specify the size of the array,
 into which no more than
 .Fa rp_count
 elements will be filled in by the kernel.
 .Pp
 The
 .Vt "struct procctl_reaper_pidinfo"
 structure provides some information about one of the reaper's descendants.
 Note that for a descendant that is not a child, it may be incorrectly
 identified because of a race in which the original child process exited
 and the exited process's pid was reused for an unrelated process.
 .Bd -literal
 struct procctl_reaper_pidinfo {
 	pid_t	pi_pid;
 	pid_t	pi_subtree;
 	u_int	pi_flags;
 };
 .Ed
 The
 .Fa pi_pid
 field is the process id of the descendant.
 The
 .Fa pi_subtree
 field provides the pid of the child of the reaper, which is the (grand-)parent
 of the process.
 The
 .Fa pi_flags
 field returns the following flags, further describing the descendant:
 .Bl -tag -width "Dv REAPER_PIDINFO_VALID"
 .It Dv REAPER_PIDINFO_VALID
 Set to indicate that the
 .Vt procctl_reaper_pidinfo
 structure was filled in by the kernel.
 Zero-filling the
 .Fa rp_pids
 array and testing the
 .Dv REAPER_PIDINFO_VALID
 flag allows the caller to detect the end
 of the returned array.
 .It Dv REAPER_PIDINFO_CHILD
 The
 .Fa pi_pid
 field identifies the direct child of the reaper.
 .El
 .It Dv PROC_REAP_KILL
 Request to deliver a signal to some subset of the descendants of the reaper.
 The
 .Fa data
 parameter must point to a
 .Vt procctl_reaper_kill
 structure, which is used both for parameters and status return.
 .Bd -literal
 struct procctl_reaper_kill {
 	int	rk_sig;
 	u_int	rk_flags;
 	pid_t	rk_subtree;
 	u_int	rk_killed;
 	pid_t	rk_fpid;
 };
 .Ed
 The
 .Fa rk_sig
 field specifies the signal to be delivered.
 Zero is not a valid signal number, unlike for
 .Xr kill 2 .
 The
 .Fa rk_flags
 field further directs the operation.
 It is or-ed from the following flags:
 .Bl -tag -width "Dv REAPER_KILL_CHILDREN"
 .It Dv REAPER_KILL_CHILDREN
 Deliver the specified signal only to direct children of the reaper.
 .It Dv REAPER_KILL_SUBTREE
 Deliver the specified signal only to descendants that were forked by
 the direct child with pid specified in the
 .Fa rk_subtree
 field.
 .El
 If neither the
 .Dv REAPER_KILL_CHILDREN
 nor the
 .Dv REAPER_KILL_SUBTREE
 flags are specified, all current descendants of the reaper are signalled.
 .Pp
 If a signal was delivered to any process, the return value from the request
 is zero.
 In this case, the
 .Fa rk_killed
 field identifies the number of processes signalled.
 The
 .Fa rk_fpid
 field is set to the pid of the first process for which signal
 delivery failed, e.g., due to permission problems.
 If no such process exists, the
 .Fa rk_fpid
 field is set to -1.
 .It Dv PROC_TRACE_CTL
 Enable or disable tracing of the specified process(es), according to the
 value of the integer argument.
 Tracing includes attachment to the process using the
 .Xr ptrace 2
 and
 .Xr ktrace 2 ,
 debugging sysctls,
 .Xr hwpmc 4 ,
 .Xr dtrace 1 ,
 and core dumping.
 Possible values for the
 .Fa data
 argument are:
 .Bl -tag -width "Dv PROC_TRACE_CTL_DISABLE_EXEC"
 .It Dv PROC_TRACE_CTL_ENABLE
 Enable tracing, after it was disabled by
 .Dv PROC_TRACE_CTL_DISABLE .
 Only allowed for self.
 .It Dv PROC_TRACE_CTL_DISABLE
 Disable tracing for the specified process.
 Tracing is re-enabled when the process changes the executing
 program with the
 .Xr execve 2
 syscall.
 A child inherits the trace settings from the parent on
 .Xr fork 2 .
 .It Dv PROC_TRACE_CTL_DISABLE_EXEC
 Same as
 .Dv PROC_TRACE_CTL_DISABLE ,
 but the setting persists for the process even after
 .Xr execve 2 .
 .El
 .It Dv PROC_TRACE_STATUS
 Returns the current tracing status for the specified process in
 the integer variable pointed to by
 .Fa data .
 If tracing is disabled,
 .Fa data
 is set to -1.
 If tracing is enabled, but no debugger is attached by the
 .Xr ptrace 2
 syscall,
 .Fa data
 is set to 0.
 If a debugger is attached,
 .Fa data
 is set to the pid of the debugger process.
+.It Dv PROC_TRAPCAP_CTL
+Enable or disable, for the specified processes which are executing in a
+capability mode sandbox, the synchronous
+.Dv SIGTRAP
+signal on return from any syscall which gives either
+.Er ENOTCAPABLE
+or
+.Er ECAPMODE
+error.
+.Pp
+Possible values for the
+.Fa data
+argument are:
+.Bl -tag -width "Dv PROC_TRAPCAP_CTL_DISABLE"
+.It Dv PROC_TRAPCAP_CTL_ENABLE
+Enable the
+.Dv SIGTRAP
+signal delivery on capability mode access violations.
+The enabled mode is inherited by the children of the process,
+and is kept after
+.Xr fexecve 2
+calls.
+.It Dv PROC_TRAPCAP_CTL_DISABLE
+Disable the signal delivery on capability mode access violations.
+Note that the global sysctl
+.Dv kern.trap_enocap
+might still cause the signal to be delivered; see
+.Xr capsicum 4 .
 .El
+.Pp
+On signal delivery, the
+.Va si_errno
+member of the
+.Fa siginfo
+signal handler parameter is set to the syscall error value,
+and the
+.Va si_code
+member is set to
+.Dv TRAP_CAP .
+.Pp
+See
+.Xr capsicum 4
+for more information about the capability mode.
+.It Dv PROC_TRAPCAP_STATUS
+Returns the current status of signalling capability mode access
+violations for the specified process.
+The integer value pointed to by the
+.Fa data
+argument is set to the
+.Dv PROC_TRAPCAP_CTL_ENABLE
+value if the process control enables signal delivery, and to
+.Dv PROC_TRAPCAP_CTL_DISABLE
+otherwise.
+.Pp
+See the note about sysctl
+.Dv kern.trap_enocap
+above, which gives independent global control of signal delivery.
+.El
 .Sh NOTES
 Disabling tracing on a process should not be considered a security
 feature, as it is bypassable both by the kernel and privileged processes,
 and via other system mechanisms.
 As such, it should not be utilized to reliably protect cryptographic
 keying material or other confidential data.
 .Sh RETURN VALUES
 If an error occurs, a value of -1 is returned and
 .Va errno
 is set to indicate the error.
 .Sh ERRORS
 The
 .Fn procctl
 system call
 will fail if:
 .Bl -tag -width Er
 .It Bq Er EFAULT
 The
 .Fa arg
 parameter points outside the process's allocated address space.
 .It Bq Er EINVAL
 The
 .Fa cmd
 argument specifies an unsupported command.
 .Pp
 The
 .Fa idtype
 argument specifies an unsupported identifier type.
 .It Bq Er EPERM
 The calling process does not have permission to perform the requested
 operation on any of the selected processes.
 .It Bq Er ESRCH
 No processes matched the requested
 .Fa idtype
 and
 .Fa id .
 .It Bq Er EINVAL
 An invalid operation or flag was passed in
 .Fa arg
 for a
 .Dv PROC_SPROTECT
 command.
 .It Bq Er EPERM
 The
 .Fa idtype
 argument is not equal to
 .Dv P_PID ,
 or
 .Fa id
 is not equal to the pid of the calling process, for
 .Dv PROC_REAP_ACQUIRE
 or
 .Dv PROC_REAP_RELEASE
 requests.
 .It Bq Er EINVAL
 Invalid or undefined flags were passed to a
 .Dv PROC_REAP_KILL
 request.
 .It Bq Er EINVAL
 An invalid or zero signal number was requested for a
 .Dv PROC_REAP_KILL
 request.
 .It Bq Er EINVAL
 The
 .Dv PROC_REAP_RELEASE
 request was issued by the
 .Xr init 8
 process.
 .It Bq Er EBUSY
 The
 .Dv PROC_REAP_ACQUIRE
 request was issued by a process that had already acquired reaper status
 and has not yet released it.
 .It Bq Er EBUSY
 The
 .Dv PROC_TRACE_CTL
 request was issued for a process already being traced.
 .It Bq Er EPERM
 The
 .Dv PROC_TRACE_CTL
 request to re-enable tracing of the process
 .Po Dv PROC_TRACE_CTL_ENABLE Pc ,
 or to disable persistence of
 .Dv PROC_TRACE_CTL_DISABLE
 on
 .Xr execve 2
 was issued for a non-current process.
 .It Bq Er EINVAL
 The value of the integer
 .Fa data
 parameter for the
 .Dv PROC_TRACE_CTL
+or
+.Dv PROC_TRAPCAP_CTL
 request is invalid.
 .El
 .Sh SEE ALSO
 .Xr dtrace 1 ,
+.Xr cap_enter 2,
 .Xr kill 2 ,
 .Xr ktrace 2 ,
 .Xr ptrace 2 ,
 .Xr wait 2 ,
+.Xr capsicum 4 ,
 .Xr hwpmc 4 ,
 .Xr init 8
 .Sh HISTORY
 The
 .Fn procctl
 function appeared in
 .Fx 10.0 .
 The reaper facility is based on a similar feature of Linux and
 DragonflyBSD, and first appeared in
 .Fx 10.2 .
Index: user/alc/PQ_LAUNDRY/lib/libcompiler_rt/Makefile
===================================================================
--- user/alc/PQ_LAUNDRY/lib/libcompiler_rt/Makefile	(revision 306282)
+++ user/alc/PQ_LAUNDRY/lib/libcompiler_rt/Makefile	(revision 306283)
@@ -1,245 +1,245 @@
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 PACKAGE=lib${LIB}
 LIB=	compiler_rt
 NO_PIC=
 WARNS?=	2
 
 CFLAGS+=${PICFLAG} -fvisibility=hidden -DVISIBILITY_HIDDEN
-CFLAGS+=-I${.CURDIR}/../../contrib/libcxxrt
+CFLAGS+=-I${SRCTOP}/contrib/libcxxrt
 
 .if ${MACHINE_CPUARCH} == "amd64"
 CRTARCH=x86_64
 .else
 CRTARCH=${MACHINE_CPUARCH}
 .endif
 
-CRTSRC=${.CURDIR}/../../contrib/compiler-rt/lib/builtins
+CRTSRC=${SRCTOP}/contrib/compiler-rt/lib/builtins
 
 .PATH: ${CRTSRC}/${CRTARCH} ${CRTSRC}
 
 SRCF=	absvdi2 \
 	absvsi2 \
 	absvti2 \
 	addvdi3 \
 	addvsi3 \
 	addvti3 \
 	apple_versioning \
 	ashldi3 \
 	ashlti3 \
 	ashrdi3 \
 	ashrti3 \
 	clear_cache \
 	clzdi2 \
 	clzsi2 \
 	clzti2 \
 	cmpdi2 \
 	cmpti2 \
 	ctzdi2 \
 	ctzsi2 \
 	ctzti2 \
 	divdc3 \
 	divdi3 \
 	divmoddi4 \
 	divmodsi4 \
 	divsc3 \
 	divtc3 \
 	divti3 \
 	divxc3 \
 	enable_execute_stack \
 	eprintf \
 	extendhfsf2 \
 	ffsdi2 \
 	ffsti2 \
 	fixdfdi \
 	fixdfti \
 	fixsfdi \
 	fixsfti \
 	fixunsdfdi \
 	fixunsdfsi \
 	fixunsdfti \
 	fixunssfdi \
 	fixunssfsi \
 	fixunssfti \
 	fixunsxfdi \
 	fixunsxfsi \
 	fixunsxfti \
 	fixxfdi \
 	fixxfti \
 	floatdidf \
 	floatdisf \
 	floatditf \
 	floatdixf \
 	floatsitf \
 	floattidf \
 	floattisf \
 	floattixf \
 	floatundidf \
 	floatundisf \
 	floatunditf \
 	floatundixf \
 	floatunsidf \
 	floatunsisf \
 	floatuntidf \
 	floatuntisf \
 	floatuntixf \
 	gcc_personality_v0 \
 	int_util \
 	lshrdi3 \
 	lshrti3 \
 	moddi3 \
 	modti3 \
 	muldc3 \
 	muldi3 \
 	mulodi4 \
 	mulosi4 \
 	muloti4 \
 	mulsc3 \
 	multi3 \
 	mulvdi3 \
 	mulvsi3 \
 	mulvti3 \
 	multc3 \
 	mulxc3 \
 	negdf2 \
 	negdi2 \
 	negsf2 \
 	negti2 \
 	negvdi2 \
 	negvsi2 \
 	negvti2 \
 	paritydi2 \
 	paritysi2 \
 	parityti2 \
 	popcountdi2 \
 	popcountsi2 \
 	popcountti2 \
 	powidf2 \
 	powisf2 \
 	powitf2 \
 	powixf2 \
 	subvdi3 \
 	subvsi3 \
 	subvti3 \
 	trampoline_setup \
 	truncdfhf2 \
 	truncsfhf2 \
 	ucmpdi2 \
 	ucmpti2 \
 	udivdi3 \
 	udivmoddi4 \
 	udivmodsi4 \
 	udivmodti4 \
 	udivti3 \
 	umoddi3 \
 	umodti3
 
 # 128-bit quad precision long double support, only used on arm64
 .if ${MACHINE_CPUARCH} == "aarch64"
 SRCF+=	addtf3 \
 	comparetf2 \
 	divtf3 \
 	extenddftf2 \
 	extendsftf2 \
 	fixtfdi \
 	fixtfsi \
 	fixtfti \
 	fixunstfdi \
 	fixunstfsi \
 	fixunstfti \
 	floatunsitf \
 	multf3 \
 	subtf3 \
 	trunctfdf2 \
 	trunctfsf2
 .endif
 
 # These are already shipped by libc.a on arm and mips
 .if ${MACHINE_CPUARCH} != "arm" && ${MACHINE_CPUARCH} != "mips"
 SRCF+=	adddf3 \
 	addsf3 \
 	divdf3 \
 	divsf3 \
 	extendsfdf2 \
 	fixdfsi \
 	fixsfsi \
 	floatsidf \
 	floatsisf \
 	muldf3 \
 	mulsf3 \
 	subdf3 \
 	subsf3 \
 	truncdfsf2
 .endif
 
 .if ${MACHINE_CPUARCH} != "arm"
 SRCF+=	comparedf2 \
 	comparesf2
 .endif
 
 .if ${MACHINE_CPUARCH} != "mips"
 SRCF+=	divsi3 \
 	modsi3 \
 	udivsi3 \
 	umodsi3
 .endif
 
 # FreeBSD-specific atomic intrinsics.
 .if ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "armv6"
 .PATH: ${.CURDIR}/../../sys/arm/arm
 
 SRCF+=	stdatomic
 CFLAGS+=	-DEMIT_SYNC_ATOMICS
 .elif ${MACHINE_CPUARCH} == "mips"
 .PATH: ${.CURDIR}/../../sys/mips/mips
 
 SRCF+=	stdatomic
 .endif
 
 .for file in ${SRCF}
 .if ${MACHINE_ARCH:Marmv6*} && (!defined(CPUTYPE) || ${CPUTYPE:M*soft*} == "") && \
     exists(${CRTSRC}/${CRTARCH}/${file}vfp.S)
 SRCS+= ${file}vfp.S
 . elif exists(${CRTSRC}/${CRTARCH}/${file}.S)
 SRCS+=	${file}.S
 . else
 SRCS+=	${file}.c
 . endif
 .endfor
 
 .if ${MACHINE_CPUARCH} == "arm"
 SRCS+=	aeabi_div0.c \
 	aeabi_idivmod.S \
 	aeabi_ldivmod.S \
 	aeabi_memcmp.S \
 	aeabi_memcpy.S \
 	aeabi_memmove.S \
 	aeabi_memset.S \
 	aeabi_uidivmod.S \
 	aeabi_uldivmod.S \
 	bswapdi2.S \
 	bswapsi2.S \
 	switch16.S \
 	switch32.S \
 	switch8.S \
 	switchu8.S \
 	sync_synchronize.S
 .endif
 
 .if ${MK_INSTALLLIB} != "no"
 SYMLINKS+=libcompiler_rt.a ${LIBDIR}/libgcc.a
 .endif
 .if ${MK_PROFILE} != "no"
 SYMLINKS+=libcompiler_rt_p.a ${LIBDIR}/libgcc_p.a
 .endif
 
 .if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "i386" || \
     ${MACHINE_CPUARCH} == "powerpc" || ${MACHINE_ARCH:Marmv6*}
 AFLAGS+=--noexecstack
 ACFLAGS+=-Wa,--noexecstack
 .endif
 
 
 .include <bsd.lib.mk>
Index: user/alc/PQ_LAUNDRY/sbin/mount_msdosfs/mount_msdosfs.8
===================================================================
--- user/alc/PQ_LAUNDRY/sbin/mount_msdosfs/mount_msdosfs.8	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sbin/mount_msdosfs/mount_msdosfs.8	(revision 306283)
@@ -1,223 +1,216 @@
 .\"	$NetBSD: mount_msdos.8,v 1.13 1998/02/06 05:57:00 perry Exp $
 .\"
 .\" Copyright (c) 1993,1994 Christopher G. Demetriou
 .\" All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. All advertising materials mentioning features or use of this software
 .\"    must display the following acknowledgment:
 .\"      This product includes software developed by Christopher G. Demetriou.
 .\" 3. The name of the author may not be used to endorse or promote products
 .\"    derived from this software without specific prior written permission
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 .\" IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 .\" OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 .\" IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 .\" INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 .\" NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 .\" DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 .\" THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 .\" THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
 .Dd October 1, 2013
 .Dt MOUNT_MSDOSFS 8
 .Os
 .Sh NAME
 .Nm mount_msdosfs
 .Nd mount an MS-DOS file system
 .Sh SYNOPSIS
 .Nm
 .Op Fl 9ls
 .Op Fl D Ar DOS_codepage
 .Op Fl g Ar gid
 .Op Fl L Ar locale
 .Op Fl M Ar mask
 .Op Fl m Ar mask
 .Op Fl o Ar options
 .Op Fl u Ar uid
 .Op Fl W Ar table
 .Ar special node
 .Sh DESCRIPTION
 The
 .Nm
 utility attaches the MS-DOS file system residing on
 the device
 .Pa special
 to the global file system namespace at the location
 indicated by
 .Pa node .
 This command is normally executed by
 .Xr mount 8
 at boot time, but can be used by any user to mount an
 MS-DOS file system on any directory that they own (provided,
 of course, that they have appropriate access to the device that
 contains the file system).
 .Pp
 The options are as follows:
 .Bl -tag -width Ds
 .It Fl o Ar options
 Use the specified mount
 .Ar options ,
 as described in
 .Xr mount 8 .
 The following MSDOS file system-specific options are available:
 .Bl -tag -width indent
 .It Cm large
 Support file systems larger than 128 gigabytes at the expense
 of 32 bytes of kernel memory for each file on disk.
 This memory will not be reclaimed until the file system has
 been unmounted.
 .It Cm longnames
 Force Windows 95 long filenames to be visible.
 .It Cm shortnames
 Force only the old MS-DOS 8.3 style filenames to be visible.
 .It Cm nowin95
 Completely ignore Windows 95 extended file information.
 .El
 .It Fl u Ar uid
 Set the owner of the files in the file system to
 .Ar uid .
 The default owner is the owner of the directory
 on which the file system is being mounted.
 .It Fl g Ar gid
 Set the group of the files in the file system to
 .Ar gid .
 The default group is the group of the directory
 on which the file system is being mounted.
 .It Fl m Ar mask
 Specify the maximum file permissions for files
 in the file system.
 (For example, a
 .Ar mask
 of
 .Li 755
 specifies that, by default, the owner should have
 read, write, and execute permissions for files, but
 others should only have read and execute permissions.
 See
 .Xr chmod 1
 for more information about octal file modes.
 Only the nine low-order bits of
 .Ar mask
 are used.
 The value of
 .Ar -M
 is used if it is supplied and
 .Ar -m
 is omitted.
 The default
 .Ar mask
 is taken from the
 directory on which the file system is being mounted.
 .It Fl M Ar mask
 Specify the maximum file permissions for directories
 in the file system.
 The value of
 .Ar -m
 is used if it is supplied and
 .Ar -M
 is omitted.
 See the previous option's description for details.
 .It Fl s
 Force behaviour to
 ignore and not generate Win'95 long filenames.
 .It Fl l
 Force listing and generation of
 Win'95 long filenames
 and separate creation/modification/access dates.
 .Pp
 If neither
 .Fl s
 nor
 .Fl l
 are given,
-.Nm
-searches the root directory of the file system to
-be mounted for any existing Win'95 long filenames.
-If no such entries are found, but short DOS filenames are found,
-.Fl s
-is the default.
-Otherwise
 .Fl l
-is assumed.
+is the default.
 .It Fl 9
 Ignore the special Win'95 directory entries even
 if deleting or renaming a file.
 This forces
 .Fl s .
 .\".It Fl G
 .\"This option causes the file system to be interpreted as an Atari-Gemdos
 .\"file system.
 .\"The differences to the MS-DOS file system are minimal and
 .\"limited to the boot block.
 .\"This option enforces
 .\".Fl s .
 .It Fl L Ar locale
 Specify locale name used for file name conversions
 for DOS and Win'95 names.
 By default ISO 8859-1 assumed as local character set.
 .It Fl D Ar DOS_codepage
 Specify the MS-DOS code page (aka IBM/OEM code page) name used for
 file name conversions for DOS names.
 .It Fl W Ar table
 .Bf Em
 This option is preserved for backward compatibility purpose only,
 and will be removed in the future.
 Please avoid using this option.
 .Ef
 .Pp
 Specify text file name with conversion table:
 .Pa iso22dos , iso72dos , koi2dos , koi8u2dos .
 .El
 .Sh EXAMPLES
 To mount a Russian MS-DOS file system located in
 .Pa /dev/ada1s1 :
 .Pp
 .Dl "mount_msdosfs -L ru_RU.KOI8-R -D CP866 /dev/ada1s1 /mnt"
 .Pp
 To mount a Japanese MS-DOS file system located in
 .Pa /dev/ada1s1 :
 .Pp
 .Dl "mount_msdosfs -L ja_JP.eucJP -D CP932 /dev/ada1s1 /mnt"
 .Sh SEE ALSO
 .Xr mount 2 ,
 .Xr unmount 2 ,
 .Xr fstab 5 ,
 .Xr msdosfs 5 ,
 .Xr mount 8
 .Pp
 List of Localized MS Operating Systems:
 .Pa http://www.microsoft.com/globaldev/reference/oslocversion.mspx .
 .Sh HISTORY
 The
 .Nm
 utility first appeared in
 .Fx 2.0 .
 Its predecessor, the
 .Nm mount_pcfs
 utility appeared in
 .Fx 1.0 ,
 and was abandoned in favor
 of the more aptly-named
 .Nm .
 .Pp
 The character code conversion routine was added by
 .An Ryuichiro Imura Aq Mt imura@ryu16.org
 in 2003.
 .Sh CAVEATS
 The use of the
 .Fl 9
 flag could result in damaged file systems,
 albeit the damage is in part taken care of by
 procedures similar to the ones used in Win'95.
Index: user/alc/PQ_LAUNDRY/share/mk/src.opts.mk
===================================================================
--- user/alc/PQ_LAUNDRY/share/mk/src.opts.mk	(revision 306282)
+++ user/alc/PQ_LAUNDRY/share/mk/src.opts.mk	(revision 306283)
@@ -1,430 +1,434 @@
 # $FreeBSD$
 #
 # Option file for FreeBSD /usr/src builds.
 #
 # Users define WITH_FOO and WITHOUT_FOO on the command line or in /etc/src.conf
 # and /etc/make.conf files. These translate in the build system to MK_FOO={yes,no}
 # with sensible (usually) defaults.
 #
 # Makefiles must include bsd.opts.mk after defining specific MK_FOO options that
 # are applicable for that Makefile (typically there are none, but sometimes there
 # are exceptions). Recursive makes usually add MK_FOO=no for options that they wish
 # to omit from that make.
 #
 # Makefiles must include bsd.mkopt.mk before they test the value of any MK_FOO
 # variable.
 #
 # Makefiles may also assume that this file is included by src.opts.mk should it
 # need variables defined there prior to the end of the Makefile where
 # bsd.{subdir,lib.bin}.mk is traditionally included.
 #
 # The old-style YES_FOO and NO_FOO are being phased out. No new instances of them
 # should be added. Old instances should be removed since they were just to
 # bridge the gap between FreeBSD 4 and FreeBSD 5.
 #
 # Makefiles should never test WITH_FOO or WITHOUT_FOO directly (although an
 # exception is made for _WITHOUT_SRCONF which turns off this mechanism
 # completely inside bsd.*.mk files).
 #
 
 .if !target(__<src.opts.mk>__)
 __<src.opts.mk>__:
 
 .include <bsd.own.mk>
 
 #
 # Define MK_* variables (which are either "yes" or "no") for users
 # to set via WITH_*/WITHOUT_* in /etc/src.conf and override in the
 # make(1) environment.
 # These should be tested with `== "no"' or `!= "no"' in makefiles.
 # The NO_* variables should only be set by makefiles for variables
 # that haven't been converted over.
 #
 
 # These options are used by src the builds
 
 __DEFAULT_YES_OPTIONS = \
     ACCT \
     ACPI \
     AMD \
     APM \
     AT \
     ATM \
     AUDIT \
     AUTHPF \
     AUTOFS \
     BHYVE \
     BINUTILS \
     BINUTILS_BOOTSTRAP \
     BLACKLIST \
     BLUETOOTH \
     BOOT \
     BOOTPARAMD \
     BOOTPD \
     BSD_CPIO \
     BSDINSTALL \
     BSNMP \
     BZIP2 \
     CALENDAR \
     CAPSICUM \
     CASPER \
     CCD \
     CDDL \
     CPP \
     CROSS_COMPILER \
     CRYPT \
     CTM \
     CUSE \
     CXX \
     DICT \
     DMAGENT \
     DYNAMICROOT \
     ED_CRYPTO \
     EE \
     ELFCOPY_AS_OBJCOPY \
     ELFTOOLCHAIN_BOOTSTRAP \
     EXAMPLES \
     FDT \
     FILE \
     FINGER \
     FLOPPY \
     FMTREE \
     FORTH \
     FP_LIBC \
     FREEBSD_UPDATE \
     FTP \
     GAMES \
     GCOV \
     GDB \
     GNU \
     GNU_GREP_COMPAT \
     GPIO \
     GPL_DTC \
     GROFF \
     HAST \
     HTML \
     HYPERV \
     ICONV \
     INET \
     INET6 \
     INETD \
     IPFILTER \
     IPFW \
     ISCSI \
     JAIL \
     KDUMP \
     KVM \
     LDNS \
     LDNS_UTILS \
     LEGACY_CONSOLE \
     LIB32 \
     LIBPTHREAD \
     LIBTHR \
     LOCALES \
     LOCATE \
     LPR \
     LS_COLORS \
     LZMA_SUPPORT \
     MAIL \
     MAILWRAPPER \
     MAKE \
     MANDOCDB \
     NDIS \
     NETCAT \
     NETGRAPH \
     NLS_CATALOGS \
     NS_CACHING \
     NTP \
     OPENSSL \
     PAM \
     PC_SYSINSTALL \
     PF \
     PKGBOOTSTRAP \
     PMC \
     PORTSNAP \
     PPP \
     QUOTAS \
     RADIUS_SUPPORT \
     RCMDS \
     RBOOTD \
     RESCUE \
     ROUTED \
     SENDMAIL \
     SETUID_LOGIN \
     SHAREDOCS \
     SOURCELESS \
     SOURCELESS_HOST \
     SOURCELESS_UCODE \
     SVNLITE \
     SYSCONS \
     SYSTEM_COMPILER \
     TALK \
     TCP_WRAPPERS \
     TCSH \
     TELNET \
     TESTS \
     TEXTPROC \
     TFTP \
     TIMED \
     UNBOUND \
     USB \
     UTMPX \
     VI \
     VT \
     WIRELESS \
     WPA_SUPPLICANT_EAPOL \
     ZFS \
     ZONEINFO
 
 __DEFAULT_NO_OPTIONS = \
     BSD_GREP \
     CLANG_EXTRAS \
     DTRACE_TESTS \
     EISA \
     HESIOD \
     LIBSOFT \
     NAND \
     OFED \
     OPENLDAP \
     RCS \
     SHARED_TOOLCHAIN \
     SORT_THREADS \
     SVN \
 
 
 #
 # Default behaviour of some options depends on the architecture.  Unfortunately
 # this means that we have to test TARGET_ARCH (the buildworld case) as well
 # as MACHINE_ARCH (the non-buildworld case).  Normally TARGET_ARCH is not
 # used at all in bsd.*.mk, but we have to make an exception here if we want
 # to allow defaults for some things like clang to vary by target architecture.
 # Additional, per-target behavior should be rarely added only after much
 # gnashing of teeth and grinding of gears.
 #
 .if defined(TARGET_ARCH)
 __T=${TARGET_ARCH}
 .else
 __T=${MACHINE_ARCH}
 .endif
 .if defined(TARGET)
 __TT=${TARGET}
 .else
 __TT=${MACHINE}
 .endif
 
 .include <bsd.compiler.mk>
 # If the compiler is not C++11 capable, disable Clang and use GCC instead.
 # This means that architectures that have GCC 4.2 as default can not
 # build Clang without using an external compiler.
 
 .if ${COMPILER_FEATURES:Mc++11} && (${__T} == "aarch64" || \
     ${__T} == "amd64" || ${__TT} == "arm" || ${__T} == "i386")
 # Clang is enabled, and will be installed as the default /usr/bin/cc.
 __DEFAULT_YES_OPTIONS+=CLANG CLANG_BOOTSTRAP CLANG_FULL CLANG_IS_CC
 __DEFAULT_NO_OPTIONS+=GCC GCC_BOOTSTRAP GNUCXX
 .elif ${COMPILER_FEATURES:Mc++11} && ${__T:Mpowerpc*}
 # On powerpc, if an external compiler that supports C++11 is used as ${CC},
 # then Clang is enabled, but GCC is installed as the default /usr/bin/cc.
 __DEFAULT_YES_OPTIONS+=CLANG CLANG_FULL GCC GCC_BOOTSTRAP GNUCXX
 __DEFAULT_NO_OPTIONS+=CLANG_BOOTSTRAP CLANG_IS_CC
 .else
 # Everything else disables Clang, and uses GCC instead.
 __DEFAULT_YES_OPTIONS+=GCC GCC_BOOTSTRAP GNUCXX
 __DEFAULT_NO_OPTIONS+=CLANG CLANG_BOOTSTRAP CLANG_FULL CLANG_IS_CC
 .endif
 # In-tree binutils/gcc are older versions without modern architecture support.
 .if ${__T} == "aarch64" || ${__T} == "riscv64"
 BROKEN_OPTIONS+=BINUTILS BINUTILS_BOOTSTRAP GCC GCC_BOOTSTRAP GDB
 .endif
 .if ${__T} == "riscv64"
 BROKEN_OPTIONS+=PROFILE # "sorry, unimplemented: profiler support for RISC-V"
 BROKEN_OPTIONS+=TESTS   # "undefined reference to `_Unwind_Resume'"
 BROKEN_OPTIONS+=CXX     # "libcxxrt.so: undefined reference to `_Unwind_Resume_or_Rethrow'"
 .endif
 .if ${__T} == "aarch64" || ${__T} == "amd64" || ${__T} == "i386" || \
     ${__T} == "riscv64"
 __DEFAULT_YES_OPTIONS+=LLVM_LIBUNWIND
 .else
 __DEFAULT_NO_OPTIONS+=LLVM_LIBUNWIND
 .endif
 .if ${__T} == "aarch64" || ${__T} == "amd64"
 __DEFAULT_YES_OPTIONS+=LLDB
 .else
 __DEFAULT_NO_OPTIONS+=LLDB
 .endif
 # LLVM lacks support for FreeBSD 64-bit atomic operations for ARMv4/ARMv5
 .if ${__T} == "arm" || ${__T} == "armeb"
 BROKEN_OPTIONS+=LLDB
 .endif
 # Only doing soft float API stuff on armv6
 .if ${__T} != "armv6"
 BROKEN_OPTIONS+=LIBSOFT
 .endif
 
 .include <bsd.mkopt.mk>
 
 #
 # MK_* options that default to "yes" if the compiler is a C++11 compiler.
 #
 .for var in \
     LIBCPLUSPLUS
 .if !defined(MK_${var})
 .if ${COMPILER_FEATURES:Mc++11}
 .if defined(WITHOUT_${var})
 MK_${var}:=	no
 .else
 MK_${var}:=	yes
 .endif
 .else
 .if defined(WITH_${var})
 MK_${var}:=	yes
 .else
 MK_${var}:=	no
 .endif
 .endif
 .endif
 .endfor
 
 #
 # Force some options off if their dependencies are off.
 # Order is somewhat important.
 #
+.if !${COMPILER_FEATURES:Mc++11}
+MK_LLVM_LIBUNWIND:=	no
+.endif
+
 .if ${MK_LIBPTHREAD} == "no"
 MK_LIBTHR:=	no
 .endif
 
 .if ${MK_LDNS} == "no"
 MK_LDNS_UTILS:=	no
 MK_UNBOUND:= no
 .endif
 
 .if ${MK_SOURCELESS} == "no"
 MK_SOURCELESS_HOST:=	no
 MK_SOURCELESS_UCODE:= no
 .endif
 
 .if ${MK_CDDL} == "no"
 MK_ZFS:=	no
 MK_CTF:=	no
 .endif
 
 .if ${MK_CRYPT} == "no"
 MK_OPENSSL:=	no
 MK_OPENSSH:=	no
 MK_KERBEROS:=	no
 .endif
 
 .if ${MK_CXX} == "no"
 MK_CLANG:=	no
 MK_GROFF:=	no
 MK_GNUCXX:=	no
 MK_TESTS:=	no
 .endif
 
 .if ${MK_MAIL} == "no"
 MK_MAILWRAPPER:= no
 MK_SENDMAIL:=	no
 MK_DMAGENT:=	no
 .endif
 
 .if ${MK_NETGRAPH} == "no"
 MK_ATM:=	no
 MK_BLUETOOTH:=	no
 .endif
 
 .if ${MK_OPENSSL} == "no"
 MK_OPENSSH:=	no
 MK_KERBEROS:=	no
 .endif
 
 .if ${MK_PF} == "no"
 MK_AUTHPF:=	no
 .endif
 
 .if ${MK_TESTS} == "no"
 MK_DTRACE_TESTS:= no
 .endif
 
 .if ${MK_TEXTPROC} == "no"
 MK_GROFF:=	no
 .endif
 
 .if ${MK_CROSS_COMPILER} == "no"
 MK_BINUTILS_BOOTSTRAP:= no
 MK_CLANG_BOOTSTRAP:= no
 MK_ELFTOOLCHAIN_BOOTSTRAP:= no
 MK_GCC_BOOTSTRAP:= no
 .endif
 
 .if ${MK_TOOLCHAIN} == "no"
 MK_BINUTILS:=	no
 MK_CLANG:=	no
 MK_GCC:=	no
 MK_GDB:=	no
 MK_INCLUDES:=	no
 MK_LLDB:=	no
 .endif
 
 .if ${MK_CLANG} == "no"
 MK_CLANG_EXTRAS:= no
 MK_CLANG_FULL:= no
 .endif
 
 #
 # Set defaults for the MK_*_SUPPORT variables.
 #
 
 #
 # MK_*_SUPPORT options which default to "yes" unless their corresponding
 # MK_* variable is set to "no".
 #
 .for var in \
     BLACKLIST \
     BZIP2 \
     GNU \
     INET \
     INET6 \
     KERBEROS \
     KVM \
     NETGRAPH \
     PAM \
     TESTS \
     WIRELESS
 .if defined(WITHOUT_${var}_SUPPORT) || ${MK_${var}} == "no"
 MK_${var}_SUPPORT:= no
 .else
 MK_${var}_SUPPORT:= yes
 .endif
 .endfor
 
 #
 # MK_* options whose default value depends on another option.
 #
 .for vv in \
     GSSAPI/KERBEROS \
     MAN_UTILS/MAN
 .if defined(WITH_${vv:H})
 MK_${vv:H}:=	yes
 .elif defined(WITHOUT_${vv:H})
 MK_${vv:H}:=	no
 .else
 MK_${vv:H}:=	${MK_${vv:T}}
 .endif
 .endfor
 
 .if !${COMPILER_FEATURES:Mc++11}
 MK_LLDB:=	no
 .endif
 
 # gcc 4.8 and newer supports libc++, so suppress gnuc++ in that case.
 # while in theory we could build it with that, we don't want to do
 # that since it creates too much confusion for too little gain.
 # XXX: This is incomplete and needs X_COMPILER_TYPE/VERSION checks too
 #      to prevent Makefile.inc1 from bootstrapping unneeded dependencies
 #      and to support 'make delete-old' when supplying an external toolchain.
 .if ${COMPILER_TYPE} == "gcc" && ${COMPILER_VERSION} >= 40800
 MK_GNUCXX:=no
 MK_GCC:=no
 .endif
 
 .endif #  !target(__<src.opts.mk>__)
Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/a10_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/allwinner/a10_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/a10_common.c	(revision 306283)
@@ -1,72 +1,68 @@
 /*-
  * Copyright (c) 2012 Ganbold Tsagaankhuu <ganbold@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/vmparam.h>
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 
 static int
 fdt_aintc_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 	int offset;
 
 	if (fdt_is_compatible(node, "allwinner,sun4i-a10-ic"))
 		offset = 0;
 	else if (fdt_is_compatible(node, "arm,gic"))
 		offset = 32;
 	else
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]) + offset;
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_aintc_decode_ic,
 	NULL
 };
 
 #endif /* INTRNG */
Index: user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/allwinner/aw_machdep.c	(revision 306283)
@@ -1,277 +1,271 @@
 /*-
  * Copyright (c) 2012 Ganbold Tsagaankhuu <ganbold@freebsd.org>
  * Copyright (c) 2015-2016 Emmanuel Vadot <manu@freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
 
  * from: FreeBSD: //depot/projects/arm/src/sys/arm/ti/ti_machdep.c
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platformvar.h>
 
 #include <dev/fdt/fdt_common.h>
 
 #include <arm/allwinner/aw_mp.h>
 #include <arm/allwinner/aw_wdog.h>
 #include <arm/allwinner/aw_machdep.h>
 
 #include "platform_if.h"
 
 static u_int soc_type;
 static u_int soc_family;
 
 static int
 a10_attach(platform_t plat)
 {
 	soc_type = ALLWINNERSOC_A10;
 	soc_family = ALLWINNERSOC_SUN4I;
 	return (0);
 }
 
 static int
 a13_attach(platform_t plat)
 {
 	soc_type = ALLWINNERSOC_A13;
 	soc_family = ALLWINNERSOC_SUN5I;
 	return (0);
 }
 
 static int
 a20_attach(platform_t plat)
 {
 	soc_type = ALLWINNERSOC_A20;
 	soc_family = ALLWINNERSOC_SUN7I;
 
 	return (0);
 }
 
 static int
 a31_attach(platform_t plat)
 {
 	soc_type = ALLWINNERSOC_A31;
 	soc_family = ALLWINNERSOC_SUN6I;
 
 	return (0);
 }
 
 static int
 a31s_attach(platform_t plat)
 {
 	soc_type = ALLWINNERSOC_A31S;
 	soc_family = ALLWINNERSOC_SUN6I;
 
 	return (0);
 }
 
 static int
 a83t_attach(platform_t plat)
 {
 	soc_type = ALLWINNERSOC_A83T;
 	soc_family = ALLWINNERSOC_SUN8I;
 
 	return (0);
 }
 
 static int
 h3_attach(platform_t plat)
 {
 	soc_type = ALLWINNERSOC_H3;
 	soc_family = ALLWINNERSOC_SUN8I;
 
 	return (0);
 }
 
 static vm_offset_t
 allwinner_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 /*
  * Set up static device mappings.
  *
  * This covers all the on-chip device with 1MB section mappings, which is good
  * for performance (uses fewer TLB entries for device access).
  *
  * XXX It also covers a block of SRAM and some GPU (mali400) stuff that maybe
  * shouldn't be device-mapped.  The original code mapped a 4MB block, but
  * perhaps a 1MB block would be more appropriate.
  */
 static int
 allwinner_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x01C00000, 0x00400000); /* 4MB */
 
 	return (0);
 }
 
-struct arm32_dma_range *
-bus_dma_get_range(void)
+static void
+allwinner_cpu_reset(platform_t plat)
 {
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-	return (0);
-}
-
-void
-cpu_reset()
-{
 	aw_wdog_watchdog_reset();
 	printf("Reset failed!\n");
 	while (1);
 }
 
 #if defined(SOC_ALLWINNER_A10)
 static platform_method_t a10_methods[] = {
 	PLATFORMMETHOD(platform_attach,         a10_attach),
 	PLATFORMMETHOD(platform_lastaddr,       allwinner_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,    allwinner_devmap_init),
+	PLATFORMMETHOD(platform_cpu_reset,	allwinner_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(a10, "a10", 0, "allwinner,sun4i-a10", 200);
 #endif
 
 #if defined(SOC_ALLWINNER_A13)
 static platform_method_t a13_methods[] = {
 	PLATFORMMETHOD(platform_attach,         a13_attach),
 	PLATFORMMETHOD(platform_lastaddr,       allwinner_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,    allwinner_devmap_init),
+	PLATFORMMETHOD(platform_cpu_reset,	allwinner_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(a13, "a13", 0, "allwinner,sun5i-a13", 200);
 #endif
 
 #if defined(SOC_ALLWINNER_A20)
 static platform_method_t a20_methods[] = {
 	PLATFORMMETHOD(platform_attach,         a20_attach),
 	PLATFORMMETHOD(platform_lastaddr,       allwinner_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,    allwinner_devmap_init),
+	PLATFORMMETHOD(platform_cpu_reset,	allwinner_cpu_reset),
 
 #ifdef SMP
 	PLATFORMMETHOD(platform_mp_start_ap,	aw_mp_start_ap),
 	PLATFORMMETHOD(platform_mp_setmaxid,	aw_mp_setmaxid),
 #endif
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(a20, "a20", 0, "allwinner,sun7i-a20", 200);
 #endif
 
 #if defined(SOC_ALLWINNER_A31)
 static platform_method_t a31_methods[] = {
 	PLATFORMMETHOD(platform_attach,         a31_attach),
 	PLATFORMMETHOD(platform_lastaddr,       allwinner_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,    allwinner_devmap_init),
+	PLATFORMMETHOD(platform_cpu_reset,	allwinner_cpu_reset),
 
 #ifdef SMP
 	PLATFORMMETHOD(platform_mp_start_ap,	aw_mp_start_ap),
 	PLATFORMMETHOD(platform_mp_setmaxid,	aw_mp_setmaxid),
 #endif
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(a31, "a31", 0, "allwinner,sun6i-a31", 200);
 #endif
 
 #if defined(SOC_ALLWINNER_A31S)
 static platform_method_t a31s_methods[] = {
 	PLATFORMMETHOD(platform_attach,         a31s_attach),
 	PLATFORMMETHOD(platform_lastaddr,       allwinner_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,    allwinner_devmap_init),
+	PLATFORMMETHOD(platform_cpu_reset,	allwinner_cpu_reset),
 
 #ifdef SMP
 	PLATFORMMETHOD(platform_mp_start_ap,	aw_mp_start_ap),
 	PLATFORMMETHOD(platform_mp_setmaxid,	aw_mp_setmaxid),
 #endif
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(a31s, "a31s", 0, "allwinner,sun6i-a31s", 200);
 #endif
 
 #if defined(SOC_ALLWINNER_A83T)
 static platform_method_t a83t_methods[] = {
 	PLATFORMMETHOD(platform_attach,         a83t_attach),
 	PLATFORMMETHOD(platform_lastaddr,       allwinner_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,    allwinner_devmap_init),
+	PLATFORMMETHOD(platform_cpu_reset,	allwinner_cpu_reset),
 
 #ifdef SMP
 	PLATFORMMETHOD(platform_mp_start_ap,	a83t_mp_start_ap),
 	PLATFORMMETHOD(platform_mp_setmaxid,	aw_mp_setmaxid),
 #endif
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(a83t, "a83t", 0, "allwinner,sun8i-a83t", 200);
 #endif
 
 #if defined(SOC_ALLWINNER_H3)
 static platform_method_t h3_methods[] = {
 	PLATFORMMETHOD(platform_attach,         h3_attach),
 	PLATFORMMETHOD(platform_lastaddr,       allwinner_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,    allwinner_devmap_init),
+	PLATFORMMETHOD(platform_cpu_reset,	allwinner_cpu_reset),
 
 #ifdef SMP
 	PLATFORMMETHOD(platform_mp_start_ap,	aw_mp_start_ap),
 	PLATFORMMETHOD(platform_mp_setmaxid,	aw_mp_setmaxid),
 #endif
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(h3, "h3", 0, "allwinner,sun8i-h3", 200);
 #endif
 
 u_int
 allwinner_soc_type(void)
 {
 	return (soc_type);
 }
 
 u_int
 allwinner_soc_family(void)
 {
 	return (soc_family);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/altera/socfpga/socfpga_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/altera/socfpga/socfpga_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/altera/socfpga/socfpga_common.c	(revision 306283)
@@ -1,96 +1,92 @@
 /*-
  * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 
 #include <arm/altera/socfpga/socfpga_rstmgr.h>
 
 void
 cpu_reset(void)
 {
 	uint32_t addr, paddr;
 	bus_addr_t vaddr;
 	phandle_t node;
 
 	if (rstmgr_warmreset() == 0)
 		goto end;
 
 	node = OF_finddevice("rstmgr");
 	if (node == -1)
 		goto end;
 
 	if ((OF_getprop(node, "reg", &paddr, sizeof(paddr))) > 0) {
 		addr = fdt32_to_cpu(paddr);
 		if (bus_space_map(fdtbus_bs_tag, addr, 0x8, 0, &vaddr) == 0) {
 			bus_space_write_4(fdtbus_bs_tag, vaddr,
 			    RSTMGR_CTRL, CTRL_SWWARMRSTREQ);
 		}
 	}
 
 end:
 	while (1);
 }
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_pic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "arm,gic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_pic_decode_ic,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/altera/socfpga/socfpga_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/altera/socfpga/socfpga_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/altera/socfpga/socfpga_machdep.c	(revision 306283)
@@ -1,116 +1,101 @@
 /*-
  * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define	_ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platform.h>
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (devmap_lastaddr());
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 }
 
 void
 platform_gpio_init(void)
 {
 
 }
 
 void
 platform_late_init(void)
 {
 
 }
 
 int
 platform_devmap_init(void)
 {
 
 	/* UART */
 	devmap_add_entry(0xffc00000, 0x100000);
 
 	/*
 	 * USB OTG
 	 *
 	 * We use static device map for USB due to some bug in the Altera
 	 * which throws Translation Fault (P) exception on high load.
 	 * It might be caused due to some power save options being turned
 	 * on or something else.
 	 */
 	devmap_add_entry(0xffb00000, 0x100000);
 
 	/* dwmmc */
 	devmap_add_entry(0xff700000, 0x100000);
 
 	/* scu */
 	devmap_add_entry(0xfff00000, 0x100000);
 
 	/* FPGA memory window, 256MB */
 	devmap_add_entry(0xd0000000, 0x10000000);
-
-	return (0);
-}
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
 
 	return (0);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/amlogic/aml8726/aml8726_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/amlogic/aml8726/aml8726_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/amlogic/aml8726/aml8726_machdep.c	(revision 306283)
@@ -1,216 +1,197 @@
 /*-
  * Copyright 2013-2015 John Wehle <john@feith.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_global.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_platform.h"
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/cpufunc.h>
 #include <machine/intr.h>
 #include <machine/machdep.h>
 #include <machine/platform.h>
 
 #include <dev/fdt/fdt_common.h>
 
 #include <arm/amlogic/aml8726/aml8726_soc.h>
 #include <arm/amlogic/aml8726/aml8726_clkmsr.h>
 
 #if defined(SOCDEV_PA) && defined(SOCDEV_VA)
 vm_offset_t aml8726_aobus_kva_base = SOCDEV_VA;
 #else
 vm_offset_t aml8726_aobus_kva_base;
 #endif
 
 static void
 aml8726_fixup_busfreq()
 {
 	phandle_t node;
 	pcell_t freq, prop;
 	ssize_t len;
 
 	/*
 	 * Set the bus-frequency for the SoC simple-bus if it
 	 * needs updating (meaning the current frequency is zero).
 	 */
 
 	if ((freq = aml8726_clkmsr_bus_frequency()) == 0 ||
 	    (node = OF_finddevice("/soc")) == 0 ||
 	    fdt_is_compatible_strict(node, "simple-bus") == 0)
 		while (1);
 
 	freq = cpu_to_fdt32(freq);
 
 	len = OF_getencprop(node, "bus-frequency", &prop, sizeof(prop));
 	if ((len / sizeof(prop)) == 1 && prop == 0)
 		OF_setprop(node, "bus-frequency", (void *)&freq, sizeof(freq));
 }
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (devmap_lastaddr());
 }
 
 void
 platform_probe_and_attach(void)
 {
 }
 
 void
 platform_gpio_init(void)
 {
 
 	/*
 	 * The UART console driver used for debugging early boot code
 	 * needs to know the virtual base address of the aobus.  It's
 	 * expected to equal SOCDEV_VA prior to initarm calling setttb
 	 * ... afterwards it needs to be updated due to the new page
 	 * tables.
 	 *
 	 * This means there's a deadzone in initarm between setttb
 	 * and platform_gpio_init during which printf can't be used.
 	 */
 	aml8726_aobus_kva_base =
 	    (vm_offset_t)devmap_ptov(0xc8100000, 0x100000);
 
 	/*
 	 * The hardware mux used by clkmsr is unique to the SoC (though
 	 * currently clk81 is at a fixed location, however that might
 	 * change in the future).
 	 */
 	aml8726_identify_soc();
 
 	/*
 	 * My aml8726-m3 development box which identifies the CPU as
 	 * a Cortex A9-r2 rev 4 randomly locks up during boot when WFI
 	 * is used.
 	 */
 	switch (aml8726_soc_hw_rev) {
 	case AML_SOC_HW_REV_M3:
 		cpufuncs.cf_sleep = (void *)cpufunc_nullop;
 		break;
 	default:
 		break;
 	}
 
 	/*
 	 * This FDT fixup should arguably be called through fdt_fixup_table,
 	 * however currently there's no mechanism to specify a fixup which
 	 * should always be invoked.
 	 *
 	 * It needs to be called prior to the console being initialized which
 	 * is why it's called here, rather than from platform_late_init.
 	 */
 	aml8726_fixup_busfreq();
 }
 
 void
 platform_late_init(void)
 {
 }
 
 /*
  * Construct static devmap entries to map out the core
  * peripherals using 1mb section mappings.
  */
 int
 platform_devmap_init(void)
 {
 
 	devmap_add_entry(0xc1100000, 0x200000); /* cbus */
 	devmap_add_entry(0xc4200000, 0x100000); /* pl310 */
 	devmap_add_entry(0xc4300000, 0x100000); /* periph */
 	devmap_add_entry(0xc8000000, 0x100000); /* apbbus */
 	devmap_add_entry(0xc8100000, 0x100000); /* aobus */
 	devmap_add_entry(0xc9000000, 0x800000); /* ahbbus */
 	devmap_add_entry(0xd9000000, 0x100000); /* ahb */
 	devmap_add_entry(0xda000000, 0x100000); /* secbus */
 
 	return (0);
 }
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
-	return (0);
-}
-
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
 
 #ifndef INTRNG
 #ifndef DEV_GIC
 static int
 fdt_pic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	/*
 	 * The single core chips have just an Amlogic PIC.
 	 */
 	if (!fdt_is_compatible_strict(node, "amlogic,aml8726-pic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[1]);
 	*trig = INTR_TRIGGER_EDGE;
 	*pol = INTR_POLARITY_HIGH;
 
 	return (0);
 }
 #endif
 
 fdt_pic_decode_t fdt_pic_table[] = {
 #ifdef DEV_GIC
 	&gic_decode_fdt,
 #else
 	&fdt_pic_decode_ic,
 #endif
 	NULL
 };
 #endif /* INTRNG */
Index: user/alc/PQ_LAUNDRY/sys/arm/annapurna/alpine/alpine_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/annapurna/alpine/alpine_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/annapurna/alpine/alpine_machdep.c	(revision 306283)
@@ -1,108 +1,93 @@
 /*-
  * Copyright (c) 2013 Ruslan Bukin <br@bsdpad.com>
  * Copyright (c) 2015 Semihalf
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/frame.h> /* For trapframe_t, used in <machine/machdep.h> */
 #include <machine/machdep.h>
 #include <machine/platform.h>
 #include <machine/fdt.h>
 
 #include <dev/fdt/fdt_common.h>
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #define	DEVMAP_MAX_VA_ADDRESS		0xF0000000
 bus_addr_t al_devmap_pa;
 bus_addr_t al_devmap_size;
 
 int alpine_get_devmap_base(bus_addr_t *pa, bus_addr_t *size);
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (DEVMAP_MAX_VA_ADDRESS);
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 }
 
 void
 platform_gpio_init(void)
 {
 
 }
 
 void
 platform_late_init(void)
 {
 
 }
 
 /*
  * Construct devmap table with DT-derived config data.
  */
 int
 platform_devmap_init(void)
 {
 	alpine_get_devmap_base(&al_devmap_pa, &al_devmap_size);
 	devmap_add_entry(al_devmap_pa, al_devmap_size);
-	return (0);
-}
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
 	return (0);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/annapurna/alpine/common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/annapurna/alpine/common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/annapurna/alpine/common.c	(revision 306283)
@@ -1,162 +1,159 @@
 /*-
  * Copyright (c) 2013 Ruslan Bukin <br@bsdpad.com>
  * Copyright (c) 2015 Semihalf.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_platform.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 #include <machine/intr.h>
 
 #define WDTLOAD		0x000
 #define LOAD_MIN	0x00000001
 #define LOAD_MAX	0xFFFFFFFF
 #define WDTVALUE	0x004
 #define WDTCONTROL	0x008
 /* control register masks */
 #define INT_ENABLE	(1 << 0)
 #define RESET_ENABLE	(1 << 1)
 #define WDTLOCK		0xC00
 #define UNLOCK		0x1ACCE551
 #define LOCK		0x00000001
 
 extern bus_addr_t  al_devmap_pa;
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
 
 static int alpine_get_wdt_base(uint32_t *pbase, uint32_t *psize);
 static int alpine_pic_decode_fdt(uint32_t iparent, uint32_t *intr,
     int *interrupt, int *trig, int *pol);
 
 int alpine_get_devmap_base(bus_addr_t *pa, bus_addr_t *size);
 
 int alpine_get_devmap_base(bus_addr_t *pa, bus_addr_t *size)
 {
 	phandle_t node;
 
 	if ((node = OF_finddevice("/")) == 0)
 		return (ENXIO);
 
 	if ((node = fdt_find_compatible(node, "simple-bus", 1)) == 0)
 		return (ENXIO);
 
 	return fdt_get_range(node, 0, pa, size);
 }
 
 static int
 alpine_get_wdt_base(uint32_t *pbase, uint32_t *psize)
 {
 	phandle_t node;
 	u_long base = 0;
 	u_long size = 0;
 
 	if (pbase == NULL || psize == NULL)
 		return (EINVAL);
 
 	if ((node = OF_finddevice("/")) == -1)
 		return (EFAULT);
 
 	if ((node = fdt_find_compatible(node, "simple-bus", 1)) == 0)
 		return (EFAULT);
 
 	if ((node =
 	    fdt_find_compatible(node, "arm,sp805", 1)) == 0)
 		return (EFAULT);
 
 	if (fdt_regsize(node, &base, &size))
 		return (EFAULT);
 
 	*pbase = base;
 	*psize = size;
 
 	return (0);
 }
 
 void
 cpu_reset(void)
 {
 	uint32_t wdbase, wdsize;
 	bus_addr_t wdbaddr;
 	int ret;
 
 	ret = alpine_get_wdt_base(&wdbase, &wdsize);
 	if (ret) {
 		printf("Unable to get WDT base, do power down manually...");
 		goto infinite;
 	}
 
 	ret = bus_space_map(fdtbus_bs_tag, al_devmap_pa + wdbase,
 	    wdsize, 0, &wdbaddr);
 	if (ret) {
 		printf("Unable to map WDT base, do power down manually...");
 		goto infinite;
 	}
 
 	bus_space_write_4(fdtbus_bs_tag, wdbaddr, WDTLOCK, UNLOCK);
 	bus_space_write_4(fdtbus_bs_tag, wdbaddr, WDTLOAD, LOAD_MIN);
 	bus_space_write_4(fdtbus_bs_tag, wdbaddr, WDTCONTROL, INT_ENABLE | RESET_ENABLE);
 
 infinite:
 	while (1) {}
 }
 
 #ifndef INTRNG
 static int
 alpine_pic_decode_fdt(uint32_t iparent, uint32_t *intr, int *interrupt,
     int *trig, int *pol)
 {
 	int rv = 0;
 
 	rv = gic_decode_fdt(iparent, intr, interrupt, trig, pol);
 	if (rv == 0) {
 		/* This was recognized as our PIC and decoded. */
 		interrupt = FDT_MAP_IRQ(iparent, interrupt);
 
 		/* Configure the interrupt if callback provided */
 		if (arm_config_irq)
 			(*arm_config_irq)(*interrupt, *trig, *pol);
 	}
 	return (rv);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&alpine_pic_decode_fdt,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/arm/busdma_machdep-v6.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/arm/busdma_machdep-v6.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/arm/busdma_machdep-v6.c	(revision 306283)
@@ -1,1741 +1,1698 @@
 /*-
  * Copyright (c) 2012-2015 Ian Lepore
  * Copyright (c) 2010 Mark Tinguely
  * Copyright (c) 2004 Olivier Houchard
  * Copyright (c) 2002 Peter Grehan
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *  From i386/busdma_machdep.c 191438 2009-04-23 20:24:19Z jhb
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/bus.h>
 #include <sys/busdma_bufalloc.h>
 #include <sys/counter.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/memdesc.h>
 #include <sys/proc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
 #include <machine/cpu-v6.h>
 #include <machine/md_var.h>
 
 #if __ARM_ARCH < 6
 #define	BUSDMA_DCACHE_ALIGN	arm_dcache_align
 #define	BUSDMA_DCACHE_MASK	arm_dcache_align_mask
 #else
 #define	BUSDMA_DCACHE_ALIGN	cpuinfo.dcache_line_size
 #define	BUSDMA_DCACHE_MASK	cpuinfo.dcache_line_mask
 #endif
 
 #define	MAX_BPAGES		64
 #define	MAX_DMA_SEGMENTS	4096
 #define	BUS_DMA_EXCL_BOUNCE	BUS_DMA_BUS2
 #define	BUS_DMA_ALIGN_BOUNCE	BUS_DMA_BUS3
 #define	BUS_DMA_COULD_BOUNCE	(BUS_DMA_EXCL_BOUNCE | BUS_DMA_ALIGN_BOUNCE)
 #define	BUS_DMA_MIN_ALLOC_COMP	BUS_DMA_BUS4
 
 struct bounce_zone;
 
 struct bus_dma_tag {
 	bus_dma_tag_t		parent;
 	bus_size_t		alignment;
 	bus_addr_t		boundary;
 	bus_addr_t		lowaddr;
 	bus_addr_t		highaddr;
 	bus_dma_filter_t	*filter;
 	void			*filterarg;
 	bus_size_t		maxsize;
 	u_int			nsegments;
 	bus_size_t		maxsegsz;
 	int			flags;
 	int			ref_count;
 	int			map_count;
 	bus_dma_lock_t		*lockfunc;
 	void			*lockfuncarg;
 	struct bounce_zone	*bounce_zone;
-	/*
-	 * DMA range for this tag.  If the page doesn't fall within
-	 * one of these ranges, an error is returned.  The caller
-	 * may then decide what to do with the transfer.  If the
-	 * range pointer is NULL, it is ignored.
-	 */
-	struct arm32_dma_range	*ranges;
-	int			_nranges;
 };
 
 struct bounce_page {
 	vm_offset_t	vaddr;		/* kva of bounce buffer */
 	bus_addr_t	busaddr;	/* Physical address */
 	vm_offset_t	datavaddr;	/* kva of client data */
 	vm_page_t	datapage;	/* physical page of client data */
 	vm_offset_t	dataoffs;	/* page offset of client data */
 	bus_size_t	datacount;	/* client data count */
 	STAILQ_ENTRY(bounce_page) links;
 };
 
 struct sync_list {
 	vm_offset_t	vaddr;		/* kva of client data */
 	bus_addr_t	paddr;		/* physical address */
 	vm_page_t	pages;		/* starting page of client data */
 	bus_size_t	datacount;	/* client data count */
 };
 
 int busdma_swi_pending;
 
 struct bounce_zone {
 	STAILQ_ENTRY(bounce_zone) links;
 	STAILQ_HEAD(bp_list, bounce_page) bounce_page_list;
 	int		total_bpages;
 	int		free_bpages;
 	int		reserved_bpages;
 	int		active_bpages;
 	int		total_bounced;
 	int		total_deferred;
 	int		map_count;
 	bus_size_t	alignment;
 	bus_addr_t	lowaddr;
 	char		zoneid[8];
 	char		lowaddrid[20];
 	struct sysctl_ctx_list sysctl_tree;
 	struct sysctl_oid *sysctl_tree_top;
 };
 
 static struct mtx bounce_lock;
 static int total_bpages;
 static int busdma_zonecount;
 static uint32_t tags_total;
 static uint32_t maps_total;
 static uint32_t maps_dmamem;
 static uint32_t maps_coherent;
 static counter_u64_t maploads_total;
 static counter_u64_t maploads_bounced;
 static counter_u64_t maploads_coherent;
 static counter_u64_t maploads_dmamem;
 static counter_u64_t maploads_mbuf;
 static counter_u64_t maploads_physmem;
 
 static STAILQ_HEAD(, bounce_zone) bounce_zone_list;
 
 SYSCTL_NODE(_hw, OID_AUTO, busdma, CTLFLAG_RD, 0, "Busdma parameters");
 SYSCTL_UINT(_hw_busdma, OID_AUTO, tags_total, CTLFLAG_RD, &tags_total, 0,
    "Number of active tags");
 SYSCTL_UINT(_hw_busdma, OID_AUTO, maps_total, CTLFLAG_RD, &maps_total, 0,
    "Number of active maps");
 SYSCTL_UINT(_hw_busdma, OID_AUTO, maps_dmamem, CTLFLAG_RD, &maps_dmamem, 0,
    "Number of active maps for bus_dmamem_alloc buffers");
 SYSCTL_UINT(_hw_busdma, OID_AUTO, maps_coherent, CTLFLAG_RD, &maps_coherent, 0,
    "Number of active maps with BUS_DMA_COHERENT flag set");
 SYSCTL_COUNTER_U64(_hw_busdma, OID_AUTO, maploads_total, CTLFLAG_RD,
     &maploads_total, "Number of load operations performed");
 SYSCTL_COUNTER_U64(_hw_busdma, OID_AUTO, maploads_bounced, CTLFLAG_RD,
     &maploads_bounced, "Number of load operations that used bounce buffers");
 SYSCTL_COUNTER_U64(_hw_busdma, OID_AUTO, maploads_coherent, CTLFLAG_RD,
     &maploads_dmamem, "Number of load operations on BUS_DMA_COHERENT memory");
 SYSCTL_COUNTER_U64(_hw_busdma, OID_AUTO, maploads_dmamem, CTLFLAG_RD,
     &maploads_dmamem, "Number of load operations on bus_dmamem_alloc buffers");
 SYSCTL_COUNTER_U64(_hw_busdma, OID_AUTO, maploads_mbuf, CTLFLAG_RD,
     &maploads_mbuf, "Number of load operations for mbufs");
 SYSCTL_COUNTER_U64(_hw_busdma, OID_AUTO, maploads_physmem, CTLFLAG_RD,
     &maploads_physmem, "Number of load operations on physical buffers");
 SYSCTL_INT(_hw_busdma, OID_AUTO, total_bpages, CTLFLAG_RD, &total_bpages, 0,
    "Total bounce pages");
 
 struct bus_dmamap {
 	struct bp_list		bpages;
 	int			pagesneeded;
 	int			pagesreserved;
 	bus_dma_tag_t		dmat;
 	struct memdesc		mem;
 	bus_dmamap_callback_t	*callback;
 	void			*callback_arg;
 	int			flags;
 #define	DMAMAP_COHERENT		(1 << 0)
 #define	DMAMAP_DMAMEM_ALLOC	(1 << 1)
 #define	DMAMAP_MBUF		(1 << 2)
 	STAILQ_ENTRY(bus_dmamap) links;
 	bus_dma_segment_t	*segments;
 	int			sync_count;
 	struct sync_list	slist[];
 };
 
 static STAILQ_HEAD(, bus_dmamap) bounce_map_waitinglist;
 static STAILQ_HEAD(, bus_dmamap) bounce_map_callbacklist;
 
 static void init_bounce_pages(void *dummy);
 static int alloc_bounce_zone(bus_dma_tag_t dmat);
 static int alloc_bounce_pages(bus_dma_tag_t dmat, u_int numpages);
 static int reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
     int commit);
 static bus_addr_t add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map,
     vm_offset_t vaddr, bus_addr_t addr, bus_size_t size);
 static void free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage);
 static void _bus_dmamap_count_pages(bus_dma_tag_t dmat, pmap_t pmap,
     bus_dmamap_t map, void *buf, bus_size_t buflen, int flags);
 static void _bus_dmamap_count_phys(bus_dma_tag_t dmat, bus_dmamap_t map,
     vm_paddr_t buf, bus_size_t buflen, int flags);
 static int _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
     int flags);
 static void dma_preread_safe(vm_offset_t va, vm_paddr_t pa, vm_size_t size);
 static void dma_dcache_sync(struct sync_list *sl, bus_dmasync_op_t op);
 
 static busdma_bufalloc_t coherent_allocator;	/* Cache of coherent buffers */
 static busdma_bufalloc_t standard_allocator;	/* Cache of standard buffers */
 
 MALLOC_DEFINE(M_BUSDMA, "busdma", "busdma metadata");
 MALLOC_DEFINE(M_BOUNCE, "bounce", "busdma bounce pages");
 
 static void
 busdma_init(void *dummy)
 {
 	int uma_flags;
 
 	maploads_total    = counter_u64_alloc(M_WAITOK);
 	maploads_bounced  = counter_u64_alloc(M_WAITOK);
 	maploads_coherent = counter_u64_alloc(M_WAITOK);
 	maploads_dmamem   = counter_u64_alloc(M_WAITOK);
 	maploads_mbuf     = counter_u64_alloc(M_WAITOK);
 	maploads_physmem  = counter_u64_alloc(M_WAITOK);
 
 	uma_flags = 0;
 
 	/* Create a cache of buffers in standard (cacheable) memory. */
 	standard_allocator = busdma_bufalloc_create("buffer",
 	    BUSDMA_DCACHE_ALIGN,/* minimum_alignment */
 	    NULL,		/* uma_alloc func */
 	    NULL,		/* uma_free func */
 	    uma_flags);		/* uma_zcreate_flags */
 
 #ifdef INVARIANTS
 	/*
 	 * Force UMA zone to allocate service structures like
 	 * slabs using own allocator. uma_debug code performs
 	 * atomic ops on uma_slab_t fields and safety of this
 	 * operation is not guaranteed for write-back caches
 	 */
 	uma_flags = UMA_ZONE_OFFPAGE;
 #endif
 	/*
 	 * Create a cache of buffers in uncacheable memory, to implement the
 	 * BUS_DMA_COHERENT (and potentially BUS_DMA_NOCACHE) flag.
 	 */
 	coherent_allocator = busdma_bufalloc_create("coherent",
 	    BUSDMA_DCACHE_ALIGN,/* minimum_alignment */
 	    busdma_bufalloc_alloc_uncacheable,
 	    busdma_bufalloc_free_uncacheable,
 	    uma_flags);	/* uma_zcreate_flags */
 }
 
 /*
  * This init historically used SI_SUB_VM, but now the init code requires
  * malloc(9) using M_BUSDMA memory and the pcpu zones for counter(9), which get
  * set up by SI_SUB_KMEM and SI_ORDER_LAST, so we'll go right after that by
  * using SI_SUB_KMEM+1.
  */
 SYSINIT(busdma, SI_SUB_KMEM+1, SI_ORDER_FIRST, busdma_init, NULL);
 
 /*
  * This routine checks the exclusion zone constraints from a tag against the
  * physical RAM available on the machine.  If a tag specifies an exclusion zone
  * but there's no RAM in that zone, then we avoid allocating resources to bounce
  * a request, and we can use any memory allocator (as opposed to needing
  * kmem_alloc_contig() just because it can allocate pages in an address range).
  *
  * Most tags have BUS_SPACE_MAXADDR or BUS_SPACE_MAXADDR_32BIT (they are the
  * same value on 32-bit architectures) as their lowaddr constraint, and we can't
  * possibly have RAM at an address higher than the highest address we can
  * express, so we take a fast out.
  */
 static int
 exclusion_bounce_check(vm_offset_t lowaddr, vm_offset_t highaddr)
 {
 	int i;
 
 	if (lowaddr >= BUS_SPACE_MAXADDR)
 		return (0);
 
 	for (i = 0; phys_avail[i] && phys_avail[i + 1]; i += 2) {
 		if ((lowaddr >= phys_avail[i] && lowaddr < phys_avail[i + 1]) ||
 		    (lowaddr < phys_avail[i] && highaddr >= phys_avail[i]))
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Return true if the tag has an exclusion zone that could lead to bouncing.
  */
 static __inline int
 exclusion_bounce(bus_dma_tag_t dmat)
 {
 
 	return (dmat->flags & BUS_DMA_EXCL_BOUNCE);
 }
 
 /*
  * Return true if the given address does not fall on the alignment boundary.
  */
 static __inline int
 alignment_bounce(bus_dma_tag_t dmat, bus_addr_t addr)
 {
 
 	return (addr & (dmat->alignment - 1));
 }
 
 /*
  * Return true if the DMA should bounce because the start or end does not fall
  * on a cacheline boundary (which would require a partial cacheline flush).
  * COHERENT memory doesn't trigger cacheline flushes.  Memory allocated by
  * bus_dmamem_alloc() is always aligned to cacheline boundaries, and there's a
  * strict rule that such memory cannot be accessed by the CPU while DMA is in
  * progress (or by multiple DMA engines at once), so that it's always safe to do
  * full cacheline flushes even if that affects memory outside the range of a
  * given DMA operation that doesn't involve the full allocated buffer.  If we're
  * mapping an mbuf, that follows the same rules as a buffer we allocated.
  */
 static __inline int
 cacheline_bounce(bus_dmamap_t map, bus_addr_t addr, bus_size_t size)
 {
 
 	if (map->flags & (DMAMAP_DMAMEM_ALLOC | DMAMAP_COHERENT | DMAMAP_MBUF))
 		return (0);
 	return ((addr | size) & arm_dcache_align_mask);
 }
 
 /*
  * Return true if we might need to bounce the DMA described by addr and size.
  *
  * This is used to quick-check whether we need to do the more expensive work of
  * checking the DMA page-by-page looking for alignment and exclusion bounces.
  *
  * Note that the addr argument might be either virtual or physical.  It doesn't
  * matter because we only look at the low-order bits, which are the same in both
  * address spaces.
  */
 static __inline int
 might_bounce(bus_dma_tag_t dmat, bus_dmamap_t map, bus_addr_t addr,
     bus_size_t size)
 {
 
 	return ((dmat->flags & BUS_DMA_EXCL_BOUNCE) ||
 	    alignment_bounce(dmat, addr) ||
 	    cacheline_bounce(map, addr, size));
 }
 
 /*
  * Return true if we must bounce the DMA described by paddr and size.
  *
  * Bouncing can be triggered by DMA that doesn't begin and end on cacheline
  * boundaries, or doesn't begin on an alignment boundary, or falls within the
  * exclusion zone of any tag in the ancestry chain.
  *
  * For exclusions, walk the chain of tags comparing paddr to the exclusion zone
  * within each tag.  If the tag has a filter function, use it to decide whether
  * the DMA needs to bounce, otherwise any DMA within the zone bounces.
  */
 static int
 must_bounce(bus_dma_tag_t dmat, bus_dmamap_t map, bus_addr_t paddr,
     bus_size_t size)
 {
 
 	if (cacheline_bounce(map, paddr, size))
 		return (1);
 
 	/*
 	 *  The tag already contains ancestors' alignment restrictions so this
 	 *  check doesn't need to be inside the loop.
 	 */
 	if (alignment_bounce(dmat, paddr))
 		return (1);
 
 	/*
 	 * Even though each tag has an exclusion zone that is a superset of its
 	 * own and all its ancestors' exclusions, the exclusion zone of each tag
 	 * up the chain must be checked within the loop, because the busdma
 	 * rules say the filter function is called only when the address lies
 	 * within the low-highaddr range of the tag that filterfunc belongs to.
 	 */
 	while (dmat != NULL && exclusion_bounce(dmat)) {
 		if ((paddr >= dmat->lowaddr && paddr <= dmat->highaddr) &&
 		    (dmat->filter == NULL ||
 		    dmat->filter(dmat->filterarg, paddr) != 0))
 			return (1);
 		dmat = dmat->parent;
 	}
 
 	return (0);
 }
 
-static __inline struct arm32_dma_range *
-_bus_dma_inrange(struct arm32_dma_range *ranges, int nranges,
-    bus_addr_t curaddr)
-{
-	struct arm32_dma_range *dr;
-	int i;
-
-	for (i = 0, dr = ranges; i < nranges; i++, dr++) {
-		if (curaddr >= dr->dr_sysbase &&
-		    round_page(curaddr) <= (dr->dr_sysbase + dr->dr_len))
-			return (dr);
-	}
-
-	return (NULL);
-}
-
 /*
  * Convenience function for manipulating driver locks from busdma (during
  * busdma_swi, for example).  Drivers that don't provide their own locks
  * should specify &Giant to dmat->lockfuncarg.  Drivers that use their own
  * non-mutex locking scheme don't have to use this at all.
  */
 void
 busdma_lock_mutex(void *arg, bus_dma_lock_op_t op)
 {
 	struct mtx *dmtx;
 
 	dmtx = (struct mtx *)arg;
 	switch (op) {
 	case BUS_DMA_LOCK:
 		mtx_lock(dmtx);
 		break;
 	case BUS_DMA_UNLOCK:
 		mtx_unlock(dmtx);
 		break;
 	default:
 		panic("Unknown operation 0x%x for busdma_lock_mutex!", op);
 	}
 }
 
 /*
  * dflt_lock should never get called.  It gets put into the dma tag when
  * lockfunc == NULL, which is only valid if the maps that are associated
  * with the tag are meant to never be defered.
  * XXX Should have a way to identify which driver is responsible here.
  */
 static void
 dflt_lock(void *arg, bus_dma_lock_op_t op)
 {
 
 	panic("driver error: busdma dflt_lock called");
 }
 
 /*
  * Allocate a device specific dma_tag.
  */
 int
 bus_dma_tag_create(bus_dma_tag_t parent, bus_size_t alignment,
     bus_addr_t boundary, bus_addr_t lowaddr, bus_addr_t highaddr,
     bus_dma_filter_t *filter, void *filterarg, bus_size_t maxsize,
     int nsegments, bus_size_t maxsegsz, int flags, bus_dma_lock_t *lockfunc,
     void *lockfuncarg, bus_dma_tag_t *dmat)
 {
 	bus_dma_tag_t newtag;
 	int error = 0;
 
 	/* Basic sanity checking. */
 	KASSERT(boundary == 0 || powerof2(boundary),
 	    ("dma tag boundary %lu, must be a power of 2", boundary));
 	KASSERT(boundary == 0 || boundary >= maxsegsz,
 	    ("dma tag boundary %lu is < maxsegsz %lu\n", boundary, maxsegsz));
 	KASSERT(alignment != 0 && powerof2(alignment),
 	    ("dma tag alignment %lu, must be non-zero power of 2", alignment));
 	KASSERT(maxsegsz != 0, ("dma tag maxsegsz must not be zero"));
 
 	/* Return a NULL tag on failure */
 	*dmat = NULL;
 
 	newtag = (bus_dma_tag_t)malloc(sizeof(*newtag), M_BUSDMA,
 	    M_ZERO | M_NOWAIT);
 	if (newtag == NULL) {
 		CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
 		    __func__, newtag, 0, error);
 		return (ENOMEM);
 	}
 
 	newtag->parent = parent;
 	newtag->alignment = alignment;
 	newtag->boundary = boundary;
 	newtag->lowaddr = trunc_page((vm_paddr_t)lowaddr) + (PAGE_SIZE - 1);
 	newtag->highaddr = trunc_page((vm_paddr_t)highaddr) +
 	    (PAGE_SIZE - 1);
 	newtag->filter = filter;
 	newtag->filterarg = filterarg;
 	newtag->maxsize = maxsize;
 	newtag->nsegments = nsegments;
 	newtag->maxsegsz = maxsegsz;
 	newtag->flags = flags;
 	newtag->ref_count = 1; /* Count ourself */
 	newtag->map_count = 0;
-	newtag->ranges = bus_dma_get_range();
-	newtag->_nranges = bus_dma_get_range_nb();
 	if (lockfunc != NULL) {
 		newtag->lockfunc = lockfunc;
 		newtag->lockfuncarg = lockfuncarg;
 	} else {
 		newtag->lockfunc = dflt_lock;
 		newtag->lockfuncarg = NULL;
 	}
 
 	/* Take into account any restrictions imposed by our parent tag */
 	if (parent != NULL) {
 		newtag->lowaddr = MIN(parent->lowaddr, newtag->lowaddr);
 		newtag->highaddr = MAX(parent->highaddr, newtag->highaddr);
 		newtag->alignment = MAX(parent->alignment, newtag->alignment);
 		newtag->flags |= parent->flags & BUS_DMA_COULD_BOUNCE;
 		if (newtag->boundary == 0)
 			newtag->boundary = parent->boundary;
 		else if (parent->boundary != 0)
 			newtag->boundary = MIN(parent->boundary,
 					       newtag->boundary);
 		if (newtag->filter == NULL) {
 			/*
 			 * Short circuit to looking at our parent directly
 			 * since we have encapsulated all of its information
 			 */
 			newtag->filter = parent->filter;
 			newtag->filterarg = parent->filterarg;
 			newtag->parent = parent->parent;
 		}
 		if (newtag->parent != NULL)
 			atomic_add_int(&parent->ref_count, 1);
 	}
 
 	if (exclusion_bounce_check(newtag->lowaddr, newtag->highaddr))
 		newtag->flags |= BUS_DMA_EXCL_BOUNCE;
 	if (alignment_bounce(newtag, 1))
 		newtag->flags |= BUS_DMA_ALIGN_BOUNCE;
 
 	/*
 	 * Any request can auto-bounce due to cacheline alignment, in addition
 	 * to any alignment or boundary specifications in the tag, so if the
 	 * ALLOCNOW flag is set, there's always work to do.
 	 */
 	if ((flags & BUS_DMA_ALLOCNOW) != 0) {
 		struct bounce_zone *bz;
 		/*
 		 * Round size up to a full page, and add one more page because
 		 * there can always be one more boundary crossing than the
 		 * number of pages in a transfer.
 		 */
 		maxsize = roundup2(maxsize, PAGE_SIZE) + PAGE_SIZE;
 
 		if ((error = alloc_bounce_zone(newtag)) != 0) {
 			free(newtag, M_BUSDMA);
 			return (error);
 		}
 		bz = newtag->bounce_zone;
 
 		if (ptoa(bz->total_bpages) < maxsize) {
 			int pages;
 
 			pages = atop(maxsize) - bz->total_bpages;
 
 			/* Add pages to our bounce pool */
 			if (alloc_bounce_pages(newtag, pages) < pages)
 				error = ENOMEM;
 		}
 		/* Performed initial allocation */
 		newtag->flags |= BUS_DMA_MIN_ALLOC_COMP;
 	} else
 		newtag->bounce_zone = NULL;
 
 	if (error != 0) {
 		free(newtag, M_BUSDMA);
 	} else {
 		atomic_add_32(&tags_total, 1);
 		*dmat = newtag;
 	}
 	CTR4(KTR_BUSDMA, "%s returned tag %p tag flags 0x%x error %d",
 	    __func__, newtag, (newtag != NULL ? newtag->flags : 0), error);
 	return (error);
 }
 
 int
 bus_dma_tag_destroy(bus_dma_tag_t dmat)
 {
 	bus_dma_tag_t dmat_copy;
 	int error;
 
 	error = 0;
 	dmat_copy = dmat;
 
 	if (dmat != NULL) {
 
 		if (dmat->map_count != 0) {
 			error = EBUSY;
 			goto out;
 		}
 
 		while (dmat != NULL) {
 			bus_dma_tag_t parent;
 
 			parent = dmat->parent;
 			atomic_subtract_int(&dmat->ref_count, 1);
 			if (dmat->ref_count == 0) {
 				atomic_subtract_32(&tags_total, 1);
 				free(dmat, M_BUSDMA);
 				/*
 				 * Last reference count, so
 				 * release our reference
 				 * count on our parent.
 				 */
 				dmat = parent;
 			} else
 				dmat = NULL;
 		}
 	}
 out:
 	CTR3(KTR_BUSDMA, "%s tag %p error %d", __func__, dmat_copy, error);
 	return (error);
 }
 
 static int
 allocate_bz_and_pages(bus_dma_tag_t dmat, bus_dmamap_t mapp)
 {
 	struct bounce_zone *bz;
 	int maxpages;
 	int error;
 
 	if (dmat->bounce_zone == NULL)
 		if ((error = alloc_bounce_zone(dmat)) != 0)
 			return (error);
 	bz = dmat->bounce_zone;
 	/* Initialize the new map */
 	STAILQ_INIT(&(mapp->bpages));
 
 	/*
 	 * Attempt to add pages to our pool on a per-instance basis up to a sane
 	 * limit.  Even if the tag isn't flagged as COULD_BOUNCE due to
 	 * alignment and boundary constraints, it could still auto-bounce due to
 	 * cacheline alignment, which requires at most two bounce pages.
 	 */
 	if (dmat->flags & BUS_DMA_COULD_BOUNCE)
 		maxpages = MAX_BPAGES;
 	else
 		maxpages = 2 * bz->map_count;
 	if ((dmat->flags & BUS_DMA_MIN_ALLOC_COMP) == 0 ||
 	    (bz->map_count > 0 && bz->total_bpages < maxpages)) {
 		int pages;
 
 		pages = atop(roundup2(dmat->maxsize, PAGE_SIZE)) + 1;
 		pages = MIN(maxpages - bz->total_bpages, pages);
 		pages = MAX(pages, 2);
 		if (alloc_bounce_pages(dmat, pages) < pages)
 			return (ENOMEM);
 
 		if ((dmat->flags & BUS_DMA_MIN_ALLOC_COMP) == 0)
 			dmat->flags |= BUS_DMA_MIN_ALLOC_COMP;
 	}
 	bz->map_count++;
 	return (0);
 }
 
 static bus_dmamap_t
 allocate_map(bus_dma_tag_t dmat, int mflags)
 {
 	int mapsize, segsize;
 	bus_dmamap_t map;
 
 	/*
 	 * Allocate the map.  The map structure ends with an embedded
 	 * variable-sized array of sync_list structures.  Following that
 	 * we allocate enough extra space to hold the array of bus_dma_segments.
 	 */
 	KASSERT(dmat->nsegments <= MAX_DMA_SEGMENTS,
 	   ("cannot allocate %u dma segments (max is %u)",
 	    dmat->nsegments, MAX_DMA_SEGMENTS));
 	segsize = sizeof(struct bus_dma_segment) * dmat->nsegments;
 	mapsize = sizeof(*map) + sizeof(struct sync_list) * dmat->nsegments;
 	map = malloc(mapsize + segsize, M_BUSDMA, mflags | M_ZERO);
 	if (map == NULL) {
 		CTR3(KTR_BUSDMA, "%s: tag %p error %d", __func__, dmat, ENOMEM);
 		return (NULL);
 	}
 	map->segments = (bus_dma_segment_t *)((uintptr_t)map + mapsize);
 	return (map);
 }
 
 /*
  * Allocate a handle for mapping from kva/uva/physical
  * address space into bus device space.
  */
 int
 bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
 {
 	bus_dmamap_t map;
 	int error = 0;
 
 	*mapp = map = allocate_map(dmat, M_NOWAIT);
 	if (map == NULL) {
 		CTR3(KTR_BUSDMA, "%s: tag %p error %d", __func__, dmat, ENOMEM);
 		return (ENOMEM);
 	}
 
 	/*
 	 * Bouncing might be required if the driver asks for an exclusion
 	 * region, a data alignment that is stricter than 1, or DMA that begins
 	 * or ends with a partial cacheline.  Whether bouncing will actually
 	 * happen can't be known until mapping time, but we need to pre-allocate
 	 * resources now because we might not be allowed to at mapping time.
 	 */
 	error = allocate_bz_and_pages(dmat, map);
 	if (error != 0) {
 		free(map, M_BUSDMA);
 		*mapp = NULL;
 		return (error);
 	}
 	if (map->flags & DMAMAP_COHERENT)
 		atomic_add_32(&maps_coherent, 1);
 	atomic_add_32(&maps_total, 1);
 	dmat->map_count++;
 
 	return (0);
 }
 
 /*
  * Destroy a handle for mapping from kva/uva/physical
  * address space into bus device space.
  */
 int
 bus_dmamap_destroy(bus_dma_tag_t dmat, bus_dmamap_t map)
 {
 
 	if (STAILQ_FIRST(&map->bpages) != NULL || map->sync_count != 0) {
 		CTR3(KTR_BUSDMA, "%s: tag %p error %d",
 		    __func__, dmat, EBUSY);
 		return (EBUSY);
 	}
 	if (dmat->bounce_zone)
 		dmat->bounce_zone->map_count--;
 	if (map->flags & DMAMAP_COHERENT)
 		atomic_subtract_32(&maps_coherent, 1);
 	atomic_subtract_32(&maps_total, 1);
 	free(map, M_BUSDMA);
 	dmat->map_count--;
 	CTR2(KTR_BUSDMA, "%s: tag %p error 0", __func__, dmat);
 	return (0);
 }
 
 /*
  * Allocate a piece of memory that can be efficiently mapped into bus device
  * space based on the constraints listed in the dma tag.  Returns a pointer to
  * the allocated memory, and a pointer to an associated bus_dmamap.
  */
 int
 bus_dmamem_alloc(bus_dma_tag_t dmat, void **vaddr, int flags,
     bus_dmamap_t *mapp)
 {
 	busdma_bufalloc_t ba;
 	struct busdma_bufzone *bufzone;
 	bus_dmamap_t map;
 	vm_memattr_t memattr;
 	int mflags;
 
 	if (flags & BUS_DMA_NOWAIT)
 		mflags = M_NOWAIT;
 	else
 		mflags = M_WAITOK;
 	if (flags & BUS_DMA_ZERO)
 		mflags |= M_ZERO;
 
 	*mapp = map = allocate_map(dmat, mflags);
 	if (map == NULL) {
 		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
 		    __func__, dmat, dmat->flags, ENOMEM);
 		return (ENOMEM);
 	}
 	map->flags = DMAMAP_DMAMEM_ALLOC;
 
 	/* Choose a busdma buffer allocator based on memory type flags. */
 	if (flags & BUS_DMA_COHERENT) {
 		memattr = VM_MEMATTR_UNCACHEABLE;
 		ba = coherent_allocator;
 		map->flags |= DMAMAP_COHERENT;
 	} else {
 		memattr = VM_MEMATTR_DEFAULT;
 		ba = standard_allocator;
 	}
 
 	/*
 	 * Try to find a bufzone in the allocator that holds a cache of buffers
 	 * of the right size for this request.  If the buffer is too big to be
 	 * held in the allocator cache, this returns NULL.
 	 */
 	bufzone = busdma_bufalloc_findzone(ba, dmat->maxsize);
 
 	/*
 	 * Allocate the buffer from the uma(9) allocator if...
 	 *  - It's small enough to be in the allocator (bufzone not NULL).
 	 *  - The alignment constraint isn't larger than the allocation size
 	 *    (the allocator aligns buffers to their size boundaries).
 	 *  - There's no need to handle lowaddr/highaddr exclusion zones.
 	 * else allocate non-contiguous pages if...
 	 *  - The page count that could get allocated doesn't exceed nsegments.
 	 *  - The alignment constraint isn't larger than a page boundary.
 	 *  - There are no boundary-crossing constraints.
 	 * else allocate a block of contiguous pages because one or more of the
 	 * constraints is something that only the contig allocator can fulfill.
 	 */
 	if (bufzone != NULL && dmat->alignment <= bufzone->size &&
 	    !exclusion_bounce(dmat)) {
 		*vaddr = uma_zalloc(bufzone->umazone, mflags);
 	} else if (dmat->nsegments >= btoc(dmat->maxsize) &&
 	    dmat->alignment <= PAGE_SIZE && dmat->boundary == 0) {
 		*vaddr = (void *)kmem_alloc_attr(kernel_arena, dmat->maxsize,
 		    mflags, 0, dmat->lowaddr, memattr);
 	} else {
 		*vaddr = (void *)kmem_alloc_contig(kernel_arena, dmat->maxsize,
 		    mflags, 0, dmat->lowaddr, dmat->alignment, dmat->boundary,
 		    memattr);
 	}
 	if (*vaddr == NULL) {
 		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
 		    __func__, dmat, dmat->flags, ENOMEM);
 		free(map, M_BUSDMA);
 		*mapp = NULL;
 		return (ENOMEM);
 	}
 	if (map->flags & DMAMAP_COHERENT)
 		atomic_add_32(&maps_coherent, 1);
 	atomic_add_32(&maps_dmamem, 1);
 	atomic_add_32(&maps_total, 1);
 	dmat->map_count++;
 
 	CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
 	    __func__, dmat, dmat->flags, 0);
 	return (0);
 }
 
 /*
  * Free a piece of memory that was allocated via bus_dmamem_alloc, along with
  * its associated map.
  */
 void
 bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map)
 {
 	struct busdma_bufzone *bufzone;
 	busdma_bufalloc_t ba;
 
 	if (map->flags & DMAMAP_COHERENT)
 		ba = coherent_allocator;
 	else
 		ba = standard_allocator;
 
 	bufzone = busdma_bufalloc_findzone(ba, dmat->maxsize);
 
 	if (bufzone != NULL && dmat->alignment <= bufzone->size &&
 	    !exclusion_bounce(dmat))
 		uma_zfree(bufzone->umazone, vaddr);
 	else
 		kmem_free(kernel_arena, (vm_offset_t)vaddr, dmat->maxsize);
 
 	dmat->map_count--;
 	if (map->flags & DMAMAP_COHERENT)
 		atomic_subtract_32(&maps_coherent, 1);
 	atomic_subtract_32(&maps_total, 1);
 	atomic_subtract_32(&maps_dmamem, 1);
 	free(map, M_BUSDMA);
 	CTR3(KTR_BUSDMA, "%s: tag %p flags 0x%x", __func__, dmat, dmat->flags);
 }
 
 static void
 _bus_dmamap_count_phys(bus_dma_tag_t dmat, bus_dmamap_t map, vm_paddr_t buf,
     bus_size_t buflen, int flags)
 {
 	bus_addr_t curaddr;
 	bus_size_t sgsize;
 
 	if (map->pagesneeded == 0) {
 		CTR5(KTR_BUSDMA, "lowaddr= %d, boundary= %d, alignment= %d"
 		    " map= %p, pagesneeded= %d",
 		    dmat->lowaddr, dmat->boundary, dmat->alignment,
 		    map, map->pagesneeded);
 		/*
 		 * Count the number of bounce pages
 		 * needed in order to complete this transfer
 		 */
 		curaddr = buf;
 		while (buflen != 0) {
 			sgsize = MIN(buflen, dmat->maxsegsz);
 			if (must_bounce(dmat, map, curaddr, sgsize) != 0) {
 				sgsize = MIN(sgsize,
 				    PAGE_SIZE - (curaddr & PAGE_MASK));
 				map->pagesneeded++;
 			}
 			curaddr += sgsize;
 			buflen -= sgsize;
 		}
 		CTR1(KTR_BUSDMA, "pagesneeded= %d", map->pagesneeded);
 	}
 }
 
 static void
 _bus_dmamap_count_pages(bus_dma_tag_t dmat, pmap_t pmap, bus_dmamap_t map,
     void *buf, bus_size_t buflen, int flags)
 {
 	vm_offset_t vaddr;
 	vm_offset_t vendaddr;
 	bus_addr_t paddr;
 
 	if (map->pagesneeded == 0) {
 		CTR5(KTR_BUSDMA, "lowaddr= %d, boundary= %d, alignment= %d"
 		    " map= %p, pagesneeded= %d",
 		    dmat->lowaddr, dmat->boundary, dmat->alignment,
 		    map, map->pagesneeded);
 		/*
 		 * Count the number of bounce pages
 		 * needed in order to complete this transfer
 		 */
 		vaddr = (vm_offset_t)buf;
 		vendaddr = (vm_offset_t)buf + buflen;
 
 		while (vaddr < vendaddr) {
 			if (__predict_true(pmap == kernel_pmap))
 				paddr = pmap_kextract(vaddr);
 			else
 				paddr = pmap_extract(pmap, vaddr);
 			if (must_bounce(dmat, map, paddr,
 			    min(vendaddr - vaddr, (PAGE_SIZE - ((vm_offset_t)vaddr &
 			    PAGE_MASK)))) != 0) {
 				map->pagesneeded++;
 			}
 			vaddr += (PAGE_SIZE - ((vm_offset_t)vaddr & PAGE_MASK));
 
 		}
 		CTR1(KTR_BUSDMA, "pagesneeded= %d", map->pagesneeded);
 	}
 }
 
 static int
 _bus_dmamap_reserve_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int flags)
 {
 
 	/* Reserve Necessary Bounce Pages */
 	mtx_lock(&bounce_lock);
 	if (flags & BUS_DMA_NOWAIT) {
 		if (reserve_bounce_pages(dmat, map, 0) != 0) {
 			map->pagesneeded = 0;
 			mtx_unlock(&bounce_lock);
 			return (ENOMEM);
 		}
 	} else {
 		if (reserve_bounce_pages(dmat, map, 1) != 0) {
 			/* Queue us for resources */
 			STAILQ_INSERT_TAIL(&bounce_map_waitinglist, map, links);
 			mtx_unlock(&bounce_lock);
 			return (EINPROGRESS);
 		}
 	}
 	mtx_unlock(&bounce_lock);
 
 	return (0);
 }
 
 /*
  * Add a single contiguous physical range to the segment list.
  */
 static int
 _bus_dmamap_addseg(bus_dma_tag_t dmat, bus_dmamap_t map, bus_addr_t curaddr,
     bus_size_t sgsize, bus_dma_segment_t *segs, int *segp)
 {
 	bus_addr_t baddr, bmask;
 	int seg;
 
 	/*
 	 * Make sure we don't cross any boundaries.
 	 */
 	bmask = ~(dmat->boundary - 1);
 	if (dmat->boundary > 0) {
 		baddr = (curaddr + dmat->boundary) & bmask;
 		if (sgsize > (baddr - curaddr))
 			sgsize = (baddr - curaddr);
-	}
-
-	if (dmat->ranges) {
-		struct arm32_dma_range *dr;
-
-		dr = _bus_dma_inrange(dmat->ranges, dmat->_nranges,
-		    curaddr);
-		if (dr == NULL) {
-			_bus_dmamap_unload(dmat, map);
-			return (0);
-		}
-		/*
-		 * In a valid DMA range.  Translate the physical
-		 * memory address to an address in the DMA window.
-		 */
-		curaddr = (curaddr - dr->dr_sysbase) + dr->dr_busbase;
 	}
 
 	/*
 	 * Insert chunk into a segment, coalescing with
 	 * previous segment if possible.
 	 */
 	seg = *segp;
 	if (seg == -1) {
 		seg = 0;
 		segs[seg].ds_addr = curaddr;
 		segs[seg].ds_len = sgsize;
 	} else {
 		if (curaddr == segs[seg].ds_addr + segs[seg].ds_len &&
 		    (segs[seg].ds_len + sgsize) <= dmat->maxsegsz &&
 		    (dmat->boundary == 0 ||
 		    (segs[seg].ds_addr & bmask) == (curaddr & bmask)))
 			segs[seg].ds_len += sgsize;
 		else {
 			if (++seg >= dmat->nsegments)
 				return (0);
 			segs[seg].ds_addr = curaddr;
 			segs[seg].ds_len = sgsize;
 		}
 	}
 	*segp = seg;
 	return (sgsize);
 }
 
 /*
  * Utility function to load a physical buffer.  segp contains
  * the starting segment on entrace, and the ending segment on exit.
  */
 int
 _bus_dmamap_load_phys(bus_dma_tag_t dmat, bus_dmamap_t map, vm_paddr_t buf,
     bus_size_t buflen, int flags, bus_dma_segment_t *segs, int *segp)
 {
 	bus_addr_t curaddr;
 	bus_addr_t sl_end = 0;
 	bus_size_t sgsize;
 	struct sync_list *sl;
 	int error;
 
 	if (segs == NULL)
 		segs = map->segments;
 
 	counter_u64_add(maploads_total, 1);
 	counter_u64_add(maploads_physmem, 1);
 
 	if (might_bounce(dmat, map, (bus_addr_t)buf, buflen)) {
 		_bus_dmamap_count_phys(dmat, map, buf, buflen, flags);
 		if (map->pagesneeded != 0) {
 			counter_u64_add(maploads_bounced, 1);
 			error = _bus_dmamap_reserve_pages(dmat, map, flags);
 			if (error)
 				return (error);
 		}
 	}
 
 	sl = map->slist + map->sync_count - 1;
 
 	while (buflen > 0) {
 		curaddr = buf;
 		sgsize = MIN(buflen, dmat->maxsegsz);
 		if (map->pagesneeded != 0 && must_bounce(dmat, map, curaddr,
 		    sgsize)) {
 			sgsize = MIN(sgsize, PAGE_SIZE - (curaddr & PAGE_MASK));
 			curaddr = add_bounce_page(dmat, map, 0, curaddr,
 			    sgsize);
 		} else {
 			if (map->sync_count > 0)
 				sl_end = sl->paddr + sl->datacount;
 
 			if (map->sync_count == 0 || curaddr != sl_end) {
 				if (++map->sync_count > dmat->nsegments)
 					break;
 				sl++;
 				sl->vaddr = 0;
 				sl->paddr = curaddr;
 				sl->datacount = sgsize;
 				sl->pages = PHYS_TO_VM_PAGE(curaddr);
 				KASSERT(sl->pages != NULL,
 				    ("%s: page at PA:0x%08lx is not in "
 				    "vm_page_array", __func__, curaddr));
 			} else
 				sl->datacount += sgsize;
 		}
 		sgsize = _bus_dmamap_addseg(dmat, map, curaddr, sgsize, segs,
 		    segp);
 		if (sgsize == 0)
 			break;
 		buf += sgsize;
 		buflen -= sgsize;
 	}
 
 	/*
 	 * Did we fit?
 	 */
 	if (buflen != 0) {
 		_bus_dmamap_unload(dmat, map);
 		return (EFBIG); /* XXX better return value here? */
 	}
 	return (0);
 }
 
 int
 _bus_dmamap_load_ma(bus_dma_tag_t dmat, bus_dmamap_t map,
     struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
     bus_dma_segment_t *segs, int *segp)
 {
 
 	return (bus_dmamap_load_ma_triv(dmat, map, ma, tlen, ma_offs, flags,
 	    segs, segp));
 }
 
 /*
  * Utility function to load a linear buffer.  segp contains
  * the starting segment on entrance, and the ending segment on exit.
  */
 int
 _bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
     bus_size_t buflen, pmap_t pmap, int flags, bus_dma_segment_t *segs,
     int *segp)
 {
 	bus_size_t sgsize;
 	bus_addr_t curaddr;
 	bus_addr_t sl_pend = 0;
 	vm_offset_t kvaddr, vaddr, sl_vend = 0;
 	struct sync_list *sl;
 	int error;
 
 	counter_u64_add(maploads_total, 1);
 	if (map->flags & DMAMAP_COHERENT)
 		counter_u64_add(maploads_coherent, 1);
 	if (map->flags & DMAMAP_DMAMEM_ALLOC)
 		counter_u64_add(maploads_dmamem, 1);
 
 	if (segs == NULL)
 		segs = map->segments;
 
 	if (flags & BUS_DMA_LOAD_MBUF) {
 		counter_u64_add(maploads_mbuf, 1);
 		map->flags |= DMAMAP_MBUF;
 	}
 
 	if (might_bounce(dmat, map, (bus_addr_t)buf, buflen)) {
 		_bus_dmamap_count_pages(dmat, pmap, map, buf, buflen, flags);
 		if (map->pagesneeded != 0) {
 			counter_u64_add(maploads_bounced, 1);
 			error = _bus_dmamap_reserve_pages(dmat, map, flags);
 			if (error)
 				return (error);
 		}
 	}
 
 	sl = map->slist + map->sync_count - 1;
 	vaddr = (vm_offset_t)buf;
 
 	while (buflen > 0) {
 		/*
 		 * Get the physical address for this segment.
 		 */
 		if (__predict_true(pmap == kernel_pmap)) {
 			curaddr = pmap_kextract(vaddr);
 			kvaddr = vaddr;
 		} else {
 			curaddr = pmap_extract(pmap, vaddr);
 			kvaddr = 0;
 		}
 
 		/*
 		 * Compute the segment size, and adjust counts.
 		 */
 		sgsize = PAGE_SIZE - (curaddr & PAGE_MASK);
 		if (sgsize > dmat->maxsegsz)
 			sgsize = dmat->maxsegsz;
 		if (buflen < sgsize)
 			sgsize = buflen;
 
 		if (map->pagesneeded != 0 && must_bounce(dmat, map, curaddr,
 		    sgsize)) {
 			curaddr = add_bounce_page(dmat, map, kvaddr, curaddr,
 			    sgsize);
 		} else {
 			if (map->sync_count > 0) {
 				sl_pend = sl->paddr + sl->datacount;
 				sl_vend = sl->vaddr + sl->datacount;
 			}
 
 			if (map->sync_count == 0 ||
 			    (kvaddr != 0 && kvaddr != sl_vend) ||
 			    (curaddr != sl_pend)) {
 
 				if (++map->sync_count > dmat->nsegments)
 					goto cleanup;
 				sl++;
 				sl->vaddr = kvaddr;
 				sl->paddr = curaddr;
 				if (kvaddr != 0) {
 					sl->pages = NULL;
 				} else {
 					sl->pages = PHYS_TO_VM_PAGE(curaddr);
 					KASSERT(sl->pages != NULL,
 					    ("%s: page at PA:0x%08lx is not "
 					    "in vm_page_array", __func__,
 					    curaddr));
 				}
 				sl->datacount = sgsize;
 			} else
 				sl->datacount += sgsize;
 		}
 		sgsize = _bus_dmamap_addseg(dmat, map, curaddr, sgsize, segs,
 		    segp);
 		if (sgsize == 0)
 			break;
 		vaddr += sgsize;
 		buflen -= sgsize;
 	}
 
 cleanup:
 	/*
 	 * Did we fit?
 	 */
 	if (buflen != 0) {
 		_bus_dmamap_unload(dmat, map);
 		return (EFBIG); /* XXX better return value here? */
 	}
 	return (0);
 }
 
 void
 __bus_dmamap_waitok(bus_dma_tag_t dmat, bus_dmamap_t map, struct memdesc *mem,
     bus_dmamap_callback_t *callback, void *callback_arg)
 {
 
 	map->mem = *mem;
 	map->dmat = dmat;
 	map->callback = callback;
 	map->callback_arg = callback_arg;
 }
 
 bus_dma_segment_t *
 _bus_dmamap_complete(bus_dma_tag_t dmat, bus_dmamap_t map,
     bus_dma_segment_t *segs, int nsegs, int error)
 {
 
 	if (segs == NULL)
 		segs = map->segments;
 	return (segs);
 }
 
 /*
  * Release the mapping held by map.
  */
 void
 _bus_dmamap_unload(bus_dma_tag_t dmat, bus_dmamap_t map)
 {
 	struct bounce_page *bpage;
 	struct bounce_zone *bz;
 
 	if ((bz = dmat->bounce_zone) != NULL) {
 		while ((bpage = STAILQ_FIRST(&map->bpages)) != NULL) {
 			STAILQ_REMOVE_HEAD(&map->bpages, links);
 			free_bounce_page(dmat, bpage);
 		}
 
 		bz = dmat->bounce_zone;
 		bz->free_bpages += map->pagesreserved;
 		bz->reserved_bpages -= map->pagesreserved;
 		map->pagesreserved = 0;
 		map->pagesneeded = 0;
 	}
 	map->sync_count = 0;
 	map->flags &= ~DMAMAP_MBUF;
 }
 
 static void
 dma_preread_safe(vm_offset_t va, vm_paddr_t pa, vm_size_t size)
 {
 	/*
 	 * Write back any partial cachelines immediately before and
 	 * after the DMA region.  We don't need to round the address
 	 * down to the nearest cacheline or specify the exact size,
 	 * as dcache_wb_poc() will do the rounding for us and works
 	 * at cacheline granularity.
 	 */
 	if (va & BUSDMA_DCACHE_MASK)
 		dcache_wb_poc(va, pa, 1);
 	if ((va + size) & BUSDMA_DCACHE_MASK)
 		dcache_wb_poc(va + size, pa + size, 1);
 
 	dcache_inv_poc_dma(va, pa, size);
 }
 
 static void
 dma_dcache_sync(struct sync_list *sl, bus_dmasync_op_t op)
 {
 	uint32_t len, offset;
 	vm_page_t m;
 	vm_paddr_t pa;
 	vm_offset_t va, tempva;
 	bus_size_t size;
 
 	offset = sl->paddr & PAGE_MASK;
 	m = sl->pages;
 	size = sl->datacount;
 	pa = sl->paddr;
 
 	for ( ; size != 0; size -= len, pa += len, offset = 0, ++m) {
 		tempva = 0;
 		if (sl->vaddr == 0) {
 			len = min(PAGE_SIZE - offset, size);
 			tempva = pmap_quick_enter_page(m);
 			va = tempva | offset;
 			KASSERT(pa == (VM_PAGE_TO_PHYS(m) | offset),
 			    ("unexpected vm_page_t phys: 0x%08x != 0x%08x",
 			    VM_PAGE_TO_PHYS(m) | offset, pa));
 		} else {
 			len = sl->datacount;
 			va = sl->vaddr;
 		}
 
 		switch (op) {
 		case BUS_DMASYNC_PREWRITE:
 		case BUS_DMASYNC_PREWRITE | BUS_DMASYNC_PREREAD:
 			dcache_wb_poc(va, pa, len);
 			break;
 		case BUS_DMASYNC_PREREAD:
 			/*
 			 * An mbuf may start in the middle of a cacheline. There
 			 * will be no cpu writes to the beginning of that line
 			 * (which contains the mbuf header) while dma is in
 			 * progress.  Handle that case by doing a writeback of
 			 * just the first cacheline before invalidating the
 			 * overall buffer.  Any mbuf in a chain may have this
 			 * misalignment.  Buffers which are not mbufs bounce if
 			 * they are not aligned to a cacheline.
 			 */
 			dma_preread_safe(va, pa, len);
 			break;
 		case BUS_DMASYNC_POSTREAD:
 		case BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE:
 			dcache_inv_poc(va, pa, len);
 			break;
 		default:
 			panic("unsupported combination of sync operations: "
                               "0x%08x\n", op);
 		}
 
 		if (tempva != 0)
 			pmap_quick_remove_page(tempva);
 	}
 }
 
 void
 _bus_dmamap_sync(bus_dma_tag_t dmat, bus_dmamap_t map, bus_dmasync_op_t op)
 {
 	struct bounce_page *bpage;
 	struct sync_list *sl, *end;
 	vm_offset_t datavaddr, tempvaddr;
 
 	if (op == BUS_DMASYNC_POSTWRITE)
 		return;
 
 	/*
 	 * If the buffer was from user space, it is possible that this is not
 	 * the same vm map, especially on a POST operation.  It's not clear that
 	 * dma on userland buffers can work at all right now.  To be safe, until
 	 * we're able to test direct userland dma, panic on a map mismatch.
 	 */
 	if ((bpage = STAILQ_FIRST(&map->bpages)) != NULL) {
 
 		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
 		    "performing bounce", __func__, dmat, dmat->flags, op);
 
 		/*
 		 * For PREWRITE do a writeback.  Clean the caches from the
 		 * innermost to the outermost levels.
 		 */
 		if (op & BUS_DMASYNC_PREWRITE) {
 			while (bpage != NULL) {
 				tempvaddr = 0;
 				datavaddr = bpage->datavaddr;
 				if (datavaddr == 0) {
 					tempvaddr = pmap_quick_enter_page(
 					    bpage->datapage);
 					datavaddr = tempvaddr | bpage->dataoffs;
 				}
 				bcopy((void *)datavaddr, (void *)bpage->vaddr,
 				    bpage->datacount);
 				if (tempvaddr != 0)
 					pmap_quick_remove_page(tempvaddr);
 				dcache_wb_poc(bpage->vaddr, bpage->busaddr,
 				    bpage->datacount);
 				bpage = STAILQ_NEXT(bpage, links);
 			}
 			dmat->bounce_zone->total_bounced++;
 		}
 
 		/*
 		 * Do an invalidate for PREREAD unless a writeback was already
 		 * done above due to PREWRITE also being set.  The reason for a
 		 * PREREAD invalidate is to prevent dirty lines currently in the
 		 * cache from being evicted during the DMA.  If a writeback was
 		 * done due to PREWRITE also being set there will be no dirty
 		 * lines and the POSTREAD invalidate handles the rest. The
 		 * invalidate is done from the innermost to outermost level. If
 		 * L2 were done first, a dirty cacheline could be automatically
 		 * evicted from L1 before we invalidated it, re-dirtying the L2.
 		 */
 		if ((op & BUS_DMASYNC_PREREAD) && !(op & BUS_DMASYNC_PREWRITE)) {
 			bpage = STAILQ_FIRST(&map->bpages);
 			while (bpage != NULL) {
 				dcache_inv_poc_dma(bpage->vaddr, bpage->busaddr,
 				    bpage->datacount);
 				bpage = STAILQ_NEXT(bpage, links);
 			}
 		}
 
 		/*
 		 * Re-invalidate the caches on a POSTREAD, even though they were
 		 * already invalidated at PREREAD time.  Aggressive prefetching
 		 * due to accesses to other data near the dma buffer could have
 		 * brought buffer data into the caches which is now stale.  The
 		 * caches are invalidated from the outermost to innermost; the
 		 * prefetches could be happening right now, and if L1 were
 		 * invalidated first, stale L2 data could be prefetched into L1.
 		 */
 		if (op & BUS_DMASYNC_POSTREAD) {
 			while (bpage != NULL) {
 				dcache_inv_poc(bpage->vaddr, bpage->busaddr,
 				    bpage->datacount);
 				tempvaddr = 0;
 				datavaddr = bpage->datavaddr;
 				if (datavaddr == 0) {
 					tempvaddr = pmap_quick_enter_page(
 					    bpage->datapage);
 					datavaddr = tempvaddr | bpage->dataoffs;
 				}
 				bcopy((void *)bpage->vaddr, (void *)datavaddr,
 				    bpage->datacount);
 				if (tempvaddr != 0)
 					pmap_quick_remove_page(tempvaddr);
 				bpage = STAILQ_NEXT(bpage, links);
 			}
 			dmat->bounce_zone->total_bounced++;
 		}
 	}
 
 	/*
 	 * For COHERENT memory no cache maintenance is necessary, but ensure all
 	 * writes have reached memory for the PREWRITE case.  No action is
 	 * needed for a PREREAD without PREWRITE also set, because that would
 	 * imply that the cpu had written to the COHERENT buffer and expected
 	 * the dma device to see that change, and by definition a PREWRITE sync
 	 * is required to make that happen.
 	 */
 	if (map->flags & DMAMAP_COHERENT) {
 		if (op & BUS_DMASYNC_PREWRITE) {
 			dsb();
 			cpu_l2cache_drain_writebuf();
 		}
 		return;
 	}
 
 	/*
 	 * Cache maintenance for normal (non-COHERENT non-bounce) buffers.  All
 	 * the comments about the sequences for flushing cache levels in the
 	 * bounce buffer code above apply here as well.  In particular, the fact
 	 * that the sequence is inner-to-outer for PREREAD invalidation and
 	 * outer-to-inner for POSTREAD invalidation is not a mistake.
 	 */
 	if (map->sync_count != 0) {
 		sl = &map->slist[0];
 		end = &map->slist[map->sync_count];
 		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x op 0x%x "
 		    "performing sync", __func__, dmat, dmat->flags, op);
 
 		for ( ; sl != end; ++sl)
 			dma_dcache_sync(sl, op);
 	}
 }
 
 static void
 init_bounce_pages(void *dummy __unused)
 {
 
 	total_bpages = 0;
 	STAILQ_INIT(&bounce_zone_list);
 	STAILQ_INIT(&bounce_map_waitinglist);
 	STAILQ_INIT(&bounce_map_callbacklist);
 	mtx_init(&bounce_lock, "bounce pages lock", NULL, MTX_DEF);
 }
 SYSINIT(bpages, SI_SUB_LOCK, SI_ORDER_ANY, init_bounce_pages, NULL);
 
 static struct sysctl_ctx_list *
 busdma_sysctl_tree(struct bounce_zone *bz)
 {
 
 	return (&bz->sysctl_tree);
 }
 
 static struct sysctl_oid *
 busdma_sysctl_tree_top(struct bounce_zone *bz)
 {
 
 	return (bz->sysctl_tree_top);
 }
 
 static int
 alloc_bounce_zone(bus_dma_tag_t dmat)
 {
 	struct bounce_zone *bz;
 
 	/* Check to see if we already have a suitable zone */
 	STAILQ_FOREACH(bz, &bounce_zone_list, links) {
 		if ((dmat->alignment <= bz->alignment) &&
 		    (dmat->lowaddr >= bz->lowaddr)) {
 			dmat->bounce_zone = bz;
 			return (0);
 		}
 	}
 
 	if ((bz = (struct bounce_zone *)malloc(sizeof(*bz), M_BUSDMA,
 	    M_NOWAIT | M_ZERO)) == NULL)
 		return (ENOMEM);
 
 	STAILQ_INIT(&bz->bounce_page_list);
 	bz->free_bpages = 0;
 	bz->reserved_bpages = 0;
 	bz->active_bpages = 0;
 	bz->lowaddr = dmat->lowaddr;
 	bz->alignment = MAX(dmat->alignment, PAGE_SIZE);
 	bz->map_count = 0;
 	snprintf(bz->zoneid, 8, "zone%d", busdma_zonecount);
 	busdma_zonecount++;
 	snprintf(bz->lowaddrid, 18, "%#jx", (uintmax_t)bz->lowaddr);
 	STAILQ_INSERT_TAIL(&bounce_zone_list, bz, links);
 	dmat->bounce_zone = bz;
 
 	sysctl_ctx_init(&bz->sysctl_tree);
 	bz->sysctl_tree_top = SYSCTL_ADD_NODE(&bz->sysctl_tree,
 	    SYSCTL_STATIC_CHILDREN(_hw_busdma), OID_AUTO, bz->zoneid,
 	    CTLFLAG_RD, 0, "");
 	if (bz->sysctl_tree_top == NULL) {
 		sysctl_ctx_free(&bz->sysctl_tree);
 		return (0);	/* XXX error code? */
 	}
 
 	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "total_bpages", CTLFLAG_RD, &bz->total_bpages, 0,
 	    "Total bounce pages");
 	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "free_bpages", CTLFLAG_RD, &bz->free_bpages, 0,
 	    "Free bounce pages");
 	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "reserved_bpages", CTLFLAG_RD, &bz->reserved_bpages, 0,
 	    "Reserved bounce pages");
 	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "active_bpages", CTLFLAG_RD, &bz->active_bpages, 0,
 	    "Active bounce pages");
 	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "total_bounced", CTLFLAG_RD, &bz->total_bounced, 0,
 	    "Total bounce requests (pages bounced)");
 	SYSCTL_ADD_INT(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "total_deferred", CTLFLAG_RD, &bz->total_deferred, 0,
 	    "Total bounce requests that were deferred");
 	SYSCTL_ADD_STRING(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "lowaddr", CTLFLAG_RD, bz->lowaddrid, 0, "");
 	SYSCTL_ADD_ULONG(busdma_sysctl_tree(bz),
 	    SYSCTL_CHILDREN(busdma_sysctl_tree_top(bz)), OID_AUTO,
 	    "alignment", CTLFLAG_RD, &bz->alignment, "");
 
 	return (0);
 }
 
 static int
 alloc_bounce_pages(bus_dma_tag_t dmat, u_int numpages)
 {
 	struct bounce_zone *bz;
 	int count;
 
 	bz = dmat->bounce_zone;
 	count = 0;
 	while (numpages > 0) {
 		struct bounce_page *bpage;
 
 		bpage = (struct bounce_page *)malloc(sizeof(*bpage), M_BUSDMA,
 		    M_NOWAIT | M_ZERO);
 
 		if (bpage == NULL)
 			break;
 		bpage->vaddr = (vm_offset_t)contigmalloc(PAGE_SIZE, M_BOUNCE,
 		    M_NOWAIT, 0ul, bz->lowaddr, PAGE_SIZE, 0);
 		if (bpage->vaddr == 0) {
 			free(bpage, M_BUSDMA);
 			break;
 		}
 		bpage->busaddr = pmap_kextract(bpage->vaddr);
 		mtx_lock(&bounce_lock);
 		STAILQ_INSERT_TAIL(&bz->bounce_page_list, bpage, links);
 		total_bpages++;
 		bz->total_bpages++;
 		bz->free_bpages++;
 		mtx_unlock(&bounce_lock);
 		count++;
 		numpages--;
 	}
 	return (count);
 }
 
 static int
 reserve_bounce_pages(bus_dma_tag_t dmat, bus_dmamap_t map, int commit)
 {
 	struct bounce_zone *bz;
 	int pages;
 
 	mtx_assert(&bounce_lock, MA_OWNED);
 	bz = dmat->bounce_zone;
 	pages = MIN(bz->free_bpages, map->pagesneeded - map->pagesreserved);
 	if (commit == 0 && map->pagesneeded > (map->pagesreserved + pages))
 		return (map->pagesneeded - (map->pagesreserved + pages));
 	bz->free_bpages -= pages;
 	bz->reserved_bpages += pages;
 	map->pagesreserved += pages;
 	pages = map->pagesneeded - map->pagesreserved;
 
 	return (pages);
 }
 
 static bus_addr_t
 add_bounce_page(bus_dma_tag_t dmat, bus_dmamap_t map, vm_offset_t vaddr,
     bus_addr_t addr, bus_size_t size)
 {
 	struct bounce_zone *bz;
 	struct bounce_page *bpage;
 
 	KASSERT(dmat->bounce_zone != NULL, ("no bounce zone in dma tag"));
 	KASSERT(map != NULL, ("add_bounce_page: bad map %p", map));
 
 	bz = dmat->bounce_zone;
 	if (map->pagesneeded == 0)
 		panic("add_bounce_page: map doesn't need any pages");
 	map->pagesneeded--;
 
 	if (map->pagesreserved == 0)
 		panic("add_bounce_page: map doesn't need any pages");
 	map->pagesreserved--;
 
 	mtx_lock(&bounce_lock);
 	bpage = STAILQ_FIRST(&bz->bounce_page_list);
 	if (bpage == NULL)
 		panic("add_bounce_page: free page list is empty");
 
 	STAILQ_REMOVE_HEAD(&bz->bounce_page_list, links);
 	bz->reserved_bpages--;
 	bz->active_bpages++;
 	mtx_unlock(&bounce_lock);
 
 	if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) {
 		/* Page offset needs to be preserved. */
 		bpage->vaddr |= addr & PAGE_MASK;
 		bpage->busaddr |= addr & PAGE_MASK;
 	}
 	bpage->datavaddr = vaddr;
 	bpage->datapage = PHYS_TO_VM_PAGE(addr);
 	bpage->dataoffs = addr & PAGE_MASK;
 	bpage->datacount = size;
 	STAILQ_INSERT_TAIL(&(map->bpages), bpage, links);
 	return (bpage->busaddr);
 }
 
 static void
 free_bounce_page(bus_dma_tag_t dmat, struct bounce_page *bpage)
 {
 	struct bus_dmamap *map;
 	struct bounce_zone *bz;
 
 	bz = dmat->bounce_zone;
 	bpage->datavaddr = 0;
 	bpage->datacount = 0;
 	if (dmat->flags & BUS_DMA_KEEP_PG_OFFSET) {
 		/*
 		 * Reset the bounce page to start at offset 0.  Other uses
 		 * of this bounce page may need to store a full page of
 		 * data and/or assume it starts on a page boundary.
 		 */
 		bpage->vaddr &= ~PAGE_MASK;
 		bpage->busaddr &= ~PAGE_MASK;
 	}
 
 	mtx_lock(&bounce_lock);
 	STAILQ_INSERT_HEAD(&bz->bounce_page_list, bpage, links);
 	bz->free_bpages++;
 	bz->active_bpages--;
 	if ((map = STAILQ_FIRST(&bounce_map_waitinglist)) != NULL) {
 		if (reserve_bounce_pages(map->dmat, map, 1) == 0) {
 			STAILQ_REMOVE_HEAD(&bounce_map_waitinglist, links);
 			STAILQ_INSERT_TAIL(&bounce_map_callbacklist,
 			    map, links);
 			busdma_swi_pending = 1;
 			bz->total_deferred++;
 			swi_sched(vm_ih, 0);
 		}
 	}
 	mtx_unlock(&bounce_lock);
 }
 
 void
 busdma_swi(void)
 {
 	bus_dma_tag_t dmat;
 	struct bus_dmamap *map;
 
 	mtx_lock(&bounce_lock);
 	while ((map = STAILQ_FIRST(&bounce_map_callbacklist)) != NULL) {
 		STAILQ_REMOVE_HEAD(&bounce_map_callbacklist, links);
 		mtx_unlock(&bounce_lock);
 		dmat = map->dmat;
 		dmat->lockfunc(dmat->lockfuncarg, BUS_DMA_LOCK);
 		bus_dmamap_load_mem(map->dmat, map, &map->mem, map->callback,
 		    map->callback_arg, BUS_DMA_WAITOK);
 		dmat->lockfunc(dmat->lockfuncarg, BUS_DMA_UNLOCK);
 		mtx_lock(&bounce_lock);
 	}
 	mtx_unlock(&bounce_lock);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/arm/platform.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/arm/platform.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/arm/platform.c	(revision 306283)
@@ -1,230 +1,243 @@
 /*-
  * Copyright (c) 2005 Peter Grehan
  * Copyright (c) 2009 Nathan Whitehorn
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Dispatch platform calls to the appropriate platform implementation
  * through a previously registered kernel object.
  */
 
-#define	_ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktr.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 #include <machine/bus_dma.h>
 #include <machine/cpu.h>
 #include <machine/intr.h>
 #include <machine/machdep.h>
 #include <machine/md_var.h>
 #include <machine/platform.h>
 #include <machine/platformvar.h>
 #include <machine/smp.h>
 
 #include "platform_if.h"
 
 static platform_def_t	*plat_def_impl;
 static platform_t	plat_obj;
 static struct kobj_ops	plat_kernel_kops;
 static struct platform_kobj	plat_kernel_obj;
 
 static char plat_name[64];
 SYSCTL_STRING(_hw, OID_AUTO, platform, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, plat_name, 0,
     "Platform currently in use");
 
 /*
  * Platform install routines. Highest priority wins, using the same
  * algorithm as bus attachment.
  */
 SET_DECLARE(platform_set, platform_def_t);
 
 #ifdef MULTIDELAY
 static delay_func platform_delay;
 #endif
 
 void
 platform_probe_and_attach(void)
 {
 	platform_def_t	**platpp, *platp;
 	int		prio, best_prio;
 
 	plat_obj = &plat_kernel_obj;
 	best_prio = 0;
 
 	/*
 	 * We are unable to use TUNABLE_STR as the read will happen
 	 * well after this function has returned.
 	 */
 	TUNABLE_STR_FETCH("hw.platform", plat_name, sizeof(plat_name));
 
 	/*
 	 * Try to locate the best platform kobj
 	 */
 	SET_FOREACH(platpp, platform_set) {
 		platp = *platpp;
 
 		/*
 		 * Take care of compiling the selected class, and
 		 * then statically initialise the MMU object
 		 */
 		kobj_class_compile_static((kobj_class_t)platp,
 		    &plat_kernel_kops);
 		kobj_init_static((kobj_t)plat_obj, (kobj_class_t)platp);
 
 		plat_obj->cls = platp;
 
 		prio = PLATFORM_PROBE(plat_obj);
 
 		/* Check for errors */
 		if (prio > 0)
 			continue;
 
 		/*
 		 * Check if this module was specifically requested through
 		 * the loader tunable we provide.
 		 */
 		if (strcmp(platp->name,plat_name) == 0) {
 			plat_def_impl = platp;
 			break;
 		}
 
 		/* Otherwise, see if it is better than our current best */
 		if (plat_def_impl == NULL || prio > best_prio) {
 			best_prio = prio;
 			plat_def_impl = platp;
 		}
 
 		/*
 		 * We can't free the KOBJ, since it is static. Reset the ops
 		 * member of this class so that we can come back later.
 		 */
 		platp->ops = NULL;
 	}
 
 	if (plat_def_impl == NULL)
 		panic("No platform module found!");
 
 	/*
 	 * Recompile to make sure we ended with the
 	 * correct one, and then attach.
 	 */
 
 	kobj_class_compile_static((kobj_class_t)plat_def_impl,
 	    &plat_kernel_kops);
 	kobj_init_static((kobj_t)plat_obj, (kobj_class_t)plat_def_impl);
 
 	strlcpy(plat_name, plat_def_impl->name, sizeof(plat_name));
 
 #ifdef MULTIDELAY
 	/* Set a default delay function */
 	arm_set_delay(platform_delay, NULL);
 #endif
 
 	PLATFORM_ATTACH(plat_obj);
 }
 
 int
 platform_devmap_init(void)
 {
 
 	return PLATFORM_DEVMAP_INIT(plat_obj);
 }
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return PLATFORM_LASTADDR(plat_obj);
 }
 
 void
 platform_gpio_init(void)
 {
 
 	PLATFORM_GPIO_INIT(plat_obj);
 }
 
 void
 platform_late_init(void)
 {
 
 	PLATFORM_LATE_INIT(plat_obj);
+}
+
+void
+cpu_reset(void)
+{
+
+	PLATFORM_CPU_RESET(plat_obj);
+
+	printf("cpu_reset failed");
+
+	intr_disable();
+	while(1) {
+		cpu_sleep(0);
+	}
 }
 
 #ifdef MULTIDELAY
 static void
 platform_delay(int usec, void *arg __unused)
 {
 	int counts;
 
 	for (; usec > 0; usec--)
 		for (counts = plat_obj->cls->delay_count; counts > 0; counts--)
 			/*
 			 * Prevent the compiler from optimizing
 			 * out the loop
 			 */
 			cpufunc_nullop();
 }
 #endif
 
 #if defined(SMP) && defined(PLATFORM_SMP)
 void
 platform_mp_setmaxid(void)
 {
 	int ncpu;
 
 	PLATFORM_MP_SETMAXID(plat_obj);
 
 	if (TUNABLE_INT_FETCH("hw.ncpu", &ncpu)) {
 		if (ncpu >= 1 && ncpu <= mp_ncpus) {
 			mp_ncpus = ncpu;
 			mp_maxid = ncpu - 1;
 		}
 	}
 }
 
 void
 platform_mp_start_ap(void)
 {
 
 	PLATFORM_MP_START_AP(plat_obj);
 }
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/arm/platform_if.m
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/arm/platform_if.m	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/arm/platform_if.m	(revision 306283)
@@ -1,135 +1,142 @@
 #-
 # Copyright (c) 2009 Nathan Whitehorn
 # All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
 # 1. Redistributions of source code must retain the above copyright
 #    notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 #    notice, this list of conditions and the following disclaimer in the
 #    documentation and/or other materials provided with the distribution.
 #
 # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
 # $FreeBSD$
 #
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/smp.h>
 
 #include <machine/machdep.h>
 #include <machine/platform.h>
 #include <machine/platformvar.h>
 #include <machine/smp.h>
 #include <machine/vmparam.h>
 
 /**
  * @defgroup PLATFORM platform - KObj methods for ARM platform
  * implementations
  * @brief A set of methods required by all platform implementations.
  * These are used to bring up secondary CPUs, supply the physical memory
  * map, etc.
  *@{
  */
 
 INTERFACE platform;
 
 #
 # Default implementations
 #
 CODE {
 	static void platform_null_attach(platform_t plat)
 	{
 		return;
 	}
 
 	static void platform_default_mp_setmaxid(platform_t plat)
 	{
 		mp_ncpus = 1;
 		mp_maxid = 0;
 	}
 };
 
 /**
  * @brief Probe for whether we are on this platform, returning the standard
  * newbus probe codes. If we have Open Firmware or a flattened device tree,
  * it is guaranteed to be available at this point.
  */
 METHOD int probe {
 	platform_t	_plat;
 };
 
 /**
  * @brief Attach this platform module. This happens before the MMU is online,
  * so the platform module can install its own high-priority MMU module at
  * this point.
  */
 METHOD int attach {
 	platform_t	_plat;
 } DEFAULT platform_null_attach;
 
 /**
  * @brief Called as one of the last steps of early virtual memory
  * initialization, shortly before the new page tables are installed.
  */
 METHOD int devmap_init {
 	platform_t	_plat;
 };
 
 /**
  * @brief Called after devmap_init(), and must return the address of the
  * first byte of unusable KVA space.  This allows a platform to carve out
  * of the top of the KVA space whatever reserves it needs for things like
  * static device mapping, and this is called to get the value before
  * calling pmap_bootstrap() which uses the value to size the available KVA.
  */
 METHOD vm_offset_t lastaddr {
 	platform_t	_plat;
 };
 
 /**
  * @brief Called after the static device mappings are established and just
  * before cninit(). The intention is that the routine can do any hardware
  * setup (such as gpio or pinmux) necessary to make the console functional.
  */
 METHOD void gpio_init {
 	platform_t	_plat;
 };
 
 /**
  * @brief Called just after cninit(). This is the first of the init
  * routines that can use printf() and expect the output to appear on
  * a standard console.
  */
 METHOD void late_init {
 	platform_t	_plat;
 };
 
 /**
  * @brief Called by cpu_mp_setmaxid() to set mp_maxid and mp_ncpus.
  */
 METHOD void mp_setmaxid {
 	platform_t	_plat;
 } DEFAULT platform_default_mp_setmaxid;
 
 /**
  * @brief Called by cpu_mp_start to start the secondary processors.
  */
 METHOD void mp_start_ap {
 	platform_t	_plat;
 };
+
+/**
+ * @brief Called by cpu_reset to reboot.
+ */
+METHOD void cpu_reset {
+	platform_t	_plat;
+};
Index: user/alc/PQ_LAUNDRY/sys/arm/at91/at91_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/at91/at91_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/at91/at91_common.c	(revision 306283)
@@ -1,124 +1,120 @@
 /*-
  * Copyright (c) 2014 M. Warner Losh. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/intr.h>
 #include <machine/machdep.h>
 #include <machine/platform.h> 
 
 #include <arm/at91/at91var.h>
 #include <arm/at91/at91soc.h>
 #include <arm/at91/at91_aicreg.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/fdt.h>
 
 extern const struct devmap_entry at91_devmap[];
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_aic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 	int offset;
 
 	if (fdt_is_compatible(node, "atmel,at91rm9200-aic"))
 		offset = 0;
 	else
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]) + offset;
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_aic_decode_ic,
 	NULL
 };
 #endif
 
 static void
 at91_eoi(void *unused)
 {
 	uint32_t *eoicr = (uint32_t *)(0xdffff000 + IC_EOICR);
 
 	*eoicr = 0;
 }
 
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (devmap_lastaddr());
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 	arm_post_filter = at91_eoi;
 	at91_soc_id();
 }
 
 int
 platform_devmap_init(void)
 {
 
 //	devmap_add_entry(0xfff00000, 0x00100000); /* 1MB - uart, aic and timers*/
 
 	devmap_register_table(at91_devmap);
 
 	return (0);
 }
 
 void
 platform_gpio_init(void)
 {
 }
 
 void
 platform_late_init(void)
 {
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/broadcom/bcm2835/bcm2835_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/broadcom/bcm2835/bcm2835_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/broadcom/bcm2835/bcm2835_common.c	(revision 306283)
@@ -1,81 +1,77 @@
 /*-
  * Copyright (C) 2008-2011 MARVELL INTERNATIONAL LTD.
  * All rights reserved.
  *
  * Developed by Semihalf.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of MARVELL nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/kdb.h>
 #include <sys/reboot.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/vmparam.h>
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_intc_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (fdt_is_compatible(node, "broadcom,bcm2835-armctrl-ic")) {
 		*interrupt = fdt32_to_cpu(intr[0]);
 		*trig = INTR_TRIGGER_CONFORM;
 		*pol = INTR_POLARITY_CONFORM;
 		return (0);
 	}
 #ifdef SOC_BCM2836
 	if (fdt_is_compatible(node, "brcm,bcm2836-l1-intc")) {
 		*interrupt = fdt32_to_cpu(intr[0]) + 72;
 		*trig = INTR_TRIGGER_CONFORM;
 		*pol = INTR_POLARITY_CONFORM;
 		return (0);
 	}
 #endif
 	return (ENXIO);
 }
 
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_intc_decode_ic,
 	NULL
 };
 #endif /* INTRNG */
Index: user/alc/PQ_LAUNDRY/sys/arm/broadcom/bcm2835/bcm2835_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/broadcom/bcm2835/bcm2835_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/broadcom/bcm2835/bcm2835_machdep.c	(revision 306283)
@@ -1,157 +1,145 @@
 /*-
  * Copyright (c) 2012 Oleksandr Tymoshenko.
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: FreeBSD: //depot/projects/arm/src/sys/arm/at91/kb920x_machdep.c, rev 45
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platform.h>
 #include <machine/platformvar.h>
 
 #include <dev/fdt/fdt_common.h>
 
 #include <arm/broadcom/bcm2835/bcm2835_wdog.h>
 
 #include "platform_if.h"
 
 static vm_offset_t
 bcm2835_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 static void
 bcm2835_late_init(platform_t plat)
 {
 	phandle_t system;
 	pcell_t cells[2];
 	int len;
 
 	system = OF_finddevice("/system");
 	if (system != 0) {
 		len = OF_getprop(system, "linux,serial", &cells, sizeof(cells));
 		if (len > 0)
 			board_set_serial(fdt64_to_cpu(*((uint64_t *)cells)));
 
 		len = OF_getprop(system, "linux,revision", &cells, sizeof(cells));
 		if (len > 0)
 			board_set_revision(fdt32_to_cpu(*((uint32_t *)cells)));
 	}
 }
 
 #ifdef SOC_BCM2835
 /*
  * Set up static device mappings.
  * All on-chip peripherals exist in a 16MB range starting at 0x20000000.
  * Map the entire range using 1MB section mappings.
  */
 static int
 bcm2835_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x20000000, 0x01000000);
 	return (0);
 }
 #endif
 
 #ifdef SOC_BCM2836
 static int
 bcm2836_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x3f000000, 0x01000000);
 	return (0);
 }
 #endif
 
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
 
-	return (NULL);
-}
 
-int
-bus_dma_get_range_nb(void)
+static void
+bcm2835_cpu_reset(platform_t plat)
 {
-
-	return (0);
-}
-
-void
-cpu_reset()
-{
 	bcmwd_watchdog_reset();
-	while (1);
 }
 
 #ifdef SOC_BCM2835
 static platform_method_t bcm2835_methods[] = {
 	PLATFORMMETHOD(platform_devmap_init,	bcm2835_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	bcm2835_lastaddr),
 	PLATFORMMETHOD(platform_late_init,	bcm2835_late_init),
+	PLATFORMMETHOD(platform_cpu_reset,	bcm2835_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(bcm2835, "bcm2835", 0, "raspberrypi,model-b", 0);
 #endif
 
 #ifdef SOC_BCM2836
 static platform_method_t bcm2836_methods[] = {
 	PLATFORMMETHOD(platform_devmap_init,	bcm2836_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	bcm2835_lastaddr),
 	PLATFORMMETHOD(platform_late_init,	bcm2835_late_init),
+	PLATFORMMETHOD(platform_cpu_reset,	bcm2835_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(bcm2836, "bcm2836", 0, "brcm,bcm2709", 0);
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx51_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx51_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx51_machdep.c	(revision 306283)
@@ -1,104 +1,105 @@
 /*-
  * Copyright (c) 2013 Ian Lepore <ian@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/reboot.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platformvar.h> 
 
 #include <arm/freescale/imx/imx_machdep.h>
 
 #include "platform_if.h"
 
 static vm_offset_t
 imx51_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 static int
 imx51_attach(platform_t plat)
 {
 
 	/* XXX - Get rid of this stuff soon. */
 	boothowto |= RB_VERBOSE|RB_MULTIPLE;
 	bootverbose = 1;
 
 	return (0);
 }
 
 /*
  * Set up static device mappings.  This is hand-optimized platform-specific
  * config data which covers most of the common on-chip devices with a few 1MB
  * section mappings.
  *
  * Notably missing are entries for GPU, IPU, in general anything video related.
  */
 static int
 imx51_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x70000000, 0x00100000);
 	devmap_add_entry(0x73f00000, 0x00100000);
 	devmap_add_entry(0x83f00000, 0x00100000);
 
 	return (0);
 }
 
-void
-cpu_reset(void)
+static void
+imx51_cpu_reset(platform_t plat)
 {
 
 	imx_wdog_cpu_reset(0x73F98000);
 }
 
 u_int imx_soc_type()
 {
 	return (IMXSOC_51);
 }
 
 static platform_method_t imx51_methods[] = {
 	PLATFORMMETHOD(platform_attach,		imx51_attach),
 	PLATFORMMETHOD(platform_devmap_init,	imx51_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	imx51_lastaddr),
+	PLATFORMMETHOD(platform_cpu_reset,	imx51_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 
 FDT_PLATFORM_DEF(imx51, "i.MX51", 0, "fsl,imx51", 0);
Index: user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx53_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx53_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx53_machdep.c	(revision 306283)
@@ -1,105 +1,106 @@
 /*-
  * Copyright (c) 2013 Ian Lepore <ian@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/reboot.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platformvar.h> 
 
 #include <arm/freescale/imx/imx_machdep.h>
 
 #include "platform_if.h"
 
 static vm_offset_t
 imx53_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 static int
 imx53_attach(platform_t plat)
 {
 
 	/* XXX - Get rid of this stuff soon. */
 	boothowto |= RB_VERBOSE|RB_MULTIPLE;
 	bootverbose = 1;
 
 	return (0);
 }
 
 /*
  * Set up static device mappings.  This is hand-optimized platform-specific
  * config data which covers most of the common on-chip devices with a few 1MB
  * section mappings.
  *
  * Notably missing are entries for GPU, IPU, in general anything video related.
  */
 static int
 imx53_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x50000000, 0x00100000);
 	devmap_add_entry(0x53f00000, 0x00100000);
 	devmap_add_entry(0x63f00000, 0x00100000);
 
 	return (0);
 }
 
-void
-cpu_reset(void)
+static void
+imx53_cpu_reset(platform_t plat)
 {
 
 	imx_wdog_cpu_reset(0x53F98000);
 }
 
 u_int imx_soc_type()
 {
 	return (IMXSOC_53);
 }
 
 static platform_method_t imx53_methods[] = {
 	PLATFORMMETHOD(platform_attach,		imx53_attach),
 	PLATFORMMETHOD(platform_devmap_init,	imx53_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	imx53_lastaddr),
+	PLATFORMMETHOD(platform_cpu_reset,	imx53_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 
 FDT_PLATFORM_DEF(imx53, "i.MX53", 0, "fsl,imx53", 0);
 
Index: user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx6_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx6_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx6_machdep.c	(revision 306283)
@@ -1,359 +1,356 @@
 /*-
  * Copyright (c) 2013 Ian Lepore <ian@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/reboot.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/bus.h>
 #include <machine/intr.h>
 #include <machine/machdep.h>
 #include <machine/platformvar.h>
 
 #include <arm/arm/mpcore_timervar.h>
 #include <arm/freescale/imx/imx6_anatopreg.h>
 #include <arm/freescale/imx/imx6_anatopvar.h>
 #include <arm/freescale/imx/imx_machdep.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include "platform_if.h"
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 static uint32_t gpio1_node;
 
 #ifndef INTRNG
 /*
  * Work around the linux workaround for imx6 erratum 006687, in which some
  * ethernet interrupts don't go to the GPC and thus won't wake the system from
  * Wait mode. We don't use Wait mode (which halts the GIC, leaving only GPC
  * interrupts able to wake the system), so we don't experience the bug at all.
  * The linux workaround is to reconfigure GPIO1_6 as the ENET interrupt by
  * writing magic values to an undocumented IOMUX register, then letting the gpio
  * interrupt driver notify the ethernet driver.  We'll be able to do all that
  * (even though we don't need to) once the INTRNG project is committed and the
  * imx_gpio driver becomes an interrupt driver.  Until then, this crazy little
  * workaround watches for requests to map an interrupt 6 with the interrupt
  * controller node referring to gpio1, and it substitutes the proper ffec
  * interrupt number.
  */
 static int
 imx6_decode_fdt(uint32_t iparent, uint32_t *intr, int *interrupt,
     int *trig, int *pol)
 {
 
 	if (fdt32_to_cpu(intr[0]) == 6 && 
 	    OF_node_from_xref(iparent) == gpio1_node) {
 		*interrupt = 150;
 		*trig = INTR_TRIGGER_CONFORM;
 		*pol  = INTR_POLARITY_CONFORM;
 		return (0);
 	}
 	return (gic_decode_fdt(iparent, intr, interrupt, trig, pol));
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&imx6_decode_fdt,
 	NULL
 };
 #endif
 
 /*
  * Fix FDT data related to interrupts.
  *
  * Driven by the needs of linux and its drivers (as always), the published FDT
  * data for imx6 now sets the interrupt parent for most devices to the GPC
  * interrupt controller, which is for use when the chip is in deep-sleep mode.
  * We don't support deep sleep or have a GPC-PIC driver; we need all interrupts
  * to be handled by the GIC.
  *
  * Luckily, the change to the FDT data was to assign the GPC as the interrupt
  * parent for the soc node and letting that get inherited by all other devices
  * (except a few that directly name GIC as their interrupt parent).  So we can
  * set the world right by just changing the interrupt-parent property of the soc
  * node to refer to GIC instead of GPC.  This will get us by until we write our
  * own GPC driver (or until linux changes its mind and the FDT data again).
  *
  * We validate that we have data that looks like we expect before changing it:
  *  - SOC node exists and has GPC as its interrupt parent.
  *  - GPC node exists and has GIC as its interrupt parent.
  *  - GIC node exists and is its own interrupt parent.
  *
  * This applies to all models of imx6.  Luckily all of them have the devices
  * involved at the same addresses on the same busses, so we don't need any
  * per-soc logic.  We handle this at platform attach time rather than via the
  * fdt_fixup_table, because the latter requires matching on the FDT "model"
  * property, and this applies to all boards including those not yet invented.
  */
 static void
 fix_fdt_interrupt_data(void)
 {
 	phandle_t gicipar, gicnode, gicxref;
 	phandle_t gpcipar, gpcnode, gpcxref;
 	phandle_t socipar, socnode;
 	int result;
 
 	socnode = OF_finddevice("/soc");
 	if (socnode == -1)
 	    return;
 	result = OF_getencprop(socnode, "interrupt-parent", &socipar,
 	    sizeof(socipar));
 	if (result <= 0)
 		return;
 
 	gicnode = OF_finddevice("/soc/interrupt-controller@00a01000");
 	if (gicnode == -1)
 		return;
 	result = OF_getencprop(gicnode, "interrupt-parent", &gicipar,
 	    sizeof(gicipar));
 	if (result <= 0)
 		return;
 	gicxref = OF_xref_from_node(gicnode);
 
 	gpcnode = OF_finddevice("/soc/aips-bus@02000000/gpc@020dc000");
 	if (gpcnode == -1)
 		return;
 	result = OF_getencprop(gpcnode, "interrupt-parent", &gpcipar,
 	    sizeof(gpcipar));
 	if (result <= 0)
 		return;
 	gpcxref = OF_xref_from_node(gpcnode);
 
 	if (socipar != gpcxref || gpcipar != gicxref || gicipar != gicxref)
 		return;
 
 	gicxref = cpu_to_fdt32(gicxref);
 	OF_setprop(socnode, "interrupt-parent", &gicxref, sizeof(gicxref));
 }
 
 static vm_offset_t
 imx6_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 static int
 imx6_attach(platform_t plat)
 {
 
 	/* Fix soc interrupt-parent property. */
 	fix_fdt_interrupt_data();
 
 	/* Inform the MPCore timer driver that its clock is variable. */
 	arm_tmr_change_frequency(ARM_TMR_FREQUENCY_VARIES);
 
 	return (0);
 }
 
 static void
 imx6_late_init(platform_t plat)
 {
 	const uint32_t IMX6_WDOG_SR_PHYS = 0x020bc004;
 
 	imx_wdog_init_last_reset(IMX6_WDOG_SR_PHYS);
 
 	/* Cache the gpio1 node handle for imx6_decode_fdt() workaround code. */
 	gpio1_node = OF_node_from_xref(
 	    OF_finddevice("/soc/aips-bus@02000000/gpio@0209c000"));
 }
 
 /*
  * Set up static device mappings.
  *
  * This attempts to cover the most-used devices with 1MB section mappings, which
  * is good for performance (uses fewer TLB entries for device access).
  *
  * ARMMP covers the interrupt controller, MPCore timers, global timer, and the
  * L2 cache controller.  Most of the 1MB range is unused reserved space.
  *
  * AIPS1/AIPS2 cover most of the on-chip devices such as uart, spi, i2c, etc.
  *
  * Notably not mapped right now are HDMI, GPU, and other devices below ARMMP in
  * the memory map.  When we get support for graphics it might make sense to
  * static map some of that area.  Be careful with other things in that area such
  * as OCRAM that probably shouldn't be mapped as VM_MEMATTR_DEVICE memory.
  */
 static int
 imx6_devmap_init(platform_t plat)
 {
 	const uint32_t IMX6_ARMMP_PHYS = 0x00a00000;
 	const uint32_t IMX6_ARMMP_SIZE = 0x00100000;
 	const uint32_t IMX6_AIPS1_PHYS = 0x02000000;
 	const uint32_t IMX6_AIPS1_SIZE = 0x00100000;
 	const uint32_t IMX6_AIPS2_PHYS = 0x02100000;
 	const uint32_t IMX6_AIPS2_SIZE = 0x00100000;
 
 	devmap_add_entry(IMX6_ARMMP_PHYS, IMX6_ARMMP_SIZE);
 	devmap_add_entry(IMX6_AIPS1_PHYS, IMX6_AIPS1_SIZE);
 	devmap_add_entry(IMX6_AIPS2_PHYS, IMX6_AIPS2_SIZE);
 
 	return (0);
 }
 
-void
-cpu_reset(void)
+static void
+imx6_cpu_reset(platform_t plat)
 {
 	const uint32_t IMX6_WDOG_CR_PHYS = 0x020bc000;
 
 	imx_wdog_cpu_reset(IMX6_WDOG_CR_PHYS);
 }
 
 /*
  * Determine what flavor of imx6 we're running on.
  *
  * This code is based on the way u-boot does it.  Information found on the web
  * indicates that Freescale themselves were the original source of this logic,
  * including the strange check for number of CPUs in the SCU configuration
  * register, which is apparently needed on some revisions of the SOLO.
  *
  * According to the documentation, there is such a thing as an i.MX6 Dual
  * (non-lite flavor).  However, Freescale doesn't seem to have assigned it a
  * number or provided any logic to handle it in their detection code.
  *
  * Note that the ANALOG_DIGPROG and SCU configuration registers are not
  * documented in the chip reference manual.  (SCU configuration is mentioned,
  * but not mapped out in detail.)  I think the bottom two bits of the scu config
  * register may be ncpu-1.
  *
  * This hasn't been tested yet on a dual[-lite].
  *
  * On a solo:
  *      digprog    = 0x00610001
  *      hwsoc      = 0x00000062
  *      scu config = 0x00000500
  * On a quad:
  *      digprog    = 0x00630002
  *      hwsoc      = 0x00000063
  *      scu config = 0x00005503
  */
 u_int imx_soc_type()
 {
 	uint32_t digprog, hwsoc;
 	uint32_t *pcr;
 	static u_int soctype;
 	const vm_offset_t SCU_CONFIG_PHYSADDR = 0x00a00004;
 #define	HWSOC_MX6SL	0x60
 #define	HWSOC_MX6DL	0x61
 #define	HWSOC_MX6SOLO	0x62
 #define	HWSOC_MX6Q	0x63
 
 	if (soctype != 0)
 		return (soctype);
 
 	digprog = imx6_anatop_read_4(IMX6_ANALOG_DIGPROG_SL);
 	hwsoc = (digprog >> IMX6_ANALOG_DIGPROG_SOCTYPE_SHIFT) & 
 	    IMX6_ANALOG_DIGPROG_SOCTYPE_MASK;
 
 	if (hwsoc != HWSOC_MX6SL) {
 		digprog = imx6_anatop_read_4(IMX6_ANALOG_DIGPROG);
 		hwsoc = (digprog & IMX6_ANALOG_DIGPROG_SOCTYPE_MASK) >>
 		    IMX6_ANALOG_DIGPROG_SOCTYPE_SHIFT;
 		/*printf("digprog = 0x%08x\n", digprog);*/
 		if (hwsoc == HWSOC_MX6DL) {
 			pcr = devmap_ptov(SCU_CONFIG_PHYSADDR, 4);
 			if (pcr != NULL) {
 				/*printf("scu config = 0x%08x\n", *pcr);*/
 				if ((*pcr & 0x03) == 0) {
 					hwsoc = HWSOC_MX6SOLO;
 				}
 			}
 		}
 	}
 	/* printf("hwsoc 0x%08x\n", hwsoc); */
 
 	switch (hwsoc) {
 	case HWSOC_MX6SL:
 		soctype = IMXSOC_6SL;
 		break;
 	case HWSOC_MX6SOLO:
 		soctype = IMXSOC_6S;
 		break;
 	case HWSOC_MX6DL:
 		soctype = IMXSOC_6DL;
 		break;
 	case HWSOC_MX6Q :
 		soctype = IMXSOC_6Q;
 		break;
 	default:
 		printf("imx_soc_type: Don't understand hwsoc 0x%02x, "
 		    "digprog 0x%08x; assuming IMXSOC_6Q\n", hwsoc, digprog);
 		soctype = IMXSOC_6Q;
 		break;
 	}
 
 	return (soctype);
 }
 
 /*
  * Early putc routine for EARLY_PRINTF support.  To use, add to kernel config:
  *   option SOCDEV_PA=0x02000000
  *   option SOCDEV_VA=0x02000000
  *   option EARLY_PRINTF
  * Resist the temptation to change the #if 0 to #ifdef EARLY_PRINTF here. It
  * makes sense now, but if multiple SOCs do that it will make early_putc another
  * duplicate symbol to be eliminated on the path to a generic kernel.
  */
 #if 0 
 static void 
 imx6_early_putc(int c)
 {
 	volatile uint32_t * UART_STAT_REG = (uint32_t *)0x02020098;
 	volatile uint32_t * UART_TX_REG   = (uint32_t *)0x02020040;
 	const uint32_t      UART_TXRDY    = (1 << 3);
 
 	while ((*UART_STAT_REG & UART_TXRDY) == 0)
 		continue;
 	*UART_TX_REG = c;
 }
 early_putc_t *early_putc = imx6_early_putc;
 #endif
 
 static platform_method_t imx6_methods[] = {
 	PLATFORMMETHOD(platform_attach,		imx6_attach),
 	PLATFORMMETHOD(platform_lastaddr,	imx6_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,	imx6_devmap_init),
 	PLATFORMMETHOD(platform_late_init,	imx6_late_init),
+	PLATFORMMETHOD(platform_cpu_reset,	imx6_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 
 FDT_PLATFORM_DEF2(imx6, imx6s, "i.MX6 Solo", 0, "fsl,imx6s", 0);
 FDT_PLATFORM_DEF2(imx6, imx6d, "i.MX6 Dual", 0, "fsl,imx6d", 0);
 FDT_PLATFORM_DEF2(imx6, imx6q, "i.MX6 Quad", 0, "fsl,imx6q", 0);
Index: user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx_common.c	(revision 306283)
@@ -1,74 +1,70 @@
 /*-
  * Copyright (C) 2008-2011 MARVELL INTERNATIONAL LTD.
  * Copyright (c) 2012, 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * Developed by Semihalf.
  *
  * Portions of this software were developed by Oleksandr Rybalko
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of MARVELL nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/kdb.h>
 #include <sys/reboot.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/vmparam.h>
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_intc_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_intc_decode_ic,
 	NULL
 };
 #endif /* INTRNG */
Index: user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/freescale/imx/imx_machdep.c	(revision 306283)
@@ -1,120 +1,105 @@
 /*-
  * Copyright (c) 2013 Ian Lepore <ian@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/reboot.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <machine/machdep.h>
 
 #include <arm/freescale/imx/imx_machdep.h>
 #include <arm/freescale/imx/imx_wdogreg.h>
 
 SYSCTL_NODE(_hw, OID_AUTO, imx, CTLFLAG_RW, NULL, "i.MX container");
 
 static int last_reset_status;
 SYSCTL_UINT(_hw_imx, OID_AUTO, last_reset_status, CTLFLAG_RD, 
     &last_reset_status, 0, "Last reset status register");
 
 SYSCTL_STRING(_hw_imx, OID_AUTO, last_reset_reason, CTLFLAG_RD, 
     "unknown", 0, "Last reset reason");
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
-	return (0);
-}
 
 /*
  * This code which manipulates the watchdog hardware is here to implement
  * cpu_reset() because the watchdog is the only way for software to reset the
  * chip.  Why here and not in imx_wdog.c?  Because there's no requirement that
  * the watchdog driver be compiled in, but it's nice to be able to reboot even
  * if it's not.
  */
 void
 imx_wdog_cpu_reset(vm_offset_t wdcr_physaddr)
 {
 	volatile uint16_t * pcr;
 
 	/*
 	 * Trigger an immediate reset by clearing the SRS bit in the watchdog
 	 * control register.  The reset happens on the next cycle of the wdog
 	 * 32KHz clock, so hang out in a spin loop until the reset takes effect.
 	 */
 	if ((pcr = devmap_ptov(wdcr_physaddr, sizeof(*pcr))) == NULL) {
 		printf("cpu_reset() can't find its control register... locking up now.");
 	} else {
 		*pcr &= ~WDOG_CR_SRS;
 	}
 	for (;;)
 		continue;
 }
 
 void
 imx_wdog_init_last_reset(vm_offset_t wdsr_phys)
 {
 	volatile uint16_t * psr;
 
 	if ((psr = devmap_ptov(wdsr_phys, sizeof(*psr))) == NULL)
 		return;
 	last_reset_status = *psr;
 	if (last_reset_status & WDOG_RSR_SFTW) {
 		sysctl___hw_imx_last_reset_reason.oid_arg1 = "SoftwareReset";
 	} else if (last_reset_status & WDOG_RSR_TOUT) {
 		sysctl___hw_imx_last_reset_reason.oid_arg1 = "WatchdogTimeout";
 	} else if (last_reset_status & WDOG_RSR_POR) {
 		sysctl___hw_imx_last_reset_reason.oid_arg1 = "PowerOnReset";
 	}
 }
 
 u_int
 imx_soc_family(void)
 {
 	return (imx_soc_type() >> IMXSOC_FAMSHIFT);
 }
 
 
Index: user/alc/PQ_LAUNDRY/sys/arm/freescale/vybrid/vf_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/freescale/vybrid/vf_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/freescale/vybrid/vf_common.c	(revision 306283)
@@ -1,88 +1,84 @@
 /*-
  * Copyright (c) 2013 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 
 #include <arm/freescale/vybrid/vf_src.h>
 
 void
 cpu_reset(void)
 {
 	phandle_t src;
 	uint32_t addr, paddr;
 	bus_addr_t vaddr;
 
 	if (src_swreset() == 0)
 		goto end;
 
 	src = OF_finddevice("src");
 	if ((src != 0) && (OF_getprop(src, "reg", &paddr, sizeof(paddr))) > 0) {
 		addr = fdt32_to_cpu(paddr);
 		if (bus_space_map(fdtbus_bs_tag, addr, 0x10, 0, &vaddr) == 0) {
 			bus_space_write_4(fdtbus_bs_tag, vaddr, 0x00, SW_RST);
 		}
 	}
 
 end:
 	while (1);
 }
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_pic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "arm,gic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_pic_decode_ic,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/freescale/vybrid/vf_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/freescale/vybrid/vf_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/freescale/vybrid/vf_machdep.c	(revision 306283)
@@ -1,94 +1,79 @@
 /*-
  * Copyright (c) 2013 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define	_ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platform.h> 
 
 #include <dev/fdt/fdt_common.h>
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (devmap_lastaddr());
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 }
 
 void
 platform_gpio_init(void)
 {
 
 }
 
 void
 platform_late_init(void)
 {
 
 }
 
 int
 platform_devmap_init(void)
 {
 
 	devmap_add_entry(0x40000000, 0x100000);
-
-	return (0);
-}
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
 
 	return (0);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/include/bus_dma.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/include/bus_dma.h	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/include/bus_dma.h	(revision 306283)
@@ -1,99 +1,99 @@
 /*	$NetBSD: bus.h,v 1.11 2003/07/28 17:35:54 thorpej Exp $	*/
 
 /*-
  * Copyright (c) 1996, 1997, 1998, 2001 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
  * NASA Ames Research Center.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1996 Charles M. Hannum.  All rights reserved.
  * Copyright (c) 1996 Christopher G. Demetriou.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Christopher G. Demetriou
  *	for the NetBSD Project.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _ARM_BUS_DMA_H
 #define _ARM_BUS_DMA_H
 
 #include <sys/bus_dma.h>
 
 /* Bus Space DMA macros */
 
 #define BUS_DMA_TAG_VALID(t)    ((t) != (bus_dma_tag_t)0)
 
-#ifdef _ARM32_BUS_DMA_PRIVATE
+#if defined(_ARM32_BUS_DMA_PRIVATE) && __ARM_ARCH < 6
 /*
  *	arm32_dma_range
  *
  *	This structure describes a valid DMA range.
  */
 struct arm32_dma_range {
 	bus_addr_t	dr_sysbase;	/* system base address */
 	bus_addr_t	dr_busbase;	/* appears here on bus */
 	bus_size_t	dr_len;		/* length of range */
 };
 
 /* _dm_buftype */
 #define	ARM32_BUFTYPE_INVALID		0
 #define	ARM32_BUFTYPE_LINEAR		1
 #define	ARM32_BUFTYPE_MBUF		2
 #define	ARM32_BUFTYPE_UIO		3
 #define	ARM32_BUFTYPE_RAW		4
 
 struct arm32_dma_range	*bus_dma_get_range(void);
 int	bus_dma_get_range_nb(void);
 
 #endif /* _ARM32_BUS_DMA_PRIVATE */
 
 #endif /* _ARM_BUS_DMA_H */
Index: user/alc/PQ_LAUNDRY/sys/arm/lpc/lpc_intc.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/lpc/lpc_intc.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/lpc/lpc_intc.c	(revision 306283)
@@ -1,252 +1,248 @@
 /*-
  * Copyright (c) 2010 Jakub Wojciech Klama <jceel@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/rman.h>
 #include <sys/timetc.h>
 #include <machine/bus.h>
 #include <machine/intr.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <arm/lpc/lpcreg.h>
 
 struct lpc_intc_softc {
 	struct resource *	li_res;
 	bus_space_tag_t		li_bst;
 	bus_space_handle_t	li_bsh;
 };
 
 static int lpc_intc_probe(device_t);
 static int lpc_intc_attach(device_t);
 static void lpc_intc_eoi(void *);
 
 static struct lpc_intc_softc *intc_softc = NULL;
 
 #define	intc_read_4(_sc, _reg)		\
     bus_space_read_4((_sc)->li_bst, (_sc)->li_bsh, (_reg))
 #define	intc_write_4(_sc, _reg, _val)		\
     bus_space_write_4((_sc)->li_bst, (_sc)->li_bsh, (_reg), (_val))
 
 static int
 lpc_intc_probe(device_t dev)
 {
 
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (!ofw_bus_is_compatible(dev, "lpc,pic"))
 		return (ENXIO);
 
 	device_set_desc(dev, "LPC32x0 Interrupt Controller");
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 lpc_intc_attach(device_t dev)
 {
 	struct lpc_intc_softc *sc = device_get_softc(dev);
 	int rid = 0;
 
 	if (intc_softc)
 		return (ENXIO);
 
 	sc->li_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, 
 	    RF_ACTIVE);
 	if (!sc->li_res) {
 		device_printf(dev, "could not alloc resources\n");
 		return (ENXIO);
 	}
 
 	sc->li_bst = rman_get_bustag(sc->li_res);
 	sc->li_bsh = rman_get_bushandle(sc->li_res);
 	intc_softc = sc;
 	arm_post_filter = lpc_intc_eoi;
 
 	/* Clear interrupt status registers and disable all interrupts */
 	intc_write_4(sc, LPC_INTC_MIC_ER, 0);
 	intc_write_4(sc, LPC_INTC_SIC1_ER, 0);
 	intc_write_4(sc, LPC_INTC_SIC2_ER, 0);
 	intc_write_4(sc, LPC_INTC_MIC_RSR, ~0);
 	intc_write_4(sc, LPC_INTC_SIC1_RSR, ~0);
 	intc_write_4(sc, LPC_INTC_SIC2_RSR, ~0);
 	return (0);
 }
 
 static device_method_t lpc_intc_methods[] = {
 	DEVMETHOD(device_probe,		lpc_intc_probe),
 	DEVMETHOD(device_attach,	lpc_intc_attach),
 	{ 0, 0 }
 };
 
 static driver_t lpc_intc_driver = {
 	"pic",
 	lpc_intc_methods,
 	sizeof(struct lpc_intc_softc),
 };
 
 static devclass_t lpc_intc_devclass;
 
 DRIVER_MODULE(pic, simplebus, lpc_intc_driver, lpc_intc_devclass, 0, 0);
 
 int
 arm_get_next_irq(int last)
 {
 	struct lpc_intc_softc *sc = intc_softc;
 	uint32_t value;
 	int i;
 
 	/* IRQs 0-31 are mapped to LPC_INTC_MIC_SR */
 	value = intc_read_4(sc, LPC_INTC_MIC_SR);
 	for (i = 0; i < 32; i++) {
 		if (value & (1 << i))
 			return (i);
 	}
 
 	/* IRQs 32-63 are mapped to LPC_INTC_SIC1_SR */
 	value = intc_read_4(sc, LPC_INTC_SIC1_SR);
 	for (i = 0; i < 32; i++) {
 		if (value & (1 << i))
 			return (i + 32);
 	}
 
 	/* IRQs 64-95 are mapped to LPC_INTC_SIC2_SR */
 	value = intc_read_4(sc, LPC_INTC_SIC2_SR);
 	for (i = 0; i < 32; i++) {
 		if (value & (1 << i))
 			return (i + 64);
 	}
 
 	return (-1);
 }
 
 void
 arm_mask_irq(uintptr_t nb)
 {
 	struct lpc_intc_softc *sc = intc_softc;
 	int reg;
 	uint32_t value;
 
 	/* Make sure that interrupt isn't active already */
 	lpc_intc_eoi((void *)nb);
 
 	if (nb > 63) {
 		nb -= 64;
 		reg = LPC_INTC_SIC2_ER;
 	} else if (nb > 31) {
 		nb -= 32;
 		reg = LPC_INTC_SIC1_ER;
 	} else
 		reg = LPC_INTC_MIC_ER;
 
 	/* Clear bit in ER register */
 	value = intc_read_4(sc, reg);
 	value &= ~(1 << nb);
 	intc_write_4(sc, reg, value);
 }
 
 void
 arm_unmask_irq(uintptr_t nb)
 {
 	struct lpc_intc_softc *sc = intc_softc;
 	int reg;
 	uint32_t value;
 
 	if (nb > 63) {
 		nb -= 64;
 		reg = LPC_INTC_SIC2_ER;
 	} else if (nb > 31) {
 		nb -= 32;
 		reg = LPC_INTC_SIC1_ER;
 	} else
 		reg = LPC_INTC_MIC_ER;
 
 	/* Set bit in ER register */
 	value = intc_read_4(sc, reg);
 	value |= (1 << nb);
 	intc_write_4(sc, reg, value);
 }
 
 static void
 lpc_intc_eoi(void *data)
 {
 	struct lpc_intc_softc *sc = intc_softc;
 	int reg;
 	int nb = (int)data;
 	uint32_t value;
 
 	if (nb > 63) {
 		nb -= 64;
 		reg = LPC_INTC_SIC2_RSR;
 	} else if (nb > 31) {
 		nb -= 32;
 		reg = LPC_INTC_SIC1_RSR;
 	} else
 		reg = LPC_INTC_MIC_RSR;
 
 	/* Set bit in RSR register */
 	value = intc_read_4(sc, reg);
 	value |= (1 << nb);
 	intc_write_4(sc, reg, value);
 
 }
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_pic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 	if (!fdt_is_compatible(node, "lpc,pic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_pic_decode_ic,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/mv/mv_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/mv/mv_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/mv/mv_machdep.c	(revision 306283)
@@ -1,506 +1,508 @@
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: FreeBSD: //depot/projects/arm/src/sys/arm/at91/kb920x_machdep.c, rev 45
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 #include <machine/machdep.h>
 #include <machine/platform.h> 
 
 #if __ARM_ARCH < 6
 #include <machine/cpu-v4.h>
 #else
 #include <machine/cpu-v6.h>
 #endif
 
 #include <arm/mv/mvreg.h>	/* XXX */
 #include <arm/mv/mvvar.h>	/* XXX eventually this should be eliminated */
 #include <arm/mv/mvwin.h>
 
 #include <dev/fdt/fdt_common.h>
 
 static int platform_mpp_init(void);
 #if defined(SOC_MV_ARMADAXP)
 void armadaxp_init_coher_fabric(void);
 void armadaxp_l2_init(void);
 #endif
 #if defined(SOC_MV_ARMADA38X)
 int armada38x_win_set_iosync_barrier(void);
 int armada38x_scu_enable(void);
 int armada38x_open_bootrom_win(void);
 #endif
 
 #define MPP_PIN_MAX		68
 #define MPP_PIN_CELLS		2
 #define MPP_PINS_PER_REG	8
 #define MPP_SEL(pin,func)	(((func) & 0xf) <<		\
     (((pin) % MPP_PINS_PER_REG) * 4))
 
 static int
 platform_mpp_init(void)
 {
 	pcell_t pinmap[MPP_PIN_MAX * MPP_PIN_CELLS];
 	int mpp[MPP_PIN_MAX];
 	uint32_t ctrl_val, ctrl_offset;
 	pcell_t reg[4];
 	u_long start, size;
 	phandle_t node;
 	pcell_t pin_cells, *pinmap_ptr, pin_count;
 	ssize_t len;
 	int par_addr_cells, par_size_cells;
 	int tuple_size, tuples, rv, pins, i, j;
 	int mpp_pin, mpp_function;
 
 	/*
 	 * Try to access the MPP node directly i.e. through /aliases/mpp.
 	 */
 	if ((node = OF_finddevice("mpp")) != -1)
 		if (fdt_is_compatible(node, "mrvl,mpp"))
 			goto moveon;
 	/*
 	 * Find the node the long way.
 	 */
 	if ((node = OF_finddevice("/")) == -1)
 		return (ENXIO);
 
 	if ((node = fdt_find_compatible(node, "simple-bus", 0)) == 0)
 		return (ENXIO);
 
 	if ((node = fdt_find_compatible(node, "mrvl,mpp", 0)) == 0)
 		/*
 		 * No MPP node. Fall back to how MPP got set by the
 		 * first-stage loader and try to continue booting.
 		 */
 		return (0);
 moveon:
 	/*
 	 * Process 'reg' prop.
 	 */
 	if ((rv = fdt_addrsize_cells(OF_parent(node), &par_addr_cells,
 	    &par_size_cells)) != 0)
 		return(ENXIO);
 
 	tuple_size = sizeof(pcell_t) * (par_addr_cells + par_size_cells);
 	len = OF_getprop(node, "reg", reg, sizeof(reg));
 	tuples = len / tuple_size;
 	if (tuple_size <= 0)
 		return (EINVAL);
 
 	/*
 	 * Get address/size. XXX we assume only the first 'reg' tuple is used.
 	 */
 	rv = fdt_data_to_res(reg, par_addr_cells, par_size_cells,
 	    &start, &size);
 	if (rv != 0)
 		return (rv);
 	start += fdt_immr_va;
 
 	/*
 	 * Process 'pin-count' and 'pin-map' props.
 	 */
 	if (OF_getprop(node, "pin-count", &pin_count, sizeof(pin_count)) <= 0)
 		return (ENXIO);
 	pin_count = fdt32_to_cpu(pin_count);
 	if (pin_count > MPP_PIN_MAX)
 		return (ERANGE);
 
 	if (OF_getprop(node, "#pin-cells", &pin_cells, sizeof(pin_cells)) <= 0)
 		pin_cells = MPP_PIN_CELLS;
 	pin_cells = fdt32_to_cpu(pin_cells);
 	if (pin_cells > MPP_PIN_CELLS)
 		return (ERANGE);
 	tuple_size = sizeof(pcell_t) * pin_cells;
 
 	bzero(pinmap, sizeof(pinmap));
 	len = OF_getprop(node, "pin-map", pinmap, sizeof(pinmap));
 	if (len <= 0)
 		return (ERANGE);
 	if (len % tuple_size)
 		return (ERANGE);
 	pins = len / tuple_size;
 	if (pins > pin_count)
 		return (ERANGE);
 	/*
 	 * Fill out a "mpp[pin] => function" table. All pins unspecified in
 	 * the 'pin-map' property are defaulted to 0 function i.e. GPIO.
 	 */
 	bzero(mpp, sizeof(mpp));
 	pinmap_ptr = pinmap;
 	for (i = 0; i < pins; i++) {
 		mpp_pin = fdt32_to_cpu(*pinmap_ptr);
 		mpp_function = fdt32_to_cpu(*(pinmap_ptr + 1));
 		mpp[mpp_pin] = mpp_function;
 		pinmap_ptr += pin_cells;
 	}
 
 	/*
 	 * Prepare and program MPP control register values.
 	 */
 	ctrl_offset = 0;
 	for (i = 0; i < pin_count;) {
 		ctrl_val = 0;
 
 		for (j = 0; j < MPP_PINS_PER_REG; j++) {
 			if (i + j == pin_count - 1)
 				break;
 			ctrl_val |= MPP_SEL(i + j, mpp[i + j]);
 		}
 		i += MPP_PINS_PER_REG;
 		bus_space_write_4(fdtbus_bs_tag, start, ctrl_offset,
 		    ctrl_val);
 
 #if defined(SOC_MV_ORION)
 		/*
 		 * Third MPP reg on Orion SoC is placed
 		 * non-linearly (with different offset).
 		 */
 		if (i ==  (2 * MPP_PINS_PER_REG))
 			ctrl_offset = 0x50;
 		else
 #endif
 			ctrl_offset += 4;
 	}
 
 	return (0);
 }
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (fdt_immr_va);
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 	if (fdt_immr_addr(MV_BASE) != 0)
 		while (1);
 }
 
 void
 platform_gpio_init(void)
 {
 
 	/*
 	 * Re-initialise MPP. It is important to call this prior to using
 	 * console as the physical connection can be routed via MPP.
 	 */
 	if (platform_mpp_init() != 0)
 		while (1);
 }
 
 void
 platform_late_init(void)
 {
 	/*
 	 * Re-initialise decode windows
 	 */
 #if !defined(SOC_MV_FREY)
 	if (soc_decode_win() != 0)
 		printf("WARNING: could not re-initialise decode windows! "
 		    "Running with existing settings...\n");
 #else
 	/* Disable watchdog and timers */
 	write_cpu_ctrl(CPU_TIMERS_BASE + CPU_TIMER_CONTROL, 0);
 #endif
 #if defined(SOC_MV_ARMADAXP)
 #if !defined(SMP)
 	/* For SMP case it should be initialized after APs are booted */
 	armadaxp_init_coher_fabric();
 #endif
 	armadaxp_l2_init();
 #endif
 
 #if defined(SOC_MV_ARMADA38X)
 	/* Set IO Sync Barrier bit for all Mbus devices */
 	if (armada38x_win_set_iosync_barrier() != 0)
 		printf("WARNING: could not map CPU Subsystem registers\n");
 	if (armada38x_scu_enable() != 0)
 		printf("WARNING: could not enable SCU\n");
 #ifdef SMP
 	/* Open window to bootROM memory - needed for SMP */
 	if (armada38x_open_bootrom_win() != 0)
 		printf("WARNING: could not open window to bootROM\n");
 #endif
 #endif
 }
 
 #define FDT_DEVMAP_MAX	(MV_WIN_CPU_MAX + 2)
 static struct devmap_entry fdt_devmap[FDT_DEVMAP_MAX] = {
 	{ 0, 0, 0, }
 };
 
 static int
 platform_sram_devmap(struct devmap_entry *map)
 {
 #if !defined(SOC_MV_ARMADAXP) && !defined(SOC_MV_ARMADA38X)
 	phandle_t child, root;
 	u_long base, size;
 	/*
 	 * SRAM range.
 	 */
 	if ((child = OF_finddevice("/sram")) != 0)
 		if (fdt_is_compatible(child, "mrvl,cesa-sram") ||
 		    fdt_is_compatible(child, "mrvl,scratchpad"))
 			goto moveon;
 
 	if ((root = OF_finddevice("/")) == 0)
 		return (ENXIO);
 
 	if ((child = fdt_find_compatible(root, "mrvl,cesa-sram", 0)) == 0 &&
 	    (child = fdt_find_compatible(root, "mrvl,scratchpad", 0)) == 0)
 			goto out;
 
 moveon:
 	if (fdt_regsize(child, &base, &size) != 0)
 		return (EINVAL);
 
 	map->pd_va = MV_CESA_SRAM_BASE; /* XXX */
 	map->pd_pa = base;
 	map->pd_size = size;
 
 	return (0);
 out:
 #endif
 	return (ENOENT);
 
 }
 
 /*
  * Supply a default do-nothing implementation of mv_pci_devmap() via a weak
  * alias.  Many Marvell platforms don't support a PCI interface, but to support
  * those that do, we end up with a reference to this function below, in
  * platform_devmap_init().  If "device pci" appears in the kernel config, the
  * real implementation of this function in arm/mv/mv_pci.c overrides the weak
  * alias defined here.
  */
 int mv_default_fdt_pci_devmap(phandle_t node, struct devmap_entry *devmap,
     vm_offset_t io_va, vm_offset_t mem_va);
 int
 mv_default_fdt_pci_devmap(phandle_t node, struct devmap_entry *devmap,
     vm_offset_t io_va, vm_offset_t mem_va)
 {
 
 	return (0);
 }
 __weak_reference(mv_default_fdt_pci_devmap, mv_pci_devmap);
 
 /*
  * XXX: When device entry in devmap has pd_size smaller than section size,
  * system will freeze during initialization
  */
 
 /*
  * Construct devmap table with DT-derived config data.
  */
 int
 platform_devmap_init(void)
 {
 	phandle_t root, child;
 	pcell_t bank_count;
 	int i, num_mapped;
 
 	i = 0;
 	devmap_register_table(&fdt_devmap[0]);
 
 #ifdef SOC_MV_ARMADAXP
 	vm_paddr_t cur_immr_pa;
 
 	/*
 	 * Acquire SoC registers' base passed by u-boot and fill devmap
 	 * accordingly. DTB is going to be modified basing on this data
 	 * later.
 	 */
 	__asm __volatile("mrc p15, 4, %0, c15, c0, 0" : "=r" (cur_immr_pa));
 	cur_immr_pa = (cur_immr_pa << 13) & 0xff000000;
 	if (cur_immr_pa != 0)
 		fdt_immr_pa = cur_immr_pa;
 #endif
 	/*
 	 * IMMR range.
 	 */
 	fdt_devmap[i].pd_va = fdt_immr_va;
 	fdt_devmap[i].pd_pa = fdt_immr_pa;
 	fdt_devmap[i].pd_size = fdt_immr_size;
 	i++;
 
 	/*
 	 * SRAM range.
 	 */
 	if (i < FDT_DEVMAP_MAX)
 		if (platform_sram_devmap(&fdt_devmap[i]) == 0)
 			i++;
 
 	/*
 	 * PCI range(s).
 	 * PCI range(s) and localbus.
 	 */
 	if ((root = OF_finddevice("/")) == -1)
 		return (ENXIO);
 	for (child = OF_child(root); child != 0; child = OF_peer(child)) {
 		if (fdt_is_type(child, "pci") || fdt_is_type(child, "pciep")) {
 			/*
 			 * Check space: each PCI node will consume 2 devmap
 			 * entries.
 			 */
 			if (i + 1 >= FDT_DEVMAP_MAX)
 				return (ENOMEM);
 
 			/*
 			 * XXX this should account for PCI and multiple ranges
 			 * of a given kind.
 			 */
 			if (mv_pci_devmap(child, &fdt_devmap[i], MV_PCI_VA_IO_BASE,
 				    MV_PCI_VA_MEM_BASE) != 0)
 				return (ENXIO);
 			i += 2;
 		}
 
 		if (fdt_is_compatible(child, "mrvl,lbc")) {
 			/* Check available space */
 			if (OF_getprop(child, "bank-count", (void *)&bank_count,
 			    sizeof(bank_count)) <= 0)
 				/* If no property, use default value */
 				bank_count = 1;
 			else
 				bank_count = fdt32_to_cpu(bank_count);
 
 			if ((i + bank_count) >= FDT_DEVMAP_MAX)
 				return (ENOMEM);
 
 			/* Add all localbus ranges to device map */
 			num_mapped = 0;
 
 			if (fdt_localbus_devmap(child, &fdt_devmap[i],
 			    (int)bank_count, &num_mapped) != 0)
 				return (ENXIO);
 
 			i += num_mapped;
 		}
 	}
 
 	return (0);
 }
 
+#if __ARM_ARCH < 6
 struct arm32_dma_range *
 bus_dma_get_range(void)
 {
 
 	return (NULL);
 }
 
 int
 bus_dma_get_range_nb(void)
 {
 
 	return (0);
 }
+#endif
 
 #if defined(CPU_MV_PJ4B)
 #ifdef DDB
 #include <ddb/ddb.h>
 
 DB_SHOW_COMMAND(cp15, db_show_cp15)
 {
 	u_int reg;
 
 	__asm __volatile("mrc p15, 0, %0, c0, c0, 0" : "=r" (reg));
 	db_printf("Cpu ID: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c0, 1" : "=r" (reg));
 	db_printf("Current Cache Lvl ID: 0x%08x\n",reg);
 
 	reg = cp15_sctlr_get();
 	db_printf("Ctrl: 0x%08x\n",reg);
 	reg = cp15_actlr_get();
 	db_printf("Aux Ctrl: 0x%08x\n",reg);
 
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 0" : "=r" (reg));
 	db_printf("Processor Feat 0: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 1" : "=r" (reg));
 	db_printf("Processor Feat 1: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 2" : "=r" (reg));
 	db_printf("Debug Feat 0: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 3" : "=r" (reg));
 	db_printf("Auxiliary Feat 0: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 4" : "=r" (reg));
 	db_printf("Memory Model Feat 0: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 5" : "=r" (reg));
 	db_printf("Memory Model Feat 1: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 6" : "=r" (reg));
 	db_printf("Memory Model Feat 2: 0x%08x\n", reg);
 	__asm __volatile("mrc p15, 0, %0, c0, c1, 7" : "=r" (reg));
 	db_printf("Memory Model Feat 3: 0x%08x\n", reg);
 
 	__asm __volatile("mrc p15, 1, %0, c15, c2, 0" : "=r" (reg));
 	db_printf("Aux Func Modes Ctrl 0: 0x%08x\n",reg);
 	__asm __volatile("mrc p15, 1, %0, c15, c2, 1" : "=r" (reg));
 	db_printf("Aux Func Modes Ctrl 1: 0x%08x\n",reg);
 
 	__asm __volatile("mrc p15, 1, %0, c15, c12, 0" : "=r" (reg));
 	db_printf("CPU ID code extension: 0x%08x\n",reg);
 }
 
 DB_SHOW_COMMAND(vtop, db_show_vtop)
 {
 	u_int reg;
 
 	if (have_addr) {
 		__asm __volatile("mcr p15, 0, %0, c7, c8, 0" : : "r" (addr));
 		__asm __volatile("mrc p15, 0, %0, c7, c4, 0" : "=r" (reg));
 		db_printf("Physical address reg: 0x%08x\n",reg);
 	} else
 		db_printf("show vtop <virt_addr>\n");
 }
 #endif /* DDB */
 #endif /* CPU_MV_PJ4B */
 
Index: user/alc/PQ_LAUNDRY/sys/arm/nvidia/tegra124/tegra124_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/nvidia/tegra124/tegra124_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/nvidia/tegra124/tegra124_machdep.c	(revision 306283)
@@ -1,173 +1,156 @@
 /*-
  * Copyright (c) 2016 Michal Meloun <mmel@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
-#define	_ARM32_BUS_DMA_PRIVATE
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/reboot.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 #include <machine/intr.h>
 #include <machine/machdep.h>
 #include <machine/platformvar.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <arm/nvidia/tegra124/tegra124_mp.h>
 
 #include "platform_if.h"
 
 #define	PMC_PHYSBASE		0x7000e400
 #define	PMC_SIZE		0x400
 #define	PMC_CONTROL_REG		0x0
 #define	PMC_SCRATCH0		0x50
 #define	 PMC_SCRATCH0_MODE_RECOVERY	(1 << 31)
 #define	 PMC_SCRATCH0_MODE_BOOTLOADER	(1 << 30)
 #define	 PMC_SCRATCH0_MODE_RCM		(1 << 1)
 #define	 PMC_SCRATCH0_MODE_MASK		(PMC_SCRATCH0_MODE_RECOVERY | \
 					PMC_SCRATCH0_MODE_BOOTLOADER | \
 					PMC_SCRATCH0_MODE_RCM)
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
-	return (0);
-}
-
 static vm_offset_t
 tegra124_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 static int
 tegra124_attach(platform_t plat)
 {
 
 	return (0);
 }
 
 static void
 tegra124_late_init(platform_t plat)
 {
 
 }
 
 /*
  * Set up static device mappings.
  *
  */
 static int
 tegra124_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x70000000, 0x01000000);
 	return (0);
 }
 
-void
-cpu_reset(void)
+static void
+tegra124_cpu_reset(platform_t plat)
 {
 	bus_space_handle_t pmc;
 	uint32_t reg;
 
 	printf("Resetting...\n");
 	bus_space_map(fdtbus_bs_tag, PMC_PHYSBASE, PMC_SIZE, 0, &pmc);
 
 	reg = bus_space_read_4(fdtbus_bs_tag, pmc, PMC_SCRATCH0);
 	reg &= PMC_SCRATCH0_MODE_MASK;
 	bus_space_write_4(fdtbus_bs_tag, pmc, PMC_SCRATCH0,
 	   reg | PMC_SCRATCH0_MODE_BOOTLOADER); 	/* boot to bootloader */
 	bus_space_read_4(fdtbus_bs_tag, pmc, PMC_SCRATCH0);
 
 	reg = bus_space_read_4(fdtbus_bs_tag, pmc, PMC_CONTROL_REG);
 	spinlock_enter();
 	dsb();
 	bus_space_write_4(fdtbus_bs_tag, pmc, PMC_CONTROL_REG, reg | 0x10);
 	bus_space_read_4(fdtbus_bs_tag, pmc, PMC_CONTROL_REG);
 	while(1)
 		;
 
 }
 
 /*
  * Early putc routine for EARLY_PRINTF support.  To use, add to kernel config:
  *   option SOCDEV_PA=0x02000000
  *   option SOCDEV_VA=0x02000000
  *   option EARLY_PRINTF
  */
 #if 0
 static void
 tegra124_early_putc(int c)
 {
 	volatile uint32_t * UART_STAT_REG = (uint32_t *)0x02020098;
 	volatile uint32_t * UART_TX_REG   = (uint32_t *)0x02020040;
 	const uint32_t      UART_TXRDY    = (1 << 3);
 
 	while ((*UART_STAT_REG & UART_TXRDY) == 0)
 		continue;
 	*UART_TX_REG = c;
 }
 early_putc_t *early_putc = tegra124_early_putc;
 #endif
 
 static platform_method_t tegra124_methods[] = {
 	PLATFORMMETHOD(platform_attach,		tegra124_attach),
 	PLATFORMMETHOD(platform_lastaddr,	tegra124_lastaddr),
 	PLATFORMMETHOD(platform_devmap_init,	tegra124_devmap_init),
 	PLATFORMMETHOD(platform_late_init,	tegra124_late_init),
+	PLATFORMMETHOD(platform_cpu_reset,	tegra124_cpu_reset),
+
 #ifdef SMP
 	PLATFORMMETHOD(platform_mp_start_ap,	tegra124_mp_start_ap),
 	PLATFORMMETHOD(platform_mp_setmaxid,	tegra124_mp_setmaxid),
 #endif
 	PLATFORMMETHOD_END,
 };
 
 FDT_PLATFORM_DEF(tegra124, "Nvidia Jetson-TK1", 0, "nvidia,jetson-tk1", 0);
Index: user/alc/PQ_LAUNDRY/sys/arm/qemu/virt_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/qemu/virt_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/qemu/virt_common.c	(revision 306283)
@@ -1,49 +1,45 @@
 /*-
  * Copyright (c) 2015 Andrew Turner
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_platform.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <dev/fdt/fdt_common.h>
 
 #include <machine/intr.h>
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 fdt_pic_decode_t fdt_pic_table[] = {
 	&gic_decode_fdt,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/qemu/virt_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/qemu/virt_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/qemu/virt_machdep.c	(revision 306283)
@@ -1,116 +1,94 @@
 /*-
  * Copyright (c) 2015 Andrew Turner
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/bus.h>
 #include <machine/platform.h>
 #include <machine/platformvar.h>
 
 #include <arm/qemu/virt_mp.h>
 
 #include "platform_if.h"
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
-	return (0);
-}
-
-void
-cpu_reset(void)
-{
-
-	while (1);
-}
 
 static vm_offset_t
 virt_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 /*
  * Set up static device mappings.
  */
 static int
 virt_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x09000000, 0x100000); /* Uart */
 	return (0);
 }
 
 static platform_method_t virt_methods[] = {
 	PLATFORMMETHOD(platform_devmap_init,	virt_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	virt_lastaddr),
 
 #ifdef SMP
 	PLATFORMMETHOD(platform_mp_start_ap,	virt_mp_start_ap),
 	PLATFORMMETHOD(platform_mp_setmaxid,	virt_mp_setmaxid),
 #endif
 
 	PLATFORMMETHOD_END,
 };
 
 FDT_PLATFORM_DEF(virt, "virt", 0, "linux,dummy-virt", 1);
 
 static int
 gem5_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x1c090000, 0x100000); /* Uart */
 	return (0);
 }
 
 static platform_method_t gem5_methods[] = {
 	PLATFORMMETHOD(platform_devmap_init,	gem5_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	virt_lastaddr),
 
 	PLATFORMMETHOD_END,
 };
 
 FDT_PLATFORM_DEF(gem5, "gem5", 0, "arm,vexpress", 1);
Index: user/alc/PQ_LAUNDRY/sys/arm/rockchip/rk30xx_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/rockchip/rk30xx_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/rockchip/rk30xx_common.c	(revision 306283)
@@ -1,65 +1,61 @@
 /*-
  * Copyright (c) 2013 Ganbold Tsagaankhuu <ganbold@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/vmparam.h>
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_aintc_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "arm,gic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_aintc_decode_ic,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/rockchip/rk30xx_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/rockchip/rk30xx_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/rockchip/rk30xx_machdep.c	(revision 306283)
@@ -1,116 +1,101 @@
 /*-
  * Copyright (c) 2013 Ganbold Tsagaankhuu <ganbold@freebsd.org>
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: FreeBSD: //depot/projects/arm/src/sys/arm/ti/ti_machdep.c
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platform.h> 
 
 #include <dev/fdt/fdt_common.h>
 
 #include <arm/rockchip/rk30xx_wdog.h>
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (devmap_lastaddr());
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 }
 
 void
 platform_gpio_init(void)
 {
 }
 
 void
 platform_late_init(void)
 {
 
 	/* Enable cache */
 	cpufunc_control(CPU_CONTROL_DC_ENABLE|CPU_CONTROL_IC_ENABLE,
 	    CPU_CONTROL_DC_ENABLE|CPU_CONTROL_IC_ENABLE);
 }
 
 /*
  * Set up static device mappings.
  */
 int
 platform_devmap_init(void)
 {
 
 	devmap_add_entry(0x10000000, 0x00200000);
 	devmap_add_entry(0x20000000, 0x00100000);
 	
-	return (0);
-}
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
 	return (0);
 }
 
 void
 cpu_reset()
 {
 
 	rk30_wd_watchdog_reset();
 	printf("Reset failed!\n");
 	while (1);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/samsung/exynos/exynos5_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/samsung/exynos/exynos5_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/samsung/exynos/exynos5_common.c	(revision 306283)
@@ -1,75 +1,71 @@
 /*-
  * Copyright (c) 2013 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 
 void
 cpu_reset(void)
 {
 	bus_space_handle_t bsh;
 
 	bus_space_map(fdtbus_bs_tag, 0x10040400, 0x1000, 0, &bsh);
 	bus_space_write_4(fdtbus_bs_tag, bsh, 0, 1);
 
 	while (1);
 }
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_pic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "arm,gic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_pic_decode_ic,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/samsung/exynos/exynos5_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/samsung/exynos/exynos5_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/samsung/exynos/exynos5_machdep.c	(revision 306283)
@@ -1,99 +1,84 @@
 /*-
  * Copyright (c) 2013 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define	_ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 
 #include <machine/armreg.h>
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platform.h> 
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (devmap_lastaddr());
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 }
 
 void
 platform_gpio_init(void)
 {
 
 }
 
 void
 platform_late_init(void)
 {
 
 }
 
 int
 platform_devmap_init(void)
 {
 
 	/* CHIP ID */
 	devmap_add_entry(0x10000000, 0x100000);
 
 	/* UART */
 	devmap_add_entry(0x12C00000, 0x100000);
 
 	/* DWMMC */
 	devmap_add_entry(0x12200000, 0x100000);
-
-	return (0);
-}
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
 
 	return (0);
 }
Index: user/alc/PQ_LAUNDRY/sys/arm/ti/ti_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/ti/ti_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/ti/ti_common.c	(revision 306283)
@@ -1,84 +1,80 @@
 /*-
  * Copyright (C) 2008-2011 MARVELL INTERNATIONAL LTD.
  * All rights reserved.
  *
  * Developed by Semihalf.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of MARVELL nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/kdb.h>
 #include <sys/reboot.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/intr.h>
 #include <machine/vmparam.h>
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 #ifdef SOC_TI_AM335X
 static int
 fdt_aintc_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "ti,aintc") &&
 	    !fdt_is_compatible(node, "ti,am33xx-intc"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 
 	return (0);
 }
 #endif
 
 fdt_pic_decode_t fdt_pic_table[] = {
 #if defined(SOC_OMAP4)
 	&gic_decode_fdt,
 #endif
 #ifdef SOC_TI_AM335X
 	&fdt_aintc_decode_ic,
 #endif
 	NULL
 };
 #endif /* !INTRNG */
Index: user/alc/PQ_LAUNDRY/sys/arm/ti/ti_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/ti/ti_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/ti/ti_machdep.c	(revision 306283)
@@ -1,143 +1,128 @@
 /*-
  * Copyright (c) 1994-1998 Mark Brinicombe.
  * Copyright (c) 1994 Brini.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * from: FreeBSD: //depot/projects/arm/src/sys/arm/at91/kb920x_machdep.c, rev 45
  */
 
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platformvar.h>
 
 #include <arm/ti/omap4/omap4_reg.h>
 
 #include "platform_if.h"
 
 void (*ti_cpu_reset)(void) = NULL;
 
 static vm_offset_t
 ti_lastaddr(platform_t plat)
 {
 
 	return (devmap_lastaddr());
 }
 
 /*
  * Construct static devmap entries to map out the most frequently used
  * peripherals using 1mb section mappings.
  */
 #if defined(SOC_OMAP4)
 static int
 ti_omap4_devmap_init(platform_t plat)
 {
 	devmap_add_entry(0x48000000, 0x01000000); /*16mb L4_PER devices */
 	devmap_add_entry(0x4A000000, 0x01000000); /*16mb L4_CFG devices */
 	return (0);
 }
 #endif
 
 #if defined(SOC_TI_AM335X)
 static int
 ti_am335x_devmap_init(platform_t plat)
 {
 
 	devmap_add_entry(0x44C00000, 0x00400000); /* 4mb L4_WKUP devices*/
 	devmap_add_entry(0x47400000, 0x00100000); /* 1mb USB            */
 	devmap_add_entry(0x47800000, 0x00100000); /* 1mb mmchs2         */
 	devmap_add_entry(0x48000000, 0x01000000); /*16mb L4_PER devices */
 	devmap_add_entry(0x49000000, 0x00100000); /* 1mb edma3          */
 	devmap_add_entry(0x49800000, 0x00300000); /* 3mb edma3          */
 	devmap_add_entry(0x4A000000, 0x01000000); /*16mb L4_FAST devices*/
 	return (0);
 }
 #endif
 
-struct arm32_dma_range *
-bus_dma_get_range(void)
+static void
+ti_plat_cpu_reset(platform_t plat)
 {
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
-	return (0);
-}
-
-void
-cpu_reset()
-{
 	if (ti_cpu_reset)
 		(*ti_cpu_reset)();
 	else
 		printf("no cpu_reset implementation\n");
-	printf("Reset failed!\n");
-	while (1);
 }
 
 #if defined(SOC_OMAP4)
 static platform_method_t omap4_methods[] = {
 	PLATFORMMETHOD(platform_devmap_init,	ti_omap4_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	ti_lastaddr),
+	PLATFORMMETHOD(platform_cpu_reset,	ti_plat_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 FDT_PLATFORM_DEF(omap4, "omap4", 0, "ti,omap4430", 0);
 #endif
 
 #if defined(SOC_TI_AM335X)
 static platform_method_t am335x_methods[] = {
 	PLATFORMMETHOD(platform_devmap_init,	ti_am335x_devmap_init),
 	PLATFORMMETHOD(platform_lastaddr,	ti_lastaddr),
+	PLATFORMMETHOD(platform_cpu_reset,	ti_plat_cpu_reset),
 
 	PLATFORMMETHOD_END,
 };
 
 FDT_PLATFORM_DEF(am335x, "am335x", 0, "ti,am335x", 0);
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/versatile/versatile_common.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/versatile/versatile_common.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/versatile/versatile_common.c	(revision 306283)
@@ -1,74 +1,70 @@
 /*-
  * Copyright (C) 2008-2011 MARVELL INTERNATIONAL LTD.
  * All rights reserved.
  *
  * Developed by Semihalf.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of MARVELL nor the names of contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/kdb.h>
 #include <sys/reboot.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 
 #include <machine/bus.h>
 #include <machine/vmparam.h>
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_intc_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "arm,versatile-vic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 
 	return (0);
 }
 
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_intc_decode_ic,
 	NULL
 };
 #endif
Index: user/alc/PQ_LAUNDRY/sys/arm/versatile/versatile_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/versatile/versatile_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/versatile/versatile_machdep.c	(revision 306283)
@@ -1,125 +1,110 @@
 /*-
  * Copyright (c) 2012 Oleksandr Tymoshenko.
  * All rights reserved.
  *
  * This code is derived from software written for Brini by Mark Brinicombe
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *      This product includes software developed by Brini.
  * 4. The name of the company nor the name of the author may be used to
  *    endorse or promote products derived from this software without specific
  *    prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
  * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include "opt_ddb.h"
 #include "opt_platform.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platform.h> 
 
 #include <dev/fdt/fdt_common.h>
 
 /* Start of address space used for bootstrap map */
 #define DEVMAP_BOOTSTRAP_MAP_START	0xE0000000
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (DEVMAP_BOOTSTRAP_MAP_START);
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 }
 
 void
 platform_gpio_init(void)
 {
 }
 
 void
 platform_late_init(void)
 {
 }
 
 #define FDT_DEVMAP_MAX	(2)		/* FIXME */
 static struct devmap_entry fdt_devmap[FDT_DEVMAP_MAX] = {
 	{ 0, 0, 0, },
 	{ 0, 0, 0, }
 };
 
 
 /*
  * Construct devmap table with DT-derived config data.
  */
 int
 platform_devmap_init(void)
 {
 	int i = 0;
 	fdt_devmap[i].pd_va = 0xf0100000;
 	fdt_devmap[i].pd_pa = 0x10100000;
 	fdt_devmap[i].pd_size = 0x01000000;       /* 1 MB */
 
 	devmap_register_table(&fdt_devmap[0]);
-	return (0);
-}
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
 	return (0);
 }
 
 void
 cpu_reset()
 {
 	printf("cpu_reset\n");
 	while (1);
 }
 
Index: user/alc/PQ_LAUNDRY/sys/arm/xilinx/zy7_machdep.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/xilinx/zy7_machdep.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/arm/xilinx/zy7_machdep.c	(revision 306283)
@@ -1,146 +1,127 @@
 /*-
  * Copyright (c) 2013 Thomas Skibo
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 /*
  * Machine dependent code for Xilinx Zynq-7000 Soc.
  *
  * Reference: Zynq-7000 All Programmable SoC Technical Reference Manual.
  * (v1.4) November 16, 2012.  Xilinx doc UG585.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/devmap.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <dev/fdt/fdt_common.h>
 
 #include <machine/bus.h>
 #include <machine/machdep.h>
 #include <machine/platform.h> 
 
 #include <arm/xilinx/zy7_reg.h>
 
 void (*zynq7_cpu_reset)(void);
 
 vm_offset_t
 platform_lastaddr(void)
 {
 
 	return (devmap_lastaddr());
 }
 
 void
 platform_probe_and_attach(void)
 {
 
 }
 
 void
 platform_gpio_init(void)
 {
 }
 
 void
 platform_late_init(void)
 {
 }
 
 /*
  * Set up static device mappings.  Not strictly necessary -- simplebus will
  * dynamically establish mappings as needed -- but doing it this way gets us
  * nice efficient 1MB section mappings.
  */
 int
 platform_devmap_init(void)
 {
 
 	devmap_add_entry(ZYNQ7_PSIO_HWBASE, ZYNQ7_PSIO_SIZE);
 	devmap_add_entry(ZYNQ7_PSCTL_HWBASE, ZYNQ7_PSCTL_SIZE);
 
 	return (0);
 }
 
 
-struct fdt_fixup_entry fdt_fixup_table[] = {
-	{ NULL, NULL }
-};
-
 #ifndef INTRNG
 static int
 fdt_gic_decode_ic(phandle_t node, pcell_t *intr, int *interrupt, int *trig,
     int *pol)
 {
 
 	if (!fdt_is_compatible(node, "arm,gic"))
 		return (ENXIO);
 
 	*interrupt = fdt32_to_cpu(intr[0]);
 	*trig = INTR_TRIGGER_CONFORM;
 	*pol = INTR_POLARITY_CONFORM;
 
 	return (0);
 }
 
 fdt_pic_decode_t fdt_pic_table[] = {
 	&fdt_gic_decode_ic,
 	NULL
 };
 #endif
-
-struct arm32_dma_range *
-bus_dma_get_range(void)
-{
-
-	return (NULL);
-}
-
-int
-bus_dma_get_range_nb(void)
-{
-
-	return (0);
-}
 
 void
 cpu_reset()
 {
 	if (zynq7_cpu_reset != NULL)
 		(*zynq7_cpu_reset)();
 
 	printf("cpu_reset: no platform cpu_reset.  hanging.\n");
 	for (;;)
 		;
 }
Index: user/alc/PQ_LAUNDRY/sys/boot/efi/loader/arch/amd64/ldscript.amd64
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/efi/loader/arch/amd64/ldscript.amd64	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/boot/efi/loader/arch/amd64/ldscript.amd64	(revision 306283)
@@ -1,67 +1,67 @@
 /* $FreeBSD$ */
 OUTPUT_FORMAT("elf64-x86-64-freebsd", "elf64-x86-64-freebsd", "elf64-x86-64-freebsd")
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(_start)
 SECTIONS
 {
   /* Read-only sections, merged into text segment: */
   . = 0;
   ImageBase = .;
   .hash : { *(.hash) }  /* this MUST come first! */
   . = ALIGN(4096);
   .eh_frame :
   {
     *(.eh_frame)
   }
   . = ALIGN(4096);
   .text		: {
     *(.text .stub .text.* .gnu.linkonce.t.*)
     /* .gnu.warning sections are handled specially by elf32.em. */
     *(.gnu.warning)
     *(.plt)
-  } =0xCC
+  } =0xCCCCCCCC
   . = ALIGN(4096);
   .data		: {
     *(.rodata .rodata.* .gnu.linkonce.r.*)
     *(.rodata1)
     *(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
     *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*)
     *(.opd)
     *(.data .data.* .gnu.linkonce.d.*)
     *(.data1)
     *(.plabel)
     *(.dynbss)
     *(.bss .bss.* .gnu.linkonce.b.*)
     *(COMMON)
   }
   . = ALIGN(4096);
   set_Xcommand_set	: {
     __start_set_Xcommand_set = .;
     *(set_Xcommand_set)
     __stop_set_Xcommand_set = .;
   }
   . = ALIGN(4096);
   __gp = .;
   .sdata	: {
     *(.got.plt .got)
     *(.sdata .sdata.* .gnu.linkonce.s.*)
     *(dynsbss)
     *(.sbss .sbss.* .gnu.linkonce.sb.*)
     *(.scommon)
   }
   . = ALIGN(4096);
   .dynamic	: { *(.dynamic) }
   . = ALIGN(4096);
   .rela.dyn	: {
     *(.rela.data*)
     *(.rela.got)
     *(.rela.stab)
     *(.relaset_*)
   }
   . = ALIGN(4096);
   .reloc	: { *(.reloc) }
   . = ALIGN(4096);
   .dynsym	: { *(.dynsym) }
   . = ALIGN(4096);
   .dynstr	: { *(.dynstr) }
 }
Index: user/alc/PQ_LAUNDRY/sys/boot/efi/loader/arch/i386/ldscript.i386
===================================================================
--- user/alc/PQ_LAUNDRY/sys/boot/efi/loader/arch/i386/ldscript.i386	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/boot/efi/loader/arch/i386/ldscript.i386	(revision 306283)
@@ -1,72 +1,72 @@
 /* $FreeBSD$ */
 OUTPUT_FORMAT("elf32-i386-freebsd", "elf32-i386-freebsd", "elf32-i386-freebsd")
 OUTPUT_ARCH(i386)
 ENTRY(_start)
 SECTIONS
 {
   /* Read-only sections, merged into text segment: */
   . = 0;
   ImageBase = .;
   . = SIZEOF_HEADERS;
   . = ALIGN(4096);
   .text		: {
     *(.text .stub .text.* .gnu.linkonce.t.*)
     /* .gnu.warning sections are handled specially by elf32.em. */
     *(.gnu.warning)
     *(.plt)
-  } =0xCC
+  } =0xCCCCCCCC
   . = ALIGN(4096);
   .data		: {
     *(.rodata .rodata.* .gnu.linkonce.r.*)
     *(.rodata1)
     *(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
     *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*)
     *(.opd)
     *(.data .data.* .gnu.linkonce.d.*)
     *(.data1)
     *(.plabel)
     *(.dynbss)
     *(.bss .bss.* .gnu.linkonce.b.*)
     *(COMMON)
   }
   . = ALIGN(4096);
   set_Xcommand_set	: {
     __start_set_Xcommand_set = .;
     *(set_Xcommand_set)
     __stop_set_Xcommand_set = .;
   }
   . = ALIGN(4096);
   __gp = .;
   .sdata	: {
     *(.got.plt .got)
     *(.sdata .sdata.* .gnu.linkonce.s.*)
     *(dynsbss)
     *(.sbss .sbss.* .gnu.linkonce.sb.*)
     *(.scommon)
   }
   . = ALIGN(4096);
   .dynamic	: { *(.dynamic) }
   . = ALIGN(4096);
   .rel.dyn	: {
     *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*)
     *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*)
     *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*)
     *(.rel.got)
     *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*)
     *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*)
     *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*)
     *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*)
     *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*)
     *(.rel.plt)
     *(.relset_*)
     *(.rel.dyn .rel.dyn.*)
   }
   . = ALIGN(4096);
   .reloc	: { *(.reloc) }
   . = ALIGN(4096);
   .hash		: { *(.hash) }
   . = ALIGN(4096);
   .dynsym	: { *(.dynsym) }
   . = ALIGN(4096);
   .dynstr	: { *(.dynstr) }
 }
Index: user/alc/PQ_LAUNDRY/sys/compat/cloudabi/cloudabi_fd.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/compat/cloudabi/cloudabi_fd.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/compat/cloudabi/cloudabi_fd.c	(revision 306283)
@@ -1,558 +1,546 @@
 /*-
  * Copyright (c) 2015 Nuxi, https://nuxi.nl/
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/mman.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 #include <contrib/cloudabi/cloudabi_types_common.h>
 
 #include <compat/cloudabi/cloudabi_proto.h>
 #include <compat/cloudabi/cloudabi_util.h>
 
 /* Translation between CloudABI and Capsicum rights. */
 #define RIGHTS_MAPPINGS \
 	MAPPING(CLOUDABI_RIGHT_FD_DATASYNC, CAP_FSYNC)			\
 	MAPPING(CLOUDABI_RIGHT_FD_READ, CAP_READ)			\
 	MAPPING(CLOUDABI_RIGHT_FD_SEEK, CAP_SEEK)			\
 	MAPPING(CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS, CAP_FCNTL)		\
 	MAPPING(CLOUDABI_RIGHT_FD_SYNC, CAP_FSYNC)			\
 	MAPPING(CLOUDABI_RIGHT_FD_TELL, CAP_SEEK_TELL)			\
 	MAPPING(CLOUDABI_RIGHT_FD_WRITE, CAP_WRITE)			\
 	MAPPING(CLOUDABI_RIGHT_FILE_ADVISE)				\
 	MAPPING(CLOUDABI_RIGHT_FILE_ALLOCATE, CAP_WRITE)		\
 	MAPPING(CLOUDABI_RIGHT_FILE_CREATE_DIRECTORY, CAP_MKDIRAT)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_CREATE_FILE, CAP_CREATE)		\
 	MAPPING(CLOUDABI_RIGHT_FILE_CREATE_FIFO, CAP_MKFIFOAT)		\
 	MAPPING(CLOUDABI_RIGHT_FILE_LINK_SOURCE, CAP_LINKAT_SOURCE)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_LINK_TARGET, CAP_LINKAT_TARGET)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_OPEN, CAP_LOOKUP)			\
 	MAPPING(CLOUDABI_RIGHT_FILE_READDIR, CAP_READ)			\
 	MAPPING(CLOUDABI_RIGHT_FILE_READLINK, CAP_LOOKUP)		\
 	MAPPING(CLOUDABI_RIGHT_FILE_RENAME_SOURCE, CAP_RENAMEAT_SOURCE)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_RENAME_TARGET, CAP_RENAMEAT_TARGET)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_STAT_FGET, CAP_FSTAT)		\
 	MAPPING(CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE, CAP_FTRUNCATE)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES, CAP_FUTIMES)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_STAT_GET, CAP_FSTATAT)		\
 	MAPPING(CLOUDABI_RIGHT_FILE_STAT_PUT_TIMES, CAP_FUTIMESAT)	\
 	MAPPING(CLOUDABI_RIGHT_FILE_SYMLINK, CAP_SYMLINKAT)		\
 	MAPPING(CLOUDABI_RIGHT_FILE_UNLINK, CAP_UNLINKAT)		\
 	MAPPING(CLOUDABI_RIGHT_MEM_MAP, CAP_MMAP)			\
 	MAPPING(CLOUDABI_RIGHT_MEM_MAP_EXEC, CAP_MMAP_X)		\
 	MAPPING(CLOUDABI_RIGHT_POLL_FD_READWRITE, CAP_EVENT)		\
 	MAPPING(CLOUDABI_RIGHT_POLL_MODIFY, CAP_KQUEUE_CHANGE)		\
 	MAPPING(CLOUDABI_RIGHT_POLL_PROC_TERMINATE, CAP_EVENT)		\
 	MAPPING(CLOUDABI_RIGHT_POLL_WAIT, CAP_KQUEUE_EVENT)		\
 	MAPPING(CLOUDABI_RIGHT_PROC_EXEC, CAP_FEXECVE)			\
 	MAPPING(CLOUDABI_RIGHT_SOCK_ACCEPT, CAP_ACCEPT)			\
 	MAPPING(CLOUDABI_RIGHT_SOCK_BIND_DIRECTORY, CAP_BINDAT)		\
 	MAPPING(CLOUDABI_RIGHT_SOCK_BIND_SOCKET, CAP_BIND)		\
 	MAPPING(CLOUDABI_RIGHT_SOCK_CONNECT_DIRECTORY, CAP_CONNECTAT)	\
 	MAPPING(CLOUDABI_RIGHT_SOCK_CONNECT_SOCKET, CAP_CONNECT)	\
 	MAPPING(CLOUDABI_RIGHT_SOCK_LISTEN, CAP_LISTEN)			\
 	MAPPING(CLOUDABI_RIGHT_SOCK_SHUTDOWN, CAP_SHUTDOWN)		\
 	MAPPING(CLOUDABI_RIGHT_SOCK_STAT_GET, CAP_GETPEERNAME,		\
 	    CAP_GETSOCKNAME, CAP_GETSOCKOPT)
 
 int
 cloudabi_sys_fd_close(struct thread *td, struct cloudabi_sys_fd_close_args *uap)
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 cloudabi_sys_fd_create1(struct thread *td,
     struct cloudabi_sys_fd_create1_args *uap)
 {
 	struct filecaps fcaps = {};
 	struct socket_args socket_args = {
 		.domain = AF_UNIX,
 	};
 
 	switch (uap->type) {
 	case CLOUDABI_FILETYPE_POLL:
 		cap_rights_init(&fcaps.fc_rights, CAP_FSTAT, CAP_KQUEUE);
 		return (kern_kqueue(td, 0, &fcaps));
 	case CLOUDABI_FILETYPE_SHARED_MEMORY:
 		cap_rights_init(&fcaps.fc_rights, CAP_FSTAT, CAP_FTRUNCATE,
 		    CAP_MMAP_RWX);
 		return (kern_shm_open(td, SHM_ANON, O_RDWR, 0, &fcaps));
 	case CLOUDABI_FILETYPE_SOCKET_DGRAM:
 		socket_args.type = SOCK_DGRAM;
 		return (sys_socket(td, &socket_args));
 	case CLOUDABI_FILETYPE_SOCKET_SEQPACKET:
 		socket_args.type = SOCK_SEQPACKET;
 		return (sys_socket(td, &socket_args));
 	case CLOUDABI_FILETYPE_SOCKET_STREAM:
 		socket_args.type = SOCK_STREAM;
 		return (sys_socket(td, &socket_args));
 	default:
 		return (EINVAL);
 	}
 }
 
 int
 cloudabi_sys_fd_create2(struct thread *td,
     struct cloudabi_sys_fd_create2_args *uap)
 {
 	struct filecaps fcaps1 = {}, fcaps2 = {};
 	int fds[2];
 	int error;
 
 	switch (uap->type) {
 	case CLOUDABI_FILETYPE_FIFO:
 		/*
 		 * CloudABI pipes are unidirectional. Restrict rights on
 		 * the pipe to simulate this.
 		 */
 		cap_rights_init(&fcaps1.fc_rights, CAP_EVENT, CAP_FCNTL,
 		    CAP_FSTAT, CAP_READ);
 		fcaps1.fc_fcntls = CAP_FCNTL_SETFL;
 		cap_rights_init(&fcaps2.fc_rights, CAP_EVENT, CAP_FCNTL,
 		    CAP_FSTAT, CAP_WRITE);
 		fcaps2.fc_fcntls = CAP_FCNTL_SETFL;
 		error = kern_pipe(td, fds, 0, &fcaps1, &fcaps2);
 		break;
 	case CLOUDABI_FILETYPE_SOCKET_DGRAM:
 		error = kern_socketpair(td, AF_UNIX, SOCK_DGRAM, 0, fds);
 		break;
 	case CLOUDABI_FILETYPE_SOCKET_SEQPACKET:
 		error = kern_socketpair(td, AF_UNIX, SOCK_SEQPACKET, 0, fds);
 		break;
 	case CLOUDABI_FILETYPE_SOCKET_STREAM:
 		error = kern_socketpair(td, AF_UNIX, SOCK_STREAM, 0, fds);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if (error == 0) {
 		td->td_retval[0] = fds[0];
 		td->td_retval[1] = fds[1];
 	}
 	return (0);
 }
 
 int
 cloudabi_sys_fd_datasync(struct thread *td,
     struct cloudabi_sys_fd_datasync_args *uap)
 {
 
 	return (kern_fsync(td, uap->fd, false));
 }
 
 int
 cloudabi_sys_fd_dup(struct thread *td, struct cloudabi_sys_fd_dup_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_NORMAL, 0, uap->from, 0));
 }
 
 int
 cloudabi_sys_fd_replace(struct thread *td,
     struct cloudabi_sys_fd_replace_args *uap)
 {
 	int error;
 
 	/*
 	 * CloudABI's equivalent to dup2(). CloudABI processes should
 	 * not depend on hardcoded file descriptor layouts, but simply
 	 * use the file descriptor numbers that are allocated by the
 	 * kernel. Duplicating file descriptors to arbitrary numbers
 	 * should not be done.
 	 *
 	 * Invoke kern_dup() with FDDUP_MUSTREPLACE, so that we return
 	 * EBADF when duplicating to a nonexistent file descriptor. Also
 	 * clear the return value, as this system call yields no return
 	 * value.
 	 */
 	error = kern_dup(td, FDDUP_MUSTREPLACE, 0, uap->from, uap->to);
 	td->td_retval[0] = 0;
 	return (error);
 }
 
 int
 cloudabi_sys_fd_seek(struct thread *td, struct cloudabi_sys_fd_seek_args *uap)
 {
 	struct lseek_args lseek_args = {
 		.fd	= uap->fd,
 		.offset	= uap->offset
 	};
 
 	switch (uap->whence) {
 	case CLOUDABI_WHENCE_CUR:
 		lseek_args.whence = SEEK_CUR;
 		break;
 	case CLOUDABI_WHENCE_END:
 		lseek_args.whence = SEEK_END;
 		break;
 	case CLOUDABI_WHENCE_SET:
 		lseek_args.whence = SEEK_SET;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	return (sys_lseek(td, &lseek_args));
 }
 
 /* Converts a file descriptor to a CloudABI file descriptor type. */
 cloudabi_filetype_t
 cloudabi_convert_filetype(const struct file *fp)
 {
 	struct socket *so;
 	struct vnode *vp;
 
 	switch (fp->f_type) {
 	case DTYPE_FIFO:
 		return (CLOUDABI_FILETYPE_FIFO);
 	case DTYPE_KQUEUE:
 		return (CLOUDABI_FILETYPE_POLL);
 	case DTYPE_PIPE:
 		return (CLOUDABI_FILETYPE_FIFO);
 	case DTYPE_PROCDESC:
 		return (CLOUDABI_FILETYPE_PROCESS);
 	case DTYPE_SHM:
 		return (CLOUDABI_FILETYPE_SHARED_MEMORY);
 	case DTYPE_SOCKET:
 		so = fp->f_data;
 		switch (so->so_type) {
 		case SOCK_DGRAM:
 			return (CLOUDABI_FILETYPE_SOCKET_DGRAM);
 		case SOCK_SEQPACKET:
 			return (CLOUDABI_FILETYPE_SOCKET_SEQPACKET);
 		case SOCK_STREAM:
 			return (CLOUDABI_FILETYPE_SOCKET_STREAM);
 		default:
 			return (CLOUDABI_FILETYPE_UNKNOWN);
 		}
 	case DTYPE_VNODE:
 		vp = fp->f_vnode;
 		switch (vp->v_type) {
 		case VBLK:
 			return (CLOUDABI_FILETYPE_BLOCK_DEVICE);
 		case VCHR:
 			return (CLOUDABI_FILETYPE_CHARACTER_DEVICE);
 		case VDIR:
 			return (CLOUDABI_FILETYPE_DIRECTORY);
 		case VFIFO:
 			return (CLOUDABI_FILETYPE_FIFO);
 		case VLNK:
 			return (CLOUDABI_FILETYPE_SYMBOLIC_LINK);
 		case VREG:
 			return (CLOUDABI_FILETYPE_REGULAR_FILE);
 		case VSOCK:
 			return (CLOUDABI_FILETYPE_SOCKET_STREAM);
 		default:
 			return (CLOUDABI_FILETYPE_UNKNOWN);
 		}
 	default:
 		return (CLOUDABI_FILETYPE_UNKNOWN);
 	}
 }
 
 /* Removes rights that conflict with the file descriptor type. */
 void
 cloudabi_remove_conflicting_rights(cloudabi_filetype_t filetype,
     cloudabi_rights_t *base, cloudabi_rights_t *inheriting)
 {
 
 	/*
 	 * CloudABI has a small number of additional rights bits to
 	 * disambiguate between multiple purposes. Remove the bits that
 	 * don't apply to the type of the file descriptor.
 	 *
 	 * As file descriptor access modes (O_ACCMODE) has been fully
 	 * replaced by rights bits, CloudABI distinguishes between
 	 * rights that apply to the file descriptor itself (base) versus
 	 * rights of new file descriptors derived from them
 	 * (inheriting). The code below approximates the pair by
 	 * decomposing depending on the file descriptor type.
 	 *
 	 * We need to be somewhat accurate about which actions can
 	 * actually be performed on the file descriptor, as functions
 	 * like fcntl(fd, F_GETFL) are emulated on top of this.
 	 */
 	switch (filetype) {
 	case CLOUDABI_FILETYPE_DIRECTORY:
 		*base &= CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS |
 		    CLOUDABI_RIGHT_FD_SYNC | CLOUDABI_RIGHT_FILE_ADVISE |
 		    CLOUDABI_RIGHT_FILE_CREATE_DIRECTORY |
 		    CLOUDABI_RIGHT_FILE_CREATE_FILE |
 		    CLOUDABI_RIGHT_FILE_CREATE_FIFO |
 		    CLOUDABI_RIGHT_FILE_LINK_SOURCE |
 		    CLOUDABI_RIGHT_FILE_LINK_TARGET |
 		    CLOUDABI_RIGHT_FILE_OPEN |
 		    CLOUDABI_RIGHT_FILE_READDIR |
 		    CLOUDABI_RIGHT_FILE_READLINK |
 		    CLOUDABI_RIGHT_FILE_RENAME_SOURCE |
 		    CLOUDABI_RIGHT_FILE_RENAME_TARGET |
 		    CLOUDABI_RIGHT_FILE_STAT_FGET |
 		    CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES |
 		    CLOUDABI_RIGHT_FILE_STAT_GET |
 		    CLOUDABI_RIGHT_FILE_STAT_PUT_TIMES |
 		    CLOUDABI_RIGHT_FILE_SYMLINK |
 		    CLOUDABI_RIGHT_FILE_UNLINK |
 		    CLOUDABI_RIGHT_POLL_FD_READWRITE |
 		    CLOUDABI_RIGHT_SOCK_BIND_DIRECTORY |
 		    CLOUDABI_RIGHT_SOCK_CONNECT_DIRECTORY;
 		*inheriting &= CLOUDABI_RIGHT_FD_DATASYNC |
 		    CLOUDABI_RIGHT_FD_READ |
 		    CLOUDABI_RIGHT_FD_SEEK |
 		    CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS |
 		    CLOUDABI_RIGHT_FD_SYNC |
 		    CLOUDABI_RIGHT_FD_TELL |
 		    CLOUDABI_RIGHT_FD_WRITE |
 		    CLOUDABI_RIGHT_FILE_ADVISE |
 		    CLOUDABI_RIGHT_FILE_ALLOCATE |
 		    CLOUDABI_RIGHT_FILE_CREATE_DIRECTORY |
 		    CLOUDABI_RIGHT_FILE_CREATE_FILE |
 		    CLOUDABI_RIGHT_FILE_CREATE_FIFO |
 		    CLOUDABI_RIGHT_FILE_LINK_SOURCE |
 		    CLOUDABI_RIGHT_FILE_LINK_TARGET |
 		    CLOUDABI_RIGHT_FILE_OPEN |
 		    CLOUDABI_RIGHT_FILE_READDIR |
 		    CLOUDABI_RIGHT_FILE_READLINK |
 		    CLOUDABI_RIGHT_FILE_RENAME_SOURCE |
 		    CLOUDABI_RIGHT_FILE_RENAME_TARGET |
 		    CLOUDABI_RIGHT_FILE_STAT_FGET |
 		    CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE |
 		    CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES |
 		    CLOUDABI_RIGHT_FILE_STAT_GET |
 		    CLOUDABI_RIGHT_FILE_STAT_PUT_TIMES |
 		    CLOUDABI_RIGHT_FILE_SYMLINK |
 		    CLOUDABI_RIGHT_FILE_UNLINK |
 		    CLOUDABI_RIGHT_MEM_MAP |
 		    CLOUDABI_RIGHT_MEM_MAP_EXEC |
 		    CLOUDABI_RIGHT_POLL_FD_READWRITE |
 		    CLOUDABI_RIGHT_PROC_EXEC |
 		    CLOUDABI_RIGHT_SOCK_BIND_DIRECTORY |
 		    CLOUDABI_RIGHT_SOCK_CONNECT_DIRECTORY;
 		break;
 	case CLOUDABI_FILETYPE_FIFO:
 		*base &= CLOUDABI_RIGHT_FD_READ |
 		    CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS |
 		    CLOUDABI_RIGHT_FD_WRITE |
 		    CLOUDABI_RIGHT_FILE_STAT_FGET |
 		    CLOUDABI_RIGHT_POLL_FD_READWRITE;
 		*inheriting = 0;
 		break;
 	case CLOUDABI_FILETYPE_POLL:
 		*base &= ~CLOUDABI_RIGHT_FILE_ADVISE;
 		*inheriting = 0;
 		break;
 	case CLOUDABI_FILETYPE_PROCESS:
 		*base &= ~(CLOUDABI_RIGHT_FILE_ADVISE |
 		    CLOUDABI_RIGHT_POLL_FD_READWRITE);
 		*inheriting = 0;
 		break;
 	case CLOUDABI_FILETYPE_REGULAR_FILE:
 		*base &= CLOUDABI_RIGHT_FD_DATASYNC |
 		    CLOUDABI_RIGHT_FD_READ |
 		    CLOUDABI_RIGHT_FD_SEEK |
 		    CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS |
 		    CLOUDABI_RIGHT_FD_SYNC |
 		    CLOUDABI_RIGHT_FD_TELL |
 		    CLOUDABI_RIGHT_FD_WRITE |
 		    CLOUDABI_RIGHT_FILE_ADVISE |
 		    CLOUDABI_RIGHT_FILE_ALLOCATE |
 		    CLOUDABI_RIGHT_FILE_STAT_FGET |
 		    CLOUDABI_RIGHT_FILE_STAT_FPUT_SIZE |
 		    CLOUDABI_RIGHT_FILE_STAT_FPUT_TIMES |
 		    CLOUDABI_RIGHT_MEM_MAP |
 		    CLOUDABI_RIGHT_MEM_MAP_EXEC |
 		    CLOUDABI_RIGHT_POLL_FD_READWRITE |
 		    CLOUDABI_RIGHT_PROC_EXEC;
 		*inheriting = 0;
 		break;
 	case CLOUDABI_FILETYPE_SHARED_MEMORY:
 		*base &= ~(CLOUDABI_RIGHT_FD_SEEK |
 		    CLOUDABI_RIGHT_FD_TELL |
 		    CLOUDABI_RIGHT_FILE_ADVISE |
 		    CLOUDABI_RIGHT_FILE_ALLOCATE |
 		    CLOUDABI_RIGHT_FILE_READDIR);
 		*inheriting = 0;
 		break;
 	case CLOUDABI_FILETYPE_SOCKET_DGRAM:
 	case CLOUDABI_FILETYPE_SOCKET_SEQPACKET:
 	case CLOUDABI_FILETYPE_SOCKET_STREAM:
 		*base &= CLOUDABI_RIGHT_FD_READ |
 		    CLOUDABI_RIGHT_FD_STAT_PUT_FLAGS |
 		    CLOUDABI_RIGHT_FD_WRITE |
 		    CLOUDABI_RIGHT_FILE_STAT_FGET |
 		    CLOUDABI_RIGHT_POLL_FD_READWRITE |
 		    CLOUDABI_RIGHT_SOCK_ACCEPT |
 		    CLOUDABI_RIGHT_SOCK_BIND_SOCKET |
 		    CLOUDABI_RIGHT_SOCK_CONNECT_SOCKET |
 		    CLOUDABI_RIGHT_SOCK_LISTEN |
 		    CLOUDABI_RIGHT_SOCK_SHUTDOWN |
 		    CLOUDABI_RIGHT_SOCK_STAT_GET;
 		break;
 	default:
 		*inheriting = 0;
 		break;
 	}
 }
 
 /* Converts FreeBSD's Capsicum rights to CloudABI's set of rights. */
 static void
 convert_capabilities(const cap_rights_t *capabilities,
     cloudabi_filetype_t filetype, cloudabi_rights_t *base,
     cloudabi_rights_t *inheriting)
 {
 	cloudabi_rights_t rights;
 
 	/* Convert FreeBSD bits to CloudABI bits. */
 	rights = 0;
 #define MAPPING(cloudabi, ...) do {				\
 	if (cap_rights_is_set(capabilities, ##__VA_ARGS__))	\
 		rights |= (cloudabi);				\
 } while (0);
 	RIGHTS_MAPPINGS
 #undef MAPPING
 
 	*base = rights;
 	*inheriting = rights;
 	cloudabi_remove_conflicting_rights(filetype, base, inheriting);
 }
 
 int
 cloudabi_sys_fd_stat_get(struct thread *td,
     struct cloudabi_sys_fd_stat_get_args *uap)
 {
 	cloudabi_fdstat_t fsb = {};
-	struct filedesc *fdp;
 	struct file *fp;
-	seq_t seq;
 	cap_rights_t rights;
+	struct filecaps fcaps;
 	int error, oflags;
-	bool modified;
 
 	/* Obtain file descriptor properties. */
-	fdp = td->td_proc->p_fd;
-	do {
-		error = fget_unlocked(fdp, uap->fd, cap_rights_init(&rights),
-		    &fp, &seq);
-		if (error != 0)
-			return (error);
-		if (fp->f_ops == &badfileops) {
-			fdrop(fp, td);
-			return (EBADF);
-		}
+	error = fget_cap(td, uap->fd, cap_rights_init(&rights), &fp,
+	    &fcaps);
+	if (error != 0)
+		return (error);
+	oflags = OFLAGS(fp->f_flag);
+	fsb.fs_filetype = cloudabi_convert_filetype(fp);
+	fdrop(fp, td);
 
-		rights = *cap_rights(fdp, uap->fd);
-		oflags = OFLAGS(fp->f_flag);
-		fsb.fs_filetype = cloudabi_convert_filetype(fp);
-
-		modified = fd_modified(fdp, uap->fd, seq);
-		fdrop(fp, td);
-	} while (modified);
-
 	/* Convert file descriptor flags. */
 	if (oflags & O_APPEND)
 		fsb.fs_flags |= CLOUDABI_FDFLAG_APPEND;
 	if (oflags & O_NONBLOCK)
 		fsb.fs_flags |= CLOUDABI_FDFLAG_NONBLOCK;
 	if (oflags & O_SYNC)
 		fsb.fs_flags |= CLOUDABI_FDFLAG_SYNC;
 
 	/* Convert capabilities to CloudABI rights. */
-	convert_capabilities(&rights, fsb.fs_filetype,
+	convert_capabilities(&fcaps.fc_rights, fsb.fs_filetype,
 	    &fsb.fs_rights_base, &fsb.fs_rights_inheriting);
+	filecaps_free(&fcaps);
 	return (copyout(&fsb, (void *)uap->buf, sizeof(fsb)));
 }
 
 /* Converts CloudABI rights to a set of Capsicum capabilities. */
 int
 cloudabi_convert_rights(cloudabi_rights_t in, cap_rights_t *out)
 {
 
 	cap_rights_init(out);
 #define MAPPING(cloudabi, ...) do {			\
 	if (in & (cloudabi)) {				\
 		cap_rights_set(out, ##__VA_ARGS__);	\
 		in &= ~(cloudabi);			\
 	}						\
 } while (0);
 	RIGHTS_MAPPINGS
 #undef MAPPING
 	if (in != 0)
 		return (ENOTCAPABLE);
 	return (0);
 }
 
 int
 cloudabi_sys_fd_stat_put(struct thread *td,
     struct cloudabi_sys_fd_stat_put_args *uap)
 {
 	cloudabi_fdstat_t fsb;
 	cap_rights_t rights;
 	int error, oflags;
 
 	error = copyin(uap->buf, &fsb, sizeof(fsb));
 	if (error != 0)
 		return (error);
 
 	if (uap->flags == CLOUDABI_FDSTAT_FLAGS) {
 		/* Convert flags. */
 		oflags = 0;
 		if (fsb.fs_flags & CLOUDABI_FDFLAG_APPEND)
 			oflags |= O_APPEND;
 		if (fsb.fs_flags & CLOUDABI_FDFLAG_NONBLOCK)
 			oflags |= O_NONBLOCK;
 		if (fsb.fs_flags & (CLOUDABI_FDFLAG_SYNC |
 		    CLOUDABI_FDFLAG_DSYNC | CLOUDABI_FDFLAG_RSYNC))
 			oflags |= O_SYNC;
 		return (kern_fcntl(td, uap->fd, F_SETFL, oflags));
 	} else if (uap->flags == CLOUDABI_FDSTAT_RIGHTS) {
 		/* Convert rights. */
 		error = cloudabi_convert_rights(
 		    fsb.fs_rights_base | fsb.fs_rights_inheriting, &rights);
 		if (error != 0)
 			return (error);
 		return (kern_cap_rights_limit(td, uap->fd, &rights));
 	}
 	return (EINVAL);
 }
 
 int
 cloudabi_sys_fd_sync(struct thread *td, struct cloudabi_sys_fd_sync_args *uap)
 {
 
 	return (kern_fsync(td, uap->fd, true));
 }
Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_netmap.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_netmap.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_netmap.c	(revision 306283)
@@ -1,1013 +1,1021 @@
 /*-
  * Copyright (c) 2014 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #ifdef DEV_NETMAP
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/selinfo.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <machine/bus.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_media.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 
 extern int fl_pad;	/* XXXNM */
 
 SYSCTL_NODE(_hw, OID_AUTO, cxgbe, CTLFLAG_RD, 0, "cxgbe netmap parameters");
 
 /*
  * 0 = normal netmap rx
  * 1 = black hole
  * 2 = supermassive black hole (buffer packing enabled)
  */
 int black_hole = 0;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_black_hole, CTLFLAG_RDTUN, &black_hole, 0,
     "Sink incoming packets.");
 
 int rx_ndesc = 256;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_rx_ndesc, CTLFLAG_RWTUN,
     &rx_ndesc, 0, "# of rx descriptors after which the hw cidx is updated.");
 
 int holdoff_tmr_idx = 2;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nm_holdoff_tmr_idx, CTLFLAG_RWTUN,
     &holdoff_tmr_idx, 0, "Holdoff timer index for netmap rx queues.");
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int nm_cong_drop = 1;
 TUNABLE_INT("hw.cxgbe.nm_cong_drop", &nm_cong_drop);
 
 static int
 alloc_nm_rxq_hwq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int cong)
 {
 	int rc, cntxt_id, i;
 	__be32 v;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_params *sp = &sc->params.sge;
 	struct netmap_adapter *na = NA(vi->ifp);
 	struct fw_iq_cmd c;
 
 	MPASS(na != NULL);
 	MPASS(nm_rxq->iq_desc != NULL);
 	MPASS(nm_rxq->fl_desc != NULL);
 
 	bzero(nm_rxq->iq_desc, vi->qsize_rxq * IQ_ESIZE);
 	bzero(nm_rxq->fl_desc, na->num_rx_desc * EQ_ESIZE + sp->spg_len);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 	if (vi->flags & INTR_RXQ) {
 		KASSERT(nm_rxq->intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__,
 		    nm_rxq->intr_idx));
 		v = V_FW_IQ_CMD_IQANDSTINDEX(nm_rxq->intr_idx);
 	} else {
 		CXGBE_UNIMPLEMENTED(__func__);	/* XXXNM: needs review */
 		v = V_FW_IQ_CMD_IQANDSTINDEX(nm_rxq->intr_idx) |
 		    F_FW_IQ_CMD_IQANDST;
 	}
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(vi->pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(0) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(vi->qsize_rxq);
 	c.iqaddr = htobe64(nm_rxq->iq_ba);
 	if (cong >= 0) {
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN |
 		    V_FW_IQ_CMD_FL0CNGCHMAP(cong) | F_FW_IQ_CMD_FL0CONGCIF |
 		    F_FW_IQ_CMD_FL0CONGEN);
 	}
 	c.iqns_to_fl0congen |=
 	    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 		(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 		(black_hole == 2 ? F_FW_IQ_CMD_FL0PACKEN : 0));
 	c.fl0dcaen_to_fl0cidxfthresh =
 	    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
 		V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 		X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 	c.fl0size = htobe16(na->num_rx_desc / 8 + sp->spg_len / EQ_ESIZE);
 	c.fl0addr = htobe64(nm_rxq->fl_ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create netmap ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	nm_rxq->iq_cidx = 0;
 	MPASS(nm_rxq->iq_sidx == vi->qsize_rxq - sp->spg_len / IQ_ESIZE);
 	nm_rxq->iq_gen = F_RSPD_GEN;
 	nm_rxq->iq_cntxt_id = be16toh(c.iqid);
 	nm_rxq->iq_abs_id = be16toh(c.physiqid);
 	cntxt_id = nm_rxq->iq_cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.niq) {
 		panic ("%s: nm_rxq->iq_cntxt_id (%d) more than the max (%d)",
 		    __func__, cntxt_id, sc->sge.niq - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = (void *)nm_rxq;
 
 	nm_rxq->fl_cntxt_id = be16toh(c.fl0id);
 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
 	MPASS(nm_rxq->fl_sidx == na->num_rx_desc);
 	cntxt_id = nm_rxq->fl_cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq) {
 		panic("%s: nm_rxq->fl_cntxt_id (%d) more than the max (%d)",
 		    __func__, cntxt_id, sc->sge.neq - 1);
 	}
 	sc->sge.eqmap[cntxt_id] = (void *)nm_rxq;
 
 	nm_rxq->fl_db_val = V_QID(nm_rxq->fl_cntxt_id) |
 	    sc->chip_params->sge_fl_db;
 
 	if (chip_id(sc) >= CHELSIO_T5 && cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(nm_rxq->iq_cntxt_id);
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(nm_rxq->iq_cntxt_id);
 		if (cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			device_printf(sc->dev,
 			    "failed to set congestion manager context for "
 			    "ingress queue %d: %d\n", nm_rxq->iq_cntxt_id, rc);
 		}
 	}
 
 	t4_write_reg(sc, sc->sge_gts_reg,
 	    V_INGRESSQID(nm_rxq->iq_cntxt_id) |
 	    V_SEINTARM(V_QINTR_TIMER_IDX(holdoff_tmr_idx)));
 
 	return (rc);
 }
 
 static int
 free_nm_rxq_hwq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
 {
 	struct adapter *sc = vi->pi->adapter;
 	int rc;
 
 	rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0, FW_IQ_TYPE_FL_INT_CAP,
 	    nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, 0xffff);
 	if (rc != 0)
 		device_printf(sc->dev, "%s: failed for iq %d, fl %d: %d\n",
 		    __func__, nm_rxq->iq_cntxt_id, nm_rxq->fl_cntxt_id, rc);
 	return (rc);
 }
 
 static int
 alloc_nm_txq_hwq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	int rc, cntxt_id;
 	size_t len;
 	struct adapter *sc = vi->pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 	struct fw_eq_eth_cmd c;
 
 	MPASS(na != NULL);
 	MPASS(nm_txq->desc != NULL);
 
 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	bzero(nm_txq->desc, len);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(vi->pi->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(sc->sge.nm_rxq[nm_txq->iqidx].iq_cntxt_id));
 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		      V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		      V_FW_EQ_ETH_CMD_EQSIZE(len / EQ_ESIZE));
 	c.eqaddr = htobe64(nm_txq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create netmap egress queue: %d\n", rc);
 		return (rc);
 	}
 
 	nm_txq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	cntxt_id = nm_txq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: nm_txq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = (void *)nm_txq;
 
 	nm_txq->pidx = nm_txq->cidx = 0;
 	MPASS(nm_txq->sidx == na->num_tx_desc);
 	nm_txq->equiqidx = nm_txq->equeqidx = nm_txq->dbidx = 0;
 
 	nm_txq->doorbells = sc->doorbells;
 	if (isset(&nm_txq->doorbells, DOORBELL_UDB) ||
 	    isset(&nm_txq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&nm_txq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (nm_txq->cntxt_id >> s_qpp) << PAGE_SHIFT;
 		nm_txq->udb_qid = nm_txq->cntxt_id & mask;
 		if (nm_txq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&nm_txq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += nm_txq->udb_qid << UDBS_SEG_SHIFT;
 			nm_txq->udb_qid = 0;
 		}
 		nm_txq->udb = (volatile void *)udb;
 	}
 
 	return (rc);
 }
 
 static int
 free_nm_txq_hwq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	struct adapter *sc = vi->pi->adapter;
 	int rc;
 
 	rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0, nm_txq->cntxt_id);
 	if (rc != 0)
 		device_printf(sc->dev, "%s: failed for eq %d: %d\n", __func__,
 		    nm_txq->cntxt_id, rc);
 	return (rc);
 }
 
 static int
 cxgbe_netmap_on(struct adapter *sc, struct vi_info *vi, struct ifnet *ifp,
     struct netmap_adapter *na)
 {
 	struct netmap_slot *slot;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 	int rc, i, j, hwidx;
 	struct hw_buf_info *hwb;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if ((vi->flags & VI_INIT_DONE) == 0 ||
 	    (ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return (EAGAIN);
 
 	hwb = &sc->sge.hw_buf_info[0];
 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
 		if (hwb->size == NETMAP_BUF_SIZE(na))
 			break;
 	}
 	if (i >= SGE_FLBUF_SIZES) {
 		if_printf(ifp, "no hwidx for netmap buffer size %d.\n",
 		    NETMAP_BUF_SIZE(na));
 		return (ENXIO);
 	}
 	hwidx = i;
 
 	/* Must set caps before calling netmap_reset */
 	nm_set_native_flags(na);
 
 	for_each_nm_rxq(vi, i, nm_rxq) {
 		struct irq *irq = &sc->irq[vi->first_intr + i];
 
 		alloc_nm_rxq_hwq(vi, nm_rxq, tnl_cong(vi->pi, nm_cong_drop));
 		nm_rxq->fl_hwidx = hwidx;
 		slot = netmap_reset(na, NR_RX, i, 0);
 		MPASS(slot != NULL);	/* XXXNM: error check, not assert */
 
 		/* We deal with 8 bufs at a time */
 		MPASS((na->num_rx_desc & 7) == 0);
 		MPASS(na->num_rx_desc == nm_rxq->fl_sidx);
 		for (j = 0; j < nm_rxq->fl_sidx; j++) {
 			uint64_t ba;
 
 			PNMB(na, &slot[j], &ba);
 			MPASS(ba != 0);
 			nm_rxq->fl_desc[j] = htobe64(ba | hwidx);
 		}
 		j = nm_rxq->fl_pidx = nm_rxq->fl_sidx - 8;
 		MPASS((j & 7) == 0);
 		j /= 8;	/* driver pidx to hardware pidx */
 		wmb();
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    nm_rxq->fl_db_val | V_PIDX(j));
 
 		atomic_cmpset_int(&irq->nm_state, NM_OFF, NM_ON);
 	}
 
 	for_each_nm_txq(vi, i, nm_txq) {
 		alloc_nm_txq_hwq(vi, nm_txq);
 		slot = netmap_reset(na, NR_TX, i, 0);
 		MPASS(slot != NULL);	/* XXXNM: error check, not assert */
 	}
 
 	if (vi->nm_rss == NULL) {
 		vi->nm_rss = malloc(vi->rss_size * sizeof(uint16_t), M_CXGBE,
 		    M_ZERO | M_WAITOK);
 	}
 	for (i = 0; i < vi->rss_size;) {
 		for_each_nm_rxq(vi, j, nm_rxq) {
 			vi->nm_rss[i++] = nm_rxq->iq_abs_id;
 			if (i == vi->rss_size)
 				break;
 		}
 	}
 	rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size,
 	    vi->nm_rss, vi->rss_size);
 	if (rc != 0)
 		if_printf(ifp, "netmap rss_config failed: %d\n", rc);
 
 	return (rc);
 }
 
 static int
 cxgbe_netmap_off(struct adapter *sc, struct vi_info *vi, struct ifnet *ifp,
     struct netmap_adapter *na)
 {
 	int rc, i;
 	struct sge_nm_txq *nm_txq;
 	struct sge_nm_rxq *nm_rxq;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if ((vi->flags & VI_INIT_DONE) == 0)
 		return (0);
 
 	rc = -t4_config_rss_range(sc, sc->mbox, vi->viid, 0, vi->rss_size,
 	    vi->rss, vi->rss_size);
 	if (rc != 0)
 		if_printf(ifp, "failed to restore RSS config: %d\n", rc);
 	nm_clear_native_flags(na);
 
 	for_each_nm_txq(vi, i, nm_txq) {
 		struct sge_qstat *spg = (void *)&nm_txq->desc[nm_txq->sidx];
 
 		/* Wait for hw pidx to catch up ... */
 		while (be16toh(nm_txq->pidx) != spg->pidx)
 			pause("nmpidx", 1);
 
 		/* ... and then for the cidx. */
 		while (spg->pidx != spg->cidx)
 			pause("nmcidx", 1);
 
 		free_nm_txq_hwq(vi, nm_txq);
 	}
 	for_each_nm_rxq(vi, i, nm_rxq) {
 		struct irq *irq = &sc->irq[vi->first_intr + i];
 
 		while (!atomic_cmpset_int(&irq->nm_state, NM_ON, NM_OFF))
 			pause("nmst", 1);
 
 		free_nm_rxq_hwq(vi, nm_rxq);
 	}
 
 	return (rc);
 }
 
 static int
 cxgbe_netmap_reg(struct netmap_adapter *na, int on)
 {
 	struct ifnet *ifp = na->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	int rc;
 
 	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4nmreg");
 	if (rc != 0)
 		return (rc);
 	if (on)
 		rc = cxgbe_netmap_on(sc, vi, ifp, na);
 	else
 		rc = cxgbe_netmap_off(sc, vi, ifp, na);
 	end_synchronized_op(sc, 0);
 
 	return (rc);
 }
 
 /* How many packets can a single type1 WR carry in n descriptors */
 static inline int
 ndesc_to_npkt(const int n)
 {
 
 	MPASS(n > 0 && n <= SGE_MAX_WR_NDESC);
 
 	return (n * 2 - 1);
 }
 #define MAX_NPKT_IN_TYPE1_WR	(ndesc_to_npkt(SGE_MAX_WR_NDESC))
 
 /* Space (in descriptors) needed for a type1 WR that carries n packets */
 static inline int
 npkt_to_ndesc(const int n)
 {
 
 	MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR);
 
 	return ((n + 2) / 2);
 }
 
 /* Space (in 16B units) needed for a type1 WR that carries n packets */
 static inline int
 npkt_to_len16(const int n)
 {
 
 	MPASS(n > 0 && n <= MAX_NPKT_IN_TYPE1_WR);
 
 	return (n * 2 + 1);
 }
 
 #define NMIDXDIFF(q, idx) IDXDIFF((q)->pidx, (q)->idx, (q)->sidx)
 
 static void
 ring_nm_txq_db(struct adapter *sc, struct sge_nm_txq *nm_txq)
 {
 	int n;
 	u_int db = nm_txq->doorbells;
 
 	MPASS(nm_txq->pidx != nm_txq->dbidx);
 
 	n = NMIDXDIFF(nm_txq, dbidx);
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(nm_txq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for nm_txq %p",
 		    __func__, nm_txq->doorbells, n, nm_txq->pidx, nm_txq));
 
 		dst = (volatile void *)((uintptr_t)nm_txq->udb +
 		    UDBS_WR_OFFSET - UDBS_DB_OFFSET);
 		src = (void *)&nm_txq->desc[nm_txq->dbidx];
 		while (src != (void *)&nm_txq->desc[nm_txq->dbidx + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*nm_txq->udb = htole32(V_QID(nm_txq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(nm_txq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 	nm_txq->dbidx = nm_txq->pidx;
 }
 
 int lazy_tx_credit_flush = 1;
 
 /*
  * Write work requests to send 'npkt' frames and ring the doorbell to send them
  * on their way.  No need to check for wraparound.
  */
 static void
 cxgbe_nm_tx(struct adapter *sc, struct sge_nm_txq *nm_txq,
     struct netmap_kring *kring, int npkt, int npkt_remaining, int txcsum)
 {
 	struct netmap_ring *ring = kring->ring;
 	struct netmap_slot *slot;
 	const u_int lim = kring->nkr_num_slots - 1;
 	struct fw_eth_tx_pkts_wr *wr = (void *)&nm_txq->desc[nm_txq->pidx];
 	uint16_t len;
 	uint64_t ba;
 	struct cpl_tx_pkt_core *cpl;
 	struct ulptx_sgl *usgl;
 	int i, n;
 
 	while (npkt) {
 		n = min(npkt, MAX_NPKT_IN_TYPE1_WR);
 		len = 0;
 
 		wr = (void *)&nm_txq->desc[nm_txq->pidx];
 		wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 		wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(npkt_to_len16(n)));
 		wr->npkt = n;
 		wr->r3 = 0;
 		wr->type = 1;
 		cpl = (void *)(wr + 1);
 
 		for (i = 0; i < n; i++) {
 			slot = &ring->slot[kring->nr_hwcur];
 			PNMB(kring->na, slot, &ba);
 			MPASS(ba != 0);
 
 			cpl->ctrl0 = nm_txq->cpl_ctrl0;
 			cpl->pack = 0;
 			cpl->len = htobe16(slot->len);
 			/*
 			 * netmap(4) says "netmap does not use features such as
 			 * checksum offloading, TCP segmentation offloading,
 			 * encryption, VLAN encapsulation/decapsulation, etc."
 			 *
 			 * So the ncxl interfaces have tx hardware checksumming
 			 * disabled by default.  But you can override netmap by
 			 * enabling IFCAP_TXCSUM on the interface manully.
 			 */
 			cpl->ctrl1 = txcsum ? 0 :
 			    htobe64(F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
 
 			usgl = (void *)(cpl + 1);
 			usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 			    V_ULPTX_NSGE(1));
 			usgl->len0 = htobe32(slot->len);
 			usgl->addr0 = htobe64(ba);
 
 			slot->flags &= ~(NS_REPORT | NS_BUF_CHANGED);
 			cpl = (void *)(usgl + 1);
 			MPASS(slot->len + len <= UINT16_MAX);
 			len += slot->len;
 			kring->nr_hwcur = nm_next(kring->nr_hwcur, lim);
 		}
 		wr->plen = htobe16(len);
 
 		npkt -= n;
 		nm_txq->pidx += npkt_to_ndesc(n);
 		MPASS(nm_txq->pidx <= nm_txq->sidx);
 		if (__predict_false(nm_txq->pidx == nm_txq->sidx)) {
 			/*
 			 * This routine doesn't know how to write WRs that wrap
 			 * around.  Make sure it wasn't asked to.
 			 */
 			MPASS(npkt == 0);
 			nm_txq->pidx = 0;
 		}
 
 		if (npkt == 0 && npkt_remaining == 0) {
 			/* All done. */
 			if (lazy_tx_credit_flush == 0) {
 				wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ |
 				    F_FW_WR_EQUIQ);
 				nm_txq->equeqidx = nm_txq->pidx;
 				nm_txq->equiqidx = nm_txq->pidx;
 			}
 			ring_nm_txq_db(sc, nm_txq);
 			return;
 		}
 
 		if (NMIDXDIFF(nm_txq, equiqidx) >= nm_txq->sidx / 2) {
 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ |
 			    F_FW_WR_EQUIQ);
 			nm_txq->equeqidx = nm_txq->pidx;
 			nm_txq->equiqidx = nm_txq->pidx;
 		} else if (NMIDXDIFF(nm_txq, equeqidx) >= 64) {
 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 			nm_txq->equeqidx = nm_txq->pidx;
 		}
 		if (NMIDXDIFF(nm_txq, dbidx) >= 2 * SGE_MAX_WR_NDESC)
 			ring_nm_txq_db(sc, nm_txq);
 	}
 
 	/* Will get called again. */
 	MPASS(npkt_remaining);
 }
 
 /* How many contiguous free descriptors starting at pidx */
 static inline int
 contiguous_ndesc_available(struct sge_nm_txq *nm_txq)
 {
 
 	if (nm_txq->cidx > nm_txq->pidx)
 		return (nm_txq->cidx - nm_txq->pidx - 1);
 	else if (nm_txq->cidx > 0)
 		return (nm_txq->sidx - nm_txq->pidx);
 	else
 		return (nm_txq->sidx - nm_txq->pidx - 1);
 }
 
 static int
 reclaim_nm_tx_desc(struct sge_nm_txq *nm_txq)
 {
 	struct sge_qstat *spg = (void *)&nm_txq->desc[nm_txq->sidx];
 	uint16_t hw_cidx = spg->cidx;	/* snapshot */
 	struct fw_eth_tx_pkts_wr *wr;
 	int n = 0;
 
 	hw_cidx = be16toh(hw_cidx);
 
 	while (nm_txq->cidx != hw_cidx) {
 		wr = (void *)&nm_txq->desc[nm_txq->cidx];
 
 		MPASS(wr->op_pkd == htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR)));
 		MPASS(wr->type == 1);
 		MPASS(wr->npkt > 0 && wr->npkt <= MAX_NPKT_IN_TYPE1_WR);
 
 		n += wr->npkt;
 		nm_txq->cidx += npkt_to_ndesc(wr->npkt);
 
 		/*
 		 * We never sent a WR that wrapped around so the credits coming
 		 * back, WR by WR, should never cause the cidx to wrap around
 		 * either.
 		 */
 		MPASS(nm_txq->cidx <= nm_txq->sidx);
 		if (__predict_false(nm_txq->cidx == nm_txq->sidx))
 			nm_txq->cidx = 0;
 	}
 
 	return (n);
 }
 
 static int
 cxgbe_netmap_txsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct ifnet *ifp = na->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_nm_txq *nm_txq = &sc->sge.nm_txq[vi->first_nm_txq + kring->ring_id];
 	const u_int head = kring->rhead;
 	u_int reclaimed = 0;
 	int n, d, npkt_remaining, ndesc_remaining, txcsum;
 
 	/*
 	 * Tx was at kring->nr_hwcur last time around and now we need to advance
 	 * to kring->rhead.  Note that the driver's pidx moves independent of
 	 * netmap's kring->nr_hwcur (pidx counts descriptors and the relation
 	 * between descriptors and frames isn't 1:1).
 	 */
 
 	npkt_remaining = head >= kring->nr_hwcur ? head - kring->nr_hwcur :
 	    kring->nkr_num_slots - kring->nr_hwcur + head;
 	txcsum = ifp->if_capenable & (IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6);
 	while (npkt_remaining) {
 		reclaimed += reclaim_nm_tx_desc(nm_txq);
 		ndesc_remaining = contiguous_ndesc_available(nm_txq);
 		/* Can't run out of descriptors with packets still remaining */
 		MPASS(ndesc_remaining > 0);
 
 		/* # of desc needed to tx all remaining packets */
 		d = (npkt_remaining / MAX_NPKT_IN_TYPE1_WR) * SGE_MAX_WR_NDESC;
 		if (npkt_remaining % MAX_NPKT_IN_TYPE1_WR)
 			d += npkt_to_ndesc(npkt_remaining % MAX_NPKT_IN_TYPE1_WR);
 
 		if (d <= ndesc_remaining)
 			n = npkt_remaining;
 		else {
 			/* Can't send all, calculate how many can be sent */
 			n = (ndesc_remaining / SGE_MAX_WR_NDESC) *
 			    MAX_NPKT_IN_TYPE1_WR;
 			if (ndesc_remaining % SGE_MAX_WR_NDESC)
 				n += ndesc_to_npkt(ndesc_remaining % SGE_MAX_WR_NDESC);
 		}
 
 		/* Send n packets and update nm_txq->pidx and kring->nr_hwcur */
 		npkt_remaining -= n;
 		cxgbe_nm_tx(sc, nm_txq, kring, n, npkt_remaining, txcsum);
 	}
 	MPASS(npkt_remaining == 0);
 	MPASS(kring->nr_hwcur == head);
 	MPASS(nm_txq->dbidx == nm_txq->pidx);
 
 	/*
 	 * Second part: reclaim buffers for completed transmissions.
 	 */
 	if (reclaimed || flags & NAF_FORCE_RECLAIM || nm_kr_txempty(kring)) {
 		reclaimed += reclaim_nm_tx_desc(nm_txq);
 		kring->nr_hwtail += reclaimed;
 		if (kring->nr_hwtail >= kring->nkr_num_slots)
 			kring->nr_hwtail -= kring->nkr_num_slots;
 	}
 
 	return (0);
 }
 
 static int
 cxgbe_netmap_rxsync(struct netmap_kring *kring, int flags)
 {
 	struct netmap_adapter *na = kring->na;
 	struct netmap_ring *ring = kring->ring;
 	struct ifnet *ifp = na->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_nm_rxq *nm_rxq = &sc->sge.nm_rxq[vi->first_nm_rxq + kring->ring_id];
 	u_int const head = kring->rhead;
 	u_int n;
 	int force_update = (flags & NAF_FORCE_READ) || kring->nr_kflags & NKR_PENDINTR;
 
 	if (black_hole)
 		return (0);	/* No updates ever. */
 
 	if (netmap_no_pendintr || force_update) {
 		kring->nr_hwtail = atomic_load_acq_32(&nm_rxq->fl_cidx);
 		kring->nr_kflags &= ~NKR_PENDINTR;
 	}
 
 	/* Userspace done with buffers from kring->nr_hwcur to head */
 	n = head >= kring->nr_hwcur ? head - kring->nr_hwcur :
 	    kring->nkr_num_slots - kring->nr_hwcur + head;
 	n &= ~7U;
 	if (n > 0) {
 		u_int fl_pidx = nm_rxq->fl_pidx;
 		struct netmap_slot *slot = &ring->slot[fl_pidx];
 		uint64_t ba;
 		int i, dbinc = 0, hwidx = nm_rxq->fl_hwidx;
 
 		/*
 		 * We always deal with 8 buffers at a time.  We must have
 		 * stopped at an 8B boundary (fl_pidx) last time around and we
 		 * must have a multiple of 8B buffers to give to the freelist.
 		 */
 		MPASS((fl_pidx & 7) == 0);
 		MPASS((n & 7) == 0);
 
 		IDXINCR(kring->nr_hwcur, n, kring->nkr_num_slots);
 		IDXINCR(nm_rxq->fl_pidx, n, nm_rxq->fl_sidx);
 
 		while (n > 0) {
 			for (i = 0; i < 8; i++, fl_pidx++, slot++) {
 				PNMB(na, slot, &ba);
 				MPASS(ba != 0);
 				nm_rxq->fl_desc[fl_pidx] = htobe64(ba | hwidx);
 				slot->flags &= ~NS_BUF_CHANGED;
 				MPASS(fl_pidx <= nm_rxq->fl_sidx);
 			}
 			n -= 8;
 			if (fl_pidx == nm_rxq->fl_sidx) {
 				fl_pidx = 0;
 				slot = &ring->slot[0];
 			}
 			if (++dbinc == 8 && n >= 32) {
 				wmb();
 				t4_write_reg(sc, sc->sge_kdoorbell_reg,
 				    nm_rxq->fl_db_val | V_PIDX(dbinc));
 				dbinc = 0;
 			}
 		}
 		MPASS(nm_rxq->fl_pidx == fl_pidx);
 
 		if (dbinc > 0) {
 			wmb();
 			t4_write_reg(sc, sc->sge_kdoorbell_reg,
 			    nm_rxq->fl_db_val | V_PIDX(dbinc));
 		}
 	}
 
 	return (0);
 }
 
 void
 cxgbe_nm_attach(struct vi_info *vi)
 {
 	struct port_info *pi;
 	struct adapter *sc;
 	struct netmap_adapter na;
 
 	MPASS(vi->nnmrxq > 0);
 	MPASS(vi->ifp != NULL);
 
 	pi = vi->pi;
 	sc = pi->adapter;
 
 	bzero(&na, sizeof(na));
 
 	na.ifp = vi->ifp;
 	na.na_flags = NAF_BDG_MAYSLEEP;
 
 	/* Netmap doesn't know about the space reserved for the status page. */
 	na.num_tx_desc = vi->qsize_txq - sc->params.sge.spg_len / EQ_ESIZE;
 
 	/*
 	 * The freelist's cidx/pidx drives netmap's rx cidx/pidx.  So
 	 * num_rx_desc is based on the number of buffers that can be held in the
 	 * freelist, and not the number of entries in the iq.  (These two are
 	 * not exactly the same due to the space taken up by the status page).
 	 */
 	na.num_rx_desc = rounddown(vi->qsize_rxq, 8);
 	na.nm_txsync = cxgbe_netmap_txsync;
 	na.nm_rxsync = cxgbe_netmap_rxsync;
 	na.nm_register = cxgbe_netmap_reg;
 	na.num_tx_rings = vi->nnmtxq;
 	na.num_rx_rings = vi->nnmrxq;
 	netmap_attach(&na);	/* This adds IFCAP_NETMAP to if_capabilities */
 }
 
 void
 cxgbe_nm_detach(struct vi_info *vi)
 {
 
 	MPASS(vi->nnmrxq > 0);
 	MPASS(vi->ifp != NULL);
 
 	netmap_detach(vi->ifp);
 }
 
+static inline const void *
+unwrap_nm_fw6_msg(const struct cpl_fw6_msg *cpl)
+{
+
+	MPASS(cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL);
+
+	/* data[0] is RSS header */
+	return (&cpl->data[1]);
+}
+
 static void
-handle_nm_fw6_msg(struct adapter *sc, struct ifnet *ifp,
-    const struct cpl_fw6_msg *cpl)
+handle_nm_sge_egr_update(struct adapter *sc, struct ifnet *ifp,
+    const struct cpl_sge_egr_update *egr)
 {
-	const struct cpl_sge_egr_update *egr;
 	uint32_t oq;
 	struct sge_nm_txq *nm_txq;
 
-	if (cpl->type != FW_TYPE_RSSCPL && cpl->type != FW6_TYPE_RSSCPL)
-		panic("%s: FW_TYPE 0x%x on nm_rxq.", __func__, cpl->type);
-
-	/* data[0] is RSS header */
-	egr = (const void *)&cpl->data[1];
 	oq = be32toh(egr->opcode_qid);
 	MPASS(G_CPL_OPCODE(oq) == CPL_SGE_EGR_UPDATE);
 	nm_txq = (void *)sc->sge.eqmap[G_EGR_QID(oq) - sc->sge.eq_start];
 
 	netmap_tx_irq(ifp, nm_txq->nid);
 }
 
 void
 t4_nm_intr(void *arg)
 {
 	struct sge_nm_rxq *nm_rxq = arg;
 	struct vi_info *vi = nm_rxq->vi;
 	struct adapter *sc = vi->pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct netmap_adapter *na = NA(ifp);
 	struct netmap_kring *kring = &na->rx_rings[nm_rxq->nid];
 	struct netmap_ring *ring = kring->ring;
 	struct iq_desc *d = &nm_rxq->iq_desc[nm_rxq->iq_cidx];
+	const void *cpl;
 	uint32_t lq;
 	u_int n = 0, work = 0;
 	uint8_t opcode;
 	uint32_t fl_cidx = atomic_load_acq_32(&nm_rxq->fl_cidx);
 	u_int fl_credits = fl_cidx & 7;
 
 	while ((d->rsp.u.type_gen & F_RSPD_GEN) == nm_rxq->iq_gen) {
 
 		rmb();
 
 		lq = be32toh(d->rsp.pldbuflen_qid);
 		opcode = d->rss.opcode;
+		cpl = &d->cpl[0];
 
 		switch (G_RSPD_TYPE(d->rsp.u.type_gen)) {
 		case X_RSPD_TYPE_FLBUF:
 			if (black_hole != 2) {
 				/* No buffer packing so new buf every time */
 				MPASS(lq & F_RSPD_NEWBUF);
 			}
 
 			/* fall through */
 
 		case X_RSPD_TYPE_CPL:
 			MPASS(opcode < NUM_CPL_CMDS);
 
 			switch (opcode) {
 			case CPL_FW4_MSG:
 			case CPL_FW6_MSG:
-				handle_nm_fw6_msg(sc, ifp,
-				    (const void *)&d->cpl[0]);
+				cpl = unwrap_nm_fw6_msg(cpl);
+				/* fall through */
+			case CPL_SGE_EGR_UPDATE:
+				handle_nm_sge_egr_update(sc, ifp, cpl);
 				break;
 			case CPL_RX_PKT:
 				ring->slot[fl_cidx].len = G_RSPD_LEN(lq) -
 				    sc->params.sge.fl_pktshift;
 				ring->slot[fl_cidx].flags = kring->nkr_slot_flags;
 				fl_cidx += (lq & F_RSPD_NEWBUF) ? 1 : 0;
 				fl_credits += (lq & F_RSPD_NEWBUF) ? 1 : 0;
 				if (__predict_false(fl_cidx == nm_rxq->fl_sidx))
 					fl_cidx = 0;
 				break;
 			default:
 				panic("%s: unexpected opcode 0x%x on nm_rxq %p",
 				    __func__, opcode, nm_rxq);
 			}
 			break;
 
 		case X_RSPD_TYPE_INTR:
 			/* Not equipped to handle forwarded interrupts. */
 			panic("%s: netmap queue received interrupt for iq %u\n",
 			    __func__, lq);
 
 		default:
 			panic("%s: illegal response type %d on nm_rxq %p",
 			    __func__, G_RSPD_TYPE(d->rsp.u.type_gen), nm_rxq);
 		}
 
 		d++;
 		if (__predict_false(++nm_rxq->iq_cidx == nm_rxq->iq_sidx)) {
 			nm_rxq->iq_cidx = 0;
 			d = &nm_rxq->iq_desc[0];
 			nm_rxq->iq_gen ^= F_RSPD_GEN;
 		}
 
 		if (__predict_false(++n == rx_ndesc)) {
 			atomic_store_rel_32(&nm_rxq->fl_cidx, fl_cidx);
 			if (black_hole && fl_credits >= 8) {
 				fl_credits /= 8;
 				IDXINCR(nm_rxq->fl_pidx, fl_credits * 8,
 				    nm_rxq->fl_sidx);
 				t4_write_reg(sc, sc->sge_kdoorbell_reg,
 				    nm_rxq->fl_db_val | V_PIDX(fl_credits));
 				fl_credits = fl_cidx & 7;
 			} else if (!black_hole) {
 				netmap_rx_irq(ifp, nm_rxq->nid, &work);
 				MPASS(work != 0);
 			}
 			t4_write_reg(sc, sc->sge_gts_reg,
 			    V_CIDXINC(n) | V_INGRESSQID(nm_rxq->iq_cntxt_id) |
 			    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 			n = 0;
 		}
 	}
 
 	atomic_store_rel_32(&nm_rxq->fl_cidx, fl_cidx);
 	if (black_hole) {
 		fl_credits /= 8;
 		IDXINCR(nm_rxq->fl_pidx, fl_credits * 8, nm_rxq->fl_sidx);
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    nm_rxq->fl_db_val | V_PIDX(fl_credits));
 	} else
 		netmap_rx_irq(ifp, nm_rxq->nid, &work);
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(n) |
 	    V_INGRESSQID((u32)nm_rxq->iq_cntxt_id) |
 	    V_SEINTARM(V_QINTR_TIMER_IDX(holdoff_tmr_idx)));
 }
 #endif
Index: user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/dev/cxgbe/t4_sge.c	(revision 306283)
@@ -1,5227 +1,5259 @@
 /*-
  * Copyright (c) 2011 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/types.h>
 #include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/time.h>
 #include <sys/sglist.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/counter.h>
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/tcp.h>
 #include <machine/in_cksum.h>
 #include <machine/md_var.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #ifdef DEV_NETMAP
 #include <machine/bus.h>
 #include <sys/selinfo.h>
 #include <net/if_var.h>
 #include <net/netmap.h>
 #include <dev/netmap/netmap_kern.h>
 #endif
 
 #include "common/common.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_msg.h"
 #include "t4_l2t.h"
 #include "t4_mp_ring.h"
 
 #ifdef T4_PKT_TIMESTAMP
 #define RX_COPY_THRESHOLD (MINCLSIZE - 8)
 #else
 #define RX_COPY_THRESHOLD MINCLSIZE
 #endif
 
 /*
  * Ethernet frames are DMA'd at this byte offset into the freelist buffer.
  * 0-7 are valid values.
  */
 static int fl_pktshift = 2;
 TUNABLE_INT("hw.cxgbe.fl_pktshift", &fl_pktshift);
 
 /*
  * Pad ethernet payload up to this boundary.
  * -1: driver should figure out a good value.
  *  0: disable padding.
  *  Any power of 2 from 32 to 4096 (both inclusive) is also a valid value.
  */
 int fl_pad = -1;
 TUNABLE_INT("hw.cxgbe.fl_pad", &fl_pad);
 
 /*
  * Status page length.
  * -1: driver should figure out a good value.
  *  64 or 128 are the only other valid values.
  */
 static int spg_len = -1;
 TUNABLE_INT("hw.cxgbe.spg_len", &spg_len);
 
 /*
  * Congestion drops.
  * -1: no congestion feedback (not recommended).
  *  0: backpressure the channel instead of dropping packets right away.
  *  1: no backpressure, drop packets for the congested queue immediately.
  */
 static int cong_drop = 0;
 TUNABLE_INT("hw.cxgbe.cong_drop", &cong_drop);
 
 /*
  * Deliver multiple frames in the same free list buffer if they fit.
  * -1: let the driver decide whether to enable buffer packing or not.
  *  0: disable buffer packing.
  *  1: enable buffer packing.
  */
 static int buffer_packing = -1;
 TUNABLE_INT("hw.cxgbe.buffer_packing", &buffer_packing);
 
 /*
  * Start next frame in a packed buffer at this boundary.
  * -1: driver should figure out a good value.
  * T4: driver will ignore this and use the same value as fl_pad above.
  * T5: 16, or a power of 2 from 64 to 4096 (both inclusive) is a valid value.
  */
 static int fl_pack = -1;
 TUNABLE_INT("hw.cxgbe.fl_pack", &fl_pack);
 
 /*
  * Allow the driver to create mbuf(s) in a cluster allocated for rx.
  * 0: never; always allocate mbufs from the zone_mbuf UMA zone.
  * 1: ok to create mbuf(s) within a cluster if there is room.
  */
 static int allow_mbufs_in_cluster = 1;
 TUNABLE_INT("hw.cxgbe.allow_mbufs_in_cluster", &allow_mbufs_in_cluster);
 
 /*
  * Largest rx cluster size that the driver is allowed to allocate.
  */
 static int largest_rx_cluster = MJUM16BYTES;
 TUNABLE_INT("hw.cxgbe.largest_rx_cluster", &largest_rx_cluster);
 
 /*
  * Size of cluster allocation that's most likely to succeed.  The driver will
  * fall back to this size if it fails to allocate clusters larger than this.
  */
 static int safest_rx_cluster = PAGE_SIZE;
 TUNABLE_INT("hw.cxgbe.safest_rx_cluster", &safest_rx_cluster);
 
 struct txpkts {
 	u_int wr_type;		/* type 0 or type 1 */
 	u_int npkt;		/* # of packets in this work request */
 	u_int plen;		/* total payload (sum of all packets) */
 	u_int len16;		/* # of 16B pieces used by this work request */
 };
 
 /* A packet's SGL.  This + m_pkthdr has all info needed for tx */
 struct sgl {
 	struct sglist sg;
 	struct sglist_seg seg[TX_SGL_SEGS];
 };
 
 static int service_iq(struct sge_iq *, int);
 static struct mbuf *get_fl_payload(struct adapter *, struct sge_fl *, uint32_t);
 static int t4_eth_rx(struct sge_iq *, const struct rss_header *, struct mbuf *);
 static inline void init_iq(struct sge_iq *, struct adapter *, int, int, int);
 static inline void init_fl(struct adapter *, struct sge_fl *, int, int, char *);
 static inline void init_eq(struct adapter *, struct sge_eq *, int, int, uint8_t,
     uint16_t, char *);
 static int alloc_ring(struct adapter *, size_t, bus_dma_tag_t *, bus_dmamap_t *,
     bus_addr_t *, void **);
 static int free_ring(struct adapter *, bus_dma_tag_t, bus_dmamap_t, bus_addr_t,
     void *);
 static int alloc_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *,
     int, int);
 static int free_iq_fl(struct vi_info *, struct sge_iq *, struct sge_fl *);
-static void add_fl_sysctls(struct sysctl_ctx_list *, struct sysctl_oid *,
-    struct sge_fl *);
+static void add_fl_sysctls(struct adapter *, struct sysctl_ctx_list *,
+    struct sysctl_oid *, struct sge_fl *);
 static int alloc_fwq(struct adapter *);
 static int free_fwq(struct adapter *);
 static int alloc_mgmtq(struct adapter *);
 static int free_mgmtq(struct adapter *);
 static int alloc_rxq(struct vi_info *, struct sge_rxq *, int, int,
     struct sysctl_oid *);
 static int free_rxq(struct vi_info *, struct sge_rxq *);
 #ifdef TCP_OFFLOAD
 static int alloc_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *, int, int,
     struct sysctl_oid *);
 static int free_ofld_rxq(struct vi_info *, struct sge_ofld_rxq *);
 #endif
 #ifdef DEV_NETMAP
 static int alloc_nm_rxq(struct vi_info *, struct sge_nm_rxq *, int, int,
     struct sysctl_oid *);
 static int free_nm_rxq(struct vi_info *, struct sge_nm_rxq *);
 static int alloc_nm_txq(struct vi_info *, struct sge_nm_txq *, int, int,
     struct sysctl_oid *);
 static int free_nm_txq(struct vi_info *, struct sge_nm_txq *);
 #endif
 static int ctrl_eq_alloc(struct adapter *, struct sge_eq *);
 static int eth_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #ifdef TCP_OFFLOAD
 static int ofld_eq_alloc(struct adapter *, struct vi_info *, struct sge_eq *);
 #endif
 static int alloc_eq(struct adapter *, struct vi_info *, struct sge_eq *);
 static int free_eq(struct adapter *, struct sge_eq *);
 static int alloc_wrq(struct adapter *, struct vi_info *, struct sge_wrq *,
     struct sysctl_oid *);
 static int free_wrq(struct adapter *, struct sge_wrq *);
 static int alloc_txq(struct vi_info *, struct sge_txq *, int,
     struct sysctl_oid *);
 static int free_txq(struct vi_info *, struct sge_txq *);
 static void oneseg_dma_callback(void *, bus_dma_segment_t *, int, int);
 static inline void ring_fl_db(struct adapter *, struct sge_fl *);
 static int refill_fl(struct adapter *, struct sge_fl *, int);
 static void refill_sfl(void *);
 static int alloc_fl_sdesc(struct sge_fl *);
 static void free_fl_sdesc(struct adapter *, struct sge_fl *);
 static void find_best_refill_source(struct adapter *, struct sge_fl *, int);
 static void find_safe_refill_source(struct adapter *, struct sge_fl *);
 static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
 
 static inline void get_pkt_gl(struct mbuf *, struct sglist *);
 static inline u_int txpkt_len16(u_int, u_int);
 static inline u_int txpkt_vm_len16(u_int, u_int);
 static inline u_int txpkts0_len16(u_int);
 static inline u_int txpkts1_len16(void);
 static u_int write_txpkt_wr(struct sge_txq *, struct fw_eth_tx_pkt_wr *,
     struct mbuf *, u_int);
 static u_int write_txpkt_vm_wr(struct adapter *, struct sge_txq *,
     struct fw_eth_tx_pkt_vm_wr *, struct mbuf *, u_int);
 static int try_txpkts(struct mbuf *, struct mbuf *, struct txpkts *, u_int);
 static int add_to_txpkts(struct mbuf *, struct txpkts *, u_int);
 static u_int write_txpkts_wr(struct sge_txq *, struct fw_eth_tx_pkts_wr *,
     struct mbuf *, const struct txpkts *, u_int);
 static void write_gl_to_txd(struct sge_txq *, struct mbuf *, caddr_t *, int);
 static inline void copy_to_txd(struct sge_eq *, caddr_t, caddr_t *, int);
 static inline void ring_eq_db(struct adapter *, struct sge_eq *, u_int);
 static inline uint16_t read_hw_cidx(struct sge_eq *);
 static inline u_int reclaimable_tx_desc(struct sge_eq *);
 static inline u_int total_available_tx_desc(struct sge_eq *);
 static u_int reclaim_tx_descs(struct sge_txq *, u_int);
 static void tx_reclaim(void *, int);
 static __be64 get_flit(struct sglist_seg *, int, int);
 static int handle_sge_egr_update(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int handle_fw_msg(struct sge_iq *, const struct rss_header *,
     struct mbuf *);
 static int t4_handle_wrerr_rpl(struct adapter *, const __be64 *);
 static void wrq_tx_drain(void *, int);
 static void drain_wrq_wr_list(struct adapter *, struct sge_wrq *);
 
 static int sysctl_uint16(SYSCTL_HANDLER_ARGS);
 static int sysctl_bufsizes(SYSCTL_HANDLER_ARGS);
 static int sysctl_tc(SYSCTL_HANDLER_ARGS);
 
 static counter_u64_t extfree_refs;
 static counter_u64_t extfree_rels;
 
 an_handler_t t4_an_handler;
 fw_msg_handler_t t4_fw_msg_handler[NUM_FW6_TYPES];
 cpl_handler_t t4_cpl_handler[NUM_CPL_CMDS];
 
 
 static int
 an_not_handled(struct sge_iq *iq, const struct rsp_ctrl *ctrl)
 {
 
 #ifdef INVARIANTS
 	panic("%s: async notification on iq %p (ctrl %p)", __func__, iq, ctrl);
 #else
 	log(LOG_ERR, "%s: async notification on iq %p (ctrl %p)\n",
 	    __func__, iq, ctrl);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_an_handler(an_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	new = h ? (uintptr_t)h : (uintptr_t)an_not_handled;
 	loc = (uintptr_t *) &t4_an_handler;
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 static int
 fw_msg_not_handled(struct adapter *sc, const __be64 *rpl)
 {
 	const struct cpl_fw6_msg *cpl =
 	    __containerof(rpl, struct cpl_fw6_msg, data[0]);
 
 #ifdef INVARIANTS
 	panic("%s: fw_msg type %d", __func__, cpl->type);
 #else
 	log(LOG_ERR, "%s: fw_msg type %d\n", __func__, cpl->type);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_fw_msg_handler(int type, fw_msg_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	if (type >= nitems(t4_fw_msg_handler))
 		return (EINVAL);
 
 	/*
 	 * These are dispatched by the handler for FW{4|6}_CPL_MSG using the CPL
 	 * handler dispatch table.  Reject any attempt to install a handler for
 	 * this subtype.
 	 */
 	if (type == FW_TYPE_RSSCPL || type == FW6_TYPE_RSSCPL)
 		return (EINVAL);
 
 	new = h ? (uintptr_t)h : (uintptr_t)fw_msg_not_handled;
 	loc = (uintptr_t *) &t4_fw_msg_handler[type];
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 static int
 cpl_not_handled(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 
 #ifdef INVARIANTS
 	panic("%s: opcode 0x%02x on iq %p with payload %p",
 	    __func__, rss->opcode, iq, m);
 #else
 	log(LOG_ERR, "%s: opcode 0x%02x on iq %p with payload %p\n",
 	    __func__, rss->opcode, iq, m);
 	m_freem(m);
 #endif
 	return (EDOOFUS);
 }
 
 int
 t4_register_cpl_handler(int opcode, cpl_handler_t h)
 {
 	uintptr_t *loc, new;
 
 	if (opcode >= nitems(t4_cpl_handler))
 		return (EINVAL);
 
 	new = h ? (uintptr_t)h : (uintptr_t)cpl_not_handled;
 	loc = (uintptr_t *) &t4_cpl_handler[opcode];
 	atomic_store_rel_ptr(loc, new);
 
 	return (0);
 }
 
 /*
  * Called on MOD_LOAD.  Validates and calculates the SGE tunables.
  */
 void
 t4_sge_modload(void)
 {
 	int i;
 
 	if (fl_pktshift < 0 || fl_pktshift > 7) {
 		printf("Invalid hw.cxgbe.fl_pktshift value (%d),"
 		    " using 2 instead.\n", fl_pktshift);
 		fl_pktshift = 2;
 	}
 
 	if (spg_len != 64 && spg_len != 128) {
 		int len;
 
 #if defined(__i386__) || defined(__amd64__)
 		len = cpu_clflush_line_size > 64 ? 128 : 64;
 #else
 		len = 64;
 #endif
 		if (spg_len != -1) {
 			printf("Invalid hw.cxgbe.spg_len value (%d),"
 			    " using %d instead.\n", spg_len, len);
 		}
 		spg_len = len;
 	}
 
 	if (cong_drop < -1 || cong_drop > 1) {
 		printf("Invalid hw.cxgbe.cong_drop value (%d),"
 		    " using 0 instead.\n", cong_drop);
 		cong_drop = 0;
 	}
 
 	extfree_refs = counter_u64_alloc(M_WAITOK);
 	extfree_rels = counter_u64_alloc(M_WAITOK);
 	counter_u64_zero(extfree_refs);
 	counter_u64_zero(extfree_rels);
 
 	t4_an_handler = an_not_handled;
 	for (i = 0; i < nitems(t4_fw_msg_handler); i++)
 		t4_fw_msg_handler[i] = fw_msg_not_handled;
 	for (i = 0; i < nitems(t4_cpl_handler); i++)
 		t4_cpl_handler[i] = cpl_not_handled;
 
 	t4_register_cpl_handler(CPL_FW4_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_FW6_MSG, handle_fw_msg);
 	t4_register_cpl_handler(CPL_SGE_EGR_UPDATE, handle_sge_egr_update);
 	t4_register_cpl_handler(CPL_RX_PKT, t4_eth_rx);
 	t4_register_fw_msg_handler(FW6_TYPE_CMD_RPL, t4_handle_fw_rpl);
 	t4_register_fw_msg_handler(FW6_TYPE_WRERR_RPL, t4_handle_wrerr_rpl);
 }
 
 void
 t4_sge_modunload(void)
 {
 
 	counter_u64_free(extfree_refs);
 	counter_u64_free(extfree_rels);
 }
 
 uint64_t
 t4_sge_extfree_refs(void)
 {
 	uint64_t refs, rels;
 
 	rels = counter_u64_fetch(extfree_rels);
 	refs = counter_u64_fetch(extfree_refs);
 
 	return (refs - rels);
 }
 
 static inline void
 setup_pad_and_pack_boundaries(struct adapter *sc)
 {
 	uint32_t v, m;
 	int pad, pack, pad_shift;
 
 	pad_shift = chip_id(sc) > CHELSIO_T5 ? X_T6_INGPADBOUNDARY_SHIFT :
 	    X_INGPADBOUNDARY_SHIFT;
 	pad = fl_pad;
 	if (fl_pad < (1 << pad_shift) ||
 	    fl_pad > (1 << (pad_shift + M_INGPADBOUNDARY)) ||
 	    !powerof2(fl_pad)) {
 		/*
 		 * If there is any chance that we might use buffer packing and
 		 * the chip is a T4, then pick 64 as the pad/pack boundary.  Set
 		 * it to the minimum allowed in all other cases.
 		 */
 		pad = is_t4(sc) && buffer_packing ? 64 : 1 << pad_shift;
 
 		/*
 		 * For fl_pad = 0 we'll still write a reasonable value to the
 		 * register but all the freelists will opt out of padding.
 		 * We'll complain here only if the user tried to set it to a
 		 * value greater than 0 that was invalid.
 		 */
 		if (fl_pad > 0) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pad value"
 			    " (%d), using %d instead.\n", fl_pad, pad);
 		}
 	}
 	m = V_INGPADBOUNDARY(M_INGPADBOUNDARY);
 	v = V_INGPADBOUNDARY(ilog2(pad) - pad_shift);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	if (is_t4(sc)) {
 		if (fl_pack != -1 && fl_pack != pad) {
 			/* Complain but carry on. */
 			device_printf(sc->dev, "hw.cxgbe.fl_pack (%d) ignored,"
 			    " using %d instead.\n", fl_pack, pad);
 		}
 		return;
 	}
 
 	pack = fl_pack;
 	if (fl_pack < 16 || fl_pack == 32 || fl_pack > 4096 ||
 	    !powerof2(fl_pack)) {
 		pack = max(sc->params.pci.mps, CACHE_LINE_SIZE);
 		MPASS(powerof2(pack));
 		if (pack < 16)
 			pack = 16;
 		if (pack == 32)
 			pack = 64;
 		if (pack > 4096)
 			pack = 4096;
 		if (fl_pack != -1) {
 			device_printf(sc->dev, "Invalid hw.cxgbe.fl_pack value"
 			    " (%d), using %d instead.\n", fl_pack, pack);
 		}
 	}
 	m = V_INGPACKBOUNDARY(M_INGPACKBOUNDARY);
 	if (pack == 16)
 		v = V_INGPACKBOUNDARY(0);
 	else
 		v = V_INGPACKBOUNDARY(ilog2(pack) - 5);
 
 	MPASS(!is_t4(sc));	/* T4 doesn't have SGE_CONTROL2 */
 	t4_set_reg_field(sc, A_SGE_CONTROL2, m, v);
 }
 
 /*
  * adap->params.vpd.cclk must be set up before this is called.
  */
 void
 t4_tweak_chip_settings(struct adapter *sc)
 {
 	int i;
 	uint32_t v, m;
 	int intr_timer[SGE_NTIMERS] = {1, 5, 10, 50, 100, 200};
 	int timer_max = M_TIMERVALUE0 * 1000 / sc->params.vpd.cclk;
 	int intr_pktcount[SGE_NCOUNTERS] = {1, 8, 16, 32}; /* 63 max */
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sge_flbuf_sizes[] = {
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 		MJUMPAGESIZE - CL_METADATA_SIZE,
 		MJUMPAGESIZE - 2 * MSIZE - CL_METADATA_SIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES,
 		MCLBYTES - MSIZE - CL_METADATA_SIZE,
 		MJUM9BYTES - CL_METADATA_SIZE,
 		MJUM16BYTES - CL_METADATA_SIZE,
 	};
 
 	KASSERT(sc->flags & MASTER_PF,
 	    ("%s: trying to change chip settings when not master.", __func__));
 
 	m = V_PKTSHIFT(M_PKTSHIFT) | F_RXPKTCPLMODE | F_EGRSTATUSPAGESIZE;
 	v = V_PKTSHIFT(fl_pktshift) | F_RXPKTCPLMODE |
 	    V_EGRSTATUSPAGESIZE(spg_len == 128);
 	t4_set_reg_field(sc, A_SGE_CONTROL, m, v);
 
 	setup_pad_and_pack_boundaries(sc);
 
 	v = V_HOSTPAGESIZEPF0(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF1(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF2(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF3(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF4(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF5(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF6(PAGE_SHIFT - 10) |
 	    V_HOSTPAGESIZEPF7(PAGE_SHIFT - 10);
 	t4_write_reg(sc, A_SGE_HOST_PAGE_SIZE, v);
 
 	KASSERT(nitems(sge_flbuf_sizes) <= SGE_FLBUF_SIZES,
 	    ("%s: hw buffer size table too big", __func__));
 	for (i = 0; i < min(nitems(sge_flbuf_sizes), SGE_FLBUF_SIZES); i++) {
 		t4_write_reg(sc, A_SGE_FL_BUFFER_SIZE0 + (4 * i),
 		    sge_flbuf_sizes[i]);
 	}
 
 	v = V_THRESHOLD_0(intr_pktcount[0]) | V_THRESHOLD_1(intr_pktcount[1]) |
 	    V_THRESHOLD_2(intr_pktcount[2]) | V_THRESHOLD_3(intr_pktcount[3]);
 	t4_write_reg(sc, A_SGE_INGRESS_RX_THRESHOLD, v);
 
 	KASSERT(intr_timer[0] <= timer_max,
 	    ("%s: not a single usable timer (%d, %d)", __func__, intr_timer[0],
 	    timer_max));
 	for (i = 1; i < nitems(intr_timer); i++) {
 		KASSERT(intr_timer[i] >= intr_timer[i - 1],
 		    ("%s: timers not listed in increasing order (%d)",
 		    __func__, i));
 
 		while (intr_timer[i] > timer_max) {
 			if (i == nitems(intr_timer) - 1) {
 				intr_timer[i] = timer_max;
 				break;
 			}
 			intr_timer[i] += intr_timer[i - 1];
 			intr_timer[i] /= 2;
 		}
 	}
 
 	v = V_TIMERVALUE0(us_to_core_ticks(sc, intr_timer[0])) |
 	    V_TIMERVALUE1(us_to_core_ticks(sc, intr_timer[1]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_0_AND_1, v);
 	v = V_TIMERVALUE2(us_to_core_ticks(sc, intr_timer[2])) |
 	    V_TIMERVALUE3(us_to_core_ticks(sc, intr_timer[3]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_2_AND_3, v);
 	v = V_TIMERVALUE4(us_to_core_ticks(sc, intr_timer[4])) |
 	    V_TIMERVALUE5(us_to_core_ticks(sc, intr_timer[5]));
 	t4_write_reg(sc, A_SGE_TIMER_VALUE_4_AND_5, v);
 
 	/* 4K, 16K, 64K, 256K DDP "page sizes" for TDDP */
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	t4_write_reg(sc, A_ULP_RX_TDDP_PSZ, v);
 
 	/*
 	 * 4K, 8K, 16K, 64K DDP "page sizes" for iSCSI DDP.  These have been
 	 * chosen with MAXPHYS = 128K in mind.  The largest DDP buffer that we
 	 * may have to deal with is MAXPHYS + 1 page.
 	 */
 	v = V_HPZ0(0) | V_HPZ1(1) | V_HPZ2(2) | V_HPZ3(4);
 	t4_write_reg(sc, A_ULP_RX_ISCSI_PSZ, v);
 
 	/* We use multiple DDP page sizes both in plain-TOE and ISCSI modes. */
 	m = v = F_TDDPTAGTCB | F_ISCSITAGTCB;
 	t4_set_reg_field(sc, A_ULP_RX_CTL, m, v);
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	t4_set_reg_field(sc, A_TP_PARA_REG5, m, v);
 }
 
 /*
  * SGE wants the buffer to be at least 64B and then a multiple of 16.  If
  * padding is in use, the buffer's start and end need to be aligned to the pad
  * boundary as well.  We'll just make sure that the size is a multiple of the
  * boundary here, it is up to the buffer allocation code to make sure the start
  * of the buffer is aligned as well.
  */
 static inline int
 hwsz_ok(struct adapter *sc, int hwsz)
 {
 	int mask = fl_pad ? sc->params.sge.pad_boundary - 1 : 16 - 1;
 
 	return (hwsz >= 64 && (hwsz & mask) == 0);
 }
 
 /*
  * XXX: driver really should be able to deal with unexpected settings.
  */
 int
 t4_read_chip_settings(struct adapter *sc)
 {
 	struct sge *s = &sc->sge;
 	struct sge_params *sp = &sc->params.sge;
 	int i, j, n, rc = 0;
 	uint32_t m, v, r;
 	uint16_t indsz = min(RX_COPY_THRESHOLD - 1, M_INDICATESIZE);
 	static int sw_buf_sizes[] = {	/* Sorted by size */
 		MCLBYTES,
 #if MJUMPAGESIZE != MCLBYTES
 		MJUMPAGESIZE,
 #endif
 		MJUM9BYTES,
 		MJUM16BYTES
 	};
 	struct sw_zone_info *swz, *safe_swz;
 	struct hw_buf_info *hwb;
 
 	m = F_RXPKTCPLMODE;
 	v = F_RXPKTCPLMODE;
 	r = sc->params.sge.sge_control;
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid SGE_CONTROL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/*
 	 * If this changes then every single use of PAGE_SHIFT in the driver
 	 * needs to be carefully reviewed for PAGE_SHIFT vs sp->page_shift.
 	 */
 	if (sp->page_shift != PAGE_SHIFT) {
 		device_printf(sc->dev, "invalid SGE_HOST_PAGE_SIZE(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	/* Filter out unusable hw buffer sizes entirely (mark with -2). */
 	hwb = &s->hw_buf_info[0];
 	for (i = 0; i < nitems(s->hw_buf_info); i++, hwb++) {
 		r = sc->params.sge.sge_fl_buffer_size[i];
 		hwb->size = r;
 		hwb->zidx = hwsz_ok(sc, r) ? -1 : -2;
 		hwb->next = -1;
 	}
 
 	/*
 	 * Create a sorted list in decreasing order of hw buffer sizes (and so
 	 * increasing order of spare area) for each software zone.
 	 *
 	 * If padding is enabled then the start and end of the buffer must align
 	 * to the pad boundary; if packing is enabled then they must align with
 	 * the pack boundary as well.  Allocations from the cluster zones are
 	 * aligned to min(size, 4K), so the buffer starts at that alignment and
 	 * ends at hwb->size alignment.  If mbuf inlining is allowed the
 	 * starting alignment will be reduced to MSIZE and the driver will
 	 * exercise appropriate caution when deciding on the best buffer layout
 	 * to use.
 	 */
 	n = 0;	/* no usable buffer size to begin with */
 	swz = &s->sw_zone_info[0];
 	safe_swz = NULL;
 	for (i = 0; i < SW_ZONE_SIZES; i++, swz++) {
 		int8_t head = -1, tail = -1;
 
 		swz->size = sw_buf_sizes[i];
 		swz->zone = m_getzone(swz->size);
 		swz->type = m_gettype(swz->size);
 
 		if (swz->size < PAGE_SIZE) {
 			MPASS(powerof2(swz->size));
 			if (fl_pad && (swz->size % sp->pad_boundary != 0))
 				continue;
 		}
 
 		if (swz->size == safest_rx_cluster)
 			safe_swz = swz;
 
 		hwb = &s->hw_buf_info[0];
 		for (j = 0; j < SGE_FLBUF_SIZES; j++, hwb++) {
 			if (hwb->zidx != -1 || hwb->size > swz->size)
 				continue;
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			hwb->zidx = i;
 			if (head == -1)
 				head = tail = j;
 			else if (hwb->size < s->hw_buf_info[tail].size) {
 				s->hw_buf_info[tail].next = j;
 				tail = j;
 			} else {
 				int8_t *cur;
 				struct hw_buf_info *t;
 
 				for (cur = &head; *cur != -1; cur = &t->next) {
 					t = &s->hw_buf_info[*cur];
 					if (hwb->size == t->size) {
 						hwb->zidx = -2;
 						break;
 					}
 					if (hwb->size > t->size) {
 						hwb->next = *cur;
 						*cur = j;
 						break;
 					}
 				}
 			}
 		}
 		swz->head_hwidx = head;
 		swz->tail_hwidx = tail;
 
 		if (tail != -1) {
 			n++;
 			if (swz->size - s->hw_buf_info[tail].size >=
 			    CL_METADATA_SIZE)
 				sc->flags |= BUF_PACKING_OK;
 		}
 	}
 	if (n == 0) {
 		device_printf(sc->dev, "no usable SGE FL buffer size.\n");
 		rc = EINVAL;
 	}
 
 	s->safe_hwidx1 = -1;
 	s->safe_hwidx2 = -1;
 	if (safe_swz != NULL) {
 		s->safe_hwidx1 = safe_swz->head_hwidx;
 		for (i = safe_swz->head_hwidx; i != -1; i = hwb->next) {
 			int spare;
 
 			hwb = &s->hw_buf_info[i];
 #ifdef INVARIANTS
 			if (fl_pad)
 				MPASS(hwb->size % sp->pad_boundary == 0);
 #endif
 			spare = safe_swz->size - hwb->size;
 			if (spare >= CL_METADATA_SIZE) {
 				s->safe_hwidx2 = i;
 				break;
 			}
 		}
 	}
 
 	if (sc->flags & IS_VF)
 		return (0);
 
 	v = V_HPZ0(0) | V_HPZ1(2) | V_HPZ2(4) | V_HPZ3(6);
 	r = t4_read_reg(sc, A_ULP_RX_TDDP_PSZ);
 	if (r != v) {
 		device_printf(sc->dev, "invalid ULP_RX_TDDP_PSZ(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = v = F_TDDPTAGTCB;
 	r = t4_read_reg(sc, A_ULP_RX_CTL);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid ULP_RX_CTL(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	m = V_INDICATESIZE(M_INDICATESIZE) | F_REARMDDPOFFSET |
 	    F_RESETDDPOFFSET;
 	v = V_INDICATESIZE(indsz) | F_REARMDDPOFFSET | F_RESETDDPOFFSET;
 	r = t4_read_reg(sc, A_TP_PARA_REG5);
 	if ((r & m) != v) {
 		device_printf(sc->dev, "invalid TP_PARA_REG5(0x%x)\n", r);
 		rc = EINVAL;
 	}
 
 	t4_init_tp_params(sc);
 
 	t4_read_mtu_tbl(sc, sc->params.mtus, NULL);
 	t4_load_mtus(sc, sc->params.mtus, sc->params.a_wnd, sc->params.b_wnd);
 
 	return (rc);
 }
 
 int
 t4_create_dma_tag(struct adapter *sc)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(bus_get_dma_tag(sc->dev), 1, 0,
 	    BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, BUS_SPACE_MAXSIZE,
 	    BUS_SPACE_UNRESTRICTED, BUS_SPACE_MAXSIZE, BUS_DMA_ALLOCNOW, NULL,
 	    NULL, &sc->dmat);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create main DMA tag: %d\n", rc);
 	}
 
 	return (rc);
 }
 
 void
 t4_sge_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
     struct sysctl_oid_list *children)
 {
 	struct sge_params *sp = &sc->params.sge;
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "buffer_sizes",
 	    CTLTYPE_STRING | CTLFLAG_RD, &sc->sge, 0, sysctl_bufsizes, "A",
 	    "freelist buffer sizes");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pktshift", CTLFLAG_RD,
 	    NULL, sp->fl_pktshift, "payload DMA offset in rx buffer (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pad", CTLFLAG_RD,
 	    NULL, sp->pad_boundary, "payload pad boundary (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "spg_len", CTLFLAG_RD,
 	    NULL, sp->spg_len, "status page size (bytes)");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "cong_drop", CTLFLAG_RD,
 	    NULL, cong_drop, "congestion drop setting");
 
 	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "fl_pack", CTLFLAG_RD,
 	    NULL, sp->pack_boundary, "payload pack boundary (bytes)");
 }
 
 int
 t4_destroy_dma_tag(struct adapter *sc)
 {
 	if (sc->dmat)
 		bus_dma_tag_destroy(sc->dmat);
 
 	return (0);
 }
 
 /*
  * Allocate and initialize the firmware event queue and the management queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  */
 int
 t4_setup_adapter_queues(struct adapter *sc)
 {
 	int rc;
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	sysctl_ctx_init(&sc->ctx);
 	sc->flags |= ADAP_SYSCTL_CTX;
 
 	/*
 	 * Firmware event queue
 	 */
 	rc = alloc_fwq(sc);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * Management queue.  This is just a control queue that uses the fwq as
 	 * its associated iq.
 	 */
 	if (!(sc->flags & IS_VF))
 		rc = alloc_mgmtq(sc);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_adapter_queues(struct adapter *sc)
 {
 
 	ADAPTER_LOCK_ASSERT_NOTOWNED(sc);
 
 	/* Do this before freeing the queue */
 	if (sc->flags & ADAP_SYSCTL_CTX) {
 		sysctl_ctx_free(&sc->ctx);
 		sc->flags &= ~ADAP_SYSCTL_CTX;
 	}
 
 	free_mgmtq(sc);
 	free_fwq(sc);
 
 	return (0);
 }
 
 static inline int
 first_vector(struct vi_info *vi)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	if (sc->intr_count == 1)
 		return (0);
 
 	return (vi->first_intr);
 }
 
 /*
  * Given an arbitrary "index," come up with an iq that can be used by other
  * queues (of this VI) for interrupt forwarding, SGE egress updates, etc.
  * The iq returned is guaranteed to be something that takes direct interrupts.
  */
 static struct sge_iq *
 vi_intr_iq(struct vi_info *vi, int idx)
 {
 	struct adapter *sc = vi->pi->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_iq *iq = NULL;
 	int nintr, i;
 
 	if (sc->intr_count == 1)
 		return (&sc->sge.fwq);
 
 	nintr = vi->nintr;
 	KASSERT(nintr != 0,
 	    ("%s: vi %p has no exclusive interrupts, total interrupts = %d",
 	    __func__, vi, sc->intr_count));
 	i = idx % nintr;
 
 	if (vi->flags & INTR_RXQ) {
 	       	if (i < vi->nrxq) {
 			iq = &s->rxq[vi->first_rxq + i].iq;
 			goto done;
 		}
 		i -= vi->nrxq;
 	}
 #ifdef TCP_OFFLOAD
 	if (vi->flags & INTR_OFLD_RXQ) {
 	       	if (i < vi->nofldrxq) {
 			iq = &s->ofld_rxq[vi->first_ofld_rxq + i].iq;
 			goto done;
 		}
 		i -= vi->nofldrxq;
 	}
 #endif
 	panic("%s: vi %p, intr_flags 0x%lx, idx %d, total intr %d\n", __func__,
 	    vi, vi->flags & INTR_ALL, idx, nintr);
 done:
 	MPASS(iq != NULL);
 	KASSERT(iq->flags & IQ_INTR,
 	    ("%s: iq %p (vi %p, intr_flags 0x%lx, idx %d)", __func__, iq, vi,
 	    vi->flags & INTR_ALL, idx));
 	return (iq);
 }
 
 /* Maximum payload that can be delivered with a single iq descriptor */
 static inline int
 mtu_to_max_payload(struct adapter *sc, int mtu, const int toe)
 {
 	int payload;
 
 #ifdef TCP_OFFLOAD
 	if (toe) {
 		payload = sc->tt.rx_coalesce ?
 		    G_RXCOALESCESIZE(t4_read_reg(sc, A_TP_PARA_REG2)) : mtu;
 	} else {
 #endif
 		/* large enough even when hw VLAN extraction is disabled */
 		payload = sc->params.sge.fl_pktshift + ETHER_HDR_LEN +
 		    ETHER_VLAN_ENCAP_LEN + mtu;
 #ifdef TCP_OFFLOAD
 	}
 #endif
 
 	return (payload);
 }
 
 int
 t4_setup_vi_queues(struct vi_info *vi)
 {
 	int rc = 0, i, j, intr_idx, iqid;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 	struct sge_wrq *ctrlq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	int saved_idx;
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 	char name[16];
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct ifnet *ifp = vi->ifp;
 	struct sysctl_oid *oid = device_get_sysctl_tree(vi->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 	int maxp, mtu = ifp->if_mtu;
 
 	/* Interrupt vector to start from (when using multiple vectors) */
 	intr_idx = first_vector(vi);
 
 #ifdef DEV_NETMAP
 	saved_idx = intr_idx;
 	if (ifp->if_capabilities & IFCAP_NETMAP) {
 
 		/* netmap is supported with direct interrupts only. */
 		MPASS(vi->flags & INTR_RXQ);
 
 		/*
 		 * We don't have buffers to back the netmap rx queues
 		 * right now so we create the queues in a way that
 		 * doesn't set off any congestion signal in the chip.
 		 */
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			rc = alloc_nm_rxq(vi, nm_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "nm_txq",
 		    CTLFLAG_RD, NULL, "tx queues");
 		for_each_nm_txq(vi, i, nm_txq) {
 			iqid = vi->first_nm_rxq + (i % vi->nnmrxq);
 			rc = alloc_nm_txq(vi, nm_txq, iqid, i, oid);
 			if (rc != 0)
 				goto done;
 		}
 	}
 
 	/* Normal rx queues and netmap rx queues share the same interrupts. */
 	intr_idx = saved_idx;
 #endif
 
 	/*
 	 * First pass over all NIC and TOE rx queues:
 	 * a) initialize iq and fl
 	 * b) allocate queue iff it will take direct interrupts.
 	 */
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	if (vi->flags & INTR_RXQ) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 	}
 	for_each_rxq(vi, i, rxq) {
 
 		init_iq(&rxq->iq, sc, vi->tmr_idx, vi->pktc_idx, vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		if (vi->flags & INTR_RXQ) {
 			rxq->iq.flags |= IQ_INTR;
 			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 	}
 #ifdef DEV_NETMAP
 	if (ifp->if_capabilities & IFCAP_NETMAP)
 		intr_idx = saved_idx + max(vi->nrxq, vi->nnmrxq);
 #endif
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	if (vi->flags & INTR_OFLD_RXQ) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 		    CTLFLAG_RD, NULL,
 		    "rx queues for offloaded TCP connections");
 	}
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 
 		init_iq(&ofld_rxq->iq, sc, vi->tmr_idx, vi->pktc_idx,
 		    vi->qsize_rxq);
 
 		snprintf(name, sizeof(name), "%s ofld_rxq%d-fl",
 		    device_get_nameunit(vi->dev), i);
 		init_fl(sc, &ofld_rxq->fl, vi->qsize_rxq / 8, maxp, name);
 
 		if (vi->flags & INTR_OFLD_RXQ) {
 			ofld_rxq->iq.flags |= IQ_INTR;
 			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			intr_idx++;
 		}
 	}
 #endif
 
 	/*
 	 * Second pass over all NIC and TOE rx queues.  The queues forwarding
 	 * their interrupts are allocated now.
 	 */
 	j = 0;
 	if (!(vi->flags & INTR_RXQ)) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "rxq",
 		    CTLFLAG_RD, NULL, "rx queues");
 		for_each_rxq(vi, i, rxq) {
 			MPASS(!(rxq->iq.flags & IQ_INTR));
 
 			intr_idx = vi_intr_iq(vi, j)->abs_id;
 
 			rc = alloc_rxq(vi, rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			j++;
 		}
 	}
 #ifdef TCP_OFFLOAD
 	if (vi->nofldrxq != 0 && !(vi->flags & INTR_OFLD_RXQ)) {
 		oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_rxq",
 		    CTLFLAG_RD, NULL,
 		    "rx queues for offloaded TCP connections");
 		for_each_ofld_rxq(vi, i, ofld_rxq) {
 			MPASS(!(ofld_rxq->iq.flags & IQ_INTR));
 
 			intr_idx = vi_intr_iq(vi, j)->abs_id;
 
 			rc = alloc_ofld_rxq(vi, ofld_rxq, intr_idx, i, oid);
 			if (rc != 0)
 				goto done;
 			j++;
 		}
 	}
 #endif
 
 	/*
 	 * Now the tx queues.  Only one pass needed.
 	 */
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "txq", CTLFLAG_RD,
 	    NULL, "tx queues");
 	j = 0;
 	for_each_txq(vi, i, txq) {
 		iqid = vi_intr_iq(vi, j)->cntxt_id;
 		snprintf(name, sizeof(name), "%s txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &txq->eq, EQ_ETH, vi->qsize_txq, pi->tx_chan, iqid,
 		    name);
 
 		rc = alloc_txq(vi, txq, i, oid);
 		if (rc != 0)
 			goto done;
 		j++;
 	}
 #ifdef TCP_OFFLOAD
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ofld_txq",
 	    CTLFLAG_RD, NULL, "tx queues for offloaded TCP connections");
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		struct sysctl_oid *oid2;
 
 		iqid = vi_intr_iq(vi, j)->cntxt_id;
 		snprintf(name, sizeof(name), "%s ofld_txq%d",
 		    device_get_nameunit(vi->dev), i);
 		init_eq(sc, &ofld_txq->eq, EQ_OFLD, vi->qsize_txq, pi->tx_chan,
 		    iqid, name);
 
 		snprintf(name, sizeof(name), "%d", i);
 		oid2 = SYSCTL_ADD_NODE(&vi->ctx, SYSCTL_CHILDREN(oid), OID_AUTO,
 		    name, CTLFLAG_RD, NULL, "offload tx queue");
 
 		rc = alloc_wrq(sc, vi, ofld_txq, oid2);
 		if (rc != 0)
 			goto done;
 		j++;
 	}
 #endif
 
 	/*
 	 * Finally, the control queue.
 	 */
 	if (!IS_MAIN_VI(vi) || sc->flags & IS_VF)
 		goto done;
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, "ctrlq", CTLFLAG_RD,
 	    NULL, "ctrl queue");
 	ctrlq = &sc->sge.ctrlq[pi->port_id];
 	iqid = vi_intr_iq(vi, 0)->cntxt_id;
 	snprintf(name, sizeof(name), "%s ctrlq", device_get_nameunit(vi->dev));
 	init_eq(sc, &ctrlq->eq, EQ_CTRL, CTRL_EQ_QSIZE, pi->tx_chan, iqid,
 	    name);
 	rc = alloc_wrq(sc, vi, ctrlq, oid);
 
 done:
 	if (rc)
 		t4_teardown_vi_queues(vi);
 
 	return (rc);
 }
 
 /*
  * Idempotent
  */
 int
 t4_teardown_vi_queues(struct vi_info *vi)
 {
 	int i;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_rxq *rxq;
 	struct sge_txq *txq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 	struct sge_wrq *ofld_txq;
 #endif
 #ifdef DEV_NETMAP
 	struct sge_nm_rxq *nm_rxq;
 	struct sge_nm_txq *nm_txq;
 #endif
 
 	/* Do this before freeing the queues */
 	if (vi->flags & VI_SYSCTL_CTX) {
 		sysctl_ctx_free(&vi->ctx);
 		vi->flags &= ~VI_SYSCTL_CTX;
 	}
 
 #ifdef DEV_NETMAP
 	if (vi->ifp->if_capabilities & IFCAP_NETMAP) {
 		for_each_nm_txq(vi, i, nm_txq) {
 			free_nm_txq(vi, nm_txq);
 		}
 
 		for_each_nm_rxq(vi, i, nm_rxq) {
 			free_nm_rxq(vi, nm_rxq);
 		}
 	}
 #endif
 
 	/*
 	 * Take down all the tx queues first, as they reference the rx queues
 	 * (for egress updates, etc.).
 	 */
 
 	if (IS_MAIN_VI(vi) && !(sc->flags & IS_VF))
 		free_wrq(sc, &sc->sge.ctrlq[pi->port_id]);
 
 	for_each_txq(vi, i, txq) {
 		free_txq(vi, txq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_txq(vi, i, ofld_txq) {
 		free_wrq(sc, ofld_txq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues that forward their interrupts, as they
 	 * reference other rx queues.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		if ((rxq->iq.flags & IQ_INTR) == 0)
 			free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		if ((ofld_rxq->iq.flags & IQ_INTR) == 0)
 			free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	/*
 	 * Then take down the rx queues that take direct interrupts.
 	 */
 
 	for_each_rxq(vi, i, rxq) {
 		if (rxq->iq.flags & IQ_INTR)
 			free_rxq(vi, rxq);
 	}
 #ifdef TCP_OFFLOAD
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		if (ofld_rxq->iq.flags & IQ_INTR)
 			free_ofld_rxq(vi, ofld_rxq);
 	}
 #endif
 
 	return (0);
 }
 
 /*
  * Deals with errors and the firmware event queue.  All data rx queues forward
  * their interrupt to the firmware event queue.
  */
 void
 t4_intr_all(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_iq *fwq = &sc->sge.fwq;
 
 	t4_intr_err(arg);
 	if (atomic_cmpset_int(&fwq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(fwq, 0);
 		atomic_cmpset_int(&fwq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 /* Deals with error interrupts */
 void
 t4_intr_err(void *arg)
 {
 	struct adapter *sc = arg;
 
 	t4_write_reg(sc, MYPF_REG(A_PCIE_PF_CLI), 0);
 	t4_slow_intr_handler(sc);
 }
 
 void
 t4_intr_evt(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 void
 t4_intr(void *arg)
 {
 	struct sge_iq *iq = arg;
 
 	if (atomic_cmpset_int(&iq->state, IQS_IDLE, IQS_BUSY)) {
 		service_iq(iq, 0);
 		atomic_cmpset_int(&iq->state, IQS_BUSY, IQS_IDLE);
 	}
 }
 
 void
 t4_vi_intr(void *arg)
 {
 	struct irq *irq = arg;
 
 #ifdef DEV_NETMAP
 	if (atomic_cmpset_int(&irq->nm_state, NM_ON, NM_BUSY)) {
 		t4_nm_intr(irq->nm_rxq);
 		atomic_cmpset_int(&irq->nm_state, NM_BUSY, NM_ON);
 	}
 #endif
 	if (irq->rxq != NULL)
 		t4_intr(irq->rxq);
 }
 
 /*
  * Deals with anything and everything on the given ingress queue.
  */
 static int
 service_iq(struct sge_iq *iq, int budget)
 {
 	struct sge_iq *q;
 	struct sge_rxq *rxq = iq_to_rxq(iq);	/* Use iff iq is part of rxq */
 	struct sge_fl *fl;			/* Use iff IQ_HAS_FL */
 	struct adapter *sc = iq->adapter;
 	struct iq_desc *d = &iq->desc[iq->cidx];
 	int ndescs = 0, limit;
 	int rsp_type, refill;
 	uint32_t lq;
 	uint16_t fl_hw_cidx;
 	struct mbuf *m0;
 	STAILQ_HEAD(, sge_iq) iql = STAILQ_HEAD_INITIALIZER(iql);
 #if defined(INET) || defined(INET6)
 	const struct timeval lro_timeout = {0, sc->lro_timeout};
 #endif
 
 	KASSERT(iq->state == IQS_BUSY, ("%s: iq %p not BUSY", __func__, iq));
 
 	limit = budget ? budget : iq->qsize / 16;
 
 	if (iq->flags & IQ_HAS_FL) {
 		fl = &rxq->fl;
 		fl_hw_cidx = fl->hw_cidx;	/* stable snapshot */
 	} else {
 		fl = NULL;
 		fl_hw_cidx = 0;			/* to silence gcc warning */
 	}
 
 	/*
 	 * We always come back and check the descriptor ring for new indirect
 	 * interrupts and other responses after running a single handler.
 	 */
 	for (;;) {
 		while ((d->rsp.u.type_gen & F_RSPD_GEN) == iq->gen) {
 
 			rmb();
 
 			refill = 0;
 			m0 = NULL;
 			rsp_type = G_RSPD_TYPE(d->rsp.u.type_gen);
 			lq = be32toh(d->rsp.pldbuflen_qid);
 
 			switch (rsp_type) {
 			case X_RSPD_TYPE_FLBUF:
 
 				KASSERT(iq->flags & IQ_HAS_FL,
 				    ("%s: data for an iq (%p) with no freelist",
 				    __func__, iq));
 
 				m0 = get_fl_payload(sc, fl, lq);
 				if (__predict_false(m0 == NULL))
 					goto process_iql;
 				refill = IDXDIFF(fl->hw_cidx, fl_hw_cidx, fl->sidx) > 2;
 #ifdef T4_PKT_TIMESTAMP
 				/*
 				 * 60 bit timestamp for the payload is
 				 * *(uint64_t *)m0->m_pktdat.  Note that it is
 				 * in the leading free-space in the mbuf.  The
 				 * kernel can clobber it during a pullup,
 				 * m_copymdata, etc.  You need to make sure that
 				 * the mbuf reaches you unmolested if you care
 				 * about the timestamp.
 				 */
 				*(uint64_t *)m0->m_pktdat =
 				    be64toh(ctrl->u.last_flit) &
 				    0xfffffffffffffff;
 #endif
 
 				/* fall through */
 
 			case X_RSPD_TYPE_CPL:
 				KASSERT(d->rss.opcode < NUM_CPL_CMDS,
 				    ("%s: bad opcode %02x.", __func__,
 				    d->rss.opcode));
 				t4_cpl_handler[d->rss.opcode](iq, &d->rss, m0);
 				break;
 
 			case X_RSPD_TYPE_INTR:
 
 				/*
 				 * Interrupts should be forwarded only to queues
 				 * that are not forwarding their interrupts.
 				 * This means service_iq can recurse but only 1
 				 * level deep.
 				 */
 				KASSERT(budget == 0,
 				    ("%s: budget %u, rsp_type %u", __func__,
 				    budget, rsp_type));
 
 				/*
 				 * There are 1K interrupt-capable queues (qids 0
 				 * through 1023).  A response type indicating a
 				 * forwarded interrupt with a qid >= 1K is an
 				 * iWARP async notification.
 				 */
 				if (lq >= 1024) {
                                         t4_an_handler(iq, &d->rsp);
                                         break;
                                 }
 
 				q = sc->sge.iqmap[lq - sc->sge.iq_start -
 				    sc->sge.iq_base];
 				if (atomic_cmpset_int(&q->state, IQS_IDLE,
 				    IQS_BUSY)) {
 					if (service_iq(q, q->qsize / 16) == 0) {
 						atomic_cmpset_int(&q->state,
 						    IQS_BUSY, IQS_IDLE);
 					} else {
 						STAILQ_INSERT_TAIL(&iql, q,
 						    link);
 					}
 				}
 				break;
 
 			default:
 				KASSERT(0,
 				    ("%s: illegal response type %d on iq %p",
 				    __func__, rsp_type, iq));
 				log(LOG_ERR,
 				    "%s: illegal response type %d on iq %p",
 				    device_get_nameunit(sc->dev), rsp_type, iq);
 				break;
 			}
 
 			d++;
 			if (__predict_false(++iq->cidx == iq->sidx)) {
 				iq->cidx = 0;
 				iq->gen ^= F_RSPD_GEN;
 				d = &iq->desc[0];
 			}
 			if (__predict_false(++ndescs == limit)) {
 				t4_write_reg(sc, sc->sge_gts_reg,
 				    V_CIDXINC(ndescs) |
 				    V_INGRESSQID(iq->cntxt_id) |
 				    V_SEINTARM(V_QINTR_TIMER_IDX(X_TIMERREG_UPDATE_CIDX)));
 				ndescs = 0;
 
 #if defined(INET) || defined(INET6)
 				if (iq->flags & IQ_LRO_ENABLED &&
 				    sc->lro_timeout != 0) {
 					tcp_lro_flush_inactive(&rxq->lro,
 					    &lro_timeout);
 				}
 #endif
 
 				if (budget) {
 					if (iq->flags & IQ_HAS_FL) {
 						FL_LOCK(fl);
 						refill_fl(sc, fl, 32);
 						FL_UNLOCK(fl);
 					}
 					return (EINPROGRESS);
 				}
 			}
 			if (refill) {
 				FL_LOCK(fl);
 				refill_fl(sc, fl, 32);
 				FL_UNLOCK(fl);
 				fl_hw_cidx = fl->hw_cidx;
 			}
 		}
 
 process_iql:
 		if (STAILQ_EMPTY(&iql))
 			break;
 
 		/*
 		 * Process the head only, and send it to the back of the list if
 		 * it's still not done.
 		 */
 		q = STAILQ_FIRST(&iql);
 		STAILQ_REMOVE_HEAD(&iql, link);
 		if (service_iq(q, q->qsize / 8) == 0)
 			atomic_cmpset_int(&q->state, IQS_BUSY, IQS_IDLE);
 		else
 			STAILQ_INSERT_TAIL(&iql, q, link);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED) {
 		struct lro_ctrl *lro = &rxq->lro;
 
 		tcp_lro_flush_all(lro);
 	}
 #endif
 
 	t4_write_reg(sc, sc->sge_gts_reg, V_CIDXINC(ndescs) |
 	    V_INGRESSQID((u32)iq->cntxt_id) | V_SEINTARM(iq->intr_params));
 
 	if (iq->flags & IQ_HAS_FL) {
 		int starved;
 
 		FL_LOCK(fl);
 		starved = refill_fl(sc, fl, 64);
 		FL_UNLOCK(fl);
 		if (__predict_false(starved != 0))
 			add_fl_to_sfl(sc, fl);
 	}
 
 	return (0);
 }
 
 static inline int
 cl_has_metadata(struct sge_fl *fl, struct cluster_layout *cll)
 {
 	int rc = fl->flags & FL_BUF_PACKING || cll->region1 > 0;
 
 	if (rc)
 		MPASS(cll->region3 >= CL_METADATA_SIZE);
 
 	return (rc);
 }
 
 static inline struct cluster_metadata *
 cl_metadata(struct adapter *sc, struct sge_fl *fl, struct cluster_layout *cll,
     caddr_t cl)
 {
 
 	if (cl_has_metadata(fl, cll)) {
 		struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 
 		return ((struct cluster_metadata *)(cl + swz->size) - 1);
 	}
 	return (NULL);
 }
 
 static void
 rxb_free(struct mbuf *m, void *arg1, void *arg2)
 {
 	uma_zone_t zone = arg1;
 	caddr_t cl = arg2;
 
 	uma_zfree(zone, cl);
 	counter_u64_add(extfree_rels, 1);
 }
 
 /*
  * The mbuf returned by this function could be allocated from zone_mbuf or
  * constructed in spare room in the cluster.
  *
  * The mbuf carries the payload in one of these ways
  * a) frame inside the mbuf (mbuf from zone_mbuf)
  * b) m_cljset (for clusters without metadata) zone_mbuf
  * c) m_extaddref (cluster with metadata) inline mbuf
  * d) m_extaddref (cluster with metadata) zone_mbuf
  */
 static struct mbuf *
 get_scatter_segment(struct adapter *sc, struct sge_fl *fl, int fr_offset,
     int remaining)
 {
 	struct mbuf *m;
 	struct fl_sdesc *sd = &fl->sdesc[fl->cidx];
 	struct cluster_layout *cll = &sd->cll;
 	struct sw_zone_info *swz = &sc->sge.sw_zone_info[cll->zidx];
 	struct hw_buf_info *hwb = &sc->sge.hw_buf_info[cll->hwidx];
 	struct cluster_metadata *clm = cl_metadata(sc, fl, cll, sd->cl);
 	int len, blen;
 	caddr_t payload;
 
 	blen = hwb->size - fl->rx_offset;	/* max possible in this buf */
 	len = min(remaining, blen);
 	payload = sd->cl + cll->region1 + fl->rx_offset;
 	if (fl->flags & FL_BUF_PACKING) {
 		const u_int l = fr_offset + len;
 		const u_int pad = roundup2(l, fl->buf_boundary) - l;
 
 		if (fl->rx_offset + len + pad < hwb->size)
 			blen = len + pad;
 		MPASS(fl->rx_offset + blen <= hwb->size);
 	} else {
 		MPASS(fl->rx_offset == 0);	/* not packing */
 	}
 
 
 	if (sc->sc_do_rxcopy && len < RX_COPY_THRESHOLD) {
 
 		/*
 		 * Copy payload into a freshly allocated mbuf.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 #ifdef T4_PKT_TIMESTAMP
 		/* Leave room for a timestamp */
 		m->m_data += 8;
 #endif
 		/* copy data to mbuf */
 		bcopy(payload, mtod(m, caddr_t), len);
 
 	} else if (sd->nmbuf * MSIZE < cll->region1) {
 
 		/*
 		 * There's spare room in the cluster for an mbuf.  Create one
 		 * and associate it with the payload that's in the cluster.
 		 */
 
 		MPASS(clm != NULL);
 		m = (struct mbuf *)(sd->cl + sd->nmbuf * MSIZE);
 		/* No bzero required */
 		if (m_init(m, M_NOWAIT, MT_DATA,
 		    fr_offset == 0 ? M_PKTHDR | M_NOFREE : M_NOFREE))
 			return (NULL);
 		fl->mbuf_inlined++;
 		m_extaddref(m, payload, blen, &clm->refcount, rxb_free,
 		    swz->zone, sd->cl);
 		if (sd->nmbuf++ == 0)
 			counter_u64_add(extfree_refs, 1);
 
 	} else {
 
 		/*
 		 * Grab an mbuf from zone_mbuf and associate it with the
 		 * payload in the cluster.
 		 */
 
 		m = fr_offset == 0 ?
 		    m_gethdr(M_NOWAIT, MT_DATA) : m_get(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return (NULL);
 		fl->mbuf_allocated++;
 		if (clm != NULL) {
 			m_extaddref(m, payload, blen, &clm->refcount,
 			    rxb_free, swz->zone, sd->cl);
 			if (sd->nmbuf++ == 0)
 				counter_u64_add(extfree_refs, 1);
 		} else {
 			m_cljset(m, sd->cl, swz->type);
 			sd->cl = NULL;	/* consumed, not a recycle candidate */
 		}
 	}
 	if (fr_offset == 0)
 		m->m_pkthdr.len = remaining;
 	m->m_len = len;
 
 	if (fl->flags & FL_BUF_PACKING) {
 		fl->rx_offset += blen;
 		MPASS(fl->rx_offset <= hwb->size);
 		if (fl->rx_offset < hwb->size)
 			return (m);	/* without advancing the cidx */
 	}
 
 	if (__predict_false(++fl->cidx % 8 == 0)) {
 		uint16_t cidx = fl->cidx / 8;
 
 		if (__predict_false(cidx == fl->sidx))
 			fl->cidx = cidx = 0;
 		fl->hw_cidx = cidx;
 	}
 	fl->rx_offset = 0;
 
 	return (m);
 }
 
 static struct mbuf *
 get_fl_payload(struct adapter *sc, struct sge_fl *fl, uint32_t len_newbuf)
 {
 	struct mbuf *m0, *m, **pnext;
 	u_int remaining;
 	const u_int total = G_RSPD_LEN(len_newbuf);
 
 	if (__predict_false(fl->flags & FL_BUF_RESUME)) {
 		M_ASSERTPKTHDR(fl->m0);
 		MPASS(fl->m0->m_pkthdr.len == total);
 		MPASS(fl->remaining < total);
 
 		m0 = fl->m0;
 		pnext = fl->pnext;
 		remaining = fl->remaining;
 		fl->flags &= ~FL_BUF_RESUME;
 		goto get_segment;
 	}
 
 	if (fl->rx_offset > 0 && len_newbuf & F_RSPD_NEWBUF) {
 		fl->rx_offset = 0;
 		if (__predict_false(++fl->cidx % 8 == 0)) {
 			uint16_t cidx = fl->cidx / 8;
 
 			if (__predict_false(cidx == fl->sidx))
 				fl->cidx = cidx = 0;
 			fl->hw_cidx = cidx;
 		}
 	}
 
 	/*
 	 * Payload starts at rx_offset in the current hw buffer.  Its length is
 	 * 'len' and it may span multiple hw buffers.
 	 */
 
 	m0 = get_scatter_segment(sc, fl, 0, total);
 	if (m0 == NULL)
 		return (NULL);
 	remaining = total - m0->m_len;
 	pnext = &m0->m_next;
 	while (remaining > 0) {
 get_segment:
 		MPASS(fl->rx_offset == 0);
 		m = get_scatter_segment(sc, fl, total - remaining, remaining);
 		if (__predict_false(m == NULL)) {
 			fl->m0 = m0;
 			fl->pnext = pnext;
 			fl->remaining = remaining;
 			fl->flags |= FL_BUF_RESUME;
 			return (NULL);
 		}
 		*pnext = m;
 		pnext = &m->m_next;
 		remaining -= m->m_len;
 	}
 	*pnext = NULL;
 
 	M_ASSERTPKTHDR(m0);
 	return (m0);
 }
 
 static int
 t4_eth_rx(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m0)
 {
 	struct sge_rxq *rxq = iq_to_rxq(iq);
 	struct ifnet *ifp = rxq->ifp;
 	struct adapter *sc = iq->adapter;
 	const struct cpl_rx_pkt *cpl = (const void *)(rss + 1);
 #if defined(INET) || defined(INET6)
 	struct lro_ctrl *lro = &rxq->lro;
 #endif
 	static const int sw_hashtype[4][2] = {
 		{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
 		{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
 		{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
 		{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
 	};
 
 	KASSERT(m0 != NULL, ("%s: no payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	m0->m_pkthdr.len -= sc->params.sge.fl_pktshift;
 	m0->m_len -= sc->params.sge.fl_pktshift;
 	m0->m_data += sc->params.sge.fl_pktshift;
 
 	m0->m_pkthdr.rcvif = ifp;
 	M_HASHTYPE_SET(m0, sw_hashtype[rss->hash_type][rss->ipv6]);
 	m0->m_pkthdr.flowid = be32toh(rss->hash_val);
 
 	if (cpl->csum_calc && !cpl->err_vec) {
 		if (ifp->if_capenable & IFCAP_RXCSUM &&
 		    cpl->l2info & htobe32(F_RXF_IP)) {
 			m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
 			    CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		} else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
 		    cpl->l2info & htobe32(F_RXF_IP6)) {
 			m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
 			    CSUM_PSEUDO_HDR);
 			rxq->rxcsum++;
 		}
 
 		if (__predict_false(cpl->ip_frag))
 			m0->m_pkthdr.csum_data = be16toh(cpl->csum);
 		else
 			m0->m_pkthdr.csum_data = 0xffff;
 	}
 
 	if (cpl->vlan_ex) {
 		m0->m_pkthdr.ether_vtag = be16toh(cpl->vlan);
 		m0->m_flags |= M_VLANTAG;
 		rxq->vlan_extraction++;
 	}
 
 #if defined(INET) || defined(INET6)
 	if (iq->flags & IQ_LRO_ENABLED &&
 	    tcp_lro_rx(lro, m0, 0) == 0) {
 		/* queued for LRO */
 	} else
 #endif
 	ifp->if_input(ifp, m0);
 
 	return (0);
 }
 
 /*
  * Must drain the wrq or make sure that someone else will.
  */
 static void
 wrq_tx_drain(void *arg, int n)
 {
 	struct sge_wrq *wrq = arg;
 	struct sge_eq *eq = &wrq->eq;
 
 	EQ_LOCK(eq);
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(wrq->adapter, wrq);
 	EQ_UNLOCK(eq);
 }
 
 static void
 drain_wrq_wr_list(struct adapter *sc, struct sge_wrq *wrq)
 {
 	struct sge_eq *eq = &wrq->eq;
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n;
 	struct wrqe *wr;
 	struct fw_eth_tx_pkt_wr *dst;	/* any fw WR struct will do */
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(TAILQ_EMPTY(&wrq->incomplete_wrs));
 	wr = STAILQ_FIRST(&wrq->wr_list);
 	MPASS(wr != NULL);	/* Must be called with something useful to do */
 	MPASS(eq->pidx == eq->dbidx);
 	dbdiff = 0;
 
 	do {
 		eq->cidx = read_hw_cidx(eq);
 		if (eq->pidx == eq->cidx)
 			available = eq->sidx - 1;
 		else
 			available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 
 		MPASS(wr->wrq == wrq);
 		n = howmany(wr->wr_len, EQ_ESIZE);
 		if (available < n)
 			break;
 
 		dst = (void *)&eq->desc[eq->pidx];
 		if (__predict_true(eq->sidx - eq->pidx > n)) {
 			/* Won't wrap, won't end exactly at the status page. */
 			bcopy(&wr->wr[0], dst, wr->wr_len);
 			eq->pidx += n;
 		} else {
 			int first_portion = (eq->sidx - eq->pidx) * EQ_ESIZE;
 
 			bcopy(&wr->wr[0], dst, first_portion);
 			if (wr->wr_len > first_portion) {
 				bcopy(&wr->wr[first_portion], &eq->desc[0],
 				    wr->wr_len - first_portion);
 			}
 			eq->pidx = n - (eq->sidx - eq->pidx);
 		}
 		wrq->tx_wrs_copied++;
 
 		if (available < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
 			dst->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		}
 
 		dbdiff += n;
 		if (dbdiff >= 16) {
 			ring_eq_db(sc, eq, dbdiff);
 			dbdiff = 0;
 		}
 
 		STAILQ_REMOVE_HEAD(&wrq->wr_list, link);
 		free_wrqe(wr);
 		MPASS(wrq->nwr_pending > 0);
 		wrq->nwr_pending--;
 		MPASS(wrq->ndesc_needed >= n);
 		wrq->ndesc_needed -= n;
 	} while ((wr = STAILQ_FIRST(&wrq->wr_list)) != NULL);
 
 	if (dbdiff)
 		ring_eq_db(sc, eq, dbdiff);
 }
 
 /*
  * Doesn't fail.  Holds on to work requests it can't send right away.
  */
 void
 t4_wrq_tx_locked(struct adapter *sc, struct sge_wrq *wrq, struct wrqe *wr)
 {
 #ifdef INVARIANTS
 	struct sge_eq *eq = &wrq->eq;
 #endif
 
 	EQ_LOCK_ASSERT_OWNED(eq);
 	MPASS(wr != NULL);
 	MPASS(wr->wr_len > 0 && wr->wr_len <= SGE_MAX_WR_LEN);
 	MPASS((wr->wr_len & 0x7) == 0);
 
 	STAILQ_INSERT_TAIL(&wrq->wr_list, wr, link);
 	wrq->nwr_pending++;
 	wrq->ndesc_needed += howmany(wr->wr_len, EQ_ESIZE);
 
 	if (!TAILQ_EMPTY(&wrq->incomplete_wrs))
 		return;	/* commit_wrq_wr will drain wr_list as well. */
 
 	drain_wrq_wr_list(sc, wrq);
 
 	/* Doorbell must have caught up to the pidx. */
 	MPASS(eq->pidx == eq->dbidx);
 }
 
 void
 t4_update_fl_bufsize(struct ifnet *ifp)
 {
 	struct vi_info *vi = ifp->if_softc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_rxq *rxq;
 #ifdef TCP_OFFLOAD
 	struct sge_ofld_rxq *ofld_rxq;
 #endif
 	struct sge_fl *fl;
 	int i, maxp, mtu = ifp->if_mtu;
 
 	maxp = mtu_to_max_payload(sc, mtu, 0);
 	for_each_rxq(vi, i, rxq) {
 		fl = &rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #ifdef TCP_OFFLOAD
 	maxp = mtu_to_max_payload(sc, mtu, 1);
 	for_each_ofld_rxq(vi, i, ofld_rxq) {
 		fl = &ofld_rxq->fl;
 
 		FL_LOCK(fl);
 		find_best_refill_source(sc, fl, maxp);
 		FL_UNLOCK(fl);
 	}
 #endif
 }
 
 static inline int
 mbuf_nsegs(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_pkthdr.l5hlen > 0,
 	    ("%s: mbuf %p missing information on # of segments.", __func__, m));
 
 	return (m->m_pkthdr.l5hlen);
 }
 
 static inline void
 set_mbuf_nsegs(struct mbuf *m, uint8_t nsegs)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.l5hlen = nsegs;
 }
 
 static inline int
 mbuf_len16(struct mbuf *m)
 {
 	int n;
 
 	M_ASSERTPKTHDR(m);
 	n = m->m_pkthdr.PH_loc.eight[0];
 	MPASS(n > 0 && n <= SGE_MAX_WR_LEN / 16);
 
 	return (n);
 }
 
 static inline void
 set_mbuf_len16(struct mbuf *m, uint8_t len16)
 {
 
 	M_ASSERTPKTHDR(m);
 	m->m_pkthdr.PH_loc.eight[0] = len16;
 }
 
 static inline int
 needs_tso(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
 		KASSERT(m->m_pkthdr.tso_segsz > 0,
 		    ("%s: TSO requested in mbuf %p but MSS not provided",
 		    __func__, m));
 		return (1);
 	}
 
 	return (0);
 }
 
 static inline int
 needs_l3_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO))
 		return (1);
 	return (0);
 }
 
 static inline int
 needs_l4_csum(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
 	    CSUM_TCP_IPV6 | CSUM_TSO))
 		return (1);
 	return (0);
 }
 
 static inline int
 needs_vlan_insertion(struct mbuf *m)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	if (m->m_flags & M_VLANTAG) {
 		KASSERT(m->m_pkthdr.ether_vtag != 0,
 		    ("%s: HWVLAN requested in mbuf %p but tag not provided",
 		    __func__, m));
 		return (1);
 	}
 	return (0);
 }
 
 static void *
 m_advance(struct mbuf **pm, int *poffset, int len)
 {
 	struct mbuf *m = *pm;
 	int offset = *poffset;
 	uintptr_t p = 0;
 
 	MPASS(len > 0);
 
 	for (;;) {
 		if (offset + len < m->m_len) {
 			offset += len;
 			p = mtod(m, uintptr_t) + offset;
 			break;
 		}
 		len -= m->m_len - offset;
 		m = m->m_next;
 		offset = 0;
 		MPASS(m != NULL);
 	}
 	*poffset = offset;
 	*pm = m;
 	return ((void *)p);
 }
 
 static inline int
 same_paddr(char *a, char *b)
 {
 
 	if (a == b)
 		return (1);
 	else if (a != NULL && b != NULL) {
 		vm_offset_t x = (vm_offset_t)a;
 		vm_offset_t y = (vm_offset_t)b;
 
 		if ((x & PAGE_MASK) == (y & PAGE_MASK) &&
 		    pmap_kextract(x) == pmap_kextract(y))
 			return (1);
 	}
 
 	return (0);
 }
 
 /*
  * Can deal with empty mbufs in the chain that have m_len = 0, but the chain
  * must have at least one mbuf that's not empty.
  */
 static inline int
 count_mbuf_nsegs(struct mbuf *m)
 {
 	char *prev_end, *start;
 	int len, nsegs;
 
 	MPASS(m != NULL);
 
 	nsegs = 0;
 	prev_end = NULL;
 	for (; m; m = m->m_next) {
 
 		len = m->m_len;
 		if (__predict_false(len == 0))
 			continue;
 		start = mtod(m, char *);
 
 		nsegs += sglist_count(start, len);
 		if (same_paddr(prev_end, start))
 			nsegs--;
 		prev_end = start + len;
 	}
 
 	MPASS(nsegs > 0);
 	return (nsegs);
 }
 
 /*
  * Analyze the mbuf to determine its tx needs.  The mbuf passed in may change:
  * a) caller can assume it's been freed if this function returns with an error.
  * b) it may get defragged up if the gather list is too long for the hardware.
  */
 int
 parse_pkt(struct adapter *sc, struct mbuf **mp)
 {
 	struct mbuf *m0 = *mp, *m;
 	int rc, nsegs, defragged = 0, offset;
 	struct ether_header *eh;
 	void *l3hdr;
 #if defined(INET) || defined(INET6)
 	struct tcphdr *tcp;
 #endif
 	uint16_t eh_type;
 
 	M_ASSERTPKTHDR(m0);
 	if (__predict_false(m0->m_pkthdr.len < ETHER_HDR_LEN)) {
 		rc = EINVAL;
 fail:
 		m_freem(m0);
 		*mp = NULL;
 		return (rc);
 	}
 restart:
 	/*
 	 * First count the number of gather list segments in the payload.
 	 * Defrag the mbuf if nsegs exceeds the hardware limit.
 	 */
 	M_ASSERTPKTHDR(m0);
 	MPASS(m0->m_pkthdr.len > 0);
 	nsegs = count_mbuf_nsegs(m0);
 	if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
 		if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0 = m;	/* update caller's copy after defrag */
 		goto restart;
 	}
 
 	if (__predict_false(nsegs > 2 && m0->m_pkthdr.len <= MHLEN)) {
 		m0 = m_pullup(m0, m0->m_pkthdr.len);
 		if (m0 == NULL) {
 			/* Should have left well enough alone. */
 			rc = EFBIG;
 			goto fail;
 		}
 		*mp = m0;	/* update caller's copy after pullup */
 		goto restart;
 	}
 	set_mbuf_nsegs(m0, nsegs);
 	if (sc->flags & IS_VF)
 		set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
 	else
 		set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
 
 	if (!needs_tso(m0) &&
 	    !(sc->flags & IS_VF && (needs_l3_csum(m0) || needs_l4_csum(m0))))
 		return (0);
 
 	m = m0;
 	eh = mtod(m, struct ether_header *);
 	eh_type = ntohs(eh->ether_type);
 	if (eh_type == ETHERTYPE_VLAN) {
 		struct ether_vlan_header *evh = (void *)eh;
 
 		eh_type = ntohs(evh->evl_proto);
 		m0->m_pkthdr.l2hlen = sizeof(*evh);
 	} else
 		m0->m_pkthdr.l2hlen = sizeof(*eh);
 
 	offset = 0;
 	l3hdr = m_advance(&m, &offset, m0->m_pkthdr.l2hlen);
 
 	switch (eh_type) {
 #ifdef INET6
 	case ETHERTYPE_IPV6:
 	{
 		struct ip6_hdr *ip6 = l3hdr;
 
 		MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
 
 		m0->m_pkthdr.l3hlen = sizeof(*ip6);
 		break;
 	}
 #endif
 #ifdef INET
 	case ETHERTYPE_IP:
 	{
 		struct ip *ip = l3hdr;
 
 		m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
 		break;
 	}
 #endif
 	default:
 		panic("%s: ethertype 0x%04x unknown.  if_cxgbe must be compiled"
 		    " with the same INET/INET6 options as the kernel.",
 		    __func__, eh_type);
 	}
 
 #if defined(INET) || defined(INET6)
 	if (needs_tso(m0)) {
 		tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
 		m0->m_pkthdr.l4hlen = tcp->th_off * 4;
 	}
 #endif
 	MPASS(m0 == *mp);
 	return (0);
 }
 
 void *
 start_wrq_wr(struct sge_wrq *wrq, int len16, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, available;
 	struct wrqe *wr;
 	void *w;
 
 	MPASS(len16 > 0);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc > 0 && ndesc <= SGE_MAX_WR_NDESC);
 
 	EQ_LOCK(eq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 	if (!STAILQ_EMPTY(&wrq->wr_list)) {
 slowpath:
 		EQ_UNLOCK(eq);
 		wr = alloc_wrqe(len16 * 16, wrq);
 		if (__predict_false(wr == NULL))
 			return (NULL);
 		cookie->pidx = -1;
 		cookie->ndesc = ndesc;
 		return (&wr->wr);
 	}
 
 	eq->cidx = read_hw_cidx(eq);
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	if (available < ndesc)
 		goto slowpath;
 
 	cookie->pidx = eq->pidx;
 	cookie->ndesc = ndesc;
 	TAILQ_INSERT_TAIL(&wrq->incomplete_wrs, cookie, link);
 
 	w = &eq->desc[eq->pidx];
 	IDXINCR(eq->pidx, ndesc, eq->sidx);
 	if (__predict_false(eq->pidx < ndesc - 1)) {
 		w = &wrq->ss[0];
 		wrq->ss_pidx = cookie->pidx;
 		wrq->ss_len = len16 * 16;
 	}
 
 	EQ_UNLOCK(eq);
 
 	return (w);
 }
 
 void
 commit_wrq_wr(struct sge_wrq *wrq, void *w, struct wrq_cookie *cookie)
 {
 	struct sge_eq *eq = &wrq->eq;
 	struct adapter *sc = wrq->adapter;
 	int ndesc, pidx;
 	struct wrq_cookie *prev, *next;
 
 	if (cookie->pidx == -1) {
 		struct wrqe *wr = __containerof(w, struct wrqe, wr);
 
 		t4_wrq_tx(sc, wr);
 		return;
 	}
 
 	ndesc = cookie->ndesc;	/* Can be more than SGE_MAX_WR_NDESC here. */
 	pidx = cookie->pidx;
 	MPASS(pidx >= 0 && pidx < eq->sidx);
 	if (__predict_false(w == &wrq->ss[0])) {
 		int n = (eq->sidx - wrq->ss_pidx) * EQ_ESIZE;
 
 		MPASS(wrq->ss_len > n);	/* WR had better wrap around. */
 		bcopy(&wrq->ss[0], &eq->desc[wrq->ss_pidx], n);
 		bcopy(&wrq->ss[n], &eq->desc[0], wrq->ss_len - n);
 		wrq->tx_wrs_ss++;
 	} else
 		wrq->tx_wrs_direct++;
 
 	EQ_LOCK(eq);
 	prev = TAILQ_PREV(cookie, wrq_incomplete_wrs, link);
 	next = TAILQ_NEXT(cookie, link);
 	if (prev == NULL) {
 		MPASS(pidx == eq->dbidx);
 		if (next == NULL || ndesc >= 16)
 			ring_eq_db(wrq->adapter, eq, ndesc);
 		else {
 			MPASS(IDXDIFF(next->pidx, pidx, eq->sidx) == ndesc);
 			next->pidx = pidx;
 			next->ndesc += ndesc;
 		}
 	} else {
 		MPASS(IDXDIFF(pidx, prev->pidx, eq->sidx) == prev->ndesc);
 		prev->ndesc += ndesc;
 	}
 	TAILQ_REMOVE(&wrq->incomplete_wrs, cookie, link);
 
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs) && !STAILQ_EMPTY(&wrq->wr_list))
 		drain_wrq_wr_list(sc, wrq);
 
 #ifdef INVARIANTS
 	if (TAILQ_EMPTY(&wrq->incomplete_wrs)) {
 		/* Doorbell must have caught up to the pidx. */
 		MPASS(wrq->eq.pidx == wrq->eq.dbidx);
 	}
 #endif
 	EQ_UNLOCK(eq);
 }
 
 static u_int
 can_resume_eth_tx(struct mp_ring *r)
 {
 	struct sge_eq *eq = r->cookie;
 
 	return (total_available_tx_desc(eq) > eq->sidx / 8);
 }
 
 static inline int
 cannot_use_txpkts(struct mbuf *m)
 {
 	/* maybe put a GL limit too, to avoid silliness? */
 
 	return (needs_tso(m));
 }
 
 /*
  * r->items[cidx] to r->items[pidx], with a wraparound at r->size, are ready to
  * be consumed.  Return the actual number consumed.  0 indicates a stall.
  */
 static u_int
 eth_tx(struct mp_ring *r, u_int cidx, u_int pidx)
 {
 	struct sge_txq *txq = r->cookie;
 	struct sge_eq *eq = &txq->eq;
 	struct ifnet *ifp = txq->ifp;
 	struct vi_info *vi = ifp->if_softc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	u_int total, remaining;		/* # of packets */
 	u_int available, dbdiff;	/* # of hardware descriptors */
 	u_int n, next_cidx;
 	struct mbuf *m0, *tail;
 	struct txpkts txp;
 	struct fw_eth_tx_pkts_wr *wr;	/* any fw WR struct will do */
 
 	remaining = IDXDIFF(pidx, cidx, r->size);
 	MPASS(remaining > 0);	/* Must not be called without work to do. */
 	total = 0;
 
 	TXQ_LOCK(txq);
 	if (__predict_false((eq->flags & EQ_ENABLED) == 0)) {
 		while (cidx != pidx) {
 			m0 = r->items[cidx];
 			m_freem(m0);
 			if (++cidx == r->size)
 				cidx = 0;
 		}
 		reclaim_tx_descs(txq, 2048);
 		total = remaining;
 		goto done;
 	}
 
 	/* How many hardware descriptors do we have readily available. */
 	if (eq->pidx == eq->cidx)
 		available = eq->sidx - 1;
 	else
 		available = IDXDIFF(eq->cidx, eq->pidx, eq->sidx) - 1;
 	dbdiff = IDXDIFF(eq->pidx, eq->dbidx, eq->sidx);
 
 	while (remaining > 0) {
 
 		m0 = r->items[cidx];
 		M_ASSERTPKTHDR(m0);
 		MPASS(m0->m_nextpkt == NULL);
 
 		if (available < SGE_MAX_WR_NDESC) {
 			available += reclaim_tx_descs(txq, 64);
 			if (available < howmany(mbuf_len16(m0), EQ_ESIZE / 16))
 				break;	/* out of descriptors */
 		}
 
 		next_cidx = cidx + 1;
 		if (__predict_false(next_cidx == r->size))
 			next_cidx = 0;
 
 		wr = (void *)&eq->desc[eq->pidx];
 		if (sc->flags & IS_VF) {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_vm_wr(sc, txq, (void *)wr, m0,
 			    available);
 		} else if (remaining > 1 &&
 		    try_txpkts(m0, r->items[next_cidx], &txp, available) == 0) {
 
 			/* pkts at cidx, next_cidx should both be in txp. */
 			MPASS(txp.npkt == 2);
 			tail = r->items[next_cidx];
 			MPASS(tail->m_nextpkt == NULL);
 			ETHER_BPF_MTAP(ifp, m0);
 			ETHER_BPF_MTAP(ifp, tail);
 			m0->m_nextpkt = tail;
 
 			if (__predict_false(++next_cidx == r->size))
 				next_cidx = 0;
 
 			while (next_cidx != pidx) {
 				if (add_to_txpkts(r->items[next_cidx], &txp,
 				    available) != 0)
 					break;
 				tail->m_nextpkt = r->items[next_cidx];
 				tail = tail->m_nextpkt;
 				ETHER_BPF_MTAP(ifp, tail);
 				if (__predict_false(++next_cidx == r->size))
 					next_cidx = 0;
 			}
 
 			n = write_txpkts_wr(txq, wr, m0, &txp, available);
 			total += txp.npkt;
 			remaining -= txp.npkt;
 		} else {
 			total++;
 			remaining--;
 			ETHER_BPF_MTAP(ifp, m0);
 			n = write_txpkt_wr(txq, (void *)wr, m0, available);
 		}
 		MPASS(n >= 1 && n <= available && n <= SGE_MAX_WR_NDESC);
 
 		available -= n;
 		dbdiff += n;
 		IDXINCR(eq->pidx, n, eq->sidx);
 
 		if (total_available_tx_desc(eq) < eq->sidx / 4 &&
 		    atomic_cmpset_int(&eq->equiq, 0, 1)) {
 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUIQ |
 			    F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		} else if (IDXDIFF(eq->pidx, eq->equeqidx, eq->sidx) >= 32) {
 			wr->equiq_to_len16 |= htobe32(F_FW_WR_EQUEQ);
 			eq->equeqidx = eq->pidx;
 		}
 
 		if (dbdiff >= 16 && remaining >= 4) {
 			ring_eq_db(sc, eq, dbdiff);
 			available += reclaim_tx_descs(txq, 4 * dbdiff);
 			dbdiff = 0;
 		}
 
 		cidx = next_cidx;
 	}
 	if (dbdiff != 0) {
 		ring_eq_db(sc, eq, dbdiff);
 		reclaim_tx_descs(txq, 32);
 	}
 done:
 	TXQ_UNLOCK(txq);
 
 	return (total);
 }
 
 static inline void
 init_iq(struct sge_iq *iq, struct adapter *sc, int tmr_idx, int pktc_idx,
     int qsize)
 {
 
 	KASSERT(tmr_idx >= 0 && tmr_idx < SGE_NTIMERS,
 	    ("%s: bad tmr_idx %d", __func__, tmr_idx));
 	KASSERT(pktc_idx < SGE_NCOUNTERS,	/* -ve is ok, means don't use */
 	    ("%s: bad pktc_idx %d", __func__, pktc_idx));
 
 	iq->flags = 0;
 	iq->adapter = sc;
 	iq->intr_params = V_QINTR_TIMER_IDX(tmr_idx);
 	iq->intr_pktc_idx = SGE_NCOUNTERS - 1;
 	if (pktc_idx >= 0) {
 		iq->intr_params |= F_QINTR_CNT_EN;
 		iq->intr_pktc_idx = pktc_idx;
 	}
 	iq->qsize = roundup2(qsize, 16);	/* See FW_IQ_CMD/iqsize */
 	iq->sidx = iq->qsize - sc->params.sge.spg_len / IQ_ESIZE;
 }
 
 static inline void
 init_fl(struct adapter *sc, struct sge_fl *fl, int qsize, int maxp, char *name)
 {
 
 	fl->qsize = qsize;
 	fl->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(fl->lockname, name, sizeof(fl->lockname));
 	if (sc->flags & BUF_PACKING_OK &&
 	    ((!is_t4(sc) && buffer_packing) ||	/* T5+: enabled unless 0 */
 	    (is_t4(sc) && buffer_packing == 1)))/* T4: disabled unless 1 */
 		fl->flags |= FL_BUF_PACKING;
 	find_best_refill_source(sc, fl, maxp);
 	find_safe_refill_source(sc, fl);
 }
 
 static inline void
 init_eq(struct adapter *sc, struct sge_eq *eq, int eqtype, int qsize,
     uint8_t tx_chan, uint16_t iqid, char *name)
 {
 	KASSERT(eqtype <= EQ_TYPEMASK, ("%s: bad qtype %d", __func__, eqtype));
 
 	eq->flags = eqtype & EQ_TYPEMASK;
 	eq->tx_chan = tx_chan;
 	eq->iqid = iqid;
 	eq->sidx = qsize - sc->params.sge.spg_len / EQ_ESIZE;
 	strlcpy(eq->lockname, name, sizeof(eq->lockname));
 }
 
 static int
 alloc_ring(struct adapter *sc, size_t len, bus_dma_tag_t *tag,
     bus_dmamap_t *map, bus_addr_t *pa, void **va)
 {
 	int rc;
 
 	rc = bus_dma_tag_create(sc->dmat, 512, 0, BUS_SPACE_MAXADDR,
 	    BUS_SPACE_MAXADDR, NULL, NULL, len, 1, len, 0, NULL, NULL, tag);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA tag: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamem_alloc(*tag, va,
 	    BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, map);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot allocate DMA memory: %d\n", rc);
 		goto done;
 	}
 
 	rc = bus_dmamap_load(*tag, *map, *va, len, oneseg_dma_callback, pa, 0);
 	if (rc != 0) {
 		device_printf(sc->dev, "cannot load DMA map: %d\n", rc);
 		goto done;
 	}
 done:
 	if (rc)
 		free_ring(sc, *tag, *map, *pa, *va);
 
 	return (rc);
 }
 
 static int
 free_ring(struct adapter *sc, bus_dma_tag_t tag, bus_dmamap_t map,
     bus_addr_t pa, void *va)
 {
 	if (pa)
 		bus_dmamap_unload(tag, map);
 	if (va)
 		bus_dmamem_free(tag, va, map);
 	if (tag)
 		bus_dma_tag_destroy(tag);
 
 	return (0);
 }
 
 /*
  * Allocates the ring for an ingress queue and an optional freelist.  If the
  * freelist is specified it will be allocated and then associated with the
  * ingress queue.
  *
  * Returns errno on failure.  Resources allocated up to that point may still be
  * allocated.  Caller is responsible for cleanup in case this function fails.
  *
  * If the ingress queue will take interrupts directly (iq->flags & IQ_INTR) then
  * the intr_idx specifies the vector, starting from 0.  Otherwise it specifies
  * the abs_id of the ingress queue to which its interrupts should be forwarded.
  */
 static int
 alloc_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl,
     int intr_idx, int cong)
 {
 	int rc, i, cntxt_id;
 	size_t len;
 	struct fw_iq_cmd c;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = iq->adapter;
 	struct sge_params *sp = &sc->params.sge;
 	__be32 v = 0;
 
 	len = iq->qsize * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &iq->desc_tag, &iq->desc_map, &iq->ba,
 	    (void **)&iq->desc);
 	if (rc != 0)
 		return (rc);
 
 	bzero(&c, sizeof(c));
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_IQ_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_IQ_CMD_PFN(sc->pf) |
 	    V_FW_IQ_CMD_VFN(0));
 
 	c.alloc_to_len16 = htobe32(F_FW_IQ_CMD_ALLOC | F_FW_IQ_CMD_IQSTART |
 	    FW_LEN16(c));
 
 	/* Special handling for firmware event queue */
 	if (iq == &sc->sge.fwq)
 		v |= F_FW_IQ_CMD_IQASYNCH;
 
 	if (iq->flags & IQ_INTR) {
 		KASSERT(intr_idx < sc->intr_count,
 		    ("%s: invalid direct intr_idx %d", __func__, intr_idx));
 	} else
 		v |= F_FW_IQ_CMD_IQANDST;
 	v |= V_FW_IQ_CMD_IQANDSTINDEX(intr_idx);
 
 	c.type_to_iqandstindex = htobe32(v |
 	    V_FW_IQ_CMD_TYPE(FW_IQ_TYPE_FL_INT_CAP) |
 	    V_FW_IQ_CMD_VIID(vi->viid) |
 	    V_FW_IQ_CMD_IQANUD(X_UPDATEDELIVERY_INTERRUPT));
 	c.iqdroprss_to_iqesize = htobe16(V_FW_IQ_CMD_IQPCIECH(pi->tx_chan) |
 	    F_FW_IQ_CMD_IQGTSMODE |
 	    V_FW_IQ_CMD_IQINTCNTTHRESH(iq->intr_pktc_idx) |
 	    V_FW_IQ_CMD_IQESIZE(ilog2(IQ_ESIZE) - 4));
 	c.iqsize = htobe16(iq->qsize);
 	c.iqaddr = htobe64(iq->ba);
 	if (cong >= 0)
 		c.iqns_to_fl0congen = htobe32(F_FW_IQ_CMD_IQFLINTCONGEN);
 
 	if (fl) {
 		mtx_init(&fl->fl_lock, fl->lockname, NULL, MTX_DEF);
 
 		len = fl->qsize * EQ_ESIZE;
 		rc = alloc_ring(sc, len, &fl->desc_tag, &fl->desc_map,
 		    &fl->ba, (void **)&fl->desc);
 		if (rc)
 			return (rc);
 
 		/* Allocate space for one software descriptor per buffer. */
 		rc = alloc_fl_sdesc(fl);
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to setup fl software descriptors: %d\n",
 			    rc);
 			return (rc);
 		}
 
 		if (fl->flags & FL_BUF_PACKING) {
 			fl->lowat = roundup2(sp->fl_starve_threshold2, 8);
 			fl->buf_boundary = sp->pack_boundary;
 		} else {
 			fl->lowat = roundup2(sp->fl_starve_threshold, 8);
 			fl->buf_boundary = 16;
 		}
 		if (fl_pad && fl->buf_boundary < sp->pad_boundary)
 			fl->buf_boundary = sp->pad_boundary;
 
 		c.iqns_to_fl0congen |=
 		    htobe32(V_FW_IQ_CMD_FL0HOSTFCMODE(X_HOSTFCMODE_NONE) |
 			F_FW_IQ_CMD_FL0FETCHRO | F_FW_IQ_CMD_FL0DATARO |
 			(fl_pad ? F_FW_IQ_CMD_FL0PADEN : 0) |
 			(fl->flags & FL_BUF_PACKING ? F_FW_IQ_CMD_FL0PACKEN :
 			    0));
 		if (cong >= 0) {
 			c.iqns_to_fl0congen |=
 				htobe32(V_FW_IQ_CMD_FL0CNGCHMAP(cong) |
 				    F_FW_IQ_CMD_FL0CONGCIF |
 				    F_FW_IQ_CMD_FL0CONGEN);
 		}
 		c.fl0dcaen_to_fl0cidxfthresh =
 		    htobe16(V_FW_IQ_CMD_FL0FBMIN(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMIN_128B : X_FETCHBURSTMIN_64B) |
 			V_FW_IQ_CMD_FL0FBMAX(chip_id(sc) <= CHELSIO_T5 ?
 			X_FETCHBURSTMAX_512B : X_FETCHBURSTMAX_256B));
 		c.fl0size = htobe16(fl->qsize);
 		c.fl0addr = htobe64(fl->ba);
 	}
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create ingress queue: %d\n", rc);
 		return (rc);
 	}
 
 	iq->cidx = 0;
 	iq->gen = F_RSPD_GEN;
 	iq->intr_next = iq->intr_params;
 	iq->cntxt_id = be16toh(c.iqid);
 	iq->abs_id = be16toh(c.physiqid);
 	iq->flags |= IQ_ALLOCATED;
 
 	cntxt_id = iq->cntxt_id - sc->sge.iq_start;
 	if (cntxt_id >= sc->sge.niq) {
 		panic ("%s: iq->cntxt_id (%d) more than the max (%d)", __func__,
 		    cntxt_id, sc->sge.niq - 1);
 	}
 	sc->sge.iqmap[cntxt_id] = iq;
 
 	if (fl) {
 		u_int qid;
 
 		iq->flags |= IQ_HAS_FL;
 		fl->cntxt_id = be16toh(c.fl0id);
 		fl->pidx = fl->cidx = 0;
 
 		cntxt_id = fl->cntxt_id - sc->sge.eq_start;
 		if (cntxt_id >= sc->sge.neq) {
 			panic("%s: fl->cntxt_id (%d) more than the max (%d)",
 			    __func__, cntxt_id, sc->sge.neq - 1);
 		}
 		sc->sge.eqmap[cntxt_id] = (void *)fl;
 
 		qid = fl->cntxt_id;
 		if (isset(&sc->doorbells, DOORBELL_UDB)) {
 			uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 			uint32_t mask = (1 << s_qpp) - 1;
 			volatile uint8_t *udb;
 
 			udb = sc->udbs_base + UDBS_DB_OFFSET;
 			udb += (qid >> s_qpp) << PAGE_SHIFT;
 			qid &= mask;
 			if (qid < PAGE_SIZE / UDBS_SEG_SIZE) {
 				udb += qid << UDBS_SEG_SHIFT;
 				qid = 0;
 			}
 			fl->udb = (volatile void *)udb;
 		}
 		fl->dbval = V_QID(qid) | sc->chip_params->sge_fl_db;
 
 		FL_LOCK(fl);
 		/* Enough to make sure the SGE doesn't think it's starved */
 		refill_fl(sc, fl, fl->lowat);
 		FL_UNLOCK(fl);
 	}
 
 	if (chip_id(sc) >= CHELSIO_T5 && !(sc->flags & IS_VF) && cong >= 0) {
 		uint32_t param, val;
 
 		param = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 		    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_CONM_CTXT) |
 		    V_FW_PARAMS_PARAM_YZ(iq->cntxt_id);
 		if (cong == 0)
 			val = 1 << 19;
 		else {
 			val = 2 << 19;
 			for (i = 0; i < 4; i++) {
 				if (cong & (1 << i))
 					val |= 1 << (i << 2);
 			}
 		}
 
 		rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &param, &val);
 		if (rc != 0) {
 			/* report error but carry on */
 			device_printf(sc->dev,
 			    "failed to set congestion manager context for "
 			    "ingress queue %d: %d\n", iq->cntxt_id, rc);
 		}
 	}
 
 	/* Enable IQ interrupts */
 	atomic_store_rel_int(&iq->state, IQS_IDLE);
 	t4_write_reg(sc, sc->sge_gts_reg, V_SEINTARM(iq->intr_params) |
 	    V_INGRESSQID(iq->cntxt_id));
 
 	return (0);
 }
 
 static int
 free_iq_fl(struct vi_info *vi, struct sge_iq *iq, struct sge_fl *fl)
 {
 	int rc;
 	struct adapter *sc = iq->adapter;
 	device_t dev;
 
 	if (sc == NULL)
 		return (0);	/* nothing to do */
 
 	dev = vi ? vi->dev : sc->dev;
 
 	if (iq->flags & IQ_ALLOCATED) {
 		rc = -t4_iq_free(sc, sc->mbox, sc->pf, 0,
 		    FW_IQ_TYPE_FL_INT_CAP, iq->cntxt_id,
 		    fl ? fl->cntxt_id : 0xffff, 0xffff);
 		if (rc != 0) {
 			device_printf(dev,
 			    "failed to free queue %p: %d\n", iq, rc);
 			return (rc);
 		}
 		iq->flags &= ~IQ_ALLOCATED;
 	}
 
 	free_ring(sc, iq->desc_tag, iq->desc_map, iq->ba, iq->desc);
 
 	bzero(iq, sizeof(*iq));
 
 	if (fl) {
 		free_ring(sc, fl->desc_tag, fl->desc_map, fl->ba,
 		    fl->desc);
 
 		if (fl->sdesc)
 			free_fl_sdesc(sc, fl);
 
 		if (mtx_initialized(&fl->fl_lock))
 			mtx_destroy(&fl->fl_lock);
 
 		bzero(fl, sizeof(*fl));
 	}
 
 	return (0);
 }
 
 static void
-add_fl_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *oid,
-    struct sge_fl *fl)
+add_fl_sysctls(struct adapter *sc, struct sysctl_ctx_list *ctx,
+    struct sysctl_oid *oid, struct sge_fl *fl)
 {
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
+	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
+	    &fl->ba, "bus address of descriptor ring");
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
+	    fl->sidx * EQ_ESIZE + sc->params.sge.spg_len,
+	    "desc ring size in bytes");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fl->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "padding", CTLFLAG_RD, NULL,
 	    fl_pad ? 1 : 0, "padding enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "packing", CTLFLAG_RD, NULL,
 	    fl->flags & FL_BUF_PACKING ? 1 : 0, "packing enabled");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD, &fl->cidx,
 	    0, "consumer index");
 	if (fl->flags & FL_BUF_PACKING) {
 		SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_offset",
 		    CTLFLAG_RD, &fl->rx_offset, 0, "packing rx offset");
 	}
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD, &fl->pidx,
 	    0, "producer index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_allocated",
 	    CTLFLAG_RD, &fl->mbuf_allocated, "# of mbuf allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "mbuf_inlined",
 	    CTLFLAG_RD, &fl->mbuf_inlined, "# of mbuf inlined in clusters");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_allocated",
 	    CTLFLAG_RD, &fl->cl_allocated, "# of clusters allocated");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_recycled",
 	    CTLFLAG_RD, &fl->cl_recycled, "# of clusters recycled");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "cluster_fast_recycled",
 	    CTLFLAG_RD, &fl->cl_fast_recycled, "# of clusters recycled (fast)");
 }
 
 static int
 alloc_fwq(struct adapter *sc)
 {
 	int rc, intr_idx;
 	struct sge_iq *fwq = &sc->sge.fwq;
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	init_iq(fwq, sc, 0, 0, FW_IQ_QSIZE);
 	fwq->flags |= IQ_INTR;	/* always */
 	if (sc->flags & IS_VF)
 		intr_idx = 0;
 	else {
 		intr_idx = sc->intr_count > 1 ? 1 : 0;
 		fwq->set_tcb_rpl = t4_filter_rpl;
 		fwq->l2t_write_rpl = do_l2t_write_rpl;
 	}
 	rc = alloc_iq_fl(&sc->port[0]->vi[0], fwq, NULL, intr_idx, -1);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create firmware event queue: %d\n", rc);
 		return (rc);
 	}
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "fwq", CTLFLAG_RD,
 	    NULL, "firmware event queue");
 	children = SYSCTL_CHILDREN(oid);
 
+	SYSCTL_ADD_UAUTO(&sc->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
+	    &fwq->ba, "bus address of descriptor ring");
+	SYSCTL_ADD_INT(&sc->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
+	    fwq->qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->abs_id, 0, sysctl_uint16, "I",
 	    "absolute id of the queue");
 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&sc->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &fwq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
 	return (0);
 }
 
 static int
 free_fwq(struct adapter *sc)
 {
 	return free_iq_fl(NULL, &sc->sge.fwq, NULL);
 }
 
 static int
 alloc_mgmtq(struct adapter *sc)
 {
 	int rc;
 	struct sge_wrq *mgmtq = &sc->sge.mgmtq;
 	char name[16];
 	struct sysctl_oid *oid = device_get_sysctl_tree(sc->dev);
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	oid = SYSCTL_ADD_NODE(&sc->ctx, children, OID_AUTO, "mgmtq", CTLFLAG_RD,
 	    NULL, "management queue");
 
 	snprintf(name, sizeof(name), "%s mgmtq", device_get_nameunit(sc->dev));
 	init_eq(sc, &mgmtq->eq, EQ_CTRL, CTRL_EQ_QSIZE, sc->port[0]->tx_chan,
 	    sc->sge.fwq.cntxt_id, name);
 	rc = alloc_wrq(sc, NULL, mgmtq, oid);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create management queue: %d\n", rc);
 		return (rc);
 	}
 
 	return (0);
 }
 
 static int
 free_mgmtq(struct adapter *sc)
 {
 
 	return free_wrq(sc, &sc->sge.mgmtq);
 }
 
 int
 tnl_cong(struct port_info *pi, int drop)
 {
 
 	if (drop == -1)
 		return (-1);
 	else if (drop == 1)
 		return (0);
 	else
 		return (pi->rx_chan_map);
 }
 
 static int
 alloc_rxq(struct vi_info *vi, struct sge_rxq *rxq, int intr_idx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &rxq->iq, &rxq->fl, intr_idx,
 	    tnl_cong(vi->pi, cong_drop));
 	if (rc != 0)
 		return (rc);
 
 	if (idx == 0)
 		sc->sge.iq_base = rxq->iq.abs_id - rxq->iq.cntxt_id;
 	else
 		KASSERT(rxq->iq.cntxt_id + sc->sge.iq_base == rxq->iq.abs_id,
 		    ("iq_base mismatch"));
 	KASSERT(sc->sge.iq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero iq_base"));
 
 	/*
 	 * The freelist is just barely above the starvation threshold right now,
 	 * fill it up a bit more.
 	 */
 	FL_LOCK(&rxq->fl);
 	refill_fl(sc, &rxq->fl, 128);
 	FL_UNLOCK(&rxq->fl);
 
 #if defined(INET) || defined(INET6)
 	rc = tcp_lro_init(&rxq->lro);
 	if (rc != 0)
 		return (rc);
 	rxq->lro.ifp = vi->ifp; /* also indicates LRO init'ed */
 
 	if (vi->ifp->if_capenable & IFCAP_LRO)
 		rxq->iq.flags |= IQ_LRO_ENABLED;
 #endif
 	rxq->ifp = vi->ifp;
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
+	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
+	    &rxq->iq.ba, "bus address of descriptor ring");
+	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
+	    rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.abs_id, 0, sysctl_uint16, "I",
 	    "absolute id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cntxt_id, 0, sysctl_uint16, "I",
 	    "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &rxq->iq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 #if defined(INET) || defined(INET6)
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_queued", CTLFLAG_RD,
 	    &rxq->lro.lro_queued, 0, NULL);
 	SYSCTL_ADD_U64(&vi->ctx, children, OID_AUTO, "lro_flushed", CTLFLAG_RD,
 	    &rxq->lro.lro_flushed, 0, NULL);
 #endif
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "rxcsum", CTLFLAG_RD,
 	    &rxq->rxcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
 	    CTLFLAG_RD, &rxq->vlan_extraction,
 	    "# of times hardware extracted 802.1Q tag");
 
-	add_fl_sysctls(&vi->ctx, oid, &rxq->fl);
+	add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_rxq(struct vi_info *vi, struct sge_rxq *rxq)
 {
 	int rc;
 
 #if defined(INET) || defined(INET6)
 	if (rxq->lro.ifp) {
 		tcp_lro_free(&rxq->lro);
 		rxq->lro.ifp = NULL;
 	}
 #endif
 
 	rc = free_iq_fl(vi, &rxq->iq, &rxq->fl);
 	if (rc == 0)
 		bzero(rxq, sizeof(*rxq));
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 alloc_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq,
     int intr_idx, int idx, struct sysctl_oid *oid)
 {
+	struct port_info *pi = vi->pi;
 	int rc;
 	struct sysctl_oid_list *children;
 	char name[16];
 
 	rc = alloc_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl, intr_idx,
-	    vi->pi->rx_chan_map);
+	    pi->rx_chan_map);
 	if (rc != 0)
 		return (rc);
 
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
+	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
+	    &ofld_rxq->iq.ba, "bus address of descriptor ring");
+	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
+	    ofld_rxq->iq.qsize * IQ_ESIZE, "descriptor ring size in bytes");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.abs_id, 0, sysctl_uint16,
 	    "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &ofld_rxq->iq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
-	add_fl_sysctls(&vi->ctx, oid, &ofld_rxq->fl);
+	add_fl_sysctls(pi->adapter, &vi->ctx, oid, &ofld_rxq->fl);
 
 	return (rc);
 }
 
 static int
 free_ofld_rxq(struct vi_info *vi, struct sge_ofld_rxq *ofld_rxq)
 {
 	int rc;
 
 	rc = free_iq_fl(vi, &ofld_rxq->iq, &ofld_rxq->fl);
 	if (rc == 0)
 		bzero(ofld_rxq, sizeof(*ofld_rxq));
 
 	return (rc);
 }
 #endif
 
 #ifdef DEV_NETMAP
 static int
 alloc_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq, int intr_idx,
     int idx, struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_oid_list *children;
 	struct sysctl_ctx_list *ctx;
 	char name[16];
 	size_t len;
 	struct adapter *sc = vi->pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 
 	MPASS(na != NULL);
 
 	len = vi->qsize_rxq * IQ_ESIZE;
 	rc = alloc_ring(sc, len, &nm_rxq->iq_desc_tag, &nm_rxq->iq_desc_map,
 	    &nm_rxq->iq_ba, (void **)&nm_rxq->iq_desc);
 	if (rc != 0)
 		return (rc);
 
 	len = na->num_rx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_rxq->fl_desc_tag, &nm_rxq->fl_desc_map,
 	    &nm_rxq->fl_ba, (void **)&nm_rxq->fl_desc);
 	if (rc != 0)
 		return (rc);
 
 	nm_rxq->vi = vi;
 	nm_rxq->nid = idx;
 	nm_rxq->iq_cidx = 0;
 	nm_rxq->iq_sidx = vi->qsize_rxq - sc->params.sge.spg_len / IQ_ESIZE;
 	nm_rxq->iq_gen = F_RSPD_GEN;
 	nm_rxq->fl_pidx = nm_rxq->fl_cidx = 0;
 	nm_rxq->fl_sidx = na->num_rx_desc;
 	nm_rxq->intr_idx = intr_idx;
 
 	ctx = &vi->ctx;
 	children = SYSCTL_CHILDREN(oid);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name, CTLFLAG_RD, NULL,
 	    "rx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "abs_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_abs_id, 0, sysctl_uint16,
 	    "I", "absolute id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->iq_cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 
 	children = SYSCTL_CHILDREN(oid);
 	oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fl", CTLFLAG_RD, NULL,
 	    "freelist");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cntxt_id",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_rxq->fl_cntxt_id, 0, sysctl_uint16,
 	    "I", "SGE context id of the freelist");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cidx", CTLFLAG_RD,
 	    &nm_rxq->fl_cidx, 0, "consumer index");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "pidx", CTLFLAG_RD,
 	    &nm_rxq->fl_pidx, 0, "producer index");
 
 	return (rc);
 }
 
 
 static int
 free_nm_rxq(struct vi_info *vi, struct sge_nm_rxq *nm_rxq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	free_ring(sc, nm_rxq->iq_desc_tag, nm_rxq->iq_desc_map, nm_rxq->iq_ba,
 	    nm_rxq->iq_desc);
 	free_ring(sc, nm_rxq->fl_desc_tag, nm_rxq->fl_desc_map, nm_rxq->fl_ba,
 	    nm_rxq->fl_desc);
 
 	return (0);
 }
 
 static int
 alloc_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq, int iqidx, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	size_t len;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct netmap_adapter *na = NA(vi->ifp);
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	len = na->num_tx_desc * EQ_ESIZE + sc->params.sge.spg_len;
 	rc = alloc_ring(sc, len, &nm_txq->desc_tag, &nm_txq->desc_map,
 	    &nm_txq->ba, (void **)&nm_txq->desc);
 	if (rc)
 		return (rc);
 
 	nm_txq->pidx = nm_txq->cidx = 0;
 	nm_txq->sidx = na->num_tx_desc;
 	nm_txq->nid = idx;
 	nm_txq->iqidx = iqidx;
 	nm_txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
 	    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
 	    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "netmap tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &nm_txq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &nm_txq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
 
 	return (rc);
 }
 
 static int
 free_nm_txq(struct vi_info *vi, struct sge_nm_txq *nm_txq)
 {
 	struct adapter *sc = vi->pi->adapter;
 
 	free_ring(sc, nm_txq->desc_tag, nm_txq->desc_map, nm_txq->ba,
 	    nm_txq->desc);
 
 	return (0);
 }
 #endif
 
 static int
 ctrl_eq_alloc(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ctrl_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_CTRL_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_CTRL_CMD_PFN(sc->pf) |
 	    V_FW_EQ_CTRL_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_CTRL_CMD_ALLOC |
 	    F_FW_EQ_CTRL_CMD_EQSTART | FW_LEN16(c));
 	c.cmpliqid_eqid = htonl(V_FW_EQ_CTRL_CMD_CMPLIQID(eq->iqid));
 	c.physeqid_pkd = htobe32(0);
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_CTRL_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_CTRL_CMD_PCIECHN(eq->tx_chan) |
 		F_FW_EQ_CTRL_CMD_FETCHRO | V_FW_EQ_CTRL_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_CTRL_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_CTRL_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_CTRL_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to create control queue %d: %d\n", eq->tx_chan, rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_CTRL_CMD_EQID(be32toh(c.cmpliqid_eqid));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 static int
 eth_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_eth_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htobe32(V_FW_CMD_OP(FW_EQ_ETH_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_ETH_CMD_PFN(sc->pf) |
 	    V_FW_EQ_ETH_CMD_VFN(0));
 	c.alloc_to_len16 = htobe32(F_FW_EQ_ETH_CMD_ALLOC |
 	    F_FW_EQ_ETH_CMD_EQSTART | FW_LEN16(c));
 	c.autoequiqe_to_viid = htobe32(F_FW_EQ_ETH_CMD_AUTOEQUIQE |
 	    F_FW_EQ_ETH_CMD_AUTOEQUEQE | V_FW_EQ_ETH_CMD_VIID(vi->viid));
 	c.fetchszm_to_iqid =
 	    htobe32(V_FW_EQ_ETH_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		V_FW_EQ_ETH_CMD_PCIECHN(eq->tx_chan) | F_FW_EQ_ETH_CMD_FETCHRO |
 		V_FW_EQ_ETH_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize = htobe32(V_FW_EQ_ETH_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 	    V_FW_EQ_ETH_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 	    V_FW_EQ_ETH_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create Ethernet egress queue: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_ETH_CMD_EQID(be32toh(c.eqid_pkd));
 	eq->abs_id = G_FW_EQ_ETH_CMD_PHYSEQID(be32toh(c.physeqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 
 #ifdef TCP_OFFLOAD
 static int
 ofld_eq_alloc(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, cntxt_id;
 	struct fw_eq_ofld_cmd c;
 	int qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 
 	bzero(&c, sizeof(c));
 
 	c.op_to_vfn = htonl(V_FW_CMD_OP(FW_EQ_OFLD_CMD) | F_FW_CMD_REQUEST |
 	    F_FW_CMD_WRITE | F_FW_CMD_EXEC | V_FW_EQ_OFLD_CMD_PFN(sc->pf) |
 	    V_FW_EQ_OFLD_CMD_VFN(0));
 	c.alloc_to_len16 = htonl(F_FW_EQ_OFLD_CMD_ALLOC |
 	    F_FW_EQ_OFLD_CMD_EQSTART | FW_LEN16(c));
 	c.fetchszm_to_iqid =
 		htonl(V_FW_EQ_OFLD_CMD_HOSTFCMODE(X_HOSTFCMODE_NONE) |
 		    V_FW_EQ_OFLD_CMD_PCIECHN(eq->tx_chan) |
 		    F_FW_EQ_OFLD_CMD_FETCHRO | V_FW_EQ_OFLD_CMD_IQID(eq->iqid));
 	c.dcaen_to_eqsize =
 	    htobe32(V_FW_EQ_OFLD_CMD_FBMIN(X_FETCHBURSTMIN_64B) |
 		V_FW_EQ_OFLD_CMD_FBMAX(X_FETCHBURSTMAX_512B) |
 		V_FW_EQ_OFLD_CMD_EQSIZE(qsize));
 	c.eqaddr = htobe64(eq->ba);
 
 	rc = -t4_wr_mbox(sc, sc->mbox, &c, sizeof(c), &c);
 	if (rc != 0) {
 		device_printf(vi->dev,
 		    "failed to create egress queue for TCP offload: %d\n", rc);
 		return (rc);
 	}
 	eq->flags |= EQ_ALLOCATED;
 
 	eq->cntxt_id = G_FW_EQ_OFLD_CMD_EQID(be32toh(c.eqid_pkd));
 	cntxt_id = eq->cntxt_id - sc->sge.eq_start;
 	if (cntxt_id >= sc->sge.neq)
 	    panic("%s: eq->cntxt_id (%d) more than the max (%d)", __func__,
 		cntxt_id, sc->sge.neq - 1);
 	sc->sge.eqmap[cntxt_id] = eq;
 
 	return (rc);
 }
 #endif
 
 static int
 alloc_eq(struct adapter *sc, struct vi_info *vi, struct sge_eq *eq)
 {
 	int rc, qsize;
 	size_t len;
 
 	mtx_init(&eq->eq_lock, eq->lockname, NULL, MTX_DEF);
 
 	qsize = eq->sidx + sc->params.sge.spg_len / EQ_ESIZE;
 	len = qsize * EQ_ESIZE;
 	rc = alloc_ring(sc, len, &eq->desc_tag, &eq->desc_map,
 	    &eq->ba, (void **)&eq->desc);
 	if (rc)
 		return (rc);
 
 	eq->pidx = eq->cidx = 0;
 	eq->equeqidx = eq->dbidx = 0;
 	eq->doorbells = sc->doorbells;
 
 	switch (eq->flags & EQ_TYPEMASK) {
 	case EQ_CTRL:
 		rc = ctrl_eq_alloc(sc, eq);
 		break;
 
 	case EQ_ETH:
 		rc = eth_eq_alloc(sc, vi, eq);
 		break;
 
 #ifdef TCP_OFFLOAD
 	case EQ_OFLD:
 		rc = ofld_eq_alloc(sc, vi, eq);
 		break;
 #endif
 
 	default:
 		panic("%s: invalid eq type %d.", __func__,
 		    eq->flags & EQ_TYPEMASK);
 	}
 	if (rc != 0) {
 		device_printf(sc->dev,
 		    "failed to allocate egress queue(%d): %d\n",
 		    eq->flags & EQ_TYPEMASK, rc);
 	}
 
 	if (isset(&eq->doorbells, DOORBELL_UDB) ||
 	    isset(&eq->doorbells, DOORBELL_UDBWC) ||
 	    isset(&eq->doorbells, DOORBELL_WCWR)) {
 		uint32_t s_qpp = sc->params.sge.eq_s_qpp;
 		uint32_t mask = (1 << s_qpp) - 1;
 		volatile uint8_t *udb;
 
 		udb = sc->udbs_base + UDBS_DB_OFFSET;
 		udb += (eq->cntxt_id >> s_qpp) << PAGE_SHIFT;	/* pg offset */
 		eq->udb_qid = eq->cntxt_id & mask;		/* id in page */
 		if (eq->udb_qid >= PAGE_SIZE / UDBS_SEG_SIZE)
 	    		clrbit(&eq->doorbells, DOORBELL_WCWR);
 		else {
 			udb += eq->udb_qid << UDBS_SEG_SHIFT;	/* seg offset */
 			eq->udb_qid = 0;
 		}
 		eq->udb = (volatile void *)udb;
 	}
 
 	return (rc);
 }
 
 static int
 free_eq(struct adapter *sc, struct sge_eq *eq)
 {
 	int rc;
 
 	if (eq->flags & EQ_ALLOCATED) {
 		switch (eq->flags & EQ_TYPEMASK) {
 		case EQ_CTRL:
 			rc = -t4_ctrl_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 		case EQ_ETH:
 			rc = -t4_eth_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 
 #ifdef TCP_OFFLOAD
 		case EQ_OFLD:
 			rc = -t4_ofld_eq_free(sc, sc->mbox, sc->pf, 0,
 			    eq->cntxt_id);
 			break;
 #endif
 
 		default:
 			panic("%s: invalid eq type %d.", __func__,
 			    eq->flags & EQ_TYPEMASK);
 		}
 		if (rc != 0) {
 			device_printf(sc->dev,
 			    "failed to free egress queue (%d): %d\n",
 			    eq->flags & EQ_TYPEMASK, rc);
 			return (rc);
 		}
 		eq->flags &= ~EQ_ALLOCATED;
 	}
 
 	free_ring(sc, eq->desc_tag, eq->desc_map, eq->ba, eq->desc);
 
 	if (mtx_initialized(&eq->eq_lock))
 		mtx_destroy(&eq->eq_lock);
 
 	bzero(eq, sizeof(*eq));
 	return (0);
 }
 
 static int
 alloc_wrq(struct adapter *sc, struct vi_info *vi, struct sge_wrq *wrq,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct sysctl_ctx_list *ctx = vi ? &vi->ctx : &sc->ctx;
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = alloc_eq(sc, vi, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	wrq->adapter = sc;
 	TASK_INIT(&wrq->wrq_tx_task, 0, wrq_tx_drain, wrq);
 	TAILQ_INIT(&wrq->incomplete_wrs);
 	STAILQ_INIT(&wrq->wr_list);
 	wrq->nwr_pending = 0;
 	wrq->ndesc_needed = 0;
 
+	SYSCTL_ADD_UAUTO(ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
+	    &wrq->eq.ba, "bus address of descriptor ring");
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
+	    wrq->eq.sidx * EQ_ESIZE + sc->params.sge.spg_len,
+	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &wrq->eq.cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &wrq->eq.pidx, 0, sysctl_uint16, "I",
 	    "producer index");
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
+	    wrq->eq.sidx, "status page index");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_direct", CTLFLAG_RD,
 	    &wrq->tx_wrs_direct, "# of work requests (direct)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_copied", CTLFLAG_RD,
 	    &wrq->tx_wrs_copied, "# of work requests (copied)");
 	SYSCTL_ADD_UQUAD(ctx, children, OID_AUTO, "tx_wrs_sspace", CTLFLAG_RD,
 	    &wrq->tx_wrs_ss, "# of work requests (copied from scratch space)");
 
 	return (rc);
 }
 
 static int
 free_wrq(struct adapter *sc, struct sge_wrq *wrq)
 {
 	int rc;
 
 	rc = free_eq(sc, &wrq->eq);
 	if (rc)
 		return (rc);
 
 	bzero(wrq, sizeof(*wrq));
 	return (0);
 }
 
 static int
 alloc_txq(struct vi_info *vi, struct sge_txq *txq, int idx,
     struct sysctl_oid *oid)
 {
 	int rc;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 	char name[16];
 	struct sysctl_oid_list *children = SYSCTL_CHILDREN(oid);
 
 	rc = mp_ring_alloc(&txq->r, eq->sidx, txq, eth_tx, can_resume_eth_tx,
 	    M_CXGBE, M_WAITOK);
 	if (rc != 0) {
 		device_printf(sc->dev, "failed to allocate mp_ring: %d\n", rc);
 		return (rc);
 	}
 
 	rc = alloc_eq(sc, vi, eq);
 	if (rc != 0) {
 		mp_ring_free(txq->r);
 		txq->r = NULL;
 		return (rc);
 	}
 
 	/* Can't fail after this point. */
 
 	if (idx == 0)
 		sc->sge.eq_base = eq->abs_id - eq->cntxt_id;
 	else
 		KASSERT(eq->cntxt_id + sc->sge.eq_base == eq->abs_id,
 		    ("eq_base mismatch"));
 	KASSERT(sc->sge.eq_base == 0 || sc->flags & IS_VF,
 	    ("PF with non-zero eq_base"));
 
 	TASK_INIT(&txq->tx_reclaim_task, 0, tx_reclaim, eq);
 	txq->ifp = vi->ifp;
 	txq->gl = sglist_alloc(TX_SGL_SEGS, M_WAITOK);
 	if (sc->flags & IS_VF)
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
 		    V_TXPKT_INTF(pi->tx_chan));
 	else
 		txq->cpl_ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT) |
 		    V_TXPKT_INTF(pi->tx_chan) |
 		    V_TXPKT_PF(G_FW_VIID_PFN(vi->viid)) |
 		    V_TXPKT_VF(G_FW_VIID_VIN(vi->viid)) |
 		    V_TXPKT_VF_VLD(G_FW_VIID_VIVLD(vi->viid)));
 	txq->tc_idx = -1;
 	txq->sdesc = malloc(eq->sidx * sizeof(struct tx_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	snprintf(name, sizeof(name), "%d", idx);
 	oid = SYSCTL_ADD_NODE(&vi->ctx, children, OID_AUTO, name, CTLFLAG_RD,
 	    NULL, "tx queue");
 	children = SYSCTL_CHILDREN(oid);
 
+	SYSCTL_ADD_UAUTO(&vi->ctx, children, OID_AUTO, "ba", CTLFLAG_RD,
+	    &eq->ba, "bus address of descriptor ring");
+	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "dmalen", CTLFLAG_RD, NULL,
+	    eq->sidx * EQ_ESIZE + sc->params.sge.spg_len,
+	    "desc ring size in bytes");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "abs_id", CTLFLAG_RD,
 	    &eq->abs_id, 0, "absolute id of the queue");
 	SYSCTL_ADD_UINT(&vi->ctx, children, OID_AUTO, "cntxt_id", CTLFLAG_RD,
 	    &eq->cntxt_id, 0, "SGE context id of the queue");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "cidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->cidx, 0, sysctl_uint16, "I",
 	    "consumer index");
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "pidx",
 	    CTLTYPE_INT | CTLFLAG_RD, &eq->pidx, 0, sysctl_uint16, "I",
 	    "producer index");
+	SYSCTL_ADD_INT(&vi->ctx, children, OID_AUTO, "sidx", CTLFLAG_RD, NULL,
+	    eq->sidx, "status page index");
 
 	SYSCTL_ADD_PROC(&vi->ctx, children, OID_AUTO, "tc",
 	    CTLTYPE_INT | CTLFLAG_RW, vi, idx, sysctl_tc, "I",
 	    "traffic class (-1 means none)");
 
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txcsum", CTLFLAG_RD,
 	    &txq->txcsum, "# of times hardware assisted with checksum");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_insertion",
 	    CTLFLAG_RD, &txq->vlan_insertion,
 	    "# of times hardware inserted 802.1Q tag");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "tso_wrs", CTLFLAG_RD,
 	    &txq->tso_wrs, "# of TSO work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "imm_wrs", CTLFLAG_RD,
 	    &txq->imm_wrs, "# of work requests with immediate data");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "sgl_wrs", CTLFLAG_RD,
 	    &txq->sgl_wrs, "# of work requests with direct SGL");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkt_wrs", CTLFLAG_RD,
 	    &txq->txpkt_wrs, "# of txpkt work requests (one pkt/WR)");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_wrs",
 	    CTLFLAG_RD, &txq->txpkts0_wrs,
 	    "# of txpkts (type 0) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_wrs",
 	    CTLFLAG_RD, &txq->txpkts1_wrs,
 	    "# of txpkts (type 1) work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts0_pkts",
 	    CTLFLAG_RD, &txq->txpkts0_pkts,
 	    "# of frames tx'd using type0 txpkts work requests");
 	SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "txpkts1_pkts",
 	    CTLFLAG_RD, &txq->txpkts1_pkts,
 	    "# of frames tx'd using type1 txpkts work requests");
 
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_enqueues",
 	    CTLFLAG_RD, &txq->r->enqueues,
 	    "# of enqueues to the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_drops",
 	    CTLFLAG_RD, &txq->r->drops,
 	    "# of drops in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_starts",
 	    CTLFLAG_RD, &txq->r->starts,
 	    "# of normal consumer starts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_stalls",
 	    CTLFLAG_RD, &txq->r->stalls,
 	    "# of consumer stalls in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_restarts",
 	    CTLFLAG_RD, &txq->r->restarts,
 	    "# of consumer restarts in the mp_ring for this queue");
 	SYSCTL_ADD_COUNTER_U64(&vi->ctx, children, OID_AUTO, "r_abdications",
 	    CTLFLAG_RD, &txq->r->abdications,
 	    "# of consumer abdications in the mp_ring for this queue");
 
 	return (0);
 }
 
 static int
 free_txq(struct vi_info *vi, struct sge_txq *txq)
 {
 	int rc;
 	struct adapter *sc = vi->pi->adapter;
 	struct sge_eq *eq = &txq->eq;
 
 	rc = free_eq(sc, eq);
 	if (rc)
 		return (rc);
 
 	sglist_free(txq->gl);
 	free(txq->sdesc, M_CXGBE);
 	mp_ring_free(txq->r);
 
 	bzero(txq, sizeof(*txq));
 	return (0);
 }
 
 static void
 oneseg_dma_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error)
 {
 	bus_addr_t *ba = arg;
 
 	KASSERT(nseg == 1,
 	    ("%s meant for single segment mappings only.", __func__));
 
 	*ba = error ? 0 : segs->ds_addr;
 }
 
 static inline void
 ring_fl_db(struct adapter *sc, struct sge_fl *fl)
 {
 	uint32_t n, v;
 
 	n = IDXDIFF(fl->pidx / 8, fl->dbidx, fl->sidx);
 	MPASS(n > 0);
 
 	wmb();
 	v = fl->dbval | V_PIDX(n);
 	if (fl->udb)
 		*fl->udb = htole32(v);
 	else
 		t4_write_reg(sc, sc->sge_kdoorbell_reg, v);
 	IDXINCR(fl->dbidx, n, fl->sidx);
 }
 
 /*
  * Fills up the freelist by allocating up to 'n' buffers.  Buffers that are
  * recycled do not count towards this allocation budget.
  *
  * Returns non-zero to indicate that this freelist should be added to the list
  * of starving freelists.
  */
 static int
 refill_fl(struct adapter *sc, struct sge_fl *fl, int n)
 {
 	__be64 *d;
 	struct fl_sdesc *sd;
 	uintptr_t pa;
 	caddr_t cl;
 	struct cluster_layout *cll;
 	struct sw_zone_info *swz;
 	struct cluster_metadata *clm;
 	uint16_t max_pidx;
 	uint16_t hw_cidx = fl->hw_cidx;		/* stable snapshot */
 
 	FL_LOCK_ASSERT_OWNED(fl);
 
 	/*
 	 * We always stop at the beginning of the hardware descriptor that's just
 	 * before the one with the hw cidx.  This is to avoid hw pidx = hw cidx,
 	 * which would mean an empty freelist to the chip.
 	 */
 	max_pidx = __predict_false(hw_cidx == 0) ? fl->sidx - 1 : hw_cidx - 1;
 	if (fl->pidx == max_pidx * 8)
 		return (0);
 
 	d = &fl->desc[fl->pidx];
 	sd = &fl->sdesc[fl->pidx];
 	cll = &fl->cll_def;	/* default layout */
 	swz = &sc->sge.sw_zone_info[cll->zidx];
 
 	while (n > 0) {
 
 		if (sd->cl != NULL) {
 
 			if (sd->nmbuf == 0) {
 				/*
 				 * Fast recycle without involving any atomics on
 				 * the cluster's metadata (if the cluster has
 				 * metadata).  This happens when all frames
 				 * received in the cluster were small enough to
 				 * fit within a single mbuf each.
 				 */
 				fl->cl_fast_recycled++;
 #ifdef INVARIANTS
 				clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 				if (clm != NULL)
 					MPASS(clm->refcount == 1);
 #endif
 				goto recycled_fast;
 			}
 
 			/*
 			 * Cluster is guaranteed to have metadata.  Clusters
 			 * without metadata always take the fast recycle path
 			 * when they're recycled.
 			 */
 			clm = cl_metadata(sc, fl, &sd->cll, sd->cl);
 			MPASS(clm != NULL);
 
 			if (atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 				fl->cl_recycled++;
 				counter_u64_add(extfree_rels, 1);
 				goto recycled;
 			}
 			sd->cl = NULL;	/* gave up my reference */
 		}
 		MPASS(sd->cl == NULL);
 alloc:
 		cl = uma_zalloc(swz->zone, M_NOWAIT);
 		if (__predict_false(cl == NULL)) {
 			if (cll == &fl->cll_alt || fl->cll_alt.zidx == -1 ||
 			    fl->cll_def.zidx == fl->cll_alt.zidx)
 				break;
 
 			/* fall back to the safe zone */
 			cll = &fl->cll_alt;
 			swz = &sc->sge.sw_zone_info[cll->zidx];
 			goto alloc;
 		}
 		fl->cl_allocated++;
 		n--;
 
 		pa = pmap_kextract((vm_offset_t)cl);
 		pa += cll->region1;
 		sd->cl = cl;
 		sd->cll = *cll;
 		*d = htobe64(pa | cll->hwidx);
 		clm = cl_metadata(sc, fl, cll, cl);
 		if (clm != NULL) {
 recycled:
 #ifdef INVARIANTS
 			clm->sd = sd;
 #endif
 			clm->refcount = 1;
 		}
 		sd->nmbuf = 0;
 recycled_fast:
 		d++;
 		sd++;
 		if (__predict_false(++fl->pidx % 8 == 0)) {
 			uint16_t pidx = fl->pidx / 8;
 
 			if (__predict_false(pidx == fl->sidx)) {
 				fl->pidx = 0;
 				pidx = 0;
 				sd = fl->sdesc;
 				d = fl->desc;
 			}
 			if (pidx == max_pidx)
 				break;
 
 			if (IDXDIFF(pidx, fl->dbidx, fl->sidx) >= 4)
 				ring_fl_db(sc, fl);
 		}
 	}
 
 	if (fl->pidx / 8 != fl->dbidx)
 		ring_fl_db(sc, fl);
 
 	return (FL_RUNNING_LOW(fl) && !(fl->flags & FL_STARVING));
 }
 
 /*
  * Attempt to refill all starving freelists.
  */
 static void
 refill_sfl(void *arg)
 {
 	struct adapter *sc = arg;
 	struct sge_fl *fl, *fl_temp;
 
 	mtx_assert(&sc->sfl_lock, MA_OWNED);
 	TAILQ_FOREACH_SAFE(fl, &sc->sfl, link, fl_temp) {
 		FL_LOCK(fl);
 		refill_fl(sc, fl, 64);
 		if (FL_NOT_RUNNING_LOW(fl) || fl->flags & FL_DOOMED) {
 			TAILQ_REMOVE(&sc->sfl, fl, link);
 			fl->flags &= ~FL_STARVING;
 		}
 		FL_UNLOCK(fl);
 	}
 
 	if (!TAILQ_EMPTY(&sc->sfl))
 		callout_schedule(&sc->sfl_callout, hz / 5);
 }
 
 static int
 alloc_fl_sdesc(struct sge_fl *fl)
 {
 
 	fl->sdesc = malloc(fl->sidx * 8 * sizeof(struct fl_sdesc), M_CXGBE,
 	    M_ZERO | M_WAITOK);
 
 	return (0);
 }
 
 static void
 free_fl_sdesc(struct adapter *sc, struct sge_fl *fl)
 {
 	struct fl_sdesc *sd;
 	struct cluster_metadata *clm;
 	struct cluster_layout *cll;
 	int i;
 
 	sd = fl->sdesc;
 	for (i = 0; i < fl->sidx * 8; i++, sd++) {
 		if (sd->cl == NULL)
 			continue;
 
 		cll = &sd->cll;
 		clm = cl_metadata(sc, fl, cll, sd->cl);
 		if (sd->nmbuf == 0)
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 		else if (clm && atomic_fetchadd_int(&clm->refcount, -1) == 1) {
 			uma_zfree(sc->sge.sw_zone_info[cll->zidx].zone, sd->cl);
 			counter_u64_add(extfree_rels, 1);
 		}
 		sd->cl = NULL;
 	}
 
 	free(fl->sdesc, M_CXGBE);
 	fl->sdesc = NULL;
 }
 
 static inline void
 get_pkt_gl(struct mbuf *m, struct sglist *gl)
 {
 	int rc;
 
 	M_ASSERTPKTHDR(m);
 
 	sglist_reset(gl);
 	rc = sglist_append_mbuf(gl, m);
 	if (__predict_false(rc != 0)) {
 		panic("%s: mbuf %p (%d segs) was vetted earlier but now fails "
 		    "with %d.", __func__, m, mbuf_nsegs(m), rc);
 	}
 
 	KASSERT(gl->sg_nseg == mbuf_nsegs(m),
 	    ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
 	    mbuf_nsegs(m), gl->sg_nseg));
 	KASSERT(gl->sg_nseg > 0 &&
 	    gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
 	    ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
 		gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
 }
 
 /*
  * len16 for a txpkt WR with a GL.  Includes the firmware work request header.
  */
 static inline u_int
 txpkt_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkt_vm WR with a GL.  Includes the firmware work
  * request header.
  */
 static inline u_int
 txpkt_vm_len16(u_int nsegs, u_int tso)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
 	    sizeof(struct cpl_tx_pkt_core) +
 	    sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
 	if (tso)
 		n += sizeof(struct cpl_tx_pkt_lso_core);
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 0 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts0_len16(u_int nsegs)
 {
 	u_int n;
 
 	MPASS(nsegs > 0);
 
 	nsegs--; /* first segment is part of ulptx_sgl */
 	n = sizeof(struct ulp_txpkt) + sizeof(struct ulptx_idata) +
 	    sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) +
 	    8 * ((3 * nsegs) / 2 + (nsegs & 1));
 
 	return (howmany(n, 16));
 }
 
 /*
  * len16 for a txpkts type 1 WR with a GL.  Does not include the firmware work
  * request header.
  */
 static inline u_int
 txpkts1_len16(void)
 {
 	u_int n;
 
 	n = sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl);
 
 	return (howmany(n, 16));
 }
 
 static inline u_int
 imm_payload(u_int ndesc)
 {
 	u_int n;
 
 	n = ndesc * EQ_ESIZE - sizeof(struct fw_eth_tx_pkt_wr) -
 	    sizeof(struct cpl_tx_pkt_core);
 
 	return (n);
 }
 
 /*
  * Write a VM txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_vm_wr(struct adapter *sc, struct sge_txq *txq,
     struct fw_eth_tx_pkt_vm_wr *wr, struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int csum_type, len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_VM_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3[0] = 0;
 	wr->r3[1] = 0;
 	
 	/*
 	 * Copy over ethmacdst, ethmacsrc, ethtype, and vlantci.
 	 * vlantci is ignored unless the ethtype is 0x8100, so it's
 	 * simpler to always copy it rather than making it
 	 * conditional.  Also, it seems that we do not have to set
 	 * vlantci or fake the ethtype when doing VLAN tag insertion.
 	 */
 	m_copydata(m0, 0, sizeof(struct ether_header) + 2, wr->ethmacdst);
 
 	csum_type = -1;
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			csum_type = TX_CSUM_TCPIP6;
 		else
 			csum_type = TX_CSUM_TCPIP;
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else {
 		if (m0->m_pkthdr.csum_flags & CSUM_IP_TCP)
 			csum_type = TX_CSUM_TCPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP_UDP)
 			csum_type = TX_CSUM_UDPIP;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_TCP)
 			csum_type = TX_CSUM_TCPIP6;
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP6_UDP)
 			csum_type = TX_CSUM_UDPIP6;
 #if defined(INET)
 		else if (m0->m_pkthdr.csum_flags & CSUM_IP) {
 			/*
 			 * XXX: The firmware appears to stomp on the
 			 * fragment/flags field of the IP header when
 			 * using TX_CSUM_IP.  Fall back to doing
 			 * software checksums.
 			 */
 			u_short *sump;
 			struct mbuf *m;
 			int offset;
 
 			m = m0;
 			offset = 0;
 			sump = m_advance(&m, &offset, m0->m_pkthdr.l2hlen +
 			    offsetof(struct ip, ip_sum));
 			*sump = in_cksum_skip(m0, m0->m_pkthdr.l2hlen +
 			    m0->m_pkthdr.l3hlen, m0->m_pkthdr.l2hlen);
 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 #endif
 
 		cpl = (void *)(wr + 1);
 	}
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (csum_type >= 0) {
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0,
 	    ("%s: mbuf %p needs checksum offload but missing header lengths",
 			__func__, m0));
 
 		if (chip_id(sc) <= CHELSIO_T5) {
 			ctrl1 |= V_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		} else {
 			ctrl1 |= V_T6_TXPKT_ETHHDR_LEN(m0->m_pkthdr.l2hlen -
 			    ETHER_HDR_LEN);
 		}
 		ctrl1 |= V_TXPKT_IPHDR_LEN(m0->m_pkthdr.l3hlen);
 		ctrl1 |= V_TXPKT_CSUM_TYPE(csum_type);
 	} else
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD |
 		    V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 
 	/*
 	 * A packet using TSO will use up an entire descriptor for the
 	 * firmware work request header, LSO CPL, and TX_PKT_XT CPL.
 	 * If this descriptor is the last descriptor in the ring, wrap
 	 * around to the front of the ring explicitly for the start of
 	 * the sgl.
 	 */
 	if (dst == (void *)&eq->desc[eq->sidx]) {
 		dst = (void *)&eq->desc[0];
 		write_gl_to_txd(txq, m0, &dst, 0);
 	} else
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 	txq->sgl_wrs++;
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * Write a txpkt WR for this packet to the hardware descriptors, update the
  * software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkt_wr(struct sge_txq *txq, struct fw_eth_tx_pkt_wr *wr,
     struct mbuf *m0, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;	/* used in many unrelated places */
 	uint64_t ctrl1;
 	int len16, ndesc, pktlen, nsegs;
 	caddr_t dst;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	M_ASSERTPKTHDR(m0);
 	MPASS(available > 0 && available < eq->sidx);
 
 	len16 = mbuf_len16(m0);
 	nsegs = mbuf_nsegs(m0);
 	pktlen = m0->m_pkthdr.len;
 	ctrl = sizeof(struct cpl_tx_pkt_core);
 	if (needs_tso(m0))
 		ctrl += sizeof(struct cpl_tx_pkt_lso_core);
 	else if (pktlen <= imm_payload(2) && available >= 2) {
 		/* Immediate data.  Recalculate len16 and set nsegs to 0. */
 		ctrl += pktlen;
 		len16 = howmany(sizeof(struct fw_eth_tx_pkt_wr) +
 		    sizeof(struct cpl_tx_pkt_core) + pktlen, 16);
 		nsegs = 0;
 	}
 	ndesc = howmany(len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	/* Firmware work request header */
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_immdlen = htobe32(V_FW_WR_OP(FW_ETH_TX_PKT_WR) |
 	    V_FW_ETH_TX_PKT_WR_IMMDLEN(ctrl));
 
 	ctrl = V_FW_WR_LEN16(len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->r3 = 0;
 
 	if (needs_tso(m0)) {
 		struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
 
 		KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
 		    m0->m_pkthdr.l4hlen > 0,
 		    ("%s: mbuf %p needs TSO but missing header lengths",
 			__func__, m0));
 
 		ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
 		    F_LSO_LAST_SLICE | V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2)
 		    | V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
 		if (m0->m_pkthdr.l2hlen == sizeof(struct ether_vlan_header))
 			ctrl |= V_LSO_ETHHDR_LEN(1);
 		if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
 			ctrl |= F_LSO_IPV6;
 
 		lso->lso_ctrl = htobe32(ctrl);
 		lso->ipid_ofst = htobe16(0);
 		lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
 		lso->seqno_offset = htobe32(0);
 		lso->len = htobe32(pktlen);
 
 		cpl = (void *)(lso + 1);
 
 		txq->tso_wrs++;
 	} else
 		cpl = (void *)(wr + 1);
 
 	/* Checksum offload */
 	ctrl1 = 0;
 	if (needs_l3_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_IPCSUM_DIS;
 	if (needs_l4_csum(m0) == 0)
 		ctrl1 |= F_TXPKT_L4CSUM_DIS;
 	if (m0->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 	    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 		txq->txcsum++;	/* some hardware assistance provided */
 
 	/* VLAN tag insertion */
 	if (needs_vlan_insertion(m0)) {
 		ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
 		txq->vlan_insertion++;
 	}
 
 	/* CPL header */
 	cpl->ctrl0 = txq->cpl_ctrl0;
 	cpl->pack = 0;
 	cpl->len = htobe16(pktlen);
 	cpl->ctrl1 = htobe64(ctrl1);
 
 	/* SGL */
 	dst = (void *)(cpl + 1);
 	if (nsegs > 0) {
 
 		write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
 		txq->sgl_wrs++;
 	} else {
 		struct mbuf *m;
 
 		for (m = m0; m != NULL; m = m->m_next) {
 			copy_to_txd(eq, mtod(m, caddr_t), &dst, m->m_len);
 #ifdef INVARIANTS
 			pktlen -= m->m_len;
 #endif
 		}
 #ifdef INVARIANTS
 		KASSERT(pktlen == 0, ("%s: %d bytes left.", __func__, pktlen));
 #endif
 		txq->imm_wrs++;
 	}
 
 	txq->txpkt_wrs++;
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 static int
 try_txpkts(struct mbuf *m, struct mbuf *n, struct txpkts *txp, u_int available)
 {
 	u_int needed, nsegs1, nsegs2, l1, l2;
 
 	if (cannot_use_txpkts(m) || cannot_use_txpkts(n))
 		return (1);
 
 	nsegs1 = mbuf_nsegs(m);
 	nsegs2 = mbuf_nsegs(n);
 	if (nsegs1 + nsegs2 == 2) {
 		txp->wr_type = 1;
 		l1 = l2 = txpkts1_len16();
 	} else {
 		txp->wr_type = 0;
 		l1 = txpkts0_len16(nsegs1);
 		l2 = txpkts0_len16(nsegs2);
 	}
 	txp->len16 = howmany(sizeof(struct fw_eth_tx_pkts_wr), 16) + l1 + l2;
 	needed = howmany(txp->len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->plen = m->m_pkthdr.len + n->m_pkthdr.len;
 	if (txp->plen > 65535)
 		return (1);
 
 	txp->npkt = 2;
 	set_mbuf_len16(m, l1);
 	set_mbuf_len16(n, l2);
 
 	return (0);
 }
 
 static int
 add_to_txpkts(struct mbuf *m, struct txpkts *txp, u_int available)
 {
 	u_int plen, len16, needed, nsegs;
 
 	MPASS(txp->wr_type == 0 || txp->wr_type == 1);
 
 	nsegs = mbuf_nsegs(m);
 	if (needs_tso(m) || (txp->wr_type == 1 && nsegs != 1))
 		return (1);
 
 	plen = txp->plen + m->m_pkthdr.len;
 	if (plen > 65535)
 		return (1);
 
 	if (txp->wr_type == 0)
 		len16 = txpkts0_len16(nsegs);
 	else
 		len16 = txpkts1_len16();
 	needed = howmany(txp->len16 + len16, EQ_ESIZE / 16);
 	if (needed > SGE_MAX_WR_NDESC || needed > available)
 		return (1);
 
 	txp->npkt++;
 	txp->plen = plen;
 	txp->len16 += len16;
 	set_mbuf_len16(m, len16);
 
 	return (0);
 }
 
 /*
  * Write a txpkts WR for the packets in txp to the hardware descriptors, update
  * the software descriptor, and advance the pidx.  It is guaranteed that enough
  * descriptors are available.
  *
  * The return value is the # of hardware descriptors used.
  */
 static u_int
 write_txpkts_wr(struct sge_txq *txq, struct fw_eth_tx_pkts_wr *wr,
     struct mbuf *m0, const struct txpkts *txp, u_int available)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct tx_sdesc *txsd;
 	struct cpl_tx_pkt_core *cpl;
 	uint32_t ctrl;
 	uint64_t ctrl1;
 	int ndesc, checkwrap;
 	struct mbuf *m;
 	void *flitp;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(txp->npkt > 0);
 	MPASS(txp->plen < 65536);
 	MPASS(m0 != NULL);
 	MPASS(m0->m_nextpkt != NULL);
 	MPASS(txp->len16 <= howmany(SGE_MAX_WR_LEN, 16));
 	MPASS(available > 0 && available < eq->sidx);
 
 	ndesc = howmany(txp->len16, EQ_ESIZE / 16);
 	MPASS(ndesc <= available);
 
 	MPASS(wr == (void *)&eq->desc[eq->pidx]);
 	wr->op_pkd = htobe32(V_FW_WR_OP(FW_ETH_TX_PKTS_WR));
 	ctrl = V_FW_WR_LEN16(txp->len16);
 	wr->equiq_to_len16 = htobe32(ctrl);
 	wr->plen = htobe16(txp->plen);
 	wr->npkt = txp->npkt;
 	wr->r3 = 0;
 	wr->type = txp->wr_type;
 	flitp = wr + 1;
 
 	/*
 	 * At this point we are 16B into a hardware descriptor.  If checkwrap is
 	 * set then we know the WR is going to wrap around somewhere.  We'll
 	 * check for that at appropriate points.
 	 */
 	checkwrap = eq->sidx - ndesc < eq->pidx;
 	for (m = m0; m != NULL; m = m->m_nextpkt) {
 		if (txp->wr_type == 0) {
 			struct ulp_txpkt *ulpmc;
 			struct ulptx_idata *ulpsc;
 
 			/* ULP master command */
 			ulpmc = flitp;
 			ulpmc->cmd_dest = htobe32(V_ULPTX_CMD(ULP_TX_PKT) |
 			    V_ULP_TXPKT_DEST(0) | V_ULP_TXPKT_FID(eq->iqid));
 			ulpmc->len = htobe32(mbuf_len16(m));
 
 			/* ULP subcommand */
 			ulpsc = (void *)(ulpmc + 1);
 			ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM) |
 			    F_ULP_TX_SC_MORE);
 			ulpsc->len = htobe32(sizeof(struct cpl_tx_pkt_core));
 
 			cpl = (void *)(ulpsc + 1);
 			if (checkwrap &&
 			    (uintptr_t)cpl == (uintptr_t)&eq->desc[eq->sidx])
 				cpl = (void *)&eq->desc[0];
 			txq->txpkts0_pkts += txp->npkt;
 			txq->txpkts0_wrs++;
 		} else {
 			cpl = flitp;
 			txq->txpkts1_pkts += txp->npkt;
 			txq->txpkts1_wrs++;
 		}
 
 		/* Checksum offload */
 		ctrl1 = 0;
 		if (needs_l3_csum(m) == 0)
 			ctrl1 |= F_TXPKT_IPCSUM_DIS;
 		if (needs_l4_csum(m) == 0)
 			ctrl1 |= F_TXPKT_L4CSUM_DIS;
 		if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TCP | CSUM_UDP |
 		    CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO))
 			txq->txcsum++;	/* some hardware assistance provided */
 
 		/* VLAN tag insertion */
 		if (needs_vlan_insertion(m)) {
 			ctrl1 |= F_TXPKT_VLAN_VLD |
 			    V_TXPKT_VLAN(m->m_pkthdr.ether_vtag);
 			txq->vlan_insertion++;
 		}
 
 		/* CPL header */
 		cpl->ctrl0 = txq->cpl_ctrl0;
 		cpl->pack = 0;
 		cpl->len = htobe16(m->m_pkthdr.len);
 		cpl->ctrl1 = htobe64(ctrl1);
 
 		flitp = cpl + 1;
 		if (checkwrap &&
 		    (uintptr_t)flitp == (uintptr_t)&eq->desc[eq->sidx])
 			flitp = (void *)&eq->desc[0];
 
 		write_gl_to_txd(txq, m, (caddr_t *)(&flitp), checkwrap);
 
 	}
 
 	txsd = &txq->sdesc[eq->pidx];
 	txsd->m = m0;
 	txsd->desc_used = ndesc;
 
 	return (ndesc);
 }
 
 /*
  * If the SGL ends on an address that is not 16 byte aligned, this function will
  * add a 0 filled flit at the end.
  */
 static void
 write_gl_to_txd(struct sge_txq *txq, struct mbuf *m, caddr_t *to, int checkwrap)
 {
 	struct sge_eq *eq = &txq->eq;
 	struct sglist *gl = txq->gl;
 	struct sglist_seg *seg;
 	__be64 *flitp, *wrap;
 	struct ulptx_sgl *usgl;
 	int i, nflits, nsegs;
 
 	KASSERT(((uintptr_t)(*to) & 0xf) == 0,
 	    ("%s: SGL must start at a 16 byte boundary: %p", __func__, *to));
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	get_pkt_gl(m, gl);
 	nsegs = gl->sg_nseg;
 	MPASS(nsegs > 0);
 
 	nflits = (3 * (nsegs - 1)) / 2 + ((nsegs - 1) & 1) + 2;
 	flitp = (__be64 *)(*to);
 	wrap = (__be64 *)(&eq->desc[eq->sidx]);
 	seg = &gl->sg_segs[0];
 	usgl = (void *)flitp;
 
 	/*
 	 * We start at a 16 byte boundary somewhere inside the tx descriptor
 	 * ring, so we're at least 16 bytes away from the status page.  There is
 	 * no chance of a wrap around in the middle of usgl (which is 16 bytes).
 	 */
 
 	usgl->cmd_nsge = htobe32(V_ULPTX_CMD(ULP_TX_SC_DSGL) |
 	    V_ULPTX_NSGE(nsegs));
 	usgl->len0 = htobe32(seg->ss_len);
 	usgl->addr0 = htobe64(seg->ss_paddr);
 	seg++;
 
 	if (checkwrap == 0 || (uintptr_t)(flitp + nflits) <= (uintptr_t)wrap) {
 
 		/* Won't wrap around at all */
 
 		for (i = 0; i < nsegs - 1; i++, seg++) {
 			usgl->sge[i / 2].len[i & 1] = htobe32(seg->ss_len);
 			usgl->sge[i / 2].addr[i & 1] = htobe64(seg->ss_paddr);
 		}
 		if (i & 1)
 			usgl->sge[i / 2].len[1] = htobe32(0);
 		flitp += nflits;
 	} else {
 
 		/* Will wrap somewhere in the rest of the SGL */
 
 		/* 2 flits already written, write the rest flit by flit */
 		flitp = (void *)(usgl + 1);
 		for (i = 0; i < nflits - 2; i++) {
 			if (flitp == wrap)
 				flitp = (void *)eq->desc;
 			*flitp++ = get_flit(seg, nsegs - 1, i);
 		}
 	}
 
 	if (nflits & 1) {
 		MPASS(((uintptr_t)flitp) & 0xf);
 		*flitp++ = 0;
 	}
 
 	MPASS((((uintptr_t)flitp) & 0xf) == 0);
 	if (__predict_false(flitp == wrap))
 		*to = (void *)eq->desc;
 	else
 		*to = (void *)flitp;
 }
 
 static inline void
 copy_to_txd(struct sge_eq *eq, caddr_t from, caddr_t *to, int len)
 {
 
 	MPASS((uintptr_t)(*to) >= (uintptr_t)&eq->desc[0]);
 	MPASS((uintptr_t)(*to) < (uintptr_t)&eq->desc[eq->sidx]);
 
 	if (__predict_true((uintptr_t)(*to) + len <=
 	    (uintptr_t)&eq->desc[eq->sidx])) {
 		bcopy(from, *to, len);
 		(*to) += len;
 	} else {
 		int portion = (uintptr_t)&eq->desc[eq->sidx] - (uintptr_t)(*to);
 
 		bcopy(from, *to, portion);
 		from += portion;
 		portion = len - portion;	/* remaining */
 		bcopy(from, (void *)eq->desc, portion);
 		(*to) = (caddr_t)eq->desc + portion;
 	}
 }
 
 static inline void
 ring_eq_db(struct adapter *sc, struct sge_eq *eq, u_int n)
 {
 	u_int db;
 
 	MPASS(n > 0);
 
 	db = eq->doorbells;
 	if (n > 1)
 		clrbit(&db, DOORBELL_WCWR);
 	wmb();
 
 	switch (ffs(db) - 1) {
 	case DOORBELL_UDB:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		break;
 
 	case DOORBELL_WCWR: {
 		volatile uint64_t *dst, *src;
 		int i;
 
 		/*
 		 * Queues whose 128B doorbell segment fits in the page do not
 		 * use relative qid (udb_qid is always 0).  Only queues with
 		 * doorbell segments can do WCWR.
 		 */
 		KASSERT(eq->udb_qid == 0 && n == 1,
 		    ("%s: inappropriate doorbell (0x%x, %d, %d) for eq %p",
 		    __func__, eq->doorbells, n, eq->dbidx, eq));
 
 		dst = (volatile void *)((uintptr_t)eq->udb + UDBS_WR_OFFSET -
 		    UDBS_DB_OFFSET);
 		i = eq->dbidx;
 		src = (void *)&eq->desc[i];
 		while (src != (void *)&eq->desc[i + 1])
 			*dst++ = *src++;
 		wmb();
 		break;
 	}
 
 	case DOORBELL_UDBWC:
 		*eq->udb = htole32(V_QID(eq->udb_qid) | V_PIDX(n));
 		wmb();
 		break;
 
 	case DOORBELL_KDB:
 		t4_write_reg(sc, sc->sge_kdoorbell_reg,
 		    V_QID(eq->cntxt_id) | V_PIDX(n));
 		break;
 	}
 
 	IDXINCR(eq->dbidx, n, eq->sidx);
 }
 
 static inline u_int
 reclaimable_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	return (IDXDIFF(hw_cidx, eq->cidx, eq->sidx));
 }
 
 static inline u_int
 total_available_tx_desc(struct sge_eq *eq)
 {
 	uint16_t hw_cidx, pidx;
 
 	hw_cidx = read_hw_cidx(eq);
 	pidx = eq->pidx;
 
 	if (pidx == hw_cidx)
 		return (eq->sidx - 1);
 	else
 		return (IDXDIFF(hw_cidx, pidx, eq->sidx) - 1);
 }
 
 static inline uint16_t
 read_hw_cidx(struct sge_eq *eq)
 {
 	struct sge_qstat *spg = (void *)&eq->desc[eq->sidx];
 	uint16_t cidx = spg->cidx;	/* stable snapshot */
 
 	return (be16toh(cidx));
 }
 
 /*
  * Reclaim 'n' descriptors approximately.
  */
 static u_int
 reclaim_tx_descs(struct sge_txq *txq, u_int n)
 {
 	struct tx_sdesc *txsd;
 	struct sge_eq *eq = &txq->eq;
 	u_int can_reclaim, reclaimed;
 
 	TXQ_LOCK_ASSERT_OWNED(txq);
 	MPASS(n > 0);
 
 	reclaimed = 0;
 	can_reclaim = reclaimable_tx_desc(eq);
 	while (can_reclaim && reclaimed < n) {
 		int ndesc;
 		struct mbuf *m, *nextpkt;
 
 		txsd = &txq->sdesc[eq->cidx];
 		ndesc = txsd->desc_used;
 
 		/* Firmware doesn't return "partial" credits. */
 		KASSERT(can_reclaim >= ndesc,
 		    ("%s: unexpected number of credits: %d, %d",
 		    __func__, can_reclaim, ndesc));
 
 		for (m = txsd->m; m != NULL; m = nextpkt) {
 			nextpkt = m->m_nextpkt;
 			m->m_nextpkt = NULL;
 			m_freem(m);
 		}
 		reclaimed += ndesc;
 		can_reclaim -= ndesc;
 		IDXINCR(eq->cidx, ndesc, eq->sidx);
 	}
 
 	return (reclaimed);
 }
 
 static void
 tx_reclaim(void *arg, int n)
 {
 	struct sge_txq *txq = arg;
 	struct sge_eq *eq = &txq->eq;
 
 	do {
 		if (TXQ_TRYLOCK(txq) == 0)
 			break;
 		n = reclaim_tx_descs(txq, 32);
 		if (eq->cidx == eq->pidx)
 			eq->equeqidx = eq->pidx;
 		TXQ_UNLOCK(txq);
 	} while (n > 0);
 }
 
 static __be64
 get_flit(struct sglist_seg *segs, int nsegs, int idx)
 {
 	int i = (idx / 3) * 2;
 
 	switch (idx % 3) {
 	case 0: {
 		__be64 rc;
 
 		rc = htobe32(segs[i].ss_len);
 		if (i + 1 < nsegs)
 			rc |= (uint64_t)htobe32(segs[i + 1].ss_len) << 32;
 
 		return (rc);
 	}
 	case 1:
 		return (htobe64(segs[i].ss_paddr));
 	case 2:
 		return (htobe64(segs[i + 1].ss_paddr));
 	}
 
 	return (0);
 }
 
 static void
 find_best_refill_source(struct adapter *sc, struct sge_fl *fl, int maxp)
 {
 	int8_t zidx, hwidx, idx;
 	uint16_t region1, region3;
 	int spare, spare_needed, n;
 	struct sw_zone_info *swz;
 	struct hw_buf_info *hwb, *hwb_list = &sc->sge.hw_buf_info[0];
 
 	/*
 	 * Buffer Packing: Look for PAGE_SIZE or larger zone which has a bufsize
 	 * large enough for the max payload and cluster metadata.  Otherwise
 	 * settle for the largest bufsize that leaves enough room in the cluster
 	 * for metadata.
 	 *
 	 * Without buffer packing: Look for the smallest zone which has a
 	 * bufsize large enough for the max payload.  Settle for the largest
 	 * bufsize available if there's nothing big enough for max payload.
 	 */
 	spare_needed = fl->flags & FL_BUF_PACKING ? CL_METADATA_SIZE : 0;
 	swz = &sc->sge.sw_zone_info[0];
 	hwidx = -1;
 	for (zidx = 0; zidx < SW_ZONE_SIZES; zidx++, swz++) {
 		if (swz->size > largest_rx_cluster) {
 			if (__predict_true(hwidx != -1))
 				break;
 
 			/*
 			 * This is a misconfiguration.  largest_rx_cluster is
 			 * preventing us from finding a refill source.  See
 			 * dev.t5nex.<n>.buffer_sizes to figure out why.
 			 */
 			device_printf(sc->dev, "largest_rx_cluster=%u leaves no"
 			    " refill source for fl %p (dma %u).  Ignored.\n",
 			    largest_rx_cluster, fl, maxp);
 		}
 		for (idx = swz->head_hwidx; idx != -1; idx = hwb->next) {
 			hwb = &hwb_list[idx];
 			spare = swz->size - hwb->size;
 			if (spare < spare_needed)
 				continue;
 
 			hwidx = idx;		/* best option so far */
 			if (hwb->size >= maxp) {
 
 				if ((fl->flags & FL_BUF_PACKING) == 0)
 					goto done; /* stop looking (not packing) */
 
 				if (swz->size >= safest_rx_cluster)
 					goto done; /* stop looking (packing) */
 			}
 			break;		/* keep looking, next zone */
 		}
 	}
 done:
 	/* A usable hwidx has been located. */
 	MPASS(hwidx != -1);
 	hwb = &hwb_list[hwidx];
 	zidx = hwb->zidx;
 	swz = &sc->sge.sw_zone_info[zidx];
 	region1 = 0;
 	region3 = swz->size - hwb->size;
 
 	/*
 	 * Stay within this zone and see if there is a better match when mbuf
 	 * inlining is allowed.  Remember that the hwidx's are sorted in
 	 * decreasing order of size (so in increasing order of spare area).
 	 */
 	for (idx = hwidx; idx != -1; idx = hwb->next) {
 		hwb = &hwb_list[idx];
 		spare = swz->size - hwb->size;
 
 		if (allow_mbufs_in_cluster == 0 || hwb->size < maxp)
 			break;
 
 		/*
 		 * Do not inline mbufs if doing so would violate the pad/pack
 		 * boundary alignment requirement.
 		 */
 		if (fl_pad && (MSIZE % sc->params.sge.pad_boundary) != 0)
 			continue;
 		if (fl->flags & FL_BUF_PACKING &&
 		    (MSIZE % sc->params.sge.pack_boundary) != 0)
 			continue;
 
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			continue;
 		n = (spare - CL_METADATA_SIZE) / MSIZE;
 		if (n > howmany(hwb->size, maxp))
 			break;
 
 		hwidx = idx;
 		if (fl->flags & FL_BUF_PACKING) {
 			region1 = n * MSIZE;
 			region3 = spare - region1;
 		} else {
 			region1 = MSIZE;
 			region3 = spare - region1;
 			break;
 		}
 	}
 
 	KASSERT(zidx >= 0 && zidx < SW_ZONE_SIZES,
 	    ("%s: bad zone %d for fl %p, maxp %d", __func__, zidx, fl, maxp));
 	KASSERT(hwidx >= 0 && hwidx <= SGE_FLBUF_SIZES,
 	    ("%s: bad hwidx %d for fl %p, maxp %d", __func__, hwidx, fl, maxp));
 	KASSERT(region1 + sc->sge.hw_buf_info[hwidx].size + region3 ==
 	    sc->sge.sw_zone_info[zidx].size,
 	    ("%s: bad buffer layout for fl %p, maxp %d. "
 		"cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		sc->sge.sw_zone_info[zidx].size, region1,
 		sc->sge.hw_buf_info[hwidx].size, region3));
 	if (fl->flags & FL_BUF_PACKING || region1 > 0) {
 		KASSERT(region3 >= CL_METADATA_SIZE,
 		    ("%s: no room for metadata.  fl %p, maxp %d; "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 		KASSERT(region1 % MSIZE == 0,
 		    ("%s: bad mbuf region for fl %p, maxp %d. "
 		    "cl %d; r1 %d, payload %d, r3 %d", __func__, fl, maxp,
 		    sc->sge.sw_zone_info[zidx].size, region1,
 		    sc->sge.hw_buf_info[hwidx].size, region3));
 	}
 
 	fl->cll_def.zidx = zidx;
 	fl->cll_def.hwidx = hwidx;
 	fl->cll_def.region1 = region1;
 	fl->cll_def.region3 = region3;
 }
 
 static void
 find_safe_refill_source(struct adapter *sc, struct sge_fl *fl)
 {
 	struct sge *s = &sc->sge;
 	struct hw_buf_info *hwb;
 	struct sw_zone_info *swz;
 	int spare;
 	int8_t hwidx;
 
 	if (fl->flags & FL_BUF_PACKING)
 		hwidx = s->safe_hwidx2;	/* with room for metadata */
 	else if (allow_mbufs_in_cluster && s->safe_hwidx2 != -1) {
 		hwidx = s->safe_hwidx2;
 		hwb = &s->hw_buf_info[hwidx];
 		swz = &s->sw_zone_info[hwb->zidx];
 		spare = swz->size - hwb->size;
 
 		/* no good if there isn't room for an mbuf as well */
 		if (spare < CL_METADATA_SIZE + MSIZE)
 			hwidx = s->safe_hwidx1;
 	} else
 		hwidx = s->safe_hwidx1;
 
 	if (hwidx == -1) {
 		/* No fallback source */
 		fl->cll_alt.hwidx = -1;
 		fl->cll_alt.zidx = -1;
 
 		return;
 	}
 
 	hwb = &s->hw_buf_info[hwidx];
 	swz = &s->sw_zone_info[hwb->zidx];
 	spare = swz->size - hwb->size;
 	fl->cll_alt.hwidx = hwidx;
 	fl->cll_alt.zidx = hwb->zidx;
 	if (allow_mbufs_in_cluster &&
 	    (fl_pad == 0 || (MSIZE % sc->params.sge.pad_boundary) == 0))
 		fl->cll_alt.region1 = ((spare - CL_METADATA_SIZE) / MSIZE) * MSIZE;
 	else
 		fl->cll_alt.region1 = 0;
 	fl->cll_alt.region3 = spare - fl->cll_alt.region1;
 }
 
 static void
 add_fl_to_sfl(struct adapter *sc, struct sge_fl *fl)
 {
 	mtx_lock(&sc->sfl_lock);
 	FL_LOCK(fl);
 	if ((fl->flags & FL_DOOMED) == 0) {
 		fl->flags |= FL_STARVING;
 		TAILQ_INSERT_TAIL(&sc->sfl, fl, link);
 		callout_reset(&sc->sfl_callout, hz / 5, refill_sfl, sc);
 	}
 	FL_UNLOCK(fl);
 	mtx_unlock(&sc->sfl_lock);
 }
 
 static void
 handle_wrq_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_wrq *wrq = (void *)eq;
 
 	atomic_readandclear_int(&eq->equiq);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &wrq->wrq_tx_task);
 }
 
 static void
 handle_eth_egr_update(struct adapter *sc, struct sge_eq *eq)
 {
 	struct sge_txq *txq = (void *)eq;
 
 	MPASS((eq->flags & EQ_TYPEMASK) == EQ_ETH);
 
 	atomic_readandclear_int(&eq->equiq);
 	mp_ring_check_drainage(txq->r, 0);
 	taskqueue_enqueue(sc->tq[eq->tx_chan], &txq->tx_reclaim_task);
 }
 
 static int
 handle_sge_egr_update(struct sge_iq *iq, const struct rss_header *rss,
     struct mbuf *m)
 {
 	const struct cpl_sge_egr_update *cpl = (const void *)(rss + 1);
 	unsigned int qid = G_EGR_QID(ntohl(cpl->opcode_qid));
 	struct adapter *sc = iq->adapter;
 	struct sge *s = &sc->sge;
 	struct sge_eq *eq;
 	static void (*h[])(struct adapter *, struct sge_eq *) = {NULL,
 		&handle_wrq_egr_update, &handle_eth_egr_update,
 		&handle_wrq_egr_update};
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	eq = s->eqmap[qid - s->eq_start - s->eq_base];
 	(*h[eq->flags & EQ_TYPEMASK])(sc, eq);
 
 	return (0);
 }
 
 /* handle_fw_msg works for both fw4_msg and fw6_msg because this is valid */
 CTASSERT(offsetof(struct cpl_fw4_msg, data) == \
     offsetof(struct cpl_fw6_msg, data));
 
 static int
 handle_fw_msg(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_fw6_msg *cpl = (const void *)(rss + 1);
 
 	KASSERT(m == NULL, ("%s: payload with opcode %02x", __func__,
 	    rss->opcode));
 
 	if (cpl->type == FW_TYPE_RSSCPL || cpl->type == FW6_TYPE_RSSCPL) {
 		const struct rss_header *rss2;
 
 		rss2 = (const struct rss_header *)&cpl->data[0];
 		return (t4_cpl_handler[rss2->opcode](iq, rss2, m));
 	}
 
 	return (t4_fw_msg_handler[cpl->type](sc, &cpl->data[0]));
 }
 
 /**
  *	t4_handle_wrerr_rpl - process a FW work request error message
  *	@adap: the adapter
  *	@rpl: start of the FW message
  */
 static int
 t4_handle_wrerr_rpl(struct adapter *adap, const __be64 *rpl)
 {
 	u8 opcode = *(const u8 *)rpl;
 	const struct fw_error_cmd *e = (const void *)rpl;
 	unsigned int i;
 
 	if (opcode != FW_ERROR_CMD) {
 		log(LOG_ERR,
 		    "%s: Received WRERR_RPL message with opcode %#x\n",
 		    device_get_nameunit(adap->dev), opcode);
 		return (EINVAL);
 	}
 	log(LOG_ERR, "%s: FW_ERROR (%s) ", device_get_nameunit(adap->dev),
 	    G_FW_ERROR_CMD_FATAL(be32toh(e->op_to_type)) ? "fatal" :
 	    "non-fatal");
 	switch (G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type))) {
 	case FW_ERROR_TYPE_EXCEPTION:
 		log(LOG_ERR, "exception info:\n");
 		for (i = 0; i < nitems(e->u.exception.info); i++)
 			log(LOG_ERR, "%s%08x", i == 0 ? "\t" : " ",
 			    be32toh(e->u.exception.info[i]));
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_HWMODULE:
 		log(LOG_ERR, "HW module regaddr %08x regval %08x\n",
 		    be32toh(e->u.hwmodule.regaddr),
 		    be32toh(e->u.hwmodule.regval));
 		break;
 	case FW_ERROR_TYPE_WR:
 		log(LOG_ERR, "WR cidx %d PF %d VF %d eqid %d hdr:\n",
 		    be16toh(e->u.wr.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.wr.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.wr.pfn_vfn)),
 		    be32toh(e->u.wr.eqid));
 		for (i = 0; i < nitems(e->u.wr.wrhdr); i++)
 			log(LOG_ERR, "%s%02x", i == 0 ? "\t" : " ",
 			    e->u.wr.wrhdr[i]);
 		log(LOG_ERR, "\n");
 		break;
 	case FW_ERROR_TYPE_ACL:
 		log(LOG_ERR, "ACL cidx %d PF %d VF %d eqid %d %s",
 		    be16toh(e->u.acl.cidx),
 		    G_FW_ERROR_CMD_PFN(be16toh(e->u.acl.pfn_vfn)),
 		    G_FW_ERROR_CMD_VFN(be16toh(e->u.acl.pfn_vfn)),
 		    be32toh(e->u.acl.eqid),
 		    G_FW_ERROR_CMD_MV(be16toh(e->u.acl.mv_pkd)) ? "vlanid" :
 		    "MAC");
 		for (i = 0; i < nitems(e->u.acl.val); i++)
 			log(LOG_ERR, " %02x", e->u.acl.val[i]);
 		log(LOG_ERR, "\n");
 		break;
 	default:
 		log(LOG_ERR, "type %#x\n",
 		    G_FW_ERROR_CMD_TYPE(be32toh(e->op_to_type)));
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 sysctl_uint16(SYSCTL_HANDLER_ARGS)
 {
 	uint16_t *id = arg1;
 	int i = *id;
 
 	return sysctl_handle_int(oidp, &i, 0, req);
 }
 
 static int
 sysctl_bufsizes(SYSCTL_HANDLER_ARGS)
 {
 	struct sge *s = arg1;
 	struct hw_buf_info *hwb = &s->hw_buf_info[0];
 	struct sw_zone_info *swz = &s->sw_zone_info[0];
 	int i, rc;
 	struct sbuf sb;
 	char c;
 
 	sbuf_new(&sb, NULL, 32, SBUF_AUTOEXTEND);
 	for (i = 0; i < SGE_FLBUF_SIZES; i++, hwb++) {
 		if (hwb->zidx >= 0 && swz[hwb->zidx].size <= largest_rx_cluster)
 			c = '*';
 		else
 			c = '\0';
 
 		sbuf_printf(&sb, "%u%c ", hwb->size, c);
 	}
 	sbuf_trim(&sb);
 	sbuf_finish(&sb);
 	rc = sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
 	sbuf_delete(&sb);
 	return (rc);
 }
 
 static int
 sysctl_tc(SYSCTL_HANDLER_ARGS)
 {
 	struct vi_info *vi = arg1;
 	struct port_info *pi;
 	struct adapter *sc;
 	struct sge_txq *txq;
 	struct tx_sched_class *tc;
 	int qidx = arg2, rc, tc_idx;
 	uint32_t fw_queue, fw_class;
 
 	MPASS(qidx >= 0 && qidx < vi->ntxq);
 	pi = vi->pi;
 	sc = pi->adapter;
 	txq = &sc->sge.txq[vi->first_txq + qidx];
 
 	tc_idx = txq->tc_idx;
 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
 	if (rc != 0 || req->newptr == NULL)
 		return (rc);
 
 	/* Note that -1 is legitimate input (it means unbind). */
 	if (tc_idx < -1 || tc_idx >= sc->chip_params->nsched_cls)
 		return (EINVAL);
 
 	rc = begin_synchronized_op(sc, vi, SLEEP_OK | INTR_OK, "t4stc");
 	if (rc)
 		return (rc);
 
 	if (tc_idx == txq->tc_idx) {
 		rc = 0;		/* No change, nothing to do. */
 		goto done;
 	}
 
 	fw_queue = V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
 	    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id);
 
 	if (tc_idx == -1)
 		fw_class = 0xffffffff;	/* Unbind. */
 	else {
 		/*
 		 * Bind to a different class.  Ethernet txq's are only allowed
 		 * to bind to cl-rl mode-class for now.  XXX: too restrictive.
 		 */
 		tc = &pi->tc[tc_idx];
 		if (tc->flags & TX_SC_OK &&
 		    tc->params.level == SCHED_CLASS_LEVEL_CL_RL &&
 		    tc->params.mode == SCHED_CLASS_MODE_CLASS) {
 			/* Ok to proceed. */
 			fw_class = tc_idx;
 		} else {
 			rc = tc->flags & TX_SC_OK ? EBUSY : ENXIO;
 			goto done;
 		}
 	}
 
 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_queue, &fw_class);
 	if (rc == 0) {
 		if (txq->tc_idx != -1) {
 			tc = &pi->tc[txq->tc_idx];
 			MPASS(tc->refcount > 0);
 			tc->refcount--;
 		}
 		if (tc_idx != -1) {
 			tc = &pi->tc[tc_idx];
 			tc->refcount++;
 		}
 		txq->tc_idx = tc_idx;
 	}
 done:
 	end_synchronized_op(sc, 0);
 	return (rc);
 }
Index: user/alc/PQ_LAUNDRY/sys/dev/evdev/evdev.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/evdev/evdev.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/dev/evdev/evdev.c	(revision 306283)
@@ -1,917 +1,918 @@
 /*-
  * Copyright (c) 2014 Jakub Wojciech Klama <jceel@FreeBSD.org>
  * Copyright (c) 2015-2016 Vladimir Kondratyev <wulf@cicgroup.ru>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include "opt_evdev.h"
 
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/malloc.h>
 #include <sys/bitstring.h>
 #include <sys/sysctl.h>
 
 #include <dev/evdev/input.h>
 #include <dev/evdev/evdev.h>
 #include <dev/evdev/evdev_private.h>
 
 #ifdef EVDEV_DEBUG
 #define	debugf(evdev, fmt, args...)	printf("evdev: " fmt "\n", ##args)
 #else
 #define	debugf(evdev, fmt, args...)
 #endif
 
 #ifdef FEATURE
 FEATURE(evdev, "Input event devices support");
 #endif
 
 enum evdev_sparse_result
 {
 	EV_SKIP_EVENT,		/* Event value not changed */
 	EV_REPORT_EVENT,	/* Event value changed */
 	EV_REPORT_MT_SLOT,	/* Event value and MT slot number changed */
 };
 
 MALLOC_DEFINE(M_EVDEV, "evdev", "evdev memory");
 
 int evdev_rcpt_mask = EVDEV_RCPT_SYSMOUSE | EVDEV_RCPT_KBDMUX;
 
 SYSCTL_NODE(_kern, OID_AUTO, evdev, CTLFLAG_RW, 0, "Evdev args");
 SYSCTL_INT(_kern_evdev, OID_AUTO, rcpt_mask, CTLFLAG_RW, &evdev_rcpt_mask, 0,
     "Who is receiving events: bit0 - sysmouse, bit1 - kbdmux, "
     "bit2 - mouse hardware, bit3 - keyboard hardware");
 
 static void evdev_start_repeat(struct evdev_dev *, uint16_t);
 static void evdev_stop_repeat(struct evdev_dev *);
 static int evdev_check_event(struct evdev_dev *, uint16_t, uint16_t, int32_t);
 
 static inline void
 bit_change(bitstr_t *bitstr, int bit, int value)
 {
 	if (value)
 		bit_set(bitstr, bit);
 	else
 		bit_clear(bitstr, bit);
 }
 
 struct evdev_dev *
 evdev_alloc(void)
 {
 
 	return malloc(sizeof(struct evdev_dev), M_EVDEV, M_WAITOK | M_ZERO);
 }
 
 void
 evdev_free(struct evdev_dev *evdev)
 {
 
-	if (evdev->ev_cdev != NULL && evdev->ev_cdev->si_drv1 != NULL)
+	if (evdev != NULL && evdev->ev_cdev != NULL &&
+	    evdev->ev_cdev->si_drv1 != NULL)
 		evdev_unregister(evdev);
 
 	free(evdev, M_EVDEV);
 }
 
 static struct input_absinfo *
 evdev_alloc_absinfo(void)
 {
 
 	return (malloc(sizeof(struct input_absinfo) * ABS_CNT, M_EVDEV,
 	    M_WAITOK | M_ZERO));
 }
 
 static void
 evdev_free_absinfo(struct input_absinfo *absinfo)
 {
 
 	free(absinfo, M_EVDEV);
 }
 
 int
 evdev_set_report_size(struct evdev_dev *evdev, size_t report_size)
 {
 	if (report_size > KEY_CNT + REL_CNT + ABS_CNT + MAX_MT_SLOTS * MT_CNT +
 	    MSC_CNT + LED_CNT + SND_CNT + SW_CNT + FF_CNT)
 		return (EINVAL);
 
 	evdev->ev_report_size = report_size;
 	return (0);
 }
 
 static size_t
 evdev_estimate_report_size(struct evdev_dev *evdev)
 {
 	size_t size = 0;
 	int res;
 
 	/*
 	 * Keyboards generate one event per report but other devices with
 	 * buttons like mouses can report events simultaneously
 	 */
 	bit_ffs_at(evdev->ev_key_flags, KEY_OK, KEY_CNT - KEY_OK, &res);
 	if (res == -1)
 		bit_ffs(evdev->ev_key_flags, BTN_MISC, &res);
 	size += (res != -1);
 	bit_count(evdev->ev_key_flags, BTN_MISC, KEY_OK - BTN_MISC, &res);
 	size += res;
 
 	/* All relative axes can be reported simultaneously */
 	bit_count(evdev->ev_rel_flags, 0, REL_CNT, &res);
 	size += res;
 
 	/*
 	 * All absolute axes can be reported simultaneously.
 	 * Multitouch axes can be reported ABS_MT_SLOT times
 	 */
 	if (evdev->ev_absinfo != NULL) {
 		bit_count(evdev->ev_abs_flags, 0, ABS_CNT, &res);
 		size += res;
 		bit_count(evdev->ev_abs_flags, ABS_MT_FIRST, MT_CNT, &res);
 		if (res > 0) {
 			res++;	/* ABS_MT_SLOT or SYN_MT_REPORT */
 			if (bit_test(evdev->ev_abs_flags, ABS_MT_SLOT))
 				/* MT type B */
 				size += res * MAXIMAL_MT_SLOT(evdev);
 			else
 				/* MT type A */
 				size += res * (MAX_MT_REPORTS - 1);
 		}
 	}
 
 	/* All misc events can be reported simultaneously */
 	bit_count(evdev->ev_msc_flags, 0, MSC_CNT, &res);
 	size += res;
 
 	/* All leds can be reported simultaneously */
 	bit_count(evdev->ev_led_flags, 0, LED_CNT, &res);
 	size += res;
 
 	/* Assume other events are generated once per report */
 	bit_ffs(evdev->ev_snd_flags, SND_CNT, &res);
 	size += (res != -1);
 
 	bit_ffs(evdev->ev_sw_flags, SW_CNT, &res);
 	size += (res != -1);
 
 	/* XXX: FF part is not implemented yet */
 
 	size++;		/* SYN_REPORT */
 	return (size);
 }
 
 int
 evdev_register(struct evdev_dev *evdev)
 {
 	int ret;
 
 	debugf(evdev, "%s: registered evdev provider: %s <%s>\n",
 	    evdev->ev_shortname, evdev->ev_name, evdev->ev_serial);
 
 	/* Initialize internal structures */
 	mtx_init(&evdev->ev_mtx, "evmtx", NULL, MTX_DEF);
 	LIST_INIT(&evdev->ev_clients);
 
 	if (evdev_event_supported(evdev, EV_REP) &&
 	    bit_test(evdev->ev_flags, EVDEV_FLAG_SOFTREPEAT)) {
 		/* Initialize callout */
 		callout_init_mtx(&evdev->ev_rep_callout, &evdev->ev_mtx, 0);
 
 		if (evdev->ev_rep[REP_DELAY] == 0 &&
 		    evdev->ev_rep[REP_PERIOD] == 0) {
 			/* Supply default values */
 			evdev->ev_rep[REP_DELAY] = 250;
 			evdev->ev_rep[REP_PERIOD] = 33;
 		}
 	}
 
 	/* Initialize multitouch protocol type B states */
 	if (bit_test(evdev->ev_abs_flags, ABS_MT_SLOT) &&
 	    evdev->ev_absinfo != NULL && MAXIMAL_MT_SLOT(evdev) > 0)
 		evdev_mt_init(evdev);
 
 	/* Estimate maximum report size */
 	if (evdev->ev_report_size == 0) {
 		ret = evdev_set_report_size(evdev,
 		    evdev_estimate_report_size(evdev));
 		if (ret != 0)
 			goto bail_out;
 	}
 
 	/* Create char device node */
 	ret = evdev_cdev_create(evdev);
 bail_out:
 	if (ret != 0)
 		mtx_destroy(&evdev->ev_mtx);
 
 	return (ret);
 }
 
 int
 evdev_unregister(struct evdev_dev *evdev)
 {
 	struct evdev_client *client;
 	int ret;
 	debugf(evdev, "%s: unregistered evdev provider: %s\n",
 	    evdev->ev_shortname, evdev->ev_name);
 
 	EVDEV_LOCK(evdev);
 	evdev->ev_cdev->si_drv1 = NULL;
 	/* Wake up sleepers */
 	LIST_FOREACH(client, &evdev->ev_clients, ec_link) {
 		evdev_revoke_client(client);
 		evdev_dispose_client(evdev, client);
 		EVDEV_CLIENT_LOCKQ(client);
 		evdev_notify_event(client);
 		EVDEV_CLIENT_UNLOCKQ(client);
 	}
 	EVDEV_UNLOCK(evdev);
 
 	/* destroy_dev can sleep so release lock */
 	ret = evdev_cdev_destroy(evdev);
 	evdev->ev_cdev = NULL;
 	if (ret == 0)
 		mtx_destroy(&evdev->ev_mtx);
 
 	evdev_free_absinfo(evdev->ev_absinfo);
 	evdev_mt_free(evdev);
 
 	return (ret);
 }
 
 inline void
 evdev_set_name(struct evdev_dev *evdev, const char *name)
 {
 
 	snprintf(evdev->ev_name, NAMELEN, "%s", name);
 }
 
 inline void
 evdev_set_id(struct evdev_dev *evdev, uint16_t bustype, uint16_t vendor,
     uint16_t product, uint16_t version)
 {
 
 	evdev->ev_id = (struct input_id) {
 		.bustype = bustype,
 		.vendor = vendor,
 		.product = product,
 		.version = version
 	};
 }
 
 inline void
 evdev_set_phys(struct evdev_dev *evdev, const char *name)
 {
 
 	snprintf(evdev->ev_shortname, NAMELEN, "%s", name);
 }
 
 inline void
 evdev_set_serial(struct evdev_dev *evdev, const char *serial)
 {
 
 	snprintf(evdev->ev_serial, NAMELEN, "%s", serial);
 }
 
 inline void
 evdev_set_methods(struct evdev_dev *evdev, void *softc,
     struct evdev_methods *methods)
 {
 
 	evdev->ev_methods = methods;
 	evdev->ev_softc = softc;
 }
 
 inline void
 evdev_support_prop(struct evdev_dev *evdev, uint16_t prop)
 {
 
 	KASSERT(prop < INPUT_PROP_CNT, ("invalid evdev input property"));
 	bit_set(evdev->ev_prop_flags, prop);
 }
 
 inline void
 evdev_support_event(struct evdev_dev *evdev, uint16_t type)
 {
 
 	KASSERT(type < EV_CNT, ("invalid evdev event property"));
 	bit_set(evdev->ev_type_flags, type);
 }
 
 inline void
 evdev_support_key(struct evdev_dev *evdev, uint16_t code)
 {
 
 	KASSERT(code < KEY_CNT, ("invalid evdev key property"));
 	bit_set(evdev->ev_key_flags, code);
 }
 
 inline void
 evdev_support_rel(struct evdev_dev *evdev, uint16_t code)
 {
 
 	KASSERT(code < REL_CNT, ("invalid evdev rel property"));
 	bit_set(evdev->ev_rel_flags, code);
 }
 
 inline void
 evdev_support_abs(struct evdev_dev *evdev, uint16_t code, int32_t value,
     int32_t minimum, int32_t maximum, int32_t fuzz, int32_t flat,
     int32_t resolution)
 {
 	struct input_absinfo absinfo;
 
 	KASSERT(code < ABS_CNT, ("invalid evdev abs property"));
 
 	absinfo = (struct input_absinfo) {
 		.value = value,
 		.minimum = minimum,
 		.maximum = maximum,
 		.fuzz = fuzz,
 		.flat = flat,
 		.resolution = resolution,
 	};
 	evdev_set_abs_bit(evdev, code);
 	evdev_set_absinfo(evdev, code, &absinfo);
 }
 
 inline void
 evdev_set_abs_bit(struct evdev_dev *evdev, uint16_t code)
 {
 
 	KASSERT(code < ABS_CNT, ("invalid evdev abs property"));
 	if (evdev->ev_absinfo == NULL)
 		evdev->ev_absinfo = evdev_alloc_absinfo();
 	bit_set(evdev->ev_abs_flags, code);
 }
 
 inline void
 evdev_support_msc(struct evdev_dev *evdev, uint16_t code)
 {
 
 	KASSERT(code < MSC_CNT, ("invalid evdev msc property"));
 	bit_set(evdev->ev_msc_flags, code);
 }
 
 
 inline void
 evdev_support_led(struct evdev_dev *evdev, uint16_t code)
 {
 
 	KASSERT(code < LED_CNT, ("invalid evdev led property"));
 	bit_set(evdev->ev_led_flags, code);
 }
 
 inline void
 evdev_support_snd(struct evdev_dev *evdev, uint16_t code)
 {
 
 	KASSERT(code < SND_CNT, ("invalid evdev snd property"));
 	bit_set(evdev->ev_snd_flags, code);
 }
 
 inline void
 evdev_support_sw(struct evdev_dev *evdev, uint16_t code)
 {
 
 	KASSERT(code < SW_CNT, ("invalid evdev sw property"));
 	bit_set(evdev->ev_sw_flags, code);
 }
 
 bool
 evdev_event_supported(struct evdev_dev *evdev, uint16_t type)
 {
 
 	KASSERT(type < EV_CNT, ("invalid evdev event property"));
 	return (bit_test(evdev->ev_type_flags, type));
 }
 
 inline void
 evdev_set_absinfo(struct evdev_dev *evdev, uint16_t axis,
     struct input_absinfo *absinfo)
 {
 
 	KASSERT(axis < ABS_CNT, ("invalid evdev abs property"));
 
 	if (axis == ABS_MT_SLOT &&
 	    (absinfo->maximum < 1 || absinfo->maximum >= MAX_MT_SLOTS))
 		return;
 
 	if (evdev->ev_absinfo == NULL)
 		evdev->ev_absinfo = evdev_alloc_absinfo();
 
 	if (axis == ABS_MT_SLOT)
 		evdev->ev_absinfo[ABS_MT_SLOT].maximum = absinfo->maximum;
 	else
 		memcpy(&evdev->ev_absinfo[axis], absinfo,
 		    sizeof(struct input_absinfo));
 }
 
 inline void
 evdev_set_repeat_params(struct evdev_dev *evdev, uint16_t property, int value)
 {
 
 	KASSERT(property < REP_CNT, ("invalid evdev repeat property"));
 	evdev->ev_rep[property] = value;
 }
 
 inline void
 evdev_set_flag(struct evdev_dev *evdev, uint16_t flag)
 {
 
 	KASSERT(flag < EVDEV_FLAG_CNT, ("invalid evdev flag property"));
 	bit_set(evdev->ev_flags, flag);
 }
 
 static int
 evdev_check_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t value)
 {
 
 	if (type >= EV_CNT)
 		return (EINVAL);
 
 	/* Allow SYN events implicitly */
 	if (type != EV_SYN && !evdev_event_supported(evdev, type))
 		return (EINVAL);
 
 	switch (type) {
 	case EV_SYN:
 		if (code >= SYN_CNT)
 			return (EINVAL);
 		break;
 
 	case EV_KEY:
 		if (code >= KEY_CNT)
 			return (EINVAL);
 		if (!bit_test(evdev->ev_key_flags, code))
 			return (EINVAL);
 		break;
 
 	case EV_REL:
 		if (code >= REL_CNT)
 			return (EINVAL);
 		if (!bit_test(evdev->ev_rel_flags, code))
 			return (EINVAL);
 		break;
 
 	case EV_ABS:
 		if (code >= ABS_CNT)
 			return (EINVAL);
 		if (!bit_test(evdev->ev_abs_flags, code))
 			return (EINVAL);
 		if (code == ABS_MT_SLOT &&
 		    (value < 0 || value > MAXIMAL_MT_SLOT(evdev)))
 			return (EINVAL);
 		if (ABS_IS_MT(code) && evdev->ev_mt == NULL &&
 		    bit_test(evdev->ev_abs_flags, ABS_MT_SLOT))
 			return (EINVAL);
 		break;
 
 	case EV_MSC:
 		if (code >= MSC_CNT)
 			return (EINVAL);
 		if (!bit_test(evdev->ev_msc_flags, code))
 			return (EINVAL);
 		break;
 
 	case EV_LED:
 		if (code >= LED_CNT)
 			return (EINVAL);
 		if (!bit_test(evdev->ev_led_flags, code))
 			return (EINVAL);
 		break;
 
 	case EV_SND:
 		if (code >= SND_CNT)
 			return (EINVAL);
 		if (!bit_test(evdev->ev_snd_flags, code))
 			return (EINVAL);
 		break;
 
 	case EV_SW:
 		if (code >= SW_CNT)
 			return (EINVAL);
 		if (!bit_test(evdev->ev_sw_flags, code))
 			return (EINVAL);
 		break;
 
 	case EV_REP:
 		if (code >= REP_CNT)
 			return (EINVAL);
 		break;
 
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static void
 evdev_modify_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t *value)
 {
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	switch (type) {
 	case EV_KEY:
 		if (!evdev_event_supported(evdev, EV_REP))
 			break;
 
 		if (!bit_test(evdev->ev_flags, EVDEV_FLAG_SOFTREPEAT)) {
 			/* Detect driver key repeats. */
 			if (bit_test(evdev->ev_key_states, code) &&
 			    *value == KEY_EVENT_DOWN)
 				*value = KEY_EVENT_REPEAT;
 		} else {
 			/* Start/stop callout for evdev repeats */
 			if (bit_test(evdev->ev_key_states, code) == !*value) {
 				if (*value == KEY_EVENT_DOWN)
 					evdev_start_repeat(evdev, code);
 				else
 					evdev_stop_repeat(evdev);
 			}
 		}
 		break;
 
 	case EV_ABS:
 		/* TBD: implement fuzz */
 		break;
 	}
 }
 
 static enum evdev_sparse_result
 evdev_sparse_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t value)
 {
 	int32_t last_mt_slot;
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	/*
 	 * For certain event types, update device state bits
 	 * and convert level reporting to edge reporting
 	 */
 	switch (type) {
 	case EV_KEY:
 		switch (value) {
 		case KEY_EVENT_UP:
 		case KEY_EVENT_DOWN:
 			if (bit_test(evdev->ev_key_states, code) == value)
 				return (EV_SKIP_EVENT);
 			bit_change(evdev->ev_key_states, code, value);
 			break;
 
 		case KEY_EVENT_REPEAT:
 			if (bit_test(evdev->ev_key_states, code) == 0 ||
 			    !evdev_event_supported(evdev, EV_REP))
 				return (EV_SKIP_EVENT);
 			break;
 
 		default:
 			 return (EV_SKIP_EVENT);
 		}
 		break;
 
 	case EV_LED:
 		if (bit_test(evdev->ev_led_states, code) == value)
 			return (EV_SKIP_EVENT);
 		bit_change(evdev->ev_led_states, code, value);
 		break;
 
 	case EV_SND:
 		if (bit_test(evdev->ev_snd_states, code) == value)
 			return (EV_SKIP_EVENT);
 		bit_change(evdev->ev_snd_states, code, value);
 		break;
 
 	case EV_SW:
 		if (bit_test(evdev->ev_sw_states, code) == value)
 			return (EV_SKIP_EVENT);
 		bit_change(evdev->ev_sw_states, code, value);
 		break;
 
 	case EV_REP:
 		if (evdev->ev_rep[code] == value)
 			return (EV_SKIP_EVENT);
 		evdev_set_repeat_params(evdev, code, value);
 		break;
 
 	case EV_REL:
 		if (value == 0)
 			return (EV_SKIP_EVENT);
 		break;
 
 	/* For EV_ABS, save last value in absinfo and ev_mt_states */
 	case EV_ABS:
 		switch (code) {
 		case ABS_MT_SLOT:
 			/* Postpone ABS_MT_SLOT till next event */
 			evdev_set_last_mt_slot(evdev, value);
 			return (EV_SKIP_EVENT);
 
 		case ABS_MT_FIRST ... ABS_MT_LAST:
 			/* Pass MT protocol type A events as is */
 			if (!bit_test(evdev->ev_abs_flags, ABS_MT_SLOT))
 				break;
 			/* Don`t repeat MT protocol type B events */
 			last_mt_slot = evdev_get_last_mt_slot(evdev);
 			if (evdev_get_mt_value(evdev, last_mt_slot, code)
 			     == value)
 				return (EV_SKIP_EVENT);
 			evdev_set_mt_value(evdev, last_mt_slot, code, value);
 			if (last_mt_slot != CURRENT_MT_SLOT(evdev)) {
 				CURRENT_MT_SLOT(evdev) = last_mt_slot;
 				evdev->ev_report_opened = true;
 				return (EV_REPORT_MT_SLOT);
 			}
 			break;
 
 		default:
 			if (evdev->ev_absinfo[code].value == value)
 				return (EV_SKIP_EVENT);
 			evdev->ev_absinfo[code].value = value;
 		}
 		break;
 
 	case EV_SYN:
 		if (code == SYN_REPORT) {
 			/* Skip empty reports */
 			if (!evdev->ev_report_opened)
 				return (EV_SKIP_EVENT);
 			evdev->ev_report_opened = false;
 			return (EV_REPORT_EVENT);
 		}
 		break;
 	}
 
 	evdev->ev_report_opened = true;
 	return (EV_REPORT_EVENT);
 }
 
 static void
 evdev_propagate_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t value)
 {
 	struct evdev_client *client;
 
 	debugf(evdev, "%s pushed event %d/%d/%d",
 	    evdev->ev_shortname, type, code, value);
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	/* Propagate event through all clients */
 	LIST_FOREACH(client, &evdev->ev_clients, ec_link) {
 		if (evdev->ev_grabber != NULL && evdev->ev_grabber != client)
 			continue;
 
 		EVDEV_CLIENT_LOCKQ(client);
 		evdev_client_push(client, type, code, value);
 		if (type == EV_SYN && code == SYN_REPORT)
 			evdev_notify_event(client);
 		EVDEV_CLIENT_UNLOCKQ(client);
 	}
 
 	/* Update counters */
 	evdev->ev_event_count++;
 	if (type == EV_SYN && code == SYN_REPORT)
 		evdev->ev_report_count++;
 }
 
 void
 evdev_send_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t value)
 {
 	enum evdev_sparse_result sparse;
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	sparse =  evdev_sparse_event(evdev, type, code, value);
 	switch (sparse) {
 	case EV_REPORT_MT_SLOT:
 		/* report postponed ABS_MT_SLOT */
 		evdev_propagate_event(evdev, EV_ABS, ABS_MT_SLOT,
 		    CURRENT_MT_SLOT(evdev));
 		/* FALLTHROUGH */
 	case EV_REPORT_EVENT:
 		evdev_propagate_event(evdev, type, code, value);
 		/* FALLTHROUGH */
 	case EV_SKIP_EVENT:
 		break;
 	}
 }
 
 int
 evdev_push_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t value)
 {
 
 	if (evdev_check_event(evdev, type, code, value) != 0)
 		return (EINVAL);
 
 	EVDEV_LOCK(evdev);
 	evdev_modify_event(evdev, type, code, &value);
 	if (type == EV_SYN && code == SYN_REPORT && evdev->ev_report_opened &&
 	    bit_test(evdev->ev_flags, EVDEV_FLAG_MT_STCOMPAT))
 		evdev_send_mt_compat(evdev);
 	evdev_send_event(evdev, type, code, value);
 	EVDEV_UNLOCK(evdev);
 
 	return (0);
 }
 
 int
 evdev_inject_event(struct evdev_dev *evdev, uint16_t type, uint16_t code,
     int32_t value)
 {
 	int ret = 0;
 
 	switch (type) {
 	case EV_REP:
 		/* evdev repeats should not be processed by hardware driver */
 		if (bit_test(evdev->ev_flags, EVDEV_FLAG_SOFTREPEAT))
 			goto push;
 		/* FALLTHROUGH */
 	case EV_LED:
 	case EV_MSC:
 	case EV_SND:
 	case EV_FF:
 		if (evdev->ev_methods != NULL &&
 		    evdev->ev_methods->ev_event != NULL)
 			evdev->ev_methods->ev_event(evdev, evdev->ev_softc,
 			    type, code, value);
 		/*
 		 * Leds and driver repeats should be reported in ev_event
 		 * method body to interoperate with kbdmux states and rates
 		 * propagation so both ways (ioctl and evdev) of changing it
 		 * will produce only one evdev event report to client.
 		 */
 		if (type == EV_LED || type == EV_REP)
 			break;
 		/* FALLTHROUGH */
 	case EV_SYN:
 	case EV_KEY:
 	case EV_REL:
 	case EV_ABS:
 	case EV_SW:
 push:
 		ret = evdev_push_event(evdev, type,  code, value);
 		break;
 
 	default:
 		ret = EINVAL;
 	}
 
 	return (ret);
 }
 
 inline int
 evdev_sync(struct evdev_dev *evdev)
 {
 
 	return (evdev_push_event(evdev, EV_SYN, SYN_REPORT, 1));
 }
 
 
 inline int
 evdev_mt_sync(struct evdev_dev *evdev)
 {
 
 	return (evdev_push_event(evdev, EV_SYN, SYN_MT_REPORT, 1));
 }
 
 int
 evdev_register_client(struct evdev_dev *evdev, struct evdev_client *client)
 {
 	int ret = 0;
 
 	debugf(evdev, "adding new client for device %s", evdev->ev_shortname);
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	if (LIST_EMPTY(&evdev->ev_clients) && evdev->ev_methods != NULL &&
 	    evdev->ev_methods->ev_open != NULL) {
 		debugf(evdev, "calling ev_open() on device %s",
 		    evdev->ev_shortname);
 		ret = evdev->ev_methods->ev_open(evdev, evdev->ev_softc);
 	}
 	if (ret == 0)
 		LIST_INSERT_HEAD(&evdev->ev_clients, client, ec_link);
 	return (ret);
 }
 
 void
 evdev_dispose_client(struct evdev_dev *evdev, struct evdev_client *client)
 {
 	debugf(evdev, "removing client for device %s", evdev->ev_shortname);
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	LIST_REMOVE(client, ec_link);
 	if (LIST_EMPTY(&evdev->ev_clients)) {
 		if (evdev->ev_methods != NULL &&
 		    evdev->ev_methods->ev_close != NULL)
 			evdev->ev_methods->ev_close(evdev, evdev->ev_softc);
 		if (evdev_event_supported(evdev, EV_REP) &&
 		    bit_test(evdev->ev_flags, EVDEV_FLAG_SOFTREPEAT))
 			evdev_stop_repeat(evdev);
 	}
 	evdev_release_client(evdev, client);
 }
 
 int
 evdev_grab_client(struct evdev_dev *evdev, struct evdev_client *client)
 {
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	if (evdev->ev_grabber != NULL)
 		return (EBUSY);
 
 	evdev->ev_grabber = client;
 
 	return (0);
 }
 
 int
 evdev_release_client(struct evdev_dev *evdev, struct evdev_client *client)
 {
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	if (evdev->ev_grabber != client)
 		return (EINVAL);
 
 	evdev->ev_grabber = NULL;
 
 	return (0);
 }
 
 static void
 evdev_repeat_callout(void *arg)
 {
 	struct evdev_dev *evdev = (struct evdev_dev *)arg;
 
 	evdev_send_event(evdev, EV_KEY, evdev->ev_rep_key, KEY_EVENT_REPEAT);
 	evdev_send_event(evdev, EV_SYN, SYN_REPORT, 1);
 
 	if (evdev->ev_rep[REP_PERIOD])
 		callout_reset(&evdev->ev_rep_callout,
 		    evdev->ev_rep[REP_PERIOD] * hz / 1000,
 		    evdev_repeat_callout, evdev);
 	else
 		evdev->ev_rep_key = KEY_RESERVED;
 }
 
 static void
 evdev_start_repeat(struct evdev_dev *evdev, uint16_t key)
 {
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	if (evdev->ev_rep[REP_DELAY]) {
 		evdev->ev_rep_key = key;
 		callout_reset(&evdev->ev_rep_callout,
 		    evdev->ev_rep[REP_DELAY] * hz / 1000,
 		    evdev_repeat_callout, evdev);
 	}
 }
 
 static void
 evdev_stop_repeat(struct evdev_dev *evdev)
 {
 
 	EVDEV_LOCK_ASSERT(evdev);
 
 	if (evdev->ev_rep_key != KEY_RESERVED) {
 		callout_stop(&evdev->ev_rep_callout);
 		evdev->ev_rep_key = KEY_RESERVED;
 	}
 }
Index: user/alc/PQ_LAUNDRY/sys/dev/ofw/ofw_fdt.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/ofw/ofw_fdt.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/dev/ofw/ofw_fdt.c	(revision 306283)
@@ -1,479 +1,488 @@
 /*-
  * Copyright (c) 2009-2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Semihalf under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 
 #include <contrib/libfdt/libfdt.h>
 
 #include <machine/stdarg.h>
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/ofwvar.h>
 #include <dev/ofw/openfirm.h>
 
 #include "ofw_if.h"
 
 #ifdef DEBUG
 #define debugf(fmt, args...) do { printf("%s(): ", __func__);	\
     printf(fmt,##args); } while (0)
 #else
 #define debugf(fmt, args...)
 #endif
 
+#if defined(__arm__)
+#if defined(SOC_MV_ARMADAXP) || defined(SOC_MV_ARMADA38X) || \
+    defined(SOC_MV_DISCOVERY) || defined(SOC_MV_DOVE) || \
+    defined(SOC_MV_FREY) || defined(SOC_MV_KIRKWOOD) || \
+    defined(SOC_MV_LOKIPLUS) || defined(SOC_MV_ORION)
+#define FDT_MARVELL
+#endif
+#endif
+
 static int ofw_fdt_init(ofw_t, void *);
 static phandle_t ofw_fdt_peer(ofw_t, phandle_t);
 static phandle_t ofw_fdt_child(ofw_t, phandle_t);
 static phandle_t ofw_fdt_parent(ofw_t, phandle_t);
 static phandle_t ofw_fdt_instance_to_package(ofw_t, ihandle_t);
 static ssize_t ofw_fdt_getproplen(ofw_t, phandle_t, const char *);
 static ssize_t ofw_fdt_getprop(ofw_t, phandle_t, const char *, void *, size_t);
 static int ofw_fdt_nextprop(ofw_t, phandle_t, const char *, char *, size_t);
 static int ofw_fdt_setprop(ofw_t, phandle_t, const char *, const void *,
     size_t);
 static ssize_t ofw_fdt_canon(ofw_t, const char *, char *, size_t);
 static phandle_t ofw_fdt_finddevice(ofw_t, const char *);
 static ssize_t ofw_fdt_instance_to_path(ofw_t, ihandle_t, char *, size_t);
 static ssize_t ofw_fdt_package_to_path(ofw_t, phandle_t, char *, size_t);
 static int ofw_fdt_interpret(ofw_t, const char *, int, cell_t *);
 
 static ofw_method_t ofw_fdt_methods[] = {
 	OFWMETHOD(ofw_init,			ofw_fdt_init),
 	OFWMETHOD(ofw_peer,			ofw_fdt_peer),
 	OFWMETHOD(ofw_child,			ofw_fdt_child),
 	OFWMETHOD(ofw_parent,			ofw_fdt_parent),
 	OFWMETHOD(ofw_instance_to_package,	ofw_fdt_instance_to_package),
 	OFWMETHOD(ofw_getproplen,		ofw_fdt_getproplen),
 	OFWMETHOD(ofw_getprop,			ofw_fdt_getprop),
 	OFWMETHOD(ofw_nextprop,			ofw_fdt_nextprop),
 	OFWMETHOD(ofw_setprop,			ofw_fdt_setprop),
 	OFWMETHOD(ofw_canon,			ofw_fdt_canon),
 	OFWMETHOD(ofw_finddevice,		ofw_fdt_finddevice),
 	OFWMETHOD(ofw_instance_to_path,		ofw_fdt_instance_to_path),
 	OFWMETHOD(ofw_package_to_path,		ofw_fdt_package_to_path),
 	OFWMETHOD(ofw_interpret,		ofw_fdt_interpret),
 	{ 0, 0 }
 };
 
 static ofw_def_t ofw_fdt = {
 	OFW_FDT,
 	ofw_fdt_methods,
 	0
 };
 OFW_DEF(ofw_fdt);
 
 static void *fdtp = NULL;
 
 static int
 sysctl_handle_dtb(SYSCTL_HANDLER_ARGS)
 {
 
         return (sysctl_handle_opaque(oidp, fdtp, fdt_totalsize(fdtp), req));
 }
 
 static void
 sysctl_register_fdt_oid(void *arg)
 {
 
 	/* If there is no FDT registered, skip adding the sysctl */
 	if (fdtp == NULL)
 		return;
 
 	SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_fdt), OID_AUTO, "dtb",
 	    CTLTYPE_OPAQUE | CTLFLAG_RD, NULL, 0, sysctl_handle_dtb, "",
 	    "Device Tree Blob");
 }
 SYSINIT(dtb_oid, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_register_fdt_oid, 0);
 
 static int
 ofw_fdt_init(ofw_t ofw, void *data)
 {
 	int err;
 
 	/* Check FDT blob integrity */
 	if ((err = fdt_check_header(data)) != 0)
 		return (err);
 
 	fdtp = data;
 	return (0);
 }
 
 /*
  * Device tree functions.
  *
  * We use the offset from fdtp to the node as the 'phandle' in OF interface.
  *
  * phandle is a u32 value, therefore we cannot use the pointer to node as
  * phandle in 64 bit. We also do not use the usual fdt offset as phandle,
  * as it can be 0, and the OF interface has special meaning for phandle 0.
  */
 
 static phandle_t
 fdt_offset_phandle(int offset)
 {
 	if (offset < 0)
 		return (0);
 	return ((phandle_t)offset + fdt_off_dt_struct(fdtp));
 }
 
 static int
 fdt_phandle_offset(phandle_t p)
 {
 	int pint = (int)p;
 	int dtoff = fdt_off_dt_struct(fdtp);
 
 	if (pint < dtoff)
 		return (-1);
 	return (pint - dtoff);
 }
 
 /* Return the next sibling of this node or 0. */
 static phandle_t
 ofw_fdt_peer(ofw_t ofw, phandle_t node)
 {
 	int depth, offset;
 
 	if (node == 0) {
 		/* Find root node */
 		offset = fdt_path_offset(fdtp, "/");
 
 		return (fdt_offset_phandle(offset));
 	}
 
 	offset = fdt_phandle_offset(node);
 	if (offset < 0)
 		return (0);
 
 	for (depth = 1, offset = fdt_next_node(fdtp, offset, &depth);
 	    offset >= 0;
 	    offset = fdt_next_node(fdtp, offset, &depth)) {
 		if (depth < 0)
 			return (0);
 		if (depth == 1)
 			return (fdt_offset_phandle(offset));
 	}
 
 	return (0);
 }
 
 /* Return the first child of this node or 0. */
 static phandle_t
 ofw_fdt_child(ofw_t ofw, phandle_t node)
 {
 	int depth, offset;
 
 	offset = fdt_phandle_offset(node);
 	if (offset < 0)
 		return (0);
 
 	for (depth = 0, offset = fdt_next_node(fdtp, offset, &depth);
 	    (offset >= 0) && (depth > 0);
 	    offset = fdt_next_node(fdtp, offset, &depth)) {
 		if (depth < 0)
 			return (0);
 		if (depth == 1)
 			return (fdt_offset_phandle(offset));
 	}
 
 	return (0);
 }
 
 /* Return the parent of this node or 0. */
 static phandle_t
 ofw_fdt_parent(ofw_t ofw, phandle_t node)
 {
 	int offset, paroffset;
 
 	offset = fdt_phandle_offset(node);
 	if (offset < 0)
 		return (0);
 
 	paroffset = fdt_parent_offset(fdtp, offset);
 	return (fdt_offset_phandle(paroffset));
 }
 
 /* Return the package handle that corresponds to an instance handle. */
 static phandle_t
 ofw_fdt_instance_to_package(ofw_t ofw, ihandle_t instance)
 {
 
 	/* Where real OF uses ihandles in the tree, FDT uses xref phandles */
 	return (OF_node_from_xref(instance));
 }
 
 /* Get the length of a property of a package. */
 static ssize_t
 ofw_fdt_getproplen(ofw_t ofw, phandle_t package, const char *propname)
 {
 	const struct fdt_property *prop;
 	int offset, len;
 
 	offset = fdt_phandle_offset(package);
 	if (offset < 0)
 		return (-1);
 
 	len = -1;
 	prop = fdt_get_property(fdtp, offset, propname, &len);
 
 	if (prop == NULL && strcmp(propname, "name") == 0) {
 		/* Emulate the 'name' property */
 		fdt_get_name(fdtp, offset, &len);
 		return (len + 1);
 	}
 
 	if (prop == NULL && offset == fdt_path_offset(fdtp, "/chosen")) {
 		if (strcmp(propname, "fdtbootcpu") == 0)
 			return (sizeof(cell_t));
 		if (strcmp(propname, "fdtmemreserv") == 0)
 			return (sizeof(uint64_t)*2*fdt_num_mem_rsv(fdtp));
 	}
 
 	if (prop == NULL)
 		return (-1);
 
 	return (len);
 }
 
 /* Get the value of a property of a package. */
 static ssize_t
 ofw_fdt_getprop(ofw_t ofw, phandle_t package, const char *propname, void *buf,
     size_t buflen)
 {
 	const void *prop;
 	const char *name;
 	int len, offset;
 	uint32_t cpuid;
 
 	offset = fdt_phandle_offset(package);
 	if (offset < 0)
 		return (-1);
 
 	prop = fdt_getprop(fdtp, offset, propname, &len);
 
 	if (prop == NULL && strcmp(propname, "name") == 0) {
 		/* Emulate the 'name' property */
 		name = fdt_get_name(fdtp, offset, &len);
 		strncpy(buf, name, buflen);
 		if (len + 1 > buflen)
 			len = buflen;
 		return (len + 1);
 	}
 
 	if (prop == NULL && offset == fdt_path_offset(fdtp, "/chosen")) {
 		if (strcmp(propname, "fdtbootcpu") == 0) {
 			cpuid = cpu_to_fdt32(fdt_boot_cpuid_phys(fdtp));
 			len = sizeof(cpuid);
 			prop = &cpuid;
 		}
 		if (strcmp(propname, "fdtmemreserv") == 0) {
 			prop = (char *)fdtp + fdt_off_mem_rsvmap(fdtp);
 			len = sizeof(uint64_t)*2*fdt_num_mem_rsv(fdtp);
 		}
 	}
 
 	if (prop == NULL)
 		return (-1);
 
 	if (len > buflen)
 		len = buflen;
 	bcopy(prop, buf, len);
 	return (len);
 }
 
 /*
  * Get the next property of a package. Return values:
  *  -1: package or previous property does not exist
  *   0: no more properties
  *   1: success
  */
 static int
 ofw_fdt_nextprop(ofw_t ofw, phandle_t package, const char *previous, char *buf,
     size_t size)
 {
 	const struct fdt_property *prop;
 	const char *name;
 	int offset;
 
 	offset = fdt_phandle_offset(package);
 	if (offset < 0)
 		return (-1);
 
 	/* Find the first prop in the node */
 	offset = fdt_first_property_offset(fdtp, offset);
 	if (offset < 0)
 		return (0); /* No properties */
 
 	if (previous != NULL) {
 		while (offset >= 0) {
 			prop = fdt_get_property_by_offset(fdtp, offset, NULL);
 			if (prop == NULL)
 				return (-1); /* Internal error */
 
 			offset = fdt_next_property_offset(fdtp, offset);
 			if (offset < 0)
 				return (0); /* No more properties */
 
 			/* Check if the last one was the one we wanted */
 			name = fdt_string(fdtp, fdt32_to_cpu(prop->nameoff));
 			if (strcmp(name, previous) == 0)
 				break;
 		}
 	}
 
 	prop = fdt_get_property_by_offset(fdtp, offset, &offset);
 	if (prop == NULL)
 		return (-1); /* Internal error */
 
 	strncpy(buf, fdt_string(fdtp, fdt32_to_cpu(prop->nameoff)), size);
 
 	return (1);
 }
 
 /* Set the value of a property of a package. */
 static int
 ofw_fdt_setprop(ofw_t ofw, phandle_t package, const char *propname,
     const void *buf, size_t len)
 {
 	int offset;
 
 	offset = fdt_phandle_offset(package);
 	if (offset < 0)
 		return (-1);
 
 	return (fdt_setprop_inplace(fdtp, offset, propname, buf, len));
 }
 
 /* Convert a device specifier to a fully qualified pathname. */
 static ssize_t
 ofw_fdt_canon(ofw_t ofw, const char *device, char *buf, size_t len)
 {
 
 	return (-1);
 }
 
 /* Return a package handle for the specified device. */
 static phandle_t
 ofw_fdt_finddevice(ofw_t ofw, const char *device)
 {
 	int offset;
 
 	offset = fdt_path_offset(fdtp, device);
 	if (offset < 0)
 		return (-1);
 	return (fdt_offset_phandle(offset));
 }
 
 /* Return the fully qualified pathname corresponding to an instance. */
 static ssize_t
 ofw_fdt_instance_to_path(ofw_t ofw, ihandle_t instance, char *buf, size_t len)
 {
 	phandle_t phandle;
 
 	phandle = OF_instance_to_package(instance);
 	if (phandle == -1)
 		return (-1);
 
 	return (OF_package_to_path(phandle, buf, len));
 }
 
 /* Return the fully qualified pathname corresponding to a package. */
 static ssize_t
 ofw_fdt_package_to_path(ofw_t ofw, phandle_t package, char *buf, size_t len)
 {
 
 	return (-1);
 }
 
-#if defined(__arm__) || defined(__powerpc__)
+#if defined(FDT_MARVELL) || defined(__powerpc__)
 static int
 ofw_fdt_fixup(ofw_t ofw)
 {
 #define FDT_MODEL_LEN	80
 	char model[FDT_MODEL_LEN];
 	phandle_t root;
 	ssize_t len;
 	int i;
 
 	if ((root = ofw_fdt_finddevice(ofw, "/")) == -1)
 		return (ENODEV);
 
 	if ((len = ofw_fdt_getproplen(ofw, root, "model")) <= 0)
 		return (0);
 
 	bzero(model, FDT_MODEL_LEN);
 	if (ofw_fdt_getprop(ofw, root, "model", model, FDT_MODEL_LEN) <= 0)
 		return (0);
 
 	/*
 	 * Search fixup table and call handler if appropriate.
 	 */
 	for (i = 0; fdt_fixup_table[i].model != NULL; i++) {
 		if (strncmp(model, fdt_fixup_table[i].model,
 		    FDT_MODEL_LEN) != 0)
 			continue;
 
 		if (fdt_fixup_table[i].handler != NULL)
 			(*fdt_fixup_table[i].handler)(root);
 	}
 
 	return (0);
 }
 #endif
 
 static int
 ofw_fdt_interpret(ofw_t ofw, const char *cmd, int nret, cell_t *retvals)
 {
-#if defined(__arm__) || defined(__powerpc__)
+#if defined(FDT_MARVELL) || defined(__powerpc__)
 	int rv;
 
 	/*
 	 * Note: FDT does not have the possibility to 'interpret' commands,
 	 * but we abuse the interface a bit to use it for doing non-standard
 	 * operations on the device tree blob.
 	 *
 	 * Currently the only supported 'command' is to trigger performing
 	 * fixups.
 	 */
 	if (strncmp("perform-fixup", cmd, 13) != 0)
 		return (0);
 
 	rv = ofw_fdt_fixup(ofw);
 	if (nret > 0)
 		retvals[0] = rv;
 
 	return (rv);
 #else
 	return (0);
 #endif
 }
Index: user/alc/PQ_LAUNDRY/sys/dev/usb/input/ukbd.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/dev/usb/input/ukbd.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/dev/usb/input/ukbd.c	(revision 306283)
@@ -1,2305 +1,2304 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 
 /*-
  * Copyright (c) 1998 The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Lennart Augustsson (lennart@augustsson.net) at
  * Carlstedt Research & Technology.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  */
 
 /*
  * HID spec: http://www.usb.org/developers/devclass_docs/HID1_11.pdf
  */
 
 #include "opt_compat.h"
 #include "opt_kbd.h"
 #include "opt_ukbd.h"
 #include "opt_evdev.h"
 
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usbdi.h>
 #include <dev/usb/usbdi_util.h>
 #include <dev/usb/usbhid.h>
 
 #define	USB_DEBUG_VAR ukbd_debug
 #include <dev/usb/usb_debug.h>
 
 #include <dev/usb/quirk/usb_quirk.h>
 
 #ifdef EVDEV
 #include <dev/evdev/input.h>
 #include <dev/evdev/evdev.h>
 #endif
 
 #include <sys/ioccom.h>
 #include <sys/filio.h>
 #include <sys/tty.h>
 #include <sys/kbio.h>
 
 #include <dev/kbd/kbdreg.h>
 
 /* the initial key map, accent map and fkey strings */
 #if defined(UKBD_DFLT_KEYMAP) && !defined(KLD_MODULE)
 #define	KBD_DFLT_KEYMAP
 #include "ukbdmap.h"
 #endif
 
 /* the following file must be included after "ukbdmap.h" */
 #include <dev/kbd/kbdtables.h>
 
 #ifdef USB_DEBUG
 static int ukbd_debug = 0;
 static int ukbd_no_leds = 0;
 static int ukbd_pollrate = 0;
 
 static SYSCTL_NODE(_hw_usb, OID_AUTO, ukbd, CTLFLAG_RW, 0, "USB keyboard");
 SYSCTL_INT(_hw_usb_ukbd, OID_AUTO, debug, CTLFLAG_RWTUN,
     &ukbd_debug, 0, "Debug level");
 SYSCTL_INT(_hw_usb_ukbd, OID_AUTO, no_leds, CTLFLAG_RWTUN,
     &ukbd_no_leds, 0, "Disables setting of keyboard leds");
 SYSCTL_INT(_hw_usb_ukbd, OID_AUTO, pollrate, CTLFLAG_RWTUN,
     &ukbd_pollrate, 0, "Force this polling rate, 1-1000Hz");
 #endif
 
 #define	UKBD_EMULATE_ATSCANCODE	       1
 #define	UKBD_DRIVER_NAME          "ukbd"
 #define	UKBD_NMOD                     8	/* units */
 #define	UKBD_NKEYCODE                 6	/* units */
 #define	UKBD_IN_BUF_SIZE  (2*(UKBD_NMOD + (2*UKBD_NKEYCODE)))	/* bytes */
 #define	UKBD_IN_BUF_FULL  ((UKBD_IN_BUF_SIZE / 2) - 1)	/* bytes */
 #define	UKBD_NFKEY        (sizeof(fkey_tab)/sizeof(fkey_tab[0]))	/* units */
 #define	UKBD_BUFFER_SIZE	      64	/* bytes */
 
 struct ukbd_data {
 	uint16_t	modifiers;
 #define	MOD_CONTROL_L	0x01
 #define	MOD_CONTROL_R	0x10
 #define	MOD_SHIFT_L	0x02
 #define	MOD_SHIFT_R	0x20
 #define	MOD_ALT_L	0x04
 #define	MOD_ALT_R	0x40
 #define	MOD_WIN_L	0x08
 #define	MOD_WIN_R	0x80
 /* internal */
 #define	MOD_EJECT	0x0100
 #define	MOD_FN		0x0200
 	uint8_t	keycode[UKBD_NKEYCODE];
 };
 
 enum {
 	UKBD_INTR_DT_0,
 	UKBD_INTR_DT_1,
 	UKBD_CTRL_LED,
 	UKBD_N_TRANSFER,
 };
 
 struct ukbd_softc {
 	keyboard_t sc_kbd;
 	keymap_t sc_keymap;
 	accentmap_t sc_accmap;
 	fkeytab_t sc_fkeymap[UKBD_NFKEY];
 	struct hid_location sc_loc_apple_eject;
 	struct hid_location sc_loc_apple_fn;
 	struct hid_location sc_loc_ctrl_l;
 	struct hid_location sc_loc_ctrl_r;
 	struct hid_location sc_loc_shift_l;
 	struct hid_location sc_loc_shift_r;
 	struct hid_location sc_loc_alt_l;
 	struct hid_location sc_loc_alt_r;
 	struct hid_location sc_loc_win_l;
 	struct hid_location sc_loc_win_r;
 	struct hid_location sc_loc_events;
 	struct hid_location sc_loc_numlock;
 	struct hid_location sc_loc_capslock;
 	struct hid_location sc_loc_scrolllock;
 	struct usb_callout sc_callout;
 	struct ukbd_data sc_ndata;
 	struct ukbd_data sc_odata;
 
 	struct thread *sc_poll_thread;
 	struct usb_device *sc_udev;
 	struct usb_interface *sc_iface;
 	struct usb_xfer *sc_xfer[UKBD_N_TRANSFER];
 #ifdef EVDEV
 	struct evdev_dev *sc_evdev;
 #endif
 
 	sbintime_t sc_co_basetime;
 	int	sc_delay;
 	uint32_t sc_ntime[UKBD_NKEYCODE];
 	uint32_t sc_otime[UKBD_NKEYCODE];
 	uint32_t sc_input[UKBD_IN_BUF_SIZE];	/* input buffer */
 	uint32_t sc_time_ms;
 	uint32_t sc_composed_char;	/* composed char code, if non-zero */
 #ifdef UKBD_EMULATE_ATSCANCODE
 	uint32_t sc_buffered_char[2];
 #endif
 	uint32_t sc_flags;		/* flags */
 #define	UKBD_FLAG_COMPOSE	0x00000001
 #define	UKBD_FLAG_POLLING	0x00000002
 #define	UKBD_FLAG_SET_LEDS	0x00000004
 #define	UKBD_FLAG_ATTACHED	0x00000010
 #define	UKBD_FLAG_GONE		0x00000020
 
 #define	UKBD_FLAG_HID_MASK	0x003fffc0
 #define	UKBD_FLAG_APPLE_EJECT	0x00000040
 #define	UKBD_FLAG_APPLE_FN	0x00000080
 #define	UKBD_FLAG_APPLE_SWAP	0x00000100
 #define	UKBD_FLAG_CTRL_L	0x00000400
 #define	UKBD_FLAG_CTRL_R	0x00000800
 #define	UKBD_FLAG_SHIFT_L	0x00001000
 #define	UKBD_FLAG_SHIFT_R	0x00002000
 #define	UKBD_FLAG_ALT_L		0x00004000
 #define	UKBD_FLAG_ALT_R		0x00008000
 #define	UKBD_FLAG_WIN_L		0x00010000
 #define	UKBD_FLAG_WIN_R		0x00020000
 #define	UKBD_FLAG_EVENTS	0x00040000
 #define	UKBD_FLAG_NUMLOCK	0x00080000
 #define	UKBD_FLAG_CAPSLOCK	0x00100000
 #define	UKBD_FLAG_SCROLLLOCK 	0x00200000
 
 	int	sc_mode;		/* input mode (K_XLATE,K_RAW,K_CODE) */
 	int	sc_state;		/* shift/lock key state */
 	int	sc_accents;		/* accent key index (> 0) */
 	int	sc_polling;		/* polling recursion count */
 	int	sc_led_size;
 	int	sc_kbd_size;
 
 	uint16_t sc_inputs;
 	uint16_t sc_inputhead;
 	uint16_t sc_inputtail;
 	uint16_t sc_modifiers;
 
 	uint8_t	sc_leds;		/* store for async led requests */
 	uint8_t	sc_iface_index;
 	uint8_t	sc_iface_no;
 	uint8_t sc_id_apple_eject;
 	uint8_t sc_id_apple_fn;
 	uint8_t sc_id_ctrl_l;
 	uint8_t sc_id_ctrl_r;
 	uint8_t sc_id_shift_l;
 	uint8_t sc_id_shift_r;
 	uint8_t sc_id_alt_l;
 	uint8_t sc_id_alt_r;
 	uint8_t sc_id_win_l;
 	uint8_t sc_id_win_r;
 	uint8_t sc_id_event;
 	uint8_t sc_id_numlock;
 	uint8_t sc_id_capslock;
 	uint8_t sc_id_scrolllock;
 	uint8_t sc_id_events;
 	uint8_t sc_kbd_id;
 
 	uint8_t sc_buffer[UKBD_BUFFER_SIZE];
 };
 
 #define	KEY_ERROR	  0x01
 
 #define	KEY_PRESS	  0
 #define	KEY_RELEASE	  0x400
 #define	KEY_INDEX(c)	  ((c) & 0xFF)
 
 #define	SCAN_PRESS	  0
 #define	SCAN_RELEASE	  0x80
 #define	SCAN_PREFIX_E0	  0x100
 #define	SCAN_PREFIX_E1	  0x200
 #define	SCAN_PREFIX_CTL	  0x400
 #define	SCAN_PREFIX_SHIFT 0x800
 #define	SCAN_PREFIX	(SCAN_PREFIX_E0  | SCAN_PREFIX_E1 | \
 			 SCAN_PREFIX_CTL | SCAN_PREFIX_SHIFT)
 #define	SCAN_CHAR(c)	((c) & 0x7f)
 
 #define	UKBD_LOCK()	USB_MTX_LOCK(&Giant)
 #define	UKBD_UNLOCK()	USB_MTX_UNLOCK(&Giant)
 #define	UKBD_LOCK_ASSERT()	USB_MTX_ASSERT(&Giant, MA_OWNED)
 
 struct ukbd_mods {
 	uint32_t mask, key;
 };
 
 static const struct ukbd_mods ukbd_mods[UKBD_NMOD] = {
 	{MOD_CONTROL_L, 0xe0},
 	{MOD_CONTROL_R, 0xe4},
 	{MOD_SHIFT_L, 0xe1},
 	{MOD_SHIFT_R, 0xe5},
 	{MOD_ALT_L, 0xe2},
 	{MOD_ALT_R, 0xe6},
 	{MOD_WIN_L, 0xe3},
 	{MOD_WIN_R, 0xe7},
 };
 
 #define	NN 0				/* no translation */
 /*
  * Translate USB keycodes to AT keyboard scancodes.
  */
 /*
  * FIXME: Mac USB keyboard generates:
  * 0x53: keypad NumLock/Clear
  * 0x66: Power
  * 0x67: keypad =
  * 0x68: F13
  * 0x69: F14
  * 0x6a: F15
  * 
  * USB Apple Keyboard JIS generates:
  * 0x90: Kana
  * 0x91: Eisu
  */
 static const uint8_t ukbd_trtab[256] = {
 	0, 0, 0, 0, 30, 48, 46, 32,	/* 00 - 07 */
 	18, 33, 34, 35, 23, 36, 37, 38,	/* 08 - 0F */
 	50, 49, 24, 25, 16, 19, 31, 20,	/* 10 - 17 */
 	22, 47, 17, 45, 21, 44, 2, 3,	/* 18 - 1F */
 	4, 5, 6, 7, 8, 9, 10, 11,	/* 20 - 27 */
 	28, 1, 14, 15, 57, 12, 13, 26,	/* 28 - 2F */
 	27, 43, 43, 39, 40, 41, 51, 52,	/* 30 - 37 */
 	53, 58, 59, 60, 61, 62, 63, 64,	/* 38 - 3F */
 	65, 66, 67, 68, 87, 88, 92, 70,	/* 40 - 47 */
 	104, 102, 94, 96, 103, 99, 101, 98,	/* 48 - 4F */
 	97, 100, 95, 69, 91, 55, 74, 78,/* 50 - 57 */
 	89, 79, 80, 81, 75, 76, 77, 71,	/* 58 - 5F */
 	72, 73, 82, 83, 86, 107, 122, NN,	/* 60 - 67 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* 68 - 6F */
 	NN, NN, NN, NN, 115, 108, 111, 113,	/* 70 - 77 */
 	109, 110, 112, 118, 114, 116, 117, 119,	/* 78 - 7F */
 	121, 120, NN, NN, NN, NN, NN, 123,	/* 80 - 87 */
 	124, 125, 126, 127, 128, NN, NN, NN,	/* 88 - 8F */
 	129, 130, NN, NN, NN, NN, NN, NN,	/* 90 - 97 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* 98 - 9F */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* A0 - A7 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* A8 - AF */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* B0 - B7 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* B8 - BF */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* C0 - C7 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* C8 - CF */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* D0 - D7 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* D8 - DF */
 	29, 42, 56, 105, 90, 54, 93, 106,	/* E0 - E7 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* E8 - EF */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* F0 - F7 */
 	NN, NN, NN, NN, NN, NN, NN, NN,	/* F8 - FF */
 };
 
 static const uint8_t ukbd_boot_desc[] = {
 	0x05, 0x01, 0x09, 0x06, 0xa1,
 	0x01, 0x05, 0x07, 0x19, 0xe0,
 	0x29, 0xe7, 0x15, 0x00, 0x25,
 	0x01, 0x75, 0x01, 0x95, 0x08,
 	0x81, 0x02, 0x95, 0x01, 0x75,
 	0x08, 0x81, 0x01, 0x95, 0x03,
 	0x75, 0x01, 0x05, 0x08, 0x19,
 	0x01, 0x29, 0x03, 0x91, 0x02,
 	0x95, 0x05, 0x75, 0x01, 0x91,
 	0x01, 0x95, 0x06, 0x75, 0x08,
 	0x15, 0x00, 0x26, 0xff, 0x00,
 	0x05, 0x07, 0x19, 0x00, 0x2a,
 	0xff, 0x00, 0x81, 0x00, 0xc0
 };
 
 /* prototypes */
 static void	ukbd_timeout(void *);
 static void	ukbd_set_leds(struct ukbd_softc *, uint8_t);
 static int	ukbd_set_typematic(keyboard_t *, int);
 #ifdef UKBD_EMULATE_ATSCANCODE
 static uint32_t	ukbd_atkeycode(int, int);
 static int	ukbd_key2scan(struct ukbd_softc *, int, int, int);
 #endif
 static uint32_t	ukbd_read_char(keyboard_t *, int);
 static void	ukbd_clear_state(keyboard_t *);
 static int	ukbd_ioctl(keyboard_t *, u_long, caddr_t);
 static int	ukbd_enable(keyboard_t *);
 static int	ukbd_disable(keyboard_t *);
 static void	ukbd_interrupt(struct ukbd_softc *);
 static void	ukbd_event_keyinput(struct ukbd_softc *);
 
 static device_probe_t ukbd_probe;
 static device_attach_t ukbd_attach;
 static device_detach_t ukbd_detach;
 static device_resume_t ukbd_resume;
 
 #ifdef EVDEV
 static struct evdev_methods ukbd_evdev_methods = {
 	.ev_event = evdev_ev_kbd_event,
 };
 #endif
 
 static uint8_t
 ukbd_any_key_pressed(struct ukbd_softc *sc)
 {
 	uint8_t i;
 	uint8_t j;
 
 	for (j = i = 0; i < UKBD_NKEYCODE; i++)
 		j |= sc->sc_odata.keycode[i];
 
 	return (j ? 1 : 0);
 }
 
 static void
 ukbd_start_timer(struct ukbd_softc *sc)
 {
 	sbintime_t delay, prec;
 
 	delay = SBT_1MS * sc->sc_delay;
 	sc->sc_co_basetime += delay;
 	/* This is rarely called, so prefer precision to efficiency. */
 	prec = qmin(delay >> 7, SBT_1MS * 10);
 	usb_callout_reset_sbt(&sc->sc_callout, sc->sc_co_basetime, prec,
 	    ukbd_timeout, sc, C_ABSOLUTE);
 }
 
 static void
 ukbd_put_key(struct ukbd_softc *sc, uint32_t key)
 {
 
 	UKBD_LOCK_ASSERT();
 
 	DPRINTF("0x%02x (%d) %s\n", key, key,
 	    (key & KEY_RELEASE) ? "released" : "pressed");
 
 #ifdef EVDEV
 	if (evdev_rcpt_mask & EVDEV_RCPT_HW_KBD && sc->sc_evdev != NULL) {
 		evdev_push_event(sc->sc_evdev, EV_KEY,
 		    evdev_hid2key(KEY_INDEX(key)), !(key & KEY_RELEASE));
 		evdev_sync(sc->sc_evdev);
 	}
 #endif
 
 	if (sc->sc_inputs < UKBD_IN_BUF_SIZE) {
 		sc->sc_input[sc->sc_inputtail] = key;
 		++(sc->sc_inputs);
 		++(sc->sc_inputtail);
 		if (sc->sc_inputtail >= UKBD_IN_BUF_SIZE) {
 			sc->sc_inputtail = 0;
 		}
 	} else {
 		DPRINTF("input buffer is full\n");
 	}
 }
 
 static void
 ukbd_do_poll(struct ukbd_softc *sc, uint8_t wait)
 {
 
 	UKBD_LOCK_ASSERT();
 	KASSERT((sc->sc_flags & UKBD_FLAG_POLLING) != 0,
 	    ("ukbd_do_poll called when not polling\n"));
 	DPRINTFN(2, "polling\n");
 
 	if (USB_IN_POLLING_MODE_FUNC() == 0) {
 		/*
 		 * In this context the kernel is polling for input,
 		 * but the USB subsystem works in normal interrupt-driven
 		 * mode, so we just wait on the USB threads to do the job.
 		 * Note that we currently hold the Giant, but it's also used
 		 * as the transfer mtx, so we must release it while waiting.
 		 */
 		while (sc->sc_inputs == 0) {
 			/*
 			 * Give USB threads a chance to run.  Note that
 			 * kern_yield performs DROP_GIANT + PICKUP_GIANT.
 			 */
 			kern_yield(PRI_UNCHANGED);
 			if (!wait)
 				break;
 		}
 		return;
 	}
 
 	while (sc->sc_inputs == 0) {
 
 		usbd_transfer_poll(sc->sc_xfer, UKBD_N_TRANSFER);
 
 		/* Delay-optimised support for repetition of keys */
 		if (ukbd_any_key_pressed(sc)) {
 			/* a key is pressed - need timekeeping */
 			DELAY(1000);
 
 			/* 1 millisecond has passed */
 			sc->sc_time_ms += 1;
 		}
 
 		ukbd_interrupt(sc);
 
 		if (!wait)
 			break;
 	}
 }
 
 static int32_t
 ukbd_get_key(struct ukbd_softc *sc, uint8_t wait)
 {
 	int32_t c;
 
 	UKBD_LOCK_ASSERT();
 	KASSERT((USB_IN_POLLING_MODE_FUNC() == 0) ||
 	    (sc->sc_flags & UKBD_FLAG_POLLING) != 0,
 	    ("not polling in kdb or panic\n"));
 
 	if (sc->sc_inputs == 0 &&
 	    (sc->sc_flags & UKBD_FLAG_GONE) == 0) {
 		/* start transfer, if not already started */
 		usbd_transfer_start(sc->sc_xfer[UKBD_INTR_DT_0]);
 		usbd_transfer_start(sc->sc_xfer[UKBD_INTR_DT_1]);
 	}
 
 	if (sc->sc_flags & UKBD_FLAG_POLLING)
 		ukbd_do_poll(sc, wait);
 
 	if (sc->sc_inputs == 0) {
 		c = -1;
 	} else {
 		c = sc->sc_input[sc->sc_inputhead];
 		--(sc->sc_inputs);
 		++(sc->sc_inputhead);
 		if (sc->sc_inputhead >= UKBD_IN_BUF_SIZE) {
 			sc->sc_inputhead = 0;
 		}
 	}
 	return (c);
 }
 
 static void
 ukbd_interrupt(struct ukbd_softc *sc)
 {
 	struct timeval ctv;
 	uint32_t n_mod;
 	uint32_t o_mod;
 	uint32_t now = sc->sc_time_ms;
 	int32_t dtime;
 	uint8_t key;
 	uint8_t i;
 	uint8_t j;
 
 	UKBD_LOCK_ASSERT();
 
 	if (sc->sc_ndata.keycode[0] == KEY_ERROR)
 		return;
 
 	n_mod = sc->sc_ndata.modifiers;
 	o_mod = sc->sc_odata.modifiers;
 	if (n_mod != o_mod) {
 		for (i = 0; i < UKBD_NMOD; i++) {
 			if ((n_mod & ukbd_mods[i].mask) !=
 			    (o_mod & ukbd_mods[i].mask)) {
 				ukbd_put_key(sc, ukbd_mods[i].key |
 				    ((n_mod & ukbd_mods[i].mask) ?
 				    KEY_PRESS : KEY_RELEASE));
 			}
 		}
 	}
 	/* Check for released keys. */
 	for (i = 0; i < UKBD_NKEYCODE; i++) {
 		key = sc->sc_odata.keycode[i];
 		if (key == 0) {
 			continue;
 		}
 		for (j = 0; j < UKBD_NKEYCODE; j++) {
 			if (sc->sc_ndata.keycode[j] == 0) {
 				continue;
 			}
 			if (key == sc->sc_ndata.keycode[j]) {
 				goto rfound;
 			}
 		}
 		ukbd_put_key(sc, key | KEY_RELEASE);
 rfound:	;
 	}
 
 	/* Check for pressed keys. */
 	for (i = 0; i < UKBD_NKEYCODE; i++) {
 		key = sc->sc_ndata.keycode[i];
 		if (key == 0) {
 			continue;
 		}
 		sc->sc_ntime[i] = now + sc->sc_kbd.kb_delay1;
 		for (j = 0; j < UKBD_NKEYCODE; j++) {
 			if (sc->sc_odata.keycode[j] == 0) {
 				continue;
 			}
 			if (key == sc->sc_odata.keycode[j]) {
 
 				/* key is still pressed */
 
 				sc->sc_ntime[i] = sc->sc_otime[j];
 				dtime = (sc->sc_otime[j] - now);
 
 				if (dtime > 0) {
 					/* time has not elapsed */
 					goto pfound;
 				}
 				sc->sc_ntime[i] = now + sc->sc_kbd.kb_delay2;
 				break;
 			}
 		}
 		if (j < UKBD_NKEYCODE) {
 			/* Old key repeating. */
 			sc->sc_delay = sc->sc_kbd.kb_delay2;
 		} else {
 			/* New key. */
 			microuptime(&ctv);
 			sc->sc_co_basetime = tvtosbt(ctv);
 			sc->sc_delay = sc->sc_kbd.kb_delay1;
 		}
 		ukbd_put_key(sc, key | KEY_PRESS);
 
 		/*
                  * If any other key is presently down, force its repeat to be
                  * well in the future (100s).  This makes the last key to be
                  * pressed do the autorepeat.
                  */
 		for (j = 0; j != UKBD_NKEYCODE; j++) {
 			if (j != i)
 				sc->sc_ntime[j] = now + (100 * 1000);
 		}
 pfound:	;
 	}
 
 	sc->sc_odata = sc->sc_ndata;
 
 	memcpy(sc->sc_otime, sc->sc_ntime, sizeof(sc->sc_otime));
 
 	ukbd_event_keyinput(sc);
 }
 
 static void
 ukbd_event_keyinput(struct ukbd_softc *sc)
 {
 	int c;
 
 	UKBD_LOCK_ASSERT();
 
 	if ((sc->sc_flags & UKBD_FLAG_POLLING) != 0)
 		return;
 
 	if (sc->sc_inputs == 0)
 		return;
 
 	if (KBD_IS_ACTIVE(&sc->sc_kbd) &&
 	    KBD_IS_BUSY(&sc->sc_kbd)) {
 		/* let the callback function process the input */
 		(sc->sc_kbd.kb_callback.kc_func) (&sc->sc_kbd, KBDIO_KEYINPUT,
 		    sc->sc_kbd.kb_callback.kc_arg);
 	} else {
 		/* read and discard the input, no one is waiting for it */
 		do {
 			c = ukbd_read_char(&sc->sc_kbd, 0);
 		} while (c != NOKEY);
 	}
 }
 
 static void
 ukbd_timeout(void *arg)
 {
 	struct ukbd_softc *sc = arg;
 
 	UKBD_LOCK_ASSERT();
 
 	sc->sc_time_ms += sc->sc_delay;
 	sc->sc_delay = 0;
 
 	ukbd_interrupt(sc);
 
 	/* Make sure any leftover key events gets read out */
 	ukbd_event_keyinput(sc);
 
 	if (ukbd_any_key_pressed(sc) || (sc->sc_inputs != 0)) {
 		ukbd_start_timer(sc);
 	}
 }
 
 static uint8_t
 ukbd_apple_fn(uint8_t keycode) {
 	switch (keycode) {
 	case 0x28: return 0x49; /* RETURN -> INSERT */
 	case 0x2a: return 0x4c; /* BACKSPACE -> DEL */
 	case 0x50: return 0x4a; /* LEFT ARROW -> HOME */
 	case 0x4f: return 0x4d; /* RIGHT ARROW -> END */
 	case 0x52: return 0x4b; /* UP ARROW -> PGUP */
 	case 0x51: return 0x4e; /* DOWN ARROW -> PGDN */
 	default: return keycode;
 	}
 }
 
 static uint8_t
 ukbd_apple_swap(uint8_t keycode) {
 	switch (keycode) {
 	case 0x35: return 0x64;
 	case 0x64: return 0x35;
 	default: return keycode;
 	}
 }
 
 static void
 ukbd_intr_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct ukbd_softc *sc = usbd_xfer_softc(xfer);
 	struct usb_page_cache *pc;
 	uint8_t i;
 	uint8_t offset;
 	uint8_t id;
 	int len;
 
 	UKBD_LOCK_ASSERT();
 
 	usbd_xfer_status(xfer, &len, NULL, NULL, NULL);
 	pc = usbd_xfer_get_frame(xfer, 0);
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 		DPRINTF("actlen=%d bytes\n", len);
 
 		if (len == 0) {
 			DPRINTF("zero length data\n");
 			goto tr_setup;
 		}
 
 		if (sc->sc_kbd_id != 0) {
 			/* check and remove HID ID byte */
 			usbd_copy_out(pc, 0, &id, 1);
 			offset = 1;
 			len--;
 			if (len == 0) {
 				DPRINTF("zero length data\n");
 				goto tr_setup;
 			}
 		} else {
 			offset = 0;
 			id = 0;
 		}
 
 		if (len > UKBD_BUFFER_SIZE)
 			len = UKBD_BUFFER_SIZE;
 
 		/* get data */
 		usbd_copy_out(pc, offset, sc->sc_buffer, len);
 
 		/* clear temporary storage */
 		memset(&sc->sc_ndata, 0, sizeof(sc->sc_ndata));
 
 		/* scan through HID data */
 		if ((sc->sc_flags & UKBD_FLAG_APPLE_EJECT) &&
 		    (id == sc->sc_id_apple_eject)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_apple_eject))
 				sc->sc_modifiers |= MOD_EJECT;
 			else
 				sc->sc_modifiers &= ~MOD_EJECT;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_APPLE_FN) &&
 		    (id == sc->sc_id_apple_fn)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_apple_fn))
 				sc->sc_modifiers |= MOD_FN;
 			else
 				sc->sc_modifiers &= ~MOD_FN;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_CTRL_L) &&
 		    (id == sc->sc_id_ctrl_l)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_ctrl_l))
 			  sc->	sc_modifiers |= MOD_CONTROL_L;
 			else
 			  sc->	sc_modifiers &= ~MOD_CONTROL_L;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_CTRL_R) &&
 		    (id == sc->sc_id_ctrl_r)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_ctrl_r))
 				sc->sc_modifiers |= MOD_CONTROL_R;
 			else
 				sc->sc_modifiers &= ~MOD_CONTROL_R;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_SHIFT_L) &&
 		    (id == sc->sc_id_shift_l)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_shift_l))
 				sc->sc_modifiers |= MOD_SHIFT_L;
 			else
 				sc->sc_modifiers &= ~MOD_SHIFT_L;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_SHIFT_R) &&
 		    (id == sc->sc_id_shift_r)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_shift_r))
 				sc->sc_modifiers |= MOD_SHIFT_R;
 			else
 				sc->sc_modifiers &= ~MOD_SHIFT_R;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_ALT_L) &&
 		    (id == sc->sc_id_alt_l)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_alt_l))
 				sc->sc_modifiers |= MOD_ALT_L;
 			else
 				sc->sc_modifiers &= ~MOD_ALT_L;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_ALT_R) &&
 		    (id == sc->sc_id_alt_r)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_alt_r))
 				sc->sc_modifiers |= MOD_ALT_R;
 			else
 				sc->sc_modifiers &= ~MOD_ALT_R;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_WIN_L) &&
 		    (id == sc->sc_id_win_l)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_win_l))
 				sc->sc_modifiers |= MOD_WIN_L;
 			else
 				sc->sc_modifiers &= ~MOD_WIN_L;
 		}
 		if ((sc->sc_flags & UKBD_FLAG_WIN_R) &&
 		    (id == sc->sc_id_win_r)) {
 			if (hid_get_data(sc->sc_buffer, len, &sc->sc_loc_win_r))
 				sc->sc_modifiers |= MOD_WIN_R;
 			else
 				sc->sc_modifiers &= ~MOD_WIN_R;
 		}
 
 		sc->sc_ndata.modifiers = sc->sc_modifiers;
 
 		if ((sc->sc_flags & UKBD_FLAG_EVENTS) &&
 		    (id == sc->sc_id_events)) {
 			i = sc->sc_loc_events.count;
 			if (i > UKBD_NKEYCODE)
 				i = UKBD_NKEYCODE;
 			if (i > len)
 				i = len;
 			while (i--) {
 				sc->sc_ndata.keycode[i] =
 				    hid_get_data(sc->sc_buffer + i, len - i,
 				    &sc->sc_loc_events);
 			}
 		}
 
 #ifdef USB_DEBUG
 		DPRINTF("modifiers = 0x%04x\n", (int)sc->sc_modifiers);
 		for (i = 0; i < UKBD_NKEYCODE; i++) {
 			if (sc->sc_ndata.keycode[i]) {
 				DPRINTF("[%d] = 0x%02x\n",
 				    (int)i, (int)sc->sc_ndata.keycode[i]);
 			}
 		}
 #endif
 		if (sc->sc_modifiers & MOD_FN) {
 			for (i = 0; i < UKBD_NKEYCODE; i++) {
 				sc->sc_ndata.keycode[i] = 
 				    ukbd_apple_fn(sc->sc_ndata.keycode[i]);
 			}
 		}
 
 		if (sc->sc_flags & UKBD_FLAG_APPLE_SWAP) {
 			for (i = 0; i < UKBD_NKEYCODE; i++) {
 				sc->sc_ndata.keycode[i] = 
 				    ukbd_apple_swap(sc->sc_ndata.keycode[i]);
 			}
 		}
 
 		ukbd_interrupt(sc);
 
 		if (ukbd_any_key_pressed(sc) != 0) {
 			ukbd_start_timer(sc);
 		}
 
 	case USB_ST_SETUP:
 tr_setup:
 		if (sc->sc_inputs < UKBD_IN_BUF_FULL) {
 			usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
 			usbd_transfer_submit(xfer);
 		} else {
 			DPRINTF("input queue is full!\n");
 		}
 		break;
 
 	default:			/* Error */
 		DPRINTF("error=%s\n", usbd_errstr(error));
 
 		if (error != USB_ERR_CANCELLED) {
 			/* try to clear stall first */
 			usbd_xfer_set_stall(xfer);
 			goto tr_setup;
 		}
 		break;
 	}
 }
 
 static void
 ukbd_set_leds_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct ukbd_softc *sc = usbd_xfer_softc(xfer);
 	struct usb_device_request req;
 	struct usb_page_cache *pc;
 	uint8_t id;
 	uint8_t any;
 	int len;
 
 	UKBD_LOCK_ASSERT();
 
 #ifdef USB_DEBUG
 	if (ukbd_no_leds)
 		return;
 #endif
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 	case USB_ST_SETUP:
 		if (!(sc->sc_flags & UKBD_FLAG_SET_LEDS))
 			break;
 		sc->sc_flags &= ~UKBD_FLAG_SET_LEDS;
 
 		req.bmRequestType = UT_WRITE_CLASS_INTERFACE;
 		req.bRequest = UR_SET_REPORT;
 		USETW2(req.wValue, UHID_OUTPUT_REPORT, 0);
 		req.wIndex[0] = sc->sc_iface_no;
 		req.wIndex[1] = 0;
 		req.wLength[1] = 0;
 
 		memset(sc->sc_buffer, 0, UKBD_BUFFER_SIZE);
 
 		id = 0;
 		any = 0;
 
 		/* Assumption: All led bits must be in the same ID. */
 
 		if (sc->sc_flags & UKBD_FLAG_NUMLOCK) {
 			if (sc->sc_leds & NLKED) {
 				hid_put_data_unsigned(sc->sc_buffer + 1, UKBD_BUFFER_SIZE - 1,
 				    &sc->sc_loc_numlock, 1);
 			}
 			id = sc->sc_id_numlock;
 			any = 1;
 		}
 
 		if (sc->sc_flags & UKBD_FLAG_SCROLLLOCK) {
 			if (sc->sc_leds & SLKED) {
 				hid_put_data_unsigned(sc->sc_buffer + 1, UKBD_BUFFER_SIZE - 1,
 				    &sc->sc_loc_scrolllock, 1);
 			}
 			id = sc->sc_id_scrolllock;
 			any = 1;
 		}
 
 		if (sc->sc_flags & UKBD_FLAG_CAPSLOCK) {
 			if (sc->sc_leds & CLKED) {
 				hid_put_data_unsigned(sc->sc_buffer + 1, UKBD_BUFFER_SIZE - 1,
 				    &sc->sc_loc_capslock, 1);
 			}
 			id = sc->sc_id_capslock;
 			any = 1;
 		}
 
 		/* if no leds, nothing to do */
 		if (!any)
 			break;
 
 #ifdef EVDEV
 		if (sc->sc_evdev != NULL)
 			evdev_push_leds(sc->sc_evdev, sc->sc_leds);
 #endif
 
 		/* range check output report length */
 		len = sc->sc_led_size;
 		if (len > (UKBD_BUFFER_SIZE - 1))
 			len = (UKBD_BUFFER_SIZE - 1);
 
 		/* check if we need to prefix an ID byte */
 		sc->sc_buffer[0] = id;
 
 		pc = usbd_xfer_get_frame(xfer, 1);
 		if (id != 0) {
 			len++;
 			usbd_copy_in(pc, 0, sc->sc_buffer, len);
 		} else {
 			usbd_copy_in(pc, 0, sc->sc_buffer + 1, len);
 		}
 		req.wLength[0] = len;
 		usbd_xfer_set_frame_len(xfer, 1, len);
 
 		DPRINTF("len=%d, id=%d\n", len, id);
 
 		/* setup control request last */
 		pc = usbd_xfer_get_frame(xfer, 0);
 		usbd_copy_in(pc, 0, &req, sizeof(req));
 		usbd_xfer_set_frame_len(xfer, 0, sizeof(req));
 
 		/* start data transfer */
 		usbd_xfer_set_frames(xfer, 2);
 		usbd_transfer_submit(xfer);
 		break;
 
 	default:			/* Error */
 		DPRINTFN(1, "error=%s\n", usbd_errstr(error));
 		break;
 	}
 }
 
 static const struct usb_config ukbd_config[UKBD_N_TRANSFER] = {
 
 	[UKBD_INTR_DT_0] = {
 		.type = UE_INTERRUPT,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_IN,
 		.flags = {.pipe_bof = 1,.short_xfer_ok = 1,},
 		.bufsize = 0,	/* use wMaxPacketSize */
 		.callback = &ukbd_intr_callback,
 	},
 
 	[UKBD_INTR_DT_1] = {
 		.type = UE_INTERRUPT,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_IN,
 		.flags = {.pipe_bof = 1,.short_xfer_ok = 1,},
 		.bufsize = 0,	/* use wMaxPacketSize */
 		.callback = &ukbd_intr_callback,
 	},
 
 	[UKBD_CTRL_LED] = {
 		.type = UE_CONTROL,
 		.endpoint = 0x00,	/* Control pipe */
 		.direction = UE_DIR_ANY,
 		.bufsize = sizeof(struct usb_device_request) + UKBD_BUFFER_SIZE,
 		.callback = &ukbd_set_leds_callback,
 		.timeout = 1000,	/* 1 second */
 	},
 };
 
 /* A match on these entries will load ukbd */
 static const STRUCT_USB_HOST_ID __used ukbd_devs[] = {
 	{USB_IFACE_CLASS(UICLASS_HID),
 	 USB_IFACE_SUBCLASS(UISUBCLASS_BOOT),
 	 USB_IFACE_PROTOCOL(UIPROTO_BOOT_KEYBOARD),},
 };
 
 static int
 ukbd_probe(device_t dev)
 {
 	keyboard_switch_t *sw = kbd_get_switch(UKBD_DRIVER_NAME);
 	struct usb_attach_arg *uaa = device_get_ivars(dev);
 	void *d_ptr;
 	int error;
 	uint16_t d_len;
 
 	UKBD_LOCK_ASSERT();
 	DPRINTFN(11, "\n");
 
 	if (sw == NULL) {
 		return (ENXIO);
 	}
 	if (uaa->usb_mode != USB_MODE_HOST) {
 		return (ENXIO);
 	}
 
 	if (uaa->info.bInterfaceClass != UICLASS_HID)
 		return (ENXIO);
 
 	if (usb_test_quirk(uaa, UQ_KBD_IGNORE))
 		return (ENXIO);
 
 	if ((uaa->info.bInterfaceSubClass == UISUBCLASS_BOOT) &&
 	    (uaa->info.bInterfaceProtocol == UIPROTO_BOOT_KEYBOARD))
 		return (BUS_PROBE_DEFAULT);
 
 	error = usbd_req_get_hid_desc(uaa->device, NULL,
 	    &d_ptr, &d_len, M_TEMP, uaa->info.bIfaceIndex);
 
 	if (error)
 		return (ENXIO);
 
 	if (hid_is_keyboard(d_ptr, d_len)) {
 		if (hid_is_mouse(d_ptr, d_len)) {
 			/*
 			 * NOTE: We currently don't support USB mouse
 			 * and USB keyboard on the same USB endpoint.
 			 * Let "ums" driver win.
 			 */
 			error = ENXIO;
 		} else {
 			error = BUS_PROBE_DEFAULT;
 		}
 	} else {
 		error = ENXIO;
 	}
 	free(d_ptr, M_TEMP);
 	return (error);
 }
 
 static void
 ukbd_parse_hid(struct ukbd_softc *sc, const uint8_t *ptr, uint32_t len)
 {
 	uint32_t flags;
 
 	/* reset detected bits */
 	sc->sc_flags &= ~UKBD_FLAG_HID_MASK;
 
 	/* check if there is an ID byte */
 	sc->sc_kbd_size = hid_report_size(ptr, len,
 	    hid_input, &sc->sc_kbd_id);
 
 	/* investigate if this is an Apple Keyboard */
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_CONSUMER, HUG_APPLE_EJECT),
 	    hid_input, 0, &sc->sc_loc_apple_eject, &flags,
 	    &sc->sc_id_apple_eject)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_APPLE_EJECT | 
 			    UKBD_FLAG_APPLE_SWAP;
 		DPRINTFN(1, "Found Apple eject-key\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(0xFFFF, 0x0003),
 	    hid_input, 0, &sc->sc_loc_apple_fn, &flags,
 	    &sc->sc_id_apple_fn)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_APPLE_FN;
 		DPRINTFN(1, "Found Apple FN-key\n");
 	}
 	/* figure out some keys */
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE0),
 	    hid_input, 0, &sc->sc_loc_ctrl_l, &flags,
 	    &sc->sc_id_ctrl_l)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_CTRL_L;
 		DPRINTFN(1, "Found left control\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE4),
 	    hid_input, 0, &sc->sc_loc_ctrl_r, &flags,
 	    &sc->sc_id_ctrl_r)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_CTRL_R;
 		DPRINTFN(1, "Found right control\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE1),
 	    hid_input, 0, &sc->sc_loc_shift_l, &flags,
 	    &sc->sc_id_shift_l)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_SHIFT_L;
 		DPRINTFN(1, "Found left shift\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE5),
 	    hid_input, 0, &sc->sc_loc_shift_r, &flags,
 	    &sc->sc_id_shift_r)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_SHIFT_R;
 		DPRINTFN(1, "Found right shift\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE2),
 	    hid_input, 0, &sc->sc_loc_alt_l, &flags,
 	    &sc->sc_id_alt_l)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_ALT_L;
 		DPRINTFN(1, "Found left alt\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE6),
 	    hid_input, 0, &sc->sc_loc_alt_r, &flags,
 	    &sc->sc_id_alt_r)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_ALT_R;
 		DPRINTFN(1, "Found right alt\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE3),
 	    hid_input, 0, &sc->sc_loc_win_l, &flags,
 	    &sc->sc_id_win_l)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_WIN_L;
 		DPRINTFN(1, "Found left GUI\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0xE7),
 	    hid_input, 0, &sc->sc_loc_win_r, &flags,
 	    &sc->sc_id_win_r)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_WIN_R;
 		DPRINTFN(1, "Found right GUI\n");
 	}
 	/* figure out event buffer */
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_KEYBOARD, 0x00),
 	    hid_input, 0, &sc->sc_loc_events, &flags,
 	    &sc->sc_id_events)) {
 		if (flags & HIO_VARIABLE) {
 			DPRINTFN(1, "Ignoring keyboard event control\n");
 		} else {
 			sc->sc_flags |= UKBD_FLAG_EVENTS;
 			DPRINTFN(1, "Found keyboard event array\n");
 		}
 	}
 
 	/* figure out leds on keyboard */
 	sc->sc_led_size = hid_report_size(ptr, len,
 	    hid_output, NULL);
 
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_LEDS, 0x01),
 	    hid_output, 0, &sc->sc_loc_numlock, &flags,
 	    &sc->sc_id_numlock)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_NUMLOCK;
 		DPRINTFN(1, "Found keyboard numlock\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_LEDS, 0x02),
 	    hid_output, 0, &sc->sc_loc_capslock, &flags,
 	    &sc->sc_id_capslock)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_CAPSLOCK;
 		DPRINTFN(1, "Found keyboard capslock\n");
 	}
 	if (hid_locate(ptr, len,
 	    HID_USAGE2(HUP_LEDS, 0x03),
 	    hid_output, 0, &sc->sc_loc_scrolllock, &flags,
 	    &sc->sc_id_scrolllock)) {
 		if (flags & HIO_VARIABLE)
 			sc->sc_flags |= UKBD_FLAG_SCROLLLOCK;
 		DPRINTFN(1, "Found keyboard scrolllock\n");
 	}
 }
 
 static int
 ukbd_attach(device_t dev)
 {
 	struct ukbd_softc *sc = device_get_softc(dev);
 	struct usb_attach_arg *uaa = device_get_ivars(dev);
 	int unit = device_get_unit(dev);
 	keyboard_t *kbd = &sc->sc_kbd;
 	void *hid_ptr = NULL;
 	usb_error_t err;
 	uint16_t n;
 	uint16_t hid_len;
 #ifdef EVDEV
 	struct evdev_dev *evdev;
 	int i;
 #endif
 #ifdef USB_DEBUG
 	int rate;
 #endif
 	UKBD_LOCK_ASSERT();
 
 	kbd_init_struct(kbd, UKBD_DRIVER_NAME, KB_OTHER, unit, 0, 0, 0);
 
 	kbd->kb_data = (void *)sc;
 
 	device_set_usb_desc(dev);
 
 	sc->sc_udev = uaa->device;
 	sc->sc_iface = uaa->iface;
 	sc->sc_iface_index = uaa->info.bIfaceIndex;
 	sc->sc_iface_no = uaa->info.bIfaceNum;
 	sc->sc_mode = K_XLATE;
 
 	usb_callout_init_mtx(&sc->sc_callout, &Giant, 0);
 
 #ifdef UKBD_NO_POLLING
 	err = usbd_transfer_setup(uaa->device,
 	    &uaa->info.bIfaceIndex, sc->sc_xfer, ukbd_config,
 	    UKBD_N_TRANSFER, sc, &Giant);
 #else
 	/*
 	 * Setup the UKBD USB transfers one by one, so they are memory
 	 * independent which allows for handling panics triggered by
 	 * the keyboard driver itself, typically via CTRL+ALT+ESC
 	 * sequences. Or if the USB keyboard driver was processing a
 	 * key at the moment of panic.
 	 */
 	for (n = 0; n != UKBD_N_TRANSFER; n++) {
 		err = usbd_transfer_setup(uaa->device,
 		    &uaa->info.bIfaceIndex, sc->sc_xfer + n, ukbd_config + n,
 		    1, sc, &Giant);
 		if (err)
 			break;
 	}
 #endif
 
 	if (err) {
 		DPRINTF("error=%s\n", usbd_errstr(err));
 		goto detach;
 	}
 	/* setup default keyboard maps */
 
 	sc->sc_keymap = key_map;
 	sc->sc_accmap = accent_map;
 	for (n = 0; n < UKBD_NFKEY; n++) {
 		sc->sc_fkeymap[n] = fkey_tab[n];
 	}
 
 	kbd_set_maps(kbd, &sc->sc_keymap, &sc->sc_accmap,
 	    sc->sc_fkeymap, UKBD_NFKEY);
 
 	KBD_FOUND_DEVICE(kbd);
 
 	ukbd_clear_state(kbd);
 
 	/*
 	 * FIXME: set the initial value for lock keys in "sc_state"
 	 * according to the BIOS data?
 	 */
 	KBD_PROBE_DONE(kbd);
 
 	/* get HID descriptor */
 	err = usbd_req_get_hid_desc(uaa->device, NULL, &hid_ptr,
 	    &hid_len, M_TEMP, uaa->info.bIfaceIndex);
 
 	if (err == 0) {
 		DPRINTF("Parsing HID descriptor of %d bytes\n",
 		    (int)hid_len);
 
 		ukbd_parse_hid(sc, hid_ptr, hid_len);
 
 		free(hid_ptr, M_TEMP);
 	}
 
 	/* check if we should use the boot protocol */
 	if (usb_test_quirk(uaa, UQ_KBD_BOOTPROTO) ||
 	    (err != 0) || (!(sc->sc_flags & UKBD_FLAG_EVENTS))) {
 
 		DPRINTF("Forcing boot protocol\n");
 
 		err = usbd_req_set_protocol(sc->sc_udev, NULL, 
 			sc->sc_iface_index, 0);
 
 		if (err != 0) {
 			DPRINTF("Set protocol error=%s (ignored)\n",
 			    usbd_errstr(err));
 		}
 
 		ukbd_parse_hid(sc, ukbd_boot_desc, sizeof(ukbd_boot_desc));
 	}
 
 	/* ignore if SETIDLE fails, hence it is not crucial */
 	usbd_req_set_idle(sc->sc_udev, NULL, sc->sc_iface_index, 0, 0);
 
 	ukbd_ioctl(kbd, KDSETLED, (caddr_t)&sc->sc_state);
 
 	KBD_INIT_DONE(kbd);
 
 	if (kbd_register(kbd) < 0) {
 		goto detach;
 	}
 	KBD_CONFIG_DONE(kbd);
 
 	ukbd_enable(kbd);
 
 #ifdef KBD_INSTALL_CDEV
 	if (kbd_attach(kbd)) {
 		goto detach;
 	}
 #endif
 
 #ifdef EVDEV
 	evdev = evdev_alloc();
 	evdev_set_name(evdev, device_get_desc(dev));
 	evdev_set_phys(evdev, device_get_nameunit(dev));
 	evdev_set_id(evdev, BUS_USB, uaa->info.idVendor,
 	   uaa->info.idProduct, 0);
 	evdev_set_serial(evdev, usb_get_serial(uaa->device));
 	evdev_set_methods(evdev, kbd, &ukbd_evdev_methods);
 	evdev_support_event(evdev, EV_SYN);
 	evdev_support_event(evdev, EV_KEY);
 	if (sc->sc_flags & (UKBD_FLAG_NUMLOCK | UKBD_FLAG_CAPSLOCK |
 			    UKBD_FLAG_SCROLLLOCK))
 		evdev_support_event(evdev, EV_LED);
 	evdev_support_event(evdev, EV_REP);
 
 	for (i = 0x00; i <= 0xFF; i++)
 		evdev_support_key(evdev, evdev_hid2key(i));
 	if (sc->sc_flags & UKBD_FLAG_NUMLOCK)
 		evdev_support_led(evdev, LED_NUML);
 	if (sc->sc_flags & UKBD_FLAG_CAPSLOCK)
 		evdev_support_led(evdev, LED_CAPSL);
 	if (sc->sc_flags & UKBD_FLAG_SCROLLLOCK)
 		evdev_support_led(evdev, LED_SCROLLL);
 
 	if (evdev_register(evdev))
 		evdev_free(evdev);
 	else
 		sc->sc_evdev = evdev;
 #endif
 
 	sc->sc_flags |= UKBD_FLAG_ATTACHED;
 
 	if (bootverbose) {
 		genkbd_diag(kbd, bootverbose);
 	}
 
 #ifdef USB_DEBUG
 	/* check for polling rate override */
 	rate = ukbd_pollrate;
 	if (rate > 0) {
 		if (rate > 1000)
 			rate = 1;
 		else
 			rate = 1000 / rate;
 
 		/* set new polling interval in ms */
 		usbd_xfer_set_interval(sc->sc_xfer[UKBD_INTR_DT_0], rate);
 		usbd_xfer_set_interval(sc->sc_xfer[UKBD_INTR_DT_1], rate);
 	}
 #endif
 	/* start the keyboard */
 	usbd_transfer_start(sc->sc_xfer[UKBD_INTR_DT_0]);
 	usbd_transfer_start(sc->sc_xfer[UKBD_INTR_DT_1]);
 
 	return (0);			/* success */
 
 detach:
 	ukbd_detach(dev);
 	return (ENXIO);			/* error */
 }
 
 static int
 ukbd_detach(device_t dev)
 {
 	struct ukbd_softc *sc = device_get_softc(dev);
 	int error;
 
 	UKBD_LOCK_ASSERT();
 
 	DPRINTF("\n");
 
 	sc->sc_flags |= UKBD_FLAG_GONE;
 
 	usb_callout_stop(&sc->sc_callout);
 
 	/* kill any stuck keys */
 	if (sc->sc_flags & UKBD_FLAG_ATTACHED) {
 		/* stop receiving events from the USB keyboard */
 		usbd_transfer_stop(sc->sc_xfer[UKBD_INTR_DT_0]);
 		usbd_transfer_stop(sc->sc_xfer[UKBD_INTR_DT_1]);
 
 		/* release all leftover keys, if any */
 		memset(&sc->sc_ndata, 0, sizeof(sc->sc_ndata));
 
 		/* process releasing of all keys */
 		ukbd_interrupt(sc);
 	}
 
 	ukbd_disable(&sc->sc_kbd);
 
 #ifdef KBD_INSTALL_CDEV
 	if (sc->sc_flags & UKBD_FLAG_ATTACHED) {
 		error = kbd_detach(&sc->sc_kbd);
 		if (error) {
 			/* usb attach cannot return an error */
 			device_printf(dev, "WARNING: kbd_detach() "
 			    "returned non-zero! (ignored)\n");
 		}
 	}
 #endif
 
 #ifdef EVDEV
-	if (sc->sc_evdev != NULL)
-		evdev_free(sc->sc_evdev);
+	evdev_free(sc->sc_evdev);
 #endif
 
 	if (KBD_IS_CONFIGURED(&sc->sc_kbd)) {
 		error = kbd_unregister(&sc->sc_kbd);
 		if (error) {
 			/* usb attach cannot return an error */
 			device_printf(dev, "WARNING: kbd_unregister() "
 			    "returned non-zero! (ignored)\n");
 		}
 	}
 	sc->sc_kbd.kb_flags = 0;
 
 	usbd_transfer_unsetup(sc->sc_xfer, UKBD_N_TRANSFER);
 
 	usb_callout_drain(&sc->sc_callout);
 
 	DPRINTF("%s: disconnected\n",
 	    device_get_nameunit(dev));
 
 	return (0);
 }
 
 static int
 ukbd_resume(device_t dev)
 {
 	struct ukbd_softc *sc = device_get_softc(dev);
 
 	UKBD_LOCK_ASSERT();
 
 	ukbd_clear_state(&sc->sc_kbd);
 
 	return (0);
 }
 
 /* early keyboard probe, not supported */
 static int
 ukbd_configure(int flags)
 {
 	return (0);
 }
 
 /* detect a keyboard, not used */
 static int
 ukbd__probe(int unit, void *arg, int flags)
 {
 	return (ENXIO);
 }
 
 /* reset and initialize the device, not used */
 static int
 ukbd_init(int unit, keyboard_t **kbdp, void *arg, int flags)
 {
 	return (ENXIO);
 }
 
 /* test the interface to the device, not used */
 static int
 ukbd_test_if(keyboard_t *kbd)
 {
 	return (0);
 }
 
 /* finish using this keyboard, not used */
 static int
 ukbd_term(keyboard_t *kbd)
 {
 	return (ENXIO);
 }
 
 /* keyboard interrupt routine, not used */
 static int
 ukbd_intr(keyboard_t *kbd, void *arg)
 {
 	return (0);
 }
 
 /* lock the access to the keyboard, not used */
 static int
 ukbd_lock(keyboard_t *kbd, int lock)
 {
 	return (1);
 }
 
 /*
  * Enable the access to the device; until this function is called,
  * the client cannot read from the keyboard.
  */
 static int
 ukbd_enable(keyboard_t *kbd)
 {
 
 	UKBD_LOCK();
 	KBD_ACTIVATE(kbd);
 	UKBD_UNLOCK();
 
 	return (0);
 }
 
 /* disallow the access to the device */
 static int
 ukbd_disable(keyboard_t *kbd)
 {
 
 	UKBD_LOCK();
 	KBD_DEACTIVATE(kbd);
 	UKBD_UNLOCK();
 
 	return (0);
 }
 
 /* check if data is waiting */
 /* Currently unused. */
 static int
 ukbd_check(keyboard_t *kbd)
 {
 	struct ukbd_softc *sc = kbd->kb_data;
 
 	UKBD_LOCK_ASSERT();
 
 	if (!KBD_IS_ACTIVE(kbd))
 		return (0);
 
 	if (sc->sc_flags & UKBD_FLAG_POLLING)
 		ukbd_do_poll(sc, 0);
 
 #ifdef UKBD_EMULATE_ATSCANCODE
 	if (sc->sc_buffered_char[0]) {
 		return (1);
 	}
 #endif
 	if (sc->sc_inputs > 0) {
 		return (1);
 	}
 	return (0);
 }
 
 /* check if char is waiting */
 static int
 ukbd_check_char_locked(keyboard_t *kbd)
 {
 	struct ukbd_softc *sc = kbd->kb_data;
 
 	UKBD_LOCK_ASSERT();
 
 	if (!KBD_IS_ACTIVE(kbd))
 		return (0);
 
 	if ((sc->sc_composed_char > 0) &&
 	    (!(sc->sc_flags & UKBD_FLAG_COMPOSE))) {
 		return (1);
 	}
 	return (ukbd_check(kbd));
 }
 
 static int
 ukbd_check_char(keyboard_t *kbd)
 {
 	int result;
 
 	UKBD_LOCK();
 	result = ukbd_check_char_locked(kbd);
 	UKBD_UNLOCK();
 
 	return (result);
 }
 
 /* read one byte from the keyboard if it's allowed */
 /* Currently unused. */
 static int
 ukbd_read(keyboard_t *kbd, int wait)
 {
 	struct ukbd_softc *sc = kbd->kb_data;
 	int32_t usbcode;
 #ifdef UKBD_EMULATE_ATSCANCODE
 	uint32_t keycode;
 	uint32_t scancode;
 
 #endif
 
 	UKBD_LOCK_ASSERT();
 
 	if (!KBD_IS_ACTIVE(kbd))
 		return (-1);
 
 #ifdef UKBD_EMULATE_ATSCANCODE
 	if (sc->sc_buffered_char[0]) {
 		scancode = sc->sc_buffered_char[0];
 		if (scancode & SCAN_PREFIX) {
 			sc->sc_buffered_char[0] &= ~SCAN_PREFIX;
 			return ((scancode & SCAN_PREFIX_E0) ? 0xe0 : 0xe1);
 		}
 		sc->sc_buffered_char[0] = sc->sc_buffered_char[1];
 		sc->sc_buffered_char[1] = 0;
 		return (scancode);
 	}
 #endif					/* UKBD_EMULATE_ATSCANCODE */
 
 	/* XXX */
 	usbcode = ukbd_get_key(sc, (wait == FALSE) ? 0 : 1);
 	if (!KBD_IS_ACTIVE(kbd) || (usbcode == -1))
 		return (-1);
 
 	++(kbd->kb_count);
 
 #ifdef UKBD_EMULATE_ATSCANCODE
 	keycode = ukbd_atkeycode(usbcode, sc->sc_ndata.modifiers);
 	if (keycode == NN) {
 		return -1;
 	}
 	return (ukbd_key2scan(sc, keycode, sc->sc_ndata.modifiers,
 	    (usbcode & KEY_RELEASE)));
 #else					/* !UKBD_EMULATE_ATSCANCODE */
 	return (usbcode);
 #endif					/* UKBD_EMULATE_ATSCANCODE */
 }
 
 /* read char from the keyboard */
 static uint32_t
 ukbd_read_char_locked(keyboard_t *kbd, int wait)
 {
 	struct ukbd_softc *sc = kbd->kb_data;
 	uint32_t action;
 	uint32_t keycode;
 	int32_t usbcode;
 #ifdef UKBD_EMULATE_ATSCANCODE
 	uint32_t scancode;
 #endif
 
 	UKBD_LOCK_ASSERT();
 
 	if (!KBD_IS_ACTIVE(kbd))
 		return (NOKEY);
 
 next_code:
 
 	/* do we have a composed char to return ? */
 
 	if ((sc->sc_composed_char > 0) &&
 	    (!(sc->sc_flags & UKBD_FLAG_COMPOSE))) {
 
 		action = sc->sc_composed_char;
 		sc->sc_composed_char = 0;
 
 		if (action > 0xFF) {
 			goto errkey;
 		}
 		goto done;
 	}
 #ifdef UKBD_EMULATE_ATSCANCODE
 
 	/* do we have a pending raw scan code? */
 
 	if (sc->sc_mode == K_RAW) {
 		scancode = sc->sc_buffered_char[0];
 		if (scancode) {
 			if (scancode & SCAN_PREFIX) {
 				sc->sc_buffered_char[0] = (scancode & ~SCAN_PREFIX);
 				return ((scancode & SCAN_PREFIX_E0) ? 0xe0 : 0xe1);
 			}
 			sc->sc_buffered_char[0] = sc->sc_buffered_char[1];
 			sc->sc_buffered_char[1] = 0;
 			return (scancode);
 		}
 	}
 #endif					/* UKBD_EMULATE_ATSCANCODE */
 
 	/* see if there is something in the keyboard port */
 	/* XXX */
 	usbcode = ukbd_get_key(sc, (wait == FALSE) ? 0 : 1);
 	if (usbcode == -1) {
 		return (NOKEY);
 	}
 	++kbd->kb_count;
 
 #ifdef UKBD_EMULATE_ATSCANCODE
 	/* USB key index -> key code -> AT scan code */
 	keycode = ukbd_atkeycode(usbcode, sc->sc_ndata.modifiers);
 	if (keycode == NN) {
 		return (NOKEY);
 	}
 	/* return an AT scan code for the K_RAW mode */
 	if (sc->sc_mode == K_RAW) {
 		return (ukbd_key2scan(sc, keycode, sc->sc_ndata.modifiers,
 		    (usbcode & KEY_RELEASE)));
 	}
 #else					/* !UKBD_EMULATE_ATSCANCODE */
 
 	/* return the byte as is for the K_RAW mode */
 	if (sc->sc_mode == K_RAW) {
 		return (usbcode);
 	}
 	/* USB key index -> key code */
 	keycode = ukbd_trtab[KEY_INDEX(usbcode)];
 	if (keycode == NN) {
 		return (NOKEY);
 	}
 #endif					/* UKBD_EMULATE_ATSCANCODE */
 
 	switch (keycode) {
 	case 0x38:			/* left alt (compose key) */
 		if (usbcode & KEY_RELEASE) {
 			if (sc->sc_flags & UKBD_FLAG_COMPOSE) {
 				sc->sc_flags &= ~UKBD_FLAG_COMPOSE;
 
 				if (sc->sc_composed_char > 0xFF) {
 					sc->sc_composed_char = 0;
 				}
 			}
 		} else {
 			if (!(sc->sc_flags & UKBD_FLAG_COMPOSE)) {
 				sc->sc_flags |= UKBD_FLAG_COMPOSE;
 				sc->sc_composed_char = 0;
 			}
 		}
 		break;
 	}
 
 	/* return the key code in the K_CODE mode */
 	if (usbcode & KEY_RELEASE) {
 		keycode |= SCAN_RELEASE;
 	}
 	if (sc->sc_mode == K_CODE) {
 		return (keycode);
 	}
 	/* compose a character code */
 	if (sc->sc_flags & UKBD_FLAG_COMPOSE) {
 		switch (keycode) {
 			/* key pressed, process it */
 		case 0x47:
 		case 0x48:
 		case 0x49:		/* keypad 7,8,9 */
 			sc->sc_composed_char *= 10;
 			sc->sc_composed_char += keycode - 0x40;
 			goto check_composed;
 
 		case 0x4B:
 		case 0x4C:
 		case 0x4D:		/* keypad 4,5,6 */
 			sc->sc_composed_char *= 10;
 			sc->sc_composed_char += keycode - 0x47;
 			goto check_composed;
 
 		case 0x4F:
 		case 0x50:
 		case 0x51:		/* keypad 1,2,3 */
 			sc->sc_composed_char *= 10;
 			sc->sc_composed_char += keycode - 0x4E;
 			goto check_composed;
 
 		case 0x52:		/* keypad 0 */
 			sc->sc_composed_char *= 10;
 			goto check_composed;
 
 			/* key released, no interest here */
 		case SCAN_RELEASE | 0x47:
 		case SCAN_RELEASE | 0x48:
 		case SCAN_RELEASE | 0x49:	/* keypad 7,8,9 */
 		case SCAN_RELEASE | 0x4B:
 		case SCAN_RELEASE | 0x4C:
 		case SCAN_RELEASE | 0x4D:	/* keypad 4,5,6 */
 		case SCAN_RELEASE | 0x4F:
 		case SCAN_RELEASE | 0x50:
 		case SCAN_RELEASE | 0x51:	/* keypad 1,2,3 */
 		case SCAN_RELEASE | 0x52:	/* keypad 0 */
 			goto next_code;
 
 		case 0x38:		/* left alt key */
 			break;
 
 		default:
 			if (sc->sc_composed_char > 0) {
 				sc->sc_flags &= ~UKBD_FLAG_COMPOSE;
 				sc->sc_composed_char = 0;
 				goto errkey;
 			}
 			break;
 		}
 	}
 	/* keycode to key action */
 	action = genkbd_keyaction(kbd, SCAN_CHAR(keycode),
 	    (keycode & SCAN_RELEASE),
 	    &sc->sc_state, &sc->sc_accents);
 	if (action == NOKEY) {
 		goto next_code;
 	}
 done:
 	return (action);
 
 check_composed:
 	if (sc->sc_composed_char <= 0xFF) {
 		goto next_code;
 	}
 errkey:
 	return (ERRKEY);
 }
 
 /* Currently wait is always false. */
 static uint32_t
 ukbd_read_char(keyboard_t *kbd, int wait)
 {
 	uint32_t keycode;
 
 	UKBD_LOCK();
 	keycode = ukbd_read_char_locked(kbd, wait);
 	UKBD_UNLOCK();
 
 	return (keycode);
 }
 
 /* some useful control functions */
 static int
 ukbd_ioctl_locked(keyboard_t *kbd, u_long cmd, caddr_t arg)
 {
 	struct ukbd_softc *sc = kbd->kb_data;
 	int i;
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	int ival;
 
 #endif
 
 	UKBD_LOCK_ASSERT();
 
 	switch (cmd) {
 	case KDGKBMODE:		/* get keyboard mode */
 		*(int *)arg = sc->sc_mode;
 		break;
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	case _IO('K', 7):
 		ival = IOCPARM_IVAL(arg);
 		arg = (caddr_t)&ival;
 		/* FALLTHROUGH */
 #endif
 	case KDSKBMODE:		/* set keyboard mode */
 		switch (*(int *)arg) {
 		case K_XLATE:
 			if (sc->sc_mode != K_XLATE) {
 				/* make lock key state and LED state match */
 				sc->sc_state &= ~LOCK_MASK;
 				sc->sc_state |= KBD_LED_VAL(kbd);
 			}
 			/* FALLTHROUGH */
 		case K_RAW:
 		case K_CODE:
 			if (sc->sc_mode != *(int *)arg) {
 				if ((sc->sc_flags & UKBD_FLAG_POLLING) == 0)
 					ukbd_clear_state(kbd);
 				sc->sc_mode = *(int *)arg;
 			}
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 
 	case KDGETLED:			/* get keyboard LED */
 		*(int *)arg = KBD_LED_VAL(kbd);
 		break;
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	case _IO('K', 66):
 		ival = IOCPARM_IVAL(arg);
 		arg = (caddr_t)&ival;
 		/* FALLTHROUGH */
 #endif
 	case KDSETLED:			/* set keyboard LED */
 		/* NOTE: lock key state in "sc_state" won't be changed */
 		if (*(int *)arg & ~LOCK_MASK)
 			return (EINVAL);
 
 		i = *(int *)arg;
 
 		/* replace CAPS LED with ALTGR LED for ALTGR keyboards */
 		if (sc->sc_mode == K_XLATE &&
 		    kbd->kb_keymap->n_keys > ALTGR_OFFSET) {
 			if (i & ALKED)
 				i |= CLKED;
 			else
 				i &= ~CLKED;
 		}
 		if (KBD_HAS_DEVICE(kbd))
 			ukbd_set_leds(sc, i);
 
 		KBD_LED_VAL(kbd) = *(int *)arg;
 		break;
 	case KDGKBSTATE:		/* get lock key state */
 		*(int *)arg = sc->sc_state & LOCK_MASK;
 		break;
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	case _IO('K', 20):
 		ival = IOCPARM_IVAL(arg);
 		arg = (caddr_t)&ival;
 		/* FALLTHROUGH */
 #endif
 	case KDSKBSTATE:		/* set lock key state */
 		if (*(int *)arg & ~LOCK_MASK) {
 			return (EINVAL);
 		}
 		sc->sc_state &= ~LOCK_MASK;
 		sc->sc_state |= *(int *)arg;
 
 		/* set LEDs and quit */
 		return (ukbd_ioctl(kbd, KDSETLED, arg));
 
 	case KDSETREPEAT:		/* set keyboard repeat rate (new
 					 * interface) */
 		if (!KBD_HAS_DEVICE(kbd)) {
 			return (0);
 		}
 		/*
 		 * Convert negative, zero and tiny args to the same limits
 		 * as atkbd.  We could support delays of 1 msec, but
 		 * anything much shorter than the shortest atkbd value
 		 * of 250.34 is almost unusable as well as incompatible.
 		 */
 		kbd->kb_delay1 = imax(((int *)arg)[0], 250);
 		kbd->kb_delay2 = imax(((int *)arg)[1], 34);
 #ifdef EVDEV
 		if (sc->sc_evdev != NULL)
 			evdev_push_repeats(sc->sc_evdev, kbd);
 #endif
 		return (0);
 
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
 	case _IO('K', 67):
 		ival = IOCPARM_IVAL(arg);
 		arg = (caddr_t)&ival;
 		/* FALLTHROUGH */
 #endif
 	case KDSETRAD:			/* set keyboard repeat rate (old
 					 * interface) */
 		return (ukbd_set_typematic(kbd, *(int *)arg));
 
 	case PIO_KEYMAP:		/* set keyboard translation table */
 	case OPIO_KEYMAP:		/* set keyboard translation table
 					 * (compat) */
 	case PIO_KEYMAPENT:		/* set keyboard translation table
 					 * entry */
 	case PIO_DEADKEYMAP:		/* set accent key translation table */
 		sc->sc_accents = 0;
 		/* FALLTHROUGH */
 	default:
 		return (genkbd_commonioctl(kbd, cmd, arg));
 	}
 
 	return (0);
 }
 
 static int
 ukbd_ioctl(keyboard_t *kbd, u_long cmd, caddr_t arg)
 {
 	int result;
 
 	/*
 	 * XXX Check if someone is calling us from a critical section:
 	 */
 	if (curthread->td_critnest != 0)
 		return (EDEADLK);
 
 	/*
 	 * XXX KDGKBSTATE, KDSKBSTATE and KDSETLED can be called from any
 	 * context where printf(9) can be called, which among other things
 	 * includes interrupt filters and threads with any kinds of locks
 	 * already held.  For this reason it would be dangerous to acquire
 	 * the Giant here unconditionally.  On the other hand we have to
 	 * have it to handle the ioctl.
 	 * So we make our best effort to auto-detect whether we can grab
 	 * the Giant or not.  Blame syscons(4) for this.
 	 */
 	switch (cmd) {
 	case KDGKBSTATE:
 	case KDSKBSTATE:
 	case KDSETLED:
 		if (!mtx_owned(&Giant) && !USB_IN_POLLING_MODE_FUNC())
 			return (EDEADLK);	/* best I could come up with */
 		/* FALLTHROUGH */
 	default:
 		UKBD_LOCK();
 		result = ukbd_ioctl_locked(kbd, cmd, arg);
 		UKBD_UNLOCK();
 		return (result);
 	}
 }
 
 
 /* clear the internal state of the keyboard */
 static void
 ukbd_clear_state(keyboard_t *kbd)
 {
 	struct ukbd_softc *sc = kbd->kb_data;
 
 	UKBD_LOCK_ASSERT();
 
 	sc->sc_flags &= ~(UKBD_FLAG_COMPOSE | UKBD_FLAG_POLLING);
 	sc->sc_state &= LOCK_MASK;	/* preserve locking key state */
 	sc->sc_accents = 0;
 	sc->sc_composed_char = 0;
 #ifdef UKBD_EMULATE_ATSCANCODE
 	sc->sc_buffered_char[0] = 0;
 	sc->sc_buffered_char[1] = 0;
 #endif
 	memset(&sc->sc_ndata, 0, sizeof(sc->sc_ndata));
 	memset(&sc->sc_odata, 0, sizeof(sc->sc_odata));
 	memset(&sc->sc_ntime, 0, sizeof(sc->sc_ntime));
 	memset(&sc->sc_otime, 0, sizeof(sc->sc_otime));
 }
 
 /* save the internal state, not used */
 static int
 ukbd_get_state(keyboard_t *kbd, void *buf, size_t len)
 {
 	return (len == 0) ? 1 : -1;
 }
 
 /* set the internal state, not used */
 static int
 ukbd_set_state(keyboard_t *kbd, void *buf, size_t len)
 {
 	return (EINVAL);
 }
 
 static int
 ukbd_poll(keyboard_t *kbd, int on)
 {
 	struct ukbd_softc *sc = kbd->kb_data;
 
 	UKBD_LOCK();
 	/*
 	 * Keep a reference count on polling to allow recursive
 	 * cngrab() during a panic for example.
 	 */
 	if (on)
 		sc->sc_polling++;
 	else if (sc->sc_polling > 0)
 		sc->sc_polling--;
 
 	if (sc->sc_polling != 0) {
 		sc->sc_flags |= UKBD_FLAG_POLLING;
 		sc->sc_poll_thread = curthread;
 	} else {
 		sc->sc_flags &= ~UKBD_FLAG_POLLING;
 		sc->sc_delay = 0;
 	}
 	UKBD_UNLOCK();
 
 	return (0);
 }
 
 /* local functions */
 
 static void
 ukbd_set_leds(struct ukbd_softc *sc, uint8_t leds)
 {
 
 	UKBD_LOCK_ASSERT();
 	DPRINTF("leds=0x%02x\n", leds);
 
 	sc->sc_leds = leds;
 	sc->sc_flags |= UKBD_FLAG_SET_LEDS;
 
 	/* start transfer, if not already started */
 
 	usbd_transfer_start(sc->sc_xfer[UKBD_CTRL_LED]);
 }
 
 static int
 ukbd_set_typematic(keyboard_t *kbd, int code)
 {
 #ifdef EVDEV
 	struct ukbd_softc *sc = kbd->kb_data;
 #endif
 	static const int delays[] = {250, 500, 750, 1000};
 	static const int rates[] = {34, 38, 42, 46, 50, 55, 59, 63,
 		68, 76, 84, 92, 100, 110, 118, 126,
 		136, 152, 168, 184, 200, 220, 236, 252,
 	272, 304, 336, 368, 400, 440, 472, 504};
 
 	if (code & ~0x7f) {
 		return (EINVAL);
 	}
 	kbd->kb_delay1 = delays[(code >> 5) & 3];
 	kbd->kb_delay2 = rates[code & 0x1f];
 #ifdef EVDEV
 	if (sc->sc_evdev != NULL)
 		evdev_push_repeats(sc->sc_evdev, kbd);
 #endif
 	return (0);
 }
 
 #ifdef UKBD_EMULATE_ATSCANCODE
 static uint32_t
 ukbd_atkeycode(int usbcode, int shift)
 {
 	uint32_t keycode;
 
 	keycode = ukbd_trtab[KEY_INDEX(usbcode)];
 	/*
 	 * Translate Alt-PrintScreen to SysRq.
 	 *
 	 * Some or all AT keyboards connected through USB have already
 	 * mapped Alted PrintScreens to an unusual usbcode (0x8a).
 	 * ukbd_trtab translates this to 0x7e, and key2scan() would
 	 * translate that to 0x79 (Intl' 4).  Assume that if we have
 	 * an Alted 0x7e here then it actually is an Alted PrintScreen.
 	 *
 	 * The usual usbcode for all PrintScreens is 0x46.  ukbd_trtab
 	 * translates this to 0x5c, so the Alt check to classify 0x5c
 	 * is routine.
 	 */
 	if ((keycode == 0x5c || keycode == 0x7e) &&
 	    shift & (MOD_ALT_L | MOD_ALT_R))
 		return (0x54);
 	return (keycode);
 }
 
 static int
 ukbd_key2scan(struct ukbd_softc *sc, int code, int shift, int up)
 {
 	static const int scan[] = {
 		/* 89 */
 		0x11c,	/* Enter */
 		/* 90-99 */
 		0x11d,	/* Ctrl-R */
 		0x135,	/* Divide */
 		0x137,	/* PrintScreen */
 		0x138,	/* Alt-R */
 		0x147,	/* Home */
 		0x148,	/* Up */
 		0x149,	/* PageUp */
 		0x14b,	/* Left */
 		0x14d,	/* Right */
 		0x14f,	/* End */
 		/* 100-109 */
 		0x150,	/* Down */
 		0x151,	/* PageDown */
 		0x152,	/* Insert */
 		0x153,	/* Delete */
 		0x146,	/* Pause/Break */
 		0x15b,	/* Win_L(Super_L) */
 		0x15c,	/* Win_R(Super_R) */
 		0x15d,	/* Application(Menu) */
 
 		/* SUN TYPE 6 USB KEYBOARD */
 		0x168,	/* Sun Type 6 Help */
 		0x15e,	/* Sun Type 6 Stop */
 		/* 110 - 119 */
 		0x15f,	/* Sun Type 6 Again */
 		0x160,	/* Sun Type 6 Props */
 		0x161,	/* Sun Type 6 Undo */
 		0x162,	/* Sun Type 6 Front */
 		0x163,	/* Sun Type 6 Copy */
 		0x164,	/* Sun Type 6 Open */
 		0x165,	/* Sun Type 6 Paste */
 		0x166,	/* Sun Type 6 Find */
 		0x167,	/* Sun Type 6 Cut */
 		0x125,	/* Sun Type 6 Mute */
 		/* 120 - 130 */
 		0x11f,	/* Sun Type 6 VolumeDown */
 		0x11e,	/* Sun Type 6 VolumeUp */
 		0x120,	/* Sun Type 6 PowerDown */
 
 		/* Japanese 106/109 keyboard */
 		0x73,	/* Keyboard Intl' 1 (backslash / underscore) */
 		0x70,	/* Keyboard Intl' 2 (Katakana / Hiragana) */
 		0x7d,	/* Keyboard Intl' 3 (Yen sign) (Not using in jp106/109) */
 		0x79,	/* Keyboard Intl' 4 (Henkan) */
 		0x7b,	/* Keyboard Intl' 5 (Muhenkan) */
 		0x5c,	/* Keyboard Intl' 6 (Keypad ,) (For PC-9821 layout) */
 		0x71,   /* Apple Keyboard JIS (Kana) */
 		0x72,   /* Apple Keyboard JIS (Eisu) */
 	};
 
 	if ((code >= 89) && (code < (int)(89 + nitems(scan)))) {
 		code = scan[code - 89];
 	}
 	/* PrintScreen */
 	if (code == 0x137 && (!(shift & (MOD_CONTROL_L | MOD_CONTROL_R |
 	    MOD_SHIFT_L | MOD_SHIFT_R)))) {
 		code |= SCAN_PREFIX_SHIFT;
 	}
 	/* Pause/Break */
 	if ((code == 0x146) && (!(shift & (MOD_CONTROL_L | MOD_CONTROL_R)))) {
 		code = (0x45 | SCAN_PREFIX_E1 | SCAN_PREFIX_CTL);
 	}
 	code |= (up ? SCAN_RELEASE : SCAN_PRESS);
 
 	if (code & SCAN_PREFIX) {
 		if (code & SCAN_PREFIX_CTL) {
 			/* Ctrl */
 			sc->sc_buffered_char[0] = (0x1d | (code & SCAN_RELEASE));
 			sc->sc_buffered_char[1] = (code & ~SCAN_PREFIX);
 		} else if (code & SCAN_PREFIX_SHIFT) {
 			/* Shift */
 			sc->sc_buffered_char[0] = (0x2a | (code & SCAN_RELEASE));
 			sc->sc_buffered_char[1] = (code & ~SCAN_PREFIX_SHIFT);
 		} else {
 			sc->sc_buffered_char[0] = (code & ~SCAN_PREFIX);
 			sc->sc_buffered_char[1] = 0;
 		}
 		return ((code & SCAN_PREFIX_E0) ? 0xe0 : 0xe1);
 	}
 	return (code);
 
 }
 
 #endif					/* UKBD_EMULATE_ATSCANCODE */
 
 static keyboard_switch_t ukbdsw = {
 	.probe = &ukbd__probe,
 	.init = &ukbd_init,
 	.term = &ukbd_term,
 	.intr = &ukbd_intr,
 	.test_if = &ukbd_test_if,
 	.enable = &ukbd_enable,
 	.disable = &ukbd_disable,
 	.read = &ukbd_read,
 	.check = &ukbd_check,
 	.read_char = &ukbd_read_char,
 	.check_char = &ukbd_check_char,
 	.ioctl = &ukbd_ioctl,
 	.lock = &ukbd_lock,
 	.clear_state = &ukbd_clear_state,
 	.get_state = &ukbd_get_state,
 	.set_state = &ukbd_set_state,
 	.get_fkeystr = &genkbd_get_fkeystr,
 	.poll = &ukbd_poll,
 	.diag = &genkbd_diag,
 };
 
 KEYBOARD_DRIVER(ukbd, ukbdsw, ukbd_configure);
 
 static int
 ukbd_driver_load(module_t mod, int what, void *arg)
 {
 	switch (what) {
 	case MOD_LOAD:
 		kbd_add_driver(&ukbd_kbd_driver);
 		break;
 	case MOD_UNLOAD:
 		kbd_delete_driver(&ukbd_kbd_driver);
 		break;
 	}
 	return (0);
 }
 
 static devclass_t ukbd_devclass;
 
 static device_method_t ukbd_methods[] = {
 	DEVMETHOD(device_probe, ukbd_probe),
 	DEVMETHOD(device_attach, ukbd_attach),
 	DEVMETHOD(device_detach, ukbd_detach),
 	DEVMETHOD(device_resume, ukbd_resume),
 
 	DEVMETHOD_END
 };
 
 static driver_t ukbd_driver = {
 	.name = "ukbd",
 	.methods = ukbd_methods,
 	.size = sizeof(struct ukbd_softc),
 };
 
 DRIVER_MODULE(ukbd, uhub, ukbd_driver, ukbd_devclass, ukbd_driver_load, 0);
 MODULE_DEPEND(ukbd, usb, 1, 1, 1);
 MODULE_VERSION(ukbd, 1);
 USB_PNP_HOST_INFO(ukbd_devs);
Index: user/alc/PQ_LAUNDRY/sys/fs/cuse/cuse.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/fs/cuse/cuse.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/fs/cuse/cuse.c	(revision 306283)
@@ -1,1883 +1,1889 @@
 /* $FreeBSD$ */
 /*-
  * Copyright (c) 2010-2013 Hans Petter Selasky. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_compat.h"
 
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/linker_set.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/unistd.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 #include <sys/uio.h>
 #include <sys/poll.h>
 #include <sys/sx.h>
 #include <sys/queue.h>
 #include <sys/fcntl.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/selinfo.h>
 #include <sys/ptrace.h>
 
 #include <machine/bus.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <fs/cuse/cuse_defs.h>
 #include <fs/cuse/cuse_ioctl.h>
 
 MODULE_VERSION(cuse, 1);
 
+/*
+ * Prevent cuse4bsd.ko and cuse.ko from loading at the same time by
+ * declaring support for the cuse4bsd interface in cuse.ko:
+ */
+MODULE_VERSION(cuse4bsd, 1);
+
 #define	NBUSY	((uint8_t *)1)
 
 #ifdef FEATURE
 FEATURE(cuse, "Userspace character devices");
 #endif
 
 struct cuse_command;
 struct cuse_server;
 struct cuse_client;
 
 struct cuse_client_command {
 	TAILQ_ENTRY(cuse_client_command) entry;
 	struct cuse_command sub;
 	struct sx sx;
 	struct cv cv;
 	struct thread *entered;
 	struct cuse_client *client;
 	struct proc *proc_curr;
 	int	proc_refs;
 	int	got_signal;
 	int	error;
 	int	command;
 };
 
 struct cuse_memory {
 	struct cuse_server *owner;
 	uint8_t *virtaddr;
 	uint32_t page_count;
 	uint32_t is_allocated;
 };
 
 struct cuse_server_dev {
 	TAILQ_ENTRY(cuse_server_dev) entry;
 	struct cuse_server *server;
 	struct cdev *kern_dev;
 	struct cuse_dev *user_dev;
 };
 
 struct cuse_server {
 	TAILQ_ENTRY(cuse_server) entry;
 	TAILQ_HEAD(, cuse_client_command) head;
 	TAILQ_HEAD(, cuse_server_dev) hdev;
 	TAILQ_HEAD(, cuse_client) hcli;
 	struct cv cv;
 	struct selinfo selinfo;
 	pid_t	pid;
 	int	is_closing;
 	int	refs;
 };
 
 struct cuse_client {
 	TAILQ_ENTRY(cuse_client) entry;
 	TAILQ_ENTRY(cuse_client) entry_ref;
 	struct cuse_client_command cmds[CUSE_CMD_MAX];
 	struct cuse_server *server;
 	struct cuse_server_dev *server_dev;
 
 	uint8_t	ioctl_buffer[CUSE_BUFFER_MAX] __aligned(4);
 
 	int	fflags;		/* file flags */
 	int	cflags;		/* client flags */
 #define	CUSE_CLI_IS_CLOSING 0x01
 #define	CUSE_CLI_KNOTE_NEED_READ 0x02
 #define	CUSE_CLI_KNOTE_NEED_WRITE 0x04
 #define	CUSE_CLI_KNOTE_HAS_READ 0x08
 #define	CUSE_CLI_KNOTE_HAS_WRITE 0x10
 };
 
 #define	CUSE_CLIENT_CLOSING(pcc) \
     ((pcc)->cflags & CUSE_CLI_IS_CLOSING)
 
 static MALLOC_DEFINE(M_CUSE, "cuse", "CUSE memory");
 
 static TAILQ_HEAD(, cuse_server) cuse_server_head;
 static struct mtx cuse_mtx;
 static struct cdev *cuse_dev;
 static struct cuse_server *cuse_alloc_unit[CUSE_DEVICES_MAX];
 static int cuse_alloc_unit_id[CUSE_DEVICES_MAX];
 static struct cuse_memory cuse_mem[CUSE_ALLOC_UNIT_MAX];
 
 static void cuse_server_wakeup_all_client_locked(struct cuse_server *pcs);
 static void cuse_client_kqfilter_read_detach(struct knote *kn);
 static void cuse_client_kqfilter_write_detach(struct knote *kn);
 static int cuse_client_kqfilter_read_event(struct knote *kn, long hint);
 static int cuse_client_kqfilter_write_event(struct knote *kn, long hint);
 
 static struct filterops cuse_client_kqfilter_read_ops = {
 	.f_isfd = 1,
 	.f_detach = cuse_client_kqfilter_read_detach,
 	.f_event = cuse_client_kqfilter_read_event,
 };
 
 static struct filterops cuse_client_kqfilter_write_ops = {
 	.f_isfd = 1,
 	.f_detach = cuse_client_kqfilter_write_detach,
 	.f_event = cuse_client_kqfilter_write_event,
 };
 
 static d_open_t cuse_client_open;
 static d_close_t cuse_client_close;
 static d_ioctl_t cuse_client_ioctl;
 static d_read_t cuse_client_read;
 static d_write_t cuse_client_write;
 static d_poll_t cuse_client_poll;
 static d_mmap_t cuse_client_mmap;
 static d_kqfilter_t cuse_client_kqfilter;
 
 static struct cdevsw cuse_client_devsw = {
 	.d_version = D_VERSION,
 	.d_open = cuse_client_open,
 	.d_close = cuse_client_close,
 	.d_ioctl = cuse_client_ioctl,
 	.d_name = "cuse_client",
 	.d_flags = D_TRACKCLOSE,
 	.d_read = cuse_client_read,
 	.d_write = cuse_client_write,
 	.d_poll = cuse_client_poll,
 	.d_mmap = cuse_client_mmap,
 	.d_kqfilter = cuse_client_kqfilter,
 };
 
 static d_open_t cuse_server_open;
 static d_close_t cuse_server_close;
 static d_ioctl_t cuse_server_ioctl;
 static d_read_t cuse_server_read;
 static d_write_t cuse_server_write;
 static d_poll_t cuse_server_poll;
 static d_mmap_t cuse_server_mmap;
 
 static struct cdevsw cuse_server_devsw = {
 	.d_version = D_VERSION,
 	.d_open = cuse_server_open,
 	.d_close = cuse_server_close,
 	.d_ioctl = cuse_server_ioctl,
 	.d_name = "cuse_server",
 	.d_flags = D_TRACKCLOSE,
 	.d_read = cuse_server_read,
 	.d_write = cuse_server_write,
 	.d_poll = cuse_server_poll,
 	.d_mmap = cuse_server_mmap,
 };
 
 static void cuse_client_is_closing(struct cuse_client *);
 static int cuse_free_unit_by_id_locked(struct cuse_server *, int);
 
 static void
 cuse_lock(void)
 {
 	mtx_lock(&cuse_mtx);
 }
 
 static void
 cuse_unlock(void)
 {
 	mtx_unlock(&cuse_mtx);
 }
 
 static void
 cuse_cmd_lock(struct cuse_client_command *pccmd)
 {
 	sx_xlock(&pccmd->sx);
 }
 
 static void
 cuse_cmd_unlock(struct cuse_client_command *pccmd)
 {
 	sx_xunlock(&pccmd->sx);
 }
 
 static void
 cuse_kern_init(void *arg)
 {
 	TAILQ_INIT(&cuse_server_head);
 
 	mtx_init(&cuse_mtx, "cuse-mtx", NULL, MTX_DEF);
 
 	cuse_dev = make_dev(&cuse_server_devsw, 0,
 	    UID_ROOT, GID_OPERATOR, 0600, "cuse");
 
 	printf("Cuse v%d.%d.%d @ /dev/cuse\n",
 	    (CUSE_VERSION >> 16) & 0xFF, (CUSE_VERSION >> 8) & 0xFF,
 	    (CUSE_VERSION >> 0) & 0xFF);
 }
 
 SYSINIT(cuse_kern_init, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_init, 0);
 
 static void
 cuse_kern_uninit(void *arg)
 {
 	void *ptr;
 
 	while (1) {
 
 		printf("Cuse: Please exit all /dev/cuse instances "
 		    "and processes which have used this device.\n");
 
 		pause("DRAIN", 2 * hz);
 
 		cuse_lock();
 		ptr = TAILQ_FIRST(&cuse_server_head);
 		cuse_unlock();
 
 		if (ptr == NULL)
 			break;
 	}
 
 	if (cuse_dev != NULL)
 		destroy_dev(cuse_dev);
 
 	mtx_destroy(&cuse_mtx);
 }
 
 SYSUNINIT(cuse_kern_uninit, SI_SUB_DEVFS, SI_ORDER_ANY, cuse_kern_uninit, 0);
 
 static int
 cuse_server_get(struct cuse_server **ppcs)
 {
 	struct cuse_server *pcs;
 	int error;
 
 	error = devfs_get_cdevpriv((void **)&pcs);
 	if (error != 0) {
 		*ppcs = NULL;
 		return (error);
 	}
 	/* check if closing */
 	cuse_lock();
 	if (pcs->is_closing) {
 		cuse_unlock();
 		*ppcs = NULL;
 		return (EINVAL);
 	}
 	cuse_unlock();
 	*ppcs = pcs;
 	return (0);
 }
 
 static void
 cuse_server_is_closing(struct cuse_server *pcs)
 {
 	struct cuse_client *pcc;
 
 	if (pcs->is_closing)
 		return;
 
 	pcs->is_closing = 1;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		cuse_client_is_closing(pcc);
 	}
 }
 
 static struct cuse_client_command *
 cuse_server_find_command(struct cuse_server *pcs, struct thread *td)
 {
 	struct cuse_client *pcc;
 	int n;
 
 	if (pcs->is_closing)
 		goto done;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		if (CUSE_CLIENT_CLOSING(pcc))
 			continue;
 		for (n = 0; n != CUSE_CMD_MAX; n++) {
 			if (pcc->cmds[n].entered == td)
 				return (&pcc->cmds[n]);
 		}
 	}
 done:
 	return (NULL);
 }
 
 static void
 cuse_str_filter(char *ptr)
 {
 	int c;
 
 	while (((c = *ptr) != 0)) {
 
 		if ((c >= 'a') && (c <= 'z')) {
 			ptr++;
 			continue;
 		}
 		if ((c >= 'A') && (c <= 'Z')) {
 			ptr++;
 			continue;
 		}
 		if ((c >= '0') && (c <= '9')) {
 			ptr++;
 			continue;
 		}
 		if ((c == '.') || (c == '_') || (c == '/')) {
 			ptr++;
 			continue;
 		}
 		*ptr = '_';
 
 		ptr++;
 	}
 }
 
 static int
 cuse_convert_error(int error)
 {
 	;				/* indent fix */
 	switch (error) {
 	case CUSE_ERR_NONE:
 		return (0);
 	case CUSE_ERR_BUSY:
 		return (EBUSY);
 	case CUSE_ERR_WOULDBLOCK:
 		return (EWOULDBLOCK);
 	case CUSE_ERR_INVALID:
 		return (EINVAL);
 	case CUSE_ERR_NO_MEMORY:
 		return (ENOMEM);
 	case CUSE_ERR_FAULT:
 		return (EFAULT);
 	case CUSE_ERR_SIGNAL:
 		return (EINTR);
 	default:
 		return (ENXIO);
 	}
 }
 
 static void
 cuse_server_free_memory(struct cuse_server *pcs)
 {
 	struct cuse_memory *mem;
 	uint32_t n;
 
 	for (n = 0; n != CUSE_ALLOC_UNIT_MAX; n++) {
 		mem = &cuse_mem[n];
 
 		/* this memory is never freed */
 		if (mem->owner == pcs) {
 			mem->owner = NULL;
 			mem->is_allocated = 0;
 		}
 	}
 }
 
 static int
 cuse_server_alloc_memory(struct cuse_server *pcs,
     struct cuse_memory *mem, uint32_t page_count)
 {
 	void *ptr;
 	int error;
 
 	cuse_lock();
 
 	if (mem->virtaddr == NBUSY) {
 		cuse_unlock();
 		return (EBUSY);
 	}
 	if (mem->virtaddr != NULL) {
 		if (mem->is_allocated != 0) {
 			cuse_unlock();
 			return (EBUSY);
 		}
 		if (mem->page_count == page_count) {
 			mem->is_allocated = 1;
 			mem->owner = pcs;
 			cuse_unlock();
 			return (0);
 		}
 		cuse_unlock();
 		return (EBUSY);
 	}
 	memset(mem, 0, sizeof(*mem));
 
 	mem->virtaddr = NBUSY;
 
 	cuse_unlock();
 
 	ptr = malloc(page_count * PAGE_SIZE, M_CUSE, M_WAITOK | M_ZERO);
 	if (ptr == NULL)
 		error = ENOMEM;
 	else
 		error = 0;
 
 	cuse_lock();
 
 	if (error) {
 		mem->virtaddr = NULL;
 		cuse_unlock();
 		return (error);
 	}
 	mem->virtaddr = ptr;
 	mem->page_count = page_count;
 	mem->is_allocated = 1;
 	mem->owner = pcs;
 	cuse_unlock();
 
 	return (0);
 }
 
 static int
 cuse_client_get(struct cuse_client **ppcc)
 {
 	struct cuse_client *pcc;
 	int error;
 
 	/* try to get private data */
 	error = devfs_get_cdevpriv((void **)&pcc);
 	if (error != 0) {
 		*ppcc = NULL;
 		return (error);
 	}
 	/* check if closing */
 	cuse_lock();
 	if (CUSE_CLIENT_CLOSING(pcc) || pcc->server->is_closing) {
 		cuse_unlock();
 		*ppcc = NULL;
 		return (EINVAL);
 	}
 	cuse_unlock();
 	*ppcc = pcc;
 	return (0);
 }
 
 static void
 cuse_client_is_closing(struct cuse_client *pcc)
 {
 	struct cuse_client_command *pccmd;
 	uint32_t n;
 
 	if (CUSE_CLIENT_CLOSING(pcc))
 		return;
 
 	pcc->cflags |= CUSE_CLI_IS_CLOSING;
 	pcc->server_dev = NULL;
 
 	for (n = 0; n != CUSE_CMD_MAX; n++) {
 
 		pccmd = &pcc->cmds[n];
 
 		if (pccmd->entry.tqe_prev != NULL) {
 			TAILQ_REMOVE(&pcc->server->head, pccmd, entry);
 			pccmd->entry.tqe_prev = NULL;
 		}
 		cv_broadcast(&pccmd->cv);
 	}
 }
 
 static void
 cuse_client_send_command_locked(struct cuse_client_command *pccmd,
     uintptr_t data_ptr, unsigned long arg, int fflags, int ioflag)
 {
 	unsigned long cuse_fflags = 0;
 	struct cuse_server *pcs;
 
 	if (fflags & FREAD)
 		cuse_fflags |= CUSE_FFLAG_READ;
 
 	if (fflags & FWRITE)
 		cuse_fflags |= CUSE_FFLAG_WRITE;
 
 	if (ioflag & IO_NDELAY)
 		cuse_fflags |= CUSE_FFLAG_NONBLOCK;
 
 	pccmd->sub.fflags = cuse_fflags;
 	pccmd->sub.data_pointer = data_ptr;
 	pccmd->sub.argument = arg;
 
 	pcs = pccmd->client->server;
 
 	if ((pccmd->entry.tqe_prev == NULL) &&
 	    (CUSE_CLIENT_CLOSING(pccmd->client) == 0) &&
 	    (pcs->is_closing == 0)) {
 		TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry);
 		cv_signal(&pcs->cv);
 	}
 }
 
 static void
 cuse_client_got_signal(struct cuse_client_command *pccmd)
 {
 	struct cuse_server *pcs;
 
 	pccmd->got_signal = 1;
 
 	pccmd = &pccmd->client->cmds[CUSE_CMD_SIGNAL];
 
 	pcs = pccmd->client->server;
 
 	if ((pccmd->entry.tqe_prev == NULL) &&
 	    (CUSE_CLIENT_CLOSING(pccmd->client) == 0) &&
 	    (pcs->is_closing == 0)) {
 		TAILQ_INSERT_TAIL(&pcs->head, pccmd, entry);
 		cv_signal(&pcs->cv);
 	}
 }
 
 static int
 cuse_client_receive_command_locked(struct cuse_client_command *pccmd,
     uint8_t *arg_ptr, uint32_t arg_len)
 {
 	int error;
 
 	error = 0;
 
 	pccmd->proc_curr = curthread->td_proc;
 
 	if (CUSE_CLIENT_CLOSING(pccmd->client) ||
 	    pccmd->client->server->is_closing) {
 		error = CUSE_ERR_OTHER;
 		goto done;
 	}
 	while (pccmd->command == CUSE_CMD_NONE) {
 		if (error != 0) {
 			cv_wait(&pccmd->cv, &cuse_mtx);
 		} else {
 			error = cv_wait_sig(&pccmd->cv, &cuse_mtx);
 
 			if (error != 0)
 				cuse_client_got_signal(pccmd);
 		}
 		if (CUSE_CLIENT_CLOSING(pccmd->client) ||
 		    pccmd->client->server->is_closing) {
 			error = CUSE_ERR_OTHER;
 			goto done;
 		}
 	}
 
 	error = pccmd->error;
 	pccmd->command = CUSE_CMD_NONE;
 	cv_signal(&pccmd->cv);
 
 done:
 
 	/* wait until all process references are gone */
 
 	pccmd->proc_curr = NULL;
 
 	while (pccmd->proc_refs != 0)
 		cv_wait(&pccmd->cv, &cuse_mtx);
 
 	return (error);
 }
 
 /*------------------------------------------------------------------------*
  *	CUSE SERVER PART
  *------------------------------------------------------------------------*/
 
 static void
 cuse_server_free_dev(struct cuse_server_dev *pcsd)
 {
 	struct cuse_server *pcs;
 	struct cuse_client *pcc;
 
 	/* get server pointer */
 	pcs = pcsd->server;
 
 	/* prevent creation of more devices */
 	cuse_lock();
 	if (pcsd->kern_dev != NULL)
 		pcsd->kern_dev->si_drv1 = NULL;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		if (pcc->server_dev == pcsd)
 			cuse_client_is_closing(pcc);
 	}
 	cuse_unlock();
 
 	/* destroy device, if any */
 	if (pcsd->kern_dev != NULL) {
 		/* destroy device synchronously */
 		destroy_dev(pcsd->kern_dev);
 	}
 	free(pcsd, M_CUSE);
 }
 
 static void
 cuse_server_free(void *arg)
 {
 	struct cuse_server *pcs = arg;
 	struct cuse_server_dev *pcsd;
 
 	cuse_lock();
 	pcs->refs--;
 	if (pcs->refs != 0) {
 		cuse_unlock();
 		return;
 	}
 	cuse_server_is_closing(pcs);
 	/* final client wakeup, if any */
 	cuse_server_wakeup_all_client_locked(pcs);
 
 	TAILQ_REMOVE(&cuse_server_head, pcs, entry);
 
 	cuse_free_unit_by_id_locked(pcs, -1);
 
 	while ((pcsd = TAILQ_FIRST(&pcs->hdev)) != NULL) {
 		TAILQ_REMOVE(&pcs->hdev, pcsd, entry);
 		cuse_unlock();
 		cuse_server_free_dev(pcsd);
 		cuse_lock();
 	}
 
 	cuse_server_free_memory(pcs);
 
 	knlist_clear(&pcs->selinfo.si_note, 1);
 	knlist_destroy(&pcs->selinfo.si_note);
 
 	cuse_unlock();
 
 	seldrain(&pcs->selinfo);
 
 	cv_destroy(&pcs->cv);
 
 	free(pcs, M_CUSE);
 }
 
 static int
 cuse_server_open(struct cdev *dev, int fflags, int devtype, struct thread *td)
 {
 	struct cuse_server *pcs;
 
 	pcs = malloc(sizeof(*pcs), M_CUSE, M_WAITOK | M_ZERO);
 	if (pcs == NULL)
 		return (ENOMEM);
 
 	if (devfs_set_cdevpriv(pcs, &cuse_server_free)) {
 		printf("Cuse: Cannot set cdevpriv.\n");
 		free(pcs, M_CUSE);
 		return (ENOMEM);
 	}
 
 	/* store current process ID */
 	pcs->pid = curproc->p_pid;
 
 	TAILQ_INIT(&pcs->head);
 	TAILQ_INIT(&pcs->hdev);
 	TAILQ_INIT(&pcs->hcli);
 
 	cv_init(&pcs->cv, "cuse-server-cv");
 
 	knlist_init_mtx(&pcs->selinfo.si_note, &cuse_mtx);
 
 	cuse_lock();
 	pcs->refs++;
 	TAILQ_INSERT_TAIL(&cuse_server_head, pcs, entry);
 	cuse_unlock();
 
 	return (0);
 }
 
 static int
 cuse_server_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	struct cuse_server *pcs;
 	int error;
 
 	error = cuse_server_get(&pcs);
 	if (error != 0)
 		goto done;
 
 	cuse_lock();
 	cuse_server_is_closing(pcs);
 	/* final client wakeup, if any */
 	cuse_server_wakeup_all_client_locked(pcs);
 
 	knlist_clear(&pcs->selinfo.si_note, 1);
 	cuse_unlock();
 
 done:
 	return (0);
 }
 
 static int
 cuse_server_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	return (ENXIO);
 }
 
 static int
 cuse_server_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	return (ENXIO);
 }
 
 static int
 cuse_server_ioctl_copy_locked(struct cuse_client_command *pccmd,
     struct cuse_data_chunk *pchk, int isread)
 {
 	struct proc *p_proc;
 	uint32_t offset;
 	int error;
 
 	offset = pchk->peer_ptr - CUSE_BUF_MIN_PTR;
 
 	if (pchk->length > CUSE_BUFFER_MAX)
 		return (EFAULT);
 
 	if (offset >= CUSE_BUFFER_MAX)
 		return (EFAULT);
 
 	if ((offset + pchk->length) > CUSE_BUFFER_MAX)
 		return (EFAULT);
 
 	p_proc = pccmd->proc_curr;
 	if (p_proc == NULL)
 		return (ENXIO);
 
 	if (pccmd->proc_refs < 0)
 		return (ENOMEM);
 
 	pccmd->proc_refs++;
 
 	cuse_unlock();
 
 	if (isread == 0) {
 		error = copyin(
 		    (void *)pchk->local_ptr,
 		    pccmd->client->ioctl_buffer + offset,
 		    pchk->length);
 	} else {
 		error = copyout(
 		    pccmd->client->ioctl_buffer + offset,
 		    (void *)pchk->local_ptr,
 		    pchk->length);
 	}
 
 	cuse_lock();
 
 	pccmd->proc_refs--;
 
 	if (pccmd->proc_curr == NULL)
 		cv_signal(&pccmd->cv);
 
 	return (error);
 }
 
 static int
 cuse_proc2proc_copy(struct proc *proc_s, vm_offset_t data_s,
     struct proc *proc_d, vm_offset_t data_d, size_t len)
 {
 	struct thread *td;
 	struct proc *proc_cur;
 	int error;
 
 	td = curthread;
 	proc_cur = td->td_proc;
 
 	if (proc_cur == proc_d) {
 		struct iovec iov = {
 			.iov_base = (caddr_t)data_d,
 			.iov_len = len,
 		};
 		struct uio uio = {
 			.uio_iov = &iov,
 			.uio_iovcnt = 1,
 			.uio_offset = (off_t)data_s,
 			.uio_resid = len,
 			.uio_segflg = UIO_USERSPACE,
 			.uio_rw = UIO_READ,
 			.uio_td = td,
 		};
 
 		PHOLD(proc_s);
 		error = proc_rwmem(proc_s, &uio);
 		PRELE(proc_s);
 
 	} else if (proc_cur == proc_s) {
 		struct iovec iov = {
 			.iov_base = (caddr_t)data_s,
 			.iov_len = len,
 		};
 		struct uio uio = {
 			.uio_iov = &iov,
 			.uio_iovcnt = 1,
 			.uio_offset = (off_t)data_d,
 			.uio_resid = len,
 			.uio_segflg = UIO_USERSPACE,
 			.uio_rw = UIO_WRITE,
 			.uio_td = td,
 		};
 
 		PHOLD(proc_d);
 		error = proc_rwmem(proc_d, &uio);
 		PRELE(proc_d);
 	} else {
 		error = EINVAL;
 	}
 	return (error);
 }
 
 static int
 cuse_server_data_copy_locked(struct cuse_client_command *pccmd,
     struct cuse_data_chunk *pchk, int isread)
 {
 	struct proc *p_proc;
 	int error;
 
 	p_proc = pccmd->proc_curr;
 	if (p_proc == NULL)
 		return (ENXIO);
 
 	if (pccmd->proc_refs < 0)
 		return (ENOMEM);
 
 	pccmd->proc_refs++;
 
 	cuse_unlock();
 
 	if (isread == 0) {
 		error = cuse_proc2proc_copy(
 		    curthread->td_proc, pchk->local_ptr,
 		    p_proc, pchk->peer_ptr,
 		    pchk->length);
 	} else {
 		error = cuse_proc2proc_copy(
 		    p_proc, pchk->peer_ptr,
 		    curthread->td_proc, pchk->local_ptr,
 		    pchk->length);
 	}
 
 	cuse_lock();
 
 	pccmd->proc_refs--;
 
 	if (pccmd->proc_curr == NULL)
 		cv_signal(&pccmd->cv);
 
 	return (error);
 }
 
 static int
 cuse_alloc_unit_by_id_locked(struct cuse_server *pcs, int id)
 {
 	int n;
 	int x = 0;
 	int match;
 
 	do {
 		for (match = n = 0; n != CUSE_DEVICES_MAX; n++) {
 			if (cuse_alloc_unit[n] != NULL) {
 				if ((cuse_alloc_unit_id[n] ^ id) & CUSE_ID_MASK)
 					continue;
 				if ((cuse_alloc_unit_id[n] & ~CUSE_ID_MASK) == x) {
 					x++;
 					match = 1;
 				}
 			}
 		}
 	} while (match);
 
 	if (x < 256) {
 		for (n = 0; n != CUSE_DEVICES_MAX; n++) {
 			if (cuse_alloc_unit[n] == NULL) {
 				cuse_alloc_unit[n] = pcs;
 				cuse_alloc_unit_id[n] = id | x;
 				return (x);
 			}
 		}
 	}
 	return (-1);
 }
 
 static void
 cuse_server_wakeup_locked(struct cuse_server *pcs)
 {
 	selwakeup(&pcs->selinfo);
 	KNOTE_LOCKED(&pcs->selinfo.si_note, 0);
 }
 
 static void
 cuse_server_wakeup_all_client_locked(struct cuse_server *pcs)
 {
 	struct cuse_client *pcc;
 
 	TAILQ_FOREACH(pcc, &pcs->hcli, entry) {
 		pcc->cflags |= (CUSE_CLI_KNOTE_NEED_READ |
 		    CUSE_CLI_KNOTE_NEED_WRITE);
 	}
 	cuse_server_wakeup_locked(pcs);
 }
 
 static int
 cuse_free_unit_by_id_locked(struct cuse_server *pcs, int id)
 {
 	int n;
 	int found = 0;
 
 	for (n = 0; n != CUSE_DEVICES_MAX; n++) {
 		if (cuse_alloc_unit[n] == pcs) {
 			if (cuse_alloc_unit_id[n] == id || id == -1) {
 				cuse_alloc_unit[n] = NULL;
 				cuse_alloc_unit_id[n] = 0;
 				found = 1;
 			}
 		}
 	}
 
 	return (found ? 0 : EINVAL);
 }
 
 static int
 cuse_server_ioctl(struct cdev *dev, unsigned long cmd,
     caddr_t data, int fflag, struct thread *td)
 {
 	struct cuse_server *pcs;
 	int error;
 
 	error = cuse_server_get(&pcs);
 	if (error != 0)
 		return (error);
 
 	switch (cmd) {
 		struct cuse_client_command *pccmd;
 		struct cuse_client *pcc;
 		struct cuse_command *pcmd;
 		struct cuse_alloc_info *pai;
 		struct cuse_create_dev *pcd;
 		struct cuse_server_dev *pcsd;
 		struct cuse_data_chunk *pchk;
 		int n;
 
 	case CUSE_IOCTL_GET_COMMAND:
 		pcmd = (void *)data;
 
 		cuse_lock();
 
 		while ((pccmd = TAILQ_FIRST(&pcs->head)) == NULL) {
 			error = cv_wait_sig(&pcs->cv, &cuse_mtx);
 
 			if (pcs->is_closing)
 				error = ENXIO;
 
 			if (error) {
 				cuse_unlock();
 				return (error);
 			}
 		}
 
 		TAILQ_REMOVE(&pcs->head, pccmd, entry);
 		pccmd->entry.tqe_prev = NULL;
 
 		pccmd->entered = curthread;
 
 		*pcmd = pccmd->sub;
 
 		cuse_unlock();
 
 		break;
 
 	case CUSE_IOCTL_SYNC_COMMAND:
 
 		cuse_lock();
 		while ((pccmd = cuse_server_find_command(pcs, curthread)) != NULL) {
 
 			/* send sync command */
 			pccmd->entered = NULL;
 			pccmd->error = *(int *)data;
 			pccmd->command = CUSE_CMD_SYNC;
 
 			/* signal peer, if any */
 			cv_signal(&pccmd->cv);
 		}
 		cuse_unlock();
 
 		break;
 
 	case CUSE_IOCTL_ALLOC_UNIT:
 
 		cuse_lock();
 		n = cuse_alloc_unit_by_id_locked(pcs,
 		    CUSE_ID_DEFAULT(0));
 		cuse_unlock();
 
 		if (n < 0)
 			error = ENOMEM;
 		else
 			*(int *)data = n;
 		break;
 
 	case CUSE_IOCTL_ALLOC_UNIT_BY_ID:
 
 		n = *(int *)data;
 
 		n = (n & CUSE_ID_MASK);
 
 		cuse_lock();
 		n = cuse_alloc_unit_by_id_locked(pcs, n);
 		cuse_unlock();
 
 		if (n < 0)
 			error = ENOMEM;
 		else
 			*(int *)data = n;
 		break;
 
 	case CUSE_IOCTL_FREE_UNIT:
 
 		n = *(int *)data;
 
 		n = CUSE_ID_DEFAULT(n);
 
 		cuse_lock();
 		error = cuse_free_unit_by_id_locked(pcs, n);
 		cuse_unlock();
 		break;
 
 	case CUSE_IOCTL_FREE_UNIT_BY_ID:
 
 		n = *(int *)data;
 
 		cuse_lock();
 		error = cuse_free_unit_by_id_locked(pcs, n);
 		cuse_unlock();
 		break;
 
 	case CUSE_IOCTL_ALLOC_MEMORY:
 
 		pai = (void *)data;
 
 		if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		if (pai->page_count > CUSE_ALLOC_PAGES_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		error = cuse_server_alloc_memory(pcs,
 		    &cuse_mem[pai->alloc_nr], pai->page_count);
 		break;
 
 	case CUSE_IOCTL_FREE_MEMORY:
 		pai = (void *)data;
 
 		if (pai->alloc_nr >= CUSE_ALLOC_UNIT_MAX) {
 			error = ENOMEM;
 			break;
 		}
 		/* we trust the character device driver in this case */
 
 		cuse_lock();
 		if (cuse_mem[pai->alloc_nr].owner == pcs) {
 			cuse_mem[pai->alloc_nr].is_allocated = 0;
 			cuse_mem[pai->alloc_nr].owner = NULL;
 		} else {
 			error = EINVAL;
 		}
 		cuse_unlock();
 		break;
 
 	case CUSE_IOCTL_GET_SIG:
 
 		cuse_lock();
 		pccmd = cuse_server_find_command(pcs, curthread);
 
 		if (pccmd != NULL) {
 			n = pccmd->got_signal;
 			pccmd->got_signal = 0;
 		} else {
 			n = 0;
 		}
 		cuse_unlock();
 
 		*(int *)data = n;
 
 		break;
 
 	case CUSE_IOCTL_SET_PFH:
 
 		cuse_lock();
 		pccmd = cuse_server_find_command(pcs, curthread);
 
 		if (pccmd != NULL) {
 			pcc = pccmd->client;
 			for (n = 0; n != CUSE_CMD_MAX; n++) {
 				pcc->cmds[n].sub.per_file_handle = *(uintptr_t *)data;
 			}
 		} else {
 			error = ENXIO;
 		}
 		cuse_unlock();
 		break;
 
 	case CUSE_IOCTL_CREATE_DEV:
 
 		error = priv_check(curthread, PRIV_DRIVER);
 		if (error)
 			break;
 
 		pcd = (void *)data;
 
 		/* filter input */
 
 		pcd->devname[sizeof(pcd->devname) - 1] = 0;
 
 		if (pcd->devname[0] == 0) {
 			error = EINVAL;
 			break;
 		}
 		cuse_str_filter(pcd->devname);
 
 		pcd->permissions &= 0777;
 
 		/* try to allocate a character device */
 
 		pcsd = malloc(sizeof(*pcsd), M_CUSE, M_WAITOK | M_ZERO);
 
 		if (pcsd == NULL) {
 			error = ENOMEM;
 			break;
 		}
 		pcsd->server = pcs;
 
 		pcsd->user_dev = pcd->dev;
 
 		pcsd->kern_dev = make_dev_credf(MAKEDEV_CHECKNAME,
 		    &cuse_client_devsw, 0, NULL, pcd->user_id, pcd->group_id,
 		    pcd->permissions, "%s", pcd->devname);
 
 		if (pcsd->kern_dev == NULL) {
 			free(pcsd, M_CUSE);
 			error = ENOMEM;
 			break;
 		}
 		pcsd->kern_dev->si_drv1 = pcsd;
 
 		cuse_lock();
 		TAILQ_INSERT_TAIL(&pcs->hdev, pcsd, entry);
 		cuse_unlock();
 
 		break;
 
 	case CUSE_IOCTL_DESTROY_DEV:
 
 		error = priv_check(curthread, PRIV_DRIVER);
 		if (error)
 			break;
 
 		cuse_lock();
 
 		error = EINVAL;
 
 		pcsd = TAILQ_FIRST(&pcs->hdev);
 		while (pcsd != NULL) {
 			if (pcsd->user_dev == *(struct cuse_dev **)data) {
 				TAILQ_REMOVE(&pcs->hdev, pcsd, entry);
 				cuse_unlock();
 				cuse_server_free_dev(pcsd);
 				cuse_lock();
 				error = 0;
 				pcsd = TAILQ_FIRST(&pcs->hdev);
 			} else {
 				pcsd = TAILQ_NEXT(pcsd, entry);
 			}
 		}
 
 		cuse_unlock();
 		break;
 
 	case CUSE_IOCTL_WRITE_DATA:
 	case CUSE_IOCTL_READ_DATA:
 
 		cuse_lock();
 		pchk = (struct cuse_data_chunk *)data;
 
 		pccmd = cuse_server_find_command(pcs, curthread);
 
 		if (pccmd == NULL) {
 			error = ENXIO;	/* invalid request */
 		} else if (pchk->peer_ptr < CUSE_BUF_MIN_PTR) {
 			error = EFAULT;	/* NULL pointer */
 		} else if (pchk->peer_ptr < CUSE_BUF_MAX_PTR) {
 			error = cuse_server_ioctl_copy_locked(pccmd,
 			    pchk, cmd == CUSE_IOCTL_READ_DATA);
 		} else {
 			error = cuse_server_data_copy_locked(pccmd,
 			    pchk, cmd == CUSE_IOCTL_READ_DATA);
 		}
 		cuse_unlock();
 		break;
 
 	case CUSE_IOCTL_SELWAKEUP:
 		cuse_lock();
 		/*
 		 * We don't know which direction caused the event.
 		 * Wakeup both!
 		 */
 		cuse_server_wakeup_all_client_locked(pcs);
 		cuse_unlock();
 		break;
 
 	default:
 		error = ENXIO;
 		break;
 	}
 	return (error);
 }
 
 static int
 cuse_server_poll(struct cdev *dev, int events, struct thread *td)
 {
 	return (events & (POLLHUP | POLLPRI | POLLIN |
 	    POLLRDNORM | POLLOUT | POLLWRNORM));
 }
 
 static int
 cuse_server_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr)
 {
 	uint32_t page_nr = offset / PAGE_SIZE;
 	uint32_t alloc_nr = page_nr / CUSE_ALLOC_PAGES_MAX;
 	struct cuse_memory *mem;
 	struct cuse_server *pcs;
 	uint8_t *ptr;
 	int error;
 
 	if (alloc_nr >= CUSE_ALLOC_UNIT_MAX)
 		return (ENOMEM);
 
 	error = cuse_server_get(&pcs);
 	if (error != 0)
 		pcs = NULL;
 
 	cuse_lock();
 	mem = &cuse_mem[alloc_nr];
 
 	/* try to enforce slight ownership */
 	if ((pcs != NULL) && (mem->owner != pcs)) {
 		cuse_unlock();
 		return (EINVAL);
 	}
 	if (mem->virtaddr == NULL) {
 		cuse_unlock();
 		return (ENOMEM);
 	}
 	if (mem->virtaddr == NBUSY) {
 		cuse_unlock();
 		return (ENOMEM);
 	}
 	page_nr %= CUSE_ALLOC_PAGES_MAX;
 
 	if (page_nr >= mem->page_count) {
 		cuse_unlock();
 		return (ENXIO);
 	}
 	ptr = mem->virtaddr + (page_nr * PAGE_SIZE);
 	cuse_unlock();
 
 	*paddr = vtophys(ptr);
 
 	return (0);
 }
 
 /*------------------------------------------------------------------------*
  *	CUSE CLIENT PART
  *------------------------------------------------------------------------*/
 static void
 cuse_client_free(void *arg)
 {
 	struct cuse_client *pcc = arg;
 	struct cuse_client_command *pccmd;
 	struct cuse_server *pcs;
 	int n;
 
 	cuse_lock();
 	cuse_client_is_closing(pcc);
 	TAILQ_REMOVE(&pcc->server->hcli, pcc, entry);
 	cuse_unlock();
 
 	for (n = 0; n != CUSE_CMD_MAX; n++) {
 
 		pccmd = &pcc->cmds[n];
 
 		sx_destroy(&pccmd->sx);
 		cv_destroy(&pccmd->cv);
 	}
 
 	pcs = pcc->server;
 
 	free(pcc, M_CUSE);
 
 	/* drop reference on server */
 	cuse_server_free(pcs);
 }
 
 static int
 cuse_client_open(struct cdev *dev, int fflags, int devtype, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_server_dev *pcsd;
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	struct cuse_dev *pcd;
 	int error;
 	int n;
 
 	cuse_lock();
 	pcsd = dev->si_drv1;
 	if (pcsd != NULL) {
 		pcs = pcsd->server;
 		pcd = pcsd->user_dev;
 		/*
 		 * Check that the refcount didn't wrap and that the
 		 * same process is not both client and server. This
 		 * can easily lead to deadlocks when destroying the
 		 * CUSE character device nodes:
 		 */
 		pcs->refs++;
 		if (pcs->refs < 0 || pcs->pid == curproc->p_pid) {
 			/* overflow or wrong PID */
 			pcs->refs--;
 			pcsd = NULL;
 		}
 	} else {
 		pcs = NULL;
 		pcd = NULL;
 	}
 	cuse_unlock();
 
 	if (pcsd == NULL)
 		return (EINVAL);
 
 	pcc = malloc(sizeof(*pcc), M_CUSE, M_WAITOK | M_ZERO);
 	if (pcc == NULL) {
 		/* drop reference on server */
 		cuse_server_free(pcs);
 		return (ENOMEM);
 	}
 	if (devfs_set_cdevpriv(pcc, &cuse_client_free)) {
 		printf("Cuse: Cannot set cdevpriv.\n");
 		/* drop reference on server */
 		cuse_server_free(pcs);
 		free(pcc, M_CUSE);
 		return (ENOMEM);
 	}
 	pcc->fflags = fflags;
 	pcc->server_dev = pcsd;
 	pcc->server = pcs;
 
 	for (n = 0; n != CUSE_CMD_MAX; n++) {
 
 		pccmd = &pcc->cmds[n];
 
 		pccmd->sub.dev = pcd;
 		pccmd->sub.command = n;
 		pccmd->client = pcc;
 
 		sx_init(&pccmd->sx, "cuse-client-sx");
 		cv_init(&pccmd->cv, "cuse-client-cv");
 	}
 
 	cuse_lock();
 
 	/* cuse_client_free() assumes that the client is listed somewhere! */
 	/* always enqueue */
 
 	TAILQ_INSERT_TAIL(&pcs->hcli, pcc, entry);
 
 	/* check if server is closing */
 	if ((pcs->is_closing != 0) || (dev->si_drv1 == NULL)) {
 		error = EINVAL;
 	} else {
 		error = 0;
 	}
 	cuse_unlock();
 
 	if (error) {
 		devfs_clear_cdevpriv();	/* XXX bugfix */
 		return (error);
 	}
 	pccmd = &pcc->cmds[CUSE_CMD_OPEN];
 
 	cuse_cmd_lock(pccmd);
 
 	cuse_lock();
 	cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0);
 
 	error = cuse_client_receive_command_locked(pccmd, 0, 0);
 	cuse_unlock();
 
 	if (error < 0) {
 		error = cuse_convert_error(error);
 	} else {
 		error = 0;
 	}
 
 	cuse_cmd_unlock(pccmd);
 
 	if (error)
 		devfs_clear_cdevpriv();	/* XXX bugfix */
 
 	return (error);
 }
 
 static int
 cuse_client_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	int error;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (0);
 
 	pccmd = &pcc->cmds[CUSE_CMD_CLOSE];
 
 	cuse_cmd_lock(pccmd);
 
 	cuse_lock();
 	cuse_client_send_command_locked(pccmd, 0, 0, pcc->fflags, 0);
 
 	error = cuse_client_receive_command_locked(pccmd, 0, 0);
 	cuse_unlock();
 
 	cuse_cmd_unlock(pccmd);
 
 	cuse_lock();
 	cuse_client_is_closing(pcc);
 	cuse_unlock();
 
 	return (0);
 }
 
 static void
 cuse_client_kqfilter_poll(struct cdev *dev, struct cuse_client *pcc)
 {
 	int temp;
 
 	cuse_lock();
 	temp = (pcc->cflags & (CUSE_CLI_KNOTE_HAS_READ |
 	    CUSE_CLI_KNOTE_HAS_WRITE));
 	pcc->cflags &= ~(CUSE_CLI_KNOTE_NEED_READ |
 	    CUSE_CLI_KNOTE_NEED_WRITE);
 	cuse_unlock();
 
 	if (temp != 0) {
 		/* get the latest polling state from the server */
 		temp = cuse_client_poll(dev, POLLIN | POLLOUT, NULL);
 
 		cuse_lock();
 		if (temp & (POLLIN | POLLOUT)) {
 			if (temp & POLLIN)
 				pcc->cflags |= CUSE_CLI_KNOTE_NEED_READ;
 			if (temp & POLLOUT)
 				pcc->cflags |= CUSE_CLI_KNOTE_NEED_WRITE;
 
 			/* make sure the "knote" gets woken up */
 			cuse_server_wakeup_locked(pcc->server);
 		}
 		cuse_unlock();
 	}
 }
 
 static int
 cuse_client_read(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	int error;
 	int len;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	pccmd = &pcc->cmds[CUSE_CMD_READ];
 
 	if (uio->uio_segflg != UIO_USERSPACE) {
 		return (EINVAL);
 	}
 	uio->uio_segflg = UIO_NOCOPY;
 
 	cuse_cmd_lock(pccmd);
 
 	while (uio->uio_resid != 0) {
 
 		if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) {
 			error = ENOMEM;
 			break;
 		}
 
 		len = uio->uio_iov->iov_len;
 
 		cuse_lock();
 		cuse_client_send_command_locked(pccmd,
 		    (uintptr_t)uio->uio_iov->iov_base,
 		    (unsigned long)(unsigned int)len, pcc->fflags, ioflag);
 
 		error = cuse_client_receive_command_locked(pccmd, 0, 0);
 		cuse_unlock();
 
 		if (error < 0) {
 			error = cuse_convert_error(error);
 			break;
 		} else if (error == len) {
 			error = uiomove(NULL, error, uio);
 			if (error)
 				break;
 		} else {
 			error = uiomove(NULL, error, uio);
 			break;
 		}
 	}
 	cuse_cmd_unlock(pccmd);
 
 	uio->uio_segflg = UIO_USERSPACE;/* restore segment flag */
 
 	if (error == EWOULDBLOCK)
 		cuse_client_kqfilter_poll(dev, pcc);
 
 	return (error);
 }
 
 static int
 cuse_client_write(struct cdev *dev, struct uio *uio, int ioflag)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	int error;
 	int len;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	pccmd = &pcc->cmds[CUSE_CMD_WRITE];
 
 	if (uio->uio_segflg != UIO_USERSPACE) {
 		return (EINVAL);
 	}
 	uio->uio_segflg = UIO_NOCOPY;
 
 	cuse_cmd_lock(pccmd);
 
 	while (uio->uio_resid != 0) {
 
 		if (uio->uio_iov->iov_len > CUSE_LENGTH_MAX) {
 			error = ENOMEM;
 			break;
 		}
 
 		len = uio->uio_iov->iov_len;
 
 		cuse_lock();
 		cuse_client_send_command_locked(pccmd,
 		    (uintptr_t)uio->uio_iov->iov_base,
 		    (unsigned long)(unsigned int)len, pcc->fflags, ioflag);
 
 		error = cuse_client_receive_command_locked(pccmd, 0, 0);
 		cuse_unlock();
 
 		if (error < 0) {
 			error = cuse_convert_error(error);
 			break;
 		} else if (error == len) {
 			error = uiomove(NULL, error, uio);
 			if (error)
 				break;
 		} else {
 			error = uiomove(NULL, error, uio);
 			break;
 		}
 	}
 	cuse_cmd_unlock(pccmd);
 
 	uio->uio_segflg = UIO_USERSPACE;/* restore segment flag */
 
 	if (error == EWOULDBLOCK)
 		cuse_client_kqfilter_poll(dev, pcc);
 
 	return (error);
 }
 
 int
 cuse_client_ioctl(struct cdev *dev, unsigned long cmd,
     caddr_t data, int fflag, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	int error;
 	int len;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	len = IOCPARM_LEN(cmd);
 	if (len > CUSE_BUFFER_MAX)
 		return (ENOMEM);
 
 	pccmd = &pcc->cmds[CUSE_CMD_IOCTL];
 
 	cuse_cmd_lock(pccmd);
 
 	if (cmd & (IOC_IN | IOC_VOID))
 		memcpy(pcc->ioctl_buffer, data, len);
 
 	/*
 	 * When the ioctl-length is zero drivers can pass information
 	 * through the data pointer of the ioctl. Make sure this information
 	 * is forwarded to the driver.
 	 */
 
 	cuse_lock();
 	cuse_client_send_command_locked(pccmd,
 	    (len == 0) ? *(long *)data : CUSE_BUF_MIN_PTR,
 	    (unsigned long)cmd, pcc->fflags,
 	    (fflag & O_NONBLOCK) ? IO_NDELAY : 0);
 
 	error = cuse_client_receive_command_locked(pccmd, data, len);
 	cuse_unlock();
 
 	if (error < 0) {
 		error = cuse_convert_error(error);
 	} else {
 		error = 0;
 	}
 
 	if (cmd & IOC_OUT)
 		memcpy(data, pcc->ioctl_buffer, len);
 
 	cuse_cmd_unlock(pccmd);
 
 	if (error == EWOULDBLOCK)
 		cuse_client_kqfilter_poll(dev, pcc);
 
 	return (error);
 }
 
 static int
 cuse_client_poll(struct cdev *dev, int events, struct thread *td)
 {
 	struct cuse_client_command *pccmd;
 	struct cuse_client *pcc;
 	unsigned long temp;
 	int error;
 	int revents;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		goto pollnval;
 
 	temp = 0;
 
 	if (events & (POLLPRI | POLLIN | POLLRDNORM))
 		temp |= CUSE_POLL_READ;
 
 	if (events & (POLLOUT | POLLWRNORM))
 		temp |= CUSE_POLL_WRITE;
 
 	if (events & POLLHUP)
 		temp |= CUSE_POLL_ERROR;
 
 	pccmd = &pcc->cmds[CUSE_CMD_POLL];
 
 	cuse_cmd_lock(pccmd);
 
 	/* Need to selrecord() first to not loose any events. */
 	if (temp != 0 && td != NULL)
 		selrecord(td, &pcc->server->selinfo);
 
 	cuse_lock();
 	cuse_client_send_command_locked(pccmd,
 	    0, temp, pcc->fflags, IO_NDELAY);
 
 	error = cuse_client_receive_command_locked(pccmd, 0, 0);
 	cuse_unlock();
 
 	cuse_cmd_unlock(pccmd);
 
 	if (error < 0) {
 		goto pollnval;
 	} else {
 		revents = 0;
 		if (error & CUSE_POLL_READ)
 			revents |= (events & (POLLPRI | POLLIN | POLLRDNORM));
 		if (error & CUSE_POLL_WRITE)
 			revents |= (events & (POLLOUT | POLLWRNORM));
 		if (error & CUSE_POLL_ERROR)
 			revents |= (events & POLLHUP);
 	}
 	return (revents);
 
  pollnval:
 	/* XXX many clients don't understand POLLNVAL */
 	return (events & (POLLHUP | POLLPRI | POLLIN |
 	    POLLRDNORM | POLLOUT | POLLWRNORM));
 }
 
 static int
 cuse_client_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot, vm_memattr_t *memattr)
 {
 	uint32_t page_nr = offset / PAGE_SIZE;
 	uint32_t alloc_nr = page_nr / CUSE_ALLOC_PAGES_MAX;
 	struct cuse_memory *mem;
 	struct cuse_server *pcs;
 	struct cuse_client *pcc;
 	uint8_t *ptr;
 	int error;
 
 	if (alloc_nr >= CUSE_ALLOC_UNIT_MAX)
 		return (ENOMEM);
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		pcs = NULL;
 	else
 		pcs = pcc->server;
 
 	cuse_lock();
 	mem = &cuse_mem[alloc_nr];
 
 	/* try to enforce slight ownership */
 	if ((pcs != NULL) && (mem->owner != pcs)) {
 		cuse_unlock();
 		return (EINVAL);
 	}
 	if (mem->virtaddr == NULL) {
 		cuse_unlock();
 		return (ENOMEM);
 	}
 	if (mem->virtaddr == NBUSY) {
 		cuse_unlock();
 		return (ENOMEM);
 	}
 	page_nr %= CUSE_ALLOC_PAGES_MAX;
 
 	if (page_nr >= mem->page_count) {
 		cuse_unlock();
 		return (ENXIO);
 	}
 	ptr = mem->virtaddr + (page_nr * PAGE_SIZE);
 	cuse_unlock();
 
 	*paddr = vtophys(ptr);
 
 	return (0);
 }
 
 static void
 cuse_client_kqfilter_read_detach(struct knote *kn)
 {
 	struct cuse_client *pcc;
 
 	cuse_lock();
 	pcc = kn->kn_hook;
 	knlist_remove(&pcc->server->selinfo.si_note, kn, 1);
 	cuse_unlock();
 }
 
 static void
 cuse_client_kqfilter_write_detach(struct knote *kn)
 {
 	struct cuse_client *pcc;
 
 	cuse_lock();
 	pcc = kn->kn_hook;
 	knlist_remove(&pcc->server->selinfo.si_note, kn, 1);
 	cuse_unlock();
 }
 
 static int
 cuse_client_kqfilter_read_event(struct knote *kn, long hint)
 {
 	struct cuse_client *pcc;
 
 	mtx_assert(&cuse_mtx, MA_OWNED);
 
 	pcc = kn->kn_hook;
 	return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_READ) ? 1 : 0);
 }
 
 static int
 cuse_client_kqfilter_write_event(struct knote *kn, long hint)
 {
 	struct cuse_client *pcc;
 
 	mtx_assert(&cuse_mtx, MA_OWNED);
 
 	pcc = kn->kn_hook;
 	return ((pcc->cflags & CUSE_CLI_KNOTE_NEED_WRITE) ? 1 : 0);
 }
 
 static int
 cuse_client_kqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct cuse_client *pcc;
 	struct cuse_server *pcs;
 	int error;
 
 	error = cuse_client_get(&pcc);
 	if (error != 0)
 		return (error);
 
 	cuse_lock();
 	pcs = pcc->server;
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		pcc->cflags |= CUSE_CLI_KNOTE_HAS_READ;
 		kn->kn_hook = pcc;
 		kn->kn_fop = &cuse_client_kqfilter_read_ops;
 		knlist_add(&pcs->selinfo.si_note, kn, 1);
 		break;
 	case EVFILT_WRITE:
 		pcc->cflags |= CUSE_CLI_KNOTE_HAS_WRITE;
 		kn->kn_hook = pcc;
 		kn->kn_fop = &cuse_client_kqfilter_write_ops;
 		knlist_add(&pcs->selinfo.si_note, kn, 1);
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	cuse_unlock();
 
 	if (error == 0)
 		cuse_client_kqfilter_poll(dev, pcc);
 	return (error);
 }
Index: user/alc/PQ_LAUNDRY/sys/fs/msdosfs/denode.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/fs/msdosfs/denode.h	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/fs/msdosfs/denode.h	(revision 306283)
@@ -1,281 +1,280 @@
 /* $FreeBSD$ */
 /*	$NetBSD: denode.h,v 1.25 1997/11/17 15:36:28 ws Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 /*
  * This is the pc filesystem specific portion of the vnode structure.
  *
  * To describe a file uniquely the de_dirclust, de_diroffset, and
  * de_StartCluster fields are used.
  *
  * de_dirclust contains the cluster number of the directory cluster
  *	containing the entry for a file or directory.
  * de_diroffset is the index into the cluster for the entry describing
  *	a file or directory.
  * de_StartCluster is the number of the first cluster of the file or directory.
  *
  * Now to describe the quirks of the pc filesystem.
  * - Clusters 0 and 1 are reserved.
  * - The first allocatable cluster is 2.
  * - The root directory is of fixed size and all blocks that make it up
  *   are contiguous.
  * - Cluster 0 refers to the root directory when it is found in the
  *   startcluster field of a directory entry that points to another directory.
  * - Cluster 0 implies a 0 length file when found in the start cluster field
  *   of a directory entry that points to a file.
  * - You can't use the cluster number 0 to derive the address of the root
  *   directory.
  * - Multiple directory entries can point to a directory. The entry in the
  *   parent directory points to a child directory.  Any directories in the
  *   child directory contain a ".." entry that points back to the parent.
  *   The child directory itself contains a "." entry that points to itself.
  * - The root directory does not contain a "." or ".." entry.
  * - Directory entries for directories are never changed once they are created
  *   (except when removed).  The size stays 0, and the last modification time
  *   is never changed.  This is because so many directory entries can point to
  *   the physical clusters that make up a directory.  It would lead to an
  *   update nightmare.
  * - The length field in a directory entry pointing to a directory contains 0
  *   (always).  The only way to find the end of a directory is to follow the
  *   cluster chain until the "last cluster" marker is found.
  *
  * My extensions to make this house of cards work.  These apply only to the in
  * memory copy of the directory entry.
  * - A reference count for each denode will be kept since dos doesn't keep such
  *   things.
  */
 
 /*
  * Internal pseudo-offset for (nonexistent) directory entry for the root
  * dir in the root dir
  */
 #define	MSDOSFSROOT_OFS	0x1fffffff
 
 /*
  * The fat cache structure. fc_fsrcn is the filesystem relative cluster
  * number that corresponds to the file relative cluster number in this
  * structure (fc_frcn).
  */
 struct fatcache {
 	u_long fc_frcn;		/* file relative cluster number */
 	u_long fc_fsrcn;	/* filesystem relative cluster number */
 };
 
 /*
  * The fat entry cache as it stands helps make extending files a "quick"
  * operation by avoiding having to scan the fat to discover the last
  * cluster of the file. The cache also helps sequential reads by
  * remembering the last cluster read from the file.  This also prevents us
  * from having to rescan the fat to find the next cluster to read.  This
  * cache is probably pretty worthless if a file is opened by multiple
  * processes.
  */
 #define	FC_SIZE		3	/* number of entries in the cache */
 #define	FC_LASTMAP	0	/* entry the last call to pcbmap() resolved
 				 * to */
 #define	FC_LASTFC	1	/* entry for the last cluster in the file */
 #define	FC_NEXTTOLASTFC	2	/* entry for a close to the last cluster in
 				 * the file */
 
 #define	FCE_EMPTY	0xffffffff	/* doesn't represent an actual cluster # */
 
 /*
  * Set a slot in the fat cache.
  */
 #define	fc_setcache(dep, slot, frcn, fsrcn) \
 	(dep)->de_fc[(slot)].fc_frcn = (frcn); \
 	(dep)->de_fc[(slot)].fc_fsrcn = (fsrcn);
 
 /*
  * This is the in memory variant of a dos directory entry.  It is usually
  * contained within a vnode.
  */
 struct denode {
 	struct vnode *de_vnode;	/* addr of vnode we are part of */
 	u_long de_flag;		/* flag bits */
 	u_long de_dirclust;	/* cluster of the directory file containing this entry */
 	u_long de_diroffset;	/* offset of this entry in the directory cluster */
 	u_long de_fndoffset;	/* offset of found dir entry */
 	int de_fndcnt;		/* number of slots before de_fndoffset */
 	long de_refcnt;		/* reference count */
 	struct msdosfsmount *de_pmp;	/* addr of our mount struct */
 	u_char de_Name[12];	/* name, from DOS directory entry */
 	u_char de_Attributes;	/* attributes, from directory entry */
 	u_char de_LowerCase;	/* NT VFAT lower case flags */
 	u_char de_CHun;		/* Hundredth of second of CTime*/
 	u_short de_CTime;	/* creation time */
 	u_short de_CDate;	/* creation date */
 	u_short de_ADate;	/* access date */
 	u_short de_MTime;	/* modification time */
 	u_short de_MDate;	/* modification date */
 	u_long de_StartCluster; /* starting cluster of file */
 	u_long de_FileSize;	/* size of file in bytes */
 	struct fatcache de_fc[FC_SIZE];	/* fat cache */
 	u_quad_t de_modrev;	/* Revision level for lease. */
 	u_int64_t de_inode;	/* Inode number (really byte offset of direntry) */
 };
 
 /*
  * Values for the de_flag field of the denode.
  */
 #define	DE_UPDATE	0x0004	/* Modification time update request */
 #define	DE_CREATE	0x0008	/* Creation time update */
 #define	DE_ACCESS	0x0010	/* Access time update */
 #define	DE_MODIFIED	0x0020	/* Denode has been modified */
 #define	DE_RENAME	0x0040	/* Denode is in the process of being renamed */
 
 
 /*
  * Transfer directory entries between internal and external form.
  * dep is a struct denode * (internal form),
  * dp is a struct direntry * (external form).
  */
 #define DE_INTERNALIZE32(dep, dp)			\
 	 ((dep)->de_StartCluster |= getushort((dp)->deHighClust) << 16)
 #define DE_INTERNALIZE(dep, dp)				\
 	(bcopy((dp)->deName, (dep)->de_Name, 11),	\
 	 (dep)->de_Attributes = (dp)->deAttributes,	\
 	 (dep)->de_LowerCase = (dp)->deLowerCase,	\
 	 (dep)->de_CHun = (dp)->deCHundredth,		\
 	 (dep)->de_CTime = getushort((dp)->deCTime),	\
 	 (dep)->de_CDate = getushort((dp)->deCDate),	\
 	 (dep)->de_ADate = getushort((dp)->deADate),	\
 	 (dep)->de_MTime = getushort((dp)->deMTime),	\
 	 (dep)->de_MDate = getushort((dp)->deMDate),	\
 	 (dep)->de_StartCluster = getushort((dp)->deStartCluster), \
 	 (dep)->de_FileSize = getulong((dp)->deFileSize), \
 	 (FAT32((dep)->de_pmp) ? DE_INTERNALIZE32((dep), (dp)) : 0))
 
 #define DE_EXTERNALIZE(dp, dep)				\
 	(bcopy((dep)->de_Name, (dp)->deName, 11),	\
 	 (dp)->deAttributes = (dep)->de_Attributes,	\
 	 (dp)->deLowerCase = (dep)->de_LowerCase,	\
 	 (dp)->deCHundredth = (dep)->de_CHun,		\
 	 putushort((dp)->deCTime, (dep)->de_CTime),	\
 	 putushort((dp)->deCDate, (dep)->de_CDate),	\
 	 putushort((dp)->deADate, (dep)->de_ADate),	\
 	 putushort((dp)->deMTime, (dep)->de_MTime),	\
 	 putushort((dp)->deMDate, (dep)->de_MDate),	\
 	 putushort((dp)->deStartCluster, (dep)->de_StartCluster), \
 	 putulong((dp)->deFileSize,			\
 	     ((dep)->de_Attributes & ATTR_DIRECTORY) ? 0 : (dep)->de_FileSize), \
 	 putushort((dp)->deHighClust, (dep)->de_StartCluster >> 16))
 
 #ifdef _KERNEL
 
 #define	VTODE(vp)	((struct denode *)(vp)->v_data)
 #define	DETOV(de)	((de)->de_vnode)
 
 #define	DETIMES(dep, acc, mod, cre) do {				\
 	if ((dep)->de_flag & DE_UPDATE) {				\
 		(dep)->de_flag |= DE_MODIFIED;				\
 		timespec2fattime((mod), 0, &(dep)->de_MDate,		\
 		    &(dep)->de_MTime, NULL);				\
 		(dep)->de_Attributes |= ATTR_ARCHIVE;			\
 	}								\
 	if ((dep)->de_pmp->pm_flags & MSDOSFSMNT_NOWIN95) {		\
 		(dep)->de_flag &= ~(DE_UPDATE | DE_CREATE | DE_ACCESS);	\
 		break;							\
 	}								\
 	if ((dep)->de_flag & DE_ACCESS) {				\
 		u_int16_t adate;					\
 									\
 		timespec2fattime((acc), 0, &adate, NULL, NULL);		\
 		if (adate != (dep)->de_ADate) {				\
 			(dep)->de_flag |= DE_MODIFIED;			\
 			(dep)->de_ADate = adate;			\
 		}							\
 	}								\
 	if ((dep)->de_flag & DE_CREATE) {				\
 		timespec2fattime((cre), 0, &(dep)->de_CDate,		\
 		    &(dep)->de_CTime, &(dep)->de_CHun);			\
 		(dep)->de_flag |= DE_MODIFIED;				\
 	}								\
 	(dep)->de_flag &= ~(DE_UPDATE | DE_CREATE | DE_ACCESS);		\
 } while (0)
 
 /*
  * This overlays the fid structure (see mount.h)
  */
 struct defid {
 	u_short defid_len;	/* length of structure */
 	u_short defid_pad;	/* force long alignment */
 
 	u_int32_t defid_dirclust; /* cluster this dir entry came from */
 	u_int32_t defid_dirofs;	/* offset of entry within the cluster */
 #if 0
 	u_int32_t defid_gen;	/* generation number */
 #endif
 };
 
 extern struct vop_vector msdosfs_vnodeops;
 
 int msdosfs_lookup(struct vop_cachedlookup_args *);
 int msdosfs_inactive(struct vop_inactive_args *);
 int msdosfs_reclaim(struct vop_reclaim_args *);
 
 /*
  * Internal service routine prototypes.
  */
 int deget(struct msdosfsmount *, u_long, u_long, struct denode **);
 int uniqdosname(struct denode *, struct componentname *, u_char *);
-int findwin95(struct denode *);
 
 int readep(struct msdosfsmount *pmp, u_long dirclu, u_long dirofs,  struct buf **bpp, struct direntry **epp);
 int readde(struct denode *dep, struct buf **bpp, struct direntry **epp);
 int deextend(struct denode *dep, u_long length, struct ucred *cred);
 int fillinusemap(struct msdosfsmount *pmp);
 void reinsert(struct denode *dep);
 int dosdirempty(struct denode *dep);
 int createde(struct denode *dep, struct denode *ddep, struct denode **depp, struct componentname *cnp);
 int deupdat(struct denode *dep, int waitfor);
 int removede(struct denode *pdep, struct denode *dep);
 int detrunc(struct denode *dep, u_long length, int flags, struct ucred *cred);
 int doscheckpath( struct denode *source, struct denode *target);
 #endif	/* _KERNEL */
Index: user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_lookup.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_lookup.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_lookup.c	(revision 306283)
@@ -1,1116 +1,1064 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_lookup.c,v 1.37 1997/11/17 15:36:54 ws Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 static int msdosfs_lookup_(struct vnode *vdp, struct vnode **vpp,
     struct componentname *cnp, u_int64_t *inum);
 
 int
 msdosfs_lookup(struct vop_cachedlookup_args *ap)
 {
 
 	return (msdosfs_lookup_(ap->a_dvp, ap->a_vpp, ap->a_cnp, NULL));
 }
 
 struct deget_dotdot {
 	u_long cluster;
 	int blkoff;
 };
 
 static int
 msdosfs_deget_dotdot(struct mount *mp, void *arg, int lkflags,
     struct vnode **rvp)
 {
 	struct deget_dotdot *dd_arg;
 	struct denode *rdp;
 	struct msdosfsmount *pmp;
 	int error;
 
 	pmp = VFSTOMSDOSFS(mp);
 	dd_arg = arg;
 	error = deget(pmp, dd_arg->cluster, dd_arg->blkoff,  &rdp);
 	if (error == 0)
 		*rvp = DETOV(rdp);
 	return (error);
 }
 
 /*
  * When we search a directory the blocks containing directory entries are
  * read and examined.  The directory entries contain information that would
  * normally be in the inode of a unix filesystem.  This means that some of
  * a directory's contents may also be in memory resident denodes (sort of
  * an inode).  This can cause problems if we are searching while some other
  * process is modifying a directory.  To prevent one process from accessing
  * incompletely modified directory information we depend upon being the
  * sole owner of a directory block.  bread/brelse provide this service.
  * This being the case, when a process modifies a directory it must first
  * acquire the disk block that contains the directory entry to be modified.
  * Then update the disk block and the denode, and then write the disk block
  * out to disk.  This way disk blocks containing directory entries and in
  * memory denode's will be in synch.
  */
 static int
 msdosfs_lookup_(struct vnode *vdp, struct vnode **vpp,
     struct componentname *cnp, u_int64_t *dd_inum)
 {
 	struct mbnambuf nb;
 	daddr_t bn;
 	int error;
 	int slotcount;
 	int slotoffset = 0;
 	int frcn;
 	u_long cluster;
 	int blkoff;
 	int diroff;
 	int blsize;
 	int isadir;		/* ~0 if found direntry is a directory	 */
 	u_long scn;		/* starting cluster number		 */
 	struct vnode *pdp;
 	struct denode *dp;
 	struct denode *tdp;
 	struct msdosfsmount *pmp;
 	struct buf *bp = NULL;
 	struct direntry *dep = NULL;
 	struct deget_dotdot dd_arg;
 	u_char dosfilename[12];
 	int flags = cnp->cn_flags;
 	int nameiop = cnp->cn_nameiop;
 	int unlen;
 	u_int64_t inode1;
 
 	int wincnt = 1;
 	int chksum = -1, chksum_ok;
 	int olddos = 1;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): looking for %s\n", cnp->cn_nameptr);
 #endif
 	dp = VTODE(vdp);
 	pmp = dp->de_pmp;
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): vdp %p, dp %p, Attr %02x\n",
 	    vdp, dp, dp->de_Attributes);
 #endif
 
  restart:
 	if (vpp != NULL)
 		*vpp = NULL;
 	/*
 	 * If they are going after the . or .. entry in the root directory,
 	 * they won't find it.  DOS filesystems don't have them in the root
 	 * directory.  So, we fake it. deget() is in on this scam too.
 	 */
 	if ((vdp->v_vflag & VV_ROOT) && cnp->cn_nameptr[0] == '.' &&
 	    (cnp->cn_namelen == 1 ||
 		(cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.'))) {
 		isadir = ATTR_DIRECTORY;
 		scn = MSDOSFSROOT;
 #ifdef MSDOSFS_DEBUG
 		printf("msdosfs_lookup(): looking for . or .. in root directory\n");
 #endif
 		cluster = MSDOSFSROOT;
 		blkoff = MSDOSFSROOT_OFS;
 		goto foundroot;
 	}
 
 	switch (unix2dosfn((const u_char *)cnp->cn_nameptr, dosfilename,
 	    cnp->cn_namelen, 0, pmp)) {
 	case 0:
 		return (EINVAL);
 	case 1:
 		break;
 	case 2:
 		wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr,
 		    cnp->cn_namelen, pmp) + 1;
 		break;
 	case 3:
 		olddos = 0;
 		wincnt = winSlotCnt((const u_char *)cnp->cn_nameptr,
 		    cnp->cn_namelen, pmp) + 1;
 		break;
 	}
 	if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME) {
 		wincnt = 1;
 		olddos = 1;
 	}
 	unlen = winLenFixup(cnp->cn_nameptr, cnp->cn_namelen);
 
 	/*
 	 * Suppress search for slots unless creating
 	 * file and at end of pathname, in which case
 	 * we watch for a place to put the new file in
 	 * case it doesn't already exist.
 	 */
 	slotcount = wincnt;
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN))
 		slotcount = 0;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): dos version of filename %s, length %ld\n",
 	    dosfilename, cnp->cn_namelen);
 #endif
 	/*
 	 * Search the directory pointed at by vdp for the name pointed at
 	 * by cnp->cn_nameptr.
 	 */
 	tdp = NULL;
 	mbnambuf_init(&nb);
 	/*
 	 * The outer loop ranges over the clusters that make up the
 	 * directory.  Note that the root directory is different from all
 	 * other directories.  It has a fixed number of blocks that are not
 	 * part of the pool of allocatable clusters.  So, we treat it a
 	 * little differently. The root directory starts at "cluster" 0.
 	 */
 	diroff = 0;
 	for (frcn = 0;; frcn++) {
 		error = pcbmap(dp, frcn, &bn, &cluster, &blsize);
 		if (error) {
 			if (error == E2BIG)
 				break;
 			return (error);
 		}
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (error);
 		}
 		for (blkoff = 0; blkoff < blsize;
 		     blkoff += sizeof(struct direntry),
 		     diroff += sizeof(struct direntry)) {
 			dep = (struct direntry *)(bp->b_data + blkoff);
 			/*
 			 * If the slot is empty and we are still looking
 			 * for an empty then remember this one.  If the
 			 * slot is not empty then check to see if it
 			 * matches what we are looking for.  If the slot
 			 * has never been filled with anything, then the
 			 * remainder of the directory has never been used,
 			 * so there is no point in searching it.
 			 */
 			if (dep->deName[0] == SLOT_EMPTY ||
 			    dep->deName[0] == SLOT_DELETED) {
 				/*
 				 * Drop memory of previous long matches
 				 */
 				chksum = -1;
 				mbnambuf_init(&nb);
 
 				if (slotcount < wincnt) {
 					slotcount++;
 					slotoffset = diroff;
 				}
 				if (dep->deName[0] == SLOT_EMPTY) {
 					brelse(bp);
 					goto notfound;
 				}
 			} else {
 				/*
 				 * If there wasn't enough space for our winentries,
 				 * forget about the empty space
 				 */
 				if (slotcount < wincnt)
 					slotcount = 0;
 
 				/*
 				 * Check for Win95 long filename entry
 				 */
 				if (dep->deAttributes == ATTR_WIN95) {
 					if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 						continue;
 
 					chksum = win2unixfn(&nb,
 					    (struct winentry *)dep, chksum,
 					    pmp);
 					continue;
 				}
 
 				chksum = winChkName(&nb,
 				    (const u_char *)cnp->cn_nameptr, unlen,
 				    chksum, pmp);
 				if (chksum == -2) {
 					chksum = -1;
 					continue;
 				}
 
 				/*
 				 * Ignore volume labels (anywhere, not just
 				 * the root directory).
 				 */
 				if (dep->deAttributes & ATTR_VOLUME) {
 					chksum = -1;
 					continue;
 				}
 
 				/*
 				 * Check for a checksum or name match
 				 */
 				chksum_ok = (chksum == winChksum(dep->deName));
 				if (!chksum_ok
 				    && (!olddos || bcmp(dosfilename, dep->deName, 11))) {
 					chksum = -1;
 					continue;
 				}
 #ifdef MSDOSFS_DEBUG
 				printf("msdosfs_lookup(): match blkoff %d, diroff %d\n",
 				    blkoff, diroff);
 #endif
 				/*
 				 * Remember where this directory
 				 * entry came from for whoever did
 				 * this lookup.
 				 */
 				dp->de_fndoffset = diroff;
 				if (chksum_ok && nameiop == RENAME) {
 					/*
 					 * Target had correct long name
 					 * directory entries, reuse them
 					 * as needed.
 					 */
 					dp->de_fndcnt = wincnt - 1;
 				} else {
 					/*
 					 * Long name directory entries
 					 * not present or corrupt, can only
 					 * reuse dos directory entry.
 					 */
 					dp->de_fndcnt = 0;
 				}
 
 				goto found;
 			}
 		}	/* for (blkoff = 0; .... */
 		/*
 		 * Release the buffer holding the directory cluster just
 		 * searched.
 		 */
 		brelse(bp);
 	}	/* for (frcn = 0; ; frcn++) */
 
 notfound:
 	/*
 	 * We hold no disk buffers at this point.
 	 */
 
 	/*
 	 * Fixup the slot description to point to the place where
 	 * we might put the new DOS direntry (putting the Win95
 	 * long name entries before that)
 	 */
 	if (!slotcount) {
 		slotcount = 1;
 		slotoffset = diroff;
 	}
 	if (wincnt > slotcount)
 		slotoffset += sizeof(struct direntry) * (wincnt - slotcount);
 
 	/*
 	 * If we get here we didn't find the entry we were looking for. But
 	 * that's ok if we are creating or renaming and are at the end of
 	 * the pathname and the directory hasn't been removed.
 	 */
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_lookup(): op %d, refcnt %ld\n",
 	    nameiop, dp->de_refcnt);
 	printf("               slotcount %d, slotoffset %d\n",
 	       slotcount, slotoffset);
 #endif
 	if ((nameiop == CREATE || nameiop == RENAME) &&
 	    (flags & ISLASTCN) && dp->de_refcnt != 0) {
 		/*
 		 * Access for write is interpreted as allowing
 		 * creation of files in the directory.
 		 */
 		error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			return (error);
 		/*
 		 * Return an indication of where the new directory
 		 * entry should be put.
 		 */
 		dp->de_fndoffset = slotoffset;
 		dp->de_fndcnt = wincnt - 1;
 
 		/*
 		 * We return with the directory locked, so that
 		 * the parameters we set up above will still be
 		 * valid if we actually decide to do a direnter().
 		 * We return ni_vp == NULL to indicate that the entry
 		 * does not currently exist; we leave a pointer to
 		 * the (locked) directory inode in ndp->ni_dvp.
 		 * The pathname buffer is saved so that the name
 		 * can be obtained later.
 		 *
 		 * NB - if the directory is unlocked, then this
 		 * information cannot be used.
 		 */
 		cnp->cn_flags |= SAVENAME;
 		return (EJUSTRETURN);
 	}
 #if 0
 	/*
 	 * Insert name into cache (as non-existent) if appropriate.
 	 *
 	 * XXX Negative caching is broken for msdosfs because the name
 	 * cache doesn't understand peculiarities such as case insensitivity
 	 * and 8.3 filenames.  Hence, it may not invalidate all negative
 	 * entries if a file with this name is later created.
 	 */
 	if ((cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(vdp, *vpp, cnp);
 #endif
 	return (ENOENT);
 
 found:
 	/*
 	 * NOTE:  We still have the buffer with matched directory entry at
 	 * this point.
 	 */
 	isadir = dep->deAttributes & ATTR_DIRECTORY;
 	scn = getushort(dep->deStartCluster);
 	if (FAT32(pmp)) {
 		scn |= getushort(dep->deHighClust) << 16;
 		if (scn == pmp->pm_rootdirblk) {
 			/*
 			 * There should actually be 0 here.
 			 * Just ignore the error.
 			 */
 			scn = MSDOSFSROOT;
 		}
 	}
 
 	if (isadir) {
 		cluster = scn;
 		if (cluster == MSDOSFSROOT)
 			blkoff = MSDOSFSROOT_OFS;
 		else
 			blkoff = 0;
 	} else if (cluster == MSDOSFSROOT)
 		blkoff = diroff;
 
 	/*
 	 * Now release buf to allow deget to read the entry again.
 	 * Reserving it here and giving it to deget could result
 	 * in a deadlock.
 	 */
 	brelse(bp);
 	bp = NULL;
 	
 foundroot:
 	/*
 	 * If we entered at foundroot, then we are looking for the . or ..
 	 * entry of the filesystems root directory.  isadir and scn were
 	 * setup before jumping here.  And, bp is already null.
 	 */
 	if (FAT32(pmp) && scn == MSDOSFSROOT)
 		scn = pmp->pm_rootdirblk;
 
 	if (dd_inum != NULL) {
 		*dd_inum = (uint64_t)pmp->pm_bpcluster * scn + blkoff;
 		return (0);
 	}
 
 	/*
 	 * If deleting, and at end of pathname, return
 	 * parameters which can be used to remove file.
 	 */
 	if (nameiop == DELETE && (flags & ISLASTCN)) {
 		/*
 		 * Don't allow deleting the root.
 		 */
 		if (blkoff == MSDOSFSROOT_OFS)
 			return (EBUSY);
 
 		/*
 		 * Write access to directory required to delete files.
 		 */
 		error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			return (error);
 
 		/*
 		 * Return pointer to current entry in dp->i_offset.
 		 * Save directory inode pointer in ndp->ni_dvp for dirremove().
 		 */
 		if (dp->de_StartCluster == scn && isadir) {	/* "." */
 			VREF(vdp);
 			*vpp = vdp;
 			return (0);
 		}
 		error = deget(pmp, cluster, blkoff, &tdp);
 		if (error)
 			return (error);
 		*vpp = DETOV(tdp);
 		return (0);
 	}
 
 	/*
 	 * If rewriting (RENAME), return the inode and the
 	 * information required to rewrite the present directory
 	 * Must get inode of directory entry to verify it's a
 	 * regular file, or empty directory.
 	 */
 	if (nameiop == RENAME && (flags & ISLASTCN)) {
 		if (blkoff == MSDOSFSROOT_OFS)
 			return (EBUSY);
 
 		error = VOP_ACCESS(vdp, VWRITE, cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			return (error);
 
 		/*
 		 * Careful about locking second inode.
 		 * This can only occur if the target is ".".
 		 */
 		if (dp->de_StartCluster == scn && isadir)
 			return (EISDIR);
 
 		if ((error = deget(pmp, cluster, blkoff, &tdp)) != 0)
 			return (error);
 		*vpp = DETOV(tdp);
 		cnp->cn_flags |= SAVENAME;
 		return (0);
 	}
 
 	/*
 	 * Step through the translation in the name.  We do not `vput' the
 	 * directory because we may need it again if a symbolic link
 	 * is relative to the current directory.  Instead we save it
 	 * unlocked as "pdp".  We must get the target inode before unlocking
 	 * the directory to insure that the inode will not be removed
 	 * before we get it.  We prevent deadlock by always fetching
 	 * inodes from the root, moving down the directory tree. Thus
 	 * when following backward pointers ".." we must unlock the
 	 * parent directory before getting the requested directory.
 	 */
 	pdp = vdp;
 	if (flags & ISDOTDOT) {
 		dd_arg.cluster = cluster;
 		dd_arg.blkoff = blkoff;
 		error = vn_vget_ino_gen(vdp, msdosfs_deget_dotdot,
 		    &dd_arg, cnp->cn_lkflags, vpp);
 		if (error != 0) {
 			*vpp = NULL;
 			return (error);
 		}
 		/*
 		 * Recheck that ".." still points to the inode we
 		 * looked up before pdp lock was dropped.
 		 */
 		error = msdosfs_lookup_(pdp, NULL, cnp, &inode1);
 		if (error) {
 			vput(*vpp);
 			*vpp = NULL;
 			return (error);
 		}
 		if (VTODE(*vpp)->de_inode != inode1) {
 			vput(*vpp);
 			goto restart;
 		}
 	} else if (dp->de_StartCluster == scn && isadir) {
 		VREF(vdp);	/* we want ourself, ie "." */
 		*vpp = vdp;
 	} else {
 		if ((error = deget(pmp, cluster, blkoff, &tdp)) != 0)
 			return (error);
 		*vpp = DETOV(tdp);
 	}
 
 	/*
 	 * Insert name into cache if appropriate.
 	 */
 	if (cnp->cn_flags & MAKEENTRY)
 		cache_enter(vdp, *vpp, cnp);
 	return (0);
 }
 
 /*
  * dep  - directory entry to copy into the directory
  * ddep - directory to add to
  * depp - return the address of the denode for the created directory entry
  *	  if depp != 0
  * cnp  - componentname needed for Win95 long filenames
  */
 int
 createde(struct denode *dep, struct denode *ddep, struct denode **depp,
     struct componentname *cnp)
 {
 	int error;
 	u_long dirclust, diroffset;
 	struct direntry *ndep;
 	struct msdosfsmount *pmp = ddep->de_pmp;
 	struct buf *bp;
 	daddr_t bn;
 	int blsize;
 
 #ifdef MSDOSFS_DEBUG
 	printf("createde(dep %p, ddep %p, depp %p, cnp %p)\n",
 	    dep, ddep, depp, cnp);
 #endif
 
 	/*
 	 * If no space left in the directory then allocate another cluster
 	 * and chain it onto the end of the file.  There is one exception
 	 * to this.  That is, if the root directory has no more space it
 	 * can NOT be expanded.  extendfile() checks for and fails attempts
 	 * to extend the root directory.  We just return an error in that
 	 * case.
 	 */
 	if (ddep->de_fndoffset >= ddep->de_FileSize) {
 		diroffset = ddep->de_fndoffset + sizeof(struct direntry)
 		    - ddep->de_FileSize;
 		dirclust = de_clcount(pmp, diroffset);
 		error = extendfile(ddep, dirclust, 0, 0, DE_CLEAR);
 		if (error) {
 			(void)detrunc(ddep, ddep->de_FileSize, 0, NOCRED);
 			return error;
 		}
 
 		/*
 		 * Update the size of the directory
 		 */
 		ddep->de_FileSize += de_cn2off(pmp, dirclust);
 	}
 
 	/*
 	 * We just read in the cluster with space.  Copy the new directory
 	 * entry in.  Then write it to disk. NOTE:  DOS directories
 	 * do not get smaller as clusters are emptied.
 	 */
 	error = pcbmap(ddep, de_cluster(pmp, ddep->de_fndoffset),
 		       &bn, &dirclust, &blsize);
 	if (error)
 		return error;
 	diroffset = ddep->de_fndoffset;
 	if (dirclust != MSDOSFSROOT)
 		diroffset &= pmp->pm_crbomask;
 	if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) != 0) {
 		brelse(bp);
 		return error;
 	}
 	ndep = bptoep(pmp, bp, ddep->de_fndoffset);
 
 	DE_EXTERNALIZE(ndep, dep);
 
 	/*
 	 * Now write the Win95 long name
 	 */
 	if (ddep->de_fndcnt > 0) {
 		u_int8_t chksum = winChksum(ndep->deName);
 		const u_char *un = (const u_char *)cnp->cn_nameptr;
 		int unlen = cnp->cn_namelen;
 		int cnt = 1;
 
 		while (--ddep->de_fndcnt >= 0) {
 			if (!(ddep->de_fndoffset & pmp->pm_crbomask)) {
 				if (DOINGASYNC(DETOV(ddep)))
 					bdwrite(bp);
 				else if ((error = bwrite(bp)) != 0)
 					return error;
 
 				ddep->de_fndoffset -= sizeof(struct direntry);
 				error = pcbmap(ddep,
 					       de_cluster(pmp,
 							  ddep->de_fndoffset),
 					       &bn, 0, &blsize);
 				if (error)
 					return error;
 
 				error = bread(pmp->pm_devvp, bn, blsize,
 					      NOCRED, &bp);
 				if (error) {
 					brelse(bp);
 					return error;
 				}
 				ndep = bptoep(pmp, bp, ddep->de_fndoffset);
 			} else {
 				ndep--;
 				ddep->de_fndoffset -= sizeof(struct direntry);
 			}
 			if (!unix2winfn(un, unlen, (struct winentry *)ndep,
 					cnt++, chksum, pmp))
 				break;
 		}
 	}
 
 	if (DOINGASYNC(DETOV(ddep)))
 		bdwrite(bp);
 	else if ((error = bwrite(bp)) != 0)
 		return error;
 
 	/*
 	 * If they want us to return with the denode gotten.
 	 */
 	if (depp) {
 		if (dep->de_Attributes & ATTR_DIRECTORY) {
 			dirclust = dep->de_StartCluster;
 			if (FAT32(pmp) && dirclust == pmp->pm_rootdirblk)
 				dirclust = MSDOSFSROOT;
 			if (dirclust == MSDOSFSROOT)
 				diroffset = MSDOSFSROOT_OFS;
 			else
 				diroffset = 0;
 		}
 		return deget(pmp, dirclust, diroffset, depp);
 	}
 
 	return 0;
 }
 
 /*
  * Be sure a directory is empty except for "." and "..". Return 1 if empty,
  * return 0 if not empty or error.
  */
 int
 dosdirempty(struct denode *dep)
 {
 	int blsize;
 	int error;
 	u_long cn;
 	daddr_t bn;
 	struct buf *bp;
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 
 	/*
 	 * Since the filesize field in directory entries for a directory is
 	 * zero, we just have to feel our way through the directory until
 	 * we hit end of file.
 	 */
 	for (cn = 0;; cn++) {
 		if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) {
 			if (error == E2BIG)
 				return (1);	/* it's empty */
 			return (0);
 		}
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return (0);
 		}
 		for (dentp = (struct direntry *)bp->b_data;
 		     (char *)dentp < bp->b_data + blsize;
 		     dentp++) {
 			if (dentp->deName[0] != SLOT_DELETED &&
 			    (dentp->deAttributes & ATTR_VOLUME) == 0) {
 				/*
 				 * In dos directories an entry whose name
 				 * starts with SLOT_EMPTY (0) starts the
 				 * beginning of the unused part of the
 				 * directory, so we can just return that it
 				 * is empty.
 				 */
 				if (dentp->deName[0] == SLOT_EMPTY) {
 					brelse(bp);
 					return (1);
 				}
 				/*
 				 * Any names other than "." and ".." in a
 				 * directory mean it is not empty.
 				 */
 				if (bcmp(dentp->deName, ".          ", 11) &&
 				    bcmp(dentp->deName, "..         ", 11)) {
 					brelse(bp);
 #ifdef MSDOSFS_DEBUG
 					printf("dosdirempty(): entry found %02x, %02x\n",
 					    dentp->deName[0], dentp->deName[1]);
 #endif
 					return (0);	/* not empty */
 				}
 			}
 		}
 		brelse(bp);
 	}
 	/* NOTREACHED */
 }
 
 /*
  * Check to see if the directory described by target is in some
  * subdirectory of source.  This prevents something like the following from
  * succeeding and leaving a bunch or files and directories orphaned. mv
  * /a/b/c /a/b/c/d/e/f Where c and f are directories.
  *
  * source - the inode for /a/b/c
  * target - the inode for /a/b/c/d/e/f
  *
  * Returns 0 if target is NOT a subdirectory of source.
  * Otherwise returns a non-zero error number.
  * The target inode is always unlocked on return.
  */
 int
 doscheckpath(struct denode *source, struct denode *target)
 {
 	daddr_t scn;
 	struct msdosfsmount *pmp;
 	struct direntry *ep;
 	struct denode *dep;
 	struct buf *bp = NULL;
 	int error = 0;
 
 	dep = target;
 	if ((target->de_Attributes & ATTR_DIRECTORY) == 0 ||
 	    (source->de_Attributes & ATTR_DIRECTORY) == 0) {
 		error = ENOTDIR;
 		goto out;
 	}
 	if (dep->de_StartCluster == source->de_StartCluster) {
 		error = EEXIST;
 		goto out;
 	}
 	if (dep->de_StartCluster == MSDOSFSROOT)
 		goto out;
 	pmp = dep->de_pmp;
 #ifdef	DIAGNOSTIC
 	if (pmp != source->de_pmp)
 		panic("doscheckpath: source and target on different filesystems");
 #endif
 	if (FAT32(pmp) && dep->de_StartCluster == pmp->pm_rootdirblk)
 		goto out;
 
 	for (;;) {
 		if ((dep->de_Attributes & ATTR_DIRECTORY) == 0) {
 			error = ENOTDIR;
 			break;
 		}
 		scn = dep->de_StartCluster;
 		error = bread(pmp->pm_devvp, cntobn(pmp, scn),
 			      pmp->pm_bpcluster, NOCRED, &bp);
 		if (error)
 			break;
 
 		ep = (struct direntry *) bp->b_data + 1;
 		if ((ep->deAttributes & ATTR_DIRECTORY) == 0 ||
 		    bcmp(ep->deName, "..         ", 11) != 0) {
 			error = ENOTDIR;
 			break;
 		}
 		scn = getushort(ep->deStartCluster);
 		if (FAT32(pmp))
 			scn |= getushort(ep->deHighClust) << 16;
 
 		if (scn == source->de_StartCluster) {
 			error = EINVAL;
 			break;
 		}
 		if (scn == MSDOSFSROOT)
 			break;
 		if (FAT32(pmp) && scn == pmp->pm_rootdirblk) {
 			/*
 			 * scn should be 0 in this case,
 			 * but we silently ignore the error.
 			 */
 			break;
 		}
 
 		vput(DETOV(dep));
 		brelse(bp);
 		bp = NULL;
 		/* NOTE: deget() clears dep on error */
 		if ((error = deget(pmp, scn, 0, &dep)) != 0)
 			break;
 	}
 out:;
 	if (bp)
 		brelse(bp);
 #ifdef MSDOSFS_DEBUG
 	if (error == ENOTDIR)
 		printf("doscheckpath(): .. not a directory?\n");
 #endif
 	if (dep != NULL)
 		vput(DETOV(dep));
 	return (error);
 }
 
 /*
  * Read in the disk block containing the directory entry (dirclu, dirofs)
  * and return the address of the buf header, and the address of the
  * directory entry within the block.
  */
 int
 readep(struct msdosfsmount *pmp, u_long dirclust, u_long diroffset,
     struct buf **bpp, struct direntry **epp)
 {
 	int error;
 	daddr_t bn;
 	int blsize;
 
 	blsize = pmp->pm_bpcluster;
 	if (dirclust == MSDOSFSROOT
 	    && de_blk(pmp, diroffset + blsize) > pmp->pm_rootdirsize)
 		blsize = de_bn2off(pmp, pmp->pm_rootdirsize) & pmp->pm_crbomask;
 	bn = detobn(pmp, dirclust, diroffset);
 	if ((error = bread(pmp->pm_devvp, bn, blsize, NOCRED, bpp)) != 0) {
 		brelse(*bpp);
 		*bpp = NULL;
 		return (error);
 	}
 	if (epp)
 		*epp = bptoep(pmp, *bpp, diroffset);
 	return (0);
 }
 
 /*
  * Read in the disk block containing the directory entry dep came from and
  * return the address of the buf header, and the address of the directory
  * entry within the block.
  */
 int
 readde(struct denode *dep, struct buf **bpp, struct direntry **epp)
 {
 
 	return (readep(dep->de_pmp, dep->de_dirclust, dep->de_diroffset,
 	    bpp, epp));
 }
 
 /*
  * Remove a directory entry. At this point the file represented by the
  * directory entry to be removed is still full length until no one has it
  * open.  When the file no longer being used msdosfs_inactive() is called
  * and will truncate the file to 0 length.  When the vnode containing the
  * denode is needed for some other purpose by VFS it will call
  * msdosfs_reclaim() which will remove the denode from the denode cache.
  *
  * pdep	directory where the entry is removed
  * dep	file to be removed
  */
 int
 removede(struct denode *pdep, struct denode *dep)
 {
 	int error;
 	struct direntry *ep;
 	struct buf *bp;
 	daddr_t bn;
 	int blsize;
 	struct msdosfsmount *pmp = pdep->de_pmp;
 	u_long offset = pdep->de_fndoffset;
 
 #ifdef MSDOSFS_DEBUG
 	printf("removede(): filename %s, dep %p, offset %08lx\n",
 	    dep->de_Name, dep, offset);
 #endif
 
 	dep->de_refcnt--;
 	offset += sizeof(struct direntry);
 	do {
 		offset -= sizeof(struct direntry);
 		error = pcbmap(pdep, de_cluster(pmp, offset), &bn, 0, &blsize);
 		if (error)
 			return error;
 		error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			return error;
 		}
 		ep = bptoep(pmp, bp, offset);
 		/*
 		 * Check whether, if we came here the second time, i.e.
 		 * when underflowing into the previous block, the last
 		 * entry in this block is a longfilename entry, too.
 		 */
 		if (ep->deAttributes != ATTR_WIN95
 		    && offset != pdep->de_fndoffset) {
 			brelse(bp);
 			break;
 		}
 		offset += sizeof(struct direntry);
 		while (1) {
 			/*
 			 * We are a bit aggressive here in that we delete any Win95
 			 * entries preceding this entry, not just the ones we "own".
 			 * Since these presumably aren't valid anyway,
 			 * there should be no harm.
 			 */
 			offset -= sizeof(struct direntry);
 			ep--->deName[0] = SLOT_DELETED;
 			if ((pmp->pm_flags & MSDOSFSMNT_NOWIN95)
 			    || !(offset & pmp->pm_crbomask)
 			    || ep->deAttributes != ATTR_WIN95)
 				break;
 		}
 		if (DOINGASYNC(DETOV(pdep)))
 			bdwrite(bp);
 		else if ((error = bwrite(bp)) != 0)
 			return error;
 	} while (!(pmp->pm_flags & MSDOSFSMNT_NOWIN95)
 	    && !(offset & pmp->pm_crbomask)
 	    && offset);
 	return 0;
 }
 
 /*
  * Create a unique DOS name in dvp
  */
 int
 uniqdosname(struct denode *dep, struct componentname *cnp, u_char *cp)
 {
 	struct msdosfsmount *pmp = dep->de_pmp;
 	struct direntry *dentp;
 	int gen;
 	int blsize;
 	u_long cn;
 	daddr_t bn;
 	struct buf *bp;
 	int error;
 	
 	if (pmp->pm_flags & MSDOSFSMNT_SHORTNAME)
 		return (unix2dosfn((const u_char *)cnp->cn_nameptr, cp,
 		    cnp->cn_namelen, 0, pmp) ? 0 : EINVAL);
 
 	for (gen = 1;; gen++) {
 		/*
 		 * Generate DOS name with generation number
 		 */
 		if (!unix2dosfn((const u_char *)cnp->cn_nameptr, cp,
 		    cnp->cn_namelen, gen, pmp))
 			return gen == 1 ? EINVAL : EEXIST;
 
 		/*
 		 * Now look for a dir entry with this exact name
 		 */
 		for (cn = error = 0; !error; cn++) {
 			if ((error = pcbmap(dep, cn, &bn, 0, &blsize)) != 0) {
 				if (error == E2BIG)	/* EOF reached and not found */
 					return 0;
 				return error;
 			}
 			error = bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp);
 			if (error) {
 				brelse(bp);
 				return error;
 			}
 			for (dentp = (struct direntry *)bp->b_data;
 			     (char *)dentp < bp->b_data + blsize;
 			     dentp++) {
 				if (dentp->deName[0] == SLOT_EMPTY) {
 					/*
 					 * Last used entry and not found
 					 */
 					brelse(bp);
 					return 0;
 				}
 				/*
 				 * Ignore volume labels and Win95 entries
 				 */
 				if (dentp->deAttributes & ATTR_VOLUME)
 					continue;
 				if (!bcmp(dentp->deName, cp, 11)) {
 					error = EEXIST;
 					break;
 				}
 			}
 			brelse(bp);
 		}
 	}
 }
-
-/*
- * Find any Win'95 long filename entry in directory dep
- */
-int
-findwin95(struct denode *dep)
-{
-	struct msdosfsmount *pmp = dep->de_pmp;
-	struct direntry *dentp;
-	int blsize, win95;
-	u_long cn;
-	daddr_t bn;
-	struct buf *bp;
-
-	win95 = 1;
-	/*
-	 * Read through the directory looking for Win'95 entries
-	 * Note: Error currently handled just as EOF			XXX
-	 */
-	for (cn = 0;; cn++) {
-		if (pcbmap(dep, cn, &bn, 0, &blsize))
-			return (win95);
-		if (bread(pmp->pm_devvp, bn, blsize, NOCRED, &bp)) {
-			brelse(bp);
-			return (win95);
-		}
-		for (dentp = (struct direntry *)bp->b_data;
-		     (char *)dentp < bp->b_data + blsize;
-		     dentp++) {
-			if (dentp->deName[0] == SLOT_EMPTY) {
-				/*
-				 * Last used entry and not found
-				 */
-				brelse(bp);
-				return (win95);
-			}
-			if (dentp->deName[0] == SLOT_DELETED) {
-				/*
-				 * Ignore deleted files
-				 * Note: might be an indication of Win'95 anyway	XXX
-				 */
-				continue;
-			}
-			if (dentp->deAttributes == ATTR_WIN95) {
-				brelse(bp);
-				return 1;
-			}
-			win95 = 0;
-		}
-		brelse(bp);
-	}
-}
Index: user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vfsops.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vfsops.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/fs/msdosfs/msdosfs_vfsops.c	(revision 306283)
@@ -1,1031 +1,1015 @@
 /* $FreeBSD$ */
 /*	$NetBSD: msdosfs_vfsops.c,v 1.51 1997/11/17 15:36:58 ws Exp $	*/
 
 /*-
  * Copyright (C) 1994, 1995, 1997 Wolfgang Solfrank.
  * Copyright (C) 1994, 1995, 1997 TooLs GmbH.
  * All rights reserved.
  * Original code by Paul Popelka (paulp@uts.amdahl.com) (see below).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by TooLs GmbH.
  * 4. The name of TooLs GmbH may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 /*-
  * Written by Paul Popelka (paulp@uts.amdahl.com)
  *
  * You can do anything you want with this software, just don't say you wrote
  * it, and don't remove this notice.
  *
  * This software is provided "as is".
  *
  * The author supplies this software to be publicly redistributed on the
  * understanding that the author is not responsible for the correct
  * functioning of this software in any circumstances and is not liable for
  * any damages caused by this software.
  *
  * October 1992
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/iconv.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/vnode.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
 
 #include <fs/msdosfs/bootsect.h>
 #include <fs/msdosfs/bpb.h>
 #include <fs/msdosfs/direntry.h>
 #include <fs/msdosfs/denode.h>
 #include <fs/msdosfs/fat.h>
 #include <fs/msdosfs/msdosfsmount.h>
 
 static const char msdosfs_lock_msg[] = "fatlk";
 
 /* Mount options that we support. */
 static const char *msdosfs_opts[] = {
 	"async", "noatime", "noclusterr", "noclusterw",
 	"export", "force", "from", "sync",
 	"cs_dos", "cs_local", "cs_win", "dirmask",
 	"gid", "kiconv", "large", "longname",
 	"longnames", "mask", "shortname", "shortnames",
 	"uid", "win95", "nowin95",
 	NULL
 };
 
 #if 1 /*def PC98*/
 /*
  * XXX - The boot signature formatted by NEC PC-98 DOS looks like a
  *       garbage or a random value :-{
  *       If you want to use that broken-signatured media, define the
  *       following symbol even though PC/AT.
  *       (ex. mount PC-98 DOS formatted FD on PC/AT)
  */
 #define	MSDOSFS_NOCHECKSIG
 #endif
 
 MALLOC_DEFINE(M_MSDOSFSMNT, "msdosfs_mount", "MSDOSFS mount structure");
 static MALLOC_DEFINE(M_MSDOSFSFAT, "msdosfs_fat", "MSDOSFS file allocation table");
 
 struct iconv_functions *msdosfs_iconv;
 
 static int	update_mp(struct mount *mp, struct thread *td);
 static int	mountmsdosfs(struct vnode *devvp, struct mount *mp);
 static vfs_fhtovp_t	msdosfs_fhtovp;
 static vfs_mount_t	msdosfs_mount;
 static vfs_root_t	msdosfs_root;
 static vfs_statfs_t	msdosfs_statfs;
 static vfs_sync_t	msdosfs_sync;
 static vfs_unmount_t	msdosfs_unmount;
 
 /* Maximum length of a character set name (arbitrary). */
 #define	MAXCSLEN	64
 
 static int
 update_mp(struct mount *mp, struct thread *td)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	void *dos, *win, *local;
 	int error, v;
 
 	if (!vfs_getopt(mp->mnt_optnew, "kiconv", NULL, NULL)) {
 		if (msdosfs_iconv != NULL) {
 			error = vfs_getopt(mp->mnt_optnew,
 			    "cs_win", &win, NULL);
 			if (!error)
 				error = vfs_getopt(mp->mnt_optnew,
 				    "cs_local", &local, NULL);
 			if (!error)
 				error = vfs_getopt(mp->mnt_optnew,
 				    "cs_dos", &dos, NULL);
 			if (!error) {
 				msdosfs_iconv->open(win, local, &pmp->pm_u2w);
 				msdosfs_iconv->open(local, win, &pmp->pm_w2u);
 				msdosfs_iconv->open(dos, local, &pmp->pm_u2d);
 				msdosfs_iconv->open(local, dos, &pmp->pm_d2u);
 			}
 			if (error != 0)
 				return (error);
 		} else {
 			pmp->pm_w2u = NULL;
 			pmp->pm_u2w = NULL;
 			pmp->pm_d2u = NULL;
 			pmp->pm_u2d = NULL;
 		}
 	}
 
 	if (vfs_scanopt(mp->mnt_optnew, "gid", "%d", &v) == 1)
 		pmp->pm_gid = v;
 	if (vfs_scanopt(mp->mnt_optnew, "uid", "%d", &v) == 1)
 		pmp->pm_uid = v;
 	if (vfs_scanopt(mp->mnt_optnew, "mask", "%d", &v) == 1)
 		pmp->pm_mask = v & ALLPERMS;
 	if (vfs_scanopt(mp->mnt_optnew, "dirmask", "%d", &v) == 1)
 		pmp->pm_dirmask = v & ALLPERMS;
 	vfs_flagopt(mp->mnt_optnew, "shortname",
 	    &pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
 	vfs_flagopt(mp->mnt_optnew, "shortnames",
 	    &pmp->pm_flags, MSDOSFSMNT_SHORTNAME);
 	vfs_flagopt(mp->mnt_optnew, "longname",
 	    &pmp->pm_flags, MSDOSFSMNT_LONGNAME);
 	vfs_flagopt(mp->mnt_optnew, "longnames",
 	    &pmp->pm_flags, MSDOSFSMNT_LONGNAME);
 	vfs_flagopt(mp->mnt_optnew, "kiconv",
 	    &pmp->pm_flags, MSDOSFSMNT_KICONV);
 
 	if (vfs_getopt(mp->mnt_optnew, "nowin95", NULL, NULL) == 0)
 		pmp->pm_flags |= MSDOSFSMNT_NOWIN95;
 	else
 		pmp->pm_flags &= ~MSDOSFSMNT_NOWIN95;
 
 	if (pmp->pm_flags & MSDOSFSMNT_NOWIN95)
 		pmp->pm_flags |= MSDOSFSMNT_SHORTNAME;
-	else if (!(pmp->pm_flags &
-	    (MSDOSFSMNT_SHORTNAME | MSDOSFSMNT_LONGNAME))) {
-		struct vnode *rootvp;
-
-		/*
-		 * Try to divine whether to support Win'95 long filenames
-		 */
-		if (FAT32(pmp))
-			pmp->pm_flags |= MSDOSFSMNT_LONGNAME;
-		else {
-			if ((error =
-			    msdosfs_root(mp, LK_EXCLUSIVE, &rootvp)) != 0)
-				return error;
-			pmp->pm_flags |= findwin95(VTODE(rootvp)) ?
-			    MSDOSFSMNT_LONGNAME : MSDOSFSMNT_SHORTNAME;
-			vput(rootvp);
-		}
-	}
+	else
+		pmp->pm_flags |= MSDOSFSMNT_LONGNAME;
 	return 0;
 }
 
 static int
 msdosfs_cmount(struct mntarg *ma, void *data, uint64_t flags)
 {
 	struct msdosfs_args args;
 	struct export_args exp;
 	int error;
 
 	if (data == NULL)
 		return (EINVAL);
 	error = copyin(data, &args, sizeof args);
 	if (error)
 		return (error);
 	vfs_oexport_conv(&args.export, &exp);
 
 	ma = mount_argsu(ma, "from", args.fspec, MAXPATHLEN);
 	ma = mount_arg(ma, "export", &exp, sizeof(exp));
 	ma = mount_argf(ma, "uid", "%d", args.uid);
 	ma = mount_argf(ma, "gid", "%d", args.gid);
 	ma = mount_argf(ma, "mask", "%d", args.mask);
 	ma = mount_argf(ma, "dirmask", "%d", args.dirmask);
 
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_SHORTNAME, "noshortname");
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_LONGNAME, "nolongname");
 	ma = mount_argb(ma, !(args.flags & MSDOSFSMNT_NOWIN95), "nowin95");
 	ma = mount_argb(ma, args.flags & MSDOSFSMNT_KICONV, "nokiconv");
 
 	ma = mount_argsu(ma, "cs_win", args.cs_win, MAXCSLEN);
 	ma = mount_argsu(ma, "cs_dos", args.cs_dos, MAXCSLEN);
 	ma = mount_argsu(ma, "cs_local", args.cs_local, MAXCSLEN);
 
 	error = kernel_mount(ma, flags);
 
 	return (error);
 }
 
 /*
  * mp - path - addr in user space of mount point (ie /usr or whatever)
  * data - addr in user space of mount params including the name of the block
  * special file to treat as a filesystem.
  */
 static int
 msdosfs_mount(struct mount *mp)
 {
 	struct vnode *devvp;	  /* vnode for blk device to mount */
 	struct thread *td;
 	/* msdosfs specific mount control block */
 	struct msdosfsmount *pmp = NULL;
 	struct nameidata ndp;
 	int error, flags;
 	accmode_t accmode;
 	char *from;
 
 	td = curthread;
 	if (vfs_filteropt(mp->mnt_optnew, msdosfs_opts))
 		return (EINVAL);
 
 	/*
 	 * If updating, check whether changing from read-only to
 	 * read/write; if there is no device name, that's all we do.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
 		pmp = VFSTOMSDOSFS(mp);
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0)) {
 			/*
 			 * Forbid export requests if filesystem has
 			 * MSDOSFS_LARGEFS flag set.
 			 */
 			if ((pmp->pm_flags & MSDOSFS_LARGEFS) != 0) {
 				vfs_mount_error(mp,
 				    "MSDOSFS_LARGEFS flag set, cannot export");
 				return (EOPNOTSUPP);
 			}
 		}
 		if (!(pmp->pm_flags & MSDOSFSMNT_RONLY) &&
 		    vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			error = VFS_SYNC(mp, MNT_WAIT);
 			if (error)
 				return (error);
 			flags = WRITECLOSE;
 			if (mp->mnt_flag & MNT_FORCE)
 				flags |= FORCECLOSE;
 			error = vflush(mp, 0, flags, td);
 			if (error)
 				return (error);
 
 			/*
 			 * Now the volume is clean.  Mark it so while the
 			 * device is still rw.
 			 */
 			error = markvoldirty(pmp, 0);
 			if (error) {
 				(void)markvoldirty(pmp, 1);
 				return (error);
 			}
 
 			/* Downgrade the device from rw to ro. */
 			g_topology_lock();
 			error = g_access(pmp->pm_cp, 0, -1, 0);
 			g_topology_unlock();
 			if (error) {
 				(void)markvoldirty(pmp, 1);
 				return (error);
 			}
 
 			/*
 			 * Backing out after an error was painful in the
 			 * above.  Now we are committed to succeeding.
 			 */
 			pmp->pm_fmod = 0;
 			pmp->pm_flags |= MSDOSFSMNT_RONLY;
 			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 		} else if ((pmp->pm_flags & MSDOSFSMNT_RONLY) &&
 		    !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) {
 			/*
 			 * If upgrade to read-write by non-root, then verify
 			 * that user has necessary permissions on the device.
 			 */
 			devvp = pmp->pm_devvp;
 			vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
 			error = VOP_ACCESS(devvp, VREAD | VWRITE,
 			    td->td_ucred, td);
 			if (error)
 				error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 			if (error) {
 				VOP_UNLOCK(devvp, 0);
 				return (error);
 			}
 			VOP_UNLOCK(devvp, 0);
 			g_topology_lock();
 			error = g_access(pmp->pm_cp, 0, 1, 0);
 			g_topology_unlock();
 			if (error)
 				return (error);
 
 			pmp->pm_fmod = 1;
 			pmp->pm_flags &= ~MSDOSFSMNT_RONLY;
 			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_RDONLY;
 			MNT_IUNLOCK(mp);
 
 			/* Now that the volume is modifiable, mark it dirty. */
 			error = markvoldirty(pmp, 1);
 			if (error)
 				return (error); 
 		}
 	}
 	/*
 	 * Not an update, or updating the name: look up the name
 	 * and verify that it refers to a sensible disk device.
 	 */
 	if (vfs_getopt(mp->mnt_optnew, "from", (void **)&from, NULL))
 		return (EINVAL);
 	NDINIT(&ndp, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, from, td);
 	error = namei(&ndp);
 	if (error)
 		return (error);
 	devvp = ndp.ni_vp;
 	NDFREE(&ndp, NDF_ONLY_PNBUF);
 
 	if (!vn_isdisk(devvp, &error)) {
 		vput(devvp);
 		return (error);
 	}
 	/*
 	 * If mount by non-root, then verify that user has necessary
 	 * permissions on the device.
 	 */
 	accmode = VREAD;
 	if ((mp->mnt_flag & MNT_RDONLY) == 0)
 		accmode |= VWRITE;
 	error = VOP_ACCESS(devvp, accmode, td->td_ucred, td);
 	if (error)
 		error = priv_check(td, PRIV_VFS_MOUNT_PERM);
 	if (error) {
 		vput(devvp);
 		return (error);
 	}
 	if ((mp->mnt_flag & MNT_UPDATE) == 0) {
 		error = mountmsdosfs(devvp, mp);
 #ifdef MSDOSFS_DEBUG		/* only needed for the printf below */
 		pmp = VFSTOMSDOSFS(mp);
 #endif
 	} else {
 		vput(devvp);
 		if (devvp != pmp->pm_devvp)
 			return (EINVAL);	/* XXX needs translation */
 	}
 	if (error) {
 		vrele(devvp);
 		return (error);
 	}
 
 	error = update_mp(mp, td);
 	if (error) {
 		if ((mp->mnt_flag & MNT_UPDATE) == 0)
 			msdosfs_unmount(mp, MNT_FORCE);
 		return error;
 	}
 
 	vfs_mountedfrom(mp, from);
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_mount(): mp %p, pmp %p, inusemap %p\n", mp, pmp, pmp->pm_inusemap);
 #endif
 	return (0);
 }
 
 static int
 mountmsdosfs(struct vnode *devvp, struct mount *mp)
 {
 	struct msdosfsmount *pmp;
 	struct buf *bp;
 	struct cdev *dev;
 	union bootsector *bsp;
 	struct byte_bpb33 *b33;
 	struct byte_bpb50 *b50;
 	struct byte_bpb710 *b710;
 	u_int8_t SecPerClust;
 	u_long clusters;
 	int ronly, error;
 	struct g_consumer *cp;
 	struct bufobj *bo;
 
 	bp = NULL;		/* This and pmp both used in error_exit. */
 	pmp = NULL;
 	ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
 
 	dev = devvp->v_rdev;
 	if (atomic_cmpset_acq_ptr((uintptr_t *)&dev->si_mountpt, 0,
 	    (uintptr_t)mp) == 0) {
 		VOP_UNLOCK(devvp, 0);
 		return (EBUSY);
 	}
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1);
 	g_topology_unlock();
 	if (error != 0) {
 		atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 		VOP_UNLOCK(devvp, 0);
 		return (error);
 	}
 	dev_ref(dev);
 	VOP_UNLOCK(devvp, 0);
 
 	bo = &devvp->v_bufobj;
 
 	/*
 	 * Read the boot sector of the filesystem, and then check the
 	 * boot signature.  If not a dos boot sector then error out.
 	 *
 	 * NOTE: 8192 is a magic size that works for ffs.
 	 */
 	error = bread(devvp, 0, 8192, NOCRED, &bp);
 	if (error)
 		goto error_exit;
 	bp->b_flags |= B_AGE;
 	bsp = (union bootsector *)bp->b_data;
 	b33 = (struct byte_bpb33 *)bsp->bs33.bsBPB;
 	b50 = (struct byte_bpb50 *)bsp->bs50.bsBPB;
 	b710 = (struct byte_bpb710 *)bsp->bs710.bsBPB;
 
 #ifndef MSDOSFS_NOCHECKSIG
 	if (bsp->bs50.bsBootSectSig0 != BOOTSIG0
 	    || bsp->bs50.bsBootSectSig1 != BOOTSIG1) {
 		error = EINVAL;
 		goto error_exit;
 	}
 #endif
 
 	pmp = malloc(sizeof *pmp, M_MSDOSFSMNT, M_WAITOK | M_ZERO);
 	pmp->pm_mountp = mp;
 	pmp->pm_cp = cp;
 	pmp->pm_bo = bo;
 
 	lockinit(&pmp->pm_fatlock, 0, msdosfs_lock_msg, 0, 0);
 
 	/*
 	 * Initialize ownerships and permissions, since nothing else will
 	 * initialize them iff we are mounting root.
 	 */
 	pmp->pm_uid = UID_ROOT;
 	pmp->pm_gid = GID_WHEEL;
 	pmp->pm_mask = pmp->pm_dirmask = S_IXUSR | S_IXGRP | S_IXOTH |
 	    S_IRUSR | S_IRGRP | S_IROTH | S_IWUSR;
 
 	/*
 	 * Experimental support for large MS-DOS filesystems.
 	 * WARNING: This uses at least 32 bytes of kernel memory (which is not
 	 * reclaimed until the FS is unmounted) for each file on disk to map
 	 * between the 32-bit inode numbers used by VFS and the 64-bit
 	 * pseudo-inode numbers used internally by msdosfs. This is only
 	 * safe to use in certain controlled situations (e.g. read-only FS
 	 * with less than 1 million files).
 	 * Since the mappings do not persist across unmounts (or reboots), these
 	 * filesystems are not suitable for exporting through NFS, or any other
 	 * application that requires fixed inode numbers.
 	 */
 	vfs_flagopt(mp->mnt_optnew, "large", &pmp->pm_flags, MSDOSFS_LARGEFS);
 
 	/*
 	 * Compute several useful quantities from the bpb in the
 	 * bootsector.  Copy in the dos 5 variant of the bpb then fix up
 	 * the fields that are different between dos 5 and dos 3.3.
 	 */
 	SecPerClust = b50->bpbSecPerClust;
 	pmp->pm_BytesPerSec = getushort(b50->bpbBytesPerSec);
 	if (pmp->pm_BytesPerSec < DEV_BSIZE) {
 		error = EINVAL;
 		goto error_exit;
 	}
 	pmp->pm_ResSectors = getushort(b50->bpbResSectors);
 	pmp->pm_FATs = b50->bpbFATs;
 	pmp->pm_RootDirEnts = getushort(b50->bpbRootDirEnts);
 	pmp->pm_Sectors = getushort(b50->bpbSectors);
 	pmp->pm_FATsecs = getushort(b50->bpbFATsecs);
 	pmp->pm_SecPerTrack = getushort(b50->bpbSecPerTrack);
 	pmp->pm_Heads = getushort(b50->bpbHeads);
 	pmp->pm_Media = b50->bpbMedia;
 
 	/* calculate the ratio of sector size to DEV_BSIZE */
 	pmp->pm_BlkPerSec = pmp->pm_BytesPerSec / DEV_BSIZE;
 
 	/*
 	 * We don't check pm_Heads nor pm_SecPerTrack, because
 	 * these may not be set for EFI file systems. We don't
 	 * use these anyway, so we're unaffected if they are
 	 * invalid.
 	 */
 	if (!pmp->pm_BytesPerSec || !SecPerClust) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	if (pmp->pm_Sectors == 0) {
 		pmp->pm_HiddenSects = getulong(b50->bpbHiddenSecs);
 		pmp->pm_HugeSectors = getulong(b50->bpbHugeSectors);
 	} else {
 		pmp->pm_HiddenSects = getushort(b33->bpbHiddenSecs);
 		pmp->pm_HugeSectors = pmp->pm_Sectors;
 	}
 	if (!(pmp->pm_flags & MSDOSFS_LARGEFS)) {
 		if (pmp->pm_HugeSectors > 0xffffffff /
 		    (pmp->pm_BytesPerSec / sizeof(struct direntry)) + 1) {
 			/*
 			 * We cannot deal currently with this size of disk
 			 * due to fileid limitations (see msdosfs_getattr and
 			 * msdosfs_readdir)
 			 */
 			error = EINVAL;
 			vfs_mount_error(mp,
 			    "Disk too big, try '-o large' mount option");
 			goto error_exit;
 		}
 	}
 
 	if (pmp->pm_RootDirEnts == 0) {
 		if (pmp->pm_FATsecs
 		    || getushort(b710->bpbFSVers)) {
 			error = EINVAL;
 #ifdef MSDOSFS_DEBUG
 			printf("mountmsdosfs(): bad FAT32 filesystem\n");
 #endif
 			goto error_exit;
 		}
 		pmp->pm_fatmask = FAT32_MASK;
 		pmp->pm_fatmult = 4;
 		pmp->pm_fatdiv = 1;
 		pmp->pm_FATsecs = getulong(b710->bpbBigFATsecs);
 		if (getushort(b710->bpbExtFlags) & FATMIRROR)
 			pmp->pm_curfat = getushort(b710->bpbExtFlags) & FATNUM;
 		else
 			pmp->pm_flags |= MSDOSFS_FATMIRROR;
 	} else
 		pmp->pm_flags |= MSDOSFS_FATMIRROR;
 
 	/*
 	 * Check a few values (could do some more):
 	 * - logical sector size: power of 2, >= block size
 	 * - sectors per cluster: power of 2, >= 1
 	 * - number of sectors:   >= 1, <= size of partition
 	 * - number of FAT sectors: >= 1
 	 */
 	if ( (SecPerClust == 0)
 	  || (SecPerClust & (SecPerClust - 1))
 	  || (pmp->pm_BytesPerSec < DEV_BSIZE)
 	  || (pmp->pm_BytesPerSec & (pmp->pm_BytesPerSec - 1))
 	  || (pmp->pm_HugeSectors == 0)
 	  || (pmp->pm_FATsecs == 0)
 	  || (SecPerClust * pmp->pm_BlkPerSec > MAXBSIZE / DEV_BSIZE)
 	) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	pmp->pm_HugeSectors *= pmp->pm_BlkPerSec;
 	pmp->pm_HiddenSects *= pmp->pm_BlkPerSec;	/* XXX not used? */
 	pmp->pm_FATsecs     *= pmp->pm_BlkPerSec;
 	SecPerClust         *= pmp->pm_BlkPerSec;
 
 	pmp->pm_fatblk = pmp->pm_ResSectors * pmp->pm_BlkPerSec;
 
 	if (FAT32(pmp)) {
 		pmp->pm_rootdirblk = getulong(b710->bpbRootClust);
 		pmp->pm_firstcluster = pmp->pm_fatblk
 			+ (pmp->pm_FATs * pmp->pm_FATsecs);
 		pmp->pm_fsinfo = getushort(b710->bpbFSInfo) * pmp->pm_BlkPerSec;
 	} else {
 		pmp->pm_rootdirblk = pmp->pm_fatblk +
 			(pmp->pm_FATs * pmp->pm_FATsecs);
 		pmp->pm_rootdirsize = howmany(pmp->pm_RootDirEnts *
 			sizeof(struct direntry), DEV_BSIZE); /* in blocks */
 		pmp->pm_firstcluster = pmp->pm_rootdirblk + pmp->pm_rootdirsize;
 	}
 
 	pmp->pm_maxcluster = (pmp->pm_HugeSectors - pmp->pm_firstcluster) /
 	    SecPerClust + 1;
 	pmp->pm_fatsize = pmp->pm_FATsecs * DEV_BSIZE;	/* XXX not used? */
 
 	if (pmp->pm_fatmask == 0) {
 		if (pmp->pm_maxcluster
 		    <= ((CLUST_RSRVD - CLUST_FIRST) & FAT12_MASK)) {
 			/*
 			 * This will usually be a floppy disk. This size makes
 			 * sure that one fat entry will not be split across
 			 * multiple blocks.
 			 */
 			pmp->pm_fatmask = FAT12_MASK;
 			pmp->pm_fatmult = 3;
 			pmp->pm_fatdiv = 2;
 		} else {
 			pmp->pm_fatmask = FAT16_MASK;
 			pmp->pm_fatmult = 2;
 			pmp->pm_fatdiv = 1;
 		}
 	}
 
 	clusters = (pmp->pm_fatsize / pmp->pm_fatmult) * pmp->pm_fatdiv;
 	if (pmp->pm_maxcluster >= clusters) {
 #ifdef MSDOSFS_DEBUG
 		printf("Warning: number of clusters (%ld) exceeds FAT "
 		    "capacity (%ld)\n", pmp->pm_maxcluster + 1, clusters);
 #endif
 		pmp->pm_maxcluster = clusters - 1;
 	}
 
 	if (FAT12(pmp))
 		pmp->pm_fatblocksize = 3 * 512;
 	else
 		pmp->pm_fatblocksize = PAGE_SIZE;
 	pmp->pm_fatblocksize = roundup(pmp->pm_fatblocksize,
 	    pmp->pm_BytesPerSec);
 	pmp->pm_fatblocksec = pmp->pm_fatblocksize / DEV_BSIZE;
 	pmp->pm_bnshift = ffs(DEV_BSIZE) - 1;
 
 	/*
 	 * Compute mask and shift value for isolating cluster relative byte
 	 * offsets and cluster numbers from a file offset.
 	 */
 	pmp->pm_bpcluster = SecPerClust * DEV_BSIZE;
 	pmp->pm_crbomask = pmp->pm_bpcluster - 1;
 	pmp->pm_cnshift = ffs(pmp->pm_bpcluster) - 1;
 
 	/*
 	 * Check for valid cluster size
 	 * must be a power of 2
 	 */
 	if (pmp->pm_bpcluster ^ (1 << pmp->pm_cnshift)) {
 		error = EINVAL;
 		goto error_exit;
 	}
 
 	/*
 	 * Release the bootsector buffer.
 	 */
 	brelse(bp);
 	bp = NULL;
 
 	/*
 	 * Check the fsinfo sector if we have one.  Silently fix up our
 	 * in-core copy of fp->fsinxtfree if it is unknown (0xffffffff)
 	 * or too large.  Ignore fp->fsinfree for now, since we need to
 	 * read the entire FAT anyway to fill the inuse map.
 	 */
 	if (pmp->pm_fsinfo) {
 		struct fsinfo *fp;
 
 		if ((error = bread(devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec,
 		    NOCRED, &bp)) != 0)
 			goto error_exit;
 		fp = (struct fsinfo *)bp->b_data;
 		if (!bcmp(fp->fsisig1, "RRaA", 4)
 		    && !bcmp(fp->fsisig2, "rrAa", 4)
 		    && !bcmp(fp->fsisig3, "\0\0\125\252", 4)) {
 			pmp->pm_nxtfree = getulong(fp->fsinxtfree);
 			if (pmp->pm_nxtfree > pmp->pm_maxcluster)
 				pmp->pm_nxtfree = CLUST_FIRST;
 		} else
 			pmp->pm_fsinfo = 0;
 		brelse(bp);
 		bp = NULL;
 	}
 
 	/*
 	 * Finish initializing pmp->pm_nxtfree (just in case the first few
 	 * sectors aren't properly reserved in the FAT).  This completes
 	 * the fixup for fp->fsinxtfree, and fixes up the zero-initialized
 	 * value if there is no fsinfo.  We will use pmp->pm_nxtfree
 	 * internally even if there is no fsinfo.
 	 */
 	if (pmp->pm_nxtfree < CLUST_FIRST)
 		pmp->pm_nxtfree = CLUST_FIRST;
 
 	/*
 	 * Allocate memory for the bitmap of allocated clusters, and then
 	 * fill it in.
 	 */
 	pmp->pm_inusemap = malloc(howmany(pmp->pm_maxcluster + 1, N_INUSEBITS)
 				  * sizeof(*pmp->pm_inusemap),
 				  M_MSDOSFSFAT, M_WAITOK);
 
 	/*
 	 * fillinusemap() needs pm_devvp.
 	 */
 	pmp->pm_devvp = devvp;
 	pmp->pm_dev = dev;
 
 	/*
 	 * Have the inuse map filled in.
 	 */
 	MSDOSFS_LOCK_MP(pmp);
 	error = fillinusemap(pmp);
 	MSDOSFS_UNLOCK_MP(pmp);
 	if (error != 0)
 		goto error_exit;
 
 	/*
 	 * If they want fat updates to be synchronous then let them suffer
 	 * the performance degradation in exchange for the on disk copy of
 	 * the fat being correct just about all the time.  I suppose this
 	 * would be a good thing to turn on if the kernel is still flakey.
 	 */
 	if (mp->mnt_flag & MNT_SYNCHRONOUS)
 		pmp->pm_flags |= MSDOSFSMNT_WAITONFAT;
 
 	/*
 	 * Finish up.
 	 */
 	if (ronly)
 		pmp->pm_flags |= MSDOSFSMNT_RONLY;
 	else {
 		if ((error = markvoldirty(pmp, 1)) != 0) {
 			(void)markvoldirty(pmp, 0);
 			goto error_exit;
 		}
 		pmp->pm_fmod = 1;
 	}
 	mp->mnt_data =  pmp;
 	mp->mnt_stat.f_fsid.val[0] = dev2udev(dev);
 	mp->mnt_stat.f_fsid.val[1] = mp->mnt_vfc->vfc_typenum;
 	MNT_ILOCK(mp);
 	mp->mnt_flag |= MNT_LOCAL;
 	mp->mnt_kern_flag |= MNTK_USES_BCACHE;
 	MNT_IUNLOCK(mp);
 
 	if (pmp->pm_flags & MSDOSFS_LARGEFS)
 		msdosfs_fileno_init(mp);
 
 	return 0;
 
 error_exit:
 	if (bp)
 		brelse(bp);
 	if (cp != NULL) {
 		g_topology_lock();
 		g_vfs_close(cp);
 		g_topology_unlock();
 	}
 	if (pmp) {
 		lockdestroy(&pmp->pm_fatlock);
 		if (pmp->pm_inusemap)
 			free(pmp->pm_inusemap, M_MSDOSFSFAT);
 		free(pmp, M_MSDOSFSMNT);
 		mp->mnt_data = NULL;
 	}
 	atomic_store_rel_ptr((uintptr_t *)&dev->si_mountpt, 0);
 	dev_rel(dev);
 	return (error);
 }
 
 /*
  * Unmount the filesystem described by mp.
  */
 static int
 msdosfs_unmount(struct mount *mp, int mntflags)
 {
 	struct msdosfsmount *pmp;
 	int error, flags;
 
 	error = flags = 0;
 	pmp = VFSTOMSDOSFS(mp);
 	if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0)
 		error = msdosfs_sync(mp, MNT_WAIT);
 	if ((mntflags & MNT_FORCE) != 0)
 		flags |= FORCECLOSE;
 	else if (error != 0)
 		return (error);
 	error = vflush(mp, 0, flags, curthread);
 	if (error != 0 && error != ENXIO)
 		return (error);
 	if ((pmp->pm_flags & MSDOSFSMNT_RONLY) == 0) {
 		error = markvoldirty(pmp, 0);
 		if (error && error != ENXIO) {
 			(void)markvoldirty(pmp, 1);
 			return (error);
 		}
 	}
 	if (pmp->pm_flags & MSDOSFSMNT_KICONV && msdosfs_iconv) {
 		if (pmp->pm_w2u)
 			msdosfs_iconv->close(pmp->pm_w2u);
 		if (pmp->pm_u2w)
 			msdosfs_iconv->close(pmp->pm_u2w);
 		if (pmp->pm_d2u)
 			msdosfs_iconv->close(pmp->pm_d2u);
 		if (pmp->pm_u2d)
 			msdosfs_iconv->close(pmp->pm_u2d);
 	}
 
 #ifdef MSDOSFS_DEBUG
 	{
 		struct vnode *vp = pmp->pm_devvp;
 		struct bufobj *bo;
 
 		bo = &vp->v_bufobj;
 		BO_LOCK(bo);
 		VI_LOCK(vp);
 		vn_printf(vp,
 		    "msdosfs_umount(): just before calling VOP_CLOSE()\n");
 		printf("freef %p, freeb %p, mount %p\n",
 		    TAILQ_NEXT(vp, v_actfreelist), vp->v_actfreelist.tqe_prev,
 		    vp->v_mount);
 		printf("cleanblkhd %p, dirtyblkhd %p, numoutput %ld, type %d\n",
 		    TAILQ_FIRST(&vp->v_bufobj.bo_clean.bv_hd),
 		    TAILQ_FIRST(&vp->v_bufobj.bo_dirty.bv_hd),
 		    vp->v_bufobj.bo_numoutput, vp->v_type);
 		VI_UNLOCK(vp);
 		BO_UNLOCK(bo);
 	}
 #endif
 	g_topology_lock();
 	g_vfs_close(pmp->pm_cp);
 	g_topology_unlock();
 	atomic_store_rel_ptr((uintptr_t *)&pmp->pm_dev->si_mountpt, 0);
 	vrele(pmp->pm_devvp);
 	dev_rel(pmp->pm_dev);
 	free(pmp->pm_inusemap, M_MSDOSFSFAT);
 	if (pmp->pm_flags & MSDOSFS_LARGEFS)
 		msdosfs_fileno_free(mp);
 	lockdestroy(&pmp->pm_fatlock);
 	free(pmp, M_MSDOSFSMNT);
 	mp->mnt_data = NULL;
 	MNT_ILOCK(mp);
 	mp->mnt_flag &= ~MNT_LOCAL;
 	MNT_IUNLOCK(mp);
 	return (error);
 }
 
 static int
 msdosfs_root(struct mount *mp, int flags, struct vnode **vpp)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	struct denode *ndep;
 	int error;
 
 #ifdef MSDOSFS_DEBUG
 	printf("msdosfs_root(); mp %p, pmp %p\n", mp, pmp);
 #endif
 	error = deget(pmp, MSDOSFSROOT, MSDOSFSROOT_OFS, &ndep);
 	if (error)
 		return (error);
 	*vpp = DETOV(ndep);
 	return (0);
 }
 
 static int
 msdosfs_statfs(struct mount *mp, struct statfs *sbp)
 {
 	struct msdosfsmount *pmp;
 
 	pmp = VFSTOMSDOSFS(mp);
 	sbp->f_bsize = pmp->pm_bpcluster;
 	sbp->f_iosize = pmp->pm_bpcluster;
 	sbp->f_blocks = pmp->pm_maxcluster + 1;
 	sbp->f_bfree = pmp->pm_freeclustercount;
 	sbp->f_bavail = pmp->pm_freeclustercount;
 	sbp->f_files = pmp->pm_RootDirEnts;	/* XXX */
 	sbp->f_ffree = 0;	/* what to put in here? */
 	return (0);
 }
 
 /*
  * If we have an FSInfo block, update it.
  */
 static int
 msdosfs_fsiflush(struct msdosfsmount *pmp, int waitfor)
 {
 	struct fsinfo *fp;
 	struct buf *bp;
 	int error;
 
 	MSDOSFS_LOCK_MP(pmp);
 	if (pmp->pm_fsinfo == 0 || (pmp->pm_flags & MSDOSFS_FSIMOD) == 0) {
 		error = 0;
 		goto unlock;
 	}
 	error = bread(pmp->pm_devvp, pmp->pm_fsinfo, pmp->pm_BytesPerSec,
 	    NOCRED, &bp);
 	if (error != 0) {
 		brelse(bp);
 		goto unlock;
 	}
 	fp = (struct fsinfo *)bp->b_data;
 	putulong(fp->fsinfree, pmp->pm_freeclustercount);
 	putulong(fp->fsinxtfree, pmp->pm_nxtfree);
 	pmp->pm_flags &= ~MSDOSFS_FSIMOD;
 	if (waitfor == MNT_WAIT)
 		error = bwrite(bp);
 	else
 		bawrite(bp);
 unlock:
 	MSDOSFS_UNLOCK_MP(pmp);
 	return (error);
 }
 
 static int
 msdosfs_sync(struct mount *mp, int waitfor)
 {
 	struct vnode *vp, *nvp;
 	struct thread *td;
 	struct denode *dep;
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	int error, allerror = 0;
 
 	td = curthread;
 
 	/*
 	 * If we ever switch to not updating all of the fats all the time,
 	 * this would be the place to update them from the first one.
 	 */
 	if (pmp->pm_fmod != 0) {
 		if (pmp->pm_flags & MSDOSFSMNT_RONLY)
 			panic("msdosfs_sync: rofs mod");
 		else {
 			/* update fats here */
 		}
 	}
 	/*
 	 * Write back each (modified) denode.
 	 */
 loop:
 	MNT_VNODE_FOREACH_ALL(vp, mp, nvp) {
 		if (vp->v_type == VNON) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		dep = VTODE(vp);
 		if ((dep->de_flag &
 		    (DE_ACCESS | DE_CREATE | DE_UPDATE | DE_MODIFIED)) == 0 &&
 		    (vp->v_bufobj.bo_dirty.bv_cnt == 0 ||
 		    waitfor == MNT_LAZY)) {
 			VI_UNLOCK(vp);
 			continue;
 		}
 		error = vget(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, td);
 		if (error) {
 			if (error == ENOENT)
 				goto loop;
 			continue;
 		}
 		error = VOP_FSYNC(vp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(vp, 0);
 		vrele(vp);
 	}
 
 	/*
 	 * Flush filesystem control info.
 	 */
 	if (waitfor != MNT_LAZY) {
 		vn_lock(pmp->pm_devvp, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(pmp->pm_devvp, waitfor, td);
 		if (error)
 			allerror = error;
 		VOP_UNLOCK(pmp->pm_devvp, 0);
 	}
 
 	error = msdosfs_fsiflush(pmp, waitfor);
 	if (error != 0)
 		allerror = error;
 	return (allerror);
 }
 
 static int
 msdosfs_fhtovp(struct mount *mp, struct fid *fhp, int flags, struct vnode **vpp)
 {
 	struct msdosfsmount *pmp = VFSTOMSDOSFS(mp);
 	struct defid *defhp = (struct defid *) fhp;
 	struct denode *dep;
 	int error;
 
 	error = deget(pmp, defhp->defid_dirclust, defhp->defid_dirofs, &dep);
 	if (error) {
 		*vpp = NULLVP;
 		return (error);
 	}
 	*vpp = DETOV(dep);
 	vnode_create_vobject(*vpp, dep->de_FileSize, curthread);
 	return (0);
 }
 
 static struct vfsops msdosfs_vfsops = {
 	.vfs_fhtovp =		msdosfs_fhtovp,
 	.vfs_mount =		msdosfs_mount,
 	.vfs_cmount =		msdosfs_cmount,
 	.vfs_root =		msdosfs_root,
 	.vfs_statfs =		msdosfs_statfs,
 	.vfs_sync =		msdosfs_sync,
 	.vfs_unmount =		msdosfs_unmount,
 };
 
 VFS_SET(msdosfs_vfsops, msdosfs, 0);
 MODULE_VERSION(msdosfs, 1);
Index: user/alc/PQ_LAUNDRY/sys/geom/bde/g_bde.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/bde/g_bde.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/bde/g_bde.c	(revision 306283)
@@ -1,292 +1,292 @@
 /*-
  * Copyright (c) 2002 Poul-Henning Kamp
  * Copyright (c) 2002 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Poul-Henning Kamp
  * and NAI Labs, the Security Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  */
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/sysctl.h>
 
 #include <crypto/rijndael/rijndael-api-fst.h>
 #include <crypto/sha2/sha512.h>
 #include <geom/geom.h>
 #include <geom/bde/g_bde.h>
 #define BDE_CLASS_NAME "BDE"
 
 FEATURE(geom_bde, "GEOM-based Disk Encryption");
 
 static void
 g_bde_start(struct bio *bp)
 {
 
 	switch (bp->bio_cmd) {
 	case BIO_DELETE:
 	case BIO_READ:
 	case BIO_WRITE:
 		g_bde_start1(bp);
 		break;
 	case BIO_GETATTR:
 		g_io_deliver(bp, EOPNOTSUPP);
 		break;
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	return;
 }
 
 static void
 g_bde_orphan(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_bde_softc *sc;
 
 	g_trace(G_T_TOPOLOGY, "g_bde_orphan(%p/%s)", cp, cp->provider->name);
 	g_topology_assert();
 
 	gp = cp->geom;
 	sc = gp->softc;
 	gp->flags |= G_GEOM_WITHER;
 	LIST_FOREACH(pp, &gp->provider, provider)
-		g_orphan_provider(pp, ENXIO);
+		g_wither_provider(pp, ENXIO);
 	bzero(sc, sizeof(struct g_bde_softc));	/* destroy evidence */
 	return;
 }
 
 static int
 g_bde_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	gp = pp->geom;
 	cp = LIST_FIRST(&gp->consumer);
 	if (cp->acr == 0 && cp->acw == 0 && cp->ace == 0) {
 		de++;
 		dr++;
 	}
 	/* ... and let go of it on last close */
 	if ((cp->acr + dr) == 0 && (cp->acw + dw) == 0 && (cp->ace + de) == 1) {
 		de--;
 		dr--;
 	}
 	return (g_access(cp, dr, dw, de));
 }
 
 static void
 g_bde_create_geom(struct gctl_req *req, struct g_class *mp, struct g_provider *pp)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_bde_key *kp;
 	int error, i;
 	u_int sectorsize;
 	off_t mediasize;
 	struct g_bde_softc *sc;
 	void *pass;
 	void *key;
 
 	g_trace(G_T_TOPOLOGY, "g_bde_create_geom(%s, %s)", mp->name, pp->name);
 	g_topology_assert();
 	gp = NULL;
 
 
 	gp = g_new_geomf(mp, "%s.bde", pp->name);
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_access(cp, 1, 1, 1);
 	if (error) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_destroy_geom(gp);
 		gctl_error(req, "could not access consumer");
 		return;
 	}
 	pass = NULL;
 	key = NULL;
 	do {
 		pass = gctl_get_param(req, "pass", &i);
 		if (pass == NULL || i != SHA512_DIGEST_LENGTH) {
 			gctl_error(req, "No usable key presented");
 			break;
 		}
 		key = gctl_get_param(req, "key", &i);
 		if (key != NULL && i != 16) {
 			gctl_error(req, "Invalid key presented");
 			break;
 		}
 		sectorsize = cp->provider->sectorsize;
 		mediasize = cp->provider->mediasize;
 		sc = g_malloc(sizeof(struct g_bde_softc), M_WAITOK | M_ZERO);
 		gp->softc = sc;
 		sc->geom = gp;
 		sc->consumer = cp;
 
 		error = g_bde_decrypt_lock(sc, pass, key,
 		    mediasize, sectorsize, NULL);
 		bzero(sc->sha2, sizeof sc->sha2);
 		if (error)
 			break;
 		kp = &sc->key;
 
 		/* Initialize helper-fields */
 		kp->keys_per_sector = kp->sectorsize / G_BDE_SKEYLEN;
 		kp->zone_cont = kp->keys_per_sector * kp->sectorsize;
 		kp->zone_width = kp->zone_cont + kp->sectorsize;
 		kp->media_width = kp->sectorN - kp->sector0 -
 		    G_BDE_MAXKEYS * kp->sectorsize;
 
 		/* Our external parameters */
 		sc->zone_cont = kp->zone_cont;
 		sc->mediasize = g_bde_max_sector(kp);
 		sc->sectorsize = kp->sectorsize;
 
 		TAILQ_INIT(&sc->freelist);
 		TAILQ_INIT(&sc->worklist);
 		mtx_init(&sc->worklist_mutex, "g_bde_worklist", NULL, MTX_DEF);
 		/* XXX: error check */
 		kproc_create(g_bde_worker, gp, &sc->thread, 0, 0,
 			"g_bde %s", gp->name);
 		pp = g_new_providerf(gp, "%s", gp->name);
 		pp->stripesize = kp->zone_cont;
 		pp->stripeoffset = 0;
 		pp->mediasize = sc->mediasize;
 		pp->sectorsize = sc->sectorsize;
 		g_error_provider(pp, 0);
 		break;
 	} while (0);
 	if (pass != NULL)
 		bzero(pass, SHA512_DIGEST_LENGTH);
 	if (key != NULL)
 		bzero(key, 16);
 	if (error == 0)
 		return;
 	g_access(cp, -1, -1, -1);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	if (gp->softc != NULL)
 		g_free(gp->softc);
 	g_destroy_geom(gp);
 	switch (error) {
 	case ENOENT:
 		gctl_error(req, "Lock was destroyed");
 		break;
 	case ESRCH:
 		gctl_error(req, "Lock was nuked");
 		break;
 	case EINVAL:
 		gctl_error(req, "Could not open lock");
 		break;
 	case ENOTDIR:
 		gctl_error(req, "Lock not found");
 		break;
 	default:
 		gctl_error(req, "Could not open lock (%d)", error);
 		break;
 	}
 	return;
 }
 
 
 static int
 g_bde_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
 {
 	struct g_consumer *cp;
 	struct g_provider *pp;
 	struct g_bde_softc *sc;
 
 	g_trace(G_T_TOPOLOGY, "g_bde_destroy_geom(%s, %s)", mp->name, gp->name);
 	g_topology_assert();
 	/*
 	 * Orderly detachment.
 	 */
 	KASSERT(gp != NULL, ("NULL geom"));
 	pp = LIST_FIRST(&gp->provider);
 	KASSERT(pp != NULL, ("NULL provider"));
 	if (pp->acr > 0 || pp->acw > 0 || pp->ace > 0)
 		return (EBUSY);
 	sc = gp->softc;
 	cp = LIST_FIRST(&gp->consumer);
 	KASSERT(cp != NULL, ("NULL consumer"));
 	sc->dead = 1;
 	wakeup(sc);
 	g_access(cp, -1, -1, -1);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	while (sc->dead != 2 && !LIST_EMPTY(&pp->consumers))
 		tsleep(sc, PRIBIO, "g_bdedie", hz);
 	mtx_destroy(&sc->worklist_mutex);
 	bzero(&sc->key, sizeof sc->key);
 	g_free(sc);
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static void
 g_bde_ctlreq(struct gctl_req *req, struct g_class *mp, char const *verb)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (!strcmp(verb, "create geom")) {
 		pp = gctl_get_provider(req, "provider");
 		if (pp != NULL)
 			g_bde_create_geom(req, mp, pp);
 	} else if (!strcmp(verb, "destroy geom")) {
 		gp = gctl_get_geom(req, mp, "geom");
 		if (gp != NULL)
 			g_bde_destroy_geom(req, mp, gp);
 	} else {
 		gctl_error(req, "unknown verb");
 	}
 }
 
 static struct g_class g_bde_class	= {
 	.name = BDE_CLASS_NAME,
 	.version = G_VERSION,
 	.destroy_geom = g_bde_destroy_geom,
 	.ctlreq = g_bde_ctlreq,
 	.start = g_bde_start,
 	.orphan = g_bde_orphan,
 	.access = g_bde_access,
 	.spoiled = g_std_spoiled,
 };
 
 DECLARE_GEOM_CLASS(g_bde_class, g_bde);
Index: user/alc/PQ_LAUNDRY/sys/geom/concat/g_concat.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/concat/g_concat.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/concat/g_concat.c	(revision 306283)
@@ -1,994 +1,993 @@
 /*-
  * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <geom/geom.h>
 #include <geom/concat/g_concat.h>
 
 FEATURE(geom_concat, "GEOM concatenation support");
 
 static MALLOC_DEFINE(M_CONCAT, "concat_data", "GEOM_CONCAT Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, concat, CTLFLAG_RW, 0,
     "GEOM_CONCAT stuff");
 static u_int g_concat_debug = 0;
 SYSCTL_UINT(_kern_geom_concat, OID_AUTO, debug, CTLFLAG_RWTUN, &g_concat_debug, 0,
     "Debug level");
 
 static int g_concat_destroy(struct g_concat_softc *sc, boolean_t force);
 static int g_concat_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 
 static g_taste_t g_concat_taste;
 static g_ctl_req_t g_concat_config;
 static g_dumpconf_t g_concat_dumpconf;
 
 struct g_class g_concat_class = {
 	.name = G_CONCAT_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_concat_config,
 	.taste = g_concat_taste,
 	.destroy_geom = g_concat_destroy_geom
 };
 
 
 /*
  * Greatest Common Divisor.
  */
 static u_int
 gcd(u_int a, u_int b)
 {
 	u_int c;
 
 	while (b != 0) {
 		c = a;
 		a = b;
 		b = (c % b);
 	}
 	return (a);
 }
 
 /*
  * Least Common Multiple.
  */
 static u_int
 lcm(u_int a, u_int b)
 {
 
 	return ((a * b) / gcd(a, b));
 }
 
 /*
  * Return the number of valid disks.
  */
 static u_int
 g_concat_nvalid(struct g_concat_softc *sc)
 {
 	u_int i, no;
 
 	no = 0;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i].d_consumer != NULL)
 			no++;
 	}
 
 	return (no);
 }
 
 static void
 g_concat_remove_disk(struct g_concat_disk *disk)
 {
 	struct g_consumer *cp;
 	struct g_concat_softc *sc;
 
 	g_topology_assert();
 	KASSERT(disk->d_consumer != NULL, ("Non-valid disk in %s.", __func__));
 	sc = disk->d_softc;
 	cp = disk->d_consumer;
 
 	if (!disk->d_removed) {
 		G_CONCAT_DEBUG(0, "Disk %s removed from %s.",
 		    cp->provider->name, sc->sc_name);
 		disk->d_removed = 1;
 	}
 
 	if (sc->sc_provider != NULL) {
-		sc->sc_provider->flags |= G_PF_WITHER;
 		G_CONCAT_DEBUG(0, "Device %s deactivated.",
 		    sc->sc_provider->name);
-		g_orphan_provider(sc->sc_provider, ENXIO);
+		g_wither_provider(sc->sc_provider, ENXIO);
 		sc->sc_provider = NULL;
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		return;
 	disk->d_consumer = NULL;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	/* If there are no valid disks anymore, remove device. */
 	if (LIST_EMPTY(&sc->sc_geom->consumer))
 		g_concat_destroy(sc, 1);
 }
 
 static void
 g_concat_orphan(struct g_consumer *cp)
 {
 	struct g_concat_softc *sc;
 	struct g_concat_disk *disk;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 
 	disk = cp->private;
 	if (disk == NULL)	/* Possible? */
 		return;
 	g_concat_remove_disk(disk);
 }
 
 static int
 g_concat_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp1, *cp2, *tmp;
 	struct g_concat_disk *disk;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	gp = pp->geom;
 
 	/* On first open, grab an extra "exclusive" bit */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... and let go of it on last close */
 	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
 		de--;
 
 	LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
 		error = g_access(cp1, dr, dw, de);
 		if (error != 0)
 			goto fail;
 		disk = cp1->private;
 		if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
 		    disk->d_removed) {
 			g_concat_remove_disk(disk); /* May destroy geom. */
 		}
 	}
 	return (0);
 
 fail:
 	LIST_FOREACH(cp2, &gp->consumer, consumer) {
 		if (cp1 == cp2)
 			break;
 		g_access(cp2, -dr, -dw, -de);
 	}
 	return (error);
 }
 
 static void
 g_concat_kernel_dump(struct bio *bp)
 {
 	struct g_concat_softc *sc;
 	struct g_concat_disk *disk;
 	struct bio *cbp;
 	struct g_kerneldump *gkd;
 	u_int i;
 
 	sc = bp->bio_to->geom->softc;
 	gkd = (struct g_kerneldump *)bp->bio_data;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i].d_start <= gkd->offset &&
 		    sc->sc_disks[i].d_end > gkd->offset)
 			break;
 	}
 	if (i == sc->sc_ndisks)
 		g_io_deliver(bp, EOPNOTSUPP);
 	disk = &sc->sc_disks[i];
 	gkd->offset -= disk->d_start;
 	if (gkd->length > disk->d_end - disk->d_start - gkd->offset)
 		gkd->length = disk->d_end - disk->d_start - gkd->offset;
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	cbp->bio_done = g_std_done;
 	g_io_request(cbp, disk->d_consumer);
 	G_CONCAT_DEBUG(1, "Kernel dump will go to %s.",
 	    disk->d_consumer->provider->name);
 }
 
 static void
 g_concat_done(struct bio *bp)
 {
 	struct g_concat_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	mtx_lock(&sc->sc_lock);
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_completed += bp->bio_completed;
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		mtx_unlock(&sc->sc_lock);
 		g_io_deliver(pbp, pbp->bio_error);
 	} else
 		mtx_unlock(&sc->sc_lock);
 	g_destroy_bio(bp);
 }
 
 static void
 g_concat_flush(struct g_concat_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	u_int no;
 
 	bioq_init(&queue);
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_concat_done;
 		cbp->bio_caller1 = sc->sc_disks[no].d_consumer;
 		cbp->bio_to = sc->sc_disks[no].d_consumer->provider;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_CONCAT_LOGREQ(cbp, "Sending request.");
 		cp = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_io_request(cbp, cp);
 	}
 }
 
 static void
 g_concat_start(struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_concat_softc *sc;
 	struct g_concat_disk *disk;
 	struct g_provider *pp;
 	off_t offset, end, length, off, len;
 	struct bio *cbp;
 	char *addr;
 	u_int no;
 
 	pp = bp->bio_to;
 	sc = pp->geom->softc;
 	/*
 	 * If sc == NULL, provider's error should be set and g_concat_start()
 	 * should not be called at all.
 	 */
 	KASSERT(sc != NULL,
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 
 	G_CONCAT_LOGREQ(bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	case BIO_FLUSH:
 		g_concat_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 		if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
 			g_concat_kernel_dump(bp);
 			return;
 		}
 		/* To which provider it should be delivered? */
 		/* FALLTHROUGH */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	offset = bp->bio_offset;
 	length = bp->bio_length;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
 		addr = NULL;
 	else
 		addr = bp->bio_data;
 	end = offset + length;
 
 	bioq_init(&queue);
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		disk = &sc->sc_disks[no];
 		if (disk->d_end <= offset)
 			continue;
 		if (disk->d_start >= end)
 			break;
 
 		off = offset - disk->d_start;
 		len = MIN(length, disk->d_end - offset);
 		length -= len;
 		offset += len;
 
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		/*
 		 * Fill in the component buf structure.
 		 */
 		if (len == bp->bio_length)
 			cbp->bio_done = g_std_done;
 		else
 			cbp->bio_done = g_concat_done;
 		cbp->bio_offset = off;
 		cbp->bio_length = len;
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 		addr += len;
 		cbp->bio_to = disk->d_consumer->provider;
 		cbp->bio_caller1 = disk;
 
 		if (length == 0)
 			break;
 	}
 	KASSERT(length == 0,
 	    ("Length is still greater than 0 (class=%s, name=%s).",
 	    bp->bio_to->geom->class->name, bp->bio_to->geom->name));
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_CONCAT_LOGREQ(cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_concat_check_and_run(struct g_concat_softc *sc)
 {
 	struct g_concat_disk *disk;
 	struct g_provider *dp, *pp;
 	u_int no, sectorsize = 0;
 	off_t start;
 
 	g_topology_assert();
 	if (g_concat_nvalid(sc) != sc->sc_ndisks)
 		return;
 
 	pp = g_new_providerf(sc->sc_geom, "concat/%s", sc->sc_name);
 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE |
 	    G_PF_ACCEPT_UNMAPPED;
 	start = 0;
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		disk = &sc->sc_disks[no];
 		dp = disk->d_consumer->provider;
 		disk->d_start = start;
 		disk->d_end = disk->d_start + dp->mediasize;
 		if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC)
 			disk->d_end -= dp->sectorsize;
 		start = disk->d_end;
 		if (no == 0)
 			sectorsize = dp->sectorsize;
 		else
 			sectorsize = lcm(sectorsize, dp->sectorsize);
 
 		/* A provider underneath us doesn't support unmapped */
 		if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 			G_CONCAT_DEBUG(1, "Cancelling unmapped "
 			    "because of %s.", dp->name);
 			pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
 		}
 	}
 	pp->sectorsize = sectorsize;
 	/* We have sc->sc_disks[sc->sc_ndisks - 1].d_end in 'start'. */
 	pp->mediasize = start;
 	pp->stripesize = sc->sc_disks[0].d_consumer->provider->stripesize;
 	pp->stripeoffset = sc->sc_disks[0].d_consumer->provider->stripeoffset;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 
 	G_CONCAT_DEBUG(0, "Device %s activated.", sc->sc_provider->name);
 }
 
 static int
 g_concat_read_metadata(struct g_consumer *cp, struct g_concat_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL)
 		return (error);
 
 	/* Decode metadata. */
 	concat_metadata_decode(buf, md);
 	g_free(buf);
 
 	return (0);
 }
 
 /*
  * Add disk to given device.
  */
 static int
 g_concat_add_disk(struct g_concat_softc *sc, struct g_provider *pp, u_int no)
 {
 	struct g_concat_disk *disk;
 	struct g_consumer *cp, *fcp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	/* Metadata corrupted? */
 	if (no >= sc->sc_ndisks)
 		return (EINVAL);
 
 	disk = &sc->sc_disks[no];
 	/* Check if disk is not already attached. */
 	if (disk->d_consumer != NULL)
 		return (EEXIST);
 
 	gp = sc->sc_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) {
 		error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 		if (error != 0) {
 			g_detach(cp);
 			g_destroy_consumer(cp);
 			return (error);
 		}
 	}
 	if (sc->sc_type == G_CONCAT_TYPE_AUTOMATIC) {
 		struct g_concat_metadata md;
 
 		/* Re-read metadata. */
 		error = g_concat_read_metadata(cp, &md);
 		if (error != 0)
 			goto fail;
 
 		if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0 ||
 		    strcmp(md.md_name, sc->sc_name) != 0 ||
 		    md.md_id != sc->sc_id) {
 			G_CONCAT_DEBUG(0, "Metadata on %s changed.", pp->name);
 			goto fail;
 		}
 	}
 
 	cp->private = disk;
 	disk->d_consumer = cp;
 	disk->d_softc = sc;
 	disk->d_start = 0;	/* not yet */
 	disk->d_end = 0;	/* not yet */
 	disk->d_removed = 0;
 
 	G_CONCAT_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
 
 	g_concat_check_and_run(sc);
 
 	return (0);
 fail:
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0))
 		g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	return (error);
 }
 
 static struct g_geom *
 g_concat_create(struct g_class *mp, const struct g_concat_metadata *md,
     u_int type)
 {
 	struct g_concat_softc *sc;
 	struct g_geom *gp;
 	u_int no;
 
 	G_CONCAT_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_id);
 
 	/* One disks is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 
 	/* Check for duplicate unit */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
 			G_CONCAT_DEBUG(0, "Device %s already configured.",
 			    gp->name);
 			return (NULL);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_CONCAT, M_WAITOK | M_ZERO);
 	gp->start = g_concat_start;
 	gp->spoiled = g_concat_orphan;
 	gp->orphan = g_concat_orphan;
 	gp->access = g_concat_access;
 	gp->dumpconf = g_concat_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_disks = malloc(sizeof(struct g_concat_disk) * sc->sc_ndisks,
 	    M_CONCAT, M_WAITOK | M_ZERO);
 	for (no = 0; no < sc->sc_ndisks; no++)
 		sc->sc_disks[no].d_consumer = NULL;
 	sc->sc_type = type;
 	mtx_init(&sc->sc_lock, "gconcat lock", NULL, MTX_DEF);
 
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 
 	G_CONCAT_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
 
 	return (gp);
 }
 
 static int
 g_concat_destroy(struct g_concat_softc *sc, boolean_t force)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp, *cp1;
 	struct g_geom *gp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_CONCAT_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_CONCAT_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	}
 
 	gp = sc->sc_geom;
 	LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) {
 		g_concat_remove_disk(cp->private);
 		if (cp1 == NULL)
 			return (0);	/* Recursion happened. */
 	}
 	if (!LIST_EMPTY(&gp->consumer))
 		return (EINPROGRESS);
 
 	gp->softc = NULL;
 	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
 	    gp->name));
 	free(sc->sc_disks, M_CONCAT);
 	mtx_destroy(&sc->sc_lock);
 	free(sc, M_CONCAT);
 
 	G_CONCAT_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static int
 g_concat_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_concat_softc *sc;
 
 	sc = gp->softc;
 	return (g_concat_destroy(sc, 0));
 }
 
 static struct g_geom *
 g_concat_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_concat_metadata md;
 	struct g_concat_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	G_CONCAT_DEBUG(3, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "concat:taste");
 	gp->start = g_concat_start;
 	gp->access = g_concat_access;
 	gp->orphan = g_concat_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_concat_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (strcmp(md.md_magic, G_CONCAT_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version > G_CONCAT_VERSION) {
 		printf("geom_concat.ko module is too old to handle %s.\n",
 		    pp->name);
 		return (NULL);
 	}
 	/*
 	 * Backward compatibility:
 	 */
 	/* There was no md_provider field in earlier versions of metadata. */
 	if (md.md_version < 3)
 		bzero(md.md_provider, sizeof(md.md_provider));
 	/* There was no md_provsize field in earlier versions of metadata. */
 	if (md.md_version < 4)
 		md.md_provsize = pp->mediasize;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != pp->mediasize)
 		return (NULL);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_type != G_CONCAT_TYPE_AUTOMATIC)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id)
 			continue;
 		break;
 	}
 	if (gp != NULL) {
 		G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_concat_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_CONCAT_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			return (NULL);
 		}
 	} else {
 		gp = g_concat_create(mp, &md, G_CONCAT_TYPE_AUTOMATIC);
 		if (gp == NULL) {
 			G_CONCAT_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 		G_CONCAT_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_concat_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_CONCAT_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			g_concat_destroy(sc, 1);
 			return (NULL);
 		}
 	}
 
 	return (gp);
 }
 
 static void
 g_concat_ctl_create(struct gctl_req *req, struct g_class *mp)
 {
 	u_int attached, no;
 	struct g_concat_metadata md;
 	struct g_provider *pp;
 	struct g_concat_softc *sc;
 	struct g_geom *gp;
 	struct sbuf *sb;
 	const char *name;
 	char param[16];
 	int *nargs;
 
 	g_topology_assert();
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs < 2) {
 		gctl_error(req, "Too few arguments.");
 		return;
 	}
 
 	strlcpy(md.md_magic, G_CONCAT_MAGIC, sizeof(md.md_magic));
 	md.md_version = G_CONCAT_VERSION;
 	name = gctl_get_asciiparam(req, "arg0");
 	if (name == NULL) {
 		gctl_error(req, "No 'arg%u' argument.", 0);
 		return;
 	}
 	strlcpy(md.md_name, name, sizeof(md.md_name));
 	md.md_id = arc4random();
 	md.md_no = 0;
 	md.md_all = *nargs - 1;
 	bzero(md.md_provider, sizeof(md.md_provider));
 	/* This field is not important here. */
 	md.md_provsize = 0;
 
 	/* Check all providers are valid */
 	for (no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", no);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL) {
 			G_CONCAT_DEBUG(1, "Disk %s is invalid.", name);
 			gctl_error(req, "Disk %s is invalid.", name);
 			return;
 		}
 	}
 
 	gp = g_concat_create(mp, &md, G_CONCAT_TYPE_MANUAL);
 	if (gp == NULL) {
 		gctl_error(req, "Can't configure %s.", md.md_name);
 		return;
 	}
 
 	sc = gp->softc;
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name);
 	for (attached = 0, no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument.", no);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		KASSERT(pp != NULL, ("Provider %s disappear?!", name));
 		if (g_concat_add_disk(sc, pp, no - 1) != 0) {
 			G_CONCAT_DEBUG(1, "Disk %u (%s) not attached to %s.",
 			    no, pp->name, gp->name);
 			sbuf_printf(sb, " %s", pp->name);
 			continue;
 		}
 		attached++;
 	}
 	sbuf_finish(sb);
 	if (md.md_all != attached) {
 		g_concat_destroy(gp->softc, 1);
 		gctl_error(req, "%s", sbuf_data(sb));
 	}
 	sbuf_delete(sb);
 }
 
 static struct g_concat_softc *
 g_concat_find_device(struct g_class *mp, const char *name)
 {
 	struct g_concat_softc *sc;
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_concat_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_concat_softc *sc;
 	int *force, *nargs, error;
 	const char *name;
 	char param[16];
 	u_int i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No '%s' argument.", "force");
 		return;
 	}
 
 	for (i = 0; i < (u_int)*nargs; i++) {
 		snprintf(param, sizeof(param), "arg%u", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", i);
 			return;
 		}
 		sc = g_concat_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_concat_destroy(sc, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    sc->sc_name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_concat_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_CONCAT_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "create") == 0) {
 		g_concat_ctl_create(req, mp);
 		return;
 	} else if (strcmp(verb, "destroy") == 0 ||
 	    strcmp(verb, "stop") == 0) {
 		g_concat_ctl_destroy(req, mp);
 		return;
 	}
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_concat_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_concat_softc *sc;
 
 	g_topology_assert();
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_concat_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		sbuf_printf(sb, "%s<End>%jd</End>\n", indent,
 		    (intmax_t)disk->d_end);
 		sbuf_printf(sb, "%s<Start>%jd</Start>\n", indent,
 		    (intmax_t)disk->d_start);
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<Type>", indent);
 		switch (sc->sc_type) {
 		case G_CONCAT_TYPE_AUTOMATIC:
 			sbuf_printf(sb, "AUTOMATIC");
 			break;
 		case G_CONCAT_TYPE_MANUAL:
 			sbuf_printf(sb, "MANUAL");
 			break;
 		default:
 			sbuf_printf(sb, "UNKNOWN");
 			break;
 		}
 		sbuf_printf(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
 		    indent, sc->sc_ndisks, g_concat_nvalid(sc));
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
 			sbuf_printf(sb, "UP");
 		else
 			sbuf_printf(sb, "DOWN");
 		sbuf_printf(sb, "</State>\n");
 	}
 }
 
 DECLARE_GEOM_CLASS(g_concat_class, g_concat);
Index: user/alc/PQ_LAUNDRY/sys/geom/gate/g_gate.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/gate/g_gate.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/gate/g_gate.c	(revision 306283)
@@ -1,965 +1,964 @@
 /*-
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * Copyright (c) 2009-2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Pawel Jakub Dawidek
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/fcntl.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/signalvar.h>
 #include <sys/time.h>
 #include <machine/atomic.h>
 
 #include <geom/geom.h>
 #include <geom/gate/g_gate.h>
 
 FEATURE(geom_gate, "GEOM Gate module");
 
 static MALLOC_DEFINE(M_GATE, "gg_data", "GEOM Gate Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, gate, CTLFLAG_RW, 0,
     "GEOM_GATE configuration");
 static int g_gate_debug = 0;
 SYSCTL_INT(_kern_geom_gate, OID_AUTO, debug, CTLFLAG_RWTUN, &g_gate_debug, 0,
     "Debug level");
 static u_int g_gate_maxunits = 256;
 SYSCTL_UINT(_kern_geom_gate, OID_AUTO, maxunits, CTLFLAG_RDTUN,
     &g_gate_maxunits, 0, "Maximum number of ggate devices");
 
 struct g_class g_gate_class = {
 	.name = G_GATE_CLASS_NAME,
 	.version = G_VERSION,
 };
 
 static struct cdev *status_dev;
 static d_ioctl_t g_gate_ioctl;
 static struct cdevsw g_gate_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_ioctl =	g_gate_ioctl,
 	.d_name =	G_GATE_CTL_NAME
 };
 
 
 static struct g_gate_softc **g_gate_units;
 static u_int g_gate_nunits;
 static struct mtx g_gate_units_lock;
 
 static int
 g_gate_destroy(struct g_gate_softc *sc, boolean_t force)
 {
 	struct bio_queue_head queue;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct bio *bp;
 
 	g_topology_assert();
 	mtx_assert(&g_gate_units_lock, MA_OWNED);
 	pp = sc->sc_provider;
 	if (!force && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		mtx_unlock(&g_gate_units_lock);
 		return (EBUSY);
 	}
 	mtx_unlock(&g_gate_units_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0)
 		sc->sc_flags |= G_GATE_FLAG_DESTROY;
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	gp = pp->geom;
-	pp->flags |= G_PF_WITHER;
-	g_orphan_provider(pp, ENXIO);
+	g_wither_provider(pp, ENXIO);
 	callout_drain(&sc->sc_callout);
 	bioq_init(&queue);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_takefirst(&sc->sc_inqueue)) != NULL) {
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	while ((bp = bioq_takefirst(&sc->sc_outqueue)) != NULL) {
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	g_topology_unlock();
 	while ((bp = bioq_takefirst(&queue)) != NULL) {
 		G_GATE_LOGREQ(1, bp, "Request canceled.");
 		g_io_deliver(bp, ENXIO);
 	}
 	mtx_lock(&g_gate_units_lock);
 	/* One reference is ours. */
 	sc->sc_ref--;
 	while (sc->sc_ref > 0)
 		msleep(&sc->sc_ref, &g_gate_units_lock, 0, "gg:destroy", 0);
 	g_gate_units[sc->sc_unit] = NULL;
 	KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?"));
 	g_gate_nunits--;
 	mtx_unlock(&g_gate_units_lock);
 	mtx_destroy(&sc->sc_queue_mtx);
 	g_topology_lock();
 	if ((cp = sc->sc_readcons) != NULL) {
 		sc->sc_readcons = NULL;
 		(void)g_access(cp, -1, 0, 0);
 		g_detach(cp);
 		g_destroy_consumer(cp);
 	}
 	G_GATE_DEBUG(1, "Device %s destroyed.", gp->name);
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 	sc->sc_provider = NULL;
 	free(sc, M_GATE);
 	return (0);
 }
 
 static int
 g_gate_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_gate_softc *sc;
 
 	if (dr <= 0 && dw <= 0 && de <= 0)
 		return (0);
 	sc = pp->geom->softc;
 	if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0)
 		return (ENXIO);
 	/* XXX: Hack to allow read-only mounts. */
 #if 0
 	if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0 && dw > 0)
 		return (EPERM);
 #endif
 	if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0 && dr > 0)
 		return (EPERM);
 	return (0);
 }
 
 static void
 g_gate_queue_io(struct bio *bp)
 {
 	struct g_gate_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) {
 		g_io_deliver(bp, ENXIO);
 		return;
 	}
 
 	mtx_lock(&sc->sc_queue_mtx);
 
 	if (sc->sc_queue_size > 0 && sc->sc_queue_count > sc->sc_queue_size) {
 		mtx_unlock(&sc->sc_queue_mtx);
 		G_GATE_LOGREQ(1, bp, "Queue full, request canceled.");
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 
 	bp->bio_driver1 = (void *)sc->sc_seq;
 	sc->sc_seq++;
 	sc->sc_queue_count++;
 
 	bioq_insert_tail(&sc->sc_inqueue, bp);
 	wakeup(sc);
 
 	mtx_unlock(&sc->sc_queue_mtx);
 }
 
 static void
 g_gate_done(struct bio *cbp)
 {
 	struct bio *pbp;
 
 	pbp = cbp->bio_parent;
 	if (cbp->bio_error == 0) {
 		pbp->bio_completed = cbp->bio_completed;
 		g_destroy_bio(cbp);
 		pbp->bio_inbed++;
 		g_io_deliver(pbp, 0);
 	} else {
 		/* If direct read failed, pass it through userland daemon. */
 		g_destroy_bio(cbp);
 		pbp->bio_children--;
 		g_gate_queue_io(pbp);
 	}
 }
 
 static void
 g_gate_start(struct bio *pbp)
 {
 	struct g_gate_softc *sc;
 
 	sc = pbp->bio_to->geom->softc;
 	if (sc == NULL || (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) {
 		g_io_deliver(pbp, ENXIO);
 		return;
 	}
 	G_GATE_LOGREQ(2, pbp, "Request received.");
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (sc->sc_readcons != NULL) {
 			struct bio *cbp;
 
 			cbp = g_clone_bio(pbp);
 			if (cbp == NULL) {
 				g_io_deliver(pbp, ENOMEM);
 				return;
 			}
 			cbp->bio_done = g_gate_done;
 			cbp->bio_offset = pbp->bio_offset + sc->sc_readoffset;
 			cbp->bio_to = sc->sc_readcons->provider;
 			g_io_request(cbp, sc->sc_readcons);
 			return;
 		}
 		break;
 	case BIO_DELETE:
 	case BIO_WRITE:
 	case BIO_FLUSH:
 		/* XXX: Hack to allow read-only mounts. */
 		if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) {
 			g_io_deliver(pbp, EPERM);
 			return;
 		}
 		break;
 	case BIO_GETATTR:
 	default:
 		G_GATE_LOGREQ(2, pbp, "Ignoring request.");
 		g_io_deliver(pbp, EOPNOTSUPP);
 		return;
 	}
 
 	g_gate_queue_io(pbp);
 }
 
 static struct g_gate_softc *
 g_gate_hold(int unit, const char *name)
 {
 	struct g_gate_softc *sc = NULL;
 
 	mtx_lock(&g_gate_units_lock);
 	if (unit >= 0 && unit < g_gate_maxunits)
 		sc = g_gate_units[unit];
 	else if (unit == G_GATE_NAME_GIVEN) {
 		KASSERT(name != NULL, ("name is NULL"));
 		for (unit = 0; unit < g_gate_maxunits; unit++) {
 			if (g_gate_units[unit] == NULL)
 				continue;
 			if (strcmp(name,
 			    g_gate_units[unit]->sc_provider->name) != 0) {
 				continue;
 			}
 			sc = g_gate_units[unit];
 			break;
 		}
 	}
 	if (sc != NULL)
 		sc->sc_ref++;
 	mtx_unlock(&g_gate_units_lock);
 	return (sc);
 }
 
 static void
 g_gate_release(struct g_gate_softc *sc)
 {
 
 	g_topology_assert_not();
 	mtx_lock(&g_gate_units_lock);
 	sc->sc_ref--;
 	KASSERT(sc->sc_ref >= 0, ("Negative sc_ref for %s.", sc->sc_name));
 	if (sc->sc_ref == 0 && (sc->sc_flags & G_GATE_FLAG_DESTROY) != 0)
 		wakeup(&sc->sc_ref);
 	mtx_unlock(&g_gate_units_lock);
 }
 
 static int
 g_gate_getunit(int unit, int *errorp)
 {
 
 	mtx_assert(&g_gate_units_lock, MA_OWNED);
 	if (unit >= 0) {
 		if (unit >= g_gate_maxunits)
 			*errorp = EINVAL;
 		else if (g_gate_units[unit] == NULL)
 			return (unit);
 		else
 			*errorp = EEXIST;
 	} else {
 		for (unit = 0; unit < g_gate_maxunits; unit++) {
 			if (g_gate_units[unit] == NULL)
 				return (unit);
 		}
 		*errorp = ENFILE;
 	}
 	return (-1);
 }
 
 static void
 g_gate_guard(void *arg)
 {
 	struct bio_queue_head queue;
 	struct g_gate_softc *sc;
 	struct bintime curtime;
 	struct bio *bp, *bp2;
 
 	sc = arg;
 	binuptime(&curtime);
 	g_gate_hold(sc->sc_unit, NULL);
 	bioq_init(&queue);
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_inqueue.queue, bio_queue, bp2) {
 		if (curtime.sec - bp->bio_t0.sec < 5)
 			continue;
 		bioq_remove(&sc->sc_inqueue, bp);
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, bp2) {
 		if (curtime.sec - bp->bio_t0.sec < 5)
 			continue;
 		bioq_remove(&sc->sc_outqueue, bp);
 		sc->sc_queue_count--;
 		bioq_insert_tail(&queue, bp);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	while ((bp = bioq_takefirst(&queue)) != NULL) {
 		G_GATE_LOGREQ(1, bp, "Request timeout.");
 		g_io_deliver(bp, EIO);
 	}
 	if ((sc->sc_flags & G_GATE_FLAG_DESTROY) == 0) {
 		callout_reset(&sc->sc_callout, sc->sc_timeout * hz,
 		    g_gate_guard, sc);
 	}
 	g_gate_release(sc);
 }
 
 static void
 g_gate_orphan(struct g_consumer *cp)
 {
 	struct g_gate_softc *sc;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	KASSERT(cp == sc->sc_readcons, ("cp=%p sc_readcons=%p", cp,
 	    sc->sc_readcons));
 	sc->sc_readcons = NULL;
 	G_GATE_DEBUG(1, "Destroying read consumer on provider %s orphan.",
 	    cp->provider->name);
 	(void)g_access(cp, -1, 0, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_gate_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_gate_softc *sc;
 
 	sc = gp->softc;
 	if (sc == NULL || pp != NULL || cp != NULL)
 		return;
 	sc = g_gate_hold(sc->sc_unit, NULL);
 	if (sc == NULL)
 		return;
 	if ((sc->sc_flags & G_GATE_FLAG_READONLY) != 0) {
 		sbuf_printf(sb, "%s<access>%s</access>\n", indent, "read-only");
 	} else if ((sc->sc_flags & G_GATE_FLAG_WRITEONLY) != 0) {
 		sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 		    "write-only");
 	} else {
 		sbuf_printf(sb, "%s<access>%s</access>\n", indent,
 		    "read-write");
 	}
 	if (sc->sc_readcons != NULL) {
 		sbuf_printf(sb, "%s<read_offset>%jd</read_offset>\n",
 		    indent, (intmax_t)sc->sc_readoffset);
 		sbuf_printf(sb, "%s<read_provider>%s</read_provider>\n",
 		    indent, sc->sc_readcons->provider->name);
 	}
 	sbuf_printf(sb, "%s<timeout>%u</timeout>\n", indent, sc->sc_timeout);
 	sbuf_printf(sb, "%s<info>%s</info>\n", indent, sc->sc_info);
 	sbuf_printf(sb, "%s<queue_count>%u</queue_count>\n", indent,
 	    sc->sc_queue_count);
 	sbuf_printf(sb, "%s<queue_size>%u</queue_size>\n", indent,
 	    sc->sc_queue_size);
 	sbuf_printf(sb, "%s<ref>%u</ref>\n", indent, sc->sc_ref);
 	sbuf_printf(sb, "%s<unit>%d</unit>\n", indent, sc->sc_unit);
 	g_topology_unlock();
 	g_gate_release(sc);
 	g_topology_lock();
 }
 
 static int
 g_gate_create(struct g_gate_ctl_create *ggio)
 {
 	struct g_gate_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp, *ropp;
 	struct g_consumer *cp;
 	char name[NAME_MAX];
 	int error = 0, unit;
 
 	if (ggio->gctl_mediasize <= 0) {
 		G_GATE_DEBUG(1, "Invalid media size.");
 		return (EINVAL);
 	}
 	if (ggio->gctl_sectorsize <= 0) {
 		G_GATE_DEBUG(1, "Invalid sector size.");
 		return (EINVAL);
 	}
 	if (!powerof2(ggio->gctl_sectorsize)) {
 		G_GATE_DEBUG(1, "Invalid sector size.");
 		return (EINVAL);
 	}
 	if ((ggio->gctl_mediasize % ggio->gctl_sectorsize) != 0) {
 		G_GATE_DEBUG(1, "Invalid media size.");
 		return (EINVAL);
 	}
 	if ((ggio->gctl_flags & G_GATE_FLAG_READONLY) != 0 &&
 	    (ggio->gctl_flags & G_GATE_FLAG_WRITEONLY) != 0) {
 		G_GATE_DEBUG(1, "Invalid flags.");
 		return (EINVAL);
 	}
 	if (ggio->gctl_unit != G_GATE_UNIT_AUTO &&
 	    ggio->gctl_unit != G_GATE_NAME_GIVEN &&
 	    ggio->gctl_unit < 0) {
 		G_GATE_DEBUG(1, "Invalid unit number.");
 		return (EINVAL);
 	}
 	if (ggio->gctl_unit == G_GATE_NAME_GIVEN &&
 	    ggio->gctl_name[0] == '\0') {
 		G_GATE_DEBUG(1, "No device name.");
 		return (EINVAL);
 	}
 
 	sc = malloc(sizeof(*sc), M_GATE, M_WAITOK | M_ZERO);
 	sc->sc_flags = (ggio->gctl_flags & G_GATE_USERFLAGS);
 	strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info));
 	sc->sc_seq = 1;
 	bioq_init(&sc->sc_inqueue);
 	bioq_init(&sc->sc_outqueue);
 	mtx_init(&sc->sc_queue_mtx, "gg:queue", NULL, MTX_DEF);
 	sc->sc_queue_count = 0;
 	sc->sc_queue_size = ggio->gctl_maxcount;
 	if (sc->sc_queue_size > G_GATE_MAX_QUEUE_SIZE)
 		sc->sc_queue_size = G_GATE_MAX_QUEUE_SIZE;
 	sc->sc_timeout = ggio->gctl_timeout;
 	callout_init(&sc->sc_callout, 1);
 
 	mtx_lock(&g_gate_units_lock);
 	sc->sc_unit = g_gate_getunit(ggio->gctl_unit, &error);
 	if (sc->sc_unit < 0)
 		goto fail1;
 	if (ggio->gctl_unit == G_GATE_NAME_GIVEN)
 		snprintf(name, sizeof(name), "%s", ggio->gctl_name);
 	else {
 		snprintf(name, sizeof(name), "%s%d", G_GATE_PROVIDER_NAME,
 		    sc->sc_unit);
 	}
 	/* Check for name collision. */
 	for (unit = 0; unit < g_gate_maxunits; unit++) {
 		if (g_gate_units[unit] == NULL)
 			continue;
 		if (strcmp(name, g_gate_units[unit]->sc_name) != 0)
 			continue;
 		error = EEXIST;
 		goto fail1;
 	}
 	sc->sc_name = name;
 	g_gate_units[sc->sc_unit] = sc;
 	g_gate_nunits++;
 	mtx_unlock(&g_gate_units_lock);
 
 	g_topology_lock();
 
 	if (ggio->gctl_readprov[0] == '\0') {
 		ropp = NULL;
 	} else {
 		ropp = g_provider_by_name(ggio->gctl_readprov);
 		if (ropp == NULL) {
 			G_GATE_DEBUG(1, "Provider %s doesn't exist.",
 			    ggio->gctl_readprov);
 			error = EINVAL;
 			goto fail2;
 		}
 		if ((ggio->gctl_readoffset % ggio->gctl_sectorsize) != 0) {
 			G_GATE_DEBUG(1, "Invalid read offset.");
 			error = EINVAL;
 			goto fail2;
 		}
 		if (ggio->gctl_mediasize + ggio->gctl_readoffset >
 		    ropp->mediasize) {
 			G_GATE_DEBUG(1, "Invalid read offset or media size.");
 			error = EINVAL;
 			goto fail2;
 		}
 	}
 
 	gp = g_new_geomf(&g_gate_class, "%s", name);
 	gp->start = g_gate_start;
 	gp->access = g_gate_access;
 	gp->orphan = g_gate_orphan;
 	gp->dumpconf = g_gate_dumpconf;
 	gp->softc = sc;
 
 	if (ropp != NULL) {
 		cp = g_new_consumer(gp);
 		cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 		error = g_attach(cp, ropp);
 		if (error != 0) {
 			G_GATE_DEBUG(1, "Unable to attach to %s.", ropp->name);
 			goto fail3;
 		}
 		error = g_access(cp, 1, 0, 0);
 		if (error != 0) {
 			G_GATE_DEBUG(1, "Unable to access %s.", ropp->name);
 			g_detach(cp);
 			goto fail3;
 		}
 		sc->sc_readcons = cp;
 		sc->sc_readoffset = ggio->gctl_readoffset;
 	}
 
 	ggio->gctl_unit = sc->sc_unit;
 
 	pp = g_new_providerf(gp, "%s", name);
 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	pp->mediasize = ggio->gctl_mediasize;
 	pp->sectorsize = ggio->gctl_sectorsize;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 
 	g_topology_unlock();
 	mtx_lock(&g_gate_units_lock);
 	sc->sc_name = sc->sc_provider->name;
 	mtx_unlock(&g_gate_units_lock);
 	G_GATE_DEBUG(1, "Device %s created.", gp->name);
 
 	if (sc->sc_timeout > 0) {
 		callout_reset(&sc->sc_callout, sc->sc_timeout * hz,
 		    g_gate_guard, sc);
 	}
 	return (0);
 fail3:
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 fail2:
 	g_topology_unlock();
 	mtx_lock(&g_gate_units_lock);
 	g_gate_units[sc->sc_unit] = NULL;
 	KASSERT(g_gate_nunits > 0, ("negative g_gate_nunits?"));
 	g_gate_nunits--;
 fail1:
 	mtx_unlock(&g_gate_units_lock);
 	mtx_destroy(&sc->sc_queue_mtx);
 	free(sc, M_GATE);
 	return (error);
 }
 
 static int
 g_gate_modify(struct g_gate_softc *sc, struct g_gate_ctl_modify *ggio)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	int error;
 
 	if ((ggio->gctl_modify & GG_MODIFY_MEDIASIZE) != 0) {
 		if (ggio->gctl_mediasize <= 0) {
 			G_GATE_DEBUG(1, "Invalid media size.");
 			return (EINVAL);
 		}
 		pp = sc->sc_provider;
 		if ((ggio->gctl_mediasize % pp->sectorsize) != 0) {
 			G_GATE_DEBUG(1, "Invalid media size.");
 			return (EINVAL);
 		}
 		/* TODO */
 		return (EOPNOTSUPP);
 	}
 
 	if ((ggio->gctl_modify & GG_MODIFY_INFO) != 0)
 		(void)strlcpy(sc->sc_info, ggio->gctl_info, sizeof(sc->sc_info));
 
 	cp = NULL;
 
 	if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) {
 		g_topology_lock();
 		if (sc->sc_readcons != NULL) {
 			cp = sc->sc_readcons;
 			sc->sc_readcons = NULL;
 			(void)g_access(cp, -1, 0, 0);
 			g_detach(cp);
 			g_destroy_consumer(cp);
 		}
 		if (ggio->gctl_readprov[0] != '\0') {
 			pp = g_provider_by_name(ggio->gctl_readprov);
 			if (pp == NULL) {
 				g_topology_unlock();
 				G_GATE_DEBUG(1, "Provider %s doesn't exist.",
 				    ggio->gctl_readprov);
 				return (EINVAL);
 			}
 			cp = g_new_consumer(sc->sc_provider->geom);
 			cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 			error = g_attach(cp, pp);
 			if (error != 0) {
 				G_GATE_DEBUG(1, "Unable to attach to %s.",
 				    pp->name);
 			} else {
 				error = g_access(cp, 1, 0, 0);
 				if (error != 0) {
 					G_GATE_DEBUG(1, "Unable to access %s.",
 					    pp->name);
 					g_detach(cp);
 				}
 			}
 			if (error != 0) {
 				g_destroy_consumer(cp);
 				g_topology_unlock();
 				return (error);
 			}
 		}
 	} else {
 		cp = sc->sc_readcons;
 	}
 
 	if ((ggio->gctl_modify & GG_MODIFY_READOFFSET) != 0) {
 		if (cp == NULL) {
 			G_GATE_DEBUG(1, "No read provider.");
 			return (EINVAL);
 		}
 		pp = sc->sc_provider;
 		if ((ggio->gctl_readoffset % pp->sectorsize) != 0) {
 			G_GATE_DEBUG(1, "Invalid read offset.");
 			return (EINVAL);
 		}
 		if (pp->mediasize + ggio->gctl_readoffset >
 		    cp->provider->mediasize) {
 			G_GATE_DEBUG(1, "Invalid read offset or media size.");
 			return (EINVAL);
 		}
 		sc->sc_readoffset = ggio->gctl_readoffset;
 	}
 
 	if ((ggio->gctl_modify & GG_MODIFY_READPROV) != 0) {
 		sc->sc_readcons = cp;
 		g_topology_unlock();
 	}
 
 	return (0);
 }
 
 #define	G_GATE_CHECK_VERSION(ggio)	do {				\
 	if ((ggio)->gctl_version != G_GATE_VERSION) {			\
 		printf("Version mismatch %d != %d.\n",			\
 		    ggio->gctl_version, G_GATE_VERSION);		\
 		return (EINVAL);					\
 	}								\
 } while (0)
 static int
 g_gate_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
 {
 	struct g_gate_softc *sc;
 	struct bio *bp;
 	int error = 0;
 
 	G_GATE_DEBUG(4, "ioctl(%s, %lx, %p, %x, %p)", devtoname(dev), cmd, addr,
 	    flags, td);
 
 	switch (cmd) {
 	case G_GATE_CMD_CREATE:
 	    {
 		struct g_gate_ctl_create *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		error = g_gate_create(ggio);
 		/*
 		 * Reset TDP_GEOM flag.
 		 * There are pending events for sure, because we just created
 		 * new provider and other classes want to taste it, but we
 		 * cannot answer on I/O requests until we're here.
 		 */
 		td->td_pflags &= ~TDP_GEOM;
 		return (error);
 	    }
 	case G_GATE_CMD_MODIFY:
 	    {
 		struct g_gate_ctl_modify *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, NULL);
 		if (sc == NULL)
 			return (ENXIO);
 		error = g_gate_modify(sc, ggio);
 		g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_DESTROY:
 	    {
 		struct g_gate_ctl_destroy *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name);
 		if (sc == NULL)
 			return (ENXIO);
 		g_topology_lock();
 		mtx_lock(&g_gate_units_lock);
 		error = g_gate_destroy(sc, ggio->gctl_force);
 		g_topology_unlock();
 		if (error != 0)
 			g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_CANCEL:
 	    {
 		struct g_gate_ctl_cancel *ggio = (void *)addr;
 		struct bio *tbp, *lbp;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, ggio->gctl_name);
 		if (sc == NULL)
 			return (ENXIO);
 		lbp = NULL;
 		mtx_lock(&sc->sc_queue_mtx);
 		TAILQ_FOREACH_SAFE(bp, &sc->sc_outqueue.queue, bio_queue, tbp) {
 			if (ggio->gctl_seq == 0 ||
 			    ggio->gctl_seq == (uintptr_t)bp->bio_driver1) {
 				G_GATE_LOGREQ(1, bp, "Request canceled.");
 				bioq_remove(&sc->sc_outqueue, bp);
 				/*
 				 * Be sure to put requests back onto incoming
 				 * queue in the proper order.
 				 */
 				if (lbp == NULL)
 					bioq_insert_head(&sc->sc_inqueue, bp);
 				else {
 					TAILQ_INSERT_AFTER(&sc->sc_inqueue.queue,
 					    lbp, bp, bio_queue);
 				}
 				lbp = bp;
 				/*
 				 * If only one request was canceled, leave now.
 				 */
 				if (ggio->gctl_seq != 0)
 					break;
 			}
 		}
 		if (ggio->gctl_unit == G_GATE_NAME_GIVEN)
 			ggio->gctl_unit = sc->sc_unit;
 		mtx_unlock(&sc->sc_queue_mtx);
 		g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_START:
 	    {
 		struct g_gate_ctl_io *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, NULL);
 		if (sc == NULL)
 			return (ENXIO);
 		error = 0;
 		for (;;) {
 			mtx_lock(&sc->sc_queue_mtx);
 			bp = bioq_first(&sc->sc_inqueue);
 			if (bp != NULL)
 				break;
 			if ((sc->sc_flags & G_GATE_FLAG_DESTROY) != 0) {
 				ggio->gctl_error = ECANCELED;
 				mtx_unlock(&sc->sc_queue_mtx);
 				goto start_end;
 			}
 			if (msleep(sc, &sc->sc_queue_mtx,
 			    PPAUSE | PDROP | PCATCH, "ggwait", 0) != 0) {
 				ggio->gctl_error = ECANCELED;
 				goto start_end;
 			}
 		}
 		ggio->gctl_cmd = bp->bio_cmd;
 		if (bp->bio_cmd == BIO_WRITE &&
 		    bp->bio_length > ggio->gctl_length) {
 			mtx_unlock(&sc->sc_queue_mtx);
 			ggio->gctl_length = bp->bio_length;
 			ggio->gctl_error = ENOMEM;
 			goto start_end;
 		}
 		bioq_remove(&sc->sc_inqueue, bp);
 		bioq_insert_tail(&sc->sc_outqueue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		ggio->gctl_seq = (uintptr_t)bp->bio_driver1;
 		ggio->gctl_offset = bp->bio_offset;
 		ggio->gctl_length = bp->bio_length;
 
 		switch (bp->bio_cmd) {
 		case BIO_READ:
 		case BIO_DELETE:
 		case BIO_FLUSH:
 			break;
 		case BIO_WRITE:
 			error = copyout(bp->bio_data, ggio->gctl_data,
 			    bp->bio_length);
 			if (error != 0) {
 				mtx_lock(&sc->sc_queue_mtx);
 				bioq_remove(&sc->sc_outqueue, bp);
 				bioq_insert_head(&sc->sc_inqueue, bp);
 				mtx_unlock(&sc->sc_queue_mtx);
 				goto start_end;
 			}
 			break;
 		}
 start_end:
 		g_gate_release(sc);
 		return (error);
 	    }
 	case G_GATE_CMD_DONE:
 	    {
 		struct g_gate_ctl_io *ggio = (void *)addr;
 
 		G_GATE_CHECK_VERSION(ggio);
 		sc = g_gate_hold(ggio->gctl_unit, NULL);
 		if (sc == NULL)
 			return (ENOENT);
 		error = 0;
 		mtx_lock(&sc->sc_queue_mtx);
 		TAILQ_FOREACH(bp, &sc->sc_outqueue.queue, bio_queue) {
 			if (ggio->gctl_seq == (uintptr_t)bp->bio_driver1)
 				break;
 		}
 		if (bp != NULL) {
 			bioq_remove(&sc->sc_outqueue, bp);
 			sc->sc_queue_count--;
 		}
 		mtx_unlock(&sc->sc_queue_mtx);
 		if (bp == NULL) {
 			/*
 			 * Request was probably canceled.
 			 */
 			goto done_end;
 		}
 		if (ggio->gctl_error == EAGAIN) {
 			bp->bio_error = 0;
 			G_GATE_LOGREQ(1, bp, "Request desisted.");
 			mtx_lock(&sc->sc_queue_mtx);
 			sc->sc_queue_count++;
 			bioq_insert_head(&sc->sc_inqueue, bp);
 			wakeup(sc);
 			mtx_unlock(&sc->sc_queue_mtx);
 		} else {
 			bp->bio_error = ggio->gctl_error;
 			if (bp->bio_error == 0) {
 				bp->bio_completed = bp->bio_length;
 				switch (bp->bio_cmd) {
 				case BIO_READ:
 					error = copyin(ggio->gctl_data,
 					    bp->bio_data, bp->bio_length);
 					if (error != 0)
 						bp->bio_error = error;
 					break;
 				case BIO_DELETE:
 				case BIO_WRITE:
 				case BIO_FLUSH:
 					break;
 				}
 			}
 			G_GATE_LOGREQ(2, bp, "Request done.");
 			g_io_deliver(bp, bp->bio_error);
 		}
 done_end:
 		g_gate_release(sc);
 		return (error);
 	    }
 	}
 	return (ENOIOCTL);
 }
 
 static void
 g_gate_device(void)
 {
 
 	status_dev = make_dev(&g_gate_cdevsw, 0x0, UID_ROOT, GID_WHEEL, 0600,
 	    G_GATE_CTL_NAME);
 }
 
 static int
 g_gate_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&g_gate_units_lock, "gg_units_lock", NULL, MTX_DEF);
 		g_gate_units = malloc(g_gate_maxunits * sizeof(g_gate_units[0]),
 		    M_GATE, M_WAITOK | M_ZERO);
 		g_gate_nunits = 0;
 		g_gate_device();
 		break;
 	case MOD_UNLOAD:
 		mtx_lock(&g_gate_units_lock);
 		if (g_gate_nunits > 0) {
 			mtx_unlock(&g_gate_units_lock);
 			error = EBUSY;
 			break;
 		}
 		mtx_unlock(&g_gate_units_lock);
 		mtx_destroy(&g_gate_units_lock);
 		if (status_dev != NULL)
 			destroy_dev(status_dev);
 		free(g_gate_units, M_GATE);
 		break;
 	default:
 		return (EOPNOTSUPP);
 		break;
 	}
 
 	return (error);
 }
 static moduledata_t g_gate_module = {
 	G_GATE_MOD_NAME,
 	g_gate_modevent,
 	NULL
 };
 DECLARE_MODULE(geom_gate, g_gate_module, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
 DECLARE_GEOM_CLASS(g_gate_class, g_gate);
Index: user/alc/PQ_LAUNDRY/sys/geom/journal/g_journal.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/journal/g_journal.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/journal/g_journal.c	(revision 306283)
@@ -1,3038 +1,3037 @@
 /*-
  * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/eventhandler.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/sbuf.h>
 #ifdef GJ_MEMDEBUG
 #include <sys/stack.h>
 #include <sys/kdb.h>
 #endif
 #include <vm/vm.h>
 #include <vm/vm_kern.h>
 #include <geom/geom.h>
 
 #include <geom/journal/g_journal.h>
 
 FEATURE(geom_journal, "GEOM journaling support");
 
 /*
  * On-disk journal format:
  *
  * JH - Journal header
  * RH - Record header
  *
  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
  * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
  *
  */
 
 CTASSERT(sizeof(struct g_journal_header) <= 512);
 CTASSERT(sizeof(struct g_journal_record_header) <= 512);
 
 static MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
 static struct mtx g_journal_cache_mtx;
 MTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
 
 const struct g_journal_desc *g_journal_filesystems[] = {
 	&g_journal_ufs,
 	NULL
 };
 
 SYSCTL_DECL(_kern_geom);
 
 int g_journal_debug = 0;
 static u_int g_journal_switch_time = 10;
 static u_int g_journal_force_switch = 70;
 static u_int g_journal_parallel_flushes = 16;
 static u_int g_journal_parallel_copies = 16;
 static u_int g_journal_accept_immediately = 64;
 static u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
 static u_int g_journal_do_optimize = 1;
 
 static SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0,
     "GEOM_JOURNAL stuff");
 SYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RWTUN, &g_journal_debug, 0,
     "Debug level");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
     &g_journal_switch_time, 0, "Switch journals every N seconds");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
     &g_journal_force_switch, 0, "Force switch when journal is N% full");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
     &g_journal_parallel_flushes, 0,
     "Number of flush I/O requests to send in parallel");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
     &g_journal_accept_immediately, 0,
     "Number of I/O requests accepted immediately");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
     &g_journal_parallel_copies, 0,
     "Number of copy I/O requests to send in parallel");
 static int
 g_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int entries;
 	int error;
 
 	entries = g_journal_record_entries;
 	error = sysctl_handle_int(oidp, &entries, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
 		return (EINVAL);
 	g_journal_record_entries = entries;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
     "Maximum number of entires in one journal record");
 SYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
     &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
 
 static u_int g_journal_cache_used = 0;
 static u_int g_journal_cache_limit = 64 * 1024 * 1024;
 static u_int g_journal_cache_divisor = 2;
 static u_int g_journal_cache_switch = 90;
 static u_int g_journal_cache_misses = 0;
 static u_int g_journal_cache_alloc_failures = 0;
 static u_int g_journal_cache_low = 0;
 
 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
     "GEOM_JOURNAL cache");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
     &g_journal_cache_used, 0, "Number of allocated bytes");
 static int
 g_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int limit;
 	int error;
 
 	limit = g_journal_cache_limit;
 	error = sysctl_handle_int(oidp, &limit, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	g_journal_cache_limit = limit;
 	g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
     CTLTYPE_UINT | CTLFLAG_RWTUN, NULL, 0, g_journal_cache_limit_sysctl, "I",
     "Maximum number of allocated bytes");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
     &g_journal_cache_divisor, 0,
     "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
 static int
 g_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	u_int cswitch;
 	int error;
 
 	cswitch = g_journal_cache_switch;
 	error = sysctl_handle_int(oidp, &cswitch, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (cswitch > 100)
 		return (EINVAL);
 	g_journal_cache_switch = cswitch;
 	g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
 	return (0);
 }
 SYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
     CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
     "Force switch when we hit this percent of cache use");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
     &g_journal_cache_misses, 0, "Number of cache misses");
 SYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
     &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
 
 static u_long g_journal_stats_bytes_skipped = 0;
 static u_long g_journal_stats_combined_ios = 0;
 static u_long g_journal_stats_switches = 0;
 static u_long g_journal_stats_wait_for_copy = 0;
 static u_long g_journal_stats_journal_full = 0;
 static u_long g_journal_stats_low_mem = 0;
 
 static SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
     "GEOM_JOURNAL statistics");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
     &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
     &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
     &g_journal_stats_switches, 0, "Number of journal switches");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
     &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
     &g_journal_stats_journal_full, 0,
     "Number of times journal was almost full.");
 SYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
     &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
 
 static g_taste_t g_journal_taste;
 static g_ctl_req_t g_journal_config;
 static g_dumpconf_t g_journal_dumpconf;
 static g_init_t g_journal_init;
 static g_fini_t g_journal_fini;
 
 struct g_class g_journal_class = {
 	.name = G_JOURNAL_CLASS_NAME,
 	.version = G_VERSION,
 	.taste = g_journal_taste,
 	.ctlreq = g_journal_config,
 	.dumpconf = g_journal_dumpconf,
 	.init = g_journal_init,
 	.fini = g_journal_fini
 };
 
 static int g_journal_destroy(struct g_journal_softc *sc);
 static void g_journal_metadata_update(struct g_journal_softc *sc);
 static void g_journal_switch_wait(struct g_journal_softc *sc);
 
 #define	GJ_SWITCHER_WORKING	0
 #define	GJ_SWITCHER_DIE		1
 #define	GJ_SWITCHER_DIED	2
 static int g_journal_switcher_state = GJ_SWITCHER_WORKING;
 static int g_journal_switcher_wokenup = 0;
 static int g_journal_sync_requested = 0;
 
 #ifdef GJ_MEMDEBUG
 struct meminfo {
 	size_t		mi_size;
 	struct stack	mi_stack;
 };
 #endif
 
 /*
  * We use our own malloc/realloc/free funtions, so we can collect statistics
  * and force journal switch when we're running out of cache.
  */
 static void *
 gj_malloc(size_t size, int flags)
 {
 	void *p;
 #ifdef GJ_MEMDEBUG
 	struct meminfo *mi;
 #endif
 
 	mtx_lock(&g_journal_cache_mtx);
 	if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
 	    g_journal_cache_used + size > g_journal_cache_low) {
 		GJ_DEBUG(1, "No cache, waking up the switcher.");
 		g_journal_switcher_wokenup = 1;
 		wakeup(&g_journal_switcher_state);
 	}
 	if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
 	    g_journal_cache_used + size > g_journal_cache_limit) {
 		mtx_unlock(&g_journal_cache_mtx);
 		g_journal_cache_alloc_failures++;
 		return (NULL);
 	}
 	g_journal_cache_used += size;
 	mtx_unlock(&g_journal_cache_mtx);
 	flags &= ~M_NOWAIT;
 #ifndef GJ_MEMDEBUG
 	p = malloc(size, M_JOURNAL, flags | M_WAITOK);
 #else
 	mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
 	p = (u_char *)mi + sizeof(*mi);
 	mi->mi_size = size;
 	stack_save(&mi->mi_stack);
 #endif
 	return (p);
 }
 
 static void
 gj_free(void *p, size_t size)
 {
 #ifdef GJ_MEMDEBUG
 	struct meminfo *mi;
 #endif
 
 	KASSERT(p != NULL, ("p=NULL"));
 	KASSERT(size > 0, ("size=0"));
 	mtx_lock(&g_journal_cache_mtx);
 	KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
 	g_journal_cache_used -= size;
 	mtx_unlock(&g_journal_cache_mtx);
 #ifdef GJ_MEMDEBUG
 	mi = p = (void *)((u_char *)p - sizeof(*mi));
 	if (mi->mi_size != size) {
 		printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
 		    mi->mi_size);
 		printf("GJOURNAL: Alloc backtrace:\n");
 		stack_print(&mi->mi_stack);
 		printf("GJOURNAL: Free backtrace:\n");
 		kdb_backtrace();
 	}
 #endif
 	free(p, M_JOURNAL);
 }
 
 static void *
 gj_realloc(void *p, size_t size, size_t oldsize)
 {
 	void *np;
 
 #ifndef GJ_MEMDEBUG
 	mtx_lock(&g_journal_cache_mtx);
 	g_journal_cache_used -= oldsize;
 	g_journal_cache_used += size;
 	mtx_unlock(&g_journal_cache_mtx);
 	np = realloc(p, size, M_JOURNAL, M_WAITOK);
 #else
 	np = gj_malloc(size, M_WAITOK);
 	bcopy(p, np, MIN(oldsize, size));
 	gj_free(p, oldsize);
 #endif
 	return (np);
 }
 
 static void
 g_journal_check_overflow(struct g_journal_softc *sc)
 {
 	off_t length, used;
 
 	if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
 	    (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
 	     sc->sc_journal_offset < sc->sc_active.jj_offset)) {
 		panic("Journal overflow "
 		    "(id = %u joffset=%jd active=%jd inactive=%jd)",
 		    (unsigned)sc->sc_id,
 		    (intmax_t)sc->sc_journal_offset,
 		    (intmax_t)sc->sc_active.jj_offset,
 		    (intmax_t)sc->sc_inactive.jj_offset);
 	}
 	if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
 		length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
 		used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 	} else {
 		length = sc->sc_jend - sc->sc_active.jj_offset;
 		length += sc->sc_inactive.jj_offset - sc->sc_jstart;
 		if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
 			used = sc->sc_journal_offset - sc->sc_active.jj_offset;
 		else {
 			used = sc->sc_jend - sc->sc_active.jj_offset;
 			used += sc->sc_journal_offset - sc->sc_jstart;
 		}
 	}
 	/* Already woken up? */
 	if (g_journal_switcher_wokenup)
 		return;
 	/*
 	 * If the active journal takes more than g_journal_force_switch precent
 	 * of free journal space, we force journal switch.
 	 */
 	KASSERT(length > 0,
 	    ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
 	    (intmax_t)length, (intmax_t)used,
 	    (intmax_t)sc->sc_active.jj_offset,
 	    (intmax_t)sc->sc_inactive.jj_offset,
 	    (intmax_t)sc->sc_journal_offset));
 	if ((used * 100) / length > g_journal_force_switch) {
 		g_journal_stats_journal_full++;
 		GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
 		    sc->sc_name, (used * 100) / length);
 		mtx_lock(&g_journal_cache_mtx);
 		g_journal_switcher_wokenup = 1;
 		wakeup(&g_journal_switcher_state);
 		mtx_unlock(&g_journal_cache_mtx);
 	}
 }
 
 static void
 g_journal_orphan(struct g_consumer *cp)
 {
 	struct g_journal_softc *sc;
 	char name[256];
 	int error;
 
 	g_topology_assert();
 	sc = cp->geom->softc;
 	strlcpy(name, cp->provider->name, sizeof(name));
 	GJ_DEBUG(0, "Lost provider %s.", name);
 	if (sc == NULL)
 		return;
 	error = g_journal_destroy(sc);
 	if (error == 0)
 		GJ_DEBUG(0, "Journal %s destroyed.", name);
 	else {
 		GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
 		    "Destroy it manually after last close.", sc->sc_name,
 		    error);
 	}
 }
 
 static int
 g_journal_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_journal_softc *sc;
 	int dcr, dcw, dce;
 
 	g_topology_assert();
 	GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
 	    acr, acw, ace);
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	sc = pp->geom->softc;
 	if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 		if (acr <= 0 && acw <= 0 && ace <= 0)
 			return (0);
 		else
 			return (ENXIO);
 	}
 	if (pp->acw == 0 && dcw > 0) {
 		GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
 		sc->sc_flags &= ~GJF_DEVICE_CLEAN;
 		g_topology_unlock();
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 	} /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 		sc->sc_flags |= GJF_DEVICE_CLEAN;
 		g_topology_unlock();
 		g_journal_metadata_update(sc);
 		g_topology_lock();
 	} */
 	return (0);
 }
 
 static void
 g_journal_header_encode(struct g_journal_header *hdr, u_char *data)
 {
 
 	bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
 	data += sizeof(GJ_HEADER_MAGIC);
 	le32enc(data, hdr->jh_journal_id);
 	data += 4;
 	le32enc(data, hdr->jh_journal_next_id);
 }
 
 static int
 g_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
 {
 
 	bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
 	data += sizeof(hdr->jh_magic);
 	if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
 		return (EINVAL);
 	hdr->jh_journal_id = le32dec(data);
 	data += 4;
 	hdr->jh_journal_next_id = le32dec(data);
 	return (0);
 }
 
 static void
 g_journal_flush_cache(struct g_journal_softc *sc)
 {
 	struct bintime bt;
 	int error;
 
 	if (sc->sc_bio_flush == 0)
 		return;
 	GJ_TIMER_START(1, &bt);
 	if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
 		error = g_io_flush(sc->sc_jconsumer);
 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 		    sc->sc_jconsumer->provider->name, error);
 	}
 	if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
 		/*
 		 * TODO: This could be called in parallel with the
 		 *       previous call.
 		 */
 		error = g_io_flush(sc->sc_dconsumer);
 		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
 		    sc->sc_dconsumer->provider->name, error);
 	}
 	GJ_TIMER_STOP(1, &bt, "Cache flush time");
 }
 
 static int
 g_journal_write_header(struct g_journal_softc *sc)
 {
 	struct g_journal_header hdr;
 	struct g_consumer *cp;
 	u_char *buf;
 	int error;
 
 	cp = sc->sc_jconsumer;
 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 
 	strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
 	hdr.jh_journal_id = sc->sc_journal_id;
 	hdr.jh_journal_next_id = sc->sc_journal_next_id;
 	g_journal_header_encode(&hdr, buf);
 	error = g_write_data(cp, sc->sc_journal_offset, buf,
 	    cp->provider->sectorsize);
 	/* if (error == 0) */
 	sc->sc_journal_offset += cp->provider->sectorsize;
 
 	gj_free(buf, cp->provider->sectorsize);
 	return (error);
 }
 
 /*
  * Every journal record has a header and data following it.
  * Functions below are used to decode the header before storing it to
  * little endian and to encode it after reading to system endianness.
  */
 static void
 g_journal_record_header_encode(struct g_journal_record_header *hdr,
     u_char *data)
 {
 	struct g_journal_entry *ent;
 	u_int i;
 
 	bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
 	data += sizeof(GJ_RECORD_HEADER_MAGIC);
 	le32enc(data, hdr->jrh_journal_id);
 	data += 8;
 	le16enc(data, hdr->jrh_nentries);
 	data += 2;
 	bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
 	data += 8;
 	for (i = 0; i < hdr->jrh_nentries; i++) {
 		ent = &hdr->jrh_entries[i];
 		le64enc(data, ent->je_joffset);
 		data += 8;
 		le64enc(data, ent->je_offset);
 		data += 8;
 		le64enc(data, ent->je_length);
 		data += 8;
 	}
 }
 
 static int
 g_journal_record_header_decode(const u_char *data,
     struct g_journal_record_header *hdr)
 {
 	struct g_journal_entry *ent;
 	u_int i;
 
 	bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
 	data += sizeof(hdr->jrh_magic);
 	if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
 		return (EINVAL);
 	hdr->jrh_journal_id = le32dec(data);
 	data += 8;
 	hdr->jrh_nentries = le16dec(data);
 	data += 2;
 	if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
 		return (EINVAL);
 	bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
 	data += 8;
 	for (i = 0; i < hdr->jrh_nentries; i++) {
 		ent = &hdr->jrh_entries[i];
 		ent->je_joffset = le64dec(data);
 		data += 8;
 		ent->je_offset = le64dec(data);
 		data += 8;
 		ent->je_length = le64dec(data);
 		data += 8;
 	}
 	return (0);
 }
 
 /*
  * Function reads metadata from a provider (via the given consumer), decodes
  * it to system endianness and verifies its correctness.
  */
 static int
 g_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata is stored in last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = journal_metadata_decode(buf, md);
 	g_free(buf);
 	/* Is this is gjournal provider at all? */
 	if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
 		return (EINVAL);
 	/*
 	 * Are we able to handle this version of metadata?
 	 * We only maintain backward compatibility.
 	 */
 	if (md->md_version > G_JOURNAL_VERSION) {
 		GJ_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	/* Is checksum correct? */
 	if (error != 0) {
 		GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 	return (0);
 }
 
 /*
  * Two functions below are responsible for updating metadata.
  * Only metadata on the data provider is updated (we need to update
  * information about active journal in there).
  */
 static void
 g_journal_metadata_done(struct bio *bp)
 {
 
 	/*
 	 * There is not much we can do on error except informing about it.
 	 */
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
 		    bp->bio_error);
 	} else {
 		GJ_LOGREQ(2, bp, "Metadata updated.");
 	}
 	gj_free(bp->bio_data, bp->bio_length);
 	g_destroy_bio(bp);
 }
 
 static void
 g_journal_metadata_update(struct g_journal_softc *sc)
 {
 	struct g_journal_metadata md;
 	struct g_consumer *cp;
 	struct bio *bp;
 	u_char *sector;
 
 	cp = sc->sc_dconsumer;
 	sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 	strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
 	md.md_version = G_JOURNAL_VERSION;
 	md.md_id = sc->sc_id;
 	md.md_type = sc->sc_orig_type;
 	md.md_jstart = sc->sc_jstart;
 	md.md_jend = sc->sc_jend;
 	md.md_joffset = sc->sc_inactive.jj_offset;
 	md.md_jid = sc->sc_journal_previous_id;
 	md.md_flags = 0;
 	if (sc->sc_flags & GJF_DEVICE_CLEAN)
 		md.md_flags |= GJ_FLAG_CLEAN;
 
 	if (sc->sc_flags & GJF_DEVICE_HARDCODED)
 		strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
 	else
 		bzero(md.md_provider, sizeof(md.md_provider));
 	md.md_provsize = cp->provider->mediasize;
 	journal_metadata_encode(&md, sector);
 
 	/*
 	 * Flush the cache, so we know all data are on disk.
 	 * We write here informations like "journal is consistent", so we need
 	 * to be sure it is. Without BIO_FLUSH here, we can end up in situation
 	 * where metadata is stored on disk, but not all data.
 	 */
 	g_journal_flush_cache(sc);
 
 	bp = g_alloc_bio();
 	bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
 	bp->bio_length = cp->provider->sectorsize;
 	bp->bio_data = sector;
 	bp->bio_cmd = BIO_WRITE;
 	if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
 		bp->bio_done = g_journal_metadata_done;
 		g_io_request(bp, cp);
 	} else {
 		bp->bio_done = NULL;
 		g_io_request(bp, cp);
 		biowait(bp, "gjmdu");
 		g_journal_metadata_done(bp);
 	}
 
 	/*
 	 * Be sure metadata reached the disk.
 	 */
 	g_journal_flush_cache(sc);
 }
 
 /*
  * This is where the I/O request comes from the GEOM.
  */
 static void
 g_journal_start(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	GJ_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 		mtx_lock(&sc->sc_mtx);
 		bioq_insert_tail(&sc->sc_regular_queue, bp);
 		wakeup(sc);
 		mtx_unlock(&sc->sc_mtx);
 		return;
 	case BIO_GETATTR:
 		if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
 			strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
 			bp->bio_completed = strlen(bp->bio_to->name) + 1;
 			g_io_deliver(bp, 0);
 			return;
 		}
 		/* FALLTHROUGH */
 	case BIO_DELETE:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 }
 
 static void
 g_journal_std_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	mtx_lock(&sc->sc_mtx);
 	bioq_insert_tail(&sc->sc_back_queue, bp);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static struct bio *
 g_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
     int flags)
 {
 	struct bio *bp;
 
 	bp = g_alloc_bio();
 	bp->bio_offset = start;
 	bp->bio_joffset = joffset;
 	bp->bio_length = end - start;
 	bp->bio_cmd = BIO_WRITE;
 	bp->bio_done = g_journal_std_done;
 	if (data == NULL)
 		bp->bio_data = NULL;
 	else {
 		bp->bio_data = gj_malloc(bp->bio_length, flags);
 		if (bp->bio_data != NULL)
 			bcopy(data, bp->bio_data, bp->bio_length);
 	}
 	return (bp);
 }
 
 #define	g_journal_insert_bio(head, bp, flags)				\
 	g_journal_insert((head), (bp)->bio_offset,			\
 		(bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset,	\
 		(bp)->bio_data, flags)
 /*
  * The function below does a lot more than just inserting bio to the queue.
  * It keeps the queue sorted by offset and ensures that there are no doubled
  * data (it combines bios where ranges overlap).
  *
  * The function returns the number of bios inserted (as bio can be splitted).
  */
 static int
 g_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
     u_char *data, int flags)
 {
 	struct bio *nbp, *cbp, *pbp;
 	off_t cstart, cend;
 	u_char *tmpdata;
 	int n;
 
 	GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
 	    joffset);
 	n = 0;
 	pbp = NULL;
 	GJQ_FOREACH(*head, cbp) {
 		cstart = cbp->bio_offset;
 		cend = cbp->bio_offset + cbp->bio_length;
 
 		if (nstart >= cend) {
 			/*
 			 *  +-------------+
 			 *  |             |
 			 *  |   current   |  +-------------+
 			 *  |     bio     |  |             |
 			 *  |             |  |     new     |
 			 *  +-------------+  |     bio     |
 			 *                   |             |
 			 *                   +-------------+
 			 */
 			GJ_DEBUG(3, "INSERT(%p): 1", *head);
 		} else if (nend <= cstart) {
 			/*
 			 *                   +-------------+
 			 *                   |             |
 			 *  +-------------+  |   current   |
 			 *  |             |  |     bio     |
 			 *  |     new     |  |             |
 			 *  |     bio     |  +-------------+
 			 *  |             |
 			 *  +-------------+
 			 */
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			if (pbp == NULL)
 				*head = nbp;
 			else
 				pbp->bio_next = nbp;
 			nbp->bio_next = cbp;
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
 			    pbp);
 			goto end;
 		} else if (nstart <= cstart && nend >= cend) {
 			/*
 			 *      +-------------+      +-------------+
 			 *      | current bio |      | current bio |
 			 *  +---+-------------+---+  +-------------+---+
 			 *  |   |             |   |  |             |   |
 			 *  |   |             |   |  |             |   |
 			 *  |   +-------------+   |  +-------------+   |
 			 *  |       new bio       |  |     new bio     |
 			 *  +---------------------+  +-----------------+
 			 *
 			 *      +-------------+  +-------------+
 			 *      | current bio |  | current bio |
 			 *  +---+-------------+  +-------------+
 			 *  |   |             |  |             |
 			 *  |   |             |  |             |
 			 *  |   +-------------+  +-------------+
 			 *  |     new bio     |  |   new bio   |
 			 *  +-----------------+  +-------------+
 			 */
 			g_journal_stats_bytes_skipped += cbp->bio_length;
 			cbp->bio_offset = nstart;
 			cbp->bio_joffset = joffset;
 			cbp->bio_length = cend - nstart;
 			if (cbp->bio_data != NULL) {
 				gj_free(cbp->bio_data, cend - cstart);
 				cbp->bio_data = NULL;
 			}
 			if (data != NULL) {
 				cbp->bio_data = gj_malloc(cbp->bio_length,
 				    flags);
 				if (cbp->bio_data != NULL) {
 					bcopy(data, cbp->bio_data,
 					    cbp->bio_length);
 				}
 				data += cend - nstart;
 			}
 			joffset += cend - nstart;
 			nstart = cend;
 			GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
 		} else if (nstart > cstart && nend >= cend) {
 			/*
 			 *  +-----------------+  +-------------+
 			 *  |   current bio   |  | current bio |
 			 *  |   +-------------+  |   +---------+---+
 			 *  |   |             |  |   |         |   |
 			 *  |   |             |  |   |         |   |
 			 *  +---+-------------+  +---+---------+   |
 			 *      |   new bio   |      |   new bio   |
 			 *      +-------------+      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += cend - nstart;
 			nbp = g_journal_new_bio(nstart, cend, joffset, data,
 			    flags);
 			nbp->bio_next = cbp->bio_next;
 			cbp->bio_next = nbp;
 			cbp->bio_length = nstart - cstart;
 			if (cbp->bio_data != NULL) {
 				cbp->bio_data = gj_realloc(cbp->bio_data,
 				    cbp->bio_length, cend - cstart);
 			}
 			if (data != NULL)
 				data += cend - nstart;
 			joffset += cend - nstart;
 			nstart = cend;
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
 		} else if (nstart > cstart && nend < cend) {
 			/*
 			 *  +---------------------+
 			 *  |     current bio     |
 			 *  |   +-------------+   |
 			 *  |   |             |   |
 			 *  |   |             |   |
 			 *  +---+-------------+---+
 			 *      |   new bio   |
 			 *      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += nend - nstart;
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			nbp->bio_next = cbp->bio_next;
 			cbp->bio_next = nbp;
 			if (cbp->bio_data == NULL)
 				tmpdata = NULL;
 			else
 				tmpdata = cbp->bio_data + nend - cstart;
 			nbp = g_journal_new_bio(nend, cend,
 			    cbp->bio_joffset + nend - cstart, tmpdata, flags);
 			nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
 			((struct bio *)cbp->bio_next)->bio_next = nbp;
 			cbp->bio_length = nstart - cstart;
 			if (cbp->bio_data != NULL) {
 				cbp->bio_data = gj_realloc(cbp->bio_data,
 				    cbp->bio_length, cend - cstart);
 			}
 			n += 2;
 			GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
 			goto end;
 		} else if (nstart <= cstart && nend < cend) {
 			/*
 			 *  +-----------------+      +-------------+
 			 *  |   current bio   |      | current bio |
 			 *  +-------------+   |  +---+---------+   |
 			 *  |             |   |  |   |         |   |
 			 *  |             |   |  |   |         |   |
 			 *  +-------------+---+  |   +---------+---+
 			 *  |   new bio   |      |   new bio   |
 			 *  +-------------+      +-------------+
 			 */
 			g_journal_stats_bytes_skipped += nend - nstart;
 			nbp = g_journal_new_bio(nstart, nend, joffset, data,
 			    flags);
 			if (pbp == NULL)
 				*head = nbp;
 			else
 				pbp->bio_next = nbp;
 			nbp->bio_next = cbp;
 			cbp->bio_offset = nend;
 			cbp->bio_length = cend - nend;
 			cbp->bio_joffset += nend - cstart;
 			tmpdata = cbp->bio_data;
 			if (tmpdata != NULL) {
 				cbp->bio_data = gj_malloc(cbp->bio_length,
 				    flags);
 				if (cbp->bio_data != NULL) {
 					bcopy(tmpdata + nend - cstart,
 					    cbp->bio_data, cbp->bio_length);
 				}
 				gj_free(tmpdata, cend - cstart);
 			}
 			n++;
 			GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
 			goto end;
 		}
 		if (nstart == nend)
 			goto end;
 		pbp = cbp;
 	}
 	nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
 	if (pbp == NULL)
 		*head = nbp;
 	else
 		pbp->bio_next = nbp;
 	nbp->bio_next = NULL;
 	n++;
 	GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
 end:
 	if (g_journal_debug >= 3) {
 		GJQ_FOREACH(*head, cbp) {
 			GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
 			    (intmax_t)cbp->bio_offset,
 			    (intmax_t)cbp->bio_length,
 			    (intmax_t)cbp->bio_joffset, cbp->bio_data);
 		}
 		GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
 	}
 	return (n);
 }
 
 /*
  * The function combines neighbour bios trying to squeeze as much data as
  * possible into one bio.
  *
  * The function returns the number of bios combined (negative value).
  */
 static int
 g_journal_optimize(struct bio *head)
 {
 	struct bio *cbp, *pbp;
 	int n;
 
 	n = 0;
 	pbp = NULL;
 	GJQ_FOREACH(head, cbp) {
 		/* Skip bios which has to be read first. */
 		if (cbp->bio_data == NULL) {
 			pbp = NULL;
 			continue;
 		}
 		/* There is no previous bio yet. */
 		if (pbp == NULL) {
 			pbp = cbp;
 			continue;
 		}
 		/* Is this a neighbour bio? */
 		if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
 			/* Be sure that bios queue is sorted. */
 			KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
 			    ("poffset=%jd plength=%jd coffset=%jd",
 			    (intmax_t)pbp->bio_offset,
 			    (intmax_t)pbp->bio_length,
 			    (intmax_t)cbp->bio_offset));
 			pbp = cbp;
 			continue;
 		}
 		/* Be sure we don't end up with too big bio. */
 		if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
 			pbp = cbp;
 			continue;
 		}
 		/* Ok, we can join bios. */
 		GJ_LOGREQ(4, pbp, "Join: ");
 		GJ_LOGREQ(4, cbp, "and: ");
 		pbp->bio_data = gj_realloc(pbp->bio_data,
 		    pbp->bio_length + cbp->bio_length, pbp->bio_length);
 		bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
 		    cbp->bio_length);
 		gj_free(cbp->bio_data, cbp->bio_length);
 		pbp->bio_length += cbp->bio_length;
 		pbp->bio_next = cbp->bio_next;
 		g_destroy_bio(cbp);
 		cbp = pbp;
 		g_journal_stats_combined_ios++;
 		n--;
 		GJ_LOGREQ(4, pbp, "Got: ");
 	}
 	return (n);
 }
 
 /*
  * TODO: Update comment.
  * These are functions responsible for copying one portion of data from journal
  * to the destination provider.
  * The order goes like this:
  * 1. Read the header, which contains informations about data blocks
  *    following it.
  * 2. Read the data blocks from the journal.
  * 3. Write the data blocks on the data provider.
  *
  * g_journal_copy_start()
  * g_journal_copy_done() - got finished write request, logs potential errors.
  */
 
 /*
  * When there is no data in cache, this function is used to read it.
  */
 static void
 g_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
 {
 	struct bio *cbp;
 
 	/*
 	 * We were short in memory, so data was freed.
 	 * In that case we need to read it back from journal.
 	 */
 	cbp = g_alloc_bio();
 	cbp->bio_cflags = bp->bio_cflags;
 	cbp->bio_parent = bp;
 	cbp->bio_offset = bp->bio_joffset;
 	cbp->bio_length = bp->bio_length;
 	cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
 	cbp->bio_cmd = BIO_READ;
 	cbp->bio_done = g_journal_std_done;
 	GJ_LOGREQ(4, cbp, "READ FIRST");
 	g_io_request(cbp, sc->sc_jconsumer);
 	g_journal_cache_misses++;
 }
 
 static void
 g_journal_copy_send(struct g_journal_softc *sc)
 {
 	struct bio *bioq, *bp, *lbp;
 
 	bioq = lbp = NULL;
 	mtx_lock(&sc->sc_mtx);
 	for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
 		bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
 		if (bp == NULL)
 			break;
 		GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
 		sc->sc_copy_in_progress++;
 		GJQ_INSERT_AFTER(bioq, bp, lbp);
 		lbp = bp;
 	}
 	mtx_unlock(&sc->sc_mtx);
 	if (g_journal_do_optimize)
 		sc->sc_copy_in_progress += g_journal_optimize(bioq);
 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
 		GJQ_REMOVE(bioq, bp);
 		GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
 		bp->bio_cflags = GJ_BIO_COPY;
 		if (bp->bio_data == NULL)
 			g_journal_read_first(sc, bp);
 		else {
 			bp->bio_joffset = 0;
 			GJ_LOGREQ(4, bp, "SEND");
 			g_io_request(bp, sc->sc_dconsumer);
 		}
 	}
 }
 
 static void
 g_journal_copy_start(struct g_journal_softc *sc)
 {
 
 	/*
 	 * Remember in metadata that we're starting to copy journaled data
 	 * to the data provider.
 	 * In case of power failure, we will copy these data once again on boot.
 	 */
 	if (!sc->sc_journal_copying) {
 		sc->sc_journal_copying = 1;
 		GJ_DEBUG(1, "Starting copy of journal.");
 		g_journal_metadata_update(sc);
 	}
 	g_journal_copy_send(sc);
 }
 
 /*
  * Data block has been read from the journal provider.
  */
 static int
 g_journal_copy_read_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 	struct g_consumer *cp;
 	struct bio *pbp;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 
 	sc = bp->bio_from->geom->softc;
 	pbp = bp->bio_parent;
 
 	if (bp->bio_error != 0) {
 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 		    bp->bio_to->name, bp->bio_error);
 		/*
 		 * We will not be able to deliver WRITE request as well.
 		 */
 		gj_free(bp->bio_data, bp->bio_length);
 		g_destroy_bio(pbp);
 		g_destroy_bio(bp);
 		sc->sc_copy_in_progress--;
 		return (1);
 	}
 	pbp->bio_data = bp->bio_data;
 	cp = sc->sc_dconsumer;
 	g_io_request(pbp, cp);
 	GJ_LOGREQ(4, bp, "READ DONE");
 	g_destroy_bio(bp);
 	return (0);
 }
 
 /*
  * Data block has been written to the data provider.
  */
 static void
 g_journal_copy_write_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
 
 	sc = bp->bio_from->geom->softc;
 	sc->sc_copy_in_progress--;
 
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
 		    bp->bio_error);
 	}
 	GJQ_REMOVE(sc->sc_copy_queue, bp);
 	gj_free(bp->bio_data, bp->bio_length);
 	GJ_LOGREQ(4, bp, "DONE");
 	g_destroy_bio(bp);
 
 	if (sc->sc_copy_in_progress == 0) {
 		/*
 		 * This was the last write request for this journal.
 		 */
 		GJ_DEBUG(1, "Data has been copied.");
 		sc->sc_journal_copying = 0;
 	}
 }
 
 static void g_journal_flush_done(struct bio *bp);
 
 /*
  * Flush one record onto active journal provider.
  */
 static void
 g_journal_flush(struct g_journal_softc *sc)
 {
 	struct g_journal_record_header hdr;
 	struct g_journal_entry *ent;
 	struct g_provider *pp;
 	struct bio **bioq;
 	struct bio *bp, *fbp, *pbp;
 	off_t joffset, size;
 	u_char *data, hash[16];
 	MD5_CTX ctx;
 	u_int i;
 
 	if (sc->sc_current_count == 0)
 		return;
 
 	size = 0;
 	pp = sc->sc_jprovider;
 	GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 	joffset = sc->sc_journal_offset;
 
 	GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
 	    sc->sc_current_count, pp->name, (intmax_t)joffset);
 
 	/*
 	 * Store 'journal id', so we know to which journal this record belongs.
 	 */
 	hdr.jrh_journal_id = sc->sc_journal_id;
 	/* Could be less than g_journal_record_entries if called due timeout. */
 	hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
 	strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
 
 	bioq = &sc->sc_active.jj_queue;
 	pbp = sc->sc_flush_queue;
 
 	fbp = g_alloc_bio();
 	fbp->bio_parent = NULL;
 	fbp->bio_cflags = GJ_BIO_JOURNAL;
 	fbp->bio_offset = -1;
 	fbp->bio_joffset = joffset;
 	fbp->bio_length = pp->sectorsize;
 	fbp->bio_cmd = BIO_WRITE;
 	fbp->bio_done = g_journal_std_done;
 	GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
 	pbp = fbp;
 	fbp->bio_to = pp;
 	GJ_LOGREQ(4, fbp, "FLUSH_OUT");
 	joffset += pp->sectorsize;
 	sc->sc_flush_count++;
 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 		MD5Init(&ctx);
 
 	for (i = 0; i < hdr.jrh_nentries; i++) {
 		bp = sc->sc_current_queue;
 		KASSERT(bp != NULL, ("NULL bp"));
 		bp->bio_to = pp;
 		GJ_LOGREQ(4, bp, "FLUSHED");
 		sc->sc_current_queue = bp->bio_next;
 		bp->bio_next = NULL;
 		sc->sc_current_count--;
 
 		/* Add to the header. */
 		ent = &hdr.jrh_entries[i];
 		ent->je_offset = bp->bio_offset;
 		ent->je_joffset = joffset;
 		ent->je_length = bp->bio_length;
 		size += ent->je_length;
 
 		data = bp->bio_data;
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 			MD5Update(&ctx, data, ent->je_length);
 		g_reset_bio(bp);
 		bp->bio_cflags = GJ_BIO_JOURNAL;
 		bp->bio_offset = ent->je_offset;
 		bp->bio_joffset = ent->je_joffset;
 		bp->bio_length = ent->je_length;
 		bp->bio_data = data;
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_done = g_journal_std_done;
 		GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
 		pbp = bp;
 		bp->bio_to = pp;
 		GJ_LOGREQ(4, bp, "FLUSH_OUT");
 		joffset += bp->bio_length;
 		sc->sc_flush_count++;
 
 		/*
 		 * Add request to the active sc_journal_queue queue.
 		 * This is our cache. After journal switch we don't have to
 		 * read the data from the inactive journal, because we keep
 		 * it in memory.
 		 */
 		g_journal_insert(bioq, ent->je_offset,
 		    ent->je_offset + ent->je_length, ent->je_joffset, data,
 		    M_NOWAIT);
 	}
 
 	/*
 	 * After all requests, store valid header.
 	 */
 	data = gj_malloc(pp->sectorsize, M_WAITOK);
 	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 		MD5Final(hash, &ctx);
 		bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
 	}
 	g_journal_record_header_encode(&hdr, data);
 	fbp->bio_data = data;
 
 	sc->sc_journal_offset = joffset;
 
 	g_journal_check_overflow(sc);
 }
 
 /*
  * Flush request finished.
  */
 static void
 g_journal_flush_done(struct bio *bp)
 {
 	struct g_journal_softc *sc;
 	struct g_consumer *cp;
 
 	KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
 
 	cp = bp->bio_from;
 	sc = cp->geom->softc;
 	sc->sc_flush_in_progress--;
 
 	if (bp->bio_error != 0) {
 		GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
 		    bp->bio_error);
 	}
 	gj_free(bp->bio_data, bp->bio_length);
 	GJ_LOGREQ(4, bp, "DONE");
 	g_destroy_bio(bp);
 }
 
 static void g_journal_release_delayed(struct g_journal_softc *sc);
 
 static void
 g_journal_flush_send(struct g_journal_softc *sc)
 {
 	struct g_consumer *cp;
 	struct bio *bioq, *bp, *lbp;
 
 	cp = sc->sc_jconsumer;
 	bioq = lbp = NULL;
 	while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
 		/* Send one flush requests to the active journal. */
 		bp = GJQ_FIRST(sc->sc_flush_queue);
 		if (bp != NULL) {
 			GJQ_REMOVE(sc->sc_flush_queue, bp);
 			sc->sc_flush_count--;
 			bp->bio_offset = bp->bio_joffset;
 			bp->bio_joffset = 0;
 			sc->sc_flush_in_progress++;
 			GJQ_INSERT_AFTER(bioq, bp, lbp);
 			lbp = bp;
 		}
 		/* Try to release delayed requests. */
 		g_journal_release_delayed(sc);
 		/* If there are no requests to flush, leave. */
 		if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
 			break;
 	}
 	if (g_journal_do_optimize)
 		sc->sc_flush_in_progress += g_journal_optimize(bioq);
 	while ((bp = GJQ_FIRST(bioq)) != NULL) {
 		GJQ_REMOVE(bioq, bp);
 		GJ_LOGREQ(3, bp, "Flush request send");
 		g_io_request(bp, cp);
 	}
 }
 
 static void
 g_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
 {
 	int n;
 
 	GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
 	n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
 	sc->sc_current_count += n;
 	n = g_journal_optimize(sc->sc_current_queue);
 	sc->sc_current_count += n;
 	/*
 	 * For requests which are added to the current queue we deliver
 	 * response immediately.
 	 */
 	bp->bio_completed = bp->bio_length;
 	g_io_deliver(bp, 0);
 	if (sc->sc_current_count >= g_journal_record_entries) {
 		/*
 		 * Let's flush one record onto active journal provider.
 		 */
 		g_journal_flush(sc);
 	}
 }
 
 static void
 g_journal_release_delayed(struct g_journal_softc *sc)
 {
 	struct bio *bp;
 
 	for (;;) {
 		/* The flush queue is full, exit. */
 		if (sc->sc_flush_count >= g_journal_accept_immediately)
 			return;
 		bp = bioq_takefirst(&sc->sc_delayed_queue);
 		if (bp == NULL)
 			return;
 		sc->sc_delayed_count--;
 		g_journal_add_current(sc, bp);
 	}
 }
 
 /*
  * Add I/O request to the current queue. If we have enough requests for one
  * journal record we flush them onto active journal provider.
  */
 static void
 g_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
 {
 
 	/*
 	 * The flush queue is full, we need to delay the request.
 	 */
 	if (sc->sc_delayed_count > 0 ||
 	    sc->sc_flush_count >= g_journal_accept_immediately) {
 		GJ_LOGREQ(4, bp, "DELAYED");
 		bioq_insert_tail(&sc->sc_delayed_queue, bp);
 		sc->sc_delayed_count++;
 		return;
 	}
 
 	KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
 	    ("DELAYED queue not empty."));
 	g_journal_add_current(sc, bp);
 }
 
 static void g_journal_read_done(struct bio *bp);
 
 /*
  * Try to find requested data in cache.
  */
 static struct bio *
 g_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	off_t cstart, cend;
 	struct bio *bp;
 
 	GJQ_FOREACH(head, bp) {
 		if (bp->bio_offset == -1)
 			continue;
 		cstart = MAX(ostart, bp->bio_offset);
 		cend = MIN(oend, bp->bio_offset + bp->bio_length);
 		if (cend <= ostart)
 			continue;
 		else if (cstart >= oend) {
 			if (!sorted)
 				continue;
 			else {
 				bp = NULL;
 				break;
 			}
 		}
 		if (bp->bio_data == NULL)
 			break;
 		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 		    bp);
 		bcopy(bp->bio_data + cstart - bp->bio_offset,
 		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 		pbp->bio_completed += cend - cstart;
 		if (pbp->bio_completed == pbp->bio_length) {
 			/*
 			 * Cool, the whole request was in cache, deliver happy
 			 * message.
 			 */
 			g_io_deliver(pbp, 0);
 			return (pbp);
 		}
 		break;
 	}
 	return (bp);
 }
 
 /*
  * Try to find requested data in cache.
  */
 static struct bio *
 g_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	off_t cstart, cend;
 	struct bio *bp;
 
 	TAILQ_FOREACH(bp, head, bio_queue) {
 		cstart = MAX(ostart, bp->bio_offset);
 		cend = MIN(oend, bp->bio_offset + bp->bio_length);
 		if (cend <= ostart)
 			continue;
 		else if (cstart >= oend)
 			continue;
 		KASSERT(bp->bio_data != NULL,
 		    ("%s: bio_data == NULL", __func__));
 		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
 		    bp);
 		bcopy(bp->bio_data + cstart - bp->bio_offset,
 		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
 		pbp->bio_completed += cend - cstart;
 		if (pbp->bio_completed == pbp->bio_length) {
 			/*
 			 * Cool, the whole request was in cache, deliver happy
 			 * message.
 			 */
 			g_io_deliver(pbp, 0);
 			return (pbp);
 		}
 		break;
 	}
 	return (bp);
 }
 
 /*
  * This function is used for colecting data on read.
  * The complexity is because parts of the data can be stored in four different
  * places:
  * - in delayed requests
  * - in memory - the data not yet send to the active journal provider
  * - in requests which are going to be sent to the active journal
  * - in the active journal
  * - in the inactive journal
  * - in the data provider
  */
 static void
 g_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
     off_t oend)
 {
 	struct bio *bp, *nbp, *head;
 	off_t cstart, cend;
 	u_int i, sorted = 0;
 
 	GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
 
 	cstart = cend = -1;
 	bp = NULL;
 	head = NULL;
 	for (i = 0; i <= 5; i++) {
 		switch (i) {
 		case 0:	/* Delayed requests. */
 			head = NULL;
 			sorted = 0;
 			break;
 		case 1:	/* Not-yet-send data. */
 			head = sc->sc_current_queue;
 			sorted = 1;
 			break;
 		case 2:	/* In-flight to the active journal. */
 			head = sc->sc_flush_queue;
 			sorted = 0;
 			break;
 		case 3:	/* Active journal. */
 			head = sc->sc_active.jj_queue;
 			sorted = 1;
 			break;
 		case 4:	/* Inactive journal. */
 			/*
 			 * XXX: Here could be a race with g_journal_lowmem().
 			 */
 			head = sc->sc_inactive.jj_queue;
 			sorted = 1;
 			break;
 		case 5:	/* In-flight to the data provider. */
 			head = sc->sc_copy_queue;
 			sorted = 0;
 			break;
 		default:
 			panic("gjournal %s: i=%d", __func__, i);
 		}
 		if (i == 0)
 			bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend);
 		else
 			bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
 		if (bp == pbp) { /* Got the whole request. */
 			GJ_DEBUG(2, "Got the whole request from %u.", i);
 			return;
 		} else if (bp != NULL) {
 			cstart = MAX(ostart, bp->bio_offset);
 			cend = MIN(oend, bp->bio_offset + bp->bio_length);
 			GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
 			    i, (intmax_t)cstart, (intmax_t)cend);
 			break;
 		}
 	}
 	if (bp != NULL) {
 		if (bp->bio_data == NULL) {
 			nbp = g_duplicate_bio(pbp);
 			nbp->bio_cflags = GJ_BIO_READ;
 			nbp->bio_data =
 			    pbp->bio_data + cstart - pbp->bio_offset;
 			nbp->bio_offset =
 			    bp->bio_joffset + cstart - bp->bio_offset;
 			nbp->bio_length = cend - cstart;
 			nbp->bio_done = g_journal_read_done;
 			g_io_request(nbp, sc->sc_jconsumer);
 		}
 		/*
 		 * If we don't have the whole request yet, call g_journal_read()
 		 * recursively.
 		 */
 		if (ostart < cstart)
 			g_journal_read(sc, pbp, ostart, cstart);
 		if (oend > cend)
 			g_journal_read(sc, pbp, cend, oend);
 	} else {
 		/*
 		 * No data in memory, no data in journal.
 		 * Its time for asking data provider.
 		 */
 		GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
 		nbp = g_duplicate_bio(pbp);
 		nbp->bio_cflags = GJ_BIO_READ;
 		nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
 		nbp->bio_offset = ostart;
 		nbp->bio_length = oend - ostart;
 		nbp->bio_done = g_journal_read_done;
 		g_io_request(nbp, sc->sc_dconsumer);
 		/* We have the whole request, return here. */
 		return;
 	}
 }
 
 /*
  * Function responsible for handling finished READ requests.
  * Actually, g_std_done() could be used here, the only difference is that we
  * log error.
  */
 static void
 g_journal_read_done(struct bio *bp)
 {
 	struct bio *pbp;
 
 	KASSERT(bp->bio_cflags == GJ_BIO_READ,
 	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
 
 	pbp = bp->bio_parent;
 	pbp->bio_inbed++;
 	pbp->bio_completed += bp->bio_length;
 
 	if (bp->bio_error != 0) {
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
 		    bp->bio_to->name, bp->bio_error);
 	}
 	g_destroy_bio(bp);
 	if (pbp->bio_children == pbp->bio_inbed &&
 	    pbp->bio_completed == pbp->bio_length) {
 		/* We're done. */
 		g_io_deliver(pbp, 0);
 	}
 }
 
 /*
  * Deactive current journal and active next one.
  */
 static void
 g_journal_switch(struct g_journal_softc *sc)
 {
 	struct g_provider *pp;
 
 	if (JEMPTY(sc)) {
 		GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 		pp = LIST_FIRST(&sc->sc_geom->provider);
 		if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
 			sc->sc_flags |= GJF_DEVICE_CLEAN;
 			GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 			g_journal_metadata_update(sc);
 		}
 	} else {
 		GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
 
 		pp = sc->sc_jprovider;
 
 		sc->sc_journal_previous_id = sc->sc_journal_id;
 
 		sc->sc_journal_id = sc->sc_journal_next_id;
 		sc->sc_journal_next_id = arc4random();
 
 		GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
 
 		g_journal_write_header(sc);
 
 		sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
 		sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
 
 		sc->sc_active.jj_offset =
 		    sc->sc_journal_offset - pp->sectorsize;
 		sc->sc_active.jj_queue = NULL;
 
 		/*
 		 * Switch is done, start copying data from the (now) inactive
 		 * journal to the data provider.
 		 */
 		g_journal_copy_start(sc);
 	}
 	mtx_lock(&sc->sc_mtx);
 	sc->sc_flags &= ~GJF_DEVICE_SWITCH;
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static void
 g_journal_initialize(struct g_journal_softc *sc)
 {
 
 	sc->sc_journal_id = arc4random();
 	sc->sc_journal_next_id = arc4random();
 	sc->sc_journal_previous_id = sc->sc_journal_id;
 	sc->sc_journal_offset = sc->sc_jstart;
 	sc->sc_inactive.jj_offset = sc->sc_jstart;
 	g_journal_write_header(sc);
 	sc->sc_active.jj_offset = sc->sc_jstart;
 }
 
 static void
 g_journal_mark_as_dirty(struct g_journal_softc *sc)
 {
 	const struct g_journal_desc *desc;
 	int i;
 
 	GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
 	for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
 		desc->jd_dirty(sc->sc_dconsumer);
 }
 
 /*
  * Function read record header from the given journal.
  * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
  * and data on every call.
  */
 static int
 g_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
     void *data)
 {
 	int error;
 
 	g_reset_bio(bp);
 	bp->bio_cmd = BIO_READ;
 	bp->bio_done = NULL;
 	bp->bio_offset = offset;
 	bp->bio_length = cp->provider->sectorsize;
 	bp->bio_data = data;
 	g_io_request(bp, cp);
 	error = biowait(bp, "gjs_read");
 	return (error);
 }
 
 #if 0
 /*
  * Function is called when we start the journal device and we detect that
  * one of the journals was not fully copied.
  * The purpose of this function is to read all records headers from journal
  * and placed them in the inactive queue, so we can start journal
  * synchronization process and the journal provider itself.
  * Design decision was taken to not synchronize the whole journal here as it
  * can take too much time. Reading headers only and delaying synchronization
  * process until after journal provider is started should be the best choice.
  */
 #endif
 
 static void
 g_journal_sync(struct g_journal_softc *sc)
 {
 	struct g_journal_record_header rhdr;
 	struct g_journal_entry *ent;
 	struct g_journal_header jhdr;
 	struct g_consumer *cp;
 	struct bio *bp, *fbp, *tbp;
 	off_t joffset, offset;
 	u_char *buf, sum[16];
 	uint64_t id;
 	MD5_CTX ctx;
 	int error, found, i;
 
 	found = 0;
 	fbp = NULL;
 	cp = sc->sc_jconsumer;
 	bp = g_alloc_bio();
 	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
 	offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
 
 	GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
 
 	/*
 	 * Read and decode first journal header.
 	 */
 	error = g_journal_sync_read(cp, bp, offset, buf);
 	if (error != 0) {
 		GJ_DEBUG(0, "Error while reading journal header from %s.",
 		    cp->provider->name);
 		goto end;
 	}
 	error = g_journal_header_decode(buf, &jhdr);
 	if (error != 0) {
 		GJ_DEBUG(0, "Cannot decode journal header from %s.",
 		    cp->provider->name);
 		goto end;
 	}
 	id = sc->sc_journal_id;
 	if (jhdr.jh_journal_id != sc->sc_journal_id) {
 		GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
 		    (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
 		goto end;
 	}
 	offset += cp->provider->sectorsize;
 	id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 
 	for (;;) {
 		/*
 		 * If the biggest record won't fit, look for a record header or
 		 * journal header from the beginning.
 		 */
 		GJ_VALIDATE_OFFSET(offset, sc);
 		error = g_journal_sync_read(cp, bp, offset, buf);
 		if (error != 0) {
 			/*
 			 * Not good. Having an error while reading header
 			 * means, that we cannot read next headers and in
 			 * consequence we cannot find termination.
 			 */
 			GJ_DEBUG(0,
 			    "Error while reading record header from %s.",
 			    cp->provider->name);
 			break;
 		}
 
 		error = g_journal_record_header_decode(buf, &rhdr);
 		if (error != 0) {
 			GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
 			    (intmax_t)offset, error);
 			/*
 			 * This is not a record header.
 			 * If we are lucky, this is next journal header.
 			 */
 			error = g_journal_header_decode(buf, &jhdr);
 			if (error != 0) {
 				GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
 				    (intmax_t)offset, error);
 				/*
 				 * Nope, this is not journal header, which
 				 * bascially means that journal is not
 				 * terminated properly.
 				 */
 				error = ENOENT;
 				break;
 			}
 			/*
 			 * Ok. This is header of _some_ journal. Now we need to
 			 * verify if this is header of the _next_ journal.
 			 */
 			if (jhdr.jh_journal_id != id) {
 				GJ_DEBUG(1, "Journal ID mismatch at %jd "
 				    "(0x%08x != 0x%08x).", (intmax_t)offset,
 				    (u_int)jhdr.jh_journal_id, (u_int)id);
 				error = ENOENT;
 				break;
 			}
 
 			/* Found termination. */
 			found++;
 			GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
 			    (intmax_t)offset, (u_int)id);
 			sc->sc_active.jj_offset = offset;
 			sc->sc_journal_offset =
 			    offset + cp->provider->sectorsize;
 			sc->sc_journal_id = id;
 			id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
 
 			while ((tbp = fbp) != NULL) {
 				fbp = tbp->bio_next;
 				GJ_LOGREQ(3, tbp, "Adding request.");
 				g_journal_insert_bio(&sc->sc_inactive.jj_queue,
 				    tbp, M_WAITOK);
 			}
 
 			/* Skip journal's header. */
 			offset += cp->provider->sectorsize;
 			continue;
 		}
 
 		/* Skip record's header. */
 		offset += cp->provider->sectorsize;
 
 		/*
 		 * Add information about every record entry to the inactive
 		 * queue.
 		 */
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
 			MD5Init(&ctx);
 		for (i = 0; i < rhdr.jrh_nentries; i++) {
 			ent = &rhdr.jrh_entries[i];
 			GJ_DEBUG(3, "Insert entry: %jd %jd.",
 			    (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
 			g_journal_insert(&fbp, ent->je_offset,
 			    ent->je_offset + ent->je_length, ent->je_joffset,
 			    NULL, M_WAITOK);
 			if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 				u_char *buf2;
 
 				/*
 				 * TODO: Should use faster function (like
 				 *       g_journal_sync_read()).
 				 */
 				buf2 = g_read_data(cp, offset, ent->je_length,
 				    NULL);
 				if (buf2 == NULL)
 					GJ_DEBUG(0, "Cannot read data at %jd.",
 					    (intmax_t)offset);
 				else {
 					MD5Update(&ctx, buf2, ent->je_length);
 					g_free(buf2);
 				}
 			}
 			/* Skip entry's data. */
 			offset += ent->je_length;
 		}
 		if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
 			MD5Final(sum, &ctx);
 			if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
 				GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
 				    (intmax_t)offset);
 			}
 		}
 	}
 end:
 	gj_free(bp->bio_data, cp->provider->sectorsize);
 	g_destroy_bio(bp);
 
 	/* Remove bios from unterminated journal. */
 	while ((tbp = fbp) != NULL) {
 		fbp = tbp->bio_next;
 		g_destroy_bio(tbp);
 	}
 
 	if (found < 1 && joffset > 0) {
 		GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
 		    sc->sc_name);
 		while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
 			sc->sc_inactive.jj_queue = tbp->bio_next;
 			g_destroy_bio(tbp);
 		}
 		g_journal_initialize(sc);
 		g_journal_mark_as_dirty(sc);
 	} else {
 		GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
 		g_journal_copy_start(sc);
 	}
 }
 
 /*
  * Wait for requests.
  * If we have requests in the current queue, flush them after 3 seconds from the
  * last flush. In this way we don't wait forever (or for journal switch) with
  * storing not full records on journal.
  */
 static void
 g_journal_wait(struct g_journal_softc *sc, time_t last_write)
 {
 	int error, timeout;
 
 	GJ_DEBUG(3, "%s: enter", __func__);
 	if (sc->sc_current_count == 0) {
 		if (g_journal_debug < 2)
 			msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
 		else {
 			/*
 			 * If we have debug turned on, show number of elements
 			 * in various queues.
 			 */
 			for (;;) {
 				error = msleep(sc, &sc->sc_mtx, PRIBIO,
 				    "gj:work", hz * 3);
 				if (error == 0) {
 					mtx_unlock(&sc->sc_mtx);
 					break;
 				}
 				GJ_DEBUG(3, "Report: current count=%d",
 				    sc->sc_current_count);
 				GJ_DEBUG(3, "Report: flush count=%d",
 				    sc->sc_flush_count);
 				GJ_DEBUG(3, "Report: flush in progress=%d",
 				    sc->sc_flush_in_progress);
 				GJ_DEBUG(3, "Report: copy in progress=%d",
 				    sc->sc_copy_in_progress);
 				GJ_DEBUG(3, "Report: delayed=%d",
 				    sc->sc_delayed_count);
 			}
 		}
 		GJ_DEBUG(3, "%s: exit 1", __func__);
 		return;
 	}
 
 	/*
 	 * Flush even not full records every 3 seconds.
 	 */
 	timeout = (last_write + 3 - time_second) * hz;
 	if (timeout <= 0) {
 		mtx_unlock(&sc->sc_mtx);
 		g_journal_flush(sc);
 		g_journal_flush_send(sc);
 		GJ_DEBUG(3, "%s: exit 2", __func__);
 		return;
 	}
 	error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
 	if (error == EWOULDBLOCK)
 		g_journal_flush_send(sc);
 	GJ_DEBUG(3, "%s: exit 3", __func__);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_journal_worker(void *arg)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct bio *bp;
 	time_t last_write;
 	int type;
 
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sc = arg;
 	type = 0;	/* gcc */
 
 	if (sc->sc_flags & GJF_DEVICE_CLEAN) {
 		GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
 		g_journal_initialize(sc);
 	} else {
 		g_journal_sync(sc);
 	}
 	/*
 	 * Check if we can use BIO_FLUSH.
 	 */
 	sc->sc_bio_flush = 0;
 	if (g_io_flush(sc->sc_jconsumer) == 0) {
 		sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
 		GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 		    sc->sc_jconsumer->provider->name);
 	} else {
 		GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 		    sc->sc_jconsumer->provider->name);
 	}
 	if (sc->sc_jconsumer != sc->sc_dconsumer) {
 		if (g_io_flush(sc->sc_dconsumer) == 0) {
 			sc->sc_bio_flush |= GJ_FLUSH_DATA;
 			GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
 			    sc->sc_dconsumer->provider->name);
 		} else {
 			GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
 			    sc->sc_dconsumer->provider->name);
 		}
 	}
 
 	gp = sc->sc_geom;
 	g_topology_lock();
 	pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
 	pp->mediasize = sc->sc_mediasize;
 	/*
 	 * There could be a problem when data provider and journal providers
 	 * have different sectorsize, but such scenario is prevented on journal
 	 * creation.
 	 */
 	pp->sectorsize = sc->sc_sectorsize;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	last_write = time_second;
 
 	if (sc->sc_rootmount != NULL) {
 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	for (;;) {
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_mtx);
 		bp = bioq_first(&sc->sc_back_queue);
 		if (bp != NULL)
 			type = (bp->bio_cflags & GJ_BIO_MASK);
 		if (bp == NULL) {
 			bp = bioq_first(&sc->sc_regular_queue);
 			if (bp != NULL)
 				type = GJ_BIO_REGULAR;
 		}
 		if (bp == NULL) {
 try_switch:
 			if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
 			    (sc->sc_flags & GJF_DEVICE_DESTROY)) {
 				if (sc->sc_current_count > 0) {
 					mtx_unlock(&sc->sc_mtx);
 					g_journal_flush(sc);
 					g_journal_flush_send(sc);
 					continue;
 				}
 				if (sc->sc_flush_in_progress > 0)
 					goto sleep;
 				if (sc->sc_copy_in_progress > 0)
 					goto sleep;
 			}
 			if (sc->sc_flags & GJF_DEVICE_SWITCH) {
 				mtx_unlock(&sc->sc_mtx);
 				g_journal_switch(sc);
 				wakeup(&sc->sc_journal_copying);
 				continue;
 			}
 			if (sc->sc_flags & GJF_DEVICE_DESTROY) {
 				GJ_DEBUG(1, "Shutting down worker "
 				    "thread for %s.", gp->name);
 				sc->sc_worker = NULL;
 				wakeup(&sc->sc_worker);
 				mtx_unlock(&sc->sc_mtx);
 				kproc_exit(0);
 			}
 sleep:
 			g_journal_wait(sc, last_write);
 			continue;
 		}
 		/*
 		 * If we're in switch process, we need to delay all new
 		 * write requests until its done.
 		 */
 		if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
 		    type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
 			GJ_LOGREQ(2, bp, "WRITE on SWITCH");
 			goto try_switch;
 		}
 		if (type == GJ_BIO_REGULAR)
 			bioq_remove(&sc->sc_regular_queue, bp);
 		else
 			bioq_remove(&sc->sc_back_queue, bp);
 		mtx_unlock(&sc->sc_mtx);
 		switch (type) {
 		case GJ_BIO_REGULAR:
 			/* Regular request. */
 			switch (bp->bio_cmd) {
 			case BIO_READ:
 				g_journal_read(sc, bp, bp->bio_offset,
 				    bp->bio_offset + bp->bio_length);
 				break;
 			case BIO_WRITE:
 				last_write = time_second;
 				g_journal_add_request(sc, bp);
 				g_journal_flush_send(sc);
 				break;
 			default:
 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 			}
 			break;
 		case GJ_BIO_COPY:
 			switch (bp->bio_cmd) {
 			case BIO_READ:
 				if (g_journal_copy_read_done(bp))
 					g_journal_copy_send(sc);
 				break;
 			case BIO_WRITE:
 				g_journal_copy_write_done(bp);
 				g_journal_copy_send(sc);
 				break;
 			default:
 				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
 			}
 			break;
 		case GJ_BIO_JOURNAL:
 			g_journal_flush_done(bp);
 			g_journal_flush_send(sc);
 			break;
 		case GJ_BIO_READ:
 		default:
 			panic("Invalid bio (%d).", type);
 		}
 	}
 }
 
 static void
 g_journal_destroy_event(void *arg, int flags __unused)
 {
 	struct g_journal_softc *sc;
 
 	g_topology_assert();
 	sc = arg;
 	g_journal_destroy(sc);
 }
 
 static void
 g_journal_timeout(void *arg)
 {
 	struct g_journal_softc *sc;
 
 	sc = arg;
 	GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
 	    sc->sc_geom->name);
 	g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
 }
 
 static struct g_geom *
 g_journal_create(struct g_class *mp, struct g_provider *pp,
     const struct g_journal_metadata *md)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	int error;
 
 	sc = NULL;	/* gcc */
 
 	g_topology_assert();
 	/*
 	 * There are two possibilities:
 	 * 1. Data and both journals are on the same provider.
 	 * 2. Data and journals are all on separated providers.
 	 */
 	/* Look for journal device with the same ID. */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_id == md->md_id)
 			break;
 	}
 	if (gp == NULL)
 		sc = NULL;
 	else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
 		GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
 		return (NULL);
 	}
 	if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
 		GJ_DEBUG(0, "Invalid type on %s.", pp->name);
 		return (NULL);
 	}
 	if (md->md_type & GJ_TYPE_DATA) {
 		GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
 		    pp->name);
 	}
 	if (md->md_type & GJ_TYPE_JOURNAL) {
 		GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
 		    pp->name);
 	}
 
 	if (sc == NULL) {
 		/* Action geom. */
 		sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
 		sc->sc_id = md->md_id;
 		sc->sc_type = 0;
 		sc->sc_flags = 0;
 		sc->sc_worker = NULL;
 
 		gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
 		gp->start = g_journal_start;
 		gp->orphan = g_journal_orphan;
 		gp->access = g_journal_access;
 		gp->softc = sc;
 		gp->flags |= G_GEOM_VOLATILE_BIO;
 		sc->sc_geom = gp;
 
 		mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
 
 		bioq_init(&sc->sc_back_queue);
 		bioq_init(&sc->sc_regular_queue);
 		bioq_init(&sc->sc_delayed_queue);
 		sc->sc_delayed_count = 0;
 		sc->sc_current_queue = NULL;
 		sc->sc_current_count = 0;
 		sc->sc_flush_queue = NULL;
 		sc->sc_flush_count = 0;
 		sc->sc_flush_in_progress = 0;
 		sc->sc_copy_queue = NULL;
 		sc->sc_copy_in_progress = 0;
 		sc->sc_inactive.jj_queue = NULL;
 		sc->sc_active.jj_queue = NULL;
 
 		sc->sc_rootmount = root_mount_hold("GJOURNAL");
 		GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 
 		callout_init(&sc->sc_callout, 1);
 		if (md->md_type != GJ_TYPE_COMPLETE) {
 			/*
 			 * Journal and data are on separate providers.
 			 * At this point we have only one of them.
 			 * We setup a timeout in case the other part will not
 			 * appear, so we won't wait forever.
 			 */
 			callout_reset(&sc->sc_callout, 5 * hz,
 			    g_journal_timeout, sc);
 		}
 	}
 
 	/* Remember type of the data provider. */
 	if (md->md_type & GJ_TYPE_DATA)
 		sc->sc_orig_type = md->md_type;
 	sc->sc_type |= md->md_type;
 	cp = NULL;
 
 	if (md->md_type & GJ_TYPE_DATA) {
 		if (md->md_flags & GJ_FLAG_CLEAN)
 			sc->sc_flags |= GJF_DEVICE_CLEAN;
 		if (md->md_flags & GJ_FLAG_CHECKSUM)
 			sc->sc_flags |= GJF_DEVICE_CHECKSUM;
 		cp = g_new_consumer(gp);
 		error = g_attach(cp, pp);
 		KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 		    pp->name, error));
 		error = g_access(cp, 1, 1, 1);
 		if (error != 0) {
 			GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
 			    error);
 			g_journal_destroy(sc);
 			return (NULL);
 		}
 		sc->sc_dconsumer = cp;
 		sc->sc_mediasize = pp->mediasize - pp->sectorsize;
 		sc->sc_sectorsize = pp->sectorsize;
 		sc->sc_jstart = md->md_jstart;
 		sc->sc_jend = md->md_jend;
 		if (md->md_provider[0] != '\0')
 			sc->sc_flags |= GJF_DEVICE_HARDCODED;
 		sc->sc_journal_offset = md->md_joffset;
 		sc->sc_journal_id = md->md_jid;
 		sc->sc_journal_previous_id = md->md_jid;
 	}
 	if (md->md_type & GJ_TYPE_JOURNAL) {
 		if (cp == NULL) {
 			cp = g_new_consumer(gp);
 			error = g_attach(cp, pp);
 			KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
 			    pp->name, error));
 			error = g_access(cp, 1, 1, 1);
 			if (error != 0) {
 				GJ_DEBUG(0, "Cannot access %s (error=%d).",
 				    pp->name, error);
 				g_journal_destroy(sc);
 				return (NULL);
 			}
 		} else {
 			/*
 			 * Journal is on the same provider as data, which means
 			 * that data provider ends where journal starts.
 			 */
 			sc->sc_mediasize = md->md_jstart;
 		}
 		sc->sc_jconsumer = cp;
 	}
 
 	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
 		/* Journal is not complete yet. */
 		return (gp);
 	} else {
 		/* Journal complete, cancel timeout. */
 		callout_drain(&sc->sc_callout);
 	}
 
 	error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_journal %s", sc->sc_name);
 	if (error != 0) {
 		GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
 		    sc->sc_name);
 		g_journal_destroy(sc);
 		return (NULL);
 	}
 
 	return (gp);
 }
 
 static void
 g_journal_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	cp = arg;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_journal_destroy(struct g_journal_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	gp = sc->sc_geom;
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL) {
 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 			GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
 			    pp->name, pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 		g_error_provider(pp, ENXIO);
 
 		g_journal_flush(sc);
 		g_journal_flush_send(sc);
 		g_journal_switch(sc);
 	}
 
 	sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
 
 	g_topology_unlock();
 
 	if (sc->sc_rootmount != NULL) {
 		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	callout_drain(&sc->sc_callout);
 	mtx_lock(&sc->sc_mtx);
 	wakeup(sc);
 	while (sc->sc_worker != NULL)
 		msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
 	mtx_unlock(&sc->sc_mtx);
 
 	if (pp != NULL) {
 		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
 		g_journal_metadata_update(sc);
 		g_topology_lock();
-		pp->flags |= G_PF_WITHER;
-		g_orphan_provider(pp, ENXIO);
+		g_wither_provider(pp, ENXIO);
 	} else {
 		g_topology_lock();
 	}
 	mtx_destroy(&sc->sc_mtx);
 
 	if (sc->sc_current_count != 0) {
 		GJ_DEBUG(0, "Warning! Number of current requests %d.",
 		    sc->sc_current_count);
 	}
 
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (cp->acr + cp->acw + cp->ace > 0)
 			g_access(cp, -1, -1, -1);
 		/*
 		 * We keep all consumers open for writting, so if I'll detach
 		 * and destroy consumer here, I'll get providers for taste, so
 		 * journal will be started again.
 		 * Sending an event here, prevents this from happening.
 		 */
 		g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
 	}
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 	free(sc, M_JOURNAL);
 	return (0);
 }
 
 static void
 g_journal_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_journal_metadata md;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	GJ_DEBUG(2, "Tasting %s.", pp->name);
 	if (pp->geom->class == mp)
 		return (NULL);
 
 	gp = g_new_geomf(mp, "journal:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_journal_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_journal_metadata_read(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if (g_journal_debug >= 2)
 		journal_metadata_dump(&md);
 
 	gp = g_journal_create(mp, pp, &md);
 	return (gp);
 }
 
 static struct g_journal_softc *
 g_journal_find_device(struct g_class *mp, const char *name)
 {
 	struct g_journal_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	if (strncmp(name, "/dev/", 5) == 0)
 		name += 5;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
 			continue;
 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 			continue;
 		pp = LIST_FIRST(&gp->provider);
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 		if (pp != NULL && strcmp(pp->name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_journal_softc *sc;
 	const char *name;
 	char param[16];
 	int *nargs;
 	int error, i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument.", i);
 			return;
 		}
 		sc = g_journal_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_journal_destroy(sc);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    LIST_FIRST(&sc->sc_geom->provider)->name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
 {
 
 	g_topology_assert();
 	g_topology_unlock();
 	g_journal_sync_requested++;
 	wakeup(&g_journal_switcher_state);
 	while (g_journal_sync_requested > 0)
 		tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
 	g_topology_lock();
 }
 
 static void
 g_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_JOURNAL_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
 		g_journal_ctl_destroy(req, mp);
 		return;
 	} else if (strcmp(verb, "sync") == 0) {
 		g_journal_ctl_sync(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_journal_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		int first = 1;
 
 		sbuf_printf(sb, "%s<Role>", indent);
 		if (cp == sc->sc_dconsumer) {
 			sbuf_printf(sb, "Data");
 			first = 0;
 		}
 		if (cp == sc->sc_jconsumer) {
 			if (!first)
 				sbuf_printf(sb, ",");
 			sbuf_printf(sb, "Journal");
 		}
 		sbuf_printf(sb, "</Role>\n");
 		if (cp == sc->sc_jconsumer) {
 			sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
 			    (intmax_t)sc->sc_jstart);
 			sbuf_printf(sb, "<Jend>%jd</Jend>\n",
 			    (intmax_t)sc->sc_jend);
 		}
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 	}
 }
 
 static eventhandler_tag g_journal_event_shutdown = NULL;
 static eventhandler_tag g_journal_event_lowmem = NULL;
 
 static void
 g_journal_shutdown(void *arg, int howto __unused)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 
 	if (panicstr != NULL)
 		return;
 	mp = arg;
 	g_topology_lock();
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if (gp->softc == NULL)
 			continue;
 		GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
 		g_journal_destroy(gp->softc);
 	}
 	g_topology_unlock();
 }
 
 /*
  * Free cached requests from inactive queue in case of low memory.
  * We free GJ_FREE_AT_ONCE elements at once.
  */
 #define	GJ_FREE_AT_ONCE	4
 static void
 g_journal_lowmem(void *arg, int howto __unused)
 {
 	struct g_journal_softc *sc;
 	struct g_class *mp;
 	struct g_geom *gp;
 	struct bio *bp;
 	u_int nfree = GJ_FREE_AT_ONCE;
 
 	g_journal_stats_low_mem++;
 	mp = arg;
 	g_topology_lock();
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
 			continue;
 		mtx_lock(&sc->sc_mtx);
 		for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
 		    nfree--, bp = bp->bio_next) {
 			/*
 			 * This is safe to free the bio_data, because:
 			 * 1. If bio_data is NULL it will be read from the
 			 *    inactive journal.
 			 * 2. If bp is sent down, it is first removed from the
 			 *    inactive queue, so it's impossible to free the
 			 *    data from under in-flight bio.
 			 * On the other hand, freeing elements from the active
 			 * queue, is not safe.
 			 */
 			if (bp->bio_data != NULL) {
 				GJ_DEBUG(2, "Freeing data from %s.",
 				    sc->sc_name);
 				gj_free(bp->bio_data, bp->bio_length);
 				bp->bio_data = NULL;
 			}
 		}
 		mtx_unlock(&sc->sc_mtx);
 		if (nfree == 0)
 			break;
 	}
 	g_topology_unlock();
 }
 
 static void g_journal_switcher(void *arg);
 
 static void
 g_journal_init(struct g_class *mp)
 {
 	int error;
 
 	/* Pick a conservative value if provided value sucks. */
 	if (g_journal_cache_divisor <= 0 ||
 	    (vm_kmem_size / g_journal_cache_divisor == 0)) {
 		g_journal_cache_divisor = 5;
 	}
 	if (g_journal_cache_limit > 0) {
 		g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
 		g_journal_cache_low =
 		    (g_journal_cache_limit / 100) * g_journal_cache_switch;
 	}
 	g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
 	if (g_journal_event_shutdown == NULL)
 		GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
 	g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
 	    g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
 	if (g_journal_event_lowmem == NULL)
 		GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
 	error = kproc_create(g_journal_switcher, mp, NULL, 0, 0,
 	    "g_journal switcher");
 	KASSERT(error == 0, ("Cannot create switcher thread."));
 }
 
 static void
 g_journal_fini(struct g_class *mp)
 {
 
 	if (g_journal_event_shutdown != NULL) {
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync,
 		    g_journal_event_shutdown);
 	}
 	if (g_journal_event_lowmem != NULL)
 		EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
 	g_journal_switcher_state = GJ_SWITCHER_DIE;
 	wakeup(&g_journal_switcher_state);
 	while (g_journal_switcher_state != GJ_SWITCHER_DIED)
 		tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
 	GJ_DEBUG(1, "Switcher died.");
 }
 
 DECLARE_GEOM_CLASS(g_journal_class, g_journal);
 
 static const struct g_journal_desc *
 g_journal_find_desc(const char *fstype)
 {
 	const struct g_journal_desc *desc;
 	int i;
 
 	for (desc = g_journal_filesystems[i = 0]; desc != NULL;
 	     desc = g_journal_filesystems[++i]) {
 		if (strcmp(desc->jd_fstype, fstype) == 0)
 			break;
 	}
 	return (desc);
 }
 
 static void
 g_journal_switch_wait(struct g_journal_softc *sc)
 {
 	struct bintime bt;
 
 	mtx_assert(&sc->sc_mtx, MA_OWNED);
 	if (g_journal_debug >= 2) {
 		if (sc->sc_flush_in_progress > 0) {
 			GJ_DEBUG(2, "%d requests flushing.",
 			    sc->sc_flush_in_progress);
 		}
 		if (sc->sc_copy_in_progress > 0) {
 			GJ_DEBUG(2, "%d requests copying.",
 			    sc->sc_copy_in_progress);
 		}
 		if (sc->sc_flush_count > 0) {
 			GJ_DEBUG(2, "%d requests to flush.",
 			    sc->sc_flush_count);
 		}
 		if (sc->sc_delayed_count > 0) {
 			GJ_DEBUG(2, "%d requests delayed.",
 			    sc->sc_delayed_count);
 		}
 	}
 	g_journal_stats_switches++;
 	if (sc->sc_copy_in_progress > 0)
 		g_journal_stats_wait_for_copy++;
 	GJ_TIMER_START(1, &bt);
 	sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 	sc->sc_flags |= GJF_DEVICE_SWITCH;
 	wakeup(sc);
 	while (sc->sc_flags & GJF_DEVICE_SWITCH) {
 		msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
 		    "gj:switch", 0);
 	}
 	GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
 }
 
 static void
 g_journal_do_switch(struct g_class *classp)
 {
 	struct g_journal_softc *sc;
 	const struct g_journal_desc *desc;
 	struct g_geom *gp;
 	struct mount *mp;
 	struct bintime bt;
 	char *mountpoint;
 	int error, save;
 
 	g_topology_lock();
 	LIST_FOREACH(gp, &classp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_flags & GJF_DEVICE_DESTROY)
 			continue;
 		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
 			continue;
 		mtx_lock(&sc->sc_mtx);
 		sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
 		mtx_unlock(&sc->sc_mtx);
 	}
 	g_topology_unlock();
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_gjprovider == NULL)
 			continue;
 		if (mp->mnt_flag & MNT_RDONLY)
 			continue;
 		desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
 		if (desc == NULL)
 			continue;
 		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
 			continue;
 		/* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
 
 		g_topology_lock();
 		sc = g_journal_find_device(classp, mp->mnt_gjprovider);
 		g_topology_unlock();
 
 		if (sc == NULL) {
 			GJ_DEBUG(0, "Cannot find journal geom for %s.",
 			    mp->mnt_gjprovider);
 			goto next;
 		} else if (JEMPTY(sc)) {
 			mtx_lock(&sc->sc_mtx);
 			sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
 			mtx_unlock(&sc->sc_mtx);
 			GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
 			goto next;
 		}
 
 		mountpoint = mp->mnt_stat.f_mntonname;
 
 		error = vn_start_write(NULL, &mp, V_WAIT);
 		if (error != 0) {
 			GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
 			    mountpoint, error);
 			goto next;
 		}
 
 		save = curthread_pflags_set(TDP_SYNCIO);
 
 		GJ_TIMER_START(1, &bt);
 		vfs_msync(mp, MNT_NOWAIT);
 		GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
 
 		GJ_TIMER_START(1, &bt);
 		error = VFS_SYNC(mp, MNT_NOWAIT);
 		if (error == 0)
 			GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
 		else {
 			GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
 			    mountpoint, error);
 		}
 
 		curthread_pflags_restore(save);
 
 		vn_finished_write(mp);
 
 		if (error != 0)
 			goto next;
 
 		/*
 		 * Send BIO_FLUSH before freezing the file system, so it can be
 		 * faster after the freeze.
 		 */
 		GJ_TIMER_START(1, &bt);
 		g_journal_flush_cache(sc);
 		GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
 
 		GJ_TIMER_START(1, &bt);
 		error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT);
 		GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
 		if (error != 0) {
 			GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
 			    mountpoint, error);
 			goto next;
 		}
 
 		error = desc->jd_clean(mp);
 		if (error != 0)
 			goto next;
 
 		mtx_lock(&sc->sc_mtx);
 		g_journal_switch_wait(sc);
 		mtx_unlock(&sc->sc_mtx);
 
 		vfs_write_resume(mp, 0);
 next:
 		mtx_lock(&mountlist_mtx);
 		vfs_unbusy(mp);
 	}
 	mtx_unlock(&mountlist_mtx);
 
 	sc = NULL;
 	for (;;) {
 		g_topology_lock();
 		LIST_FOREACH(gp, &g_journal_class.geom, geom) {
 			sc = gp->softc;
 			if (sc == NULL)
 				continue;
 			mtx_lock(&sc->sc_mtx);
 			if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
 			    !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
 			    (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
 				break;
 			}
 			mtx_unlock(&sc->sc_mtx);
 			sc = NULL;
 		}
 		g_topology_unlock();
 		if (sc == NULL)
 			break;
 		mtx_assert(&sc->sc_mtx, MA_OWNED);
 		g_journal_switch_wait(sc);
 		mtx_unlock(&sc->sc_mtx);
 	}
 }
 
 /*
  * TODO: Switcher thread should be started on first geom creation and killed on
  * last geom destruction.
  */
 static void
 g_journal_switcher(void *arg)
 {
 	struct g_class *mp;
 	struct bintime bt;
 	int error;
 
 	mp = arg;
 	curthread->td_pflags |= TDP_NORUNNINGBUF;
 	for (;;) {
 		g_journal_switcher_wokenup = 0;
 		error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
 		    g_journal_switch_time * hz);
 		if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
 			g_journal_switcher_state = GJ_SWITCHER_DIED;
 			GJ_DEBUG(1, "Switcher exiting.");
 			wakeup(&g_journal_switcher_state);
 			kproc_exit(0);
 		}
 		if (error == 0 && g_journal_sync_requested == 0) {
 			GJ_DEBUG(1, "Out of cache, force switch (used=%u "
 			    "limit=%u).", g_journal_cache_used,
 			    g_journal_cache_limit);
 		}
 		GJ_TIMER_START(1, &bt);
 		g_journal_do_switch(mp);
 		GJ_TIMER_STOP(1, &bt, "Entire switch time");
 		if (g_journal_sync_requested > 0) {
 			g_journal_sync_requested = 0;
 			wakeup(&g_journal_sync_requested);
 		}
 	}
 }
Index: user/alc/PQ_LAUNDRY/sys/geom/linux_lvm/g_linux_lvm.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/linux_lvm/g_linux_lvm.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/linux_lvm/g_linux_lvm.c	(revision 306283)
@@ -1,1190 +1,1190 @@
 /*-
  * Copyright (c) 2008 Andrew Thompson <thompsa@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/ctype.h>
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <geom/geom.h>
 #include <sys/endian.h>
 
 #include <geom/linux_lvm/g_linux_lvm.h>
 
 FEATURE(geom_linux_lvm, "GEOM Linux LVM partitioning support");
 
 /* Declare malloc(9) label */
 static MALLOC_DEFINE(M_GLLVM, "gllvm", "GEOM_LINUX_LVM Data");
 
 /* GEOM class methods */
 static g_access_t g_llvm_access;
 static g_init_t g_llvm_init;
 static g_orphan_t g_llvm_orphan;
 static g_orphan_t g_llvm_taste_orphan;
 static g_start_t g_llvm_start;
 static g_taste_t g_llvm_taste;
 static g_ctl_destroy_geom_t g_llvm_destroy_geom;
 
 static void	g_llvm_done(struct bio *);
 static void	g_llvm_remove_disk(struct g_llvm_vg *, struct g_consumer *);
 static int	g_llvm_activate_lv(struct g_llvm_vg *, struct g_llvm_lv *);
 static int	g_llvm_add_disk(struct g_llvm_vg *, struct g_provider *, char *);
 static void	g_llvm_free_vg(struct g_llvm_vg *);
 static int	g_llvm_destroy(struct g_llvm_vg *, int);
 static int	g_llvm_read_label(struct g_consumer *, struct g_llvm_label *);
 static int	g_llvm_read_md(struct g_consumer *, struct g_llvm_metadata *,
 		    struct g_llvm_label *);
 
 static int	llvm_label_decode(const u_char *, struct g_llvm_label *, int);
 static int	llvm_md_decode(const u_char *, struct g_llvm_metadata *,
 		    struct g_llvm_label *);
 static int	llvm_textconf_decode(u_char *, int,
 		    struct g_llvm_metadata *);
 static int	llvm_textconf_decode_pv(char **, char *, struct g_llvm_vg *);
 static int	llvm_textconf_decode_lv(char **, char *, struct g_llvm_vg *);
 static int	llvm_textconf_decode_sg(char **, char *, struct g_llvm_lv *);
 
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, linux_lvm, CTLFLAG_RW, 0,
     "GEOM_LINUX_LVM stuff");
 static u_int g_llvm_debug = 0;
 SYSCTL_UINT(_kern_geom_linux_lvm, OID_AUTO, debug, CTLFLAG_RWTUN, &g_llvm_debug, 0,
     "Debug level");
 
 LIST_HEAD(, g_llvm_vg) vg_list;
 
 /*
  * Called to notify geom when it's been opened, and for what intent
  */
 static int
 g_llvm_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *c;
 	struct g_llvm_vg *vg;
 	struct g_geom *gp;
 	int error;
 
 	KASSERT(pp != NULL, ("%s: NULL provider", __func__));
 	gp = pp->geom;
 	KASSERT(gp != NULL, ("%s: NULL geom", __func__));
 	vg = gp->softc;
 
 	if (vg == NULL) {
 		/* It seems that .access can be called with negative dr,dw,dx
 		 * in this case but I want to check for myself */
 		G_LLVM_DEBUG(0, "access(%d, %d, %d) for %s",
 		    dr, dw, de, pp->name);
 
 		/* This should only happen when geom is withered so
 		 * allow only negative requests */
 		KASSERT(dr <= 0 && dw <= 0 && de <= 0,
 		    ("%s: Positive access for %s", __func__, pp->name));
 		if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0)
 			G_LLVM_DEBUG(0,
 			    "Device %s definitely destroyed", pp->name);
 		return (0);
 	}
 
 	/* Grab an exclusive bit to propagate on our consumers on first open */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... drop it on close */
 	if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0)
 		de--;
 
 	error = ENXIO;
 	LIST_FOREACH(c, &gp->consumer, consumer) {
 		KASSERT(c != NULL, ("%s: consumer is NULL", __func__));
 		error = g_access(c, dr, dw, de);
 		if (error != 0) {
 			struct g_consumer *c2;
 
 			/* Backout earlier changes */
 			LIST_FOREACH(c2, &gp->consumer, consumer) {
 				if (c2 == c) /* all eariler components fixed */
 					return (error);
 				g_access(c2, -dr, -dw, -de);
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Dismantle bio_queue and destroy its components
  */
 static void
 bioq_dismantle(struct bio_queue_head *bq)
 {
 	struct bio *b;
 
 	for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) {
 		bioq_remove(bq, b);
 		g_destroy_bio(b);
 	}
 }
 
 /*
  * GEOM .done handler
  * Can't use standard handler because one requested IO may
  * fork into additional data IOs
  */
 static void
 g_llvm_done(struct bio *b)
 {
 	struct bio *parent_b;
 
 	parent_b = b->bio_parent;
 
 	if (b->bio_error != 0) {
 		G_LLVM_DEBUG(0, "Error %d for offset=%ju, length=%ju on %s",
 		    b->bio_error, b->bio_offset, b->bio_length,
 		    b->bio_to->name);
 		if (parent_b->bio_error == 0)
 			parent_b->bio_error = b->bio_error;
 	}
 
 	parent_b->bio_inbed++;
 	parent_b->bio_completed += b->bio_completed;
 
 	if (parent_b->bio_children == parent_b->bio_inbed) {
 		parent_b->bio_completed = parent_b->bio_length;
 		g_io_deliver(parent_b, parent_b->bio_error);
 	}
 	g_destroy_bio(b);
 }
 
 static void
 g_llvm_start(struct bio *bp)
 {
 	struct g_provider *pp;
 	struct g_llvm_vg *vg;
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 	struct bio *cb;
 	struct bio_queue_head bq;
 	size_t chunk_size;
 	off_t offset, length;
 	char *addr;
 	u_int count;
 
 	pp = bp->bio_to;
 	lv = pp->private;
 	vg = pp->geom->softc;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 	/* XXX BIO_GETATTR allowed? */
 		break;
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	bioq_init(&bq);
 
 	chunk_size = vg->vg_extentsize;
 	addr = bp->bio_data;
 	offset = bp->bio_offset;	/* virtual offset and length */
 	length = bp->bio_length;
 
 	while (length > 0) {
 		size_t chunk_index, in_chunk_offset, in_chunk_length;
 
 		pv = NULL;
 		cb = g_clone_bio(bp);
 		if (cb == NULL) {
 			bioq_dismantle(&bq);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 
 		/* get the segment and the pv */
 		if (lv->lv_sgcount == 1) {
 			/* skip much of the calculations for a single sg */
 			chunk_index = 0;
 			in_chunk_offset = 0;
 			in_chunk_length = length;
 			sg = lv->lv_firstsg;
 			pv = sg->sg_pv;
 			cb->bio_offset = offset + sg->sg_pvoffset;
 		} else {
 			chunk_index = offset / chunk_size; /* round downwards */
 			in_chunk_offset = offset % chunk_size;
 			in_chunk_length =
 			    min(length, chunk_size - in_chunk_offset);
 
 			/* XXX could be faster */
 			LIST_FOREACH(sg, &lv->lv_segs, sg_next) {
 				if (chunk_index >= sg->sg_start &&
 				    chunk_index <= sg->sg_end) {
 					/* adjust chunk index for sg start */
 					chunk_index -= sg->sg_start;
 					pv = sg->sg_pv;
 					break;
 				}
 			}
 			cb->bio_offset =
 			    (off_t)chunk_index * (off_t)chunk_size
 			    + in_chunk_offset + sg->sg_pvoffset;
 		}
 
 		KASSERT(pv != NULL, ("Can't find PV for chunk %zu",
 		    chunk_index));
 
 		cb->bio_to = pv->pv_gprov;
 		cb->bio_done = g_llvm_done;
 		cb->bio_length = in_chunk_length;
 		cb->bio_data = addr;
 		cb->bio_caller1 = pv;
 		bioq_disksort(&bq, cb);
 
 		G_LLVM_DEBUG(5,
 		    "Mapped %s(%ju, %ju) on %s to %zu(%zu,%zu) @ %s:%ju",
 		    bp->bio_cmd == BIO_READ ? "R" : "W",
 		    offset, length, lv->lv_name,
 		    chunk_index, in_chunk_offset, in_chunk_length,
 		    pv->pv_name, cb->bio_offset);
 
 		addr += in_chunk_length;
 		length -= in_chunk_length;
 		offset += in_chunk_length;
 	}
 
 	/* Fire off bio's here */
 	count = 0;
 	for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) {
 		bioq_remove(&bq, cb);
 		pv = cb->bio_caller1;
 		cb->bio_caller1 = NULL;
 		G_LLVM_DEBUG(6, "firing bio to %s, offset=%ju, length=%ju",
 		    cb->bio_to->name, cb->bio_offset, cb->bio_length);
 		g_io_request(cb, pv->pv_gcons);
 		count++;
 	}
 	if (count == 0) { /* We handled everything locally */
 		bp->bio_completed = bp->bio_length;
 		g_io_deliver(bp, 0);
 	}
 }
 
 static void
 g_llvm_remove_disk(struct g_llvm_vg *vg, struct g_consumer *cp)
 {
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 	int found;
 
 	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
 	pv = (struct g_llvm_pv *)cp->private;
 
 	G_LLVM_DEBUG(0, "Disk %s removed from %s.", cp->provider->name,
 	    pv->pv_name);
 
 	LIST_FOREACH(lv, &vg->vg_lvs, lv_next) {
 		/* Find segments that map to this disk */
 		found = 0;
 		LIST_FOREACH(sg, &lv->lv_segs, sg_next) {
 			if (sg->sg_pv == pv) {
 				sg->sg_pv = NULL;
 				lv->lv_sgactive--;
 				found = 1;
 				break;
 			}
 		}
 		if (found) {
 			G_LLVM_DEBUG(0, "Device %s removed.",
 			    lv->lv_gprov->name);
-			g_orphan_provider(lv->lv_gprov, ENXIO);
+			g_wither_provider(lv->lv_gprov, ENXIO);
 			lv->lv_gprov = NULL;
 		}
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_llvm_orphan(struct g_consumer *cp)
 {
 	struct g_llvm_vg *vg;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	vg = gp->softc;
 	if (vg == NULL)
 		return;
 
 	g_llvm_remove_disk(vg, cp);
 	g_llvm_destroy(vg, 1);
 }
 
 static int
 g_llvm_activate_lv(struct g_llvm_vg *vg, struct g_llvm_lv *lv)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	g_topology_assert();
 
 	KASSERT(lv->lv_sgactive == lv->lv_sgcount, ("segment missing"));
 
 	gp = vg->vg_geom;
 	pp = g_new_providerf(gp, "linux_lvm/%s-%s", vg->vg_name, lv->lv_name);
 	pp->mediasize = vg->vg_extentsize * (off_t)lv->lv_extentcount;
 	pp->sectorsize = vg->vg_sectorsize;
 	g_error_provider(pp, 0);
 	lv->lv_gprov = pp;
 	pp->private = lv;
 
 	G_LLVM_DEBUG(1, "Created %s, %juM", pp->name,
 	    pp->mediasize / (1024*1024));
 
 	return (0);
 }
 
 static int
 g_llvm_add_disk(struct g_llvm_vg *vg, struct g_provider *pp, char *uuid)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp, *fcp;
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 	int error;
 
 	g_topology_assert();
 
 	LIST_FOREACH(pv, &vg->vg_pvs, pv_next) {
 		if (strcmp(pv->pv_uuid, uuid) == 0)
 			break;	/* found it */
 	}
 	if (pv == NULL) {
 		G_LLVM_DEBUG(3, "uuid %s not found in pv list", uuid);
 		return (ENOENT);
 	}
 	if (pv->pv_gprov != NULL) {
 		G_LLVM_DEBUG(0, "disk %s already initialised in %s",
 		    pv->pv_name, vg->vg_name);
 		return (EEXIST);
 	}
 
 	pv->pv_start *= vg->vg_sectorsize;
 	gp = vg->vg_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, pp);
 	G_LLVM_DEBUG(1, "Attached %s to %s at offset %ju",
 	    pp->name, pv->pv_name, pv->pv_start);
 
 	if (error != 0) {
 		G_LLVM_DEBUG(0, "cannot attach %s to %s",
 		    pp->name, vg->vg_name);
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL) {
 		if (fcp->provider->sectorsize != pp->sectorsize) {
 			G_LLVM_DEBUG(0, "Provider %s of %s has invalid "
 			    "sector size (%d)", pp->name, vg->vg_name,
 			    pp->sectorsize);
 			return (EINVAL);
 		}
 		if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) {
 			/* Replicate access permissions from first "live"
 			 * consumer to the new one */
 			error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 			if (error != 0) {
 				g_detach(cp);
 				g_destroy_consumer(cp);
 				return (error);
 			}
 		}
 	}
 
 	cp->private = pv;
 	pv->pv_gcons = cp;
 	pv->pv_gprov = pp;
 
 	LIST_FOREACH(lv, &vg->vg_lvs, lv_next) {
 		/* Find segments that map to this disk */
 		LIST_FOREACH(sg, &lv->lv_segs, sg_next) {
 			if (strcmp(sg->sg_pvname, pv->pv_name) == 0) {
 				/* avtivate the segment */
 				KASSERT(sg->sg_pv == NULL,
 				    ("segment already mapped"));
 				sg->sg_pvoffset =
 				    (off_t)sg->sg_pvstart * vg->vg_extentsize
 				    + pv->pv_start;
 				sg->sg_pv = pv;
 				lv->lv_sgactive++;
 
 				G_LLVM_DEBUG(2, "%s: %d to %d @ %s:%d"
 				    " offset %ju sector %ju",
 				    lv->lv_name, sg->sg_start, sg->sg_end,
 				    sg->sg_pvname, sg->sg_pvstart,
 				    sg->sg_pvoffset,
 				    sg->sg_pvoffset / vg->vg_sectorsize);
 			}
 		}
 		/* Activate any lvs waiting on this disk */
 		if (lv->lv_gprov == NULL && lv->lv_sgactive == lv->lv_sgcount) {
 			error = g_llvm_activate_lv(vg, lv);
 			if (error)
 				break;
 		}
 	}
 	return (error);
 }
 
 static void
 g_llvm_init(struct g_class *mp)
 {
 	LIST_INIT(&vg_list);
 }
 
 static void
 g_llvm_free_vg(struct g_llvm_vg *vg)
 {
 	struct g_llvm_pv *pv;
 	struct g_llvm_lv *lv;
 	struct g_llvm_segment *sg;
 
 	/* Free all the structures */
 	while ((pv = LIST_FIRST(&vg->vg_pvs)) != NULL) {
 		LIST_REMOVE(pv, pv_next);
 		free(pv, M_GLLVM);
 	}
 	while ((lv = LIST_FIRST(&vg->vg_lvs)) != NULL) {
 		while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) {
 			LIST_REMOVE(sg, sg_next);
 			free(sg, M_GLLVM);
 		}
 		LIST_REMOVE(lv, lv_next);
 		free(lv, M_GLLVM);
 	}
 	LIST_REMOVE(vg, vg_next);
 	free(vg, M_GLLVM);
 }
 
 static void
 g_llvm_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_llvm_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	struct g_llvm_label ll;
 	struct g_llvm_metadata md;
 	struct g_llvm_vg *vg;
 	int error;
 
 	bzero(&md, sizeof(md));
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	gp = g_new_geomf(mp, "linux_lvm:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_llvm_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_llvm_read_label(cp, &ll);
 	if (!error)
 		error = g_llvm_read_md(cp, &md, &ll);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 
 	vg = md.md_vg;
 	if (vg->vg_geom == NULL) {
 		/* new volume group */
 		gp = g_new_geomf(mp, "%s", vg->vg_name);
 		gp->start = g_llvm_start;
 		gp->spoiled = g_llvm_orphan;
 		gp->orphan = g_llvm_orphan;
 		gp->access = g_llvm_access;
 		vg->vg_sectorsize = pp->sectorsize;
 		vg->vg_extentsize *= vg->vg_sectorsize;
 		vg->vg_geom = gp;
 		gp->softc = vg;
 		G_LLVM_DEBUG(1, "Created volume %s, extent size %zuK",
 		    vg->vg_name, vg->vg_extentsize / 1024);
 	}
 
 	/* initialise this disk in the volume group */
 	g_llvm_add_disk(vg, pp, ll.ll_uuid);
 	return (vg->vg_geom);
 }
 
 static int
 g_llvm_destroy(struct g_llvm_vg *vg, int force)
 {
 	struct g_provider *pp;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	if (vg == NULL)
 		return (ENXIO);
 	gp = vg->vg_geom;
 
 	LIST_FOREACH(pp, &gp->provider, provider) {
 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
 			G_LLVM_DEBUG(1, "Device %s is still open (r%dw%de%d)",
 			    pp->name, pp->acr, pp->acw, pp->ace);
 			if (!force)
 				return (EBUSY);
 		}
 	}
 
 	g_llvm_free_vg(gp->softc);
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static int
 g_llvm_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
     struct g_geom *gp)
 {
 	struct g_llvm_vg *vg;
 
 	vg = gp->softc;
 	return (g_llvm_destroy(vg, 0));
 }
 
 int
 g_llvm_read_label(struct g_consumer *cp, struct g_llvm_label *ll)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int i, error = 0;
 
 	g_topology_assert();
 
 	/* The LVM label is stored on the first four sectors */
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, 0, pp->sectorsize * 4, &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_LLVM_DEBUG(1, "Cannot read metadata from %s (error=%d)",
 		    pp->name, error);
 		return (error);
 	}
 
 	/* Search the four sectors for the LVM label. */
 	for (i = 0; i < 4; i++) {
 		error = llvm_label_decode(&buf[i * pp->sectorsize], ll, i);
 		if (error == 0)
 			break;	/* found it */
 	}
 	g_free(buf);
 	return (error);
 }
 
 int
 g_llvm_read_md(struct g_consumer *cp, struct g_llvm_metadata *md,
     struct g_llvm_label *ll)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 	int size;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, ll->ll_md_offset, pp->sectorsize, &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_LLVM_DEBUG(0, "Cannot read metadata from %s (error=%d)",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	error = llvm_md_decode(buf, md, ll);
 	g_free(buf);
 	if (error != 0) {
 		return (error);
 	}
 
 	G_LLVM_DEBUG(1, "reading LVM2 config @ %s:%ju", pp->name,
 		    ll->ll_md_offset + md->md_reloffset);
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* round up to the nearest sector */
 	size = md->md_relsize +
 	    (pp->sectorsize - md->md_relsize % pp->sectorsize);
 	buf = g_read_data(cp, ll->ll_md_offset + md->md_reloffset, size, &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_LLVM_DEBUG(0, "Cannot read LVM2 config from %s (error=%d)",
 		    pp->name, error);
 		return (error);
 	}
 	buf[md->md_relsize] = '\0';
 	G_LLVM_DEBUG(10, "LVM config:\n%s\n", buf);
 	error = llvm_textconf_decode(buf, md->md_relsize, md);
 	g_free(buf);
 
 	return (error);
 }
 
 static int
 llvm_label_decode(const u_char *data, struct g_llvm_label *ll, int sector)
 {
 	uint64_t off;
 	char *uuid;
 
 	/* Magic string */
 	if (bcmp("LABELONE", data , 8) != 0)
 		return (EINVAL);
 
 	/* We only support LVM2 text format */
 	if (bcmp("LVM2 001", data + 24, 8) != 0) {
 		G_LLVM_DEBUG(0, "Unsupported LVM format");
 		return (EINVAL);
 	}
 
 	ll->ll_sector = le64dec(data + 8);
 	ll->ll_crc = le32dec(data + 16);
 	ll->ll_offset = le32dec(data + 20);
 
 	if (ll->ll_sector != sector) {
 		G_LLVM_DEBUG(0, "Expected sector %ju, found at %d",
 		    ll->ll_sector, sector);
 		return (EINVAL);
 	}
 
 	off = ll->ll_offset;
 	/*
 	 * convert the binary uuid to string format, the format is
 	 * xxxxxx-xxxx-xxxx-xxxx-xxxx-xxxx-xxxxxx (6-4-4-4-4-4-6)
 	 */
 	uuid = ll->ll_uuid;
 	bcopy(data + off, uuid, 6);
 	off += 6;
 	uuid += 6;
 	*uuid++ = '-';
 	for (int i = 0; i < 5; i++) {
 		bcopy(data + off, uuid, 4);
 		off += 4;
 		uuid += 4;
 		*uuid++ = '-';
 	}
 	bcopy(data + off, uuid, 6);
 	off += 6;
 	uuid += 6;
 	*uuid++ = '\0';
 
 	ll->ll_size = le64dec(data + off);
 	off += 8;
 	ll->ll_pestart = le64dec(data + off);
 	off += 16;
 
 	/* Only one data section is supported */
 	if (le64dec(data + off) != 0) {
 		G_LLVM_DEBUG(0, "Only one data section supported");
 		return (EINVAL);
 	}
 
 	off += 16;
 	ll->ll_md_offset = le64dec(data + off);
 	off += 8;
 	ll->ll_md_size = le64dec(data + off);
 	off += 8;
 
 	G_LLVM_DEBUG(1, "LVM metadata: offset=%ju, size=%ju", ll->ll_md_offset,
 	    ll->ll_md_size);
 
 	/* Only one data section is supported */
 	if (le64dec(data + off) != 0) {
 		G_LLVM_DEBUG(0, "Only one metadata section supported");
 		return (EINVAL);
 	}
 
 	G_LLVM_DEBUG(2, "label uuid=%s", ll->ll_uuid);
 	G_LLVM_DEBUG(2, "sector=%ju, crc=%u, offset=%u, size=%ju, pestart=%ju",
 	    ll->ll_sector, ll->ll_crc, ll->ll_offset, ll->ll_size,
 	    ll->ll_pestart);
 
 	return (0);
 }
 
 static int
 llvm_md_decode(const u_char *data, struct g_llvm_metadata *md,
     struct g_llvm_label *ll)
 {
 	uint64_t off;
 	char magic[16];
 
 	off = 0;
 	md->md_csum = le32dec(data + off);
 	off += 4;
 	bcopy(data + off, magic, 16);
 	off += 16;
 	md->md_version = le32dec(data + off);
 	off += 4;
 	md->md_start = le64dec(data + off);
 	off += 8;
 	md->md_size = le64dec(data + off);
 	off += 8;
 
 	if (bcmp(G_LLVM_MAGIC, magic, 16) != 0) {
 		G_LLVM_DEBUG(0, "Incorrect md magic number");
 		return (EINVAL);
 	}
 	if (md->md_version != 1) {
 		G_LLVM_DEBUG(0, "Incorrect md version number (%u)",
 		    md->md_version);
 		return (EINVAL);
 	}
 	if (md->md_start != ll->ll_md_offset) {
 		G_LLVM_DEBUG(0, "Incorrect md offset (%ju)", md->md_start);
 		return (EINVAL);
 	}
 
 	/* Aparently only one is ever returned */
 	md->md_reloffset = le64dec(data + off);
 	off += 8;
 	md->md_relsize = le64dec(data + off);
 	off += 16;	/* XXX skipped checksum */
 
 	if (le64dec(data + off) != 0) {
 		G_LLVM_DEBUG(0, "Only one reloc supported");
 		return (EINVAL);
 	}
 
 	G_LLVM_DEBUG(3, "reloc: offset=%ju, size=%ju",
 	    md->md_reloffset, md->md_relsize);
 	G_LLVM_DEBUG(3, "md: version=%u, start=%ju, size=%ju",
 	    md->md_version, md->md_start, md->md_size);
 
 	return (0);
 }
 
 #define	GRAB_INT(key, tok1, tok2, v)					\
 	if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) {	\
 		v = strtol(tok2, &tok1, 10);				\
 		if (tok1 == tok2)					\
 			/* strtol did not eat any of the buffer */	\
 			goto bad;					\
 		continue;						\
 	}
 
 #define	GRAB_STR(key, tok1, tok2, v, len)				\
 	if (tok1 && tok2 && strncmp(tok1, key, sizeof(key)) == 0) {	\
 		strsep(&tok2, "\"");					\
 		if (tok2 == NULL)					\
 			continue;					\
 		tok1 = strsep(&tok2, "\"");				\
 		if (tok2 == NULL)					\
 			continue;					\
 		strncpy(v, tok1, len);					\
 		continue;						\
 	}
 
 #define	SPLIT(key, value, str)						\
 	key = strsep(&value, str);					\
 	/* strip trailing whitespace on the key */			\
 	for (char *t = key; *t != '\0'; t++)				\
 		if (isspace(*t)) {					\
 			*t = '\0';					\
 			break;						\
 		}
 
 static size_t 
 llvm_grab_name(char *name, const char *tok)
 {
 	size_t len;
 
 	len = 0;
 	if (tok == NULL)
 		return (0);
 	if (tok[0] == '-')
 		return (0);
 	if (strcmp(tok, ".") == 0 || strcmp(tok, "..") == 0)
 		return (0);
 	while (tok[len] && (isalpha(tok[len]) || isdigit(tok[len]) ||
 	    tok[len] == '.' || tok[len] == '_' || tok[len] == '-' ||
 	    tok[len] == '+') && len < G_LLVM_NAMELEN - 1)
 		len++;
 	bcopy(tok, name, len);
 	name[len] = '\0';
 	return (len);
 }
 
 static int
 llvm_textconf_decode(u_char *data, int buflen, struct g_llvm_metadata *md)
 {
 	struct g_llvm_vg	*vg;
 	char *buf = data;
 	char *tok, *v;
 	char name[G_LLVM_NAMELEN];
 	char uuid[G_LLVM_UUIDLEN];
 	size_t len;
 
 	if (buf == NULL || *buf == '\0')
 		return (EINVAL);
 
 	tok = strsep(&buf, "\n");
 	if (tok == NULL)
 		return (EINVAL);
 	len = llvm_grab_name(name, tok);
 	if (len == 0)
 		return (EINVAL);
 
 	/* check too see if the vg has already been loaded off another disk */
 	LIST_FOREACH(vg, &vg_list, vg_next) {
 		if (strcmp(vg->vg_name, name) == 0) {
 			uuid[0] = '\0';
 			/* grab the volume group uuid */
 			while ((tok = strsep(&buf, "\n")) != NULL) {
 				if (strstr(tok, "{"))
 					break;
 				if (strstr(tok, "=")) {
 					SPLIT(v, tok, "=");
 					GRAB_STR("id", v, tok, uuid,
 					    sizeof(uuid));
 				}
 			}
 			if (strcmp(vg->vg_uuid, uuid) == 0) {
 				/* existing vg */
 				md->md_vg = vg;
 				return (0);
 			}
 			/* XXX different volume group with name clash! */
 			G_LLVM_DEBUG(0,
 			    "%s already exists, volume group not loaded", name);
 			return (EINVAL);
 		}
 	}
 
 	vg = malloc(sizeof(*vg), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (vg == NULL)
 		return (ENOMEM);
 
 	strncpy(vg->vg_name, name, sizeof(vg->vg_name));
 	LIST_INIT(&vg->vg_pvs);
 	LIST_INIT(&vg->vg_lvs);
 
 #define	VOL_FOREACH(func, tok, buf, p)					\
 	while ((tok = strsep(buf, "\n")) != NULL) {			\
 		if (strstr(tok, "{")) {					\
 			func(buf, tok, p);				\
 			continue;					\
 		}							\
 		if (strstr(tok, "}"))					\
 			break;						\
 	}
 
 	while ((tok = strsep(&buf, "\n")) != NULL) {
 		if (strcmp(tok, "physical_volumes {") == 0) {
 			VOL_FOREACH(llvm_textconf_decode_pv, tok, &buf, vg);
 			continue;
 		}
 		if (strcmp(tok, "logical_volumes {") == 0) {
 			VOL_FOREACH(llvm_textconf_decode_lv, tok, &buf, vg);
 			continue;
 		}
 		if (strstr(tok, "{")) {
 			G_LLVM_DEBUG(2, "unknown section %s", tok);
 			continue;
 		}
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_STR("id", v, tok, vg->vg_uuid, sizeof(vg->vg_uuid));
 			GRAB_INT("extent_size", v, tok, vg->vg_extentsize);
 			continue;
 		}
 	}
 	/* basic checking */
 	if (vg->vg_extentsize == 0)
 		goto bad;
 
 	md->md_vg = vg;
 	LIST_INSERT_HEAD(&vg_list, vg, vg_next);
 	G_LLVM_DEBUG(3, "vg: name=%s uuid=%s", vg->vg_name, vg->vg_uuid);
 	return(0);
 
 bad:
 	g_llvm_free_vg(vg);
 	return (-1);
 }
 #undef	VOL_FOREACH
 
 static int
 llvm_textconf_decode_pv(char **buf, char *tok, struct g_llvm_vg *vg)
 {
 	struct g_llvm_pv	*pv;
 	char *v;
 	size_t len;
 
 	if (*buf == NULL || **buf == '\0')
 		return (EINVAL);
 
 	pv = malloc(sizeof(*pv), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (pv == NULL)
 		return (ENOMEM);
 
 	pv->pv_vg = vg;
 	len = 0;
 	if (tok == NULL)
 		goto bad;
 	len = llvm_grab_name(pv->pv_name, tok);
 	if (len == 0)
 		goto bad;
 
 	while ((tok = strsep(buf, "\n")) != NULL) {
 		if (strstr(tok, "{"))
 			goto bad;
 
 		if (strstr(tok, "}"))
 			break;
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_STR("id", v, tok, pv->pv_uuid, sizeof(pv->pv_uuid));
 			GRAB_INT("pe_start", v, tok, pv->pv_start);
 			GRAB_INT("pe_count", v, tok, pv->pv_count);
 			continue;
 		}
 	}
 	if (tok == NULL)
 		goto bad;
 	/* basic checking */
 	if (pv->pv_count == 0)
 		goto bad;
 
 	LIST_INSERT_HEAD(&vg->vg_pvs, pv, pv_next);
 	G_LLVM_DEBUG(3, "pv: name=%s uuid=%s", pv->pv_name, pv->pv_uuid);
 
 	return (0);
 bad:
 	free(pv, M_GLLVM);
 	return (-1);
 }
 
 static int
 llvm_textconf_decode_lv(char **buf, char *tok, struct g_llvm_vg *vg)
 {
 	struct g_llvm_lv	*lv;
 	struct g_llvm_segment *sg;
 	char *v;
 	size_t len;
 
 	if (*buf == NULL || **buf == '\0')
 		return (EINVAL);
 
 	lv = malloc(sizeof(*lv), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (lv == NULL)
 		return (ENOMEM);
 
 	lv->lv_vg = vg;
 	LIST_INIT(&lv->lv_segs);
 
 	if (tok == NULL)
 		goto bad;
 	len = llvm_grab_name(lv->lv_name, tok);
 	if (len == 0)
 		goto bad;
 
 	while ((tok = strsep(buf, "\n")) != NULL) {
 		if (strstr(tok, "{")) {
 			if (strstr(tok, "segment")) {
 				llvm_textconf_decode_sg(buf, tok, lv);
 				continue;
 			} else
 				/* unexpected section */
 				goto bad;
 		}
 
 		if (strstr(tok, "}"))
 			break;
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_STR("id", v, tok, lv->lv_uuid, sizeof(lv->lv_uuid));
 			GRAB_INT("segment_count", v, tok, lv->lv_sgcount);
 			continue;
 		}
 	}
 	if (tok == NULL)
 		goto bad;
 	if (lv->lv_sgcount == 0 || lv->lv_sgcount != lv->lv_numsegs)
 		/* zero or incomplete segment list */
 		goto bad;
 
 	/* Optimize for only one segment on the pv */
 	lv->lv_firstsg = LIST_FIRST(&lv->lv_segs);
 	LIST_INSERT_HEAD(&vg->vg_lvs, lv, lv_next);
 	G_LLVM_DEBUG(3, "lv: name=%s uuid=%s", lv->lv_name, lv->lv_uuid);
 
 	return (0);
 bad:
 	while ((sg = LIST_FIRST(&lv->lv_segs)) != NULL) {
 		LIST_REMOVE(sg, sg_next);
 		free(sg, M_GLLVM);
 	}
 	free(lv, M_GLLVM);
 	return (-1);
 }
 
 static int
 llvm_textconf_decode_sg(char **buf, char *tok, struct g_llvm_lv *lv)
 {
 	struct g_llvm_segment *sg;
 	char *v;
 	int count = 0;
 
 	if (*buf == NULL || **buf == '\0')
 		return (EINVAL);
 
 	sg = malloc(sizeof(*sg), M_GLLVM, M_NOWAIT|M_ZERO);
 	if (sg == NULL)
 		return (ENOMEM);
 
 	while ((tok = strsep(buf, "\n")) != NULL) {
 		/* only a single linear stripe is supported */
 		if (strstr(tok, "stripe_count")) {
 			SPLIT(v, tok, "=");
 			GRAB_INT("stripe_count", v, tok, count);
 			if (count != 1)
 				goto bad;
 		}
 
 		if (strstr(tok, "{"))
 			goto bad;
 
 		if (strstr(tok, "}"))
 			break;
 
 		if (strcmp(tok, "stripes = [") == 0) {
 			tok = strsep(buf, "\n");
 			if (tok == NULL)
 				goto bad;
 
 			strsep(&tok, "\"");
 			if (tok == NULL)
 				goto bad;	/* missing open quotes */
 			v = strsep(&tok, "\"");
 			if (tok == NULL)
 				goto bad;	/* missing close quotes */
 			strncpy(sg->sg_pvname, v, sizeof(sg->sg_pvname));
 			if (*tok != ',')
 				goto bad;	/* missing comma for stripe */
 			tok++;
 
 			sg->sg_pvstart = strtol(tok, &v, 10);
 			if (v == tok)
 				/* strtol did not eat any of the buffer */
 				goto bad;
 
 			continue;
 		}
 
 		/* parse 'key = value' lines */
 		if (strstr(tok, "=")) {
 			SPLIT(v, tok, "=");
 			GRAB_INT("start_extent", v, tok, sg->sg_start);
 			GRAB_INT("extent_count", v, tok, sg->sg_count);
 			continue;
 		}
 	}
 	if (tok == NULL)
 		goto bad;
 	/* basic checking */
 	if (count != 1 || sg->sg_count == 0)
 		goto bad;
 
 	sg->sg_end = sg->sg_start + sg->sg_count - 1;
 	lv->lv_numsegs++;
 	lv->lv_extentcount += sg->sg_count;
 	LIST_INSERT_HEAD(&lv->lv_segs, sg, sg_next);
 
 	return (0);
 bad:
 	free(sg, M_GLLVM);
 	return (-1);
 }
 #undef	GRAB_INT
 #undef	GRAB_STR
 #undef	SPLIT
 
 static struct g_class g_llvm_class = {
 	.name = G_LLVM_CLASS_NAME,
 	.version = G_VERSION,
 	.init = g_llvm_init,
 	.taste = g_llvm_taste,
 	.destroy_geom = g_llvm_destroy_geom
 };
 
 DECLARE_GEOM_CLASS(g_llvm_class, g_linux_lvm);
Index: user/alc/PQ_LAUNDRY/sys/geom/mirror/g_mirror.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/mirror/g_mirror.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/mirror/g_mirror.c	(revision 306283)
@@ -1,3384 +1,3383 @@
 /*-
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fail.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <geom/mirror/g_mirror.h>
 
 FEATURE(geom_mirror, "GEOM mirroring support");
 
 static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, mirror, CTLFLAG_RW, 0,
     "GEOM_MIRROR stuff");
 u_int g_mirror_debug = 0;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, debug, CTLFLAG_RWTUN, &g_mirror_debug, 0,
     "Debug level");
 static u_int g_mirror_timeout = 4;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_mirror_timeout,
     0, "Time to wait on all mirror components");
 static u_int g_mirror_idletime = 5;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, idletime, CTLFLAG_RWTUN,
     &g_mirror_idletime, 0, "Mark components as clean when idling");
 static u_int g_mirror_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_mirror_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_mirror_syncreqs = 2;
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests.");
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
 } while (0)
 
 static eventhandler_tag g_mirror_post_sync = NULL;
 static int g_mirror_shutdown = 0;
 
 static int g_mirror_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static g_taste_t g_mirror_taste;
 static g_resize_t g_mirror_resize;
 static void g_mirror_init(struct g_class *mp);
 static void g_mirror_fini(struct g_class *mp);
 
 struct g_class g_mirror_class = {
 	.name = G_MIRROR_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_mirror_config,
 	.taste = g_mirror_taste,
 	.destroy_geom = g_mirror_destroy_geom,
 	.init = g_mirror_init,
 	.fini = g_mirror_fini,
 	.resize = g_mirror_resize
 };
 
 
 static void g_mirror_destroy_provider(struct g_mirror_softc *sc);
 static int g_mirror_update_disk(struct g_mirror_disk *disk, u_int state);
 static void g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force);
 static void g_mirror_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_mirror_sync_stop(struct g_mirror_disk *disk, int type);
 static void g_mirror_register_request(struct bio *bp);
 static void g_mirror_sync_release(struct g_mirror_softc *sc);
 
 
 static const char *
 g_mirror_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NONE:
 		return ("NONE");
 	case G_MIRROR_DISK_STATE_NEW:
 		return ("NEW");
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		return ("ACTIVE");
 	case G_MIRROR_DISK_STATE_STALE:
 		return ("STALE");
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		return ("SYNCHRONIZING");
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		return ("DISCONNECTED");
 	case G_MIRROR_DISK_STATE_DESTROY:
 		return ("DESTROY");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_device_state2str(int state)
 {
 
 	switch (state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 		return ("STARTING");
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		return ("RUNNING");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_mirror_get_diskname(struct g_mirror_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_name);
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_mirror are used to maintain disks and device status
  * from one thread to simplify locking.
  */
 static void
 g_mirror_event_free(struct g_mirror_event *ep)
 {
 
 	free(ep, M_MIRROR);
 }
 
 int
 g_mirror_event_send(void *arg, int state, int flags)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	int error;
 
 	ep = malloc(sizeof(*ep), M_MIRROR, M_WAITOK);
 	G_MIRROR_DEBUG(4, "%s: Sending event %p.", __func__, ep);
 	if ((flags & G_MIRROR_EVENT_DEVICE) != 0) {
 		disk = NULL;
 		sc = arg;
 	} else {
 		disk = arg;
 		sc = disk->d_softc;
 	}
 	ep->e_disk = disk;
 	ep->e_state = state;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if ((flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 		return (0);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_MIRROR_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_events_mtx);
 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "m:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_mirror_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static struct g_mirror_event *
 g_mirror_event_get(struct g_mirror_softc *sc)
 {
 	struct g_mirror_event *ep;
 
 	mtx_lock(&sc->sc_events_mtx);
 	ep = TAILQ_FIRST(&sc->sc_events);
 	mtx_unlock(&sc->sc_events_mtx);
 	return (ep);
 }
 
 static void
 g_mirror_event_remove(struct g_mirror_softc *sc, struct g_mirror_event *ep)
 {
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 static void
 g_mirror_event_cancel(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep, *tmpep;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0)
 			continue;
 		if (ep->e_disk != disk)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 /*
  * Return the number of disks in given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_mirror_ndisks(struct g_mirror_softc *sc, int state)
 {
 	struct g_mirror_disk *disk;
 	u_int n = 0;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (state == -1 || disk->d_state == state)
 			n++;
 	}
 	return (n);
 }
 
 /*
  * Find a disk in mirror by its disk ID.
  */
 static struct g_mirror_disk *
 g_mirror_id2disk(struct g_mirror_softc *sc, uint32_t id)
 {
 	struct g_mirror_disk *disk;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_id == id)
 			return (disk);
 	}
 	return (NULL);
 }
 
 static u_int
 g_mirror_nrequests(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 static int
 g_mirror_is_busy(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_mirror_nrequests(sc, cp) > 0) {
 		G_MIRROR_DEBUG(2,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_mirror_kill_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert();
 
 	cp->private = NULL;
 	if (g_mirror_is_busy(sc, cp))
 		return;
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	G_MIRROR_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
 	    -cp->acw, -cp->ace, 0);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_mirror_destroy_consumer, cp, M_WAITOK, NULL);
 		return;
 	}
 	G_MIRROR_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_mirror_connect_disk(struct g_mirror_disk *disk, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert_not();
 	KASSERT(disk->d_consumer == NULL,
 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
 
 	g_topology_lock();
 	cp = g_new_consumer(disk->d_softc->sc_geom);
 	cp->flags |= G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		return (error);
 	}
 	error = g_access(cp, 1, 1, 1);
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		G_MIRROR_DEBUG(0, "Cannot open consumer %s (error=%d).",
 		    pp->name, error);
 		return (error);
 	}
 	g_topology_unlock();
 	disk->d_consumer = cp;
 	disk->d_consumer->private = disk;
 	disk->d_consumer->index = 0;
 
 	G_MIRROR_DEBUG(2, "Disk %s connected.", g_mirror_get_diskname(disk));
 	return (0);
 }
 
 static void
 g_mirror_disconnect_consumer(struct g_mirror_softc *sc, struct g_consumer *cp)
 {
 
 	g_topology_assert();
 
 	if (cp == NULL)
 		return;
 	if (cp->provider != NULL)
 		g_mirror_kill_consumer(sc, cp);
 	else
 		g_destroy_consumer(cp);
 }
 
 /*
  * Initialize disk. This means allocate memory, create consumer, attach it
  * to the provider and open access (r1w1e1) to it.
  */
 static struct g_mirror_disk *
 g_mirror_init_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md, int *errorp)
 {
 	struct g_mirror_disk *disk;
 	int i, error;
 
 	disk = malloc(sizeof(*disk), M_MIRROR, M_NOWAIT | M_ZERO);
 	if (disk == NULL) {
 		error = ENOMEM;
 		goto fail;
 	}
 	disk->d_softc = sc;
 	error = g_mirror_connect_disk(disk, pp);
 	if (error != 0)
 		goto fail;
 	disk->d_id = md->md_did;
 	disk->d_state = G_MIRROR_DISK_STATE_NONE;
 	disk->d_priority = md->md_priority;
 	disk->d_flags = md->md_dflags;
 	error = g_getattr("GEOM::candelete", disk->d_consumer, &i);
 	if (error == 0 && i != 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_CANDELETE;
 	if (md->md_provider[0] != '\0')
 		disk->d_flags |= G_MIRROR_DISK_FLAG_HARDCODED;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_sync.ds_offset = md->md_sync_offset;
 	disk->d_sync.ds_offset_done = md->md_sync_offset;
 	disk->d_genid = md->md_genid;
 	disk->d_sync.ds_syncid = md->md_syncid;
 	if (errorp != NULL)
 		*errorp = 0;
 	return (disk);
 fail:
 	if (errorp != NULL)
 		*errorp = error;
 	if (disk != NULL)
 		free(disk, M_MIRROR);
 	return (NULL);
 }
 
 static void
 g_mirror_destroy_disk(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	LIST_REMOVE(disk, d_next);
 	g_mirror_event_cancel(disk);
 	if (sc->sc_hint == disk)
 		sc->sc_hint = NULL;
 	switch (disk->d_state) {
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		g_mirror_sync_stop(disk, 1);
 		/* FALLTHROUGH */
 	case G_MIRROR_DISK_STATE_NEW:
 	case G_MIRROR_DISK_STATE_STALE:
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		g_topology_lock();
 		g_mirror_disconnect_consumer(sc, disk->d_consumer);
 		g_topology_unlock();
 		free(disk, M_MIRROR);
 		break;
 	default:
 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 	}
 }
 
 static void
 g_mirror_destroy_device(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_mirror_event *ep;
 	struct g_geom *gp;
 	struct g_consumer *cp, *tmpcp;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	gp = sc->sc_geom;
 	if (sc->sc_provider != NULL)
 		g_mirror_destroy_provider(sc);
 	for (disk = LIST_FIRST(&sc->sc_disks); disk != NULL;
 	    disk = LIST_FIRST(&sc->sc_disks)) {
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 		g_mirror_destroy_disk(disk);
 	}
 	while ((ep = g_mirror_event_get(sc)) != NULL) {
 		g_mirror_event_remove(sc, ep);
 		if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0)
 			g_mirror_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			ep->e_flags |= G_MIRROR_EVENT_DONE;
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, ep);
 			mtx_lock(&sc->sc_events_mtx);
 			wakeup(ep);
 			mtx_unlock(&sc->sc_events_mtx);
 		}
 	}
 	callout_drain(&sc->sc_callout);
 
 	g_topology_lock();
 	LIST_FOREACH_SAFE(cp, &sc->sc_sync.ds_geom->consumer, consumer, tmpcp) {
 		g_mirror_disconnect_consumer(sc, cp);
 	}
 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
 	G_MIRROR_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	g_topology_unlock();
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_events_mtx);
 	mtx_destroy(&sc->sc_done_mtx);
 	sx_xunlock(&sc->sc_lock);
 	sx_destroy(&sc->sc_lock);
 }
 
 static void
 g_mirror_orphan(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	disk->d_softc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 	g_mirror_event_send(disk, G_MIRROR_DISK_STATE_DISCONNECTED,
 	    G_MIRROR_EVENT_DONTWAIT);
 }
 
 /*
  * Function should return the next active disk on the list.
  * It is possible that it will be the same disk as given.
  * If there are no active disks on list, NULL is returned.
  */
 static __inline struct g_mirror_disk *
 g_mirror_find_next(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 	struct g_mirror_disk *dp;
 
 	for (dp = LIST_NEXT(disk, d_next); dp != disk;
 	    dp = LIST_NEXT(dp, d_next)) {
 		if (dp == NULL)
 			dp = LIST_FIRST(&sc->sc_disks);
 		if (dp->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 		return (NULL);
 	return (dp);
 }
 
 static struct g_mirror_disk *
 g_mirror_get_disk(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	if (sc->sc_hint == NULL) {
 		sc->sc_hint = LIST_FIRST(&sc->sc_disks);
 		if (sc->sc_hint == NULL)
 			return (NULL);
 	}
 	disk = sc->sc_hint;
 	if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE) {
 		disk = g_mirror_find_next(sc, disk);
 		if (disk == NULL)
 			return (NULL);
 	}
 	sc->sc_hint = g_mirror_find_next(sc, disk);
 	return (disk);
 }
 
 static int
 g_mirror_write_metadata(struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	off_t offset, length;
 	u_char *sector;
 	int error = 0;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	cp = disk->d_consumer;
 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	length = cp->provider->sectorsize;
 	offset = cp->provider->mediasize - length;
 	sector = malloc((size_t)length, M_MIRROR, M_WAITOK | M_ZERO);
 	if (md != NULL &&
 	    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0) {
 		/*
 		 * Handle the case, when the size of parent provider reduced.
 		 */
 		if (offset < md->md_mediasize)
 			error = ENOSPC;
 		else
 			mirror_metadata_encode(md, sector);
 	}
 	KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_metadata_write, error);
 	if (error == 0)
 		error = g_write_data(cp, offset, sector, length);
 	free(sector, M_MIRROR);
 	if (error != 0) {
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 			disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 			G_MIRROR_DEBUG(0, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		} else {
 			G_MIRROR_DEBUG(1, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_mirror_get_diskname(disk), sc->sc_name, error);
 		}
 		if (g_mirror_disconnect_on_failure &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1) {
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 		}
 	}
 	return (error);
 }
 
 static int
 g_mirror_clear_metadata(struct g_mirror_disk *disk)
 {
 	int error;
 
 	g_topology_assert_not();
 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
 
 	error = g_mirror_write_metadata(disk, NULL);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s cleared.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot clear metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 	return (error);
 }
 
 void
 g_mirror_fill_metadata(struct g_mirror_softc *sc, struct g_mirror_disk *disk,
     struct g_mirror_metadata *md)
 {
 
 	strlcpy(md->md_magic, G_MIRROR_MAGIC, sizeof(md->md_magic));
 	md->md_version = G_MIRROR_VERSION;
 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
 	md->md_mid = sc->sc_id;
 	md->md_all = sc->sc_ndisks;
 	md->md_slice = sc->sc_slice;
 	md->md_balance = sc->sc_balance;
 	md->md_genid = sc->sc_genid;
 	md->md_mediasize = sc->sc_mediasize;
 	md->md_sectorsize = sc->sc_sectorsize;
 	md->md_mflags = (sc->sc_flags & G_MIRROR_DEVICE_FLAG_MASK);
 	bzero(md->md_provider, sizeof(md->md_provider));
 	if (disk == NULL) {
 		md->md_did = arc4random();
 		md->md_priority = 0;
 		md->md_syncid = 0;
 		md->md_dflags = 0;
 		md->md_sync_offset = 0;
 		md->md_provsize = 0;
 	} else {
 		md->md_did = disk->d_id;
 		md->md_priority = disk->d_priority;
 		md->md_syncid = disk->d_sync.ds_syncid;
 		md->md_dflags = (disk->d_flags & G_MIRROR_DISK_FLAG_MASK);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			md->md_sync_offset = disk->d_sync.ds_offset_done;
 		else
 			md->md_sync_offset = 0;
 		if ((disk->d_flags & G_MIRROR_DISK_FLAG_HARDCODED) != 0) {
 			strlcpy(md->md_provider,
 			    disk->d_consumer->provider->name,
 			    sizeof(md->md_provider));
 		}
 		md->md_provsize = disk->d_consumer->provider->mediasize;
 	}
 }
 
 void
 g_mirror_update_metadata(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_metadata md;
 	int error;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WIPE) == 0)
 		g_mirror_fill_metadata(sc, disk, &md);
 	error = g_mirror_write_metadata(disk, &md);
 	if (error == 0) {
 		G_MIRROR_DEBUG(2, "Metadata on %s updated.",
 		    g_mirror_get_diskname(disk));
 	} else {
 		G_MIRROR_DEBUG(0,
 		    "Cannot update metadata on disk %s (error=%d).",
 		    g_mirror_get_diskname(disk), error);
 	}
 }
 
 static void
 g_mirror_bump_syncid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_syncid++;
 	G_MIRROR_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
 	    sc->sc_syncid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_sync.ds_syncid = sc->sc_syncid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static void
 g_mirror_bump_genid(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_genid++;
 	G_MIRROR_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
 	    sc->sc_genid);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_genid = sc->sc_genid;
 			g_mirror_update_metadata(disk);
 		}
 	}
 }
 
 static int
 g_mirror_idle(struct g_mirror_softc *sc, int acw)
 {
 	struct g_mirror_disk *disk;
 	int timeout;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider == NULL)
 		return (0);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return (0);
 	if (sc->sc_idle)
 		return (0);
 	if (sc->sc_writes > 0)
 		return (0);
 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
 		timeout = g_mirror_idletime - (time_uptime - sc->sc_last_write);
 		if (!g_mirror_shutdown && timeout > 0)
 			return (timeout);
 	}
 	sc->sc_idle = 1;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_unidle(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	sc->sc_idle = 0;
 	sc->sc_last_write = time_uptime;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 		g_mirror_update_metadata(disk);
 	}
 }
 
 static void
 g_mirror_flush_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	mtx_lock(&sc->sc_done_mtx);
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_completed += bp->bio_completed;
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		mtx_unlock(&sc->sc_done_mtx);
 		g_io_deliver(pbp, pbp->bio_error);
 	} else
 		mtx_unlock(&sc->sc_done_mtx);
 	g_destroy_bio(bp);
 }
 
 static void
 g_mirror_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_REGULAR;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_regular_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct bio *pbp;
 
 	g_topology_assert_not();
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	bp->bio_from->index--;
 	if (bp->bio_cmd == BIO_WRITE)
 		sc->sc_writes--;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 	}
 
 	if (bp->bio_cmd == BIO_READ)
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_read,
 		    bp->bio_error);
 	else if (bp->bio_cmd == BIO_WRITE)
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_regular_request_write,
 		    bp->bio_error);
 
 	pbp->bio_inbed++;
 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
 	    pbp->bio_children));
 	if (bp->bio_error == 0 && pbp->bio_error == 0) {
 		G_MIRROR_LOGREQ(3, bp, "Request delivered.");
 		g_destroy_bio(bp);
 		if (pbp->bio_children == pbp->bio_inbed) {
 			G_MIRROR_LOGREQ(3, pbp, "Request delivered.");
 			pbp->bio_completed = pbp->bio_length;
 			if (pbp->bio_cmd == BIO_WRITE ||
 			    pbp->bio_cmd == BIO_DELETE) {
 				bioq_remove(&sc->sc_inflight, pbp);
 				/* Release delayed sync requests if possible. */
 				g_mirror_sync_release(sc);
 			}
 			g_io_deliver(pbp, pbp->bio_error);
 		}
 		return;
 	} else if (bp->bio_error != 0) {
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 		if (disk != NULL) {
 			if ((disk->d_flags & G_MIRROR_DISK_FLAG_BROKEN) == 0) {
 				disk->d_flags |= G_MIRROR_DISK_FLAG_BROKEN;
 				G_MIRROR_LOGREQ(0, bp,
 				    "Request failed (error=%d).",
 				    bp->bio_error);
 			} else {
 				G_MIRROR_LOGREQ(1, bp,
 				    "Request failed (error=%d).",
 				    bp->bio_error);
 			}
 			if (g_mirror_disconnect_on_failure &&
 			    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) > 1)
 			{
 				sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 				g_mirror_event_send(disk,
 				    G_MIRROR_DISK_STATE_DISCONNECTED,
 				    G_MIRROR_EVENT_DONTWAIT);
 			}
 		}
 		switch (pbp->bio_cmd) {
 		case BIO_DELETE:
 		case BIO_WRITE:
 			pbp->bio_inbed--;
 			pbp->bio_children--;
 			break;
 		}
 	}
 	g_destroy_bio(bp);
 
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (pbp->bio_inbed < pbp->bio_children)
 			break;
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 1)
 			g_io_deliver(pbp, pbp->bio_error);
 		else {
 			pbp->bio_error = 0;
 			mtx_lock(&sc->sc_queue_mtx);
 			bioq_insert_tail(&sc->sc_queue, pbp);
 			mtx_unlock(&sc->sc_queue_mtx);
 			G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 			wakeup(sc);
 		}
 		break;
 	case BIO_DELETE:
 	case BIO_WRITE:
 		if (pbp->bio_children == 0) {
 			/*
 			 * All requests failed.
 			 */
 		} else if (pbp->bio_inbed < pbp->bio_children) {
 			/* Do nothing. */
 			break;
 		} else if (pbp->bio_children == pbp->bio_inbed) {
 			/* Some requests succeeded. */
 			pbp->bio_error = 0;
 			pbp->bio_completed = pbp->bio_length;
 		}
 		bioq_remove(&sc->sc_inflight, pbp);
 		/* Release delayed sync requests if possible. */
 		g_mirror_sync_release(sc);
 		g_io_deliver(pbp, pbp->bio_error);
 		break;
 	default:
 		KASSERT(1 == 0, ("Invalid request: %u.", pbp->bio_cmd));
 		break;
 	}
 }
 
 static void
 g_mirror_sync_done(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	G_MIRROR_LOGREQ(3, bp, "Synchronization request delivered.");
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags = G_MIRROR_BIO_FLAG_SYNC;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 }
 
 static void
 g_mirror_candelete(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	int *val;
 
 	sc = bp->bio_to->geom->softc;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE)
 			break;
 	}
 	val = (int *)bp->bio_data;
 	*val = (disk != NULL);
 	g_io_deliver(bp, 0);
 }
 
 static void
 g_mirror_kernel_dump(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 	struct bio *cbp;
 	struct g_kerneldump *gkd;
 
 	/*
 	 * We configure dumping to the first component, because this component
 	 * will be used for reading with 'prefer' balance algorithm.
 	 * If the component with the highest priority is currently disconnected
 	 * we will not be able to read the dump after the reboot if it will be
 	 * connected and synchronized later. Can we do something better?
 	 */
 	sc = bp->bio_to->geom->softc;
 	disk = LIST_FIRST(&sc->sc_disks);
 
 	gkd = (struct g_kerneldump *)bp->bio_data;
 	if (gkd->length > bp->bio_to->mediasize)
 		gkd->length = bp->bio_to->mediasize;
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 	cbp->bio_done = g_std_done;
 	g_io_request(cbp, disk->d_consumer);
 	G_MIRROR_DEBUG(1, "Kernel dump will go to %s.",
 	    g_mirror_get_diskname(disk));
 }
 
 static void
 g_mirror_flush(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	bioq_init(&queue);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_mirror_flush_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_mirror_start(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_mirror_start() should not be called at all.
 	 */
 	KASSERT(sc != NULL && sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Provider's error should be set (error=%d)(mirror=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_MIRROR_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	case BIO_FLUSH:
 		g_mirror_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 		if (!strcmp(bp->bio_attribute, "GEOM::candelete")) {
 			g_mirror_candelete(bp);
 			return;
 		} else if (strcmp("GEOM::kerneldump", bp->bio_attribute) == 0) {
 			g_mirror_kernel_dump(bp);
 			return;
 		}
 		/* FALLTHROUGH */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	wakeup(sc);
 }
 
 /*
  * Return TRUE if the given request is colliding with a in-progress
  * synchronization request.
  */
 static int
 g_mirror_sync_collision(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct bio *sbp;
 	off_t rstart, rend, sstart, send;
 	u_int i;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (0);
 	rstart = bp->bio_offset;
 	rend = bp->bio_offset + bp->bio_length;
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			continue;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			sbp = disk->d_sync.ds_bios[i];
 			if (sbp == NULL)
 				continue;
 			sstart = sbp->bio_offset;
 			send = sbp->bio_offset + sbp->bio_length;
 			if (rend > sstart && rstart < send)
 				return (1);
 		}
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the given sync request is colliding with a in-progress regular
  * request.
  */
 static int
 g_mirror_regular_collision(struct g_mirror_softc *sc, struct bio *sbp)
 {
 	off_t rstart, rend, sstart, send;
 	struct bio *bp;
 
 	if (sc->sc_sync.ds_ndisks == 0)
 		return (0);
 	sstart = sbp->bio_offset;
 	send = sbp->bio_offset + sbp->bio_length;
 	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
 		rstart = bp->bio_offset;
 		rend = bp->bio_offset + bp->bio_length;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Puts request onto delayed queue.
  */
 static void
 g_mirror_regular_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying request.");
 	bioq_insert_head(&sc->sc_regular_delayed, bp);
 }
 
 /*
  * Puts synchronization request onto delayed queue.
  */
 static void
 g_mirror_sync_delay(struct g_mirror_softc *sc, struct bio *bp)
 {
 
 	G_MIRROR_LOGREQ(2, bp, "Delaying synchronization request.");
 	bioq_insert_tail(&sc->sc_sync_delayed, bp);
 }
 
 /*
  * Releases delayed regular requests which don't collide anymore with sync
  * requests.
  */
 static void
 g_mirror_regular_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
 		if (g_mirror_sync_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_regular_delayed, bp);
 		G_MIRROR_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_head(&sc->sc_queue, bp);
 #if 0
 		/*
 		 * wakeup() is not needed, because this function is called from
 		 * the worker thread.
 		 */
 		wakeup(&sc->sc_queue);
 #endif
 		mtx_unlock(&sc->sc_queue_mtx);
 	}
 }
 
 /*
  * Releases delayed sync requests which don't collide anymore with regular
  * requests.
  */
 static void
 g_mirror_sync_release(struct g_mirror_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
 		if (g_mirror_regular_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_sync_delayed, bp);
 		G_MIRROR_LOGREQ(2, bp,
 		    "Releasing delayed synchronization request.");
 		g_io_request(bp, bp->bio_from);
 	}
 }
 
 /*
  * Handle synchronization requests.
  * Every synchronization request is two-steps process: first, READ request is
  * send to active provider and then WRITE request (with read data) to the provider
  * being synchronized. When WRITE is finished, new synchronization request is
  * send.
  */
 static void
 g_mirror_sync_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_disk *disk;
 
 	bp->bio_from->index--;
 	sc = bp->bio_from->geom->softc;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 		g_topology_lock();
 		g_mirror_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 		free(bp->bio_data, M_MIRROR);
 		g_destroy_bio(bp);
 		sx_xlock(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * Synchronization request.
 	 */
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	    {
 		struct g_consumer *cp;
 
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_read,
 		    bp->bio_error);
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp,
 		    "Synchronization request half-finished.");
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_cflags = 0;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(bp, cp);
 		return;
 	    }
 	case BIO_WRITE:
 	    {
 		struct g_mirror_disk_sync *sync;
 		off_t offset;
 		void *data;
 		int i;
 
 		KFAIL_POINT_ERROR(DEBUG_FP, g_mirror_sync_request_write,
 		    bp->bio_error);
 
 		if (bp->bio_error != 0) {
 			G_MIRROR_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			sc->sc_bump_id |= G_MIRROR_BUMP_GENID;
 			g_mirror_event_send(disk,
 			    G_MIRROR_DISK_STATE_DISCONNECTED,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 		G_MIRROR_LOGREQ(3, bp, "Synchronization request finished.");
 		sync = &disk->d_sync;
 		if (sync->ds_offset >= sc->sc_mediasize ||
 		    sync->ds_consumer == NULL ||
 		    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 			/* Don't send more synchronization requests. */
 			sync->ds_inflight--;
 			if (sync->ds_bios != NULL) {
 				i = (int)(uintptr_t)bp->bio_caller1;
 				sync->ds_bios[i] = NULL;
 			}
 			free(bp->bio_data, M_MIRROR);
 			g_destroy_bio(bp);
 			if (sync->ds_inflight > 0)
 				return;
 			if (sync->ds_consumer == NULL ||
 			    (sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				return;
 			}
 			/* Disk up-to-date, activate it. */
 			g_mirror_event_send(disk, G_MIRROR_DISK_STATE_ACTIVE,
 			    G_MIRROR_EVENT_DONTWAIT);
 			return;
 		}
 
 		/* Send next synchronization request. */
 		data = bp->bio_data;
 		g_reset_bio(bp);
 		bp->bio_cmd = BIO_READ;
 		bp->bio_offset = sync->ds_offset;
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		sync->ds_offset += bp->bio_length;
 		bp->bio_done = g_mirror_sync_done;
 		bp->bio_data = data;
 		bp->bio_from = sync->ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 
 		/* Release delayed requests if possible. */
 		g_mirror_regular_release(sc);
 
 		/* Find the smallest offset */
 		offset = sc->sc_mediasize;
 		for (i = 0; i < g_mirror_syncreqs; i++) {
 			bp = sync->ds_bios[i];
 			if (bp->bio_offset < offset)
 				offset = bp->bio_offset;
 		}
 		if (sync->ds_offset_done + (MAXPHYS * 100) < offset) {
 			/* Update offset_done on every 100 blocks. */
 			sync->ds_offset_done = offset;
 			g_mirror_update_metadata(disk);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static void
 g_mirror_request_prefer(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_ACTIVE)
 			break;
 	}
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_round_robin(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 
 	disk = g_mirror_get_disk(sc);
 	if (disk == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENXIO;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	g_io_request(cbp, cp);
 }
 
 #define TRACK_SIZE  (1 * 1024 * 1024)
 #define LOAD_SCALE	256
 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
 
 static void
 g_mirror_request_load(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct g_mirror_disk *disk, *dp;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	int prio, best;
 
 	/* Find a disk with the smallest load. */
 	disk = NULL;
 	best = INT_MAX;
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		prio = dp->load;
 		/* If disk head is precisely in position - highly prefer it. */
 		if (dp->d_last_offset == bp->bio_offset)
 			prio -= 2 * LOAD_SCALE;
 		else
 		/* If disk head is close to position - prefer it. */
 		if (ABS(dp->d_last_offset - bp->bio_offset) < TRACK_SIZE)
 			prio -= 1 * LOAD_SCALE;
 		if (prio <= best) {
 			disk = dp;
 			best = prio;
 		}
 	}
 	KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name));
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		if (bp->bio_error == 0)
 			bp->bio_error = ENOMEM;
 		g_io_deliver(bp, bp->bio_error);
 		return;
 	}
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	cp->index++;
 	/* Remember last head position */
 	disk->d_last_offset = bp->bio_offset + bp->bio_length;
 	/* Update loads. */
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		dp->load = (dp->d_consumer->index * LOAD_SCALE +
 		    dp->load * 7) / 8;
 	}
 	g_io_request(cbp, cp);
 }
 
 static void
 g_mirror_request_split(struct g_mirror_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_mirror_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	off_t left, mod, offset, slice;
 	u_char *data;
 	u_int ndisks;
 
 	if (bp->bio_length <= sc->sc_slice) {
 		g_mirror_request_round_robin(sc, bp);
 		return;
 	}
 	ndisks = g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE);
 	slice = bp->bio_length / ndisks;
 	mod = slice % sc->sc_provider->sectorsize;
 	if (mod != 0)
 		slice += sc->sc_provider->sectorsize - mod;
 	/*
 	 * Allocate all bios before sending any request, so we can
 	 * return ENOMEM in nice and clean way.
 	 */
 	left = bp->bio_length;
 	offset = bp->bio_offset;
 	data = bp->bio_data;
 	bioq_init(&queue);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			while ((cbp = bioq_takefirst(&queue)) != NULL)
 				g_destroy_bio(cbp);
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_mirror_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 		cbp->bio_offset = offset;
 		cbp->bio_data = data;
 		cbp->bio_length = MIN(left, slice);
 		left -= cbp->bio_length;
 		if (left == 0)
 			break;
 		offset += cbp->bio_length;
 		data += cbp->bio_length;
 	}
 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
 		G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		disk->d_consumer->index++;
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_mirror_register_request(struct bio *bp)
 {
 	struct g_mirror_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		switch (sc->sc_balance) {
 		case G_MIRROR_BALANCE_LOAD:
 			g_mirror_request_load(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_PREFER:
 			g_mirror_request_prefer(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_ROUND_ROBIN:
 			g_mirror_request_round_robin(sc, bp);
 			break;
 		case G_MIRROR_BALANCE_SPLIT:
 			g_mirror_request_split(sc, bp);
 			break;
 		}
 		return;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	    {
 		struct g_mirror_disk *disk;
 		struct g_mirror_disk_sync *sync;
 		struct bio_queue_head queue;
 		struct g_consumer *cp;
 		struct bio *cbp;
 
 		/*
 		 * Delay the request if it is colliding with a synchronization
 		 * request.
 		 */
 		if (g_mirror_sync_collision(sc, bp)) {
 			g_mirror_regular_delay(sc, bp);
 			return;
 		}
 
 		if (sc->sc_idle)
 			g_mirror_unidle(sc);
 		else
 			sc->sc_last_write = time_uptime;
 
 		/*
 		 * Allocate all bios before sending any request, so we can
 		 * return ENOMEM in nice and clean way.
 		 */
 		bioq_init(&queue);
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			sync = &disk->d_sync;
 			switch (disk->d_state) {
 			case G_MIRROR_DISK_STATE_ACTIVE:
 				break;
 			case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 				if (bp->bio_offset >= sync->ds_offset)
 					continue;
 				break;
 			default:
 				continue;
 			}
 			if (bp->bio_cmd == BIO_DELETE &&
 			    (disk->d_flags & G_MIRROR_DISK_FLAG_CANDELETE) == 0)
 				continue;
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL) {
 				while ((cbp = bioq_takefirst(&queue)) != NULL)
 					g_destroy_bio(cbp);
 				if (bp->bio_error == 0)
 					bp->bio_error = ENOMEM;
 				g_io_deliver(bp, bp->bio_error);
 				return;
 			}
 			bioq_insert_tail(&queue, cbp);
 			cbp->bio_done = g_mirror_done;
 			cp = disk->d_consumer;
 			cbp->bio_caller1 = cp;
 			cbp->bio_to = cp->provider;
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).",
 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
 		}
 		if (bioq_first(&queue) == NULL) {
 			g_io_deliver(bp, EOPNOTSUPP);
 			return;
 		}
 		while ((cbp = bioq_takefirst(&queue)) != NULL) {
 			G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 			cp = cbp->bio_caller1;
 			cbp->bio_caller1 = NULL;
 			cp->index++;
 			sc->sc_writes++;
 			g_io_request(cbp, cp);
 		}
 		/*
 		 * Put request onto inflight queue, so we can check if new
 		 * synchronization requests don't collide with it.
 		 */
 		bioq_insert_tail(&sc->sc_inflight, bp);
 		/*
 		 * Bump syncid on first write.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 			g_mirror_bump_syncid(sc);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static int
 g_mirror_can_destroy(struct g_mirror_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	gp = sc->sc_geom;
 	if (gp->softc == NULL)
 		return (1);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_TASTING) != 0)
 		return (0);
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	gp = sc->sc_sync.ds_geom;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_mirror_is_busy(sc, cp))
 			return (0);
 	}
 	G_MIRROR_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
 	    sc->sc_name);
 	return (1);
 }
 
 static int
 g_mirror_try_destroy(struct g_mirror_softc *sc)
 {
 
 	if (sc->sc_rootmount != NULL) {
 		G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 		    sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 	g_topology_lock();
 	if (!g_mirror_can_destroy(sc)) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_WAIT) != 0) {
 		g_topology_unlock();
 		G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 		    &sc->sc_worker);
 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
 		sx_xunlock(&sc->sc_lock);
 		wakeup(&sc->sc_worker);
 		sc->sc_worker = NULL;
 	} else {
 		g_topology_unlock();
 		g_mirror_destroy_device(sc);
 		free(sc, M_MIRROR);
 	}
 	return (1);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_mirror_worker(void *arg)
 {
 	struct g_mirror_softc *sc;
 	struct g_mirror_event *ep;
 	struct bio *bp;
 	int timeout;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		G_MIRROR_DEBUG(5, "%s: Let's see...", __func__);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		ep = g_mirror_event_get(sc);
 		if (ep != NULL) {
 			g_mirror_event_remove(sc, ep);
 			if ((ep->e_flags & G_MIRROR_EVENT_DEVICE) != 0) {
 				/* Update only device status. */
 				G_MIRROR_DEBUG(3,
 				    "Running event for device %s.",
 				    sc->sc_name);
 				ep->e_error = 0;
 				g_mirror_update_device(sc, 1);
 			} else {
 				/* Update disk status. */
 				G_MIRROR_DEBUG(3, "Running event for disk %s.",
 				     g_mirror_get_diskname(ep->e_disk));
 				ep->e_error = g_mirror_update_disk(ep->e_disk,
 				    ep->e_state);
 				if (ep->e_error == 0)
 					g_mirror_update_device(sc, 0);
 			}
 			if ((ep->e_flags & G_MIRROR_EVENT_DONTWAIT) != 0) {
 				KASSERT(ep->e_error == 0,
 				    ("Error cannot be handled."));
 				g_mirror_event_free(ep);
 			} else {
 				ep->e_flags |= G_MIRROR_EVENT_DONE;
 				G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__,
 				    ep);
 				mtx_lock(&sc->sc_events_mtx);
 				wakeup(ep);
 				mtx_unlock(&sc->sc_events_mtx);
 			}
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 			}
 			G_MIRROR_DEBUG(5, "%s: I'm here 1.", __func__);
 			continue;
 		}
 		/*
 		 * Check if we can mark array as CLEAN and if we can't take
 		 * how much seconds should we wait.
 		 */
 		timeout = g_mirror_idle(sc, -1);
 		/*
 		 * Now I/O requests.
 		 */
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_queue_mtx);
 		bp = bioq_takefirst(&sc->sc_queue);
 		if (bp == NULL) {
 			if ((sc->sc_flags &
 			    G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				if (g_mirror_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_MIRROR_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 				mtx_lock(&sc->sc_queue_mtx);
 			}
 			sx_xunlock(&sc->sc_lock);
 			/*
 			 * XXX: We can miss an event here, because an event
 			 *      can be added without sx-device-lock and without
 			 *      mtx-queue-lock. Maybe I should just stop using
 			 *      dedicated mutex for events synchronization and
 			 *      stick with the queue lock?
 			 *      The event will hang here until next I/O request
 			 *      or next event is received.
 			 */
 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:w1",
 			    timeout * hz);
 			sx_xlock(&sc->sc_lock);
 			G_MIRROR_DEBUG(5, "%s: I'm here 4.", __func__);
 			continue;
 		}
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
 		    (bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0) {
 			g_mirror_sync_request(bp);	/* READ */
 		} else if (bp->bio_to != sc->sc_provider) {
 			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_REGULAR) != 0)
 				g_mirror_regular_request(bp);
 			else if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
 				g_mirror_sync_request(bp);	/* WRITE */
 			else {
 				KASSERT(0,
 				    ("Invalid request cflags=0x%hx to=%s.",
 				    bp->bio_cflags, bp->bio_to->name));
 			}
 		} else {
 			g_mirror_register_request(bp);
 		}
 		G_MIRROR_DEBUG(5, "%s: I'm here 9.", __func__);
 	}
 }
 
 static void
 g_mirror_update_idle(struct g_mirror_softc *sc, struct g_mirror_disk *disk)
 {
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	if (!sc->sc_idle && (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as dirty.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	} else if (sc->sc_idle &&
 	    (disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 		G_MIRROR_DEBUG(2, "Disk %s (device %s) marked as clean.",
 		    g_mirror_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	}
 }
 
 static void
 g_mirror_sync_start(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	struct bio *bp;
 	int error, i;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Disk %s is not marked for synchronization.",
 	    g_mirror_get_diskname(disk)));
 	KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 	    ("Device not in RUNNING state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	cp = g_new_consumer(sc->sc_sync.ds_geom);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	error = g_attach(cp, sc->sc_provider);
 	KASSERT(error == 0,
 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
 	error = g_access(cp, 1, 0, 0);
 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 
 	G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
 	    g_mirror_get_diskname(disk));
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOFAILSYNC) == 0)
 		disk->d_flags |= G_MIRROR_DISK_FLAG_DIRTY;
 	KASSERT(disk->d_sync.ds_consumer == NULL,
 	    ("Sync consumer already exists (device=%s, disk=%s).",
 	    sc->sc_name, g_mirror_get_diskname(disk)));
 
 	disk->d_sync.ds_consumer = cp;
 	disk->d_sync.ds_consumer->private = disk;
 	disk->d_sync.ds_consumer->index = 0;
 
 	/*
 	 * Allocate memory for synchronization bios and initialize them.
 	 */
 	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_mirror_syncreqs,
 	    M_MIRROR, M_WAITOK);
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = g_alloc_bio();
 		disk->d_sync.ds_bios[i] = bp;
 		bp->bio_parent = NULL;
 		bp->bio_cmd = BIO_READ;
 		bp->bio_data = malloc(MAXPHYS, M_MIRROR, M_WAITOK);
 		bp->bio_cflags = 0;
 		bp->bio_offset = disk->d_sync.ds_offset;
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		disk->d_sync.ds_offset += bp->bio_length;
 		bp->bio_done = g_mirror_sync_done;
 		bp->bio_from = disk->d_sync.ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		bp->bio_caller1 = (void *)(uintptr_t)i;
 	}
 
 	/* Increase the number of disks in SYNCHRONIZING state. */
 	sc->sc_sync.ds_ndisks++;
 	/* Set the number of in-flight synchronization requests. */
 	disk->d_sync.ds_inflight = g_mirror_syncreqs;
 
 	/*
 	 * Fire off first synchronization requests.
 	 */
 	for (i = 0; i < g_mirror_syncreqs; i++) {
 		bp = disk->d_sync.ds_bios[i];
 		G_MIRROR_LOGREQ(3, bp, "Sending synchronization request.");
 		disk->d_sync.ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_mirror_regular_collision(sc, bp))
 			g_mirror_sync_delay(sc, bp);
 		else
 			g_io_request(bp, disk->d_sync.ds_consumer);
 	}
 }
 
 /*
  * Stop synchronization process.
  * type: 0 - synchronization finished
  *       1 - synchronization stopped
  */
 static void
 g_mirror_sync_stop(struct g_mirror_disk *disk, int type)
 {
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 	    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 	    g_mirror_disk_state2str(disk->d_state)));
 	if (disk->d_sync.ds_consumer == NULL)
 		return;
 
 	if (type == 0) {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s finished.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	} else /* if (type == 1) */ {
 		G_MIRROR_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 	}
 	free(disk->d_sync.ds_bios, M_MIRROR);
 	disk->d_sync.ds_bios = NULL;
 	cp = disk->d_sync.ds_consumer;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 	sc->sc_sync.ds_ndisks--;
 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 	g_topology_lock();
 	g_mirror_kill_consumer(sc, cp);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 }
 
 static void
 g_mirror_launch_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct g_provider *pp, *dp;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	pp = g_new_providerf(sc->sc_geom, "mirror/%s", sc->sc_name);
 	pp->flags |= G_PF_DIRECT_RECEIVE;
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 
 	/* Splitting of unmapped BIO's could work but isn't implemented now */
 	if (sc->sc_balance != G_MIRROR_BALANCE_SPLIT)
 		pp->flags |= G_PF_ACCEPT_UNMAPPED;
 
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_consumer && disk->d_consumer->provider) {
 			dp = disk->d_consumer->provider;
 			if (dp->stripesize > pp->stripesize) {
 				pp->stripesize = dp->stripesize;
 				pp->stripeoffset = dp->stripeoffset;
 			}
 			/* A provider underneath us doesn't support unmapped */
 			if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 				G_MIRROR_DEBUG(0, "Cancelling unmapped "
 				    "because of %s.", dp->name);
 				pp->flags &= ~G_PF_ACCEPT_UNMAPPED;
 			}
 		}
 	}
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_MIRROR_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
 	    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE), sc->sc_ndisks);
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_start(disk);
 	}
 }
 
 static void
 g_mirror_destroy_provider(struct g_mirror_softc *sc)
 {
 	struct g_mirror_disk *disk;
 	struct bio *bp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
 	    sc->sc_name));
 
 	g_topology_lock();
 	g_error_provider(sc->sc_provider, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
 		/*
 		 * Abort any pending I/O that wasn't generated by us.
 		 * Synchronization requests and requests destined for individual
 		 * mirror components can be destroyed immediately.
 		 */
 		if (bp->bio_to == sc->sc_provider &&
 		    bp->bio_from->geom != sc->sc_sync.ds_geom) {
 			g_io_deliver(bp, ENXIO);
 		} else {
 			if ((bp->bio_cflags & G_MIRROR_BIO_FLAG_SYNC) != 0)
 				free(bp->bio_data, M_MIRROR);
 			g_destroy_bio(bp);
 		}
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
 	    sc->sc_provider->name);
-	sc->sc_provider->flags |= G_PF_WITHER;
-	g_orphan_provider(sc->sc_provider, ENXIO);
-	g_topology_unlock();
+	g_wither_provider(sc->sc_provider, ENXIO);
 	sc->sc_provider = NULL;
+	g_topology_unlock();
 	LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING)
 			g_mirror_sync_stop(disk, 1);
 	}
 }
 
 static void
 g_mirror_go(void *arg)
 {
 	struct g_mirror_softc *sc;
 
 	sc = arg;
 	G_MIRROR_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
 	g_mirror_event_send(sc, 0,
 	    G_MIRROR_EVENT_DONTWAIT | G_MIRROR_EVENT_DEVICE);
 }
 
 static u_int
 g_mirror_determine_state(struct g_mirror_disk *disk)
 {
 	struct g_mirror_softc *sc;
 	u_int state;
 
 	sc = disk->d_softc;
 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
 		if ((disk->d_flags &
 		    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
 			/* Disk does not need synchronization. */
 			state = G_MIRROR_DISK_STATE_ACTIVE;
 		} else {
 			if ((sc->sc_flags &
 			     G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 			    (disk->d_flags &
 			     G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 				/*
 				 * We can start synchronization from
 				 * the stored offset.
 				 */
 				state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 			} else {
 				state = G_MIRROR_DISK_STATE_STALE;
 			}
 		}
 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
 		/*
 		 * Reset all synchronization data for this disk,
 		 * because if it even was synchronized, it was
 		 * synchronized to disks with different syncid.
 		 */
 		disk->d_flags |= G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		disk->d_sync.ds_syncid = sc->sc_syncid;
 		if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 		    (disk->d_flags & G_MIRROR_DISK_FLAG_FORCE_SYNC) != 0) {
 			state = G_MIRROR_DISK_STATE_SYNCHRONIZING;
 		} else {
 			state = G_MIRROR_DISK_STATE_STALE;
 		}
 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
 		/*
 		 * Not good, NOT GOOD!
 		 * It means that mirror was started on stale disks
 		 * and more fresh disk just arrive.
 		 * If there were writes, mirror is broken, sorry.
 		 * I think the best choice here is don't touch
 		 * this disk and inform the user loudly.
 		 */
 		G_MIRROR_DEBUG(0, "Device %s was started before the freshest "
 		    "disk (%s) arrives!! It will not be connected to the "
 		    "running device.", sc->sc_name,
 		    g_mirror_get_diskname(disk));
 		g_mirror_destroy_disk(disk);
 		state = G_MIRROR_DISK_STATE_NONE;
 		/* Return immediately, because disk was destroyed. */
 		return (state);
 	}
 	G_MIRROR_DEBUG(3, "State for %s disk: %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(state));
 	return (state);
 }
 
 /*
  * Update device state.
  */
 static void
 g_mirror_update_device(struct g_mirror_softc *sc, boolean_t force)
 {
 	struct g_mirror_disk *disk;
 	u_int state;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	switch (sc->sc_state) {
 	case G_MIRROR_DEVICE_STATE_STARTING:
 	    {
 		struct g_mirror_disk *pdisk, *tdisk;
 		u_int dirty, ndisks, genid, syncid;
 
 		KASSERT(sc->sc_provider == NULL,
 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
 		/*
 		 * Are we ready? We are, if all disks are connected or
 		 * if we have any disks and 'force' is true.
 		 */
 		ndisks = g_mirror_ndisks(sc, -1);
 		if (sc->sc_ndisks == ndisks || (force && ndisks > 0)) {
 			;
 		} else if (ndisks == 0) {
 			/*
 			 * Disks went down in starting phase, so destroy
 			 * device.
 			 */
 			callout_drain(&sc->sc_callout);
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 			return;
 		} else {
 			return;
 		}
 
 		/*
 		 * Activate all disks with the biggest syncid.
 		 */
 		if (force) {
 			/*
 			 * If 'force' is true, we have been called due to
 			 * timeout, so don't bother canceling timeout.
 			 */
 			ndisks = 0;
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) == 0) {
 					ndisks++;
 				}
 			}
 			if (ndisks == 0) {
 				/* No valid disks found, destroy device. */
 				sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 				return;
 			}
 		} else {
 			/* Cancel timeout. */
 			callout_drain(&sc->sc_callout);
 		}
 
 		/*
 		 * Find the biggest genid.
 		 */
 		genid = 0;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_genid > genid)
 				genid = disk->d_genid;
 		}
 		sc->sc_genid = genid;
 		/*
 		 * Remove all disks without the biggest genid.
 		 */
 		LIST_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tdisk) {
 			if (disk->d_genid < genid) {
 				G_MIRROR_DEBUG(0,
 				    "Component %s (device %s) broken, skipping.",
 				    g_mirror_get_diskname(disk), sc->sc_name);
 				g_mirror_destroy_disk(disk);
 			}
 		}
 
 		/*
 		 * Find the biggest syncid.
 		 */
 		syncid = 0;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_sync.ds_syncid > syncid)
 				syncid = disk->d_sync.ds_syncid;
 		}
 
 		/*
 		 * Here we need to look for dirty disks and if all disks
 		 * with the biggest syncid are dirty, we have to choose
 		 * one with the biggest priority and rebuild the rest.
 		 */
 		/*
 		 * Find the number of dirty disks with the biggest syncid.
 		 * Find the number of disks with the biggest syncid.
 		 * While here, find a disk with the biggest priority.
 		 */
 		dirty = ndisks = 0;
 		pdisk = NULL;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			if (disk->d_sync.ds_syncid != syncid)
 				continue;
 			if ((disk->d_flags &
 			    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 				continue;
 			}
 			ndisks++;
 			if ((disk->d_flags & G_MIRROR_DISK_FLAG_DIRTY) != 0) {
 				dirty++;
 				if (pdisk == NULL ||
 				    pdisk->d_priority < disk->d_priority) {
 					pdisk = disk;
 				}
 			}
 		}
 		if (dirty == 0) {
 			/* No dirty disks at all, great. */
 		} else if (dirty == ndisks) {
 			/*
 			 * Force synchronization for all dirty disks except one
 			 * with the biggest priority.
 			 */
 			KASSERT(pdisk != NULL, ("pdisk == NULL"));
 			G_MIRROR_DEBUG(1, "Using disk %s (device %s) as a "
 			    "master disk for synchronization.",
 			    g_mirror_get_diskname(pdisk), sc->sc_name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				KASSERT((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) != 0,
 				    ("Disk %s isn't marked as dirty.",
 				    g_mirror_get_diskname(disk)));
 				/* Skip the disk with the biggest priority. */
 				if (disk == pdisk)
 					continue;
 				disk->d_sync.ds_syncid = 0;
 			}
 		} else if (dirty < ndisks) {
 			/*
 			 * Force synchronization for all dirty disks.
 			 * We have some non-dirty disks.
 			 */
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_sync.ds_syncid != syncid)
 					continue;
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_SYNCHRONIZING) != 0) {
 					continue;
 				}
 				if ((disk->d_flags &
 				    G_MIRROR_DISK_FLAG_DIRTY) == 0) {
 					continue;
 				}
 				disk->d_sync.ds_syncid = 0;
 			}
 		}
 
 		/* Reset hint. */
 		sc->sc_hint = NULL;
 		sc->sc_syncid = syncid;
 		if (force) {
 			/* Remember to bump syncid on first write. */
 			sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		state = G_MIRROR_DEVICE_STATE_RUNNING;
 		G_MIRROR_DEBUG(1, "Device %s state changed from %s to %s.",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_device_state2str(state));
 		sc->sc_state = state;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			state = g_mirror_determine_state(disk);
 			g_mirror_event_send(disk, state,
 			    G_MIRROR_EVENT_DONTWAIT);
 			if (state == G_MIRROR_DISK_STATE_STALE)
 				sc->sc_bump_id |= G_MIRROR_BUMP_SYNCID;
 		}
 		break;
 	    }
 	case G_MIRROR_DEVICE_STATE_RUNNING:
 		if (g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE) == 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * No active disks or no disks at all,
 			 * so destroy device.
 			 */
 			if (sc->sc_provider != NULL)
 				g_mirror_destroy_provider(sc);
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 			break;
 		} else if (g_mirror_ndisks(sc,
 		    G_MIRROR_DISK_STATE_ACTIVE) > 0 &&
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_NEW) == 0) {
 			/*
 			 * We have active disks, launch provider if it doesn't
 			 * exist.
 			 */
 			if (sc->sc_provider == NULL)
 				g_mirror_launch_provider(sc);
 			if (sc->sc_rootmount != NULL) {
 				G_MIRROR_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 			}
 		}
 		/*
 		 * Genid should be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_MIRROR_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_MIRROR_BUMP_GENID;
 			g_mirror_bump_genid(sc);
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Wrong device state (%s, %s).",
 		    sc->sc_name, g_mirror_device_state2str(sc->sc_state)));
 		break;
 	}
 }
 
 /*
  * Update disk state and device state if needed.
  */
 #define	DISK_STATE_CHANGED()	G_MIRROR_DEBUG(1,			\
 	"Disk %s state changed from %s to %s (device %s).",		\
 	g_mirror_get_diskname(disk),					\
 	g_mirror_disk_state2str(disk->d_state),				\
 	g_mirror_disk_state2str(state), sc->sc_name)
 static int
 g_mirror_update_disk(struct g_mirror_disk *disk, u_int state)
 {
 	struct g_mirror_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 again:
 	G_MIRROR_DEBUG(3, "Changing disk %s state from %s to %s.",
 	    g_mirror_get_diskname(disk), g_mirror_disk_state2str(disk->d_state),
 	    g_mirror_disk_state2str(state));
 	switch (state) {
 	case G_MIRROR_DISK_STATE_NEW:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk arrive.
 		 */
 		/* Previous state should be NONE. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NONE,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_state = state;
 		if (LIST_EMPTY(&sc->sc_disks))
 			LIST_INSERT_HEAD(&sc->sc_disks, disk, d_next);
 		else {
 			struct g_mirror_disk *dp;
 
 			LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 				if (disk->d_priority >= dp->d_priority) {
 					LIST_INSERT_BEFORE(dp, disk, d_next);
 					dp = NULL;
 					break;
 				}
 				if (LIST_NEXT(dp, d_next) == NULL)
 					break;
 			}
 			if (dp != NULL)
 				LIST_INSERT_AFTER(dp, disk, d_next);
 		}
 		G_MIRROR_DEBUG(1, "Device %s: provider %s detected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			break;
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		state = g_mirror_determine_state(disk);
 		if (state != G_MIRROR_DISK_STATE_NONE)
 			goto again;
 		break;
 	case G_MIRROR_DISK_STATE_ACTIVE:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk does not need synchronization.
 		 * 2. Synchronization process finished successfully.
 		 */
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/* Previous state should be NEW or SYNCHRONIZING. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW ||
 		    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_SYNCHRONIZING;
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_FORCE_SYNC;
 			g_mirror_sync_stop(disk, 0);
 		}
 		disk->d_state = state;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		g_mirror_update_idle(sc, disk);
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(1, "Device %s: provider %s activated.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_STALE:
 		/*
 		 * Possible scenarios:
 		 * 1. Stale disk was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		/*
 		 * STALE state is only possible if device is marked
 		 * NOAUTOSYNC.
 		 */
 		KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_NOAUTOSYNC) != 0,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		g_mirror_update_metadata(disk);
 		G_MIRROR_DEBUG(0, "Device %s: provider %s is stale.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 		break;
 	case G_MIRROR_DISK_STATE_SYNCHRONIZING:
 		/*
 		 * Possible scenarios:
 		 * 1. Disk which needs synchronization was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_mirror_device_state2str(sc->sc_state),
 		    g_mirror_get_diskname(disk),
 		    g_mirror_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_MIRROR_DISK_STATE_NEW)
 			disk->d_flags &= ~G_MIRROR_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		if (sc->sc_provider != NULL) {
 			g_mirror_sync_start(disk);
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	case G_MIRROR_DISK_STATE_DISCONNECTED:
 		/*
 		 * Possible scenarios:
 		 * 1. Device wasn't running yet, but disk disappear.
 		 * 2. Disk was active and disapppear.
 		 * 3. Disk disappear during synchronization process.
 		 */
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING) {
 			/*
 			 * Previous state should be ACTIVE, STALE or
 			 * SYNCHRONIZING.
 			 */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_STALE ||
 			    disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 		} else if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING) {
 			/* Previous state should be NEW. */
 			KASSERT(disk->d_state == G_MIRROR_DISK_STATE_NEW,
 			    ("Wrong disk state (%s, %s).",
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 			/*
 			 * Reset bumping syncid if disk disappeared in STARTING
 			 * state.
 			 */
 			if ((sc->sc_bump_id & G_MIRROR_BUMP_SYNCID) != 0)
 				sc->sc_bump_id &= ~G_MIRROR_BUMP_SYNCID;
 #ifdef	INVARIANTS
 		} else {
 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
 			    sc->sc_name,
 			    g_mirror_device_state2str(sc->sc_state),
 			    g_mirror_get_diskname(disk),
 			    g_mirror_disk_state2str(disk->d_state)));
 #endif
 		}
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s disconnected.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		break;
 	case G_MIRROR_DISK_STATE_DESTROY:
 	    {
 		int error;
 
 		error = g_mirror_clear_metadata(disk);
 		if (error != 0) {
 			G_MIRROR_DEBUG(0,
 			    "Device %s: failed to clear metadata on %s: %d.",
 			    sc->sc_name, g_mirror_get_diskname(disk), error);
 			break;
 		}
 		DISK_STATE_CHANGED();
 		G_MIRROR_DEBUG(0, "Device %s: provider %s destroyed.",
 		    sc->sc_name, g_mirror_get_diskname(disk));
 
 		g_mirror_destroy_disk(disk);
 		sc->sc_ndisks--;
 		LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 			g_mirror_update_metadata(disk);
 		}
 		break;
 	    }
 	default:
 		KASSERT(1 == 0, ("Unknown state (%u).", state));
 		break;
 	}
 	return (0);
 }
 #undef	DISK_STATE_CHANGED
 
 int
 g_mirror_read_metadata(struct g_consumer *cp, struct g_mirror_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata are stored on last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_MIRROR_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = mirror_metadata_decode(buf, md);
 	g_free(buf);
 	if (strcmp(md->md_magic, G_MIRROR_MAGIC) != 0)
 		return (EINVAL);
 	if (md->md_version > G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 
 	return (0);
 }
 
 static int
 g_mirror_check_metadata(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 
 	if (g_mirror_id2disk(sc, md->md_did) != NULL) {
 		G_MIRROR_DEBUG(1, "Disk %s (id=%u) already exists, skipping.",
 		    pp->name, md->md_did);
 		return (EEXIST);
 	}
 	if (md->md_all != sc->sc_ndisks) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_all", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_slice != sc->sc_slice) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_slice", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_balance != sc->sc_balance) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_balance", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 #if 0
 	if (md->md_mediasize != sc->sc_mediasize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 #endif
 	if (sc->sc_mediasize > pp->mediasize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_sectorsize != sc->sc_sectorsize) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid sector size of disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & ~G_MIRROR_DEVICE_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid device flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_dflags & ~G_MIRROR_DISK_FLAG_MASK) != 0) {
 		G_MIRROR_DEBUG(1,
 		    "Invalid disk flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 g_mirror_add_disk(struct g_mirror_softc *sc, struct g_provider *pp,
     struct g_mirror_metadata *md)
 {
 	struct g_mirror_disk *disk;
 	int error;
 
 	g_topology_assert_not();
 	G_MIRROR_DEBUG(2, "Adding disk %s.", pp->name);
 
 	error = g_mirror_check_metadata(sc, pp, md);
 	if (error != 0)
 		return (error);
 	if (sc->sc_state == G_MIRROR_DEVICE_STATE_RUNNING &&
 	    md->md_genid < sc->sc_genid) {
 		G_MIRROR_DEBUG(0, "Component %s (device %s) broken, skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	disk = g_mirror_init_disk(sc, pp, md, &error);
 	if (disk == NULL)
 		return (error);
 	error = g_mirror_event_send(disk, G_MIRROR_DISK_STATE_NEW,
 	    G_MIRROR_EVENT_WAIT);
 	if (error != 0)
 		return (error);
 	if (md->md_version < G_MIRROR_VERSION) {
 		G_MIRROR_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
 		    pp->name, md->md_version, G_MIRROR_VERSION);
 		g_mirror_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_mirror_destroy_delayed(void *arg, int flag)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	if (flag == EV_CANCEL) {
 		G_MIRROR_DEBUG(1, "Destroying canceled.");
 		return;
 	}
 	sc = arg;
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) == 0,
 	    ("DESTROY flag set on %s.", sc->sc_name));
 	KASSERT((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0,
 	    ("DESTROYING flag not set on %s.", sc->sc_name));
 	G_MIRROR_DEBUG(1, "Destroying %s (delayed).", sc->sc_name);
 	error = g_mirror_destroy(sc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot destroy %s (error=%d).",
 		    sc->sc_name, error);
 		sx_xunlock(&sc->sc_lock);
 	}
 	g_topology_lock();
 }
 
 static int
 g_mirror_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_mirror_softc *sc;
 	int dcr, dcw, dce, error = 0;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
 	    acw, ace);
 
 	sc = pp->geom->softc;
 	if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
 		return (0);
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0 ||
 	    LIST_EMPTY(&sc->sc_disks)) {
 		if (acr > 0 || acw > 0 || ace > 0)
 			error = ENXIO;
 		goto end;
 	}
 	if (dcw == 0)
 		g_mirror_idle(sc, dcw);
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROYING) != 0) {
 		if (acr > 0 || acw > 0 || ace > 0) {
 			error = ENXIO;
 			goto end;
 		}
 		if (dcr == 0 && dcw == 0 && dce == 0) {
 			g_post_event(g_mirror_destroy_delayed, sc, M_WAITOK,
 			    sc, NULL);
 		}
 	}
 end:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static struct g_geom *
 g_mirror_create(struct g_class *mp, const struct g_mirror_metadata *md)
 {
 	struct g_mirror_softc *sc;
 	struct g_geom *gp;
 	int error, timeout;
 
 	g_topology_assert();
 	G_MIRROR_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_mid);
 
 	/* One disk is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 	/*
 	 * Action geom.
 	 */
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_MIRROR, M_WAITOK | M_ZERO);
 	gp->start = g_mirror_start;
 	gp->orphan = g_mirror_orphan;
 	gp->access = g_mirror_access;
 	gp->dumpconf = g_mirror_dumpconf;
 
 	sc->sc_id = md->md_mid;
 	sc->sc_slice = md->md_slice;
 	sc->sc_balance = md->md_balance;
 	sc->sc_mediasize = md->md_mediasize;
 	sc->sc_sectorsize = md->md_sectorsize;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_flags = md->md_mflags;
 	sc->sc_bump_id = 0;
 	sc->sc_idle = 1;
 	sc->sc_last_write = time_uptime;
 	sc->sc_writes = 0;
 	sx_init(&sc->sc_lock, "gmirror:lock");
 	bioq_init(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF);
 	bioq_init(&sc->sc_regular_delayed);
 	bioq_init(&sc->sc_inflight);
 	bioq_init(&sc->sc_sync_delayed);
 	LIST_INIT(&sc->sc_disks);
 	TAILQ_INIT(&sc->sc_events);
 	mtx_init(&sc->sc_events_mtx, "gmirror:events", NULL, MTX_DEF);
 	callout_init(&sc->sc_callout, 1);
 	mtx_init(&sc->sc_done_mtx, "gmirror:done", NULL, MTX_DEF);
 	sc->sc_state = G_MIRROR_DEVICE_STATE_STARTING;
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 	/*
 	 * Synchronization geom.
 	 */
 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
 	gp->softc = sc;
 	gp->orphan = g_mirror_orphan;
 	sc->sc_sync.ds_geom = gp;
 	sc->sc_sync.ds_ndisks = 0;
 	error = kproc_create(g_mirror_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_mirror %s", md->md_name);
 	if (error != 0) {
 		G_MIRROR_DEBUG(1, "Cannot create kernel thread for %s.",
 		    sc->sc_name);
 		g_destroy_geom(sc->sc_sync.ds_geom);
 		mtx_destroy(&sc->sc_done_mtx);
 		mtx_destroy(&sc->sc_events_mtx);
 		mtx_destroy(&sc->sc_queue_mtx);
 		sx_destroy(&sc->sc_lock);
 		g_destroy_geom(sc->sc_geom);
 		free(sc, M_MIRROR);
 		return (NULL);
 	}
 
 	G_MIRROR_DEBUG(1, "Device %s created (%u components, id=%u).",
 	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
 
 	sc->sc_rootmount = root_mount_hold("GMIRROR");
 	G_MIRROR_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 	/*
 	 * Run timeout.
 	 */
 	timeout = g_mirror_timeout * hz;
 	callout_reset(&sc->sc_callout, timeout, g_mirror_go, sc);
 	return (sc->sc_geom);
 }
 
 int
 g_mirror_destroy(struct g_mirror_softc *sc, int how)
 {
 	struct g_mirror_disk *disk;
 	struct g_provider *pp;
 
 	g_topology_assert_not();
 	if (sc == NULL)
 		return (ENXIO);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0 ||
 	    SCHEDULER_STOPPED())) {
 		switch (how) {
 		case G_MIRROR_DESTROY_SOFT:
 			G_MIRROR_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		case G_MIRROR_DESTROY_DELAYED:
 			G_MIRROR_DEBUG(1,
 			    "Device %s will be destroyed on last close.",
 			    pp->name);
 			LIST_FOREACH(disk, &sc->sc_disks, d_next) {
 				if (disk->d_state ==
 				    G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 					g_mirror_sync_stop(disk, 1);
 				}
 			}
 			sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROYING;
 			return (EBUSY);
 		case G_MIRROR_DESTROY_HARD:
 			G_MIRROR_DEBUG(1, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		}
 	}
 
 	g_topology_lock();
 	if (sc->sc_geom->softc == NULL) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	g_topology_unlock();
 
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_DESTROY;
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_WAIT;
 	G_MIRROR_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	sx_xunlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
 	while (sc->sc_worker != NULL)
 		tsleep(&sc->sc_worker, PRIBIO, "m:destroy", hz / 5);
 	G_MIRROR_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
 	sx_xlock(&sc->sc_lock);
 	g_mirror_destroy_device(sc);
 	free(sc, M_MIRROR);
 	return (0);
 }
 
 static void
 g_mirror_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_mirror_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_mirror_metadata md;
 	struct g_mirror_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_MIRROR_DEBUG(2, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "mirror:taste");
 	/*
 	 * This orphan function should be never called.
 	 */
 	gp->orphan = g_mirror_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_mirror_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if ((md.md_dflags & G_MIRROR_DISK_FLAG_INACTIVE) != 0) {
 		G_MIRROR_DEBUG(0,
 		    "Device %s: provider %s marked as inactive, skipping.",
 		    md.md_name, pp->name);
 		return (NULL);
 	}
 	if (g_mirror_debug >= 2)
 		mirror_metadata_dump(&md);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_sync.ds_geom == gp)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_mid != sc->sc_id) {
 			G_MIRROR_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_mirror_create(mp, &md);
 		if (gp == NULL) {
 			G_MIRROR_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 	}
 	G_MIRROR_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	sc->sc_flags |= G_MIRROR_DEVICE_FLAG_TASTING;
 	error = g_mirror_add_disk(sc, pp, &md);
 	if (error != 0) {
 		G_MIRROR_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 		    pp->name, gp->name, error);
 		if (LIST_EMPTY(&sc->sc_disks)) {
 			g_cancel_event(sc);
 			g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 			g_topology_lock();
 			return (NULL);
 		}
 		gp = NULL;
 	}
 	sc->sc_flags &= ~G_MIRROR_DEVICE_FLAG_TASTING;
 	if ((sc->sc_flags & G_MIRROR_DEVICE_FLAG_DESTROY) != 0) {
 		g_mirror_destroy(sc, G_MIRROR_DESTROY_HARD);
 		g_topology_lock();
 		return (NULL);
 	}
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (gp);
 }
 
 static void
 g_mirror_resize(struct g_consumer *cp)
 {
 	struct g_mirror_disk *disk;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s)", __func__, cp->provider->name);
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	g_topology_unlock();
 	g_mirror_update_metadata(disk);
 	g_topology_lock();
 }
 
 static int
 g_mirror_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_mirror_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_mirror_destroy(gp->softc, G_MIRROR_DESTROY_SOFT);
 	if (error != 0)
 		sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_mirror_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_mirror_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	/* Skip synchronization geom. */
 	if (gp == sc->sc_sync.ds_geom)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_mirror_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)disk->d_id);
 		if (disk->d_state == G_MIRROR_DISK_STATE_SYNCHRONIZING) {
 			sbuf_printf(sb, "%s<Synchronized>", indent);
 			if (disk->d_sync.ds_offset == 0)
 				sbuf_printf(sb, "0%%");
 			else {
 				sbuf_printf(sb, "%u%%",
 				    (u_int)((disk->d_sync.ds_offset * 100) /
 				    sc->sc_provider->mediasize));
 			}
 			sbuf_printf(sb, "</Synchronized>\n");
 			if (disk->d_sync.ds_offset > 0) {
 				sbuf_printf(sb, "%s<BytesSynced>%jd"
 				    "</BytesSynced>\n", indent,
 				    (intmax_t)disk->d_sync.ds_offset);
 			}
 		}
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
 		    disk->d_sync.ds_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent,
 		    disk->d_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (disk->d_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((disk->d_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DISK_FLAG_DIRTY, "DIRTY");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_HARDCODED, "HARDCODED");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_INACTIVE, "INACTIVE");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_SYNCHRONIZING,
 			    "SYNCHRONIZING");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
 			ADD_FLAG(G_MIRROR_DISK_FLAG_BROKEN, "BROKEN");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Priority>%u</Priority>\n", indent,
 		    disk->d_priority);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_mirror_disk_state2str(disk->d_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else {
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (sc->sc_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((sc->sc_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
 			ADD_FLAG(G_MIRROR_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Slice>%u</Slice>\n", indent,
 		    (u_int)sc->sc_slice);
 		sbuf_printf(sb, "%s<Balance>%s</Balance>\n", indent,
 		    balance_name(sc->sc_balance));
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    sc->sc_ndisks);
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_state == G_MIRROR_DEVICE_STATE_STARTING)
 			sbuf_printf(sb, "%s", "STARTING");
 		else if (sc->sc_ndisks ==
 		    g_mirror_ndisks(sc, G_MIRROR_DISK_STATE_ACTIVE))
 			sbuf_printf(sb, "%s", "COMPLETE");
 		else
 			sbuf_printf(sb, "%s", "DEGRADED");
 		sbuf_printf(sb, "</State>\n");
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 }
 
 static void
 g_mirror_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_mirror_softc *sc;
 	int error;
 
 	mp = arg;
 	g_topology_lock();
 	g_mirror_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		/* Skip synchronization geom. */
 		if (gp == sc->sc_sync.ds_geom)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		g_mirror_idle(sc, -1);
 		g_cancel_event(sc);
 		error = g_mirror_destroy(sc, G_MIRROR_DESTROY_DELAYED);
 		if (error != 0)
 			sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 }
 
 static void
 g_mirror_init(struct g_class *mp)
 {
 
 	g_mirror_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_mirror_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_mirror_post_sync == NULL)
 		G_MIRROR_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_mirror_fini(struct g_class *mp)
 {
 
 	if (g_mirror_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_mirror_post_sync);
 }
 
 DECLARE_GEOM_CLASS(g_mirror_class, g_mirror);
Index: user/alc/PQ_LAUNDRY/sys/geom/mountver/g_mountver.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/mountver/g_mountver.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/mountver/g_mountver.c	(revision 306283)
@@ -1,638 +1,638 @@
 /*-
  * Copyright (c) 2010 Edward Tomasz Napierala <trasz@FreeBSD.org>
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/disk.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <geom/geom.h>
 #include <geom/mountver/g_mountver.h>
 
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, mountver, CTLFLAG_RW,
     0, "GEOM_MOUNTVER stuff");
 static u_int g_mountver_debug = 0;
 static u_int g_mountver_check_ident = 1;
 SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, debug, CTLFLAG_RW,
     &g_mountver_debug, 0, "Debug level");
 SYSCTL_UINT(_kern_geom_mountver, OID_AUTO, check_ident, CTLFLAG_RW,
     &g_mountver_check_ident, 0, "Check disk ident when reattaching");
 
 static eventhandler_tag g_mountver_pre_sync = NULL;
 
 static void g_mountver_queue(struct bio *bp);
 static void g_mountver_orphan(struct g_consumer *cp);
 static void g_mountver_resize(struct g_consumer *cp);
 static int g_mountver_destroy(struct g_geom *gp, boolean_t force);
 static g_taste_t g_mountver_taste;
 static int g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static void g_mountver_config(struct gctl_req *req, struct g_class *mp,
     const char *verb);
 static void g_mountver_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_mountver_init(struct g_class *mp);
 static void g_mountver_fini(struct g_class *mp);
 
 struct g_class g_mountver_class = {
 	.name = G_MOUNTVER_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_mountver_config,
 	.taste = g_mountver_taste,
 	.destroy_geom = g_mountver_destroy_geom,
 	.init = g_mountver_init,
 	.fini = g_mountver_fini
 };
 
 static void
 g_mountver_done(struct bio *bp)
 {
 	struct g_geom *gp;
 	struct bio *pbp;
 
 	if (bp->bio_error != ENXIO) {
 		g_std_done(bp);
 		return;
 	}
 
 	/*
 	 * When the device goes away, it's possible that few requests
 	 * will be completed with ENXIO before g_mountver_orphan()
 	 * gets called.  To work around that, we have to queue requests
 	 * that failed with ENXIO, in order to send them later.
 	 */
 	gp = bp->bio_from->geom;
 
 	pbp = bp->bio_parent;
 	KASSERT(pbp->bio_to == LIST_FIRST(&gp->provider),
 	    ("parent request was for someone else"));
 	g_destroy_bio(bp);
 	pbp->bio_inbed++;
 	g_mountver_queue(pbp);
 }
 
 static void
 g_mountver_send(struct bio *bp)
 {
 	struct g_geom *gp;
 	struct bio *cbp;
 
 	gp = bp->bio_to->geom;
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		g_io_deliver(bp, ENOMEM);
 		return;
 	}
 
 	cbp->bio_done = g_mountver_done;
 	g_io_request(cbp, LIST_FIRST(&gp->consumer));
 }
 
 static void
 g_mountver_queue(struct bio *bp)
 {
 	struct g_mountver_softc *sc;
 	struct g_geom *gp;
 
 	gp = bp->bio_to->geom;
 	sc = gp->softc;
 
 	mtx_lock(&sc->sc_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_queue, bp, bio_queue);
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static void
 g_mountver_send_queued(struct g_geom *gp)
 {
 	struct g_mountver_softc *sc;
 	struct bio *bp;
 
 	sc = gp->softc;
 
 	mtx_lock(&sc->sc_mtx);
 	while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) {
 		TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue);
 		G_MOUNTVER_LOGREQ(bp, "Sending queued request.");
 		g_mountver_send(bp);
 	}
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static void
 g_mountver_discard_queued(struct g_geom *gp)
 {
 	struct g_mountver_softc *sc;
 	struct bio *bp;
 
 	sc = gp->softc;
 
 	mtx_lock(&sc->sc_mtx);
 	while ((bp = TAILQ_FIRST(&sc->sc_queue)) != NULL) {
 		TAILQ_REMOVE(&sc->sc_queue, bp, bio_queue);
 		G_MOUNTVER_LOGREQ(bp, "Discarding queued request.");
 		g_io_deliver(bp, ENXIO);
 	}
 	mtx_unlock(&sc->sc_mtx);
 }
 
 static void
 g_mountver_start(struct bio *bp)
 {
 	struct g_mountver_softc *sc;
 	struct g_geom *gp;
 
 	gp = bp->bio_to->geom;
 	sc = gp->softc;
 	G_MOUNTVER_LOGREQ(bp, "Request received.");
 
 	/*
 	 * It is possible that some bios were returned with ENXIO, even though
 	 * orphaning didn't happen yet.  In that case, queue all subsequent
 	 * requests in order to maintain ordering.
 	 */
 	if (sc->sc_orphaned || !TAILQ_EMPTY(&sc->sc_queue)) {
 		G_MOUNTVER_LOGREQ(bp, "Queueing request.");
 		g_mountver_queue(bp);
 		if (!sc->sc_orphaned)
 			g_mountver_send_queued(gp);
 	} else {
 		G_MOUNTVER_LOGREQ(bp, "Sending request.");
 		g_mountver_send(bp);
 	}
 }
 
 static int
 g_mountver_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_mountver_softc *sc;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	gp = pp->geom;
 	cp = LIST_FIRST(&gp->consumer);
 	sc = gp->softc;
 	if (sc == NULL && dr <= 0 && dw <= 0 && de <= 0)
 		return (0);
 	KASSERT(sc != NULL, ("Trying to access withered provider \"%s\".", pp->name));
 
 	sc->sc_access_r += dr;
 	sc->sc_access_w += dw;
 	sc->sc_access_e += de;
 
 	if (sc->sc_orphaned)
 		return (0);
 
 	return (g_access(cp, dr, dw, de));
 }
 
 static int
 g_mountver_create(struct gctl_req *req, struct g_class *mp, struct g_provider *pp)
 {
 	struct g_mountver_softc *sc;
 	struct g_geom *gp;
 	struct g_provider *newpp;
 	struct g_consumer *cp;
 	char name[64];
 	int error;
 	int identsize = DISK_IDENT_SIZE;
 
 	g_topology_assert();
 
 	gp = NULL;
 	newpp = NULL;
 	cp = NULL;
 
 	snprintf(name, sizeof(name), "%s%s", pp->name, G_MOUNTVER_SUFFIX);
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		if (strcmp(gp->name, name) == 0) {
 			gctl_error(req, "Provider %s already exists.", name);
 			return (EEXIST);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", name);
 	sc = g_malloc(sizeof(*sc), M_WAITOK | M_ZERO);
 	mtx_init(&sc->sc_mtx, "gmountver", NULL, MTX_DEF);
 	TAILQ_INIT(&sc->sc_queue);
 	sc->sc_provider_name = strdup(pp->name, M_GEOM);
 	gp->softc = sc;
 	gp->start = g_mountver_start;
 	gp->orphan = g_mountver_orphan;
 	gp->resize = g_mountver_resize;
 	gp->access = g_mountver_access;
 	gp->dumpconf = g_mountver_dumpconf;
 
 	newpp = g_new_providerf(gp, "%s", gp->name);
 	newpp->mediasize = pp->mediasize;
 	newpp->sectorsize = pp->sectorsize;
 
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		gctl_error(req, "Cannot attach to provider %s.", pp->name);
 		goto fail;
 	}
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0) {
 		gctl_error(req, "Cannot access provider %s.", pp->name);
 		goto fail;
 	}
 	error = g_io_getattr("GEOM::ident", cp, &identsize, sc->sc_ident);
 	g_access(cp, -1, 0, 0);
 	if (error != 0) {
 		if (g_mountver_check_ident) {
 			gctl_error(req, "Cannot get disk ident from %s; error = %d.", pp->name, error);
 			goto fail;
 		}
 
 		G_MOUNTVER_DEBUG(0, "Cannot get disk ident from %s; error = %d.", pp->name, error);
 		sc->sc_ident[0] = '\0';
 	}
 
 	g_error_provider(newpp, 0);
 	G_MOUNTVER_DEBUG(0, "Device %s created.", gp->name);
 	return (0);
 fail:
 	g_free(sc->sc_provider_name);
 	if (cp->provider != NULL)
 		g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_provider(newpp);
 	g_free(gp->softc);
 	g_destroy_geom(gp);
 	return (error);
 }
 
 static int
 g_mountver_destroy(struct g_geom *gp, boolean_t force)
 {
 	struct g_mountver_softc *sc;
 	struct g_provider *pp;
 
 	g_topology_assert();
 	if (gp->softc == NULL)
 		return (ENXIO);
 	sc = gp->softc;
 	pp = LIST_FIRST(&gp->provider);
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_MOUNTVER_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_MOUNTVER_DEBUG(1, "Device %s is still open (r%dw%de%d).",
 			    pp->name, pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	} else {
 		G_MOUNTVER_DEBUG(0, "Device %s removed.", gp->name);
 	}
 	if (pp != NULL)
-		g_orphan_provider(pp, ENXIO);
+		g_wither_provider(pp, ENXIO);
 	g_mountver_discard_queued(gp);
 	g_free(sc->sc_provider_name);
 	g_free(gp->softc);
 	gp->softc = NULL;
 	g_wither_geom(gp, ENXIO);
 
 	return (0);
 }
 
 static int
 g_mountver_destroy_geom(struct gctl_req *req, struct g_class *mp, struct g_geom *gp)
 {
 
 	return (g_mountver_destroy(gp, 0));
 }
 
 static void
 g_mountver_ctl_create(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_provider *pp;
 	const char *name;
 	char param[16];
 	int i, *nargs;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument", i);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL) {
 			G_MOUNTVER_DEBUG(1, "Provider %s is invalid.", name);
 			gctl_error(req, "Provider %s is invalid.", name);
 			return;
 		}
 		if (g_mountver_create(req, mp, pp) != 0)
 			return;
 	}
 }
 
 static struct g_geom *
 g_mountver_find_geom(struct g_class *mp, const char *name)
 {
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		if (strcmp(gp->name, name) == 0)
 			return (gp);
 	}
 	return (NULL);
 }
 
 static void
 g_mountver_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	int *nargs, *force, error, i;
 	struct g_geom *gp;
 	const char *name;
 	char param[16];
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No 'force' argument");
 		return;
 	}
 
 	for (i = 0; i < *nargs; i++) {
 		snprintf(param, sizeof(param), "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument", i);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		gp = g_mountver_find_geom(mp, name);
 		if (gp == NULL) {
 			G_MOUNTVER_DEBUG(1, "Device %s is invalid.", name);
 			gctl_error(req, "Device %s is invalid.", name);
 			return;
 		}
 		error = g_mountver_destroy(gp, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    gp->name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_mountver_orphan(struct g_consumer *cp)
 {
 	struct g_mountver_softc *sc;
 
 	g_topology_assert();
 
 	sc = cp->geom->softc;
 	sc->sc_orphaned = 1;
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	G_MOUNTVER_DEBUG(0, "%s is offline.  Mount verification in progress.", sc->sc_provider_name);
 }
 
 static void
 g_mountver_resize(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 	struct g_provider *pp;
 
 	gp = cp->geom;
 
 	LIST_FOREACH(pp, &gp->provider, provider)
 		g_resize_provider(pp, cp->provider->mediasize);
 }
 
 static int
 g_mountver_ident_matches(struct g_geom *gp)
 {
 	struct g_consumer *cp;
 	struct g_mountver_softc *sc;
 	char ident[DISK_IDENT_SIZE];
 	int error, identsize = DISK_IDENT_SIZE;
 
 	sc = gp->softc;
 	cp = LIST_FIRST(&gp->consumer);
 
 	if (g_mountver_check_ident == 0)
 		return (0);
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0) {
 		G_MOUNTVER_DEBUG(0, "Cannot access %s; "
 		    "not attaching; error = %d.", gp->name, error);
 		return (1);
 	}
 	error = g_io_getattr("GEOM::ident", cp, &identsize, ident);
 	g_access(cp, -1, 0, 0);
 	if (error != 0) {
 		G_MOUNTVER_DEBUG(0, "Cannot get disk ident for %s; "
 		    "not attaching; error = %d.", gp->name, error);
 		return (1);
 	}
 	if (strcmp(ident, sc->sc_ident) != 0) {
 		G_MOUNTVER_DEBUG(1, "Disk ident for %s (\"%s\") is different "
 		    "from expected \"%s\", not attaching.", gp->name, ident,
 		    sc->sc_ident);
 		return (1);
 	}
 
 	return (0);
 }
 	
 static struct g_geom *
 g_mountver_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_mountver_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_MOUNTVER_DEBUG(2, "Tasting %s.", pp->name);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 
 		/* Already attached? */
 		if (pp == LIST_FIRST(&gp->provider))
 			return (NULL);
 
 		if (sc->sc_orphaned && strcmp(pp->name, sc->sc_provider_name) == 0)
 			break;
 	}
 	if (gp == NULL)
 		return (NULL);
 
 	cp = LIST_FIRST(&gp->consumer);
 	g_attach(cp, pp);
 	error = g_mountver_ident_matches(gp);
 	if (error != 0) {
 		g_detach(cp);
 		return (NULL);
 	}
 	if (sc->sc_access_r > 0 || sc->sc_access_w > 0 || sc->sc_access_e > 0) {
 		error = g_access(cp, sc->sc_access_r, sc->sc_access_w, sc->sc_access_e);
 		if (error != 0) {
 			G_MOUNTVER_DEBUG(0, "Cannot access %s; error = %d.", pp->name, error);
 			g_detach(cp);
 			return (NULL);
 		}
 	}
 	g_mountver_send_queued(gp);
 	sc->sc_orphaned = 0;
 	G_MOUNTVER_DEBUG(0, "%s has completed mount verification.", sc->sc_provider_name);
 
 	return (gp);
 }
 
 static void
 g_mountver_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_MOUNTVER_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "create") == 0) {
 		g_mountver_ctl_create(req, mp);
 		return;
 	} else if (strcmp(verb, "destroy") == 0) {
 		g_mountver_ctl_destroy(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_mountver_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_mountver_softc *sc;
 
 	if (pp != NULL || cp != NULL)
 		return;
 
 	sc = gp->softc;
 	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 	    sc->sc_orphaned ? "OFFLINE" : "ONLINE");
 	sbuf_printf(sb, "%s<Provider-Name>%s</Provider-Name>\n", indent, sc->sc_provider_name);
 	sbuf_printf(sb, "%s<Disk-Ident>%s</Disk-Ident>\n", indent, sc->sc_ident);
 }
 
 static void
 g_mountver_shutdown_pre_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 
 	mp = arg;
 	g_topology_lock();
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2)
 		g_mountver_destroy(gp, 1);
 	g_topology_unlock();
 }
 
 static void
 g_mountver_init(struct g_class *mp)
 {
 
 	g_mountver_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
 	    g_mountver_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_mountver_pre_sync == NULL)
 		G_MOUNTVER_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_mountver_fini(struct g_class *mp)
 {
 
 	if (g_mountver_pre_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_mountver_pre_sync);
 }
 
 DECLARE_GEOM_CLASS(g_mountver_class, g_mountver);
Index: user/alc/PQ_LAUNDRY/sys/geom/raid3/g_raid3.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/raid3/g_raid3.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/raid3/g_raid3.c	(revision 306283)
@@ -1,3584 +1,3583 @@
 /*-
  * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/eventhandler.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sched.h>
 #include <geom/raid3/g_raid3.h>
 
 FEATURE(geom_raid3, "GEOM RAID-3 functionality");
 
 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data");
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0,
     "GEOM_RAID3 stuff");
 u_int g_raid3_debug = 0;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RWTUN, &g_raid3_debug, 0,
     "Debug level");
 static u_int g_raid3_timeout = 4;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RWTUN, &g_raid3_timeout,
     0, "Time to wait on all raid3 components");
 static u_int g_raid3_idletime = 5;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RWTUN,
     &g_raid3_idletime, 0, "Mark components as clean when idling");
 static u_int g_raid3_disconnect_on_failure = 1;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RWTUN,
     &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure.");
 static u_int g_raid3_syncreqs = 2;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests.");
 static u_int g_raid3_use_malloc = 0;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, use_malloc, CTLFLAG_RDTUN,
     &g_raid3_use_malloc, 0, "Use malloc(9) instead of uma(9).");
 
 static u_int g_raid3_n64k = 50;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RDTUN, &g_raid3_n64k, 0,
     "Maximum number of 64kB allocations");
 static u_int g_raid3_n16k = 200;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RDTUN, &g_raid3_n16k, 0,
     "Maximum number of 16kB allocations");
 static u_int g_raid3_n4k = 1200;
 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RDTUN, &g_raid3_n4k, 0,
     "Maximum number of 4kB allocations");
 
 static SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0,
     "GEOM_RAID3 statistics");
 static u_int g_raid3_parity_mismatch = 0;
 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD,
     &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode");
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
 	msleep((ident), (mtx), (priority), (wmesg), (timeout));		\
 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident));	\
 } while (0)
 
 static eventhandler_tag g_raid3_post_sync = NULL;
 static int g_raid3_shutdown = 0;
 
 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 static g_taste_t g_raid3_taste;
 static void g_raid3_init(struct g_class *mp);
 static void g_raid3_fini(struct g_class *mp);
 
 struct g_class g_raid3_class = {
 	.name = G_RAID3_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_raid3_config,
 	.taste = g_raid3_taste,
 	.destroy_geom = g_raid3_destroy_geom,
 	.init = g_raid3_init,
 	.fini = g_raid3_fini
 };
 
 
 static void g_raid3_destroy_provider(struct g_raid3_softc *sc);
 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state);
 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force);
 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent,
     struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp);
 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type);
 static int g_raid3_register_request(struct bio *pbp);
 static void g_raid3_sync_release(struct g_raid3_softc *sc);
 
 
 static const char *
 g_raid3_disk_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID3_DISK_STATE_NODISK:
 		return ("NODISK");
 	case G_RAID3_DISK_STATE_NONE:
 		return ("NONE");
 	case G_RAID3_DISK_STATE_NEW:
 		return ("NEW");
 	case G_RAID3_DISK_STATE_ACTIVE:
 		return ("ACTIVE");
 	case G_RAID3_DISK_STATE_STALE:
 		return ("STALE");
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		return ("SYNCHRONIZING");
 	case G_RAID3_DISK_STATE_DISCONNECTED:
 		return ("DISCONNECTED");
 	default:
 		return ("INVALID");
 	}
 }
 
 static const char *
 g_raid3_device_state2str(int state)
 {
 
 	switch (state) {
 	case G_RAID3_DEVICE_STATE_STARTING:
 		return ("STARTING");
 	case G_RAID3_DEVICE_STATE_DEGRADED:
 		return ("DEGRADED");
 	case G_RAID3_DEVICE_STATE_COMPLETE:
 		return ("COMPLETE");
 	default:
 		return ("INVALID");
 	}
 }
 
 const char *
 g_raid3_get_diskname(struct g_raid3_disk *disk)
 {
 
 	if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL)
 		return ("[unknown]");
 	return (disk->d_name);
 }
 
 static void *
 g_raid3_alloc(struct g_raid3_softc *sc, size_t size, int flags)
 {
 	void *ptr;
 	enum g_raid3_zones zone;
 
 	if (g_raid3_use_malloc ||
 	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
 		ptr = malloc(size, M_RAID3, flags);
 	else {
 		ptr = uma_zalloc_arg(sc->sc_zones[zone].sz_zone,
 		   &sc->sc_zones[zone], flags);
 		sc->sc_zones[zone].sz_requested++;
 		if (ptr == NULL)
 			sc->sc_zones[zone].sz_failed++;
 	}
 	return (ptr);
 }
 
 static void
 g_raid3_free(struct g_raid3_softc *sc, void *ptr, size_t size)
 {
 	enum g_raid3_zones zone;
 
 	if (g_raid3_use_malloc ||
 	    (zone = g_raid3_zone(size)) == G_RAID3_NUM_ZONES)
 		free(ptr, M_RAID3);
 	else {
 		uma_zfree_arg(sc->sc_zones[zone].sz_zone,
 		    ptr, &sc->sc_zones[zone]);
 	}
 }
 
 static int
 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct g_raid3_zone *sz = arg;
 
 	if (sz->sz_max > 0 && sz->sz_inuse == sz->sz_max)
 		return (ENOMEM);
 	sz->sz_inuse++;
 	return (0);
 }
 
 static void
 g_raid3_uma_dtor(void *mem, int size, void *arg)
 {
 	struct g_raid3_zone *sz = arg;
 
 	sz->sz_inuse--;
 }
 
 #define	g_raid3_xor(src, dst, size)					\
 	_g_raid3_xor((uint64_t *)(src),					\
 	    (uint64_t *)(dst), (size_t)size)
 static void
 _g_raid3_xor(uint64_t *src, uint64_t *dst, size_t size)
 {
 
 	KASSERT((size % 128) == 0, ("Invalid size: %zu.", size));
 	for (; size > 0; size -= 128) {
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 		*dst++ ^= (*src++);
 	}
 }
 
 static int
 g_raid3_is_zero(struct bio *bp)
 {
 	static const uint64_t zeros[] = {
 	    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 	};
 	u_char *addr;
 	ssize_t size;
 
 	size = bp->bio_length;
 	addr = (u_char *)bp->bio_data;
 	for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) {
 		if (bcmp(addr, zeros, sizeof(zeros)) != 0)
 			return (0);
 	}
 	return (1);
 }
 
 /*
  * --- Events handling functions ---
  * Events in geom_raid3 are used to maintain disks and device status
  * from one thread to simplify locking.
  */
 static void
 g_raid3_event_free(struct g_raid3_event *ep)
 {
 
 	free(ep, M_RAID3);
 }
 
 int
 g_raid3_event_send(void *arg, int state, int flags)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct g_raid3_event *ep;
 	int error;
 
 	ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK);
 	G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep);
 	if ((flags & G_RAID3_EVENT_DEVICE) != 0) {
 		disk = NULL;
 		sc = arg;
 	} else {
 		disk = arg;
 		sc = disk->d_softc;
 	}
 	ep->e_disk = disk;
 	ep->e_state = state;
 	ep->e_flags = flags;
 	ep->e_error = 0;
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	if ((flags & G_RAID3_EVENT_DONTWAIT) != 0)
 		return (0);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep);
 	sx_xunlock(&sc->sc_lock);
 	while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) {
 		mtx_lock(&sc->sc_events_mtx);
 		MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event",
 		    hz * 5);
 	}
 	error = ep->e_error;
 	g_raid3_event_free(ep);
 	sx_xlock(&sc->sc_lock);
 	return (error);
 }
 
 static struct g_raid3_event *
 g_raid3_event_get(struct g_raid3_softc *sc)
 {
 	struct g_raid3_event *ep;
 
 	mtx_lock(&sc->sc_events_mtx);
 	ep = TAILQ_FIRST(&sc->sc_events);
 	mtx_unlock(&sc->sc_events_mtx);
 	return (ep);
 }
 
 static void
 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep)
 {
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 static void
 g_raid3_event_cancel(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_event *ep, *tmpep;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	mtx_lock(&sc->sc_events_mtx);
 	TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) {
 		if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0)
 			continue;
 		if (ep->e_disk != disk)
 			continue;
 		TAILQ_REMOVE(&sc->sc_events, ep, e_next);
 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
 			g_raid3_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			wakeup(ep);
 		}
 	}
 	mtx_unlock(&sc->sc_events_mtx);
 }
 
 /*
  * Return the number of disks in the given state.
  * If state is equal to -1, count all connected disks.
  */
 u_int
 g_raid3_ndisks(struct g_raid3_softc *sc, int state)
 {
 	struct g_raid3_disk *disk;
 	u_int n, ndisks;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	for (n = ndisks = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 			continue;
 		if (state == -1 || disk->d_state == state)
 			ndisks++;
 	}
 	return (ndisks);
 }
 
 static u_int
 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 	struct bio *bp;
 	u_int nreqs = 0;
 
 	mtx_lock(&sc->sc_queue_mtx);
 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 		if (bp->bio_from == cp)
 			nreqs++;
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	return (nreqs);
 }
 
 static int
 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 
 	if (cp->index > 0) {
 		G_RAID3_DEBUG(2,
 		    "I/O requests for %s exist, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	if (g_raid3_nrequests(sc, cp) > 0) {
 		G_RAID3_DEBUG(2,
 		    "I/O requests for %s in queue, can't destroy it now.",
 		    cp->provider->name);
 		return (1);
 	}
 	return (0);
 }
 
 static void
 g_raid3_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *cp;
 
 	g_topology_assert();
 
 	cp = arg;
 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 	struct g_provider *pp;
 	int retaste_wait;
 
 	g_topology_assert();
 
 	cp->private = NULL;
 	if (g_raid3_is_busy(sc, cp))
 		return;
 	G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name);
 	pp = cp->provider;
 	retaste_wait = 0;
 	if (cp->acw == 1) {
 		if ((pp->geom->flags & G_GEOM_WITHER) == 0)
 			retaste_wait = 1;
 	}
 	G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr,
 	    -cp->acw, -cp->ace, 0);
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	if (retaste_wait) {
 		/*
 		 * After retaste event was send (inside g_access()), we can send
 		 * event to detach and destroy consumer.
 		 * A class, which has consumer to the given provider connected
 		 * will not receive retaste event for the provider.
 		 * This is the way how I ignore retaste events when I close
 		 * consumers opened for write: I detach and destroy consumer
 		 * after retaste event is sent.
 		 */
 		g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL);
 		return;
 	}
 	G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static int
 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp)
 {
 	struct g_consumer *cp;
 	int error;
 
 	g_topology_assert_not();
 	KASSERT(disk->d_consumer == NULL,
 	    ("Disk already connected (device %s).", disk->d_softc->sc_name));
 
 	g_topology_lock();
 	cp = g_new_consumer(disk->d_softc->sc_geom);
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 		return (error);
 	}
 	error = g_access(cp, 1, 1, 1);
 		g_topology_unlock();
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).",
 		    pp->name, error);
 		return (error);
 	}
 	disk->d_consumer = cp;
 	disk->d_consumer->private = disk;
 	disk->d_consumer->index = 0;
 	G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk));
 	return (0);
 }
 
 static void
 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp)
 {
 
 	g_topology_assert();
 
 	if (cp == NULL)
 		return;
 	if (cp->provider != NULL)
 		g_raid3_kill_consumer(sc, cp);
 	else
 		g_destroy_consumer(cp);
 }
 
 /*
  * Initialize disk. This means allocate memory, create consumer, attach it
  * to the provider and open access (r1w1e1) to it.
  */
 static struct g_raid3_disk *
 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md, int *errorp)
 {
 	struct g_raid3_disk *disk;
 	int error;
 
 	disk = &sc->sc_disks[md->md_no];
 	error = g_raid3_connect_disk(disk, pp);
 	if (error != 0) {
 		if (errorp != NULL)
 			*errorp = error;
 		return (NULL);
 	}
 	disk->d_state = G_RAID3_DISK_STATE_NONE;
 	disk->d_flags = md->md_dflags;
 	if (md->md_provider[0] != '\0')
 		disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_sync.ds_offset = md->md_sync_offset;
 	disk->d_sync.ds_offset_done = md->md_sync_offset;
 	disk->d_genid = md->md_genid;
 	disk->d_sync.ds_syncid = md->md_syncid;
 	if (errorp != NULL)
 		*errorp = 0;
 	return (disk);
 }
 
 static void
 g_raid3_destroy_disk(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 		return;
 	g_raid3_event_cancel(disk);
 	switch (disk->d_state) {
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		if (sc->sc_syncdisk != NULL)
 			g_raid3_sync_stop(sc, 1);
 		/* FALLTHROUGH */
 	case G_RAID3_DISK_STATE_NEW:
 	case G_RAID3_DISK_STATE_STALE:
 	case G_RAID3_DISK_STATE_ACTIVE:
 		g_topology_lock();
 		g_raid3_disconnect_consumer(sc, disk->d_consumer);
 		g_topology_unlock();
 		disk->d_consumer = NULL;
 		break;
 	default:
 		KASSERT(0 == 1, ("Wrong disk state (%s, %s).",
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 	}
 	disk->d_state = G_RAID3_DISK_STATE_NODISK;
 }
 
 static void
 g_raid3_destroy_device(struct g_raid3_softc *sc)
 {
 	struct g_raid3_event *ep;
 	struct g_raid3_disk *disk;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	gp = sc->sc_geom;
 	if (sc->sc_provider != NULL)
 		g_raid3_destroy_provider(sc);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state != G_RAID3_DISK_STATE_NODISK) {
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 			g_raid3_update_metadata(disk);
 			g_raid3_destroy_disk(disk);
 		}
 	}
 	while ((ep = g_raid3_event_get(sc)) != NULL) {
 		g_raid3_event_remove(sc, ep);
 		if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0)
 			g_raid3_event_free(ep);
 		else {
 			ep->e_error = ECANCELED;
 			ep->e_flags |= G_RAID3_EVENT_DONE;
 			G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep);
 			mtx_lock(&sc->sc_events_mtx);
 			wakeup(ep);
 			mtx_unlock(&sc->sc_events_mtx);
 		}
 	}
 	callout_drain(&sc->sc_callout);
 	cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer);
 	g_topology_lock();
 	if (cp != NULL)
 		g_raid3_disconnect_consumer(sc, cp);
 	g_wither_geom(sc->sc_sync.ds_geom, ENXIO);
 	G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	g_topology_unlock();
 	if (!g_raid3_use_malloc) {
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
 		uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
 	}
 	mtx_destroy(&sc->sc_queue_mtx);
 	mtx_destroy(&sc->sc_events_mtx);
 	sx_xunlock(&sc->sc_lock);
 	sx_destroy(&sc->sc_lock);
 }
 
 static void
 g_raid3_orphan(struct g_consumer *cp)
 {
 	struct g_raid3_disk *disk;
 
 	g_topology_assert();
 
 	disk = cp->private;
 	if (disk == NULL)
 		return;
 	disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID;
 	g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED,
 	    G_RAID3_EVENT_DONTWAIT);
 }
 
 static int
 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_consumer *cp;
 	off_t offset, length;
 	u_char *sector;
 	int error = 0;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	cp = disk->d_consumer;
 	KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name));
 	KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name));
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr,
 	    cp->acw, cp->ace));
 	length = cp->provider->sectorsize;
 	offset = cp->provider->mediasize - length;
 	sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO);
 	if (md != NULL)
 		raid3_metadata_encode(md, sector);
 	error = g_write_data(cp, offset, sector, length);
 	free(sector, M_RAID3);
 	if (error != 0) {
 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 			G_RAID3_DEBUG(0, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_raid3_get_diskname(disk), sc->sc_name, error);
 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 		} else {
 			G_RAID3_DEBUG(1, "Cannot write metadata on %s "
 			    "(device=%s, error=%d).",
 			    g_raid3_get_diskname(disk), sc->sc_name, error);
 		}
 		if (g_raid3_disconnect_on_failure &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 		}
 	}
 	return (error);
 }
 
 int
 g_raid3_clear_metadata(struct g_raid3_disk *disk)
 {
 	int error;
 
 	g_topology_assert_not();
 	sx_assert(&disk->d_softc->sc_lock, SX_LOCKED);
 
 	error = g_raid3_write_metadata(disk, NULL);
 	if (error == 0) {
 		G_RAID3_DEBUG(2, "Metadata on %s cleared.",
 		    g_raid3_get_diskname(disk));
 	} else {
 		G_RAID3_DEBUG(0,
 		    "Cannot clear metadata on disk %s (error=%d).",
 		    g_raid3_get_diskname(disk), error);
 	}
 	return (error);
 }
 
 void
 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_provider *pp;
 
 	sc = disk->d_softc;
 	strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic));
 	md->md_version = G_RAID3_VERSION;
 	strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name));
 	md->md_id = sc->sc_id;
 	md->md_all = sc->sc_ndisks;
 	md->md_genid = sc->sc_genid;
 	md->md_mediasize = sc->sc_mediasize;
 	md->md_sectorsize = sc->sc_sectorsize;
 	md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK);
 	md->md_no = disk->d_no;
 	md->md_syncid = disk->d_sync.ds_syncid;
 	md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK);
 	if (disk->d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
 		md->md_sync_offset = 0;
 	else {
 		md->md_sync_offset =
 		    disk->d_sync.ds_offset_done / (sc->sc_ndisks - 1);
 	}
 	if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL)
 		pp = disk->d_consumer->provider;
 	else
 		pp = NULL;
 	if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL)
 		strlcpy(md->md_provider, pp->name, sizeof(md->md_provider));
 	else
 		bzero(md->md_provider, sizeof(md->md_provider));
 	if (pp != NULL)
 		md->md_provsize = pp->mediasize;
 	else
 		md->md_provsize = 0;
 }
 
 void
 g_raid3_update_metadata(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_metadata md;
 	int error;
 
 	g_topology_assert_not();
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_raid3_fill_metadata(disk, &md);
 	error = g_raid3_write_metadata(disk, &md);
 	if (error == 0) {
 		G_RAID3_DEBUG(2, "Metadata on %s updated.",
 		    g_raid3_get_diskname(disk));
 	} else {
 		G_RAID3_DEBUG(0,
 		    "Cannot update metadata on disk %s (error=%d).",
 		    g_raid3_get_diskname(disk), error);
 	}
 }
 
 static void
 g_raid3_bump_syncid(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_syncid++;
 	G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name,
 	    sc->sc_syncid);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_sync.ds_syncid = sc->sc_syncid;
 			g_raid3_update_metadata(disk);
 		}
 	}
 }
 
 static void
 g_raid3_bump_genid(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 	KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0,
 	    ("%s called with no active disks (device=%s).", __func__,
 	    sc->sc_name));
 
 	sc->sc_genid++;
 	G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name,
 	    sc->sc_genid);
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_genid = sc->sc_genid;
 			g_raid3_update_metadata(disk);
 		}
 	}
 }
 
 static int
 g_raid3_idle(struct g_raid3_softc *sc, int acw)
 {
 	struct g_raid3_disk *disk;
 	u_int i;
 	int timeout;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_provider == NULL)
 		return (0);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return (0);
 	if (sc->sc_idle)
 		return (0);
 	if (sc->sc_writes > 0)
 		return (0);
 	if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) {
 		timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write);
 		if (!g_raid3_shutdown && timeout > 0)
 			return (timeout);
 	}
 	sc->sc_idle = 1;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		g_raid3_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_raid3_unidle(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	u_int i;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	sc->sc_idle = 0;
 	sc->sc_last_write = time_uptime;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 		g_raid3_update_metadata(disk);
 	}
 }
 
 /*
  * Treat bio_driver1 field in parent bio as list head and field bio_caller1
  * in child bio as pointer to the next element on the list.
  */
 #define	G_RAID3_HEAD_BIO(pbp)	(pbp)->bio_driver1
 
 #define	G_RAID3_NEXT_BIO(cbp)	(cbp)->bio_caller1
 
 #define	G_RAID3_FOREACH_BIO(pbp, bp)					\
 	for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL;		\
 	    (bp) = G_RAID3_NEXT_BIO(bp))
 
 #define	G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp)			\
 	for ((bp) = G_RAID3_HEAD_BIO(pbp);				\
 	    (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1);	\
 	    (bp) = (tmpbp))
 
 static void
 g_raid3_init_bio(struct bio *pbp)
 {
 
 	G_RAID3_HEAD_BIO(pbp) = NULL;
 }
 
 static void
 g_raid3_remove_bio(struct bio *cbp)
 {
 	struct bio *pbp, *bp;
 
 	pbp = cbp->bio_parent;
 	if (G_RAID3_HEAD_BIO(pbp) == cbp)
 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == cbp) {
 				G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
 				break;
 			}
 		}
 	}
 	G_RAID3_NEXT_BIO(cbp) = NULL;
 }
 
 static void
 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp)
 {
 	struct bio *pbp, *bp;
 
 	g_raid3_remove_bio(sbp);
 	pbp = dbp->bio_parent;
 	G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp);
 	if (G_RAID3_HEAD_BIO(pbp) == dbp)
 		G_RAID3_HEAD_BIO(pbp) = sbp;
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == dbp) {
 				G_RAID3_NEXT_BIO(bp) = sbp;
 				break;
 			}
 		}
 	}
 	G_RAID3_NEXT_BIO(dbp) = NULL;
 }
 
 static void
 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp)
 {
 	struct bio *bp, *pbp;
 	size_t size;
 
 	pbp = cbp->bio_parent;
 	pbp->bio_children--;
 	KASSERT(cbp->bio_data != NULL, ("NULL bio_data"));
 	size = pbp->bio_length / (sc->sc_ndisks - 1);
 	g_raid3_free(sc, cbp->bio_data, size);
 	if (G_RAID3_HEAD_BIO(pbp) == cbp) {
 		G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp);
 		G_RAID3_NEXT_BIO(cbp) = NULL;
 		g_destroy_bio(cbp);
 	} else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == cbp)
 				break;
 		}
 		if (bp != NULL) {
 			KASSERT(G_RAID3_NEXT_BIO(bp) != NULL,
 			    ("NULL bp->bio_driver1"));
 			G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp);
 			G_RAID3_NEXT_BIO(cbp) = NULL;
 		}
 		g_destroy_bio(cbp);
 	}
 }
 
 static struct bio *
 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp)
 {
 	struct bio *bp, *cbp;
 	size_t size;
 	int memflag;
 
 	cbp = g_clone_bio(pbp);
 	if (cbp == NULL)
 		return (NULL);
 	size = pbp->bio_length / (sc->sc_ndisks - 1);
 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
 		memflag = M_WAITOK;
 	else
 		memflag = M_NOWAIT;
 	cbp->bio_data = g_raid3_alloc(sc, size, memflag);
 	if (cbp->bio_data == NULL) {
 		pbp->bio_children--;
 		g_destroy_bio(cbp);
 		return (NULL);
 	}
 	G_RAID3_NEXT_BIO(cbp) = NULL;
 	if (G_RAID3_HEAD_BIO(pbp) == NULL)
 		G_RAID3_HEAD_BIO(pbp) = cbp;
 	else {
 		G_RAID3_FOREACH_BIO(pbp, bp) {
 			if (G_RAID3_NEXT_BIO(bp) == NULL) {
 				G_RAID3_NEXT_BIO(bp) = cbp;
 				break;
 			}
 		}
 	}
 	return (cbp);
 }
 
 static void
 g_raid3_scatter(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *bp, *cbp, *tmpbp;
 	off_t atom, cadd, padd, left;
 	int first;
 
 	sc = pbp->bio_to->geom->softc;
 	bp = NULL;
 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
 		/*
 		 * Find bio for which we should calculate data.
 		 */
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
 				bp = cbp;
 				break;
 			}
 		}
 		KASSERT(bp != NULL, ("NULL parity bio."));
 	}
 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 	cadd = padd = 0;
 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if (cbp == bp)
 				continue;
 			bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom);
 			padd += atom;
 		}
 		cadd += atom;
 	}
 	if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) {
 		/*
 		 * Calculate parity.
 		 */
 		first = 1;
 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 			if (cbp == bp)
 				continue;
 			if (first) {
 				bcopy(cbp->bio_data, bp->bio_data,
 				    bp->bio_length);
 				first = 0;
 			} else {
 				g_raid3_xor(cbp->bio_data, bp->bio_data,
 				    bp->bio_length);
 			}
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0)
 				g_raid3_destroy_bio(sc, cbp);
 		}
 	}
 	G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 		struct g_consumer *cp;
 
 		disk = cbp->bio_caller2;
 		cp = disk->d_consumer;
 		cbp->bio_to = cp->provider;
 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		sc->sc_writes++;
 		g_io_request(cbp, cp);
 	}
 }
 
 static void
 g_raid3_gather(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *xbp, *fbp, *cbp;
 	off_t atom, cadd, padd, left;
 
 	sc = pbp->bio_to->geom->softc;
 	/*
 	 * Find bio for which we have to calculate data.
 	 * While going through this path, check if all requests
 	 * succeeded, if not, deny whole request.
 	 * If we're in COMPLETE mode, we allow one request to fail,
 	 * so if we find one, we're sending it to the parity consumer.
 	 * If there are more failed requests, we deny whole request.
 	 */
 	xbp = fbp = NULL;
 	G_RAID3_FOREACH_BIO(pbp, cbp) {
 		if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) {
 			KASSERT(xbp == NULL, ("More than one parity bio."));
 			xbp = cbp;
 		}
 		if (cbp->bio_error == 0)
 			continue;
 		/*
 		 * Found failed request.
 		 */
 		if (fbp == NULL) {
 			if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) {
 				/*
 				 * We are already in degraded mode, so we can't
 				 * accept any failures.
 				 */
 				if (pbp->bio_error == 0)
 					pbp->bio_error = cbp->bio_error;
 			} else {
 				fbp = cbp;
 			}
 		} else {
 			/*
 			 * Next failed request, that's too many.
 			 */
 			if (pbp->bio_error == 0)
 				pbp->bio_error = fbp->bio_error;
 		}
 		disk = cbp->bio_caller2;
 		if (disk == NULL)
 			continue;
 		if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 			disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 			G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).",
 			    cbp->bio_error);
 		} else {
 			G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).",
 			    cbp->bio_error);
 		}
 		if (g_raid3_disconnect_on_failure &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 		}
 	}
 	if (pbp->bio_error != 0)
 		goto finish;
 	if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY;
 		if (xbp != fbp)
 			g_raid3_replace_bio(xbp, fbp);
 		g_raid3_destroy_bio(sc, fbp);
 	} else if (fbp != NULL) {
 		struct g_consumer *cp;
 
 		/*
 		 * One request failed, so send the same request to
 		 * the parity consumer.
 		 */
 		disk = pbp->bio_driver2;
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
 			pbp->bio_error = fbp->bio_error;
 			goto finish;
 		}
 		pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 		pbp->bio_inbed--;
 		fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR);
 		if (disk->d_no == sc->sc_ndisks - 1)
 			fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 		fbp->bio_error = 0;
 		fbp->bio_completed = 0;
 		fbp->bio_children = 0;
 		fbp->bio_inbed = 0;
 		cp = disk->d_consumer;
 		fbp->bio_caller2 = disk;
 		fbp->bio_to = cp->provider;
 		G_RAID3_LOGREQ(3, fbp, "Sending request (recover).");
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(fbp, cp);
 		return;
 	}
 	if (xbp != NULL) {
 		/*
 		 * Calculate parity.
 		 */
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0)
 				continue;
 			g_raid3_xor(cbp->bio_data, xbp->bio_data,
 			    xbp->bio_length);
 		}
 		xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY;
 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) {
 			if (!g_raid3_is_zero(xbp)) {
 				g_raid3_parity_mismatch++;
 				pbp->bio_error = EIO;
 				goto finish;
 			}
 			g_raid3_destroy_bio(sc, xbp);
 		}
 	}
 	atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 	cadd = padd = 0;
 	for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) {
 		G_RAID3_FOREACH_BIO(pbp, cbp) {
 			bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom);
 			pbp->bio_completed += atom;
 			padd += atom;
 		}
 		cadd += atom;
 	}
 finish:
 	if (pbp->bio_error == 0)
 		G_RAID3_LOGREQ(3, pbp, "Request finished.");
 	else {
 		if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0)
 			G_RAID3_LOGREQ(1, pbp, "Verification error.");
 		else
 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
 	}
 	pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK;
 	while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
 		g_raid3_destroy_bio(sc, cbp);
 	g_io_deliver(pbp, pbp->bio_error);
 }
 
 static void
 g_raid3_done(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR;
 	G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error);
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_head(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 }
 
 static void
 g_raid3_regular_request(struct bio *cbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct bio *pbp;
 
 	g_topology_assert_not();
 
 	pbp = cbp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	cbp->bio_from->index--;
 	if (cbp->bio_cmd == BIO_WRITE)
 		sc->sc_writes--;
 	disk = cbp->bio_from->private;
 	if (disk == NULL) {
 		g_topology_lock();
 		g_raid3_kill_consumer(sc, cbp->bio_from);
 		g_topology_unlock();
 	}
 
 	G_RAID3_LOGREQ(3, cbp, "Request finished.");
 	pbp->bio_inbed++;
 	KASSERT(pbp->bio_inbed <= pbp->bio_children,
 	    ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed,
 	    pbp->bio_children));
 	if (pbp->bio_inbed != pbp->bio_children)
 		return;
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		g_raid3_gather(pbp);
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 	    {
 		int error = 0;
 
 		pbp->bio_completed = pbp->bio_length;
 		while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) {
 			if (cbp->bio_error == 0) {
 				g_raid3_destroy_bio(sc, cbp);
 				continue;
 			}
 
 			if (error == 0)
 				error = cbp->bio_error;
 			else if (pbp->bio_error == 0) {
 				/*
 				 * Next failed request, that's too many.
 				 */
 				pbp->bio_error = error;
 			}
 
 			disk = cbp->bio_caller2;
 			if (disk == NULL) {
 				g_raid3_destroy_bio(sc, cbp);
 				continue;
 			}
 
 			if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) {
 				disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN;
 				G_RAID3_LOGREQ(0, cbp,
 				    "Request failed (error=%d).",
 				    cbp->bio_error);
 			} else {
 				G_RAID3_LOGREQ(1, cbp,
 				    "Request failed (error=%d).",
 				    cbp->bio_error);
 			}
 			if (g_raid3_disconnect_on_failure &&
 			    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 				sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 				g_raid3_event_send(disk,
 				    G_RAID3_DISK_STATE_DISCONNECTED,
 				    G_RAID3_EVENT_DONTWAIT);
 			}
 			g_raid3_destroy_bio(sc, cbp);
 		}
 		if (pbp->bio_error == 0)
 			G_RAID3_LOGREQ(3, pbp, "Request finished.");
 		else
 			G_RAID3_LOGREQ(0, pbp, "Request failed.");
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED;
 		pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY;
 		bioq_remove(&sc->sc_inflight, pbp);
 		/* Release delayed sync requests if possible. */
 		g_raid3_sync_release(sc);
 		g_io_deliver(pbp, pbp->bio_error);
 		break;
 	    }
 	}
 }
 
 static void
 g_raid3_sync_done(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	G_RAID3_LOGREQ(3, bp, "Synchronization request delivered.");
 	sc = bp->bio_from->geom->softc;
 	bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC;
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_head(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 }
 
 static void
 g_raid3_flush(struct g_raid3_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	u_int i;
 
 	bioq_init(&queue);
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		disk = &sc->sc_disks[i];
 		if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE)
 			continue;
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			for (cbp = bioq_first(&queue); cbp != NULL;
 			    cbp = bioq_first(&queue)) {
 				bioq_remove(&queue, cbp);
 				g_destroy_bio(cbp);
 			}
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_std_done;
 		cbp->bio_caller1 = disk;
 		cbp->bio_to = disk->d_consumer->provider;
 	}
 	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
 		bioq_remove(&queue, cbp);
 		G_RAID3_LOGREQ(3, cbp, "Sending request.");
 		disk = cbp->bio_caller1;
 		cbp->bio_caller1 = NULL;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		g_io_request(cbp, disk->d_consumer);
 	}
 }
 
 static void
 g_raid3_start(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL or there are no valid disks, provider's error
 	 * should be set and g_raid3_start() should not be called at all.
 	 */
 	KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 	    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE),
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 	G_RAID3_LOGREQ(3, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	case BIO_FLUSH:
 		g_raid3_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 	mtx_lock(&sc->sc_queue_mtx);
 	bioq_insert_tail(&sc->sc_queue, bp);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	wakeup(sc);
 }
 
 /*
  * Return TRUE if the given request is colliding with a in-progress
  * synchronization request.
  */
 static int
 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp)
 {
 	struct g_raid3_disk *disk;
 	struct bio *sbp;
 	off_t rstart, rend, sstart, send;
 	int i;
 
 	disk = sc->sc_syncdisk;
 	if (disk == NULL)
 		return (0);
 	rstart = bp->bio_offset;
 	rend = bp->bio_offset + bp->bio_length;
 	for (i = 0; i < g_raid3_syncreqs; i++) {
 		sbp = disk->d_sync.ds_bios[i];
 		if (sbp == NULL)
 			continue;
 		sstart = sbp->bio_offset;
 		send = sbp->bio_length;
 		if (sbp->bio_cmd == BIO_WRITE) {
 			sstart *= sc->sc_ndisks - 1;
 			send *= sc->sc_ndisks - 1;
 		}
 		send += sstart;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Return TRUE if the given sync request is colliding with a in-progress regular
  * request.
  */
 static int
 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp)
 {
 	off_t rstart, rend, sstart, send;
 	struct bio *bp;
 
 	if (sc->sc_syncdisk == NULL)
 		return (0);
 	sstart = sbp->bio_offset;
 	send = sstart + sbp->bio_length;
 	TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) {
 		rstart = bp->bio_offset;
 		rend = bp->bio_offset + bp->bio_length;
 		if (rend > sstart && rstart < send)
 			return (1);
 	}
 	return (0);
 }
 
 /*
  * Puts request onto delayed queue.
  */
 static void
 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp)
 {
 
 	G_RAID3_LOGREQ(2, bp, "Delaying request.");
 	bioq_insert_head(&sc->sc_regular_delayed, bp);
 }
 
 /*
  * Puts synchronization request onto delayed queue.
  */
 static void
 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp)
 {
 
 	G_RAID3_LOGREQ(2, bp, "Delaying synchronization request.");
 	bioq_insert_tail(&sc->sc_sync_delayed, bp);
 }
 
 /*
  * Releases delayed regular requests which don't collide anymore with sync
  * requests.
  */
 static void
 g_raid3_regular_release(struct g_raid3_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) {
 		if (g_raid3_sync_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_regular_delayed, bp);
 		G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp);
 		mtx_lock(&sc->sc_queue_mtx);
 		bioq_insert_head(&sc->sc_queue, bp);
 #if 0
 		/*
 		 * wakeup() is not needed, because this function is called from
 		 * the worker thread.
 		 */
 		wakeup(&sc->sc_queue);
 #endif
 		mtx_unlock(&sc->sc_queue_mtx);
 	}
 }
 
 /*
  * Releases delayed sync requests which don't collide anymore with regular
  * requests.
  */
 static void
 g_raid3_sync_release(struct g_raid3_softc *sc)
 {
 	struct bio *bp, *bp2;
 
 	TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) {
 		if (g_raid3_regular_collision(sc, bp))
 			continue;
 		bioq_remove(&sc->sc_sync_delayed, bp);
 		G_RAID3_LOGREQ(2, bp,
 		    "Releasing delayed synchronization request.");
 		g_io_request(bp, bp->bio_from);
 	}
 }
 
 /*
  * Handle synchronization requests.
  * Every synchronization request is two-steps process: first, READ request is
  * send to active provider and then WRITE request (with read data) to the provider
  * being synchronized. When WRITE is finished, new synchronization request is
  * send.
  */
 static void
 g_raid3_sync_request(struct bio *bp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 
 	bp->bio_from->index--;
 	sc = bp->bio_from->geom->softc;
 	disk = bp->bio_from->private;
 	if (disk == NULL) {
 		sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 		g_topology_lock();
 		g_raid3_kill_consumer(sc, bp->bio_from);
 		g_topology_unlock();
 		free(bp->bio_data, M_RAID3);
 		g_destroy_bio(bp);
 		sx_xlock(&sc->sc_lock);
 		return;
 	}
 
 	/*
 	 * Synchronization request.
 	 */
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	    {
 		struct g_consumer *cp;
 		u_char *dst, *src;
 		off_t left;
 		u_int atom;
 
 		if (bp->bio_error != 0) {
 			G_RAID3_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			return;
 		}
 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
 		atom = sc->sc_sectorsize / (sc->sc_ndisks - 1);
 		dst = src = bp->bio_data;
 		if (disk->d_no == sc->sc_ndisks - 1) {
 			u_int n;
 
 			/* Parity component. */
 			for (left = bp->bio_length; left > 0;
 			    left -= sc->sc_sectorsize) {
 				bcopy(src, dst, atom);
 				src += atom;
 				for (n = 1; n < sc->sc_ndisks - 1; n++) {
 					g_raid3_xor(src, dst, atom);
 					src += atom;
 				}
 				dst += atom;
 			}
 		} else {
 			/* Regular component. */
 			src += atom * disk->d_no;
 			for (left = bp->bio_length; left > 0;
 			    left -= sc->sc_sectorsize) {
 				bcopy(src, dst, atom);
 				src += sc->sc_sectorsize;
 				dst += atom;
 			}
 		}
 		bp->bio_driver1 = bp->bio_driver2 = NULL;
 		bp->bio_pflags = 0;
 		bp->bio_offset /= sc->sc_ndisks - 1;
 		bp->bio_length /= sc->sc_ndisks - 1;
 		bp->bio_cmd = BIO_WRITE;
 		bp->bio_cflags = 0;
 		bp->bio_children = bp->bio_inbed = 0;
 		cp = disk->d_consumer;
 		KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 		    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name,
 		    cp->acr, cp->acw, cp->ace));
 		cp->index++;
 		g_io_request(bp, cp);
 		return;
 	    }
 	case BIO_WRITE:
 	    {
 		struct g_raid3_disk_sync *sync;
 		off_t boffset, moffset;
 		void *data;
 		int i;
 
 		if (bp->bio_error != 0) {
 			G_RAID3_LOGREQ(0, bp,
 			    "Synchronization request failed (error=%d).",
 			    bp->bio_error);
 			g_destroy_bio(bp);
 			sc->sc_bump_id |= G_RAID3_BUMP_GENID;
 			g_raid3_event_send(disk,
 			    G_RAID3_DISK_STATE_DISCONNECTED,
 			    G_RAID3_EVENT_DONTWAIT);
 			return;
 		}
 		G_RAID3_LOGREQ(3, bp, "Synchronization request finished.");
 		sync = &disk->d_sync;
 		if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) ||
 		    sync->ds_consumer == NULL ||
 		    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 			/* Don't send more synchronization requests. */
 			sync->ds_inflight--;
 			if (sync->ds_bios != NULL) {
 				i = (int)(uintptr_t)bp->bio_caller1;
 				sync->ds_bios[i] = NULL;
 			}
 			free(bp->bio_data, M_RAID3);
 			g_destroy_bio(bp);
 			if (sync->ds_inflight > 0)
 				return;
 			if (sync->ds_consumer == NULL ||
 			    (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				return;
 			}
 			/*
 			 * Disk up-to-date, activate it.
 			 */
 			g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE,
 			    G_RAID3_EVENT_DONTWAIT);
 			return;
 		}
 
 		/* Send next synchronization request. */
 		data = bp->bio_data;
 		g_reset_bio(bp);
 		bp->bio_cmd = BIO_READ;
 		bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1);
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
 		bp->bio_done = g_raid3_sync_done;
 		bp->bio_data = data;
 		bp->bio_from = sync->ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
 		sync->ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_raid3_regular_collision(sc, bp))
 			g_raid3_sync_delay(sc, bp);
 		else
 			g_io_request(bp, sync->ds_consumer);
 
 		/* Release delayed requests if possible. */
 		g_raid3_regular_release(sc);
 
 		/* Find the smallest offset. */
 		moffset = sc->sc_mediasize;
 		for (i = 0; i < g_raid3_syncreqs; i++) {
 			bp = sync->ds_bios[i];
 			boffset = bp->bio_offset;
 			if (bp->bio_cmd == BIO_WRITE)
 				boffset *= sc->sc_ndisks - 1;
 			if (boffset < moffset)
 				moffset = boffset;
 		}
 		if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) {
 			/* Update offset_done on every 100 blocks. */
 			sync->ds_offset_done = moffset;
 			g_raid3_update_metadata(disk);
 		}
 		return;
 	    }
 	default:
 		KASSERT(1 == 0, ("Invalid command here: %u (device=%s)",
 		    bp->bio_cmd, sc->sc_name));
 		break;
 	}
 }
 
 static int
 g_raid3_register_request(struct bio *pbp)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *cbp, *tmpbp;
 	off_t offset, length;
 	u_int n, ndisks;
 	int round_robin, verify;
 
 	ndisks = 0;
 	sc = pbp->bio_to->geom->softc;
 	if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 &&
 	    sc->sc_syncdisk == NULL) {
 		g_io_deliver(pbp, EIO);
 		return (0);
 	}
 	g_raid3_init_bio(pbp);
 	length = pbp->bio_length / (sc->sc_ndisks - 1);
 	offset = pbp->bio_offset / (sc->sc_ndisks - 1);
 	round_robin = verify = 0;
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY;
 			verify = 1;
 			ndisks = sc->sc_ndisks;
 		} else {
 			verify = 0;
 			ndisks = sc->sc_ndisks - 1;
 		}
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 &&
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			round_robin = 1;
 		} else {
 			round_robin = 0;
 		}
 		KASSERT(!round_robin || !verify,
 		    ("ROUND-ROBIN and VERIFY are mutually exclusive."));
 		pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1];
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/*
 		 * Delay the request if it is colliding with a synchronization
 		 * request.
 		 */
 		if (g_raid3_sync_collision(sc, pbp)) {
 			g_raid3_regular_delay(sc, pbp);
 			return (0);
 		}
 
 		if (sc->sc_idle)
 			g_raid3_unidle(sc);
 		else
 			sc->sc_last_write = time_uptime;
 
 		ndisks = sc->sc_ndisks;
 		break;
 	}
 	for (n = 0; n < ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		cbp = g_raid3_clone_bio(sc, pbp);
 		if (cbp == NULL) {
 			while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL)
 				g_raid3_destroy_bio(sc, cbp);
 			/*
 			 * To prevent deadlock, we must run back up
 			 * with the ENOMEM for failed requests of any
 			 * of our consumers.  Our own sync requests
 			 * can stick around, as they are finite.
 			 */
 			if ((pbp->bio_cflags &
 			    G_RAID3_BIO_CFLAG_REGULAR) != 0) {
 				g_io_deliver(pbp, ENOMEM);
 				return (0);
 			}
 			return (ENOMEM);
 		}
 		cbp->bio_offset = offset;
 		cbp->bio_length = length;
 		cbp->bio_done = g_raid3_done;
 		switch (pbp->bio_cmd) {
 		case BIO_READ:
 			if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) {
 				/*
 				 * Replace invalid component with the parity
 				 * component.
 				 */
 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 			} else if (round_robin &&
 			    disk->d_no == sc->sc_round_robin) {
 				/*
 				 * In round-robin mode skip one data component
 				 * and use parity component when reading.
 				 */
 				pbp->bio_driver2 = disk;
 				disk = &sc->sc_disks[sc->sc_ndisks - 1];
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 				sc->sc_round_robin++;
 				round_robin = 0;
 			} else if (verify && disk->d_no == sc->sc_ndisks - 1) {
 				cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY;
 			}
 			break;
 		case BIO_WRITE:
 		case BIO_DELETE:
 			if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 				if (n == ndisks - 1) {
 					/*
 					 * Active parity component, mark it as such.
 					 */
 					cbp->bio_cflags |=
 					    G_RAID3_BIO_CFLAG_PARITY;
 				}
 			} else {
 				pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED;
 				if (n == ndisks - 1) {
 					/*
 					 * Parity component is not connected,
 					 * so destroy its request.
 					 */
 					pbp->bio_pflags |=
 					    G_RAID3_BIO_PFLAG_NOPARITY;
 					g_raid3_destroy_bio(sc, cbp);
 					cbp = NULL;
 				} else {
 					cbp->bio_cflags |=
 					    G_RAID3_BIO_CFLAG_NODISK;
 					disk = NULL;
 				}
 			}
 			break;
 		}
 		if (cbp != NULL)
 			cbp->bio_caller2 = disk;
 	}
 	switch (pbp->bio_cmd) {
 	case BIO_READ:
 		if (round_robin) {
 			/*
 			 * If we are in round-robin mode and 'round_robin' is
 			 * still 1, it means, that we skipped parity component
 			 * for this read and must reset sc_round_robin field.
 			 */
 			sc->sc_round_robin = 0;
 		}
 		G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) {
 			disk = cbp->bio_caller2;
 			cp = disk->d_consumer;
 			cbp->bio_to = cp->provider;
 			G_RAID3_LOGREQ(3, cbp, "Sending request.");
 			KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 			    ("Consumer %s not opened (r%dw%de%d).",
 			    cp->provider->name, cp->acr, cp->acw, cp->ace));
 			cp->index++;
 			g_io_request(cbp, cp);
 		}
 		break;
 	case BIO_WRITE:
 	case BIO_DELETE:
 		/*
 		 * Put request onto inflight queue, so we can check if new
 		 * synchronization requests don't collide with it.
 		 */
 		bioq_insert_tail(&sc->sc_inflight, pbp);
 
 		/*
 		 * Bump syncid on first write.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
 			g_raid3_bump_syncid(sc);
 		}
 		g_raid3_scatter(pbp);
 		break;
 	}
 	return (0);
 }
 
 static int
 g_raid3_can_destroy(struct g_raid3_softc *sc)
 {
 	struct g_geom *gp;
 	struct g_consumer *cp;
 
 	g_topology_assert();
 	gp = sc->sc_geom;
 	if (gp->softc == NULL)
 		return (1);
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_raid3_is_busy(sc, cp))
 			return (0);
 	}
 	gp = sc->sc_sync.ds_geom;
 	LIST_FOREACH(cp, &gp->consumer, consumer) {
 		if (g_raid3_is_busy(sc, cp))
 			return (0);
 	}
 	G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.",
 	    sc->sc_name);
 	return (1);
 }
 
 static int
 g_raid3_try_destroy(struct g_raid3_softc *sc)
 {
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	if (sc->sc_rootmount != NULL) {
 		G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 		    sc->sc_rootmount);
 		root_mount_rel(sc->sc_rootmount);
 		sc->sc_rootmount = NULL;
 	}
 
 	g_topology_lock();
 	if (!g_raid3_can_destroy(sc)) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) {
 		g_topology_unlock();
 		G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
 		    &sc->sc_worker);
 		/* Unlock sc_lock here, as it can be destroyed after wakeup. */
 		sx_xunlock(&sc->sc_lock);
 		wakeup(&sc->sc_worker);
 		sc->sc_worker = NULL;
 	} else {
 		g_topology_unlock();
 		g_raid3_destroy_device(sc);
 		free(sc->sc_disks, M_RAID3);
 		free(sc, M_RAID3);
 	}
 	return (1);
 }
 
 /*
  * Worker thread.
  */
 static void
 g_raid3_worker(void *arg)
 {
 	struct g_raid3_softc *sc;
 	struct g_raid3_event *ep;
 	struct bio *bp;
 	int timeout;
 
 	sc = arg;
 	thread_lock(curthread);
 	sched_prio(curthread, PRIBIO);
 	thread_unlock(curthread);
 
 	sx_xlock(&sc->sc_lock);
 	for (;;) {
 		G_RAID3_DEBUG(5, "%s: Let's see...", __func__);
 		/*
 		 * First take a look at events.
 		 * This is important to handle events before any I/O requests.
 		 */
 		ep = g_raid3_event_get(sc);
 		if (ep != NULL) {
 			g_raid3_event_remove(sc, ep);
 			if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) {
 				/* Update only device status. */
 				G_RAID3_DEBUG(3,
 				    "Running event for device %s.",
 				    sc->sc_name);
 				ep->e_error = 0;
 				g_raid3_update_device(sc, 1);
 			} else {
 				/* Update disk status. */
 				G_RAID3_DEBUG(3, "Running event for disk %s.",
 				     g_raid3_get_diskname(ep->e_disk));
 				ep->e_error = g_raid3_update_disk(ep->e_disk,
 				    ep->e_state);
 				if (ep->e_error == 0)
 					g_raid3_update_device(sc, 0);
 			}
 			if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) {
 				KASSERT(ep->e_error == 0,
 				    ("Error cannot be handled."));
 				g_raid3_event_free(ep);
 			} else {
 				ep->e_flags |= G_RAID3_EVENT_DONE;
 				G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__,
 				    ep);
 				mtx_lock(&sc->sc_events_mtx);
 				wakeup(ep);
 				mtx_unlock(&sc->sc_events_mtx);
 			}
 			if ((sc->sc_flags &
 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				if (g_raid3_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_RAID3_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 			}
 			G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__);
 			continue;
 		}
 		/*
 		 * Check if we can mark array as CLEAN and if we can't take
 		 * how much seconds should we wait.
 		 */
 		timeout = g_raid3_idle(sc, -1);
 		/*
 		 * Now I/O requests.
 		 */
 		/* Get first request from the queue. */
 		mtx_lock(&sc->sc_queue_mtx);
 		bp = bioq_first(&sc->sc_queue);
 		if (bp == NULL) {
 			if ((sc->sc_flags &
 			    G_RAID3_DEVICE_FLAG_DESTROY) != 0) {
 				mtx_unlock(&sc->sc_queue_mtx);
 				if (g_raid3_try_destroy(sc)) {
 					curthread->td_pflags &= ~TDP_GEOM;
 					G_RAID3_DEBUG(1, "Thread exiting.");
 					kproc_exit(0);
 				}
 				mtx_lock(&sc->sc_queue_mtx);
 			}
 			sx_xunlock(&sc->sc_lock);
 			/*
 			 * XXX: We can miss an event here, because an event
 			 *      can be added without sx-device-lock and without
 			 *      mtx-queue-lock. Maybe I should just stop using
 			 *      dedicated mutex for events synchronization and
 			 *      stick with the queue lock?
 			 *      The event will hang here until next I/O request
 			 *      or next event is received.
 			 */
 			MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1",
 			    timeout * hz);
 			sx_xlock(&sc->sc_lock);
 			G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__);
 			continue;
 		}
 process:
 		bioq_remove(&sc->sc_queue, bp);
 		mtx_unlock(&sc->sc_queue_mtx);
 
 		if (bp->bio_from->geom == sc->sc_sync.ds_geom &&
 		    (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) {
 			g_raid3_sync_request(bp);	/* READ */
 		} else if (bp->bio_to != sc->sc_provider) {
 			if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0)
 				g_raid3_regular_request(bp);
 			else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0)
 				g_raid3_sync_request(bp);	/* WRITE */
 			else {
 				KASSERT(0,
 				    ("Invalid request cflags=0x%hx to=%s.",
 				    bp->bio_cflags, bp->bio_to->name));
 			}
 		} else if (g_raid3_register_request(bp) != 0) {
 			mtx_lock(&sc->sc_queue_mtx);
 			bioq_insert_head(&sc->sc_queue, bp);
 			/*
 			 * We are short in memory, let see if there are finished
 			 * request we can free.
 			 */
 			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR)
 					goto process;
 			}
 			/*
 			 * No finished regular request, so at least keep
 			 * synchronization running.
 			 */
 			TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
 				if (bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC)
 					goto process;
 			}
 			sx_xunlock(&sc->sc_lock);
 			MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, PRIBIO | PDROP,
 			    "r3:lowmem", hz / 10);
 			sx_xlock(&sc->sc_lock);
 		}
 		G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__);
 	}
 }
 
 static void
 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk)
 {
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) != 0)
 		return;
 	if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) {
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 	} else if (sc->sc_idle &&
 	    (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) {
 		G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.",
 		    g_raid3_get_diskname(disk), sc->sc_name);
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 	}
 }
 
 static void
 g_raid3_sync_start(struct g_raid3_softc *sc)
 {
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 	struct bio *bp;
 	int error;
 	u_int n;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 	KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).",
 	    sc->sc_name, sc->sc_state));
 	disk = NULL;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING)
 			continue;
 		disk = &sc->sc_disks[n];
 		break;
 	}
 	if (disk == NULL)
 		return;
 
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	cp = g_new_consumer(sc->sc_sync.ds_geom);
 	error = g_attach(cp, sc->sc_provider);
 	KASSERT(error == 0,
 	    ("Cannot attach to %s (error=%d).", sc->sc_name, error));
 	error = g_access(cp, 1, 0, 0);
 	KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error));
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 
 	G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name,
 	    g_raid3_get_diskname(disk));
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOFAILSYNC) == 0)
 		disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY;
 	KASSERT(disk->d_sync.ds_consumer == NULL,
 	    ("Sync consumer already exists (device=%s, disk=%s).",
 	    sc->sc_name, g_raid3_get_diskname(disk)));
 
 	disk->d_sync.ds_consumer = cp;
 	disk->d_sync.ds_consumer->private = disk;
 	disk->d_sync.ds_consumer->index = 0;
 	sc->sc_syncdisk = disk;
 
 	/*
 	 * Allocate memory for synchronization bios and initialize them.
 	 */
 	disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs,
 	    M_RAID3, M_WAITOK);
 	for (n = 0; n < g_raid3_syncreqs; n++) {
 		bp = g_alloc_bio();
 		disk->d_sync.ds_bios[n] = bp;
 		bp->bio_parent = NULL;
 		bp->bio_cmd = BIO_READ;
 		bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK);
 		bp->bio_cflags = 0;
 		bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1);
 		bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset);
 		disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1);
 		bp->bio_done = g_raid3_sync_done;
 		bp->bio_from = disk->d_sync.ds_consumer;
 		bp->bio_to = sc->sc_provider;
 		bp->bio_caller1 = (void *)(uintptr_t)n;
 	}
 
 	/* Set the number of in-flight synchronization requests. */
 	disk->d_sync.ds_inflight = g_raid3_syncreqs;
 
 	/*
 	 * Fire off first synchronization requests.
 	 */
 	for (n = 0; n < g_raid3_syncreqs; n++) {
 		bp = disk->d_sync.ds_bios[n];
 		G_RAID3_LOGREQ(3, bp, "Sending synchronization request.");
 		disk->d_sync.ds_consumer->index++;
 		/*
 		 * Delay the request if it is colliding with a regular request.
 		 */
 		if (g_raid3_regular_collision(sc, bp))
 			g_raid3_sync_delay(sc, bp);
 		else
 			g_io_request(bp, disk->d_sync.ds_consumer);
 	}
 }
 
 /*
  * Stop synchronization process.
  * type: 0 - synchronization finished
  *       1 - synchronization stopped
  */
 static void
 g_raid3_sync_stop(struct g_raid3_softc *sc, int type)
 {
 	struct g_raid3_disk *disk;
 	struct g_consumer *cp;
 
 	g_topology_assert_not();
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED,
 	    ("Device not in DEGRADED state (%s, %u).", sc->sc_name,
 	    sc->sc_state));
 	disk = sc->sc_syncdisk;
 	sc->sc_syncdisk = NULL;
 	KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name));
 	KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 	    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 	    g_raid3_disk_state2str(disk->d_state)));
 	if (disk->d_sync.ds_consumer == NULL)
 		return;
 
 	if (type == 0) {
 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 	} else /* if (type == 1) */ {
 		G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 	}
 	free(disk->d_sync.ds_bios, M_RAID3);
 	disk->d_sync.ds_bios = NULL;
 	cp = disk->d_sync.ds_consumer;
 	disk->d_sync.ds_consumer = NULL;
 	disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 	sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */
 	g_topology_lock();
 	g_raid3_kill_consumer(sc, cp);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 }
 
 static void
 g_raid3_launch_provider(struct g_raid3_softc *sc)
 {
 	struct g_provider *pp;
 	struct g_raid3_disk *disk;
 	int n;
 
 	sx_assert(&sc->sc_lock, SX_LOCKED);
 
 	g_topology_lock();
 	pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name);
 	pp->mediasize = sc->sc_mediasize;
 	pp->sectorsize = sc->sc_sectorsize;
 	pp->stripesize = 0;
 	pp->stripeoffset = 0;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		disk = &sc->sc_disks[n];
 		if (disk->d_consumer && disk->d_consumer->provider &&
 		    disk->d_consumer->provider->stripesize > pp->stripesize) {
 			pp->stripesize = disk->d_consumer->provider->stripesize;
 			pp->stripeoffset = disk->d_consumer->provider->stripeoffset;
 		}
 	}
 	pp->stripesize *= sc->sc_ndisks - 1;
 	pp->stripeoffset *= sc->sc_ndisks - 1;
 	sc->sc_provider = pp;
 	g_error_provider(pp, 0);
 	g_topology_unlock();
 	G_RAID3_DEBUG(0, "Device %s launched (%u/%u).", pp->name,
 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE), sc->sc_ndisks);
 
 	if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED)
 		g_raid3_sync_start(sc);
 }
 
 static void
 g_raid3_destroy_provider(struct g_raid3_softc *sc)
 {
 	struct bio *bp;
 
 	g_topology_assert_not();
 	KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).",
 	    sc->sc_name));
 
 	g_topology_lock();
 	g_error_provider(sc->sc_provider, ENXIO);
 	mtx_lock(&sc->sc_queue_mtx);
 	while ((bp = bioq_first(&sc->sc_queue)) != NULL) {
 		bioq_remove(&sc->sc_queue, bp);
 		g_io_deliver(bp, ENXIO);
 	}
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name,
 	    sc->sc_provider->name);
-	sc->sc_provider->flags |= G_PF_WITHER;
-	g_orphan_provider(sc->sc_provider, ENXIO);
+	g_wither_provider(sc->sc_provider, ENXIO);
 	g_topology_unlock();
 	sc->sc_provider = NULL;
 	if (sc->sc_syncdisk != NULL)
 		g_raid3_sync_stop(sc, 1);
 }
 
 static void
 g_raid3_go(void *arg)
 {
 	struct g_raid3_softc *sc;
 
 	sc = arg;
 	G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name);
 	g_raid3_event_send(sc, 0,
 	    G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE);
 }
 
 static u_int
 g_raid3_determine_state(struct g_raid3_disk *disk)
 {
 	struct g_raid3_softc *sc;
 	u_int state;
 
 	sc = disk->d_softc;
 	if (sc->sc_syncid == disk->d_sync.ds_syncid) {
 		if ((disk->d_flags &
 		    G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) {
 			/* Disk does not need synchronization. */
 			state = G_RAID3_DISK_STATE_ACTIVE;
 		} else {
 			if ((sc->sc_flags &
 			     G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 			    (disk->d_flags &
 			     G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
 				/*
 				 * We can start synchronization from
 				 * the stored offset.
 				 */
 				state = G_RAID3_DISK_STATE_SYNCHRONIZING;
 			} else {
 				state = G_RAID3_DISK_STATE_STALE;
 			}
 		}
 	} else if (disk->d_sync.ds_syncid < sc->sc_syncid) {
 		/*
 		 * Reset all synchronization data for this disk,
 		 * because if it even was synchronized, it was
 		 * synchronized to disks with different syncid.
 		 */
 		disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		disk->d_sync.ds_syncid = sc->sc_syncid;
 		if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 ||
 		    (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) {
 			state = G_RAID3_DISK_STATE_SYNCHRONIZING;
 		} else {
 			state = G_RAID3_DISK_STATE_STALE;
 		}
 	} else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ {
 		/*
 		 * Not good, NOT GOOD!
 		 * It means that device was started on stale disks
 		 * and more fresh disk just arrive.
 		 * If there were writes, device is broken, sorry.
 		 * I think the best choice here is don't touch
 		 * this disk and inform the user loudly.
 		 */
 		G_RAID3_DEBUG(0, "Device %s was started before the freshest "
 		    "disk (%s) arrives!! It will not be connected to the "
 		    "running device.", sc->sc_name,
 		    g_raid3_get_diskname(disk));
 		g_raid3_destroy_disk(disk);
 		state = G_RAID3_DISK_STATE_NONE;
 		/* Return immediately, because disk was destroyed. */
 		return (state);
 	}
 	G_RAID3_DEBUG(3, "State for %s disk: %s.",
 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(state));
 	return (state);
 }
 
 /*
  * Update device state.
  */
 static void
 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force)
 {
 	struct g_raid3_disk *disk;
 	u_int state;
 
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	switch (sc->sc_state) {
 	case G_RAID3_DEVICE_STATE_STARTING:
 	    {
 		u_int n, ndirty, ndisks, genid, syncid;
 
 		KASSERT(sc->sc_provider == NULL,
 		    ("Non-NULL provider in STARTING state (%s).", sc->sc_name));
 		/*
 		 * Are we ready? We are, if all disks are connected or
 		 * one disk is missing and 'force' is true.
 		 */
 		if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) {
 			if (!force)
 				callout_drain(&sc->sc_callout);
 		} else {
 			if (force) {
 				/*
 				 * Timeout expired, so destroy device.
 				 */
 				sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 				G_RAID3_DEBUG(1, "root_mount_rel[%u] %p",
 				    __LINE__, sc->sc_rootmount);
 				root_mount_rel(sc->sc_rootmount);
 				sc->sc_rootmount = NULL;
 			}
 			return;
 		}
 
 		/*
 		 * Find the biggest genid.
 		 */
 		genid = 0;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if (disk->d_genid > genid)
 				genid = disk->d_genid;
 		}
 		sc->sc_genid = genid;
 		/*
 		 * Remove all disks without the biggest genid.
 		 */
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if (disk->d_genid < genid) {
 				G_RAID3_DEBUG(0,
 				    "Component %s (device %s) broken, skipping.",
 				    g_raid3_get_diskname(disk), sc->sc_name);
 				g_raid3_destroy_disk(disk);
 			}
 		}
 
 		/*
 		 * There must be at least 'sc->sc_ndisks - 1' components
 		 * with the same syncid and without SYNCHRONIZING flag.
 		 */
 
 		/*
 		 * Find the biggest syncid, number of valid components and
 		 * number of dirty components.
 		 */
 		ndirty = ndisks = syncid = 0;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0)
 				ndirty++;
 			if (disk->d_sync.ds_syncid > syncid) {
 				syncid = disk->d_sync.ds_syncid;
 				ndisks = 0;
 			} else if (disk->d_sync.ds_syncid < syncid) {
 				continue;
 			}
 			if ((disk->d_flags &
 			    G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) {
 				continue;
 			}
 			ndisks++;
 		}
 		/*
 		 * Do we have enough valid components?
 		 */
 		if (ndisks + 1 < sc->sc_ndisks) {
 			G_RAID3_DEBUG(0,
 			    "Device %s is broken, too few valid components.",
 			    sc->sc_name);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 			return;
 		}
 		/*
 		 * If there is one DIRTY component and all disks are present,
 		 * mark it for synchronization. If there is more than one DIRTY
 		 * component, mark parity component for synchronization.
 		 */
 		if (ndisks == sc->sc_ndisks && ndirty == 1) {
 			for (n = 0; n < sc->sc_ndisks; n++) {
 				disk = &sc->sc_disks[n];
 				if ((disk->d_flags &
 				    G_RAID3_DISK_FLAG_DIRTY) == 0) {
 					continue;
 				}
 				disk->d_flags |=
 				    G_RAID3_DISK_FLAG_SYNCHRONIZING;
 			}
 		} else if (ndisks == sc->sc_ndisks && ndirty > 1) {
 			disk = &sc->sc_disks[sc->sc_ndisks - 1];
 			disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING;
 		}
 
 		sc->sc_syncid = syncid;
 		if (force) {
 			/* Remember to bump syncid on first write. */
 			sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
 		}
 		if (ndisks == sc->sc_ndisks)
 			state = G_RAID3_DEVICE_STATE_COMPLETE;
 		else /* if (ndisks == sc->sc_ndisks - 1) */
 			state = G_RAID3_DEVICE_STATE_DEGRADED;
 		G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.",
 		    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_device_state2str(state));
 		sc->sc_state = state;
 		for (n = 0; n < sc->sc_ndisks; n++) {
 			disk = &sc->sc_disks[n];
 			if (disk->d_state == G_RAID3_DISK_STATE_NODISK)
 				continue;
 			state = g_raid3_determine_state(disk);
 			g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT);
 			if (state == G_RAID3_DISK_STATE_STALE)
 				sc->sc_bump_id |= G_RAID3_BUMP_SYNCID;
 		}
 		break;
 	    }
 	case G_RAID3_DEVICE_STATE_DEGRADED:
 		/*
 		 * Genid need to be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
 			g_raid3_bump_genid(sc);
 		}
 
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
 			return;
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) <
 		    sc->sc_ndisks - 1) {
 			if (sc->sc_provider != NULL)
 				g_raid3_destroy_provider(sc);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 			return;
 		}
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
 		    sc->sc_ndisks) {
 			state = G_RAID3_DEVICE_STATE_COMPLETE;
 			G_RAID3_DEBUG(1,
 			    "Device %s state changed from %s to %s.",
 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_device_state2str(state));
 			sc->sc_state = state;
 		}
 		if (sc->sc_provider == NULL)
 			g_raid3_launch_provider(sc);
 		if (sc->sc_rootmount != NULL) {
 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 		}
 		break;
 	case G_RAID3_DEVICE_STATE_COMPLETE:
 		/*
 		 * Genid need to be bumped immediately, so do it here.
 		 */
 		if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) {
 			sc->sc_bump_id &= ~G_RAID3_BUMP_GENID;
 			g_raid3_bump_genid(sc);
 		}
 
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0)
 			return;
 		KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >=
 		    sc->sc_ndisks - 1,
 		    ("Too few ACTIVE components in COMPLETE state (device %s).",
 		    sc->sc_name));
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) ==
 		    sc->sc_ndisks - 1) {
 			state = G_RAID3_DEVICE_STATE_DEGRADED;
 			G_RAID3_DEBUG(1,
 			    "Device %s state changed from %s to %s.",
 			    sc->sc_name, g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_device_state2str(state));
 			sc->sc_state = state;
 		}
 		if (sc->sc_provider == NULL)
 			g_raid3_launch_provider(sc);
 		if (sc->sc_rootmount != NULL) {
 			G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__,
 			    sc->sc_rootmount);
 			root_mount_rel(sc->sc_rootmount);
 			sc->sc_rootmount = NULL;
 		}
 		break;
 	default:
 		KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state)));
 		break;
 	}
 }
 
 /*
  * Update disk state and device state if needed.
  */
 #define	DISK_STATE_CHANGED()	G_RAID3_DEBUG(1,			\
 	"Disk %s state changed from %s to %s (device %s).",		\
 	g_raid3_get_diskname(disk),					\
 	g_raid3_disk_state2str(disk->d_state),				\
 	g_raid3_disk_state2str(state), sc->sc_name)
 static int
 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state)
 {
 	struct g_raid3_softc *sc;
 
 	sc = disk->d_softc;
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 again:
 	G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.",
 	    g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state),
 	    g_raid3_disk_state2str(state));
 	switch (state) {
 	case G_RAID3_DISK_STATE_NEW:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk arrive.
 		 */
 		/* Previous state should be NONE. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_state = state;
 		G_RAID3_DEBUG(1, "Device %s: provider %s detected.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING)
 			break;
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		state = g_raid3_determine_state(disk);
 		if (state != G_RAID3_DISK_STATE_NONE)
 			goto again;
 		break;
 	case G_RAID3_DISK_STATE_ACTIVE:
 		/*
 		 * Possible scenarios:
 		 * 1. New disk does not need synchronization.
 		 * 2. Synchronization process finished successfully.
 		 */
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		/* Previous state should be NEW or SYNCHRONIZING. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW ||
 		    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING;
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC;
 			g_raid3_sync_stop(sc, 0);
 		}
 		disk->d_state = state;
 		disk->d_sync.ds_offset = 0;
 		disk->d_sync.ds_offset_done = 0;
 		g_raid3_update_idle(sc, disk);
 		g_raid3_update_metadata(disk);
 		G_RAID3_DEBUG(1, "Device %s: provider %s activated.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		break;
 	case G_RAID3_DISK_STATE_STALE:
 		/*
 		 * Possible scenarios:
 		 * 1. Stale disk was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		/*
 		 * STALE state is only possible if device is marked
 		 * NOAUTOSYNC.
 		 */
 		KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		g_raid3_update_metadata(disk);
 		G_RAID3_DEBUG(0, "Device %s: provider %s is stale.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 		break;
 	case G_RAID3_DISK_STATE_SYNCHRONIZING:
 		/*
 		 * Possible scenarios:
 		 * 1. Disk which needs synchronization was connected.
 		 */
 		/* Previous state should be NEW. */
 		KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 		    ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE,
 		    ("Wrong device state (%s, %s, %s, %s).", sc->sc_name,
 		    g_raid3_device_state2str(sc->sc_state),
 		    g_raid3_get_diskname(disk),
 		    g_raid3_disk_state2str(disk->d_state)));
 		DISK_STATE_CHANGED();
 
 		if (disk->d_state == G_RAID3_DISK_STATE_NEW)
 			disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY;
 		disk->d_state = state;
 		if (sc->sc_provider != NULL) {
 			g_raid3_sync_start(sc);
 			g_raid3_update_metadata(disk);
 		}
 		break;
 	case G_RAID3_DISK_STATE_DISCONNECTED:
 		/*
 		 * Possible scenarios:
 		 * 1. Device wasn't running yet, but disk disappear.
 		 * 2. Disk was active and disapppear.
 		 * 3. Disk disappear during synchronization process.
 		 */
 		if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED ||
 		    sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) {
 			/*
 			 * Previous state should be ACTIVE, STALE or
 			 * SYNCHRONIZING.
 			 */
 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE ||
 			    disk->d_state == G_RAID3_DISK_STATE_STALE ||
 			    disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING,
 			    ("Wrong disk state (%s, %s).",
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 		} else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) {
 			/* Previous state should be NEW. */
 			KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW,
 			    ("Wrong disk state (%s, %s).",
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 			/*
 			 * Reset bumping syncid if disk disappeared in STARTING
 			 * state.
 			 */
 			if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0)
 				sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID;
 #ifdef	INVARIANTS
 		} else {
 			KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).",
 			    sc->sc_name,
 			    g_raid3_device_state2str(sc->sc_state),
 			    g_raid3_get_diskname(disk),
 			    g_raid3_disk_state2str(disk->d_state)));
 #endif
 		}
 		DISK_STATE_CHANGED();
 		G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.",
 		    sc->sc_name, g_raid3_get_diskname(disk));
 
 		g_raid3_destroy_disk(disk);
 		break;
 	default:
 		KASSERT(1 == 0, ("Unknown state (%u).", state));
 		break;
 	}
 	return (0);
 }
 #undef	DISK_STATE_CHANGED
 
 int
 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	/* Metadata are stored on last sector. */
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL) {
 		G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).",
 		    cp->provider->name, error);
 		return (error);
 	}
 
 	/* Decode metadata. */
 	error = raid3_metadata_decode(buf, md);
 	g_free(buf);
 	if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0)
 		return (EINVAL);
 	if (md->md_version > G_RAID3_VERSION) {
 		G_RAID3_DEBUG(0,
 		    "Kernel module is too old to handle metadata from %s.",
 		    cp->provider->name);
 		return (EINVAL);
 	}
 	if (error != 0) {
 		G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.",
 		    cp->provider->name);
 		return (error);
 	}
 	if (md->md_sectorsize > MAXPHYS) {
 		G_RAID3_DEBUG(0, "The blocksize is too big.");
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md)
 {
 
 	if (md->md_no >= sc->sc_ndisks) {
 		G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.",
 		    pp->name, md->md_no);
 		return (EINVAL);
 	}
 	if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) {
 		G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.",
 		    pp->name, md->md_no);
 		return (EEXIST);
 	}
 	if (md->md_all != sc->sc_ndisks) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_all", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mediasize % md->md_sectorsize) != 0) {
 		G_RAID3_DEBUG(1, "Invalid metadata (mediasize %% sectorsize != "
 		    "0) on disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_mediasize != sc->sc_mediasize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_mediasize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid size of disk %s (device %s), skipping.", pp->name,
 		    sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if (md->md_sectorsize != sc->sc_sectorsize) {
 		G_RAID3_DEBUG(1,
 		    "Invalid '%s' field on disk %s (device %s), skipping.",
 		    "md_sectorsize", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((sc->sc_sectorsize % pp->sectorsize) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid sector size of disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid device flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 &&
 	    (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) {
 		/*
 		 * VERIFY and ROUND-ROBIN options are mutally exclusive.
 		 */
 		G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on "
 		    "disk %s (device %s), skipping.", pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) {
 		G_RAID3_DEBUG(1,
 		    "Invalid disk flags on disk %s (device %s), skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 int
 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp,
     struct g_raid3_metadata *md)
 {
 	struct g_raid3_disk *disk;
 	int error;
 
 	g_topology_assert_not();
 	G_RAID3_DEBUG(2, "Adding disk %s.", pp->name);
 
 	error = g_raid3_check_metadata(sc, pp, md);
 	if (error != 0)
 		return (error);
 	if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING &&
 	    md->md_genid < sc->sc_genid) {
 		G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.",
 		    pp->name, sc->sc_name);
 		return (EINVAL);
 	}
 	disk = g_raid3_init_disk(sc, pp, md, &error);
 	if (disk == NULL)
 		return (error);
 	error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW,
 	    G_RAID3_EVENT_WAIT);
 	if (error != 0)
 		return (error);
 	if (md->md_version < G_RAID3_VERSION) {
 		G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).",
 		    pp->name, md->md_version, G_RAID3_VERSION);
 		g_raid3_update_metadata(disk);
 	}
 	return (0);
 }
 
 static void
 g_raid3_destroy_delayed(void *arg, int flag)
 {
 	struct g_raid3_softc *sc;
 	int error;
 
 	if (flag == EV_CANCEL) {
 		G_RAID3_DEBUG(1, "Destroying canceled.");
 		return;
 	}
 	sc = arg;
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) == 0,
 	    ("DESTROY flag set on %s.", sc->sc_name));
 	KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0,
 	    ("DESTROYING flag not set on %s.", sc->sc_name));
 	G_RAID3_DEBUG(0, "Destroying %s (delayed).", sc->sc_name);
 	error = g_raid3_destroy(sc, G_RAID3_DESTROY_SOFT);
 	if (error != 0) {
 		G_RAID3_DEBUG(0, "Cannot destroy %s.", sc->sc_name);
 		sx_xunlock(&sc->sc_lock);
 	}
 	g_topology_lock();
 }
 
 static int
 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace)
 {
 	struct g_raid3_softc *sc;
 	int dcr, dcw, dce, error = 0;
 
 	g_topology_assert();
 	G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr,
 	    acw, ace);
 
 	sc = pp->geom->softc;
 	if (sc == NULL && acr <= 0 && acw <= 0 && ace <= 0)
 		return (0);
 	KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name));
 
 	dcr = pp->acr + acr;
 	dcw = pp->acw + acw;
 	dce = pp->ace + ace;
 
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0 ||
 	    g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) {
 		if (acr > 0 || acw > 0 || ace > 0)
 			error = ENXIO;
 		goto end;
 	}
 	if (dcw == 0)
 		g_raid3_idle(sc, dcw);
 	if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROYING) != 0) {
 		if (acr > 0 || acw > 0 || ace > 0) {
 			error = ENXIO;
 			goto end;
 		}
 		if (dcr == 0 && dcw == 0 && dce == 0) {
 			g_post_event(g_raid3_destroy_delayed, sc, M_WAITOK,
 			    sc, NULL);
 		}
 	}
 end:
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static struct g_geom *
 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md)
 {
 	struct g_raid3_softc *sc;
 	struct g_geom *gp;
 	int error, timeout;
 	u_int n;
 
 	g_topology_assert();
 	G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
 
 	/* One disk is minimum. */
 	if (md->md_all < 1)
 		return (NULL);
 	/*
 	 * Action geom.
 	 */
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO);
 	sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3,
 	    M_WAITOK | M_ZERO);
 	gp->start = g_raid3_start;
 	gp->orphan = g_raid3_orphan;
 	gp->access = g_raid3_access;
 	gp->dumpconf = g_raid3_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_mediasize = md->md_mediasize;
 	sc->sc_sectorsize = md->md_sectorsize;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_round_robin = 0;
 	sc->sc_flags = md->md_mflags;
 	sc->sc_bump_id = 0;
 	sc->sc_idle = 1;
 	sc->sc_last_write = time_uptime;
 	sc->sc_writes = 0;
 	for (n = 0; n < sc->sc_ndisks; n++) {
 		sc->sc_disks[n].d_softc = sc;
 		sc->sc_disks[n].d_no = n;
 		sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK;
 	}
 	sx_init(&sc->sc_lock, "graid3:lock");
 	bioq_init(&sc->sc_queue);
 	mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF);
 	bioq_init(&sc->sc_regular_delayed);
 	bioq_init(&sc->sc_inflight);
 	bioq_init(&sc->sc_sync_delayed);
 	TAILQ_INIT(&sc->sc_events);
 	mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF);
 	callout_init(&sc->sc_callout, 1);
 	sc->sc_state = G_RAID3_DEVICE_STATE_STARTING;
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 	/*
 	 * Synchronization geom.
 	 */
 	gp = g_new_geomf(mp, "%s.sync", md->md_name);
 	gp->softc = sc;
 	gp->orphan = g_raid3_orphan;
 	sc->sc_sync.ds_geom = gp;
 
 	if (!g_raid3_use_malloc) {
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k",
 		    65536, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k;
 		sc->sc_zones[G_RAID3_ZONE_64K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k",
 		    16384, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k;
 		sc->sc_zones[G_RAID3_ZONE_16K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k",
 		    4096, g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k;
 		sc->sc_zones[G_RAID3_ZONE_4K].sz_requested =
 		    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0;
 	}
 
 	error = kproc_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0,
 	    "g_raid3 %s", md->md_name);
 	if (error != 0) {
 		G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.",
 		    sc->sc_name);
 		if (!g_raid3_use_malloc) {
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone);
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone);
 			uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone);
 		}
 		g_destroy_geom(sc->sc_sync.ds_geom);
 		mtx_destroy(&sc->sc_events_mtx);
 		mtx_destroy(&sc->sc_queue_mtx);
 		sx_destroy(&sc->sc_lock);
 		g_destroy_geom(sc->sc_geom);
 		free(sc->sc_disks, M_RAID3);
 		free(sc, M_RAID3);
 		return (NULL);
 	}
 
 	G_RAID3_DEBUG(1, "Device %s created (%u components, id=%u).",
 	    sc->sc_name, sc->sc_ndisks, sc->sc_id);
 
 	sc->sc_rootmount = root_mount_hold("GRAID3");
 	G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
 
 	/*
 	 * Run timeout.
 	 */
 	timeout = atomic_load_acq_int(&g_raid3_timeout);
 	callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc);
 	return (sc->sc_geom);
 }
 
 int
 g_raid3_destroy(struct g_raid3_softc *sc, int how)
 {
 	struct g_provider *pp;
 
 	g_topology_assert_not();
 	if (sc == NULL)
 		return (ENXIO);
 	sx_assert(&sc->sc_lock, SX_XLOCKED);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		switch (how) {
 		case G_RAID3_DESTROY_SOFT:
 			G_RAID3_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		case G_RAID3_DESTROY_DELAYED:
 			G_RAID3_DEBUG(1,
 			    "Device %s will be destroyed on last close.",
 			    pp->name);
 			if (sc->sc_syncdisk != NULL)
 				g_raid3_sync_stop(sc, 1);
 			sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROYING;
 			return (EBUSY);
 		case G_RAID3_DESTROY_HARD:
 			G_RAID3_DEBUG(1, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 			break;
 		}
 	}
 
 	g_topology_lock();
 	if (sc->sc_geom->softc == NULL) {
 		g_topology_unlock();
 		return (0);
 	}
 	sc->sc_geom->softc = NULL;
 	sc->sc_sync.ds_geom->softc = NULL;
 	g_topology_unlock();
 
 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY;
 	sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT;
 	G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc);
 	sx_xunlock(&sc->sc_lock);
 	mtx_lock(&sc->sc_queue_mtx);
 	wakeup(sc);
 	wakeup(&sc->sc_queue);
 	mtx_unlock(&sc->sc_queue_mtx);
 	G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker);
 	while (sc->sc_worker != NULL)
 		tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5);
 	G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker);
 	sx_xlock(&sc->sc_lock);
 	g_raid3_destroy_device(sc);
 	free(sc->sc_disks, M_RAID3);
 	free(sc, M_RAID3);
 	return (0);
 }
 
 static void
 g_raid3_taste_orphan(struct g_consumer *cp)
 {
 
 	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
 	    cp->provider->name));
 }
 
 static struct g_geom *
 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_raid3_metadata md;
 	struct g_raid3_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	G_RAID3_DEBUG(2, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "raid3:taste");
 	/* This orphan function should be never called. */
 	gp->orphan = g_raid3_taste_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_raid3_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
 		return (NULL);
 	if (g_raid3_debug >= 2)
 		raid3_metadata_dump(&md);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_sync.ds_geom == gp)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id) {
 			G_RAID3_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 		break;
 	}
 	if (gp == NULL) {
 		gp = g_raid3_create(mp, &md);
 		if (gp == NULL) {
 			G_RAID3_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 	}
 	G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 	g_topology_unlock();
 	sx_xlock(&sc->sc_lock);
 	error = g_raid3_add_disk(sc, pp, &md);
 	if (error != 0) {
 		G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 		    pp->name, gp->name, error);
 		if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) ==
 		    sc->sc_ndisks) {
 			g_cancel_event(sc);
 			g_raid3_destroy(sc, G_RAID3_DESTROY_HARD);
 			g_topology_lock();
 			return (NULL);
 		}
 		gp = NULL;
 	}
 	sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (gp);
 }
 
 static int
 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
     struct g_geom *gp)
 {
 	struct g_raid3_softc *sc;
 	int error;
 
 	g_topology_unlock();
 	sc = gp->softc;
 	sx_xlock(&sc->sc_lock);
 	g_cancel_event(sc);
 	error = g_raid3_destroy(gp->softc, G_RAID3_DESTROY_SOFT);
 	if (error != 0)
 		sx_xunlock(&sc->sc_lock);
 	g_topology_lock();
 	return (error);
 }
 
 static void
 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_raid3_softc *sc;
 
 	g_topology_assert();
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	/* Skip synchronization geom. */
 	if (gp == sc->sc_sync.ds_geom)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		struct g_raid3_disk *disk;
 
 		disk = cp->private;
 		if (disk == NULL)
 			return;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		sbuf_printf(sb, "%s<Type>", indent);
 		if (disk->d_no == sc->sc_ndisks - 1)
 			sbuf_printf(sb, "PARITY");
 		else
 			sbuf_printf(sb, "DATA");
 		sbuf_printf(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
 		    (u_int)disk->d_no);
 		if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) {
 			sbuf_printf(sb, "%s<Synchronized>", indent);
 			if (disk->d_sync.ds_offset == 0)
 				sbuf_printf(sb, "0%%");
 			else {
 				sbuf_printf(sb, "%u%%",
 				    (u_int)((disk->d_sync.ds_offset * 100) /
 				    (sc->sc_mediasize / (sc->sc_ndisks - 1))));
 			}
 			sbuf_printf(sb, "</Synchronized>\n");
 			if (disk->d_sync.ds_offset > 0) {
 				sbuf_printf(sb, "%s<BytesSynced>%jd"
 				    "</BytesSynced>\n", indent,
 				    (intmax_t)disk->d_sync.ds_offset);
 			}
 		}
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent,
 		    disk->d_sync.ds_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (disk->d_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((disk->d_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY");
 			ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED");
 			ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING,
 			    "SYNCHRONIZING");
 			ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC");
 			ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_raid3_disk_state2str(disk->d_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	} else {
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		if (!g_raid3_use_malloc) {
 			sbuf_printf(sb,
 			    "%s<Zone4kRequested>%u</Zone4kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_4K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone4kFailed>%u</Zone4kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_4K].sz_failed);
 			sbuf_printf(sb,
 			    "%s<Zone16kRequested>%u</Zone16kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_16K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone16kFailed>%u</Zone16kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_16K].sz_failed);
 			sbuf_printf(sb,
 			    "%s<Zone64kRequested>%u</Zone64kRequested>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_64K].sz_requested);
 			sbuf_printf(sb,
 			    "%s<Zone64kFailed>%u</Zone64kFailed>\n", indent,
 			    sc->sc_zones[G_RAID3_ZONE_64K].sz_failed);
 		}
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid);
 		sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid);
 		sbuf_printf(sb, "%s<Flags>", indent);
 		if (sc->sc_flags == 0)
 			sbuf_printf(sb, "NONE");
 		else {
 			int first = 1;
 
 #define	ADD_FLAG(flag, name)	do {					\
 	if ((sc->sc_flags & (flag)) != 0) {				\
 		if (!first)						\
 			sbuf_printf(sb, ", ");				\
 		else							\
 			first = 0;					\
 		sbuf_printf(sb, name);					\
 	}								\
 } while (0)
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOFAILSYNC, "NOFAILSYNC");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN,
 			    "ROUND-ROBIN");
 			ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY");
 #undef	ADD_FLAG
 		}
 		sbuf_printf(sb, "</Flags>\n");
 		sbuf_printf(sb, "%s<Components>%u</Components>\n", indent,
 		    sc->sc_ndisks);
 		sbuf_printf(sb, "%s<State>%s</State>\n", indent,
 		    g_raid3_device_state2str(sc->sc_state));
 		sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 }
 
 static void
 g_raid3_shutdown_post_sync(void *arg, int howto)
 {
 	struct g_class *mp;
 	struct g_geom *gp, *gp2;
 	struct g_raid3_softc *sc;
 	int error;
 
 	mp = arg;
 	g_topology_lock();
 	g_raid3_shutdown = 1;
 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
 		if ((sc = gp->softc) == NULL)
 			continue;
 		/* Skip synchronization geom. */
 		if (gp == sc->sc_sync.ds_geom)
 			continue;
 		g_topology_unlock();
 		sx_xlock(&sc->sc_lock);
 		g_raid3_idle(sc, -1);
 		g_cancel_event(sc);
 		error = g_raid3_destroy(sc, G_RAID3_DESTROY_DELAYED);
 		if (error != 0)
 			sx_xunlock(&sc->sc_lock);
 		g_topology_lock();
 	}
 	g_topology_unlock();
 }
 
 static void
 g_raid3_init(struct g_class *mp)
 {
 
 	g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync,
 	    g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST);
 	if (g_raid3_post_sync == NULL)
 		G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event.");
 }
 
 static void
 g_raid3_fini(struct g_class *mp)
 {
 
 	if (g_raid3_post_sync != NULL)
 		EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync);
 }
 
 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3);
Index: user/alc/PQ_LAUNDRY/sys/geom/shsec/g_shsec.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/shsec/g_shsec.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/shsec/g_shsec.c	(revision 306283)
@@ -1,836 +1,836 @@
 /*-
  * Copyright (c) 2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <geom/shsec/g_shsec.h>
 
 FEATURE(geom_shsec, "GEOM shared secret device support");
 
 static MALLOC_DEFINE(M_SHSEC, "shsec_data", "GEOM_SHSEC Data");
 
 static uma_zone_t g_shsec_zone;
 
 static int g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force);
 static int g_shsec_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 
 static g_taste_t g_shsec_taste;
 static g_ctl_req_t g_shsec_config;
 static g_dumpconf_t g_shsec_dumpconf;
 static g_init_t g_shsec_init;
 static g_fini_t g_shsec_fini;
 
 struct g_class g_shsec_class = {
 	.name = G_SHSEC_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_shsec_config,
 	.taste = g_shsec_taste,
 	.destroy_geom = g_shsec_destroy_geom,
 	.init = g_shsec_init,
 	.fini = g_shsec_fini
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, shsec, CTLFLAG_RW, 0,
     "GEOM_SHSEC stuff");
 static u_int g_shsec_debug = 0;
 SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, debug, CTLFLAG_RWTUN, &g_shsec_debug, 0,
     "Debug level");
 static u_int g_shsec_maxmem = MAXPHYS * 100;
 SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_shsec_maxmem,
     0, "Maximum memory that can be allocated for I/O (in bytes)");
 static u_int g_shsec_alloc_failed = 0;
 SYSCTL_UINT(_kern_geom_shsec, OID_AUTO, alloc_failed, CTLFLAG_RD,
     &g_shsec_alloc_failed, 0, "How many times I/O allocation failed");
 
 /*
  * Greatest Common Divisor.
  */
 static u_int
 gcd(u_int a, u_int b)
 {
 	u_int c;
 
 	while (b != 0) {
 		c = a;
 		a = b;
 		b = (c % b);
 	}
 	return (a);
 }
 
 /*
  * Least Common Multiple.
  */
 static u_int
 lcm(u_int a, u_int b)
 {
 
 	return ((a * b) / gcd(a, b));
 }
 
 static void
 g_shsec_init(struct g_class *mp __unused)
 {
 
 	g_shsec_zone = uma_zcreate("g_shsec_zone", MAXPHYS, NULL, NULL, NULL,
 	    NULL, 0, 0);
 	g_shsec_maxmem -= g_shsec_maxmem % MAXPHYS;
 	uma_zone_set_max(g_shsec_zone, g_shsec_maxmem / MAXPHYS);
 }
 
 static void
 g_shsec_fini(struct g_class *mp __unused)
 {
 
 	uma_zdestroy(g_shsec_zone);
 }
 
 /*
  * Return the number of valid disks.
  */
 static u_int
 g_shsec_nvalid(struct g_shsec_softc *sc)
 {
 	u_int i, no;
 
 	no = 0;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i] != NULL)
 			no++;
 	}
 
 	return (no);
 }
 
 static void
 g_shsec_remove_disk(struct g_consumer *cp)
 {
 	struct g_shsec_softc *sc;
 	u_int no;
 
 	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
 	sc = (struct g_shsec_softc *)cp->private;
 	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));
 	no = cp->index;
 
 	G_SHSEC_DEBUG(0, "Disk %s removed from %s.", cp->provider->name,
 	    sc->sc_name);
 
 	sc->sc_disks[no] = NULL;
 	if (sc->sc_provider != NULL) {
-		g_orphan_provider(sc->sc_provider, ENXIO);
+		g_wither_provider(sc->sc_provider, ENXIO);
 		sc->sc_provider = NULL;
 		G_SHSEC_DEBUG(0, "Device %s removed.", sc->sc_name);
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 static void
 g_shsec_orphan(struct g_consumer *cp)
 {
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 
 	g_shsec_remove_disk(cp);
 	/* If there are no valid disks anymore, remove device. */
 	if (g_shsec_nvalid(sc) == 0)
 		g_shsec_destroy(sc, 1);
 }
 
 static int
 g_shsec_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp1, *cp2;
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 	int error;
 
 	gp = pp->geom;
 	sc = gp->softc;
 
 	if (sc == NULL) {
 		/*
 		 * It looks like geom is being withered.
 		 * In that case we allow only negative requests.
 		 */
 		KASSERT(dr <= 0 && dw <= 0 && de <= 0,
 		    ("Positive access request (device=%s).", pp->name));
 		if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 &&
 		    (pp->ace + de) == 0) {
 			G_SHSEC_DEBUG(0, "Device %s definitely destroyed.",
 			    gp->name);
 		}
 		return (0);
 	}
 
 	/* On first open, grab an extra "exclusive" bit */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... and let go of it on last close */
 	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
 		de--;
 
 	error = ENXIO;
 	LIST_FOREACH(cp1, &gp->consumer, consumer) {
 		error = g_access(cp1, dr, dw, de);
 		if (error == 0)
 			continue;
 		/*
 		 * If we fail here, backout all previous changes.
 		 */
 		LIST_FOREACH(cp2, &gp->consumer, consumer) {
 			if (cp1 == cp2)
 				return (error);
 			g_access(cp2, -dr, -dw, -de);
 		}
 		/* NOTREACHED */
 	}
 
 	return (error);
 }
 
 static void
 g_shsec_xor1(uint32_t *src, uint32_t *dst, ssize_t len)
 {
 
 	for (; len > 0; len -= sizeof(uint32_t), dst++)
 		*dst = *dst ^ *src++;
 	KASSERT(len == 0, ("len != 0 (len=%zd)", len));
 }
 
 static void
 g_shsec_done(struct bio *bp)
 {
 	struct g_shsec_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	if (bp->bio_error == 0)
 		G_SHSEC_LOGREQ(2, bp, "Request done.");
 	else {
 		G_SHSEC_LOGREQ(0, bp, "Request failed (error=%d).",
 		    bp->bio_error);
 		if (pbp->bio_error == 0)
 			pbp->bio_error = bp->bio_error;
 	}
 	if (pbp->bio_cmd == BIO_READ) {
 		if ((pbp->bio_pflags & G_SHSEC_BFLAG_FIRST) != 0) {
 			bcopy(bp->bio_data, pbp->bio_data, pbp->bio_length);
 			pbp->bio_pflags = 0;
 		} else {
 			g_shsec_xor1((uint32_t *)bp->bio_data,
 			    (uint32_t *)pbp->bio_data,
 			    (ssize_t)pbp->bio_length);
 		}
 	}
 	bzero(bp->bio_data, bp->bio_length);
 	uma_zfree(g_shsec_zone, bp->bio_data);
 	g_destroy_bio(bp);
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		pbp->bio_completed = pbp->bio_length;
 		g_io_deliver(pbp, pbp->bio_error);
 	}
 }
 
 static void
 g_shsec_xor2(uint32_t *rand, uint32_t *dst, ssize_t len)
 {
 
 	for (; len > 0; len -= sizeof(uint32_t), dst++) {
 		*rand = arc4random();
 		*dst = *dst ^ *rand++;
 	}
 	KASSERT(len == 0, ("len != 0 (len=%zd)", len));
 }
 
 static void
 g_shsec_start(struct bio *bp)
 {
 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
 	struct g_shsec_softc *sc;
 	struct bio *cbp;
 	uint32_t *dst;
 	ssize_t len;
 	u_int no;
 	int error;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL, provider's error should be set and g_shsec_start()
 	 * should not be called at all.
 	 */
 	KASSERT(sc != NULL,
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 
 	G_SHSEC_LOGREQ(2, bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_FLUSH:
 		/*
 		 * Only those requests are supported.
 		 */
 		break;
 	case BIO_DELETE:
 	case BIO_GETATTR:
 		/* To which provider it should be delivered? */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	/*
 	 * Allocate all bios first and calculate XOR.
 	 */
 	dst = NULL;
 	len = bp->bio_length;
 	if (bp->bio_cmd == BIO_READ)
 		bp->bio_pflags = G_SHSEC_BFLAG_FIRST;
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			error = ENOMEM;
 			goto failure;
 		}
 		TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 
 		/*
 		 * Fill in the component buf structure.
 		 */
 		cbp->bio_done = g_shsec_done;
 		cbp->bio_data = uma_zalloc(g_shsec_zone, M_NOWAIT);
 		if (cbp->bio_data == NULL) {
 			g_shsec_alloc_failed++;
 			error = ENOMEM;
 			goto failure;
 		}
 		cbp->bio_caller2 = sc->sc_disks[no];
 		if (bp->bio_cmd == BIO_WRITE) {
 			if (no == 0) {
 				dst = (uint32_t *)cbp->bio_data;
 				bcopy(bp->bio_data, dst, len);
 			} else {
 				g_shsec_xor2((uint32_t *)cbp->bio_data, dst,
 				    len);
 			}
 		}
 	}
 	/*
 	 * Fire off all allocated requests!
 	 */
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		struct g_consumer *cp;
 
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		cbp->bio_to = cp->provider;
 		G_SHSEC_LOGREQ(2, cbp, "Sending request.");
 		g_io_request(cbp, cp);
 	}
 	return;
 failure:
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		bp->bio_children--;
 		if (cbp->bio_data != NULL) {
 			bzero(cbp->bio_data, cbp->bio_length);
 			uma_zfree(g_shsec_zone, cbp->bio_data);
 		}
 		g_destroy_bio(cbp);
 	}
 	if (bp->bio_error == 0)
 		bp->bio_error = error;
 	g_io_deliver(bp, bp->bio_error);
 }
 
 static void
 g_shsec_check_and_run(struct g_shsec_softc *sc)
 {
 	off_t mediasize, ms;
 	u_int no, sectorsize = 0;
 
 	if (g_shsec_nvalid(sc) != sc->sc_ndisks)
 		return;
 
 	sc->sc_provider = g_new_providerf(sc->sc_geom, "shsec/%s", sc->sc_name);
 	/*
 	 * Find the smallest disk.
 	 */
 	mediasize = sc->sc_disks[0]->provider->mediasize;
 	mediasize -= sc->sc_disks[0]->provider->sectorsize;
 	sectorsize = sc->sc_disks[0]->provider->sectorsize;
 	for (no = 1; no < sc->sc_ndisks; no++) {
 		ms = sc->sc_disks[no]->provider->mediasize;
 		ms -= sc->sc_disks[no]->provider->sectorsize;
 		if (ms < mediasize)
 			mediasize = ms;
 		sectorsize = lcm(sectorsize,
 		    sc->sc_disks[no]->provider->sectorsize);
 	}
 	sc->sc_provider->sectorsize = sectorsize;
 	sc->sc_provider->mediasize = mediasize;
 	g_error_provider(sc->sc_provider, 0);
 
 	G_SHSEC_DEBUG(0, "Device %s activated.", sc->sc_name);
 }
 
 static int
 g_shsec_read_metadata(struct g_consumer *cp, struct g_shsec_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL)
 		return (error);
 
 	/* Decode metadata. */
 	shsec_metadata_decode(buf, md);
 	g_free(buf);
 
 	return (0);
 }
 
 /*
  * Add disk to given device.
  */
 static int
 g_shsec_add_disk(struct g_shsec_softc *sc, struct g_provider *pp, u_int no)
 {
 	struct g_consumer *cp, *fcp;
 	struct g_geom *gp;
 	struct g_shsec_metadata md;
 	int error;
 
 	/* Metadata corrupted? */
 	if (no >= sc->sc_ndisks)
 		return (EINVAL);
 
 	/* Check if disk is not already attached. */
 	if (sc->sc_disks[no] != NULL)
 		return (EEXIST);
 
 	gp = sc->sc_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) {
 		error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 		if (error != 0) {
 			g_detach(cp);
 			g_destroy_consumer(cp);
 			return (error);
 		}
 	}
 
 	/* Reread metadata. */
 	error = g_shsec_read_metadata(cp, &md);
 	if (error != 0)
 		goto fail;
 
 	if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0 ||
 	    strcmp(md.md_name, sc->sc_name) != 0 || md.md_id != sc->sc_id) {
 		G_SHSEC_DEBUG(0, "Metadata on %s changed.", pp->name);
 		goto fail;
 	}
 
 	cp->private = sc;
 	cp->index = no;
 	sc->sc_disks[no] = cp;
 
 	G_SHSEC_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
 
 	g_shsec_check_and_run(sc);
 
 	return (0);
 fail:
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0))
 		g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	return (error);
 }
 
 static struct g_geom *
 g_shsec_create(struct g_class *mp, const struct g_shsec_metadata *md)
 {
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 	u_int no;
 
 	G_SHSEC_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id);
 
 	/* Two disks is minimum. */
 	if (md->md_all < 2) {
 		G_SHSEC_DEBUG(0, "Too few disks defined for %s.", md->md_name);
 		return (NULL);
 	}
 
 	/* Check for duplicate unit */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
 			G_SHSEC_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_SHSEC, M_WAITOK | M_ZERO);
 	gp->start = g_shsec_start;
 	gp->spoiled = g_shsec_orphan;
 	gp->orphan = g_shsec_orphan;
 	gp->access = g_shsec_access;
 	gp->dumpconf = g_shsec_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_ndisks = md->md_all;
 	sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks,
 	    M_SHSEC, M_WAITOK | M_ZERO);
 	for (no = 0; no < sc->sc_ndisks; no++)
 		sc->sc_disks[no] = NULL;
 
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 
 	G_SHSEC_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
 
 	return (gp);
 }
 
 static int
 g_shsec_destroy(struct g_shsec_softc *sc, boolean_t force)
 {
 	struct g_provider *pp;
 	struct g_geom *gp;
 	u_int no;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_SHSEC_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_SHSEC_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	}
 
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		if (sc->sc_disks[no] != NULL)
 			g_shsec_remove_disk(sc->sc_disks[no]);
 	}
 
 	gp = sc->sc_geom;
 	gp->softc = NULL;
 	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
 	    gp->name));
 	free(sc->sc_disks, M_SHSEC);
 	free(sc, M_SHSEC);
 
 	pp = LIST_FIRST(&gp->provider);
 	if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0))
 		G_SHSEC_DEBUG(0, "Device %s destroyed.", gp->name);
 
 	g_wither_geom(gp, ENXIO);
 
 	return (0);
 }
 
 static int
 g_shsec_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused,
     struct g_geom *gp)
 {
 	struct g_shsec_softc *sc;
 
 	sc = gp->softc;
 	return (g_shsec_destroy(sc, 0));
 }
 
 static struct g_geom *
 g_shsec_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_shsec_metadata md;
 	struct g_shsec_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	G_SHSEC_DEBUG(3, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "shsec:taste");
 	gp->start = g_shsec_start;
 	gp->access = g_shsec_access;
 	gp->orphan = g_shsec_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_shsec_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (strcmp(md.md_magic, G_SHSEC_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version > G_SHSEC_VERSION) {
 		G_SHSEC_DEBUG(0, "Kernel module is too old to handle %s.\n",
 		    pp->name);
 		return (NULL);
 	}
 	/*
 	 * Backward compatibility:
 	 */
 	/* There was no md_provsize field in earlier versions of metadata. */
 	if (md.md_version < 1)
 		md.md_provsize = pp->mediasize;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != pp->mediasize)
 		return (NULL);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id)
 			continue;
 		break;
 	}
 	if (gp != NULL) {
 		G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_shsec_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 			    pp->name, gp->name, error);
 			return (NULL);
 		}
 	} else {
 		gp = g_shsec_create(mp, &md);
 		if (gp == NULL) {
 			G_SHSEC_DEBUG(0, "Cannot create device %s.", md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 		G_SHSEC_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_shsec_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_SHSEC_DEBUG(0, "Cannot add disk %s to %s (error=%d).",
 			    pp->name, gp->name, error);
 			g_shsec_destroy(sc, 1);
 			return (NULL);
 		}
 	}
 	return (gp);
 }
 
 static struct g_shsec_softc *
 g_shsec_find_device(struct g_class *mp, const char *name)
 {
 	struct g_shsec_softc *sc;
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_shsec_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_shsec_softc *sc;
 	int *force, *nargs, error;
 	const char *name;
 	char param[16];
 	u_int i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No '%s' argument.", "force");
 		return;
 	}
 
 	for (i = 0; i < (u_int)*nargs; i++) {
 		snprintf(param, sizeof(param), "arg%u", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", i);
 			return;
 		}
 		sc = g_shsec_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_shsec_destroy(sc, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    sc->sc_name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_shsec_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_SHSEC_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "stop") == 0) {
 		g_shsec_ctl_destroy(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_shsec_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_shsec_softc *sc;
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
 		    (u_int)cp->index);
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
 		    indent, sc->sc_ndisks, g_shsec_nvalid(sc));
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
 			sbuf_printf(sb, "UP");
 		else
 			sbuf_printf(sb, "DOWN");
 		sbuf_printf(sb, "</State>\n");
 	}
 }
 
 DECLARE_GEOM_CLASS(g_shsec_class, g_shsec);
Index: user/alc/PQ_LAUNDRY/sys/geom/stripe/g_stripe.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/stripe/g_stripe.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/stripe/g_stripe.c	(revision 306283)
@@ -1,1271 +1,1270 @@
 /*-
  * Copyright (c) 2004-2005 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 #include <geom/stripe/g_stripe.h>
 
 FEATURE(geom_stripe, "GEOM striping support");
 
 static MALLOC_DEFINE(M_STRIPE, "stripe_data", "GEOM_STRIPE Data");
 
 static uma_zone_t g_stripe_zone;
 
 static int g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force);
 static int g_stripe_destroy_geom(struct gctl_req *req, struct g_class *mp,
     struct g_geom *gp);
 
 static g_taste_t g_stripe_taste;
 static g_ctl_req_t g_stripe_config;
 static g_dumpconf_t g_stripe_dumpconf;
 static g_init_t g_stripe_init;
 static g_fini_t g_stripe_fini;
 
 struct g_class g_stripe_class = {
 	.name = G_STRIPE_CLASS_NAME,
 	.version = G_VERSION,
 	.ctlreq = g_stripe_config,
 	.taste = g_stripe_taste,
 	.destroy_geom = g_stripe_destroy_geom,
 	.init = g_stripe_init,
 	.fini = g_stripe_fini
 };
 
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, stripe, CTLFLAG_RW, 0,
     "GEOM_STRIPE stuff");
 static u_int g_stripe_debug = 0;
 SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, debug, CTLFLAG_RWTUN, &g_stripe_debug, 0,
     "Debug level");
 static int g_stripe_fast = 0;
 static int
 g_sysctl_stripe_fast(SYSCTL_HANDLER_ARGS)
 {
 	int error, fast;
 
 	fast = g_stripe_fast;
 	error = sysctl_handle_int(oidp, &fast, 0, req);
 	if (error == 0 && req->newptr != NULL)
 		g_stripe_fast = fast;
 	return (error);
 }
 SYSCTL_PROC(_kern_geom_stripe, OID_AUTO, fast, CTLTYPE_INT | CTLFLAG_RWTUN,
     NULL, 0, g_sysctl_stripe_fast, "I", "Fast, but memory-consuming, mode");
 static u_int g_stripe_maxmem = MAXPHYS * 100;
 SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, maxmem, CTLFLAG_RDTUN, &g_stripe_maxmem,
     0, "Maximum memory that can be allocated in \"fast\" mode (in bytes)");
 static u_int g_stripe_fast_failed = 0;
 SYSCTL_UINT(_kern_geom_stripe, OID_AUTO, fast_failed, CTLFLAG_RD,
     &g_stripe_fast_failed, 0, "How many times \"fast\" mode failed");
 
 /*
  * Greatest Common Divisor.
  */
 static u_int
 gcd(u_int a, u_int b)
 {
 	u_int c;
 
 	while (b != 0) {
 		c = a;
 		a = b;
 		b = (c % b);
 	}
 	return (a);
 }
 
 /*
  * Least Common Multiple.
  */
 static u_int
 lcm(u_int a, u_int b)
 {
 
 	return ((a * b) / gcd(a, b));
 }
 
 static void
 g_stripe_init(struct g_class *mp __unused)
 {
 
 	g_stripe_zone = uma_zcreate("g_stripe_zone", MAXPHYS, NULL, NULL,
 	    NULL, NULL, 0, 0);
 	g_stripe_maxmem -= g_stripe_maxmem % MAXPHYS;
 	uma_zone_set_max(g_stripe_zone, g_stripe_maxmem / MAXPHYS);
 }
 
 static void
 g_stripe_fini(struct g_class *mp __unused)
 {
 
 	uma_zdestroy(g_stripe_zone);
 }
 
 /*
  * Return the number of valid disks.
  */
 static u_int
 g_stripe_nvalid(struct g_stripe_softc *sc)
 {
 	u_int i, no;
 
 	no = 0;
 	for (i = 0; i < sc->sc_ndisks; i++) {
 		if (sc->sc_disks[i] != NULL)
 			no++;
 	}
 
 	return (no);
 }
 
 static void
 g_stripe_remove_disk(struct g_consumer *cp)
 {
 	struct g_stripe_softc *sc;
 
 	g_topology_assert();
 	KASSERT(cp != NULL, ("Non-valid disk in %s.", __func__));
 	sc = (struct g_stripe_softc *)cp->geom->softc;
 	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));
 
 	if (cp->private == NULL) {
 		G_STRIPE_DEBUG(0, "Disk %s removed from %s.",
 		    cp->provider->name, sc->sc_name);
 		cp->private = (void *)(uintptr_t)-1;
 	}
 
 	if (sc->sc_provider != NULL) {
-		sc->sc_provider->flags |= G_PF_WITHER;
 		G_STRIPE_DEBUG(0, "Device %s deactivated.",
 		    sc->sc_provider->name);
-		g_orphan_provider(sc->sc_provider, ENXIO);
+		g_wither_provider(sc->sc_provider, ENXIO);
 		sc->sc_provider = NULL;
 	}
 
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		return;
 	sc->sc_disks[cp->index] = NULL;
 	cp->index = 0;
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	/* If there are no valid disks anymore, remove device. */
 	if (LIST_EMPTY(&sc->sc_geom->consumer))
 		g_stripe_destroy(sc, 1);
 }
 
 static void
 g_stripe_orphan(struct g_consumer *cp)
 {
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 
 	g_stripe_remove_disk(cp);
 }
 
 static int
 g_stripe_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *cp1, *cp2, *tmp;
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	gp = pp->geom;
 	sc = gp->softc;
 	KASSERT(sc != NULL, ("NULL sc in %s.", __func__));
 
 	/* On first open, grab an extra "exclusive" bit */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... and let go of it on last close */
 	if ((pp->acr + dr) == 0 && (pp->acw + dw) == 0 && (pp->ace + de) == 0)
 		de--;
 
 	LIST_FOREACH_SAFE(cp1, &gp->consumer, consumer, tmp) {
 		error = g_access(cp1, dr, dw, de);
 		if (error != 0)
 			goto fail;
 		if (cp1->acr == 0 && cp1->acw == 0 && cp1->ace == 0 &&
 		    cp1->private != NULL) {
 			g_stripe_remove_disk(cp1); /* May destroy geom. */
 		}
 	}
 	return (0);
 
 fail:
 	LIST_FOREACH(cp2, &gp->consumer, consumer) {
 		if (cp1 == cp2)
 			break;
 		g_access(cp2, -dr, -dw, -de);
 	}
 	return (error);
 }
 
 static void
 g_stripe_copy(struct g_stripe_softc *sc, char *src, char *dst, off_t offset,
     off_t length, int mode)
 {
 	u_int stripesize;
 	size_t len;
 
 	stripesize = sc->sc_stripesize;
 	len = (size_t)(stripesize - (offset & (stripesize - 1)));
 	do {
 		bcopy(src, dst, len);
 		if (mode) {
 			dst += len + stripesize * (sc->sc_ndisks - 1);
 			src += len;
 		} else {
 			dst += len;
 			src += len + stripesize * (sc->sc_ndisks - 1);
 		}
 		length -= len;
 		KASSERT(length >= 0,
 		    ("Length < 0 (stripesize=%zu, offset=%jd, length=%jd).",
 		    (size_t)stripesize, (intmax_t)offset, (intmax_t)length));
 		if (length > stripesize)
 			len = stripesize;
 		else
 			len = length;
 	} while (length > 0);
 }
 
 static void
 g_stripe_done(struct bio *bp)
 {
 	struct g_stripe_softc *sc;
 	struct bio *pbp;
 
 	pbp = bp->bio_parent;
 	sc = pbp->bio_to->geom->softc;
 	if (bp->bio_cmd == BIO_READ && bp->bio_caller1 != NULL) {
 		g_stripe_copy(sc, bp->bio_data, bp->bio_caller1, bp->bio_offset,
 		    bp->bio_length, 1);
 		bp->bio_data = bp->bio_caller1;
 		bp->bio_caller1 = NULL;
 	}
 	mtx_lock(&sc->sc_lock);
 	if (pbp->bio_error == 0)
 		pbp->bio_error = bp->bio_error;
 	pbp->bio_completed += bp->bio_completed;
 	pbp->bio_inbed++;
 	if (pbp->bio_children == pbp->bio_inbed) {
 		mtx_unlock(&sc->sc_lock);
 		if (pbp->bio_driver1 != NULL)
 			uma_zfree(g_stripe_zone, pbp->bio_driver1);
 		g_io_deliver(pbp, pbp->bio_error);
 	} else
 		mtx_unlock(&sc->sc_lock);
 	g_destroy_bio(bp);
 }
 
 static int
 g_stripe_start_fast(struct bio *bp, u_int no, off_t offset, off_t length)
 {
 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
 	u_int nparts = 0, stripesize;
 	struct g_stripe_softc *sc;
 	char *addr, *data = NULL;
 	struct bio *cbp;
 	int error;
 
 	sc = bp->bio_to->geom->softc;
 
 	addr = bp->bio_data;
 	stripesize = sc->sc_stripesize;
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		error = ENOMEM;
 		goto failure;
 	}
 	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 	nparts++;
 	/*
 	 * Fill in the component buf structure.
 	 */
 	cbp->bio_done = g_stripe_done;
 	cbp->bio_offset = offset;
 	cbp->bio_data = addr;
 	cbp->bio_caller1 = NULL;
 	cbp->bio_length = length;
 	cbp->bio_caller2 = sc->sc_disks[no];
 
 	/* offset -= offset % stripesize; */
 	offset -= offset & (stripesize - 1);
 	addr += length;
 	length = bp->bio_length - length;
 	for (no++; length > 0; no++, length -= stripesize, addr += stripesize) {
 		if (no > sc->sc_ndisks - 1) {
 			no = 0;
 			offset += stripesize;
 		}
 		if (nparts >= sc->sc_ndisks) {
 			cbp = TAILQ_NEXT(cbp, bio_queue);
 			if (cbp == NULL)
 				cbp = TAILQ_FIRST(&queue);
 			nparts++;
 			/*
 			 * Update bio structure.
 			 */
 			/*
 			 * MIN() is in case when
 			 * (bp->bio_length % sc->sc_stripesize) != 0.
 			 */
 			cbp->bio_length += MIN(stripesize, length);
 			if (cbp->bio_caller1 == NULL) {
 				cbp->bio_caller1 = cbp->bio_data;
 				cbp->bio_data = NULL;
 				if (data == NULL) {
 					data = uma_zalloc(g_stripe_zone,
 					    M_NOWAIT);
 					if (data == NULL) {
 						error = ENOMEM;
 						goto failure;
 					}
 				}
 			}
 		} else {
 			cbp = g_clone_bio(bp);
 			if (cbp == NULL) {
 				error = ENOMEM;
 				goto failure;
 			}
 			TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 			nparts++;
 			/*
 			 * Fill in the component buf structure.
 			 */
 			cbp->bio_done = g_stripe_done;
 			cbp->bio_offset = offset;
 			cbp->bio_data = addr;
 			cbp->bio_caller1 = NULL;
 			/*
 			 * MIN() is in case when
 			 * (bp->bio_length % sc->sc_stripesize) != 0.
 			 */
 			cbp->bio_length = MIN(stripesize, length);
 			cbp->bio_caller2 = sc->sc_disks[no];
 		}
 	}
 	if (data != NULL)
 		bp->bio_driver1 = data;
 	/*
 	 * Fire off all allocated requests!
 	 */
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		struct g_consumer *cp;
 
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		cbp->bio_to = cp->provider;
 		if (cbp->bio_caller1 != NULL) {
 			cbp->bio_data = data;
 			if (bp->bio_cmd == BIO_WRITE) {
 				g_stripe_copy(sc, cbp->bio_caller1, data,
 				    cbp->bio_offset, cbp->bio_length, 0);
 			}
 			data += cbp->bio_length;
 		}
 		G_STRIPE_LOGREQ(cbp, "Sending request.");
 		g_io_request(cbp, cp);
 	}
 	return (0);
 failure:
 	if (data != NULL)
 		uma_zfree(g_stripe_zone, data);
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		if (cbp->bio_caller1 != NULL) {
 			cbp->bio_data = cbp->bio_caller1;
 			cbp->bio_caller1 = NULL;
 		}
 		bp->bio_children--;
 		g_destroy_bio(cbp);
 	}
 	return (error);
 }
 
 static int
 g_stripe_start_economic(struct bio *bp, u_int no, off_t offset, off_t length)
 {
 	TAILQ_HEAD(, bio) queue = TAILQ_HEAD_INITIALIZER(queue);
 	struct g_stripe_softc *sc;
 	uint32_t stripesize;
 	struct bio *cbp;
 	char *addr;
 	int error;
 
 	sc = bp->bio_to->geom->softc;
 
 	stripesize = sc->sc_stripesize;
 
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
 		error = ENOMEM;
 		goto failure;
 	}
 	TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 	/*
 	 * Fill in the component buf structure.
 	 */
 	if (bp->bio_length == length)
 		cbp->bio_done = g_std_done;	/* Optimized lockless case. */
 	else
 		cbp->bio_done = g_stripe_done;
 	cbp->bio_offset = offset;
 	cbp->bio_length = length;
 	if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 		bp->bio_ma_n = round_page(bp->bio_ma_offset +
 		    bp->bio_length) / PAGE_SIZE;
 		addr = NULL;
 	} else
 		addr = bp->bio_data;
 	cbp->bio_caller2 = sc->sc_disks[no];
 
 	/* offset -= offset % stripesize; */
 	offset -= offset & (stripesize - 1);
 	if (bp->bio_cmd != BIO_DELETE)
 		addr += length;
 	length = bp->bio_length - length;
 	for (no++; length > 0; no++, length -= stripesize) {
 		if (no > sc->sc_ndisks - 1) {
 			no = 0;
 			offset += stripesize;
 		}
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			error = ENOMEM;
 			goto failure;
 		}
 		TAILQ_INSERT_TAIL(&queue, cbp, bio_queue);
 
 		/*
 		 * Fill in the component buf structure.
 		 */
 		cbp->bio_done = g_stripe_done;
 		cbp->bio_offset = offset;
 		/*
 		 * MIN() is in case when
 		 * (bp->bio_length % sc->sc_stripesize) != 0.
 		 */
 		cbp->bio_length = MIN(stripesize, length);
 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
 			cbp->bio_ma_offset += (uintptr_t)addr;
 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
 			cbp->bio_ma_offset %= PAGE_SIZE;
 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
 			    cbp->bio_length) / PAGE_SIZE;
 		} else
 			cbp->bio_data = addr;
 
 		cbp->bio_caller2 = sc->sc_disks[no];
 
 		if (bp->bio_cmd != BIO_DELETE)
 			addr += stripesize;
 	}
 	/*
 	 * Fire off all allocated requests!
 	 */
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		struct g_consumer *cp;
 
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		cbp->bio_to = cp->provider;
 		G_STRIPE_LOGREQ(cbp, "Sending request.");
 		g_io_request(cbp, cp);
 	}
 	return (0);
 failure:
 	while ((cbp = TAILQ_FIRST(&queue)) != NULL) {
 		TAILQ_REMOVE(&queue, cbp, bio_queue);
 		bp->bio_children--;
 		g_destroy_bio(cbp);
 	}
 	return (error);
 }
 
 static void
 g_stripe_flush(struct g_stripe_softc *sc, struct bio *bp)
 {
 	struct bio_queue_head queue;
 	struct g_consumer *cp;
 	struct bio *cbp;
 	u_int no;
 
 	bioq_init(&queue);
 	for (no = 0; no < sc->sc_ndisks; no++) {
 		cbp = g_clone_bio(bp);
 		if (cbp == NULL) {
 			for (cbp = bioq_first(&queue); cbp != NULL;
 			    cbp = bioq_first(&queue)) {
 				bioq_remove(&queue, cbp);
 				g_destroy_bio(cbp);
 			}
 			if (bp->bio_error == 0)
 				bp->bio_error = ENOMEM;
 			g_io_deliver(bp, bp->bio_error);
 			return;
 		}
 		bioq_insert_tail(&queue, cbp);
 		cbp->bio_done = g_stripe_done;
 		cbp->bio_caller2 = sc->sc_disks[no];
 		cbp->bio_to = sc->sc_disks[no]->provider;
 	}
 	for (cbp = bioq_first(&queue); cbp != NULL; cbp = bioq_first(&queue)) {
 		bioq_remove(&queue, cbp);
 		G_STRIPE_LOGREQ(cbp, "Sending request.");
 		cp = cbp->bio_caller2;
 		cbp->bio_caller2 = NULL;
 		g_io_request(cbp, cp);
 	}
 }
 
 static void
 g_stripe_start(struct bio *bp)
 {
 	off_t offset, start, length, nstripe;
 	struct g_stripe_softc *sc;
 	u_int no, stripesize;
 	int error, fast = 0;
 
 	sc = bp->bio_to->geom->softc;
 	/*
 	 * If sc == NULL, provider's error should be set and g_stripe_start()
 	 * should not be called at all.
 	 */
 	KASSERT(sc != NULL,
 	    ("Provider's error should be set (error=%d)(device=%s).",
 	    bp->bio_to->error, bp->bio_to->name));
 
 	G_STRIPE_LOGREQ(bp, "Request received.");
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	case BIO_FLUSH:
 		g_stripe_flush(sc, bp);
 		return;
 	case BIO_GETATTR:
 		/* To which provider it should be delivered? */
 	default:
 		g_io_deliver(bp, EOPNOTSUPP);
 		return;
 	}
 
 	stripesize = sc->sc_stripesize;
 
 	/*
 	 * Calculations are quite messy, but fast I hope.
 	 */
 
 	/* Stripe number. */
 	/* nstripe = bp->bio_offset / stripesize; */
 	nstripe = bp->bio_offset >> (off_t)sc->sc_stripebits;
 	/* Disk number. */
 	no = nstripe % sc->sc_ndisks;
 	/* Start position in stripe. */
 	/* start = bp->bio_offset % stripesize; */
 	start = bp->bio_offset & (stripesize - 1);
 	/* Start position in disk. */
 	/* offset = (nstripe / sc->sc_ndisks) * stripesize + start; */
 	offset = ((nstripe / sc->sc_ndisks) << sc->sc_stripebits) + start;
 	/* Length of data to operate. */
 	length = MIN(bp->bio_length, stripesize - start);
 
 	/*
 	 * Do use "fast" mode when:
 	 * 1. "Fast" mode is ON.
 	 * and
 	 * 2. Request size is less than or equal to MAXPHYS,
 	 *    which should always be true.
 	 * and
 	 * 3. Request size is bigger than stripesize * ndisks. If it isn't,
 	 *    there will be no need to send more than one I/O request to
 	 *    a provider, so there is nothing to optmize.
 	 * and
 	 * 4. Request is not unmapped.
 	 * and
 	 * 5. It is not a BIO_DELETE.
 	 */
 	if (g_stripe_fast && bp->bio_length <= MAXPHYS &&
 	    bp->bio_length >= stripesize * sc->sc_ndisks &&
 	    (bp->bio_flags & BIO_UNMAPPED) == 0 &&
 	    bp->bio_cmd != BIO_DELETE) {
 		fast = 1;
 	}
 	error = 0;
 	if (fast) {
 		error = g_stripe_start_fast(bp, no, offset, length);
 		if (error != 0)
 			g_stripe_fast_failed++;
 	}
 	/*
 	 * Do use "economic" when:
 	 * 1. "Economic" mode is ON.
 	 * or
 	 * 2. "Fast" mode failed. It can only fail if there is no memory.
 	 */
 	if (!fast || error != 0)
 		error = g_stripe_start_economic(bp, no, offset, length);
 	if (error != 0) {
 		if (bp->bio_error == 0)
 			bp->bio_error = error;
 		g_io_deliver(bp, bp->bio_error);
 	}
 }
 
 static void
 g_stripe_check_and_run(struct g_stripe_softc *sc)
 {
 	struct g_provider *dp;
 	off_t mediasize, ms;
 	u_int no, sectorsize = 0;
 
 	g_topology_assert();
 	if (g_stripe_nvalid(sc) != sc->sc_ndisks)
 		return;
 
 	sc->sc_provider = g_new_providerf(sc->sc_geom, "stripe/%s",
 	    sc->sc_name);
 	sc->sc_provider->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
 	if (g_stripe_fast == 0)
 		sc->sc_provider->flags |= G_PF_ACCEPT_UNMAPPED;
 	/*
 	 * Find the smallest disk.
 	 */
 	mediasize = sc->sc_disks[0]->provider->mediasize;
 	if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
 		mediasize -= sc->sc_disks[0]->provider->sectorsize;
 	mediasize -= mediasize % sc->sc_stripesize;
 	sectorsize = sc->sc_disks[0]->provider->sectorsize;
 	for (no = 1; no < sc->sc_ndisks; no++) {
 		dp = sc->sc_disks[no]->provider;
 		ms = dp->mediasize;
 		if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC)
 			ms -= dp->sectorsize;
 		ms -= ms % sc->sc_stripesize;
 		if (ms < mediasize)
 			mediasize = ms;
 		sectorsize = lcm(sectorsize, dp->sectorsize);
 
 		/* A provider underneath us doesn't support unmapped */
 		if ((dp->flags & G_PF_ACCEPT_UNMAPPED) == 0) {
 			G_STRIPE_DEBUG(1, "Cancelling unmapped "
 			    "because of %s.", dp->name);
 			sc->sc_provider->flags &= ~G_PF_ACCEPT_UNMAPPED;
 		}
 	}
 	sc->sc_provider->sectorsize = sectorsize;
 	sc->sc_provider->mediasize = mediasize * sc->sc_ndisks;
 	sc->sc_provider->stripesize = sc->sc_stripesize;
 	sc->sc_provider->stripeoffset = 0;
 	g_error_provider(sc->sc_provider, 0);
 
 	G_STRIPE_DEBUG(0, "Device %s activated.", sc->sc_provider->name);
 }
 
 static int
 g_stripe_read_metadata(struct g_consumer *cp, struct g_stripe_metadata *md)
 {
 	struct g_provider *pp;
 	u_char *buf;
 	int error;
 
 	g_topology_assert();
 
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL)
 		return (error);
 
 	/* Decode metadata. */
 	stripe_metadata_decode(buf, md);
 	g_free(buf);
 
 	return (0);
 }
 
 /*
  * Add disk to given device.
  */
 static int
 g_stripe_add_disk(struct g_stripe_softc *sc, struct g_provider *pp, u_int no)
 {
 	struct g_consumer *cp, *fcp;
 	struct g_geom *gp;
 	int error;
 
 	g_topology_assert();
 	/* Metadata corrupted? */
 	if (no >= sc->sc_ndisks)
 		return (EINVAL);
 
 	/* Check if disk is not already attached. */
 	if (sc->sc_disks[no] != NULL)
 		return (EEXIST);
 
 	gp = sc->sc_geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	cp->private = NULL;
 	cp->index = no;
 	error = g_attach(cp, pp);
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0)) {
 		error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 		if (error != 0) {
 			g_detach(cp);
 			g_destroy_consumer(cp);
 			return (error);
 		}
 	}
 	if (sc->sc_type == G_STRIPE_TYPE_AUTOMATIC) {
 		struct g_stripe_metadata md;
 
 		/* Reread metadata. */
 		error = g_stripe_read_metadata(cp, &md);
 		if (error != 0)
 			goto fail;
 
 		if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0 ||
 		    strcmp(md.md_name, sc->sc_name) != 0 ||
 		    md.md_id != sc->sc_id) {
 			G_STRIPE_DEBUG(0, "Metadata on %s changed.", pp->name);
 			goto fail;
 		}
 	}
 
 	sc->sc_disks[no] = cp;
 	G_STRIPE_DEBUG(0, "Disk %s attached to %s.", pp->name, sc->sc_name);
 	g_stripe_check_and_run(sc);
 
 	return (0);
 fail:
 	if (fcp != NULL && (fcp->acr > 0 || fcp->acw > 0 || fcp->ace > 0))
 		g_access(cp, -fcp->acr, -fcp->acw, -fcp->ace);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	return (error);
 }
 
 static struct g_geom *
 g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md,
     u_int type)
 {
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 	u_int no;
 
 	g_topology_assert();
 	G_STRIPE_DEBUG(1, "Creating device %s (id=%u).", md->md_name,
 	    md->md_id);
 
 	/* Two disks is minimum. */
 	if (md->md_all < 2) {
 		G_STRIPE_DEBUG(0, "Too few disks defined for %s.", md->md_name);
 		return (NULL);
 	}
 #if 0
 	/* Stripe size have to be grater than or equal to sector size. */
 	if (md->md_stripesize < sectorsize) {
 		G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
 		return (NULL);
 	}
 #endif
 	/* Stripe size have to be power of 2. */
 	if (!powerof2(md->md_stripesize)) {
 		G_STRIPE_DEBUG(0, "Invalid stripe size for %s.", md->md_name);
 		return (NULL);
 	}
 
 	/* Check for duplicate unit */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && strcmp(sc->sc_name, md->md_name) == 0) {
 			G_STRIPE_DEBUG(0, "Device %s already configured.",
 			    sc->sc_name);
 			return (NULL);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	sc = malloc(sizeof(*sc), M_STRIPE, M_WAITOK | M_ZERO);
 	gp->start = g_stripe_start;
 	gp->spoiled = g_stripe_orphan;
 	gp->orphan = g_stripe_orphan;
 	gp->access = g_stripe_access;
 	gp->dumpconf = g_stripe_dumpconf;
 
 	sc->sc_id = md->md_id;
 	sc->sc_stripesize = md->md_stripesize;
 	sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1);
 	sc->sc_ndisks = md->md_all;
 	sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks,
 	    M_STRIPE, M_WAITOK | M_ZERO);
 	for (no = 0; no < sc->sc_ndisks; no++)
 		sc->sc_disks[no] = NULL;
 	sc->sc_type = type;
 	mtx_init(&sc->sc_lock, "gstripe lock", NULL, MTX_DEF);
 
 	gp->softc = sc;
 	sc->sc_geom = gp;
 	sc->sc_provider = NULL;
 
 	G_STRIPE_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id);
 
 	return (gp);
 }
 
 static int
 g_stripe_destroy(struct g_stripe_softc *sc, boolean_t force)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp, *cp1;
 	struct g_geom *gp;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	pp = sc->sc_provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		if (force) {
 			G_STRIPE_DEBUG(0, "Device %s is still open, so it "
 			    "can't be definitely removed.", pp->name);
 		} else {
 			G_STRIPE_DEBUG(1,
 			    "Device %s is still open (r%dw%de%d).", pp->name,
 			    pp->acr, pp->acw, pp->ace);
 			return (EBUSY);
 		}
 	}
 
 	gp = sc->sc_geom;
 	LIST_FOREACH_SAFE(cp, &gp->consumer, consumer, cp1) {
 		g_stripe_remove_disk(cp);
 		if (cp1 == NULL)
 			return (0);	/* Recursion happened. */
 	}
 	if (!LIST_EMPTY(&gp->consumer))
 		return (EINPROGRESS);
 
 	gp->softc = NULL;
 	KASSERT(sc->sc_provider == NULL, ("Provider still exists? (device=%s)",
 	    gp->name));
 	free(sc->sc_disks, M_STRIPE);
 	mtx_destroy(&sc->sc_lock);
 	free(sc, M_STRIPE);
 	G_STRIPE_DEBUG(0, "Device %s destroyed.", gp->name);
 	g_wither_geom(gp, ENXIO);
 	return (0);
 }
 
 static int
 g_stripe_destroy_geom(struct gctl_req *req __unused,
     struct g_class *mp __unused, struct g_geom *gp)
 {
 	struct g_stripe_softc *sc;
 
 	sc = gp->softc;
 	return (g_stripe_destroy(sc, 0));
 }
 
 static struct g_geom *
 g_stripe_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
 {
 	struct g_stripe_metadata md;
 	struct g_stripe_softc *sc;
 	struct g_consumer *cp;
 	struct g_geom *gp;
 	int error;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 
 	/* Skip providers that are already open for writing. */
 	if (pp->acw > 0)
 		return (NULL);
 
 	G_STRIPE_DEBUG(3, "Tasting %s.", pp->name);
 
 	gp = g_new_geomf(mp, "stripe:taste");
 	gp->start = g_stripe_start;
 	gp->access = g_stripe_access;
 	gp->orphan = g_stripe_orphan;
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = g_stripe_read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 	if (error != 0)
 		return (NULL);
 	gp = NULL;
 
 	if (strcmp(md.md_magic, G_STRIPE_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version > G_STRIPE_VERSION) {
 		printf("geom_stripe.ko module is too old to handle %s.\n",
 		    pp->name);
 		return (NULL);
 	}
 	/*
 	 * Backward compatibility:
 	 */
 	/* There was no md_provider field in earlier versions of metadata. */
 	if (md.md_version < 2)
 		bzero(md.md_provider, sizeof(md.md_provider));
 	/* There was no md_provsize field in earlier versions of metadata. */
 	if (md.md_version < 3)
 		md.md_provsize = pp->mediasize;
 
 	if (md.md_provider[0] != '\0' &&
 	    !g_compare_names(md.md_provider, pp->name))
 		return (NULL);
 	if (md.md_provsize != pp->mediasize)
 		return (NULL);
 
 	/*
 	 * Let's check if device already exists.
 	 */
 	sc = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (sc->sc_type != G_STRIPE_TYPE_AUTOMATIC)
 			continue;
 		if (strcmp(md.md_name, sc->sc_name) != 0)
 			continue;
 		if (md.md_id != sc->sc_id)
 			continue;
 		break;
 	}
 	if (gp != NULL) {
 		G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_stripe_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_STRIPE_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			return (NULL);
 		}
 	} else {
 		gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_AUTOMATIC);
 		if (gp == NULL) {
 			G_STRIPE_DEBUG(0, "Cannot create device %s.",
 			    md.md_name);
 			return (NULL);
 		}
 		sc = gp->softc;
 		G_STRIPE_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name);
 		error = g_stripe_add_disk(sc, pp, md.md_no);
 		if (error != 0) {
 			G_STRIPE_DEBUG(0,
 			    "Cannot add disk %s to %s (error=%d).", pp->name,
 			    gp->name, error);
 			g_stripe_destroy(sc, 1);
 			return (NULL);
 		}
 	}
 
 	return (gp);
 }
 
 static void
 g_stripe_ctl_create(struct gctl_req *req, struct g_class *mp)
 {
 	u_int attached, no;
 	struct g_stripe_metadata md;
 	struct g_provider *pp;
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 	struct sbuf *sb;
 	intmax_t *stripesize;
 	const char *name;
 	char param[16];
 	int *nargs;
 
 	g_topology_assert();
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 2) {
 		gctl_error(req, "Too few arguments.");
 		return;
 	}
 
 	strlcpy(md.md_magic, G_STRIPE_MAGIC, sizeof(md.md_magic));
 	md.md_version = G_STRIPE_VERSION;
 	name = gctl_get_asciiparam(req, "arg0");
 	if (name == NULL) {
 		gctl_error(req, "No 'arg%u' argument.", 0);
 		return;
 	}
 	strlcpy(md.md_name, name, sizeof(md.md_name));
 	md.md_id = arc4random();
 	md.md_no = 0;
 	md.md_all = *nargs - 1;
 	stripesize = gctl_get_paraml(req, "stripesize", sizeof(*stripesize));
 	if (stripesize == NULL) {
 		gctl_error(req, "No '%s' argument.", "stripesize");
 		return;
 	}
 	md.md_stripesize = *stripesize;
 	bzero(md.md_provider, sizeof(md.md_provider));
 	/* This field is not important here. */
 	md.md_provsize = 0;
 
 	/* Check all providers are valid */
 	for (no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", no);
 			return;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		if (pp == NULL) {
 			G_STRIPE_DEBUG(1, "Disk %s is invalid.", name);
 			gctl_error(req, "Disk %s is invalid.", name);
 			return;
 		}
 	}
 
 	gp = g_stripe_create(mp, &md, G_STRIPE_TYPE_MANUAL);
 	if (gp == NULL) {
 		gctl_error(req, "Can't configure %s.", md.md_name);
 		return;
 	}
 
 	sc = gp->softc;
 	sb = sbuf_new_auto();
 	sbuf_printf(sb, "Can't attach disk(s) to %s:", gp->name);
 	for (attached = 0, no = 1; no < *nargs; no++) {
 		snprintf(param, sizeof(param), "arg%u", no);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", no);
 			continue;
 		}
 		if (strncmp(name, "/dev/", strlen("/dev/")) == 0)
 			name += strlen("/dev/");
 		pp = g_provider_by_name(name);
 		KASSERT(pp != NULL, ("Provider %s disappear?!", name));
 		if (g_stripe_add_disk(sc, pp, no - 1) != 0) {
 			G_STRIPE_DEBUG(1, "Disk %u (%s) not attached to %s.",
 			    no, pp->name, gp->name);
 			sbuf_printf(sb, " %s", pp->name);
 			continue;
 		}
 		attached++;
 	}
 	sbuf_finish(sb);
 	if (md.md_all != attached) {
 		g_stripe_destroy(gp->softc, 1);
 		gctl_error(req, "%s", sbuf_data(sb));
 	}
 	sbuf_delete(sb);
 }
 
 static struct g_stripe_softc *
 g_stripe_find_device(struct g_class *mp, const char *name)
 {
 	struct g_stripe_softc *sc;
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(sc->sc_name, name) == 0)
 			return (sc);
 	}
 	return (NULL);
 }
 
 static void
 g_stripe_ctl_destroy(struct gctl_req *req, struct g_class *mp)
 {
 	struct g_stripe_softc *sc;
 	int *force, *nargs, error;
 	const char *name;
 	char param[16];
 	u_int i;
 
 	g_topology_assert();
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "No '%s' argument.", "nargs");
 		return;
 	}
 	if (*nargs <= 0) {
 		gctl_error(req, "Missing device(s).");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof(*force));
 	if (force == NULL) {
 		gctl_error(req, "No '%s' argument.", "force");
 		return;
 	}
 
 	for (i = 0; i < (u_int)*nargs; i++) {
 		snprintf(param, sizeof(param), "arg%u", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%u' argument.", i);
 			return;
 		}
 		sc = g_stripe_find_device(mp, name);
 		if (sc == NULL) {
 			gctl_error(req, "No such device: %s.", name);
 			return;
 		}
 		error = g_stripe_destroy(sc, *force);
 		if (error != 0) {
 			gctl_error(req, "Cannot destroy device %s (error=%d).",
 			    sc->sc_name, error);
 			return;
 		}
 	}
 }
 
 static void
 g_stripe_config(struct gctl_req *req, struct g_class *mp, const char *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "No '%s' argument.", "version");
 		return;
 	}
 	if (*version != G_STRIPE_VERSION) {
 		gctl_error(req, "Userland and kernel parts are out of sync.");
 		return;
 	}
 
 	if (strcmp(verb, "create") == 0) {
 		g_stripe_ctl_create(req, mp);
 		return;
 	} else if (strcmp(verb, "destroy") == 0 ||
 	    strcmp(verb, "stop") == 0) {
 		g_stripe_ctl_destroy(req, mp);
 		return;
 	}
 
 	gctl_error(req, "Unknown verb.");
 }
 
 static void
 g_stripe_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_stripe_softc *sc;
 
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 	if (pp != NULL) {
 		/* Nothing here. */
 	} else if (cp != NULL) {
 		sbuf_printf(sb, "%s<Number>%u</Number>\n", indent,
 		    (u_int)cp->index);
 	} else {
 		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
 		sbuf_printf(sb, "%s<Stripesize>%u</Stripesize>\n", indent,
 		    (u_int)sc->sc_stripesize);
 		sbuf_printf(sb, "%s<Type>", indent);
 		switch (sc->sc_type) {
 		case G_STRIPE_TYPE_AUTOMATIC:
 			sbuf_printf(sb, "AUTOMATIC");
 			break;
 		case G_STRIPE_TYPE_MANUAL:
 			sbuf_printf(sb, "MANUAL");
 			break;
 		default:
 			sbuf_printf(sb, "UNKNOWN");
 			break;
 		}
 		sbuf_printf(sb, "</Type>\n");
 		sbuf_printf(sb, "%s<Status>Total=%u, Online=%u</Status>\n",
 		    indent, sc->sc_ndisks, g_stripe_nvalid(sc));
 		sbuf_printf(sb, "%s<State>", indent);
 		if (sc->sc_provider != NULL && sc->sc_provider->error == 0)
 			sbuf_printf(sb, "UP");
 		else
 			sbuf_printf(sb, "DOWN");
 		sbuf_printf(sb, "</State>\n");
 	}
 }
 
 DECLARE_GEOM_CLASS(g_stripe_class, g_stripe);
Index: user/alc/PQ_LAUNDRY/sys/geom/vinum/geom_vinum_rm.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/vinum/geom_vinum_rm.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/vinum/geom_vinum_rm.c	(revision 306283)
@@ -1,388 +1,387 @@
 /*-
  *  Copyright (c) 2004, 2007 Lukas Ertl
  *  All rights reserved.
  * 
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/libkern.h>
 #include <sys/malloc.h>
 
 #include <geom/geom.h>
 #include <geom/vinum/geom_vinum_var.h>
 #include <geom/vinum/geom_vinum.h>
 
 /* General 'remove' routine. */
 void
 gv_remove(struct g_geom *gp, struct gctl_req *req)
 {
 	struct gv_softc *sc;
 	struct gv_volume *v;
 	struct gv_plex *p;
 	struct gv_sd *s;
 	struct gv_drive *d;
 	int *argc, *flags;
 	char *argv, buf[20];
 	int i, type;
 
 	argc = gctl_get_paraml(req, "argc", sizeof(*argc));
 
 	if (argc == NULL || *argc == 0) {
 		gctl_error(req, "no arguments given");
 		return;
 	}
 
 	flags = gctl_get_paraml(req, "flags", sizeof(*flags));
 	if (flags == NULL) {
 		gctl_error(req, "no flags given");
 		return;
 	}
 
 	sc = gp->softc;
 
 	/* XXX config locking */
 
 	for (i = 0; i < *argc; i++) {
 		snprintf(buf, sizeof(buf), "argv%d", i);
 		argv = gctl_get_param(req, buf, NULL);
 		if (argv == NULL)
 			continue;
 		type = gv_object_type(sc, argv);
 		switch (type) {
 		case GV_TYPE_VOL:
 			v = gv_find_vol(sc, argv);
 
 			/*
 			 * If this volume has plexes, we want a recursive
 			 * removal.
 			 */
 			if (!LIST_EMPTY(&v->plexes) && !(*flags & GV_FLAG_R)) {
 				gctl_error(req, "volume '%s' has attached "
 				    "plexes - need recursive removal", v->name);
 				return;
 			}
 
 			gv_post_event(sc, GV_EVENT_RM_VOLUME, v, NULL, 0, 0);
 			break;
 
 		case GV_TYPE_PLEX:
 			p = gv_find_plex(sc, argv);
 
 			/*
 			 * If this plex has subdisks, we want a recursive
 			 * removal.
 			 */
 			if (!LIST_EMPTY(&p->subdisks) &&
 			    !(*flags & GV_FLAG_R)) {
 				gctl_error(req, "plex '%s' has attached "
 				    "subdisks - need recursive removal",
 				    p->name);
 				return;
 			}
 
 			/* Don't allow removal of the only plex of a volume. */
 			if (p->vol_sc != NULL && p->vol_sc->plexcount == 1) {
 				gctl_error(req, "plex '%s' is still attached "
 				    "to volume '%s'", p->name, p->volume);
 				return;
 			}
 
 			gv_post_event(sc, GV_EVENT_RM_PLEX, p, NULL, 0, 0);
 			break;
 
 		case GV_TYPE_SD:
 			s = gv_find_sd(sc, argv);
 
 			/* Don't allow removal if attached to a plex. */
 			if (s->plex_sc != NULL) {
 				gctl_error(req, "subdisk '%s' is still attached"
 				    " to plex '%s'", s->name, s->plex_sc->name);
 				return;
 			}
 
 			gv_post_event(sc, GV_EVENT_RM_SD, s, NULL, 0, 0);
 			break;
 
 		case GV_TYPE_DRIVE:
 			d = gv_find_drive(sc, argv);
 			/* We don't allow to remove open drives. */
 			if (gv_consumer_is_open(d->consumer) &&
 			    !(*flags & GV_FLAG_F)) {
 				gctl_error(req, "drive '%s' is open", d->name);
 				return;
 			}
 
 			/* A drive with subdisks needs a recursive removal. */
 /*			if (!LIST_EMPTY(&d->subdisks) &&
 			    !(*flags & GV_FLAG_R)) {
 				gctl_error(req, "drive '%s' still has subdisks"
 				    " - need recursive removal", d->name);
 				return;
 			}*/
 
 			gv_post_event(sc, GV_EVENT_RM_DRIVE, d, NULL, *flags,
 			    0);
 			break;
 
 		default:
 			gctl_error(req, "unknown object '%s'", argv);
 			return;
 		}
 	}
 
 	gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
 }
 
 /* Resets configuration */
 int
 gv_resetconfig(struct gv_softc *sc)
 {
 	struct gv_drive *d, *d2;
 	struct gv_volume *v, *v2;
 	struct gv_plex *p, *p2;
 	struct gv_sd *s, *s2;
 
 	/* First make sure nothing is open. */
         LIST_FOREACH_SAFE(d, &sc->drives, drive, d2) {
 		if (gv_consumer_is_open(d->consumer)) {
 			return (GV_ERR_ISBUSY);
 		}
 	}
 
 	/* Make sure nothing is going on internally. */
 	LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2) {
 		if (p->flags & (GV_PLEX_REBUILDING | GV_PLEX_GROWING))
 			return (GV_ERR_ISBUSY);
 	}
 
 	/* Then if not, we remove everything. */
 	LIST_FOREACH_SAFE(s, &sc->subdisks, sd, s2)
 		gv_rm_sd(sc, s);
 	LIST_FOREACH_SAFE(d, &sc->drives, drive, d2)
 		gv_rm_drive(sc, d, 0);
 	LIST_FOREACH_SAFE(p, &sc->plexes, plex, p2)
 		gv_rm_plex(sc, p);
 	LIST_FOREACH_SAFE(v, &sc->volumes, volume, v2)
 		gv_rm_vol(sc, v);
 
 	gv_post_event(sc, GV_EVENT_SAVE_CONFIG, sc, NULL, 0, 0);
 
 	return (0);
 }
 
 /* Remove a volume. */
 void
 gv_rm_vol(struct gv_softc *sc, struct gv_volume *v)
 {
 	struct g_provider *pp;
 	struct gv_plex *p, *p2;
 
 	KASSERT(v != NULL, ("gv_rm_vol: NULL v"));
 	pp = v->provider;
 	KASSERT(pp != NULL, ("gv_rm_vol: NULL pp"));
 
 	/* Check if any of our consumers is open. */
 	if (gv_provider_is_open(pp)) {
 		G_VINUM_DEBUG(0, "unable to remove %s: volume still in use",
 		    v->name);
 		return;
 	}
 
 	/* Remove the plexes our volume has. */
 	LIST_FOREACH_SAFE(p, &v->plexes, in_volume, p2)
 		gv_rm_plex(sc, p);
 
 	/* Clean up. */
 	LIST_REMOVE(v, volume);
 	g_free(v);
 
 	/* Get rid of the volume's provider. */
 	if (pp != NULL) {
 		g_topology_lock();
-		pp->flags |= G_PF_WITHER;
-		g_orphan_provider(pp, ENXIO);
+		g_wither_provider(pp, ENXIO);
 		g_topology_unlock();
 	}
 }
 
 /* Remove a plex. */
 void
 gv_rm_plex(struct gv_softc *sc, struct gv_plex *p)
 {
 	struct gv_volume *v;
 	struct gv_sd *s, *s2;
 
 	KASSERT(p != NULL, ("gv_rm_plex: NULL p"));
 	v = p->vol_sc;
 
 	/* Check if any of our consumers is open. */
 	if (v != NULL && gv_provider_is_open(v->provider) && v->plexcount < 2) {
 		G_VINUM_DEBUG(0, "unable to remove %s: volume still in use",
 		    p->name);
 		return;
 	}
 
 	/* Remove the subdisks our plex has. */
 	LIST_FOREACH_SAFE(s, &p->subdisks, in_plex, s2)
 		gv_rm_sd(sc, s);
 
 	v = p->vol_sc;
 	/* Clean up and let our geom fade away. */
 	LIST_REMOVE(p, plex);
 	if (p->vol_sc != NULL) {
 		p->vol_sc->plexcount--;
 		LIST_REMOVE(p, in_volume);
 		p->vol_sc = NULL;
 		/* Correctly update the volume size. */
 		gv_update_vol_size(v, gv_vol_size(v));
 	}
 
 	g_free(p);
 }
 
 /* Remove a subdisk. */
 void
 gv_rm_sd(struct gv_softc *sc, struct gv_sd *s)
 {
 	struct gv_plex *p;
 	struct gv_volume *v;
 
 	KASSERT(s != NULL, ("gv_rm_sd: NULL s"));
 
 	p = s->plex_sc;
 	v = NULL;
 
 	/* Clean up. */
 	if (p != NULL) {
 		LIST_REMOVE(s, in_plex);
 		s->plex_sc = NULL;
 		p->sdcount--;
 		/* Update the plexsize. */
 		p->size = gv_plex_size(p);
 		v = p->vol_sc;
 		if (v != NULL) {
 			/* Update the size of our plex' volume. */
 			gv_update_vol_size(v, gv_vol_size(v));
 		}
 	}
 	if (s->drive_sc && !(s->drive_sc->flags & GV_DRIVE_REFERENCED))
 		LIST_REMOVE(s, from_drive);
 	LIST_REMOVE(s, sd);
 	gv_free_sd(s);
 	g_free(s);
 }
 
 /* Remove a drive. */
 void
 gv_rm_drive(struct gv_softc *sc, struct gv_drive *d, int flags)
 {
 	struct g_consumer *cp;
 	struct gv_freelist *fl, *fl2;
 	struct gv_plex *p;
 	struct gv_sd *s, *s2;
 	struct gv_volume *v;
 	struct gv_drive *d2;
 	int err;
 
 	KASSERT(d != NULL, ("gv_rm_drive: NULL d"));
 
 	cp = d->consumer;
 
 	if (cp != NULL) {
 		g_topology_lock();
 		err = g_access(cp, 0, 1, 0);
 		g_topology_unlock();
 
 		if (err) {
 			G_VINUM_DEBUG(0, "%s: unable to access '%s', "
 			    "errno: %d", __func__, cp->provider->name, err);
 			return;
 		}
 
 		/* Clear the Vinum Magic. */
 		d->hdr->magic = GV_NOMAGIC;
 		err = gv_write_header(cp, d->hdr);
 		if (err)
 			G_VINUM_DEBUG(0, "gv_rm_drive: error writing header to"
 			    " '%s', errno: %d", cp->provider->name, err);
 
 		g_topology_lock();
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		g_topology_unlock();
 	}
 
 	/* Remove all associated subdisks, plexes, volumes. */
 	if (flags & GV_FLAG_R) {
 		if (!LIST_EMPTY(&d->subdisks)) {
 			LIST_FOREACH_SAFE(s, &d->subdisks, from_drive, s2) {
 				p = s->plex_sc;
 				if (p != NULL) {
 					v = p->vol_sc;
 					if (v != NULL)
 						gv_rm_vol(sc, v);
 				}
 			}
 		}
 	}
 
 	/* Clean up. */
 	LIST_FOREACH_SAFE(fl, &d->freelist, freelist, fl2) {
 		LIST_REMOVE(fl, freelist);
 		g_free(fl);
 	}
 
 	LIST_REMOVE(d, drive);
 	g_free(d->hdr);
 
 	/* Put ourself into referenced state if we have subdisks. */
 	if (d->sdcount > 0) {
 		d->consumer = NULL;
 		d->hdr = NULL;
 		d->flags |= GV_DRIVE_REFERENCED;
 		snprintf(d->device, sizeof(d->device), "???");
 		d->size = 0;
 		d->avail = 0;
 		d->freelist_entries = 0;
 		LIST_FOREACH(s, &d->subdisks, from_drive) {
 			s->flags |= GV_SD_TASTED;
 			gv_set_sd_state(s, GV_SD_DOWN, GV_SETSTATE_FORCE);
 		}
 		/* Shuffle around so we keep gv_is_newer happy. */
 		LIST_REMOVE(d, drive);
 		d2 = LIST_FIRST(&sc->drives);
 		if (d2 == NULL)
 			LIST_INSERT_HEAD(&sc->drives, d, drive);
 		else
 			LIST_INSERT_AFTER(d2, d, drive);
 		return;
 	}
 	g_free(d);
 
 	gv_save_config(sc);
 }
Index: user/alc/PQ_LAUNDRY/sys/geom/virstor/g_virstor.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/geom/virstor/g_virstor.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/geom/virstor/g_virstor.c	(revision 306283)
@@ -1,1892 +1,1890 @@
 /*-
  * Copyright (c) 2006-2007 Ivan Voras <ivoras@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /* Implementation notes:
  * - "Components" are wrappers around providers that make up the
  *   virtual storage (i.e. a virstor has "physical" components)
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/bio.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/time.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/mutex.h>
 #include <vm/uma.h>
 #include <geom/geom.h>
 
 #include <geom/virstor/g_virstor.h>
 #include <geom/virstor/g_virstor_md.h>
 
 FEATURE(g_virstor, "GEOM virtual storage support");
 
 /* Declare malloc(9) label */
 static MALLOC_DEFINE(M_GVIRSTOR, "gvirstor", "GEOM_VIRSTOR Data");
 
 /* GEOM class methods */
 static g_init_t g_virstor_init;
 static g_fini_t g_virstor_fini;
 static g_taste_t g_virstor_taste;
 static g_ctl_req_t g_virstor_config;
 static g_ctl_destroy_geom_t g_virstor_destroy_geom;
 
 /* Declare & initialize class structure ("geom class") */
 struct g_class g_virstor_class = {
 	.name =		G_VIRSTOR_CLASS_NAME,
 	.version =	G_VERSION,
 	.init =		g_virstor_init,
 	.fini =		g_virstor_fini,
 	.taste =	g_virstor_taste,
 	.ctlreq =	g_virstor_config,
 	.destroy_geom = g_virstor_destroy_geom
 	/* The .dumpconf and the rest are only usable for a geom instance, so
 	 * they will be set when such instance is created. */
 };
 
 /* Declare sysctl's and loader tunables */
 SYSCTL_DECL(_kern_geom);
 static SYSCTL_NODE(_kern_geom, OID_AUTO, virstor, CTLFLAG_RW, 0,
     "GEOM_GVIRSTOR information");
 
 static u_int g_virstor_debug = 2; /* XXX: lower to 2 when released to public */
 SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, debug, CTLFLAG_RWTUN, &g_virstor_debug,
     0, "Debug level (2=production, 5=normal, 15=excessive)");
 
 static u_int g_virstor_chunk_watermark = 100;
 SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, chunk_watermark, CTLFLAG_RWTUN,
     &g_virstor_chunk_watermark, 0,
     "Minimum number of free chunks before issuing administrative warning");
 
 static u_int g_virstor_component_watermark = 1;
 SYSCTL_UINT(_kern_geom_virstor, OID_AUTO, component_watermark, CTLFLAG_RWTUN,
     &g_virstor_component_watermark, 0,
     "Minimum number of free components before issuing administrative warning");
 
 static int read_metadata(struct g_consumer *, struct g_virstor_metadata *);
 static void write_metadata(struct g_consumer *, struct g_virstor_metadata *);
 static int clear_metadata(struct g_virstor_component *);
 static int add_provider_to_geom(struct g_virstor_softc *, struct g_provider *,
     struct g_virstor_metadata *);
 static struct g_geom *create_virstor_geom(struct g_class *,
     struct g_virstor_metadata *);
 static void virstor_check_and_run(struct g_virstor_softc *);
 static u_int virstor_valid_components(struct g_virstor_softc *);
 static int virstor_geom_destroy(struct g_virstor_softc *, boolean_t,
     boolean_t);
 static void remove_component(struct g_virstor_softc *,
     struct g_virstor_component *, boolean_t);
 static void bioq_dismantle(struct bio_queue_head *);
 static int allocate_chunk(struct g_virstor_softc *,
     struct g_virstor_component **, u_int *, u_int *);
 static void delay_destroy_consumer(void *, int);
 static void dump_component(struct g_virstor_component *comp);
 #if 0
 static void dump_me(struct virstor_map_entry *me, unsigned int nr);
 #endif
 
 static void virstor_ctl_stop(struct gctl_req *, struct g_class *);
 static void virstor_ctl_add(struct gctl_req *, struct g_class *);
 static void virstor_ctl_remove(struct gctl_req *, struct g_class *);
 static struct g_virstor_softc * virstor_find_geom(const struct g_class *,
     const char *);
 static void update_metadata(struct g_virstor_softc *);
 static void fill_metadata(struct g_virstor_softc *, struct g_virstor_metadata *,
     u_int, u_int);
 
 static void g_virstor_orphan(struct g_consumer *);
 static int g_virstor_access(struct g_provider *, int, int, int);
 static void g_virstor_start(struct bio *);
 static void g_virstor_dumpconf(struct sbuf *, const char *, struct g_geom *,
     struct g_consumer *, struct g_provider *);
 static void g_virstor_done(struct bio *);
 
 static void invalid_call(void);
 /*
  * Initialise GEOM class (per-class callback)
  */
 static void
 g_virstor_init(struct g_class *mp __unused)
 {
 
 	/* Catch map struct size mismatch at compile time; Map entries must
 	 * fit into MAXPHYS exactly, with no wasted space. */
 	CTASSERT(VIRSTOR_MAP_BLOCK_ENTRIES*VIRSTOR_MAP_ENTRY_SIZE == MAXPHYS);
 
 	/* Init UMA zones, TAILQ's, other global vars */
 }
 
 /*
  * Finalise GEOM class (per-class callback)
  */
 static void
 g_virstor_fini(struct g_class *mp __unused)
 {
 
 	/* Deinit UMA zones & global vars */
 }
 
 /*
  * Config (per-class callback)
  */
 static void
 g_virstor_config(struct gctl_req *req, struct g_class *cp, char const *verb)
 {
 	uint32_t *version;
 
 	g_topology_assert();
 
 	version = gctl_get_paraml(req, "version", sizeof(*version));
 	if (version == NULL) {
 		gctl_error(req, "Failed to get 'version' argument");
 		return;
 	}
 	if (*version != G_VIRSTOR_VERSION) {
 		gctl_error(req, "Userland and kernel versions out of sync");
 		return;
 	}
 
 	g_topology_unlock();
 	if (strcmp(verb, "add") == 0)
 		virstor_ctl_add(req, cp);
 	else if (strcmp(verb, "stop") == 0 || strcmp(verb, "destroy") == 0)
 		virstor_ctl_stop(req, cp);
 	else if (strcmp(verb, "remove") == 0)
 		virstor_ctl_remove(req, cp);
 	else
 		gctl_error(req, "unknown verb: '%s'", verb);
 	g_topology_lock();
 }
 
 /*
  * "stop" verb from userland
  */
 static void
 virstor_ctl_stop(struct gctl_req *req, struct g_class *cp)
 {
 	int *force, *nargs;
 	int i;
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof *nargs);
 	if (nargs == NULL) {
 		gctl_error(req, "Error fetching argument '%s'", "nargs");
 		return;
 	}
 	if (*nargs < 1) {
 		gctl_error(req, "Invalid number of arguments");
 		return;
 	}
 	force = gctl_get_paraml(req, "force", sizeof *force);
 	if (force == NULL) {
 		gctl_error(req, "Error fetching argument '%s'", "force");
 		return;
 	}
 
 	g_topology_lock();
 	for (i = 0; i < *nargs; i++) {
 		char param[8];
 		const char *name;
 		struct g_virstor_softc *sc;
 		int error;
 
 		sprintf(param, "arg%d", i);
 		name = gctl_get_asciiparam(req, param);
 		if (name == NULL) {
 			gctl_error(req, "No 'arg%d' argument", i);
 			g_topology_unlock();
 			return;
 		}
 		sc = virstor_find_geom(cp, name);
 		if (sc == NULL) {
 			gctl_error(req, "Don't know anything about '%s'", name);
 			g_topology_unlock();
 			return;
 		}
 
 		LOG_MSG(LVL_INFO, "Stopping %s by the userland command",
 		    sc->geom->name);
 		update_metadata(sc);
 		if ((error = virstor_geom_destroy(sc, TRUE, TRUE)) != 0) {
 			LOG_MSG(LVL_ERROR, "Cannot destroy %s: %d",
 			    sc->geom->name, error);
 		}
 	}
 	g_topology_unlock();
 }
 
 /*
  * "add" verb from userland - add new component(s) to the structure.
  * This will be done all at once in here, without going through the
  * .taste function for new components.
  */
 static void
 virstor_ctl_add(struct gctl_req *req, struct g_class *cp)
 {
 	/* Note: while this is going on, I/O is being done on
 	 * the g_up and g_down threads. The idea is to make changes
 	 * to softc members in a way that can atomically activate
 	 * them all at once. */
 	struct g_virstor_softc *sc;
 	int *hardcode, *nargs;
 	const char *geom_name;	/* geom to add a component to */
 	struct g_consumer *fcp;
 	struct g_virstor_bio_q *bq;
 	u_int added;
 	int error;
 	int i;
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "Error fetching argument '%s'", "nargs");
 		return;
 	}
 	if (*nargs < 2) {
 		gctl_error(req, "Invalid number of arguments");
 		return;
 	}
 	hardcode = gctl_get_paraml(req, "hardcode", sizeof(*hardcode));
 	if (hardcode == NULL) {
 		gctl_error(req, "Error fetching argument '%s'", "hardcode");
 		return;
 	}
 
 	/* Find "our" geom */
 	geom_name = gctl_get_asciiparam(req, "arg0");
 	if (geom_name == NULL) {
 		gctl_error(req, "Error fetching argument '%s'", "geom_name (arg0)");
 		return;
 	}
 	sc = virstor_find_geom(cp, geom_name);
 	if (sc == NULL) {
 		gctl_error(req, "Don't know anything about '%s'", geom_name);
 		return;
 	}
 
 	if (virstor_valid_components(sc) != sc->n_components) {
 		LOG_MSG(LVL_ERROR, "Cannot add components to incomplete "
 		    "virstor %s", sc->geom->name);
 		gctl_error(req, "Virstor %s is incomplete", sc->geom->name);
 		return;
 	}
 
 	fcp = sc->components[0].gcons;
 	added = 0;
 	g_topology_lock();
 	for (i = 1; i < *nargs; i++) {
 		struct g_virstor_metadata md;
 		char aname[8];
 		const char *prov_name;
 		struct g_provider *pp;
 		struct g_consumer *cp;
 		u_int nc;
 		u_int j;
 
 		snprintf(aname, sizeof aname, "arg%d", i);
 		prov_name = gctl_get_asciiparam(req, aname);
 		if (prov_name == NULL) {
 			gctl_error(req, "Error fetching argument '%s'", aname);
 			g_topology_unlock();
 			return;
 		}
 		if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
 			prov_name += sizeof(_PATH_DEV) - 1;
 
 		pp = g_provider_by_name(prov_name);
 		if (pp == NULL) {
 			/* This is the most common error so be verbose about it */
 			if (added != 0) {
 				gctl_error(req, "Invalid provider: '%s' (added"
 				    " %u components)", prov_name, added);
 				update_metadata(sc);
 			} else {
 				gctl_error(req, "Invalid provider: '%s'",
 				    prov_name);
 			}
 			g_topology_unlock();
 			return;
 		}
 		cp = g_new_consumer(sc->geom);
 		if (cp == NULL) {
 			gctl_error(req, "Cannot create consumer");
 			g_topology_unlock();
 			return;
 		}
 		error = g_attach(cp, pp);
 		if (error != 0) {
 			gctl_error(req, "Cannot attach a consumer to %s",
 			    pp->name);
 			g_destroy_consumer(cp);
 			g_topology_unlock();
 			return;
 		}
 		if (fcp->acr != 0 || fcp->acw != 0 || fcp->ace != 0) {
 			error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 			if (error != 0) {
 				gctl_error(req, "Access request failed for %s",
 				    pp->name);
 				g_destroy_consumer(cp);
 				g_topology_unlock();
 				return;
 			}
 		}
 		if (fcp->provider->sectorsize != pp->sectorsize) {
 			gctl_error(req, "Sector size doesn't fit for %s",
 			    pp->name);
 			g_destroy_consumer(cp);
 			g_topology_unlock();
 			return;
 		}
 		for (j = 0; j < sc->n_components; j++) {
 			if (strcmp(sc->components[j].gcons->provider->name,
 			    pp->name) == 0) {
 				gctl_error(req, "Component %s already in %s",
 				    pp->name, sc->geom->name);
 				g_destroy_consumer(cp);
 				g_topology_unlock();
 				return;
 			}
 		}
 		sc->components = realloc(sc->components,
 		    sizeof(*sc->components) * (sc->n_components + 1),
 		    M_GVIRSTOR, M_WAITOK);
 
 		nc = sc->n_components;
 		sc->components[nc].gcons = cp;
 		sc->components[nc].sc = sc;
 		sc->components[nc].index = nc;
 		sc->components[nc].chunk_count = cp->provider->mediasize /
 		    sc->chunk_size;
 		sc->components[nc].chunk_next = 0;
 		sc->components[nc].chunk_reserved = 0;
 
 		if (sc->components[nc].chunk_count < 4) {
 			gctl_error(req, "Provider too small: %s",
 			    cp->provider->name);
 			g_destroy_consumer(cp);
 			g_topology_unlock();
 			return;
 		}
 		fill_metadata(sc, &md, nc, *hardcode);
 		write_metadata(cp, &md);
 		/* The new component becomes visible when n_components is
 		 * incremented */
 		sc->n_components++;
 		added++;
 
 	}
 	/* This call to update_metadata() is critical. In case there's a
 	 * power failure in the middle of it and some components are updated
 	 * while others are not, there will be trouble on next .taste() iff
 	 * a non-updated component is detected first */
 	update_metadata(sc);
 	g_topology_unlock();
 	LOG_MSG(LVL_INFO, "Added %d component(s) to %s", added,
 	    sc->geom->name);
 	/* Fire off BIOs previously queued because there wasn't any
 	 * physical space left. If the BIOs still can't be satisfied
 	 * they will again be added to the end of the queue (during
 	 * which the mutex will be recursed) */
 	bq = malloc(sizeof(*bq), M_GVIRSTOR, M_WAITOK);
 	bq->bio = NULL;
 	mtx_lock(&sc->delayed_bio_q_mtx);
 	/* First, insert a sentinel to the queue end, so we don't
 	 * end up in an infinite loop if there's still no free
 	 * space available. */
 	STAILQ_INSERT_TAIL(&sc->delayed_bio_q, bq, linkage);
 	while (!STAILQ_EMPTY(&sc->delayed_bio_q)) {
 		bq = STAILQ_FIRST(&sc->delayed_bio_q);
 		if (bq->bio != NULL) {
 			g_virstor_start(bq->bio);
 			STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage);
 			free(bq, M_GVIRSTOR);
 		} else {
 			STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage);
 			free(bq, M_GVIRSTOR);
 			break;
 		}
 	}
 	mtx_unlock(&sc->delayed_bio_q_mtx);
 
 }
 
 /*
  * Find a geom handled by the class
  */
 static struct g_virstor_softc *
 virstor_find_geom(const struct g_class *cp, const char *name)
 {
 	struct g_geom *gp;
 
 	LIST_FOREACH(gp, &cp->geom, geom) {
 		if (strcmp(name, gp->name) == 0)
 			return (gp->softc);
 	}
 	return (NULL);
 }
 
 /*
  * Update metadata on all components to reflect the current state
  * of these fields:
  *    - chunk_next
  *    - flags
  *    - md_count
  * Expects things to be set up so write_metadata() can work, i.e.
  * the topology lock must be held.
  */
 static void
 update_metadata(struct g_virstor_softc *sc)
 {
 	struct g_virstor_metadata md;
 	u_int n;
 
 	if (virstor_valid_components(sc) != sc->n_components)
 		return; /* Incomplete device */
 	LOG_MSG(LVL_DEBUG, "Updating metadata on components for %s",
 	    sc->geom->name);
 	/* Update metadata on components */
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__,
 	    sc->geom->class->name, sc->geom->name);
 	g_topology_assert();
 	for (n = 0; n < sc->n_components; n++) {
 		read_metadata(sc->components[n].gcons, &md);
 		md.chunk_next = sc->components[n].chunk_next;
 		md.flags = sc->components[n].flags;
 		md.md_count = sc->n_components;
 		write_metadata(sc->components[n].gcons, &md);
 	}
 }
 
 /*
  * Fills metadata (struct md) from information stored in softc and the nc'th
  * component of virstor
  */
 static void
 fill_metadata(struct g_virstor_softc *sc, struct g_virstor_metadata *md,
     u_int nc, u_int hardcode)
 {
 	struct g_virstor_component *c;
 
 	bzero(md, sizeof *md);
 	c = &sc->components[nc];
 
 	strncpy(md->md_magic, G_VIRSTOR_MAGIC, sizeof md->md_magic);
 	md->md_version = G_VIRSTOR_VERSION;
 	strncpy(md->md_name, sc->geom->name, sizeof md->md_name);
 	md->md_id = sc->id;
 	md->md_virsize = sc->virsize;
 	md->md_chunk_size = sc->chunk_size;
 	md->md_count = sc->n_components;
 
 	if (hardcode) {
 		strncpy(md->provider, c->gcons->provider->name,
 		    sizeof md->provider);
 	}
 	md->no = nc;
 	md->provsize = c->gcons->provider->mediasize;
 	md->chunk_count = c->chunk_count;
 	md->chunk_next = c->chunk_next;
 	md->chunk_reserved = c->chunk_reserved;
 	md->flags = c->flags;
 }
 
 /*
  * Remove a component from virstor device.
  * Can only be done if the component is unallocated.
  */
 static void
 virstor_ctl_remove(struct gctl_req *req, struct g_class *cp)
 {
 	/* As this is executed in parallel to I/O, operations on virstor
 	 * structures must be as atomic as possible. */
 	struct g_virstor_softc *sc;
 	int *nargs;
 	const char *geom_name;
 	u_int removed;
 	int i;
 
 	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
 	if (nargs == NULL) {
 		gctl_error(req, "Error fetching argument '%s'", "nargs");
 		return;
 	}
 	if (*nargs < 2) {
 		gctl_error(req, "Invalid number of arguments");
 		return;
 	}
 	/* Find "our" geom */
 	geom_name = gctl_get_asciiparam(req, "arg0");
 	if (geom_name == NULL) {
 		gctl_error(req, "Error fetching argument '%s'",
 		    "geom_name (arg0)");
 		return;
 	}
 	sc = virstor_find_geom(cp, geom_name);
 	if (sc == NULL) {
 		gctl_error(req, "Don't know anything about '%s'", geom_name);
 		return;
 	}
 
 	if (virstor_valid_components(sc) != sc->n_components) {
 		LOG_MSG(LVL_ERROR, "Cannot remove components from incomplete "
 		    "virstor %s", sc->geom->name);
 		gctl_error(req, "Virstor %s is incomplete", sc->geom->name);
 		return;
 	}
 
 	removed = 0;
 	for (i = 1; i < *nargs; i++) {
 		char param[8];
 		const char *prov_name;
 		int j, found;
 		struct g_virstor_component *newcomp, *compbak;
 
 		sprintf(param, "arg%d", i);
 		prov_name = gctl_get_asciiparam(req, param);
 		if (prov_name == NULL) {
 			gctl_error(req, "Error fetching argument '%s'", param);
 			return;
 		}
 		if (strncmp(prov_name, _PATH_DEV, sizeof(_PATH_DEV) - 1) == 0)
 			prov_name += sizeof(_PATH_DEV) - 1;
 
 		found = -1;
 		for (j = 0; j < sc->n_components; j++) {
 			if (strcmp(sc->components[j].gcons->provider->name,
 			    prov_name) == 0) {
 				found = j;
 				break;
 			}
 		}
 		if (found == -1) {
 			LOG_MSG(LVL_ERROR, "No %s component in %s",
 			    prov_name, sc->geom->name);
 			continue;
 		}
 
 		compbak = sc->components;
 		newcomp = malloc(sc->n_components * sizeof(*sc->components),
 		    M_GVIRSTOR, M_WAITOK | M_ZERO);
 		bcopy(sc->components, newcomp, found * sizeof(*sc->components));
 		bcopy(&sc->components[found + 1], newcomp + found,
 		    found * sizeof(*sc->components));
 		if ((sc->components[j].flags & VIRSTOR_PROVIDER_ALLOCATED) != 0) {
 			LOG_MSG(LVL_ERROR, "Allocated provider %s cannot be "
 			    "removed from %s",
 			    prov_name, sc->geom->name);
 			free(newcomp, M_GVIRSTOR);
 			/* We'll consider this non-fatal error */
 			continue;
 		}
 		/* Renumerate unallocated components */
 		for (j = 0; j < sc->n_components-1; j++) {
 			if ((sc->components[j].flags &
 			    VIRSTOR_PROVIDER_ALLOCATED) == 0) {
 				sc->components[j].index = j;
 			}
 		}
 		/* This is the critical section. If a component allocation
 		 * event happens while both variables are not yet set,
 		 * there will be trouble. Something will panic on encountering
 		 * NULL sc->components[x].gcomp member.
 		 * Luckily, component allocation happens very rarely and
 		 * removing components is an abnormal action in any case. */
 		sc->components = newcomp;
 		sc->n_components--;
 		/* End critical section */
 
 		g_topology_lock();
 		if (clear_metadata(&compbak[found]) != 0) {
 			LOG_MSG(LVL_WARNING, "Trouble ahead: cannot clear "
 			    "metadata on %s", prov_name);
 		}
 		g_detach(compbak[found].gcons);
 		g_destroy_consumer(compbak[found].gcons);
 		g_topology_unlock();
 
 		free(compbak, M_GVIRSTOR);
 
 		removed++;
 	}
 
 	/* This call to update_metadata() is critical. In case there's a
 	 * power failure in the middle of it and some components are updated
 	 * while others are not, there will be trouble on next .taste() iff
 	 * a non-updated component is detected first */
 	g_topology_lock();
 	update_metadata(sc);
 	g_topology_unlock();
 	LOG_MSG(LVL_INFO, "Removed %d component(s) from %s", removed,
 	    sc->geom->name);
 }
 
 /*
  * Clear metadata sector on component
  */
 static int
 clear_metadata(struct g_virstor_component *comp)
 {
 	char *buf;
 	int error;
 
 	LOG_MSG(LVL_INFO, "Clearing metadata on %s",
 	    comp->gcons->provider->name);
 	g_topology_assert();
 	error = g_access(comp->gcons, 0, 1, 0);
 	if (error != 0)
 		return (error);
 	buf = malloc(comp->gcons->provider->sectorsize, M_GVIRSTOR,
 	    M_WAITOK | M_ZERO);
 	error = g_write_data(comp->gcons,
 	    comp->gcons->provider->mediasize -
 	    comp->gcons->provider->sectorsize,
 	    buf,
 	    comp->gcons->provider->sectorsize);
 	free(buf, M_GVIRSTOR);
 	g_access(comp->gcons, 0, -1, 0);
 	return (error);
 }
 
 /*
  * Destroy geom forcibly.
  */
 static int
 g_virstor_destroy_geom(struct gctl_req *req __unused, struct g_class *mp,
     struct g_geom *gp)
 {
 	struct g_virstor_softc *sc;
 	int exitval;
 
 	sc = gp->softc;
 	KASSERT(sc != NULL, ("%s: NULL sc", __func__));
 	
 	exitval = 0;
 	LOG_MSG(LVL_DEBUG, "%s called for %s, sc=%p", __func__, gp->name,
 	    gp->softc);
 
 	if (sc != NULL) {
 #ifdef INVARIANTS
 		char *buf;
 		int error;
 		off_t off;
 		int isclean, count;
 		int n;
 
 		LOG_MSG(LVL_INFO, "INVARIANTS detected");
 		LOG_MSG(LVL_INFO, "Verifying allocation "
 		    "table for %s", sc->geom->name);
 		count = 0;
 		for (n = 0; n < sc->chunk_count; n++) {
 			if (sc->map[n].flags || VIRSTOR_MAP_ALLOCATED != 0)
 				count++;
 		}
 		LOG_MSG(LVL_INFO, "Device %s has %d allocated chunks",
 		    sc->geom->name, count);
 		n = off = count = 0;
 		isclean = 1;
 		if (virstor_valid_components(sc) != sc->n_components) {
 			/* This is a incomplete virstor device (not all
 			 * components have been found) */
 			LOG_MSG(LVL_ERROR, "Device %s is incomplete",
 			    sc->geom->name);
 			goto bailout;
 		}
 		error = g_access(sc->components[0].gcons, 1, 0, 0);
 		KASSERT(error == 0, ("%s: g_access failed (%d)", __func__,
 		    error));
 		/* Compare the whole on-disk allocation table with what's
 		 * currently in memory */
 		while (n < sc->chunk_count) {
 			buf = g_read_data(sc->components[0].gcons, off,
 			    sc->sectorsize, &error);
 			KASSERT(buf != NULL, ("g_read_data returned NULL (%d) "
 			    "for read at %jd", error, off));
 			if (bcmp(buf, &sc->map[n], sc->sectorsize) != 0) {
 				LOG_MSG(LVL_ERROR, "ERROR in allocation table, "
 				    "entry %d, offset %jd", n, off);
 				isclean = 0;
 				count++;
 			}
 			n += sc->me_per_sector;
 			off += sc->sectorsize;
 			g_free(buf);
 		}
 		error = g_access(sc->components[0].gcons, -1, 0, 0);
 		KASSERT(error == 0, ("%s: g_access failed (%d) on exit",
 		    __func__, error));
 		if (isclean != 1) {
 			LOG_MSG(LVL_ERROR, "ALLOCATION TABLE CORRUPTED FOR %s "
 			    "(%d sectors don't match, max %zu allocations)",
 			    sc->geom->name, count,
 			    count * sc->me_per_sector);
 		} else {
 			LOG_MSG(LVL_INFO, "Allocation table ok for %s",
 			    sc->geom->name);
 		}
 bailout:
 #endif
 		update_metadata(sc);
 		virstor_geom_destroy(sc, FALSE, FALSE);
 		exitval = EAGAIN;
 	} else
 		exitval = 0;
 	return (exitval);
 }
 
 /*
  * Taste event (per-class callback)
  * Examines a provider and creates geom instances if needed
  */
 static struct g_geom *
 g_virstor_taste(struct g_class *mp, struct g_provider *pp, int flags)
 {
 	struct g_virstor_metadata md;
 	struct g_geom *gp;
 	struct g_consumer *cp;
 	struct g_virstor_softc *sc;
 	int error;
 
 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
 	g_topology_assert();
 	LOG_MSG(LVL_DEBUG, "Tasting %s", pp->name);
 
 	/* We need a dummy geom to attach a consumer to the given provider */
 	gp = g_new_geomf(mp, "virstor:taste.helper");
 	gp->start = (void *)invalid_call;	/* XXX: hacked up so the        */
 	gp->access = (void *)invalid_call;	/* compiler doesn't complain.   */
 	gp->orphan = (void *)invalid_call;	/* I really want these to fail. */
 
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
 	error = read_metadata(cp, &md);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 	g_destroy_geom(gp);
 
 	if (error != 0)
 		return (NULL);
 
 	if (strcmp(md.md_magic, G_VIRSTOR_MAGIC) != 0)
 		return (NULL);
 	if (md.md_version != G_VIRSTOR_VERSION) {
 		LOG_MSG(LVL_ERROR, "Kernel module version invalid "
 		    "to handle %s (%s) : %d should be %d",
 		    md.md_name, pp->name, md.md_version, G_VIRSTOR_VERSION);
 		return (NULL);
 	}
 	if (md.provsize != pp->mediasize)
 		return (NULL);
 
 	/* If the provider name is hardcoded, use the offered provider only
 	 * if it's been offered with its proper name (the one used in
 	 * the label command). */
 	if (md.provider[0] != '\0' &&
 	    !g_compare_names(md.provider, pp->name))
 		return (NULL);
 
 	/* Iterate all geoms this class already knows about to see if a new
 	 * geom instance of this class needs to be created (in case the provider
 	 * is first from a (possibly) multi-consumer geom) or it just needs
 	 * to be added to an existing instance. */
 	sc = NULL;
 	gp = NULL;
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc == NULL)
 			continue;
 		if (strcmp(md.md_name, sc->geom->name) != 0)
 			continue;
 		if (md.md_id != sc->id)
 			continue;
 		break;
 	}
 	if (gp != NULL) { /* We found an existing geom instance; add to it */
 		LOG_MSG(LVL_INFO, "Adding %s to %s", pp->name, md.md_name);
 		error = add_provider_to_geom(sc, pp, &md);
 		if (error != 0) {
 			LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)",
 			    pp->name, md.md_name, error);
 			return (NULL);
 		}
 	} else { /* New geom instance needs to be created */
 		gp = create_virstor_geom(mp, &md);
 		if (gp == NULL) {
 			LOG_MSG(LVL_ERROR, "Error creating new instance of "
 			    "class %s: %s", mp->name, md.md_name);
 			LOG_MSG(LVL_DEBUG, "Error creating %s at %s",
 			    md.md_name, pp->name);
 			return (NULL);
 		}
 		sc = gp->softc;
 		LOG_MSG(LVL_INFO, "Adding %s to %s (first found)", pp->name,
 		    md.md_name);
 		error = add_provider_to_geom(sc, pp, &md);
 		if (error != 0) {
 			LOG_MSG(LVL_ERROR, "Error adding %s to %s (error %d)",
 			    pp->name, md.md_name, error);
 			virstor_geom_destroy(sc, TRUE, FALSE);
 			return (NULL);
 		}
 	}
 
 	return (gp);
 }
 
 /*
  * Destroyes consumer passed to it in arguments. Used as a callback
  * on g_event queue.
  */
 static void
 delay_destroy_consumer(void *arg, int flags __unused)
 {
 	struct g_consumer *c = arg;
 	KASSERT(c != NULL, ("%s: invalid consumer", __func__));
 	LOG_MSG(LVL_DEBUG, "Consumer %s destroyed with delay",
 	    c->provider->name);
 	g_detach(c);
 	g_destroy_consumer(c);
 }
 
 /*
  * Remove a component (consumer) from geom instance; If it's the first
  * component being removed, orphan the provider to announce geom's being
  * dismantled
  */
 static void
 remove_component(struct g_virstor_softc *sc, struct g_virstor_component *comp,
     boolean_t delay)
 {
 	struct g_consumer *c;
 
 	KASSERT(comp->gcons != NULL, ("Component with no consumer in %s",
 	    sc->geom->name));
 	c = comp->gcons;
 
 	comp->gcons = NULL;
 	KASSERT(c->provider != NULL, ("%s: no provider", __func__));
 	LOG_MSG(LVL_DEBUG, "Component %s removed from %s", c->provider->name,
 	    sc->geom->name);
 	if (sc->provider != NULL) {
-		/* Whither, GEOM? */
-		sc->provider->flags |= G_PF_WITHER;
-		g_orphan_provider(sc->provider, ENXIO);
+		LOG_MSG(LVL_INFO, "Removing provider %s", sc->provider->name);
+		g_wither_provider(sc->provider, ENXIO);
 		sc->provider = NULL;
-		LOG_MSG(LVL_INFO, "Removing provider %s", sc->geom->name);
 	}
 
 	if (c->acr > 0 || c->acw > 0 || c->ace > 0)
 		g_access(c, -c->acr, -c->acw, -c->ace);
 	if (delay) {
 		/* Destroy consumer after it's tasted */
 		g_post_event(delay_destroy_consumer, c, M_WAITOK, NULL);
 	} else {
 		g_detach(c);
 		g_destroy_consumer(c);
 	}
 }
 
 /*
  * Destroy geom - called internally
  * See g_virstor_destroy_geom for the other one
  */
 static int
 virstor_geom_destroy(struct g_virstor_softc *sc, boolean_t force,
     boolean_t delay)
 {
 	struct g_provider *pp;
 	struct g_geom *gp;
 	u_int n;
 
 	g_topology_assert();
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	pp = sc->provider;
 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
 		LOG_MSG(force ? LVL_WARNING : LVL_ERROR,
 		    "Device %s is still open.", pp->name);
 		if (!force)
 			return (EBUSY);
 	}
 
 	for (n = 0; n < sc->n_components; n++) {
 		if (sc->components[n].gcons != NULL)
 			remove_component(sc, &sc->components[n], delay);
 	}
 
 	gp = sc->geom;
 	gp->softc = NULL;
 
 	KASSERT(sc->provider == NULL, ("Provider still exists for %s",
 	    gp->name));
 
 	/* XXX: This might or might not work, since we're called with
 	 * the topology lock held. Also, it might panic the kernel if
 	 * the error'd BIO is in softupdates code. */
 	mtx_lock(&sc->delayed_bio_q_mtx);
 	while (!STAILQ_EMPTY(&sc->delayed_bio_q)) {
 		struct g_virstor_bio_q *bq;
 		bq = STAILQ_FIRST(&sc->delayed_bio_q);
 		bq->bio->bio_error = ENOSPC;
 		g_io_deliver(bq->bio, EIO);
 		STAILQ_REMOVE_HEAD(&sc->delayed_bio_q, linkage);
 		free(bq, M_GVIRSTOR);
 	}
 	mtx_unlock(&sc->delayed_bio_q_mtx);
 	mtx_destroy(&sc->delayed_bio_q_mtx);
 
 	free(sc->map, M_GVIRSTOR);
 	free(sc->components, M_GVIRSTOR);
 	bzero(sc, sizeof *sc);
 	free(sc, M_GVIRSTOR);
 
 	pp = LIST_FIRST(&gp->provider); /* We only offer one provider */
 	if (pp == NULL || (pp->acr == 0 && pp->acw == 0 && pp->ace == 0))
 		LOG_MSG(LVL_DEBUG, "Device %s destroyed", gp->name);
 
 	g_wither_geom(gp, ENXIO);
 
 	return (0);
 }
 
 /*
  * Utility function: read metadata & decode. Wants topology lock to be
  * held.
  */
 static int
 read_metadata(struct g_consumer *cp, struct g_virstor_metadata *md)
 {
 	struct g_provider *pp;
 	char *buf;
 	int error;
 
 	g_topology_assert();
 	error = g_access(cp, 1, 0, 0);
 	if (error != 0)
 		return (error);
 	pp = cp->provider;
 	g_topology_unlock();
 	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
 	    &error);
 	g_topology_lock();
 	g_access(cp, -1, 0, 0);
 	if (buf == NULL)
 		return (error);
 
 	virstor_metadata_decode(buf, md);
 	g_free(buf);
 
 	return (0);
 }
 
 /**
  * Utility function: encode & write metadata. Assumes topology lock is
  * held.
  *
  * There is no useful way of recovering from errors in this function,
  * not involving panicking the kernel. If the metadata cannot be written
  * the most we can do is notify the operator and hope he spots it and
  * replaces the broken drive.
  */
 static void
 write_metadata(struct g_consumer *cp, struct g_virstor_metadata *md)
 {
 	struct g_provider *pp;
 	char *buf;
 	int error;
 
 	KASSERT(cp != NULL && md != NULL && cp->provider != NULL,
 	    ("Something's fishy in %s", __func__));
 	LOG_MSG(LVL_DEBUG, "Writing metadata on %s", cp->provider->name);
 	g_topology_assert();
 	error = g_access(cp, 0, 1, 0);
 	if (error != 0) {
 		LOG_MSG(LVL_ERROR, "g_access(0,1,0) failed for %s: %d",
 		    cp->provider->name, error);
 		return;
 	}
 	pp = cp->provider;
 
 	buf = malloc(pp->sectorsize, M_GVIRSTOR, M_WAITOK);
 	virstor_metadata_encode(md, buf);
 	g_topology_unlock();
 	error = g_write_data(cp, pp->mediasize - pp->sectorsize, buf,
 	    pp->sectorsize);
 	g_topology_lock();
 	g_access(cp, 0, -1, 0);
 	free(buf, M_GVIRSTOR);
 
 	if (error != 0)
 		LOG_MSG(LVL_ERROR, "Error %d writing metadata to %s",
 		    error, cp->provider->name);
 }
 
 /*
  * Creates a new instance of this GEOM class, initialise softc
  */
 static struct g_geom *
 create_virstor_geom(struct g_class *mp, struct g_virstor_metadata *md)
 {
 	struct g_geom *gp;
 	struct g_virstor_softc *sc;
 
 	LOG_MSG(LVL_DEBUG, "Creating geom instance for %s (id=%u)",
 	    md->md_name, md->md_id);
 
 	if (md->md_count < 1 || md->md_chunk_size < 1 ||
 	    md->md_virsize < md->md_chunk_size) {
 		/* This is bogus configuration, and probably means data is
 		 * somehow corrupted. Panic, maybe? */
 		LOG_MSG(LVL_ERROR, "Nonsensical metadata information for %s",
 		    md->md_name);
 		return (NULL);
 	}
 
 	/* Check if it's already created */
 	LIST_FOREACH(gp, &mp->geom, geom) {
 		sc = gp->softc;
 		if (sc != NULL && strcmp(sc->geom->name, md->md_name) == 0) {
 			LOG_MSG(LVL_WARNING, "Geom %s already exists",
 			    md->md_name);
 			if (sc->id != md->md_id) {
 				LOG_MSG(LVL_ERROR,
 				    "Some stale or invalid components "
 				    "exist for virstor device named %s. "
 				    "You will need to <CLEAR> all stale "
 				    "components and maybe reconfigure "
 				    "the virstor device. Tune "
 				    "kern.geom.virstor.debug sysctl up "
 				    "for more information.",
 				    sc->geom->name);
 			}
 			return (NULL);
 		}
 	}
 	gp = g_new_geomf(mp, "%s", md->md_name);
 	gp->softc = NULL; /* to circumevent races that test softc */
 
 	gp->start = g_virstor_start;
 	gp->spoiled = g_virstor_orphan;
 	gp->orphan = g_virstor_orphan;
 	gp->access = g_virstor_access;
 	gp->dumpconf = g_virstor_dumpconf;
 
 	sc = malloc(sizeof(*sc), M_GVIRSTOR, M_WAITOK | M_ZERO);
 	sc->id = md->md_id;
 	sc->n_components = md->md_count;
 	sc->components = malloc(sizeof(struct g_virstor_component) * md->md_count,
 	    M_GVIRSTOR, M_WAITOK | M_ZERO);
 	sc->chunk_size = md->md_chunk_size;
 	sc->virsize = md->md_virsize;
 	STAILQ_INIT(&sc->delayed_bio_q);
 	mtx_init(&sc->delayed_bio_q_mtx, "gvirstor_delayed_bio_q_mtx",
 	    "gvirstor", MTX_DEF | MTX_RECURSE);
 
 	sc->geom = gp;
 	sc->provider = NULL; /* virstor_check_and_run will create it */
 	gp->softc = sc;
 
 	LOG_MSG(LVL_ANNOUNCE, "Device %s created", sc->geom->name);
 
 	return (gp);
 }
 
 /*
  * Add provider to a GEOM class instance
  */
 static int
 add_provider_to_geom(struct g_virstor_softc *sc, struct g_provider *pp,
     struct g_virstor_metadata *md)
 {
 	struct g_virstor_component *component;
 	struct g_consumer *cp, *fcp;
 	struct g_geom *gp;
 	int error;
 
 	if (md->no >= sc->n_components)
 		return (EINVAL);
 
 	/* "Current" compontent */
 	component = &(sc->components[md->no]);
 	if (component->gcons != NULL)
 		return (EEXIST);
 
 	gp = sc->geom;
 	fcp = LIST_FIRST(&gp->consumer);
 
 	cp = g_new_consumer(gp);
 	error = g_attach(cp, pp);
 
 	if (error != 0) {
 		g_destroy_consumer(cp);
 		return (error);
 	}
 
 	if (fcp != NULL) {
 		if (fcp->provider->sectorsize != pp->sectorsize) {
 			/* TODO: this can be made to work */
 			LOG_MSG(LVL_ERROR, "Provider %s of %s has invalid "
 			    "sector size (%d)", pp->name, sc->geom->name,
 			    pp->sectorsize);
 			return (EINVAL);
 		}
 		if (fcp->acr > 0 || fcp->acw || fcp->ace > 0) {
 			/* Replicate access permissions from first "live" consumer
 			 * to the new one */
 			error = g_access(cp, fcp->acr, fcp->acw, fcp->ace);
 			if (error != 0) {
 				g_detach(cp);
 				g_destroy_consumer(cp);
 				return (error);
 			}
 		}
 	}
 
 	/* Bring up a new component */
 	cp->private = component;
 	component->gcons = cp;
 	component->sc = sc;
 	component->index = md->no;
 	component->chunk_count = md->chunk_count;
 	component->chunk_next = md->chunk_next;
 	component->chunk_reserved = md->chunk_reserved;
 	component->flags = md->flags;
 
 	LOG_MSG(LVL_DEBUG, "%s attached to %s", pp->name, sc->geom->name);
 
 	virstor_check_and_run(sc);
 	return (0);
 }
 
 /*
  * Check if everything's ready to create the geom provider & device entry,
  * create and start provider.
  * Called ultimately by .taste, from g_event thread
  */
 static void
 virstor_check_and_run(struct g_virstor_softc *sc)
 {
 	off_t off;
 	size_t n, count;
 	int index;
 	int error;
 
 	if (virstor_valid_components(sc) != sc->n_components)
 		return;
 
 	if (virstor_valid_components(sc) == 0) {
 		/* This is actually a candidate for panic() */
 		LOG_MSG(LVL_ERROR, "No valid components for %s?",
 		    sc->provider->name);
 		return;
 	}
 
 	sc->sectorsize = sc->components[0].gcons->provider->sectorsize;
 
 	/* Initialise allocation map from the first consumer */
 	sc->chunk_count = sc->virsize / sc->chunk_size;
 	if (sc->chunk_count * (off_t)sc->chunk_size != sc->virsize) {
 		LOG_MSG(LVL_WARNING, "Device %s truncated to %ju bytes",
 		    sc->provider->name,
 		    sc->chunk_count * (off_t)sc->chunk_size);
 	}
 	sc->map_size = sc->chunk_count * sizeof *(sc->map);
 	/* The following allocation is in order of 4MB - 8MB */
 	sc->map = malloc(sc->map_size, M_GVIRSTOR, M_WAITOK);
 	KASSERT(sc->map != NULL, ("%s: Memory allocation error (%zu bytes) for %s",
 	    __func__, sc->map_size, sc->provider->name));
 	sc->map_sectors = sc->map_size / sc->sectorsize;
 
 	count = 0;
 	for (n = 0; n < sc->n_components; n++)
 		count += sc->components[n].chunk_count;
 	LOG_MSG(LVL_INFO, "Device %s has %zu physical chunks and %zu virtual "
 	    "(%zu KB chunks)",
 	    sc->geom->name, count, sc->chunk_count, sc->chunk_size / 1024);
 
 	error = g_access(sc->components[0].gcons, 1, 0, 0);
 	if (error != 0) {
 		LOG_MSG(LVL_ERROR, "Cannot acquire read access for %s to "
 		    "read allocation map for %s",
 		    sc->components[0].gcons->provider->name,
 		    sc->geom->name);
 		return;
 	}
 	/* Read in the allocation map */
 	LOG_MSG(LVL_DEBUG, "Reading map for %s from %s", sc->geom->name,
 	    sc->components[0].gcons->provider->name);
 	off = count = n = 0;
 	while (count < sc->map_size) {
 		struct g_virstor_map_entry *mapbuf;
 		size_t bs;
 
 		bs = MIN(MAXPHYS, sc->map_size - count);
 		if (bs % sc->sectorsize != 0) {
 			/* Check for alignment errors */
 			bs = rounddown(bs, sc->sectorsize);
 			if (bs == 0)
 				break;
 			LOG_MSG(LVL_ERROR, "Trouble: map is not sector-aligned "
 			    "for %s on %s", sc->geom->name,
 			    sc->components[0].gcons->provider->name);
 		}
 		mapbuf = g_read_data(sc->components[0].gcons, off, bs, &error);
 		if (mapbuf == NULL) {
 			free(sc->map, M_GVIRSTOR);
 			LOG_MSG(LVL_ERROR, "Error reading allocation map "
 			    "for %s from %s (offset %ju) (error %d)",
 			    sc->geom->name,
 			    sc->components[0].gcons->provider->name,
 			    off, error);
 			return;
 		}
 
 		bcopy(mapbuf, &sc->map[n], bs);
 		off += bs;
 		count += bs;
 		n += bs / sizeof *(sc->map);
 		g_free(mapbuf);
 	}
 	g_access(sc->components[0].gcons, -1, 0, 0);
 	LOG_MSG(LVL_DEBUG, "Read map for %s", sc->geom->name);
 
 	/* find first component with allocatable chunks */
 	index = -1;
 	for (n = 0; n < sc->n_components; n++) {
 		if (sc->components[n].chunk_next <
 		    sc->components[n].chunk_count) {
 			index = n;
 			break;
 		}
 	}
 	if (index == -1)
 		/* not found? set it to the last component and handle it
 		 * later */
 		index = sc->n_components - 1;
 
 	if (index >= sc->n_components - g_virstor_component_watermark - 1) {
 		LOG_MSG(LVL_WARNING, "Device %s running out of components "
 		    "(%d/%u: %s)", sc->geom->name,
 		    index+1,
 		    sc->n_components,
 		    sc->components[index].gcons->provider->name);
 	}
 	sc->curr_component = index;
 
 	if (sc->components[index].chunk_next >=
 	    sc->components[index].chunk_count - g_virstor_chunk_watermark) {
 		LOG_MSG(LVL_WARNING,
 		    "Component %s of %s is running out of free space "
 		    "(%u chunks left)",
 		    sc->components[index].gcons->provider->name,
 		    sc->geom->name, sc->components[index].chunk_count -
 		    sc->components[index].chunk_next);
 	}
 
 	sc->me_per_sector = sc->sectorsize / sizeof *(sc->map);
 	if (sc->sectorsize % sizeof *(sc->map) != 0) {
 		LOG_MSG(LVL_ERROR,
 		    "%s: Map entries don't fit exactly in a sector (%s)",
 		    __func__, sc->geom->name);
 		return;
 	}
 
 	/* Recalculate allocated chunks in components & at the same time
 	 * verify map data is sane. We could trust metadata on this, but
 	 * we want to make sure. */
 	for (n = 0; n < sc->n_components; n++)
 		sc->components[n].chunk_next = sc->components[n].chunk_reserved;
 
 	for (n = 0; n < sc->chunk_count; n++) {
 		if (sc->map[n].provider_no >= sc->n_components ||
 			sc->map[n].provider_chunk >=
 			sc->components[sc->map[n].provider_no].chunk_count) {
 			LOG_MSG(LVL_ERROR, "%s: Invalid entry %u in map for %s",
 			    __func__, (u_int)n, sc->geom->name);
 			LOG_MSG(LVL_ERROR, "%s: provider_no: %u, n_components: %u"
 			    " provider_chunk: %u, chunk_count: %u", __func__,
 			    sc->map[n].provider_no, sc->n_components,
 			    sc->map[n].provider_chunk,
 			    sc->components[sc->map[n].provider_no].chunk_count);
 			return;
 		}
 		if (sc->map[n].flags & VIRSTOR_MAP_ALLOCATED)
 			sc->components[sc->map[n].provider_no].chunk_next++;
 	}
 
 	sc->provider = g_new_providerf(sc->geom, "virstor/%s",
 	    sc->geom->name);
 
 	sc->provider->sectorsize = sc->sectorsize;
 	sc->provider->mediasize = sc->virsize;
 	g_error_provider(sc->provider, 0);
 
 	LOG_MSG(LVL_INFO, "%s activated", sc->provider->name);
 	LOG_MSG(LVL_DEBUG, "%s starting with current component %u, starting "
 	    "chunk %u", sc->provider->name, sc->curr_component,
 	    sc->components[sc->curr_component].chunk_next);
 }
 
 /*
  * Returns count of active providers in this geom instance
  */
 static u_int
 virstor_valid_components(struct g_virstor_softc *sc)
 {
 	unsigned int nc, i;
 
 	nc = 0;
 	KASSERT(sc != NULL, ("%s: softc is NULL", __func__));
 	KASSERT(sc->components != NULL, ("%s: sc->components is NULL", __func__));
 	for (i = 0; i < sc->n_components; i++)
 		if (sc->components[i].gcons != NULL)
 			nc++;
 	return (nc);
 }
 
 /*
  * Called when the consumer gets orphaned (?)
  */
 static void
 g_virstor_orphan(struct g_consumer *cp)
 {
 	struct g_virstor_softc *sc;
 	struct g_virstor_component *comp;
 	struct g_geom *gp;
 
 	g_topology_assert();
 	gp = cp->geom;
 	sc = gp->softc;
 	if (sc == NULL)
 		return;
 
 	comp = cp->private;
 	KASSERT(comp != NULL, ("%s: No component in private part of consumer",
 	    __func__));
 	remove_component(sc, comp, FALSE);
 	if (virstor_valid_components(sc) == 0)
 		virstor_geom_destroy(sc, TRUE, FALSE);
 }
 
 /*
  * Called to notify geom when it's been opened, and for what intent
  */
 static int
 g_virstor_access(struct g_provider *pp, int dr, int dw, int de)
 {
 	struct g_consumer *c;
 	struct g_virstor_softc *sc;
 	struct g_geom *gp;
 	int error;
 
 	KASSERT(pp != NULL, ("%s: NULL provider", __func__));
 	gp = pp->geom;
 	KASSERT(gp != NULL, ("%s: NULL geom", __func__));
 	sc = gp->softc;
 
 	if (sc == NULL) {
 		/* It seems that .access can be called with negative dr,dw,dx
 		 * in this case but I want to check for myself */
 		LOG_MSG(LVL_WARNING, "access(%d, %d, %d) for %s",
 		    dr, dw, de, pp->name);
 		/* This should only happen when geom is withered so
 		 * allow only negative requests */
 		KASSERT(dr <= 0 && dw <= 0 && de <= 0,
 		    ("%s: Positive access for %s", __func__, pp->name));
 		if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0)
 			LOG_MSG(LVL_DEBUG, "Device %s definitely destroyed",
 			    pp->name);
 		return (0);
 	}
 
 	/* Grab an exclusive bit to propagate on our consumers on first open */
 	if (pp->acr == 0 && pp->acw == 0 && pp->ace == 0)
 		de++;
 	/* ... drop it on close */
 	if (pp->acr + dr == 0 && pp->acw + dw == 0 && pp->ace + de == 0) {
 		de--;
 		update_metadata(sc);	/* Writes statistical information */
 	}
 
 	error = ENXIO;
 	LIST_FOREACH(c, &gp->consumer, consumer) {
 		KASSERT(c != NULL, ("%s: consumer is NULL", __func__));
 		error = g_access(c, dr, dw, de);
 		if (error != 0) {
 			struct g_consumer *c2;
 
 			/* Backout earlier changes */
 			LIST_FOREACH(c2, &gp->consumer, consumer) {
 				if (c2 == c) /* all eariler components fixed */
 					return (error);
 				g_access(c2, -dr, -dw, -de);
 			}
 		}
 	}
 
 	return (error);
 }
 
 /*
  * Generate XML dump of current state
  */
 static void
 g_virstor_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
     struct g_consumer *cp, struct g_provider *pp)
 {
 	struct g_virstor_softc *sc;
 
 	g_topology_assert();
 	sc = gp->softc;
 
 	if (sc == NULL || pp != NULL)
 		return;
 
 	if (cp != NULL) {
 		/* For each component */
 		struct g_virstor_component *comp;
 
 		comp = cp->private;
 		if (comp == NULL)
 			return;
 		sbuf_printf(sb, "%s<ComponentIndex>%u</ComponentIndex>\n",
 		    indent, comp->index);
 		sbuf_printf(sb, "%s<ChunkCount>%u</ChunkCount>\n",
 		    indent, comp->chunk_count);
 		sbuf_printf(sb, "%s<ChunksUsed>%u</ChunksUsed>\n",
 		    indent, comp->chunk_next);
 		sbuf_printf(sb, "%s<ChunksReserved>%u</ChunksReserved>\n",
 		    indent, comp->chunk_reserved);
 		sbuf_printf(sb, "%s<StorageFree>%u%%</StorageFree>\n",
 		    indent,
 		    comp->chunk_next > 0 ? 100 -
 		    ((comp->chunk_next + comp->chunk_reserved) * 100) /
 		    comp->chunk_count : 100);
 	} else {
 		/* For the whole thing */
 		u_int count, used, i;
 		off_t size;
 
 		count = used = size = 0;
 		for (i = 0; i < sc->n_components; i++) {
 			if (sc->components[i].gcons != NULL) {
 				count += sc->components[i].chunk_count;
 				used += sc->components[i].chunk_next +
 				    sc->components[i].chunk_reserved;
 				size += sc->components[i].gcons->
 				    provider->mediasize;
 			}
 		}
 
 		sbuf_printf(sb, "%s<Status>"
 		    "Components=%u, Online=%u</Status>\n", indent,
 		    sc->n_components, virstor_valid_components(sc));
 		sbuf_printf(sb, "%s<State>%u%% physical free</State>\n",
 		    indent, 100-(used * 100) / count);
 		sbuf_printf(sb, "%s<ChunkSize>%zu</ChunkSize>\n", indent,
 		    sc->chunk_size);
 		sbuf_printf(sb, "%s<PhysicalFree>%u%%</PhysicalFree>\n",
 		    indent, used > 0 ? 100 - (used * 100) / count : 100);
 		sbuf_printf(sb, "%s<ChunkPhysicalCount>%u</ChunkPhysicalCount>\n",
 		    indent, count);
 		sbuf_printf(sb, "%s<ChunkVirtualCount>%zu</ChunkVirtualCount>\n",
 		    indent, sc->chunk_count);
 		sbuf_printf(sb, "%s<PhysicalBacking>%zu%%</PhysicalBacking>\n",
 		    indent,
 		    (count * 100) / sc->chunk_count);
 		sbuf_printf(sb, "%s<PhysicalBackingSize>%jd</PhysicalBackingSize>\n",
 		    indent, size);
 		sbuf_printf(sb, "%s<VirtualSize>%jd</VirtualSize>\n", indent,
 		    sc->virsize);
 	}
 }
 
 /*
  * GEOM .done handler
  * Can't use standard handler because one requested IO may
  * fork into additional data IOs
  */
 static void
 g_virstor_done(struct bio *b)
 {
 	struct g_virstor_softc *sc;
 	struct bio *parent_b;
 
 	parent_b = b->bio_parent;
 	sc = parent_b->bio_to->geom->softc;
 
 	if (b->bio_error != 0) {
 		LOG_MSG(LVL_ERROR, "Error %d for offset=%ju, length=%ju, %s",
 		    b->bio_error, b->bio_offset, b->bio_length,
 		    b->bio_to->name);
 		if (parent_b->bio_error == 0)
 			parent_b->bio_error = b->bio_error;
 	}
 
 	parent_b->bio_inbed++;
 	parent_b->bio_completed += b->bio_completed;
 
 	if (parent_b->bio_children == parent_b->bio_inbed) {
 		parent_b->bio_completed = parent_b->bio_length;
 		g_io_deliver(parent_b, parent_b->bio_error);
 	}
 	g_destroy_bio(b);
 }
 
 /*
  * I/O starts here
  * Called in g_down thread
  */
 static void
 g_virstor_start(struct bio *b)
 {
 	struct g_virstor_softc *sc;
 	struct g_virstor_component *comp;
 	struct bio *cb;
 	struct g_provider *pp;
 	char *addr;
 	off_t offset, length;
 	struct bio_queue_head bq;
 	size_t chunk_size;	/* cached for convenience */
 	u_int count;
 
 	pp = b->bio_to;
 	sc = pp->geom->softc;
 	KASSERT(sc != NULL, ("%s: no softc (error=%d, device=%s)", __func__,
 	    b->bio_to->error, b->bio_to->name));
 
 	LOG_REQ(LVL_MOREDEBUG, b, "%s", __func__);
 
 	switch (b->bio_cmd) {
 	case BIO_READ:
 	case BIO_WRITE:
 	case BIO_DELETE:
 		break;
 	default:
 		g_io_deliver(b, EOPNOTSUPP);
 		return;
 	}
 
 	LOG_MSG(LVL_DEBUG2, "BIO arrived, size=%ju", b->bio_length);
 	bioq_init(&bq);
 
 	chunk_size = sc->chunk_size;
 	addr = b->bio_data;
 	offset = b->bio_offset;	/* virtual offset and length */
 	length = b->bio_length;
 
 	while (length > 0) {
 		size_t chunk_index, in_chunk_offset, in_chunk_length;
 		struct virstor_map_entry *me;
 
 		chunk_index = offset / chunk_size; /* round downwards */
 		in_chunk_offset = offset % chunk_size;
 		in_chunk_length = min(length, chunk_size - in_chunk_offset);
 		LOG_MSG(LVL_DEBUG, "Mapped %s(%ju, %ju) to (%zu,%zu,%zu)",
 		    b->bio_cmd == BIO_READ ? "R" : "W",
 		    offset, length,
 		    chunk_index, in_chunk_offset, in_chunk_length);
 		me = &sc->map[chunk_index];
 
 		if (b->bio_cmd == BIO_READ || b->bio_cmd == BIO_DELETE) {
 			if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) {
 				/* Reads from unallocated chunks return zeroed
 				 * buffers */
 				if (b->bio_cmd == BIO_READ)
 					bzero(addr, in_chunk_length);
 			} else {
 				comp = &sc->components[me->provider_no];
 
 				cb = g_clone_bio(b);
 				if (cb == NULL) {
 					bioq_dismantle(&bq);
 					if (b->bio_error == 0)
 						b->bio_error = ENOMEM;
 					g_io_deliver(b, b->bio_error);
 					return;
 				}
 				cb->bio_to = comp->gcons->provider;
 				cb->bio_done = g_virstor_done;
 				cb->bio_offset =
 				    (off_t)me->provider_chunk * (off_t)chunk_size
 				    + in_chunk_offset;
 				cb->bio_length = in_chunk_length;
 				cb->bio_data = addr;
 				cb->bio_caller1 = comp;
 				bioq_disksort(&bq, cb);
 			}
 		} else { /* handle BIO_WRITE */
 			KASSERT(b->bio_cmd == BIO_WRITE,
 			    ("%s: Unknown command %d", __func__,
 			    b->bio_cmd));
 
 			if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0) {
 				/* We have a virtual chunk, represented by
 				 * the "me" entry, but it's not yet allocated
 				 * (tied to) a physical chunk. So do it now. */
 				struct virstor_map_entry *data_me;
 				u_int phys_chunk, comp_no;
 				off_t s_offset;
 				int error;
 
 				error = allocate_chunk(sc, &comp, &comp_no,
 				    &phys_chunk);
 				if (error != 0) {
 					/* We cannot allocate a physical chunk
 					 * to satisfy this request, so we'll
 					 * delay it to when we can...
 					 * XXX: this will prevent the fs from
 					 * being umounted! */
 					struct g_virstor_bio_q *biq;
 					biq = malloc(sizeof *biq, M_GVIRSTOR,
 					    M_NOWAIT);
 					if (biq == NULL) {
 						bioq_dismantle(&bq);
 						if (b->bio_error == 0)
 							b->bio_error = ENOMEM;
 						g_io_deliver(b, b->bio_error);
 						return;
 					}
 					biq->bio = b;
 					mtx_lock(&sc->delayed_bio_q_mtx);
 					STAILQ_INSERT_TAIL(&sc->delayed_bio_q,
 					    biq, linkage);
 					mtx_unlock(&sc->delayed_bio_q_mtx);
 					LOG_MSG(LVL_WARNING, "Delaying BIO "
 					    "(size=%ju) until free physical "
 					    "space can be found on %s",
 					    b->bio_length,
 					    sc->provider->name);
 					return;
 				}
 				LOG_MSG(LVL_DEBUG, "Allocated chunk %u on %s "
 				    "for %s",
 				    phys_chunk,
 				    comp->gcons->provider->name,
 				    sc->provider->name);
 
 				me->provider_no = comp_no;
 				me->provider_chunk = phys_chunk;
 				me->flags |= VIRSTOR_MAP_ALLOCATED;
 
 				cb = g_clone_bio(b);
 				if (cb == NULL) {
 					me->flags &= ~VIRSTOR_MAP_ALLOCATED;
 					me->provider_no = 0;
 					me->provider_chunk = 0;
 					bioq_dismantle(&bq);
 					if (b->bio_error == 0)
 						b->bio_error = ENOMEM;
 					g_io_deliver(b, b->bio_error);
 					return;
 				}
 
 				/* The allocation table is stored continuously
 				 * at the start of the drive. We need to
 				 * calculate the offset of the sector that holds
 				 * this map entry both on the drive and in the
 				 * map array.
 				 * sc_offset will end up pointing to the drive
 				 * sector. */
 				s_offset = chunk_index * sizeof *me;
 				s_offset = rounddown(s_offset, sc->sectorsize);
 
 				/* data_me points to map entry sector
 				 * in memory (analogous to offset) */
 				data_me = &sc->map[rounddown(chunk_index,
 				    sc->me_per_sector)];
 
 				/* Commit sector with map entry to storage */
 				cb->bio_to = sc->components[0].gcons->provider;
 				cb->bio_done = g_virstor_done;
 				cb->bio_offset = s_offset;
 				cb->bio_data = (char *)data_me;
 				cb->bio_length = sc->sectorsize;
 				cb->bio_caller1 = &sc->components[0];
 				bioq_disksort(&bq, cb);
 			}
 
 			comp = &sc->components[me->provider_no];
 			cb = g_clone_bio(b);
 			if (cb == NULL) {
 				bioq_dismantle(&bq);
 				if (b->bio_error == 0)
 					b->bio_error = ENOMEM;
 				g_io_deliver(b, b->bio_error);
 				return;
 			}
 			/* Finally, handle the data */
 			cb->bio_to = comp->gcons->provider;
 			cb->bio_done = g_virstor_done;
 			cb->bio_offset = (off_t)me->provider_chunk*(off_t)chunk_size +
 			    in_chunk_offset;
 			cb->bio_length = in_chunk_length;
 			cb->bio_data = addr;
 			cb->bio_caller1 = comp;
 			bioq_disksort(&bq, cb);
 		}
 		addr += in_chunk_length;
 		length -= in_chunk_length;
 		offset += in_chunk_length;
 	}
 
 	/* Fire off bio's here */
 	count = 0;
 	for (cb = bioq_first(&bq); cb != NULL; cb = bioq_first(&bq)) {
 		bioq_remove(&bq, cb);
 		LOG_REQ(LVL_MOREDEBUG, cb, "Firing request");
 		comp = cb->bio_caller1;
 		cb->bio_caller1 = NULL;
 		LOG_MSG(LVL_DEBUG, " firing bio, offset=%ju, length=%ju",
 		    cb->bio_offset, cb->bio_length);
 		g_io_request(cb, comp->gcons);
 		count++;
 	}
 	if (count == 0) { /* We handled everything locally */
 		b->bio_completed = b->bio_length;
 		g_io_deliver(b, 0);
 	}
 
 }
 
 /*
  * Allocate a chunk from a physical provider. Returns physical component,
  * chunk index relative to the component and the component's index.
  */
 static int
 allocate_chunk(struct g_virstor_softc *sc, struct g_virstor_component **comp,
     u_int *comp_no_p, u_int *chunk)
 {
 	u_int comp_no;
 
 	KASSERT(sc->curr_component < sc->n_components,
 	    ("%s: Invalid curr_component: %u",  __func__, sc->curr_component));
 
 	comp_no = sc->curr_component;
 	*comp = &sc->components[comp_no];
 	dump_component(*comp);
 	if ((*comp)->chunk_next >= (*comp)->chunk_count) {
 		/* This component is full. Allocate next component */
 		if (comp_no >= sc->n_components-1) {
 			LOG_MSG(LVL_ERROR, "All physical space allocated for %s",
 			    sc->geom->name);
 			return (-1);
 		}
 		(*comp)->flags &= ~VIRSTOR_PROVIDER_CURRENT;
 		sc->curr_component = ++comp_no;
 
 		*comp = &sc->components[comp_no];
 		if (comp_no >= sc->n_components - g_virstor_component_watermark-1)
 			LOG_MSG(LVL_WARNING, "Device %s running out of components "
 			    "(switching to %u/%u: %s)", sc->geom->name,
 			    comp_no+1, sc->n_components,
 			    (*comp)->gcons->provider->name);
 		/* Take care not to overwrite reserved chunks */
 		if ( (*comp)->chunk_reserved > 0 &&
 		    (*comp)->chunk_next < (*comp)->chunk_reserved)
 			(*comp)->chunk_next = (*comp)->chunk_reserved;
 
 		(*comp)->flags |=
 		    VIRSTOR_PROVIDER_ALLOCATED | VIRSTOR_PROVIDER_CURRENT;
 		dump_component(*comp);
 		*comp_no_p = comp_no;
 		*chunk = (*comp)->chunk_next++;
 	} else {
 		*comp_no_p = comp_no;
 		*chunk = (*comp)->chunk_next++;
 	}
 	return (0);
 }
 
 /* Dump a component */
 static void
 dump_component(struct g_virstor_component *comp)
 {
 
 	if (g_virstor_debug < LVL_DEBUG2)
 		return;
 	printf("Component %d: %s\n", comp->index, comp->gcons->provider->name);
 	printf("  chunk_count: %u\n", comp->chunk_count);
 	printf("   chunk_next: %u\n", comp->chunk_next);
 	printf("        flags: %u\n", comp->flags);
 }
 
 #if 0
 /* Dump a map entry */
 static void
 dump_me(struct virstor_map_entry *me, unsigned int nr)
 {
 	if (g_virstor_debug < LVL_DEBUG)
 		return;
 	printf("VIRT. CHUNK #%d: ", nr);
 	if ((me->flags & VIRSTOR_MAP_ALLOCATED) == 0)
 		printf("(unallocated)\n");
 	else
 		printf("allocated at provider %u, provider_chunk %u\n",
 		    me->provider_no, me->provider_chunk);
 }
 #endif
 
 /*
  * Dismantle bio_queue and destroy its components
  */
 static void
 bioq_dismantle(struct bio_queue_head *bq)
 {
 	struct bio *b;
 
 	for (b = bioq_first(bq); b != NULL; b = bioq_first(bq)) {
 		bioq_remove(bq, b);
 		g_destroy_bio(b);
 	}
 }
 
 /*
  * The function that shouldn't be called.
  * When this is called, the stack is already garbled because of
  * argument mismatch. There's nothing to do now but panic, which is
  * accidentally the whole purpose of this function.
  * Motivation: to guard from accidentally calling geom methods when
  * they shouldn't be called. (see g_..._taste)
  */
 static void
 invalid_call(void)
 {
 	panic("invalid_call() has just been called. Something's fishy here.");
 }
 
 DECLARE_GEOM_CLASS(g_virstor_class, g_virstor); /* Let there be light */
Index: user/alc/PQ_LAUNDRY/sys/kern/kern_descrip.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/kern_descrip.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/kern/kern_descrip.c	(revision 306283)
@@ -1,4178 +1,4183 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_compat.h"
 #include "opt_ddb.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/selinfo.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/socketvar.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/uma.h>
 #include <vm/vm.h>
 
 #include <ddb/ddb.h>
 
 static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
 static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
 
 MALLOC_DECLARE(M_FADVISE);
 
 static uma_zone_t file_zone;
 static uma_zone_t filedesc0_zone;
 
 static int	closefp(struct filedesc *fdp, int fd, struct file *fp,
 		    struct thread *td, int holdleaders);
 static int	fd_first_free(struct filedesc *fdp, int low, int size);
 static int	fd_last_used(struct filedesc *fdp, int size);
 static void	fdgrowtable(struct filedesc *fdp, int nfd);
 static void	fdgrowtable_exp(struct filedesc *fdp, int nfd);
 static void	fdunused(struct filedesc *fdp, int fd);
 static void	fdused(struct filedesc *fdp, int fd);
 static int	getmaxfd(struct thread *td);
 
 /*
  * Each process has:
  *
  * - An array of open file descriptors (fd_ofiles)
  * - An array of file flags (fd_ofileflags)
  * - A bitmap recording which descriptors are in use (fd_map)
  *
  * A process starts out with NDFILE descriptors.  The value of NDFILE has
  * been selected based the historical limit of 20 open files, and an
  * assumption that the majority of processes, especially short-lived
  * processes like shells, will never need more.
  *
  * If this initial allocation is exhausted, a larger descriptor table and
  * map are allocated dynamically, and the pointers in the process's struct
  * filedesc are updated to point to those.  This is repeated every time
  * the process runs out of file descriptors (provided it hasn't hit its
  * resource limit).
  *
  * Since threads may hold references to individual descriptor table
  * entries, the tables are never freed.  Instead, they are placed on a
  * linked list and freed only when the struct filedesc is released.
  */
 #define NDFILE		20
 #define NDSLOTSIZE	sizeof(NDSLOTTYPE)
 #define	NDENTRIES	(NDSLOTSIZE * __CHAR_BIT)
 #define NDSLOT(x)	((x) / NDENTRIES)
 #define NDBIT(x)	((NDSLOTTYPE)1 << ((x) % NDENTRIES))
 #define	NDSLOTS(x)	(((x) + NDENTRIES - 1) / NDENTRIES)
 
 /*
  * SLIST entry used to keep track of ofiles which must be reclaimed when
  * the process exits.
  */
 struct freetable {
 	struct fdescenttbl *ft_table;
 	SLIST_ENTRY(freetable) ft_next;
 };
 
 /*
  * Initial allocation: a filedesc structure + the head of SLIST used to
  * keep track of old ofiles + enough space for NDFILE descriptors.
  */
 
 struct fdescenttbl0 {
 	int	fdt_nfiles;
 	struct	filedescent fdt_ofiles[NDFILE];
 };
 
 struct filedesc0 {
 	struct filedesc fd_fd;
 	SLIST_HEAD(, freetable) fd_free;
 	struct	fdescenttbl0 fd_dfiles;
 	NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
 };
 
 /*
  * Descriptor management.
  */
 volatile int openfiles;			/* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
 void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
  * given bitmap, starting at low and not exceeding size - 1. Return size if
  * not found.
  */
 static int
 fd_first_free(struct filedesc *fdp, int low, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, maxoff;
 
 	if (low >= size)
 		return (low);
 
 	off = NDSLOT(low);
 	if (low % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 >> (NDENTRIES - (low % NDENTRIES)));
 		if ((mask &= ~map[off]) != 0UL)
 			return (off * NDENTRIES + ffsl(mask) - 1);
 		++off;
 	}
 	for (maxoff = NDSLOTS(size); off < maxoff; ++off)
 		if (map[off] != ~0UL)
 			return (off * NDENTRIES + ffsl(~map[off]) - 1);
 	return (size);
 }
 
 /*
  * Find the highest non-zero bit in the given bitmap, starting at 0 and
  * not exceeding size - 1. Return -1 if not found.
  */
 static int
 fd_last_used(struct filedesc *fdp, int size)
 {
 	NDSLOTTYPE *map = fdp->fd_map;
 	NDSLOTTYPE mask;
 	int off, minoff;
 
 	off = NDSLOT(size);
 	if (size % NDENTRIES) {
 		mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
 		if ((mask &= map[off]) != 0)
 			return (off * NDENTRIES + flsl(mask) - 1);
 		--off;
 	}
 	for (minoff = NDSLOT(0); off >= minoff; --off)
 		if (map[off] != 0)
 			return (off * NDENTRIES + flsl(map[off]) - 1);
 	return (-1);
 }
 
 static int
 fdisused(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
 	    ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
 
 	return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
 }
 
 /*
  * Mark a file descriptor as used.
  */
 static void
 fdused_init(struct filedesc *fdp, int fd)
 {
 
 	KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
 
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 }
 
 static void
 fdused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fdused_init(fdp, fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
 	if (fd == fdp->fd_freefile)
 		fdp->fd_freefile = fd_first_free(fdp, fd, fdp->fd_nfiles);
 }
 
 /*
  * Mark a file descriptor as unused.
  */
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("fd=%d is still in use", fd));
 
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
 	if (fd == fdp->fd_lastfile)
 		fdp->fd_lastfile = fd_last_used(fdp, fd);
 }
 
 /*
  * Free a file descriptor.
  *
  * Avoid some work if fdp is about to be destroyed.
  */
 static inline void
 fdefree_last(struct filedescent *fde)
 {
 
 	filecaps_free(&fde->fde_caps);
 }
 
 static inline void
 fdfree(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fdefree_last(fde);
 	fde->fde_file = NULL;
 	fdunused(fdp, fd);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 void
 pwd_ensure_dirs(void)
 {
 	struct filedesc *fdp;
 
 	fdp = curproc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_cdir == NULL) {
 		fdp->fd_cdir = rootvnode;
 		VREF(rootvnode);
 	}
 	if (fdp->fd_rdir == NULL) {
 		fdp->fd_rdir = rootvnode;
 		VREF(rootvnode);
 	}
 	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * System calls on descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getdtablesize_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 sys_getdtablesize(struct thread *td, struct getdtablesize_args *uap)
 {
 #ifdef	RACCT
 	uint64_t lim;
 #endif
 
 	td->td_retval[0] =
 	    min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc);
 #ifdef	RACCT
 	PROC_LOCK(td->td_proc);
 	lim = racct_get_limit(td->td_proc, RACCT_NOFILE);
 	PROC_UNLOCK(td->td_proc);
 	if (lim < td->td_retval[0])
 		td->td_retval[0] = lim;
 #endif
 	return (0);
 }
 
 /*
  * Duplicate a file descriptor to a particular value.
  *
  * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup2_args {
 	u_int	from;
 	u_int	to;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup2(struct thread *td, struct dup2_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_FIXED, 0, (int)uap->from, (int)uap->to));
 }
 
 /*
  * Duplicate a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct dup_args {
 	u_int	fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_dup(struct thread *td, struct dup_args *uap)
 {
 
 	return (kern_dup(td, FDDUP_NORMAL, 0, (int)uap->fd, 0));
 }
 
 /*
  * The file control system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fcntl_args {
 	int	fd;
 	int	cmd;
 	long	arg;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fcntl(struct thread *td, struct fcntl_args *uap)
 {
 
 	return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
 }
 
 int
 kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
 {
 	struct flock fl;
 	struct __oflock ofl;
 	intptr_t arg1;
 	int error, newcmd;
 
 	error = 0;
 	newcmd = cmd;
 	switch (cmd) {
 	case F_OGETLK:
 	case F_OSETLK:
 	case F_OSETLKW:
 		/*
 		 * Convert old flock structure to new.
 		 */
 		error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
 		fl.l_start = ofl.l_start;
 		fl.l_len = ofl.l_len;
 		fl.l_pid = ofl.l_pid;
 		fl.l_type = ofl.l_type;
 		fl.l_whence = ofl.l_whence;
 		fl.l_sysid = 0;
 
 		switch (cmd) {
 		case F_OGETLK:
 			newcmd = F_GETLK;
 			break;
 		case F_OSETLK:
 			newcmd = F_SETLK;
 			break;
 		case F_OSETLKW:
 			newcmd = F_SETLKW;
 			break;
 		}
 		arg1 = (intptr_t)&fl;
 		break;
 	case F_GETLK:
 	case F_SETLK:
 	case F_SETLKW:
 	case F_SETLK_REMOTE:
 		error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
 		arg1 = (intptr_t)&fl;
 		break;
 	default:
 		arg1 = arg;
 		break;
 	}
 	if (error)
 		return (error);
 	error = kern_fcntl(td, fd, newcmd, arg1);
 	if (error)
 		return (error);
 	if (cmd == F_OGETLK) {
 		ofl.l_start = fl.l_start;
 		ofl.l_len = fl.l_len;
 		ofl.l_pid = fl.l_pid;
 		ofl.l_type = fl.l_type;
 		ofl.l_whence = fl.l_whence;
 		error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
 	} else if (cmd == F_GETLK) {
 		error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
 	}
 	return (error);
 }
 
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
 	struct filedesc *fdp;
 	struct flock *flp;
 	struct file *fp, *fp2;
 	struct filedescent *fde;
 	struct proc *p;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error, flg, tmp;
 	uint64_t bsize;
 	off_t foffset;
 
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	switch (cmd) {
 	case F_DUPFD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, 0, fd, tmp);
 		break;
 
 	case F_DUPFD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FCNTL, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_DUP2FD:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, 0, fd, tmp);
 		break;
 
 	case F_DUP2FD_CLOEXEC:
 		tmp = arg;
 		error = kern_dup(td, FDDUP_FIXED, FDDUP_FLAG_CLOEXEC, fd, tmp);
 		break;
 
 	case F_GETFD:
 		error = EBADF;
 		FILEDESC_SLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			td->td_retval[0] =
 			    (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
 			error = 0;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
 		error = EBADF;
 		FILEDESC_XLOCK(fdp);
 		fde = fdeget_locked(fdp, fd);
 		if (fde != NULL) {
 			fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
 			    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
 			error = 0;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp);
 		if (error != 0)
 			break;
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		fdrop(fp, td);
 		break;
 
 	case F_SETFL:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp);
 		if (error != 0)
 			break;
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
 			tmp |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		tmp = fp->f_flag & FASYNC;
 		error = fo_ioctl(fp, FIOASYNC, &tmp, td->td_ucred, td);
 		if (error == 0) {
 			fdrop(fp, td);
 			break;
 		}
 		atomic_clear_int(&fp->f_flag, FNONBLOCK);
 		tmp = 0;
 		(void)fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_GETOWN:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp);
 		if (error != 0)
 			break;
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
 		fdrop(fp, td);
 		break;
 
 	case F_SETOWN:
 		error = fget_fcntl(td, fd,
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp);
 		if (error != 0)
 			break;
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLK_REMOTE:
 		error = priv_check(td, PRIV_NFS_LOCKD);
 		if (error)
 			return (error);
 		flg = F_REMOTE;
 		goto do_setlk;
 
 	case F_SETLKW:
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
 	do_setlk:
 		cap_rights_init(&rights, CAP_FLOCK);
 		error = fget_unlocked(fdp, fd, &rights, &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if (foffset < 0 ||
 			    (flp->l_start > 0 &&
 			     foffset > OFF_MAX - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 
 		vp = fp->f_vnode;
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_WRLCK:
 			if ((fp->f_flag & FWRITE) == 0) {
 				error = EBADF;
 				break;
 			}
 			PROC_LOCK(p->p_leader);
 			p->p_leader->p_flag |= P_ADVLOCK;
 			PROC_UNLOCK(p->p_leader);
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
 			    flp, flg);
 			break;
 		case F_UNLCK:
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
 			    flp, flg);
 			break;
 		case F_UNLCKSYS:
 			/*
 			 * Temporary api for testing remote lock
 			 * infrastructure.
 			 */
 			if (flg != F_REMOTE) {
 				error = EINVAL;
 				break;
 			}
 			error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCKSYS, flp, flg);
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error != 0 || flp->l_type == F_UNLCK ||
 		    flp->l_type == F_UNLCKSYS) {
 			fdrop(fp, td);
 			break;
 		}
 
 		/*
 		 * Check for a race with close.
 		 *
 		 * The vnode is now advisory locked (or unlocked, but this case
 		 * is not really important) as the caller requested.
 		 * We had to drop the filedesc lock, so we need to recheck if
 		 * the descriptor is still valid, because if it was closed
 		 * in the meantime we need to remove advisory lock from the
 		 * vnode - close on any descriptor leading to an advisory
 		 * locked vnode, removes that lock.
 		 * We will return 0 on purpose in that case, as the result of
 		 * successful advisory lock might have been externally visible
 		 * already. This is fine - effectively we pretend to the caller
 		 * that the closing thread was a bit slower and that the
 		 * advisory lock succeeded before the close.
 		 */
 		error = fget_unlocked(fdp, fd, &rights, &fp2, NULL);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (fp != fp2) {
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 			    F_UNLCK, flp, F_POSIX);
 		}
 		fdrop(fp, td);
 		fdrop(fp2, td);
 		break;
 
 	case F_GETLK:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights, CAP_FLOCK), &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			error = EBADF;
 			fdrop(fp, td);
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
 			error = EINVAL;
 			fdrop(fp, td);
 			break;
 		}
 		if (flp->l_whence == SEEK_CUR) {
 			foffset = foffset_get(fp);
 			if ((flp->l_start > 0 &&
 			    foffset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			    foffset < OFF_MIN - flp->l_start)) {
 				error = EOVERFLOW;
 				fdrop(fp, td);
 				break;
 			}
 			flp->l_start += foffset;
 		}
 		vp = fp->f_vnode;
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
 		fdrop(fp, td);
 		break;
 
 	case F_RDAHEAD:
 		arg = arg ? 128 * 1024: 0;
 		/* FALLTHROUGH */
 	case F_READAHEAD:
 		error = fget_unlocked(fdp, fd,
 		    cap_rights_init(&rights), &fp, NULL);
 		if (error != 0)
 			break;
 		if (fp->f_type != DTYPE_VNODE) {
 			fdrop(fp, td);
 			error = EBADF;
 			break;
 		}
 		vp = fp->f_vnode;
 		/*
 		 * Exclusive lock synchronizes against f_seqcount reads and
 		 * writes in sequential_heuristic().
 		 */
 		error = vn_lock(vp, LK_EXCLUSIVE);
 		if (error != 0) {
 			fdrop(fp, td);
 			break;
 		}
 		if (arg >= 0) {
 			bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
 			fp->f_seqcount = (arg + bsize - 1) / bsize;
 			atomic_set_int(&fp->f_flag, FRDAHEAD);
 		} else {
 			atomic_clear_int(&fp->f_flag, FRDAHEAD);
 		}
 		VOP_UNLOCK(vp, 0);
 		fdrop(fp, td);
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 	return (error);
 }
 
 static int
 getmaxfd(struct thread *td)
 {
 
 	return (min((int)lim_cur(td, RLIMIT_NOFILE), maxfilesperproc));
 }
 
 /*
  * Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
  */
 int
 kern_dup(struct thread *td, u_int mode, int flags, int old, int new)
 {
 	struct filedesc *fdp;
 	struct filedescent *oldfde, *newfde;
 	struct proc *p;
 	struct file *delfp;
 	int error, maxfd;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 
 	MPASS((flags & ~(FDDUP_FLAG_CLOEXEC)) == 0);
 	MPASS(mode < FDDUP_LASTMODE);
 
 	AUDIT_ARG_FD(old);
 	/* XXXRW: if (flags & FDDUP_FIXED) AUDIT_ARG_FD2(new); */
 
 	/*
 	 * Verify we have a valid descriptor to dup from and possibly to
 	 * dup to. Unlike dup() and dup2(), fcntl()'s F_DUPFD should
 	 * return EINVAL when the new descriptor is out of bounds.
 	 */
 	if (old < 0)
 		return (EBADF);
 	if (new < 0)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 	maxfd = getmaxfd(td);
 	if (new >= maxfd)
 		return (mode == FDDUP_FCNTL ? EINVAL : EBADF);
 
 	error = EBADF;
 	FILEDESC_XLOCK(fdp);
 	if (fget_locked(fdp, old) == NULL)
 		goto unlock;
 	if ((mode == FDDUP_FIXED || mode == FDDUP_MUSTREPLACE) && old == new) {
 		td->td_retval[0] = new;
 		if (flags & FDDUP_FLAG_CLOEXEC)
 			fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
 		error = 0;
 		goto unlock;
 	}
 
 	/*
 	 * If the caller specified a file descriptor, make sure the file
 	 * table is large enough to hold it, and grab it.  Otherwise, just
 	 * allocate a new descriptor the usual way.
 	 */
 	switch (mode) {
 	case FDDUP_NORMAL:
 	case FDDUP_FCNTL:
 		if ((error = fdalloc(td, new, &new)) != 0)
 			goto unlock;
 		break;
 	case FDDUP_MUSTREPLACE:
 		/* Target file descriptor must exist. */
 		if (fget_locked(fdp, new) == NULL)
 			goto unlock;
 		break;
 	case FDDUP_FIXED:
 		if (new >= fdp->fd_nfiles) {
 			/*
 			 * The resource limits are here instead of e.g.
 			 * fdalloc(), because the file descriptor table may be
 			 * shared between processes, so we can't really use
 			 * racct_add()/racct_sub().  Instead of counting the
 			 * number of actually allocated descriptors, just put
 			 * the limit on the size of the file descriptor table.
 			 */
 #ifdef RACCT
 			if (racct_enable) {
 				PROC_LOCK(p);
 				error = racct_set(p, RACCT_NOFILE, new + 1);
 				PROC_UNLOCK(p);
 				if (error != 0) {
 					error = EMFILE;
 					goto unlock;
 				}
 			}
 #endif
 			fdgrowtable_exp(fdp, new + 1);
 		}
 		if (!fdisused(fdp, new))
 			fdused(fdp, new);
 		break;
 	default:
 		KASSERT(0, ("%s unsupported mode %d", __func__, mode));
 	}
 
 	KASSERT(old != new, ("new fd is same as old"));
 
 	oldfde = &fdp->fd_ofiles[old];
 	fhold(oldfde->fde_file);
 	newfde = &fdp->fd_ofiles[new];
 	delfp = newfde->fde_file;
 
 	/*
 	 * Duplicate the source descriptor.
 	 */
 #ifdef CAPABILITIES
 	seq_write_begin(&newfde->fde_seq);
 #endif
 	filecaps_free(&newfde->fde_caps);
 	memcpy(newfde, oldfde, fde_change_size);
 	filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
 	if ((flags & FDDUP_FLAG_CLOEXEC) != 0)
 		newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
 	else
 		newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
 #ifdef CAPABILITIES
 	seq_write_end(&newfde->fde_seq);
 #endif
 	td->td_retval[0] = new;
 
 	error = 0;
 
 	if (delfp != NULL) {
 		(void) closefp(fdp, new, delfp, td, 1);
 		FILEDESC_UNLOCK_ASSERT(fdp);
 	} else {
 unlock:
 		FILEDESC_XUNLOCK(fdp);
 	}
 
 	return (error);
 }
 
 /*
  * If sigio is on the list associated with a process or process group,
  * disable signalling from the device, remove sigio from the list and
  * free sigio.
  */
 void
 funsetown(struct sigio **sigiop)
 {
 	struct sigio *sigio;
 
 	if (*sigiop == NULL)
 		return;
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	*(sigio->sio_myref) = NULL;
 	if ((sigio)->sio_pgid < 0) {
 		struct pgrp *pg = (sigio)->sio_pgrp;
 		PGRP_LOCK(pg);
 		SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
 			    sigio, sio_pgsigio);
 		PGRP_UNLOCK(pg);
 	} else {
 		struct proc *p = (sigio)->sio_proc;
 		PROC_LOCK(p);
 		SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
 			    sigio, sio_pgsigio);
 		PROC_UNLOCK(p);
 	}
 	SIGIO_UNLOCK();
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 }
 
 /*
  * Free a list of sigio structures.
  * We only need to lock the SIGIO_LOCK because we have made ourselves
  * inaccessible to callers of fsetown and therefore do not need to lock
  * the proc or pgrp struct for the list manipulation.
  */
 void
 funsetownlst(struct sigiolst *sigiolst)
 {
 	struct proc *p;
 	struct pgrp *pg;
 	struct sigio *sigio;
 
 	sigio = SLIST_FIRST(sigiolst);
 	if (sigio == NULL)
 		return;
 	p = NULL;
 	pg = NULL;
 
 	/*
 	 * Every entry of the list should belong
 	 * to a single proc or pgrp.
 	 */
 	if (sigio->sio_pgid < 0) {
 		pg = sigio->sio_pgrp;
 		PGRP_LOCK_ASSERT(pg, MA_NOTOWNED);
 	} else /* if (sigio->sio_pgid > 0) */ {
 		p = sigio->sio_proc;
 		PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 	}
 
 	SIGIO_LOCK();
 	while ((sigio = SLIST_FIRST(sigiolst)) != NULL) {
 		*(sigio->sio_myref) = NULL;
 		if (pg != NULL) {
 			KASSERT(sigio->sio_pgid < 0,
 			    ("Proc sigio in pgrp sigio list"));
 			KASSERT(sigio->sio_pgrp == pg,
 			    ("Bogus pgrp in sigio list"));
 			PGRP_LOCK(pg);
 			SLIST_REMOVE(&pg->pg_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PGRP_UNLOCK(pg);
 		} else /* if (p != NULL) */ {
 			KASSERT(sigio->sio_pgid > 0,
 			    ("Pgrp sigio in proc sigio list"));
 			KASSERT(sigio->sio_proc == p,
 			    ("Bogus proc in sigio list"));
 			PROC_LOCK(p);
 			SLIST_REMOVE(&p->p_sigiolst, sigio, sigio,
 			    sio_pgsigio);
 			PROC_UNLOCK(p);
 		}
 		SIGIO_UNLOCK();
 		crfree(sigio->sio_ucred);
 		free(sigio, M_SIGIO);
 		SIGIO_LOCK();
 	}
 	SIGIO_UNLOCK();
 }
 
 /*
  * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg).
  *
  * After permission checking, add a sigio structure to the sigio list for
  * the process or process group.
  */
 int
 fsetown(pid_t pgid, struct sigio **sigiop)
 {
 	struct proc *proc;
 	struct pgrp *pgrp;
 	struct sigio *sigio;
 	int ret;
 
 	if (pgid == 0) {
 		funsetown(sigiop);
 		return (0);
 	}
 
 	ret = 0;
 
 	/* Allocate and fill in the new sigio out of locks. */
 	sigio = malloc(sizeof(struct sigio), M_SIGIO, M_WAITOK);
 	sigio->sio_pgid = pgid;
 	sigio->sio_ucred = crhold(curthread->td_ucred);
 	sigio->sio_myref = sigiop;
 
 	sx_slock(&proctree_lock);
 	if (pgid > 0) {
 		proc = pfind(pgid);
 		if (proc == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		PROC_UNLOCK(proc);
 		if (proc->p_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		pgrp = NULL;
 	} else /* if (pgid < 0) */ {
 		pgrp = pgfind(-pgid);
 		if (pgrp == NULL) {
 			ret = ESRCH;
 			goto fail;
 		}
 		PGRP_UNLOCK(pgrp);
 
 		/*
 		 * Policy - Don't allow a process to FSETOWN a process
 		 * in another session.
 		 *
 		 * Remove this test to allow maximum flexibility or
 		 * restrict FSETOWN to the current process or process
 		 * group for maximum safety.
 		 */
 		if (pgrp->pg_session != curthread->td_proc->p_session) {
 			ret = EPERM;
 			goto fail;
 		}
 
 		proc = NULL;
 	}
 	funsetown(sigiop);
 	if (pgid > 0) {
 		PROC_LOCK(proc);
 		/*
 		 * Since funsetownlst() is called without the proctree
 		 * locked, we need to check for P_WEXIT.
 		 * XXX: is ESRCH correct?
 		 */
 		if ((proc->p_flag & P_WEXIT) != 0) {
 			PROC_UNLOCK(proc);
 			ret = ESRCH;
 			goto fail;
 		}
 		SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_proc = proc;
 		PROC_UNLOCK(proc);
 	} else {
 		PGRP_LOCK(pgrp);
 		SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio);
 		sigio->sio_pgrp = pgrp;
 		PGRP_UNLOCK(pgrp);
 	}
 	sx_sunlock(&proctree_lock);
 	SIGIO_LOCK();
 	*sigiop = sigio;
 	SIGIO_UNLOCK();
 	return (0);
 
 fail:
 	sx_sunlock(&proctree_lock);
 	crfree(sigio->sio_ucred);
 	free(sigio, M_SIGIO);
 	return (ret);
 }
 
 /*
  * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg).
  */
 pid_t
 fgetown(sigiop)
 	struct sigio **sigiop;
 {
 	pid_t pgid;
 
 	SIGIO_LOCK();
 	pgid = (*sigiop != NULL) ? (*sigiop)->sio_pgid : 0;
 	SIGIO_UNLOCK();
 	return (pgid);
 }
 
 /*
  * Function drops the filedesc lock on return.
  */
 static int
 closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
     int holdleaders)
 {
 	int error;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (holdleaders) {
 		if (td->td_proc->p_fdtol != NULL) {
 			/*
 			 * Ask fdfree() to sleep to ensure that all relevant
 			 * process leaders can be traversed in closef().
 			 */
 			fdp->fd_holdleaderscount++;
 		} else {
 			holdleaders = 0;
 		}
 	}
 
 	/*
 	 * We now hold the fp reference that used to be owned by the
 	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
 	 * knote_fdclose to prevent a race of the fd getting opened, a knote
 	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
 
 	/*
 	 * We need to notify mqueue if the object is of type mqueue.
 	 */
 	if (fp->f_type == DTYPE_MQUEUE)
 		mq_fdclose(td, fd, fp);
 	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
 		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
 		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
 
 /*
  * Close a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct close_args {
 	int     fd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_close(struct thread *td, struct close_args *uap)
 {
 
 	return (kern_close(td, uap->fd));
 }
 
 int
 kern_close(struct thread *td, int fd)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 
 	fdp = td->td_proc->p_fd;
 
 	AUDIT_SYSCLOSE(td, fd);
 
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdfree(fdp, fd);
 
 	/* closefp() drops the FILEDESC lock for us. */
 	return (closefp(fdp, fd, fp, td, 1));
 }
 
 /*
  * Close open file descriptors.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct closefrom_args {
 	int	lowfd;
 };
 #endif
 /* ARGSUSED */
 int
 sys_closefrom(struct thread *td, struct closefrom_args *uap)
 {
 	struct filedesc *fdp;
 	int fd;
 
 	fdp = td->td_proc->p_fd;
 	AUDIT_ARG_FD(uap->lowfd);
 
 	/*
 	 * Treat negative starting file descriptor values identical to
 	 * closefrom(0) which closes all files.
 	 */
 	if (uap->lowfd < 0)
 		uap->lowfd = 0;
 	FILEDESC_SLOCK(fdp);
 	for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
 		if (fdp->fd_ofiles[fd].fde_file != NULL) {
 			FILEDESC_SUNLOCK(fdp);
 			(void)kern_close(td, fd);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 	FILEDESC_SUNLOCK(fdp);
 	return (0);
 }
 
 #if defined(COMPAT_43)
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ofstat_args {
 	int	fd;
 	struct	ostat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
 {
 	struct ostat oub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtstat(&ub, &oub);
 		error = copyout(&oub, uap->sb, sizeof(oub));
 	}
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fstat_args {
 	int	fd;
 	struct	stat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fstat(struct thread *td, struct fstat_args *uap)
 {
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0)
 		error = copyout(&ub, uap->sb, sizeof(ub));
 	return (error);
 }
 
 int
 kern_fstat(struct thread *td, int fd, struct stat *sbp)
 {
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
 	if (error != 0)
 		return (error);
 
 	AUDIT_ARG_FILE(td->td_proc, fp);
 
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT))
 		ktrstat(sbp);
 #endif
 	return (error);
 }
 
 /*
  * Return status information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nfstat_args {
 	int	fd;
 	struct	nstat *sb;
 };
 #endif
 /* ARGSUSED */
 int
 sys_nfstat(struct thread *td, struct nfstat_args *uap)
 {
 	struct nstat nub;
 	struct stat ub;
 	int error;
 
 	error = kern_fstat(td, uap->fd, &ub);
 	if (error == 0) {
 		cvtnstat(&ub, &nub);
 		error = copyout(&nub, uap->sb, sizeof(nub));
 	}
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fpathconf_args {
 	int	fd;
 	int	name;
 };
 #endif
 /* ARGSUSED */
 int
 sys_fpathconf(struct thread *td, struct fpathconf_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
 	if (error != 0)
 		return (error);
 
 	if (uap->name == _PC_ASYNC_IO) {
 		td->td_retval[0] = _POSIX_ASYNCHRONOUS_IO;
 		goto out;
 	}
 	vp = fp->f_vnode;
 	if (vp != NULL) {
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		error = VOP_PATHCONF(vp, uap->name, td->td_retval);
 		VOP_UNLOCK(vp, 0);
 	} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
 		if (uap->name != _PC_PIPE_BUF) {
 			error = EINVAL;
 		} else {
 			td->td_retval[0] = PIPE_BUF;
 			error = 0;
 		}
 	} else {
 		error = EOPNOTSUPP;
 	}
 out:
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
  * Initialize filecaps structure.
  */
 void
 filecaps_init(struct filecaps *fcaps)
 {
 
 	bzero(fcaps, sizeof(*fcaps));
 	fcaps->fc_nioctls = -1;
 }
 
 /*
  * Copy filecaps structure allocating memory for ioctls array if needed.
  *
  * The last parameter indicates whether the fdtable is locked. If it is not and
  * ioctls are encountered, copying fails and the caller must lock the table.
  *
  * Note that if the table was not locked, the caller has to check the relevant
  * sequence counter to determine whether the operation was successful.
  */
 int
 filecaps_copy(const struct filecaps *src, struct filecaps *dst, bool locked)
 {
 	size_t size;
 
 	*dst = *src;
 	if (src->fc_ioctls == NULL)
 		return (0);
 	if (!locked)
 		return (1);
 
 	KASSERT(src->fc_nioctls > 0,
 	    ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
 
 	size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
 	dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
 	bcopy(src->fc_ioctls, dst->fc_ioctls, size);
 	return (0);
 }
 
 /*
  * Move filecaps structure to the new place and clear the old place.
  */
 void
 filecaps_move(struct filecaps *src, struct filecaps *dst)
 {
 
 	*dst = *src;
 	bzero(src, sizeof(*src));
 }
 
 /*
  * Fill the given filecaps structure with full rights.
  */
 static void
 filecaps_fill(struct filecaps *fcaps)
 {
 
 	CAP_ALL(&fcaps->fc_rights);
 	fcaps->fc_ioctls = NULL;
 	fcaps->fc_nioctls = -1;
 	fcaps->fc_fcntls = CAP_FCNTL_ALL;
 }
 
 /*
  * Free memory allocated within filecaps structure.
  */
 void
 filecaps_free(struct filecaps *fcaps)
 {
 
 	free(fcaps->fc_ioctls, M_FILECAPS);
 	bzero(fcaps, sizeof(*fcaps));
 }
 
 /*
  * Validate the given filecaps structure.
  */
 static void
 filecaps_validate(const struct filecaps *fcaps, const char *func)
 {
 
 	KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
 	    ("%s: invalid rights", func));
 	KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
 	    ("%s: invalid fcntls", func));
 	KASSERT(fcaps->fc_fcntls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
 	    ("%s: fcntls without CAP_FCNTL", func));
 	KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
 	    (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
 	    ("%s: invalid ioctls", func));
 	KASSERT(fcaps->fc_nioctls == 0 ||
 	    cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
 	    ("%s: ioctls without CAP_IOCTL", func));
 }
 
 static void
 fdgrowtable_exp(struct filedesc *fdp, int nfd)
 {
 	int nfd1;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	nfd1 = fdp->fd_nfiles * 2;
 	if (nfd1 < nfd)
 		nfd1 = nfd;
 	fdgrowtable(fdp, nfd1);
 }
 
 /*
  * Grow the file table to accommodate (at least) nfd descriptors.
  */
 static void
 fdgrowtable(struct filedesc *fdp, int nfd)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft;
 	struct fdescenttbl *ntable;
 	struct fdescenttbl *otable;
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap, *omap;
 
 	/*
 	 * If lastfile is -1 this struct filedesc was just allocated and we are
 	 * growing it to accommodate for the one we are going to copy from. There
 	 * is no need to have a lock on this one as it's not visible to anyone.
 	 */
 	if (fdp->fd_lastfile != -1)
 		FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
 
 	/* save old values */
 	onfiles = fdp->fd_nfiles;
 	otable = fdp->fd_files;
 	omap = fdp->fd_map;
 
 	/* compute the size of the new table */
 	nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
 	if (nnfiles <= onfiles)
 		/* the table is already large enough */
 		return;
 
 	/*
 	 * Allocate a new table.  We need enough space for the number of
 	 * entries, file entries themselves and the struct freetable we will use
 	 * when we decommission the table and place it on the freelist.
 	 * We place the struct freetable in the middle so we don't have
 	 * to worry about padding.
 	 */
 	ntable = malloc(offsetof(struct fdescenttbl, fdt_ofiles) +
 	    nnfiles * sizeof(ntable->fdt_ofiles[0]) +
 	    sizeof(struct freetable),
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	/* copy the old data */
 	ntable->fdt_nfiles = nnfiles;
 	memcpy(ntable->fdt_ofiles, otable->fdt_ofiles,
 	    onfiles * sizeof(ntable->fdt_ofiles[0]));
 
 	/*
 	 * Allocate a new map only if the old is not large enough.  It will
 	 * grow at a slower rate than the table as it can map more
 	 * entries than the table can hold.
 	 */
 	if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
 		nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
 		    M_ZERO | M_WAITOK);
 		/* copy over the old data and update the pointer */
 		memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
 		fdp->fd_map = nmap;
 	}
 
 	/*
 	 * Make sure that ntable is correctly initialized before we replace
 	 * fd_files poiner. Otherwise fget_unlocked() may see inconsistent
 	 * data.
 	 */
 	atomic_store_rel_ptr((volatile void *)&fdp->fd_files, (uintptr_t)ntable);
 
 	/*
 	 * Do not free the old file table, as some threads may still
 	 * reference entries within it.  Instead, place it on a freelist
 	 * which will be processed when the struct filedesc is released.
 	 *
 	 * Note that if onfiles == NDFILE, we're dealing with the original
 	 * static allocation contained within (struct filedesc0 *)fdp,
 	 * which must not be freed.
 	 */
 	if (onfiles > NDFILE) {
 		ft = (struct freetable *)&otable->fdt_ofiles[onfiles];
 		fdp0 = (struct filedesc0 *)fdp;
 		ft->ft_table = otable;
 		SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
 	}
 	/*
 	 * The map does not have the same possibility of threads still
 	 * holding references to it.  So always free it as long as it
 	 * does not reference the original static allocation.
 	 */
 	if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
 		free(omap, M_FILEDESC);
 }
 
 /*
  * Allocate a file descriptor for the process.
  */
 int
 fdalloc(struct thread *td, int minfd, int *result)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int fd, maxfd, allocfd;
 #ifdef RACCT
 	int error;
 #endif
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;
 
 	maxfd = getmaxfd(td);
 
 	/*
 	 * Search the bitmap for a free descriptor starting at minfd.
 	 * If none is found, grow the file table.
 	 */
 	fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
 	if (fd >= maxfd)
 		return (EMFILE);
 	if (fd >= fdp->fd_nfiles) {
 		allocfd = min(fd * 2, maxfd);
 #ifdef RACCT
 		if (racct_enable) {
 			PROC_LOCK(p);
 			error = racct_set(p, RACCT_NOFILE, allocfd);
 			PROC_UNLOCK(p);
 			if (error != 0)
 				return (EMFILE);
 		}
 #endif
 		/*
 		 * fd is already equal to first free descriptor >= minfd, so
 		 * we only need to grow the table and we are done.
 		 */
 		fdgrowtable_exp(fdp, allocfd);
 	}
 
 	/*
 	 * Perform some sanity checks, then mark the file descriptor as
 	 * used and return it to the caller.
 	 */
 	KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
 	    ("invalid descriptor %d", fd));
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd_first_free() returned non-free descriptor"));
 	KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
 	    ("file descriptor isn't free"));
 	fdused(fdp, fd);
 	*result = fd;
 	return (0);
 }
 
 /*
  * Allocate n file descriptors for the process.
  */
 int
 fdallocn(struct thread *td, int minfd, int *fds, int n)
 {
 	struct proc *p = td->td_proc;
 	struct filedesc *fdp = p->p_fd;
 	int i;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	for (i = 0; i < n; i++)
 		if (fdalloc(td, 0, &fds[i]) != 0)
 			break;
 
 	if (i < n) {
 		for (i--; i >= 0; i--)
 			fdunused(fdp, fds[i]);
 		return (EMFILE);
 	}
 
 	return (0);
 }
 
 /*
  * Create a new open file structure and allocate a file descriptor for the
  * process that refers to it.  We add one reference to the file for the
  * descriptor table and one reference for resultfp. This is to prevent us
  * being preempted and the entry in the descriptor table closed after we
  * release the FILEDESC lock.
  */
 int
 falloc_caps(struct thread *td, struct file **resultfp, int *resultfd, int flags,
     struct filecaps *fcaps)
 {
 	struct file *fp;
 	int error, fd;
 
 	error = falloc_noinstall(td, &fp);
 	if (error)
 		return (error);		/* no reference held on error */
 
 	error = finstall(td, fp, &fd, flags, fcaps);
 	if (error) {
 		fdrop(fp, td);		/* one reference (fp only) */
 		return (error);
 	}
 
 	if (resultfp != NULL)
 		*resultfp = fp;		/* copy out result */
 	else
 		fdrop(fp, td);		/* release local reference */
 
 	if (resultfd != NULL)
 		*resultfd = fd;
 
 	return (0);
 }
 
 /*
  * Create a new open file structure without allocating a file descriptor.
  */
 int
 falloc_noinstall(struct thread *td, struct file **resultfp)
 {
 	struct file *fp;
 	int maxuserfiles = maxfiles - (maxfiles / 20);
 	static struct timeval lastfail;
 	static int curfail;
 
 	KASSERT(resultfp != NULL, ("%s: resultfp == NULL", __func__));
 
 	if ((openfiles >= maxuserfiles &&
 	    priv_check(td, PRIV_MAXFILES) != 0) ||
 	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, "
 			    "please see tuning(7).\n", td->td_ucred->cr_ruid);
 		}
 		return (ENFILE);
 	}
 	atomic_add_int(&openfiles, 1);
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	refcount_init(&fp->f_count, 1);
 	fp->f_cred = crhold(td->td_ucred);
 	fp->f_ops = &badfileops;
 	*resultfp = fp;
 	return (0);
 }
 
 /*
  * Install a file in a file descriptor table.
  */
 void
 _finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedescent *fde;
 
 	MPASS(fp != NULL);
 	if (fcaps != NULL)
 		filecaps_validate(fcaps, __func__);
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	fde = &fdp->fd_ofiles[fd];
 #ifdef CAPABILITIES
 	seq_write_begin(&fde->fde_seq);
 #endif
 	fde->fde_file = fp;
 	fde->fde_flags = (flags & O_CLOEXEC) != 0 ? UF_EXCLOSE : 0;
 	if (fcaps != NULL)
 		filecaps_move(fcaps, &fde->fde_caps);
 	else
 		filecaps_fill(&fde->fde_caps);
 #ifdef CAPABILITIES
 	seq_write_end(&fde->fde_seq);
 #endif
 }
 
 int
 finstall(struct thread *td, struct file *fp, int *fd, int flags,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
 
 	MPASS(fd != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	if ((error = fdalloc(td, 0, fd))) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 	fhold(fp);
 	_finstall(fdp, fp, *fd, flags, fcaps);
 	FILEDESC_XUNLOCK(fdp);
 	return (0);
 }
 
 /*
  * Build a new filedesc structure from another.
  * Copy the current, root, and jail root vnode references.
  *
  * If fdp is not NULL, return with it shared locked.
  */
 struct filedesc *
 fdinit(struct filedesc *fdp, bool prepfiles)
 {
 	struct filedesc0 *newfdp0;
 	struct filedesc *newfdp;
 
 	newfdp0 = uma_zalloc(filedesc0_zone, M_WAITOK | M_ZERO);
 	newfdp = &newfdp0->fd_fd;
 
 	/* Create the file descriptor table. */
 	FILEDESC_LOCK_INIT(newfdp);
 	refcount_init(&newfdp->fd_refcnt, 1);
 	refcount_init(&newfdp->fd_holdcnt, 1);
 	newfdp->fd_cmask = CMASK;
 	newfdp->fd_map = newfdp0->fd_dmap;
 	newfdp->fd_lastfile = -1;
 	newfdp->fd_files = (struct fdescenttbl *)&newfdp0->fd_dfiles;
 	newfdp->fd_files->fdt_nfiles = NDFILE;
 
 	if (fdp == NULL)
 		return (newfdp);
 
 	if (prepfiles && fdp->fd_lastfile >= newfdp->fd_nfiles)
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 
 	FILEDESC_SLOCK(fdp);
 	newfdp->fd_cdir = fdp->fd_cdir;
 	if (newfdp->fd_cdir)
 		VREF(newfdp->fd_cdir);
 	newfdp->fd_rdir = fdp->fd_rdir;
 	if (newfdp->fd_rdir)
 		VREF(newfdp->fd_rdir);
 	newfdp->fd_jdir = fdp->fd_jdir;
 	if (newfdp->fd_jdir)
 		VREF(newfdp->fd_jdir);
 
 	if (!prepfiles) {
 		FILEDESC_SUNLOCK(fdp);
 	} else {
 		while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
 			FILEDESC_SUNLOCK(fdp);
 			fdgrowtable(newfdp, fdp->fd_lastfile + 1);
 			FILEDESC_SLOCK(fdp);
 		}
 	}
 
 	return (newfdp);
 }
 
 static struct filedesc *
 fdhold(struct proc *p)
 {
 	struct filedesc *fdp;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	fdp = p->p_fd;
 	if (fdp != NULL)
 		refcount_acquire(&fdp->fd_holdcnt);
 	return (fdp);
 }
 
 static void
 fddrop(struct filedesc *fdp)
 {
 
 	if (fdp->fd_holdcnt > 1) {
 		if (refcount_release(&fdp->fd_holdcnt) == 0)
 			return;
 	}
 
 	FILEDESC_LOCK_DESTROY(fdp);
 	uma_zfree(filedesc0_zone, fdp);
 }
 
 /*
  * Share a filedesc structure.
  */
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
 
 	refcount_acquire(&fdp->fd_refcnt);
 	return (fdp);
 }
 
 /*
  * Unshare a filedesc structure, if necessary by making a copy
  */
 void
 fdunshare(struct thread *td)
 {
 	struct filedesc *tmp;
 	struct proc *p = td->td_proc;
 
 	if (p->p_fd->fd_refcnt == 1)
 		return;
 
 	tmp = fdcopy(p->p_fd);
 	fdescfree(td);
 	p->p_fd = tmp;
 }
 
 void
 fdinstall_remapped(struct thread *td, struct filedesc *fdp)
 {
 
 	fdescfree(td);
 	td->td_proc->p_fd = fdp;
 }
 
 /*
  * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
  * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int i;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true);
 	/* copy all passable descriptors (i.e. not kqueue) */
 	newfdp->fd_freefile = -1;
 	for (i = 0; i <= fdp->fd_lastfile; ++i) {
 		ofde = &fdp->fd_ofiles[i];
 		if (ofde->fde_file == NULL ||
 		    (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			if (newfdp->fd_freefile == -1)
 				newfdp->fd_freefile = i;
 			continue;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fhold(nfde->fde_file);
 		fdused_init(newfdp, i);
 		newfdp->fd_lastfile = i;
 	}
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
 /*
  * Copies a filedesc structure, while remapping all file descriptors
  * stored inside using a translation table.
  *
  * File descriptors are copied over to the new file descriptor table,
  * regardless of whether the close-on-exec flag is set.
  */
 int
 fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
     struct filedesc **ret)
 {
 	struct filedesc *newfdp;
 	struct filedescent *nfde, *ofde;
 	int error, i;
 
 	MPASS(fdp != NULL);
 
 	newfdp = fdinit(fdp, true);
 	if (nfds > fdp->fd_lastfile + 1) {
 		/* New table cannot be larger than the old one. */
 		error = E2BIG;
 		goto bad;
 	}
 	/* Copy all passable descriptors (i.e. not kqueue). */
 	newfdp->fd_freefile = nfds;
 	for (i = 0; i < nfds; ++i) {
 		if (fds[i] < 0 || fds[i] > fdp->fd_lastfile) {
 			/* File descriptor out of bounds. */
 			error = EBADF;
 			goto bad;
 		}
 		ofde = &fdp->fd_ofiles[fds[i]];
 		if (ofde->fde_file == NULL) {
 			/* Unused file descriptor. */
 			error = EBADF;
 			goto bad;
 		}
 		if ((ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) == 0) {
 			/* File descriptor cannot be passed. */
 			error = EINVAL;
 			goto bad;
 		}
 		nfde = &newfdp->fd_ofiles[i];
 		*nfde = *ofde;
 		filecaps_copy(&ofde->fde_caps, &nfde->fde_caps, true);
 		fhold(nfde->fde_file);
 		fdused_init(newfdp, i);
 		newfdp->fd_lastfile = i;
 	}
 	newfdp->fd_cmask = fdp->fd_cmask;
 	FILEDESC_SUNLOCK(fdp);
 	*ret = newfdp;
 	return (0);
 bad:
 	FILEDESC_SUNLOCK(fdp);
 	fdescfree_remapped(newfdp);
 	return (error);
 }
 
 /*
  * Clear POSIX style locks. This is only used when fdp looses a reference (i.e.
  * one of processes using it exits) and the table used to be shared.
  */
 static void
 fdclearlocks(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedesc_to_leader *fdtol;
 	struct flock lf;
 	struct file *fp;
 	struct proc *p;
 	struct vnode *vp;
 	int i;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	fdtol = p->p_fdtol;
 	MPASS(fdtol != NULL);
 
 	FILEDESC_XLOCK(fdp);
 	KASSERT(fdtol->fdl_refcount > 0,
 	    ("filedesc_to_refcount botch: fdl_refcount=%d",
 	    fdtol->fdl_refcount));
 	if (fdtol->fdl_refcount == 1 &&
 	    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 		for (i = 0; i <= fdp->fd_lastfile; i++) {
 			fp = fdp->fd_ofiles[i].fde_file;
 			if (fp == NULL || fp->f_type != DTYPE_VNODE)
 				continue;
 			fhold(fp);
 			FILEDESC_XUNLOCK(fdp);
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			vp = fp->f_vnode;
 			(void) VOP_ADVLOCK(vp,
 			    (caddr_t)p->p_leader, F_UNLCK,
 			    &lf, F_POSIX);
 			FILEDESC_XLOCK(fdp);
 			fdrop(fp, td);
 		}
 	}
 retry:
 	if (fdtol->fdl_refcount == 1) {
 		if (fdp->fd_holdleaderscount > 0 &&
 		    (p->p_leader->p_flag & P_ADVLOCK) != 0) {
 			/*
 			 * close() or kern_dup() has cleared a reference
 			 * in a shared file descriptor table.
 			 */
 			fdp->fd_holdleaderswakeup = 1;
 			sx_sleep(&fdp->fd_holdleaderscount,
 			    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 			goto retry;
 		}
 		if (fdtol->fdl_holdcount > 0) {
 			/*
 			 * Ensure that fdtol->fdl_leader remains
 			 * valid in closef().
 			 */
 			fdtol->fdl_wakeup = 1;
 			sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
 			    "fdlhold", 0);
 			goto retry;
 		}
 	}
 	fdtol->fdl_refcount--;
 	if (fdtol->fdl_refcount == 0 &&
 	    fdtol->fdl_holdcount == 0) {
 		fdtol->fdl_next->fdl_prev = fdtol->fdl_prev;
 		fdtol->fdl_prev->fdl_next = fdtol->fdl_next;
 	} else
 		fdtol = NULL;
 	p->p_fdtol = NULL;
 	FILEDESC_XUNLOCK(fdp);
 	if (fdtol != NULL)
 		free(fdtol, M_FILEDESC_TO_LEADER);
 }
 
 /*
  * Release a filedesc structure.
  */
 static void
 fdescfree_fds(struct thread *td, struct filedesc *fdp, bool needclose)
 {
 	struct filedesc0 *fdp0;
 	struct freetable *ft, *tft;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL) {
 			fdefree_last(fde);
 			if (needclose)
 				(void) closef(fp, td);
 			else
 				fdrop(fp, td);
 		}
 	}
 
 	if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
 		free(fdp->fd_map, M_FILEDESC);
 	if (fdp->fd_nfiles > NDFILE)
 		free(fdp->fd_files, M_FILEDESC);
 
 	fdp0 = (struct filedesc0 *)fdp;
 	SLIST_FOREACH_SAFE(ft, &fdp0->fd_free, ft_next, tft)
 		free(ft->ft_table, M_FILEDESC);
 
 	fddrop(fdp);
 }
 
 void
 fdescfree(struct thread *td)
 {
 	struct proc *p;
 	struct filedesc *fdp;
 	struct vnode *cdir, *jdir, *rdir;
 
 	p = td->td_proc;
 	fdp = p->p_fd;
 	MPASS(fdp != NULL);
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(p);
 		racct_set(p, RACCT_NOFILE, 0);
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	if (p->p_fdtol != NULL)
 		fdclearlocks(td);
 
 	PROC_LOCK(p);
 	p->p_fd = NULL;
 	PROC_UNLOCK(p);
 
 	if (refcount_release(&fdp->fd_refcnt) == 0)
 		return;
 
 	FILEDESC_XLOCK(fdp);
 	cdir = fdp->fd_cdir;
 	fdp->fd_cdir = NULL;
 	rdir = fdp->fd_rdir;
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
 	FILEDESC_XUNLOCK(fdp);
 
 	if (cdir != NULL)
 		vrele(cdir);
 	if (rdir != NULL)
 		vrele(rdir);
 	if (jdir != NULL)
 		vrele(jdir);
 
 	fdescfree_fds(td, fdp, 1);
 }
 
 void
 fdescfree_remapped(struct filedesc *fdp)
 {
 
 	if (fdp->fd_cdir != NULL)
 		vrele(fdp->fd_cdir);
 	if (fdp->fd_rdir != NULL)
 		vrele(fdp->fd_rdir);
 	if (fdp->fd_jdir != NULL)
 		vrele(fdp->fd_jdir);
 
 	fdescfree_fds(curthread, fdp, 0);
 }
 
 /*
  * For setugid programs, we don't want to people to use that setugidness
  * to generate error messages which write to a file which otherwise would
  * otherwise be off-limits to the process.  We check for filesystems where
  * the vnode can change out from under us after execve (like [lin]procfs).
  *
  * Since fdsetugidsafety calls this only for fd 0, 1 and 2, this check is
  * sufficient.  We also don't check for setugidness since we know we are.
  */
 static bool
 is_unsafe(struct file *fp)
 {
 	struct vnode *vp;
 
 	if (fp->f_type != DTYPE_VNODE)
 		return (false);
 
 	vp = fp->f_vnode;
 	return ((vp->v_vflag & VV_PROCDEP) != 0);
 }
 
 /*
  * Make this setguid thing safe, if at all possible.
  */
 void
 fdsetugidsafety(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	for (i = 0; i <= 2; i++) {
 		fp = fdp->fd_ofiles[i].fde_file;
 		if (fp != NULL && is_unsafe(fp)) {
 			FILEDESC_XLOCK(fdp);
 			knote_fdclose(td, i);
 			/*
 			 * NULL-out descriptor prior to close to avoid
 			 * a race while close blocks.
 			 */
 			fdfree(fdp, i);
 			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
 		}
 	}
 }
 
 /*
  * If a specific file object occupies a specific file descriptor, close the
  * file descriptor entry and drop a reference on the file object.  This is a
  * convenience function to handle a subsequent error in a function that calls
  * falloc() that handles the race that another thread might have closed the
  * file descriptor out from under the thread creating the file object.
  */
 void
 fdclose(struct thread *td, struct file *fp, int idx)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 
 	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx].fde_file == fp) {
 		fdfree(fdp, idx);
 		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 	} else
 		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
  * Close any files on exec?
  */
 void
 fdcloseexec(struct thread *td)
 {
 	struct filedesc *fdp;
 	struct filedescent *fde;
 	struct file *fp;
 	int i;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		fde = &fdp->fd_ofiles[i];
 		fp = fde->fde_file;
 		if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
 		    (fde->fde_flags & UF_EXCLOSE))) {
 			FILEDESC_XLOCK(fdp);
 			fdfree(fdp, i);
 			(void) closefp(fdp, i, fp, td, 0);
 			FILEDESC_UNLOCK_ASSERT(fdp);
 		}
 	}
 }
 
 /*
  * It is unsafe for set[ug]id processes to be started with file
  * descriptors 0..2 closed, as these descriptors are given implicit
  * significance in the Standard C library.  fdcheckstd() will create a
  * descriptor referencing /dev/null for each of stdin, stdout, and
  * stderr that is not already open.
  */
 int
 fdcheckstd(struct thread *td)
 {
 	struct filedesc *fdp;
 	register_t save;
 	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
 	MPASS(fdp->fd_nfiles >= 3);
 	devnull = -1;
 	for (i = 0; i <= 2; i++) {
 		if (fdp->fd_ofiles[i].fde_file != NULL)
 			continue;
 
 		save = td->td_retval[0];
 		if (devnull != -1) {
 			error = kern_dup(td, FDDUP_FIXED, 0, devnull, i);
 		} else {
 			error = kern_openat(td, AT_FDCWD, "/dev/null",
 			    UIO_SYSSPACE, O_RDWR, 0);
 			if (error == 0) {
 				devnull = td->td_retval[0];
 				KASSERT(devnull == i, ("we didn't get our fd"));
 			}
 		}
 		td->td_retval[0] = save;
 		if (error != 0)
 			return (error);
 	}
 	return (0);
 }
 
 /*
  * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
  * XXXRW: Giant is not required for the caller, but often will be held; this
  * makes it moderately likely the Giant will be recursed in the VFS case.
  */
 int
 closef(struct file *fp, struct thread *td)
 {
 	struct vnode *vp;
 	struct flock lf;
 	struct filedesc_to_leader *fdtol;
 	struct filedesc *fdp;
 
 	/*
 	 * POSIX record locking dictates that any close releases ALL
 	 * locks owned by this process.  This is handled by setting
 	 * a flag in the unlock to free ONLY locks obeying POSIX
 	 * semantics, and not to free BSD-style file locks.
 	 * If the descriptor was in a message, POSIX-style locks
 	 * aren't passed with the descriptor, and the thread pointer
 	 * will be NULL.  Callers should be careful only to pass a
 	 * NULL thread pointer when there really is no owning
 	 * context that might have locks, or the locks will be
 	 * leaked.
 	 */
 	if (fp->f_type == DTYPE_VNODE && td != NULL) {
 		vp = fp->f_vnode;
 		if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
 			lf.l_whence = SEEK_SET;
 			lf.l_start = 0;
 			lf.l_len = 0;
 			lf.l_type = F_UNLCK;
 			(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
 			    F_UNLCK, &lf, F_POSIX);
 		}
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
 			 * Handle special case where file descriptor table is
 			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
 			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			    fdtol != td->td_proc->p_fdtol;
 			    fdtol = fdtol->fdl_next) {
 				if ((fdtol->fdl_leader->p_flag &
 				    P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
 				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
 				lf.l_type = F_UNLCK;
 				vp = fp->f_vnode;
 				(void) VOP_ADVLOCK(vp,
 				    (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
 				    F_POSIX);
 				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
 					fdtol->fdl_wakeup = 0;
 					wakeup(fdtol);
 				}
 			}
 			FILEDESC_XUNLOCK(fdp);
 		}
 	}
 	return (fdrop(fp, td));
 }
 
 /*
  * Initialize the file pointer with the specified properties.
  *
  * The ops are set with release semantics to be certain that the flags, type,
  * and data are visible when ops is.  This is to prevent ops methods from being
  * called with bad data.
  */
 void
 finit(struct file *fp, u_int flag, short type, void *data, struct fileops *ops)
 {
 	fp->f_data = data;
 	fp->f_flag = flag;
 	fp->f_type = type;
 	atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
 }
 
 int
 fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
 	struct filedescent *fde;
 	int error;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	fde = fdeget_locked(fdp, fd);
 	if (fde == NULL) {
 		error = EBADF;
 		goto out;
 	}
 
 #ifdef CAPABILITIES
 	error = cap_check(cap_rights_fde(fde), needrightsp);
 	if (error != 0)
 		goto out;
 #endif
 
 	if (havecapsp != NULL)
 		filecaps_copy(&fde->fde_caps, havecapsp, true);
 
-	fhold(fde->fde_file);
 	*fpp = fde->fde_file;
 
 	error = 0;
 out:
 	return (error);
 }
 
 int
 fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct file **fpp, struct filecaps *havecapsp)
 {
-	struct filedesc *fdp;
-	struct file *fp;
+	struct filedesc *fdp = td->td_proc->p_fd;
 	int error;
+#ifndef CAPABILITIES
+	error = fget_unlocked(fdp, fd, needrightsp, fpp, NULL);
+	if (error == 0 && havecapsp != NULL)
+		filecaps_fill(havecapsp);
+#else
+	struct file *fp;
 	seq_t seq;
 
-	fdp = td->td_proc->p_fd;
 	for (;;) {
 		error = fget_unlocked(fdp, fd, needrightsp, &fp, &seq);
 		if (error != 0)
 			return (error);
 
 		if (havecapsp != NULL) {
 			if (!filecaps_copy(&fdp->fd_ofiles[fd].fde_caps,
 			    havecapsp, false)) {
 				fdrop(fp, td);
 				goto get_locked;
 			}
 		}
 
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(fp, td);
 	}
 
 	*fpp = fp;
 	return (0);
 
 get_locked:
 	FILEDESC_SLOCK(fdp);
 	error = fget_cap_locked(fdp, fd, needrightsp, fpp, havecapsp);
+	if (error == 0)
+		fhold(*fpp);
 	FILEDESC_SUNLOCK(fdp);
-
+#endif
 	return (error);
 }
 
 int
 fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
     struct file **fpp, seq_t *seqp)
 {
 #ifdef CAPABILITIES
 	struct filedescent *fde;
 #endif
 	struct fdescenttbl *fdt;
 	struct file *fp;
 	u_int count;
 #ifdef CAPABILITIES
 	seq_t seq;
 	cap_rights_t haverights;
 	int error;
 #endif
 
 	fdt = fdp->fd_files;
 	if ((u_int)fd >= fdt->fdt_nfiles)
 		return (EBADF);
 	/*
 	 * Fetch the descriptor locklessly.  We avoid fdrop() races by
 	 * never raising a refcount above 0.  To accomplish this we have
 	 * to use a cmpset loop rather than an atomic_add.  The descriptor
 	 * must be re-verified once we acquire a reference to be certain
 	 * that the identity is still correct and we did not lose a race
 	 * due to preemption.
 	 */
 	for (;;) {
 #ifdef CAPABILITIES
 		seq = seq_read(fd_seq(fdt, fd));
 		fde = &fdt->fdt_ofiles[fd];
 		haverights = *cap_rights_fde(fde);
 		fp = fde->fde_file;
 		if (!seq_consistent(fd_seq(fdt, fd), seq)) {
 			cpu_spinwait();
 			continue;
 		}
 #else
 		fp = fdt->fdt_ofiles[fd].fde_file;
 #endif
 		if (fp == NULL)
 			return (EBADF);
 #ifdef CAPABILITIES
 		error = cap_check(&haverights, needrightsp);
 		if (error != 0)
 			return (error);
 #endif
 	retry:
 		count = fp->f_count;
 		if (count == 0) {
 			/*
 			 * Force a reload. Other thread could reallocate the
 			 * table before this fd was closed, so it possible that
 			 * there is a stale fp pointer in cached version.
 			 */
 			fdt = *(struct fdescenttbl * volatile *)&(fdp->fd_files);
 			continue;
 		}
 		/*
 		 * Use an acquire barrier to force re-reading of fdt so it is
 		 * refreshed for verification.
 		 */
 		if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) == 0)
 			goto retry;
 		fdt = fdp->fd_files;
 #ifdef	CAPABILITIES
 		if (seq_consistent_nomb(fd_seq(fdt, fd), seq))
 #else
 		if (fp == fdt->fdt_ofiles[fd].fde_file)
 #endif
 			break;
 		fdrop(fp, curthread);
 	}
 	*fpp = fp;
 	if (seqp != NULL) {
 #ifdef CAPABILITIES
 		*seqp = seq;
 #endif
 	}
 	return (0);
 }
 
 /*
  * Extract the file pointer associated with the specified descriptor for the
  * current user process.
  *
  * If the descriptor doesn't exist or doesn't match 'flags', EBADF is
  * returned.
  *
  * File's rights will be checked against the capability rights mask.
  *
  * If an error occurred the non-zero error is returned and *fpp is set to
  * NULL.  Otherwise *fpp is held and set and zero is returned.  Caller is
  * responsible for fdrop().
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags,
     cap_rights_t *needrightsp, seq_t *seqp)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	int error;
 
 	*fpp = NULL;
 	fdp = td->td_proc->p_fd;
 	error = fget_unlocked(fdp, fd, needrightsp, &fp, seqp);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		fdrop(fp, td);
 		return (EBADF);
 	}
 
 	/*
 	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 */
 	error = 0;
 	switch (flags) {
 	case FREAD:
 	case FWRITE:
 		if ((fp->f_flag & flags) == 0)
 			error = EBADF;
 		break;
 	case FEXEC:
 	    	if ((fp->f_flag & (FREAD | FEXEC)) == 0 ||
 		    ((fp->f_flag & FWRITE) != 0))
 			error = EBADF;
 		break;
 	case 0:
 		break;
 	default:
 		KASSERT(0, ("wrong flags"));
 	}
 
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 
 	*fpp = fp;
 	return (0);
 }
 
 int
 fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, 0, rightsp, NULL));
 }
 
 int
 fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
     struct file **fpp)
 {
 	int error;
 #ifndef CAPABILITIES
 	error = _fget(td, fd, fpp, 0, rightsp, NULL);
 	if (maxprotp != NULL)
 		*maxprotp = VM_PROT_ALL;
 #else
 	struct filedesc *fdp = td->td_proc->p_fd;
 	seq_t seq;
 
 	MPASS(cap_rights_is_set(rightsp, CAP_MMAP));
 	for (;;) {
 		error = _fget(td, fd, fpp, 0, rightsp, &seq);
 		if (error != 0)
 			return (error);
 		/*
 		 * If requested, convert capability rights to access flags.
 		 */
 		if (maxprotp != NULL)
 			*maxprotp = cap_rights_to_vmprot(cap_rights(fdp, fd));
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(*fpp, td);
 	}
 #endif
 	return (error);
 }
 
 int
 fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FREAD, rightsp, NULL));
 }
 
 int
 fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
 {
 
 	return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
 }
 
 int
 fget_fcntl(struct thread *td, int fd, cap_rights_t *rightsp, int needfcntl,
     struct file **fpp)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 #ifndef CAPABILITIES
 	return (fget_unlocked(fdp, fd, rightsp, fpp, NULL));
 #else
 	int error;
 	seq_t seq;
 
 	MPASS(cap_rights_is_set(rightsp, CAP_FCNTL));
 	for (;;) {
 		error = fget_unlocked(fdp, fd, rightsp, fpp, &seq);
 		if (error != 0)
 			return (error);
 		error = cap_fcntl_check(fdp, fd, needfcntl);
 		if (!fd_modified(fdp, fd, seq))
 			break;
 		fdrop(*fpp, td);
 	}
 	if (error != 0) {
 		fdrop(*fpp, td);
 		*fpp = NULL;
 	}
 	return (error);
 #endif
 }
 
 /*
  * Like fget() but loads the underlying vnode, or returns an error if the
  * descriptor does not represent a vnode.  Note that pipes use vnodes but
  * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
 static __inline int
 _fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
     struct vnode **vpp)
 {
 	struct file *fp;
 	int error;
 
 	*vpp = NULL;
 	error = _fget(td, fd, &fp, flags, needrightsp, NULL);
 	if (error != 0)
 		return (error);
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 	} else {
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 int
 fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, 0, rightsp, vpp));
 }
 
 int
 fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp)
 {
 	struct filedesc *fdp;
 	struct filecaps caps;
 	struct file *fp;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	error = fget_cap_locked(fdp, fd, needrightsp, &fp, &caps);
 	if (error != 0)
 		return (error);
 	if (fp->f_ops == &badfileops) {
 		error = EBADF;
 		goto out;
 	}
 	if (fp->f_vnode == NULL) {
 		error = EINVAL;
 		goto out;
 	}
 
 	*havecaps = caps;
 	*vpp = fp->f_vnode;
 	vref(*vpp);
 
 	return (0);
 out:
 	filecaps_free(&caps);
 	return (error);
 }
 
 int
 fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FREAD, rightsp, vpp));
 }
 
 int
 fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
 }
 
 #ifdef notyet
 int
 fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp)
 {
 
 	return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
 }
 #endif
 
 /*
  * Like fget() but loads the underlying socket, or returns an error if the
  * descriptor does not represent a socket.
  *
  * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
  * in the future.
  *
  * Note: fgetsock() and fputsock() are deprecated, as consumers should rely
  * on their file descriptor reference to prevent the socket from being free'd
  * during use.
  */
 int
 fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
     u_int *fflagp)
 {
 	struct file *fp;
 	int error;
 
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
 	if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		error = ENOTSOCK;
 	} else {
 		*spp = fp->f_data;
 		if (fflagp)
 			*fflagp = fp->f_flag;
 		SOCK_LOCK(*spp);
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
 	fdrop(fp, td);
 
 	return (error);
 }
 
 /*
  * Drop the reference count on the socket and XXX release the SX lock in the
  * future.  The last reference closes the socket.
  *
  * Note: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Handle the last reference to a file being closed.
  */
 int
 _fdrop(struct file *fp, struct thread *td)
 {
 	int error;
 
 	if (fp->f_count != 0)
 		panic("fdrop: count %d", fp->f_count);
 	error = fo_close(fp, td);
 	atomic_subtract_int(&openfiles, 1);
 	crfree(fp->f_cred);
 	free(fp->f_advice, M_FADVISE);
 	uma_zfree(file_zone, fp);
 
 	return (error);
 }
 
 /*
  * Apply an advisory lock on a file descriptor.
  *
  * Just attempt to get a record lock of the requested type on the entire file
  * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
 	int	fd;
 	int	how;
 };
 #endif
 /* ARGSUSED */
 int
 sys_flock(struct thread *td, struct flock_args *uap)
 {
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_VNODE) {
 		fdrop(fp, td);
 		return (EOPNOTSUPP);
 	}
 
 	vp = fp->f_vnode;
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		atomic_clear_int(&fp->f_flag, FHASLOCK);
 		error = VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
 		goto done2;
 	}
 	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
 	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else {
 		error = EBADF;
 		goto done2;
 	}
 	atomic_set_int(&fp->f_flag, FHASLOCK);
 	error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
 	return (error);
 }
 /*
  * Duplicate the specified descriptor to a free descriptor.
  */
 int
 dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
     int openerror, int *indxp)
 {
 	struct filedescent *newfde, *oldfde;
 	struct file *fp;
 	int error, indx;
 
 	KASSERT(openerror == ENODEV || openerror == ENXIO,
 	    ("unexpected error %d in %s", openerror, __func__));
 
 	/*
 	 * If the to-be-dup'd fd number is greater than the allowed number
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
 	FILEDESC_XLOCK(fdp);
 	if ((fp = fget_locked(fdp, dfd)) == NULL) {
 		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	error = fdalloc(td, 0, &indx);
 	if (error != 0) {
 		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
 	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
 	 * For ENXIO steal away the file structure from (dfd) and store it in
 	 * (indx).  (dfd) is effectively closed by this operation.
 	 */
 	switch (openerror) {
 	case ENODEV:
 		/*
 		 * Check that the mode the file is being opened for is a
 		 * subset of the mode of the existing descriptor.
 		 */
 		if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
 			fdunused(fdp, indx);
 			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		fhold(fp);
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps, true);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	case ENXIO:
 		/*
 		 * Steal away the file pointer from dfd and stuff it into indx.
 		 */
 		newfde = &fdp->fd_ofiles[indx];
 		oldfde = &fdp->fd_ofiles[dfd];
 #ifdef CAPABILITIES
 		seq_write_begin(&newfde->fde_seq);
 #endif
 		memcpy(newfde, oldfde, fde_change_size);
 		oldfde->fde_file = NULL;
 		fdunused(fdp, dfd);
 #ifdef CAPABILITIES
 		seq_write_end(&newfde->fde_seq);
 #endif
 		break;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	*indxp = indx;
 	return (0);
 }
 
 /*
  * This sysctl determines if we will allow a process to chroot(2) if it
  * has a directory open:
  *	0: disallowed for all processes.
  *	1: allowed for processes that were not already chroot(2)'ed.
  *	2: allowed for all processes.
  */
 
 static int chroot_allow_open_directories = 1;
 
 SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
     &chroot_allow_open_directories, 0,
     "Allow a process to chroot(2) if it has a directory open");
 
 /*
  * Helper function for raised chroot(2) security function:  Refuse if
  * any filedescriptors are open directories.
  */
 static int
 chroot_refuse_vdir_fds(struct filedesc *fdp)
 {
 	struct vnode *vp;
 	struct file *fp;
 	int fd;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			continue;
 		if (fp->f_type == DTYPE_VNODE) {
 			vp = fp->f_vnode;
 			if (vp->v_type == VDIR)
 				return (EPERM);
 		}
 	}
 	return (0);
 }
 
 /*
  * Common routine for kern_chroot() and jail_attach().  The caller is
  * responsible for invoking priv_check() and mac_vnode_check_chroot() to
  * authorize this operation.
  */
 int
 pwd_chroot(struct thread *td, struct vnode *vp)
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 	int error;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		if (error != 0) {
 			FILEDESC_XUNLOCK(fdp);
 			return (error);
 		}
 	}
 	oldvp = fdp->fd_rdir;
 	VREF(vp);
 	fdp->fd_rdir = vp;
 	if (fdp->fd_jdir == NULL) {
 		VREF(vp);
 		fdp->fd_jdir = vp;
 	}
 	FILEDESC_XUNLOCK(fdp);
 	vrele(oldvp);
 	return (0);
 }
 
 void
 pwd_chdir(struct thread *td, struct vnode *vp)
 {
 	struct filedesc *fdp;
 	struct vnode *oldvp;
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_XLOCK(fdp);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("chdir to a vnode with zero usecount"));
 	oldvp = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
 	FILEDESC_XUNLOCK(fdp);
 	vrele(oldvp);
 }
 
 /*
  * Scan all active processes and prisons to see if any of them have a current
  * or root directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
 {
 	struct filedesc *fdp;
 	struct prison *pr;
 	struct proc *p;
 	int nrele;
 
 	if (vrefcnt(olddp) == 1)
 		return;
 	nrele = 0;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_XLOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_rdir == olddp) {
 			vref(newdp);
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
 		if (fdp->fd_jdir == olddp) {
 			vref(newdp);
 			fdp->fd_jdir = newdp;
 			nrele++;
 		}
 		FILEDESC_XUNLOCK(fdp);
 		fddrop(fdp);
 	}
 	sx_sunlock(&allproc_lock);
 	if (rootvnode == olddp) {
 		vref(newdp);
 		rootvnode = newdp;
 		nrele++;
 	}
 	mtx_lock(&prison0.pr_mtx);
 	if (prison0.pr_root == olddp) {
 		vref(newdp);
 		prison0.pr_root = newdp;
 		nrele++;
 	}
 	mtx_unlock(&prison0.pr_mtx);
 	sx_slock(&allprison_lock);
 	TAILQ_FOREACH(pr, &allprison, pr_list) {
 		mtx_lock(&pr->pr_mtx);
 		if (pr->pr_root == olddp) {
 			vref(newdp);
 			pr->pr_root = newdp;
 			nrele++;
 		}
 		mtx_unlock(&pr->pr_mtx);
 	}
 	sx_sunlock(&allprison_lock);
 	while (nrele--)
 		vrele(olddp);
 }
 
 struct filedesc_to_leader *
 filedesc_to_leader_alloc(struct filedesc_to_leader *old, struct filedesc *fdp, struct proc *leader)
 {
 	struct filedesc_to_leader *fdtol;
 
 	fdtol = malloc(sizeof(struct filedesc_to_leader),
 	    M_FILEDESC_TO_LEADER, M_WAITOK);
 	fdtol->fdl_refcount = 1;
 	fdtol->fdl_holdcount = 0;
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
 		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
 		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
 	}
 	return (fdtol);
 }
 
 static int
 sysctl_kern_proc_nfds(SYSCTL_HANDLER_ARGS)
 {
 	struct filedesc *fdp;
 	int i, count, slots;
 
 	if (*(int *)arg1 != 0)
 		return (EINVAL);
 
 	fdp = curproc->p_fd;
 	count = 0;
 	FILEDESC_SLOCK(fdp);
 	slots = NDSLOTS(fdp->fd_lastfile + 1);
 	for (i = 0; i < slots; i++)
 		count += bitcountl(fdp->fd_map[i]);
 	FILEDESC_SUNLOCK(fdp);
 
 	return (SYSCTL_OUT(req, &count, sizeof(count)));
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_NFDS, nfds,
     CTLFLAG_RD|CTLFLAG_CAPRD|CTLFLAG_MPSAFE, sysctl_kern_proc_nfds,
     "Number of open file descriptors");
 
 /*
  * Get file structures globally.
  */
 static int
 sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 {
 	struct xfile xf;
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int error, n;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	if (req->oldptr == NULL) {
 		n = 0;
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			fdp = fdhold(p);
 			PROC_UNLOCK(p);
 			if (fdp == NULL)
 				continue;
 			/* overestimates sparse tables. */
 			if (fdp->fd_lastfile > 0)
 				n += fdp->fd_lastfile;
 			fddrop(fdp);
 		}
 		sx_sunlock(&allproc_lock);
 		return (SYSCTL_OUT(req, 0, n * sizeof(xf)));
 	}
 	error = 0;
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_state == PRS_NEW) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		if (p_cansee(req->td, p) != 0) {
 			PROC_UNLOCK(p);
 			continue;
 		}
 		xf.xf_pid = p->p_pid;
 		xf.xf_uid = p->p_ucred->cr_uid;
 		fdp = fdhold(p);
 		PROC_UNLOCK(p);
 		if (fdp == NULL)
 			continue;
 		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			xf.xf_fd = n;
 			xf.xf_file = fp;
 			xf.xf_data = fp->f_data;
 			xf.xf_vnode = fp->f_vnode;
 			xf.xf_type = fp->f_type;
 			xf.xf_count = fp->f_count;
 			xf.xf_msgcount = 0;
 			xf.xf_offset = foffset_get(fp);
 			xf.xf_flag = fp->f_flag;
 			error = SYSCTL_OUT(req, &xf, sizeof(xf));
 			if (error)
 				break;
 		}
 		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
 	}
 	sx_sunlock(&allproc_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
     0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
 
 #ifdef KINFO_FILE_SIZE
 CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
 #endif
 
 static int
 xlate_fflags(int fflags)
 {
 	static const struct {
 		int	fflag;
 		int	kf_fflag;
 	} fflags_table[] = {
 		{ FAPPEND, KF_FLAG_APPEND },
 		{ FASYNC, KF_FLAG_ASYNC },
 		{ FFSYNC, KF_FLAG_FSYNC },
 		{ FHASLOCK, KF_FLAG_HASLOCK },
 		{ FNONBLOCK, KF_FLAG_NONBLOCK },
 		{ FREAD, KF_FLAG_READ },
 		{ FWRITE, KF_FLAG_WRITE },
 		{ O_CREAT, KF_FLAG_CREAT },
 		{ O_DIRECT, KF_FLAG_DIRECT },
 		{ O_EXCL, KF_FLAG_EXCL },
 		{ O_EXEC, KF_FLAG_EXEC },
 		{ O_EXLOCK, KF_FLAG_EXLOCK },
 		{ O_NOFOLLOW, KF_FLAG_NOFOLLOW },
 		{ O_SHLOCK, KF_FLAG_SHLOCK },
 		{ O_TRUNC, KF_FLAG_TRUNC }
 	};
 	unsigned int i;
 	int kflags;
 
 	kflags = 0;
 	for (i = 0; i < nitems(fflags_table); i++)
 		if (fflags & fflags_table[i].fflag)
 			kflags |=  fflags_table[i].kf_fflag;
 	return (kflags);
 }
 
 /* Trim unused data from kf_path by truncating the structure size. */
 static void
 pack_kinfo(struct kinfo_file *kif)
 {
 
 	kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
 	    strlen(kif->kf_path) + 1;
 	kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
 }
 
 static void
 export_file_to_kinfo(struct file *fp, int fd, cap_rights_t *rightsp,
     struct kinfo_file *kif, struct filedesc *fdp, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	/* Set a default type to allow for empty fill_kinfo() methods. */
 	kif->kf_type = KF_TYPE_UNKNOWN;
 	kif->kf_flags = xlate_fflags(fp->f_flag);
 	if (rightsp != NULL)
 		kif->kf_cap_rights = *rightsp;
 	else
 		cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = fp->f_count;
 	kif->kf_offset = foffset_get(fp);
 
 	/*
 	 * This may drop the filedesc lock, so the 'fp' cannot be
 	 * accessed after this call.
 	 */
 	error = fo_fill_kinfo(fp, kif, fdp);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 }
 
 static void
 export_vnode_to_kinfo(struct vnode *vp, int fd, int fflags,
     struct kinfo_file *kif, int flags)
 {
 	int error;
 
 	bzero(kif, sizeof(*kif));
 
 	kif->kf_type = KF_TYPE_VNODE;
 	error = vn_fill_kinfo_vnode(vp, kif);
 	if (error == 0)
 		kif->kf_status |= KF_ATTR_VALID;
 	kif->kf_flags = xlate_fflags(fflags);
 	cap_rights_init(&kif->kf_cap_rights);
 	kif->kf_fd = fd;
 	kif->kf_ref_count = -1;
 	kif->kf_offset = -1;
 	if ((flags & KERN_FILEDESC_PACK_KINFO) != 0)
 		pack_kinfo(kif);
 	else
 		kif->kf_structsize = roundup2(sizeof(*kif), sizeof(uint64_t));
 	vrele(vp);
 }
 
 struct export_fd_buf {
 	struct filedesc		*fdp;
 	struct sbuf 		*sb;
 	ssize_t			remainder;
 	struct kinfo_file	kif;
 	int			flags;
 };
 
 static int
 export_kinfo_to_sb(struct export_fd_buf *efbuf)
 {
 	struct kinfo_file *kif;
 
 	kif = &efbuf->kif;
 	if (efbuf->remainder != -1) {
 		if (efbuf->remainder < kif->kf_structsize) {
 			/* Terminate export. */
 			efbuf->remainder = 0;
 			return (0);
 		}
 		efbuf->remainder -= kif->kf_structsize;
 	}
 	return (sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM);
 }
 
 static int
 export_file_to_sb(struct file *fp, int fd, cap_rights_t *rightsp,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	export_file_to_kinfo(fp, fd, rightsp, &efbuf->kif, efbuf->fdp,
 	    efbuf->flags);
 	FILEDESC_SUNLOCK(efbuf->fdp);
 	error = export_kinfo_to_sb(efbuf);
 	FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 static int
 export_vnode_to_sb(struct vnode *vp, int fd, int fflags,
     struct export_fd_buf *efbuf)
 {
 	int error;
 
 	if (efbuf->remainder == 0)
 		return (0);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SUNLOCK(efbuf->fdp);
 	export_vnode_to_kinfo(vp, fd, fflags, &efbuf->kif, efbuf->flags);
 	error = export_kinfo_to_sb(efbuf);
 	if (efbuf->fdp != NULL)
 		FILEDESC_SLOCK(efbuf->fdp);
 	return (error);
 }
 
 /*
  * Store a process file descriptor information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_filedesc_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen,
     int flags)
 {
 	struct file *fp;
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	struct vnode *cttyvp, *textvp, *tracevp;
 	int error, i;
 	cap_rights_t rights;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/* ktrace vnode */
 	tracevp = p->p_tracevp;
 	if (tracevp != NULL)
 		vref(tracevp);
 	/* text vnode */
 	textvp = p->p_textvp;
 	if (textvp != NULL)
 		vref(textvp);
 	/* Controlling tty. */
 	cttyvp = NULL;
 	if (p->p_pgrp != NULL && p->p_pgrp->pg_session != NULL) {
 		cttyvp = p->p_pgrp->pg_session->s_ttyvp;
 		if (cttyvp != NULL)
 			vref(cttyvp);
 	}
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = NULL;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 	efbuf->flags = flags;
 	if (tracevp != NULL)
 		export_vnode_to_sb(tracevp, KF_FD_TYPE_TRACE, FREAD | FWRITE,
 		    efbuf);
 	if (textvp != NULL)
 		export_vnode_to_sb(textvp, KF_FD_TYPE_TEXT, FREAD, efbuf);
 	if (cttyvp != NULL)
 		export_vnode_to_sb(cttyvp, KF_FD_TYPE_CTTY, FREAD | FWRITE,
 		    efbuf);
 	error = 0;
 	if (fdp == NULL)
 		goto fail;
 	efbuf->fdp = fdp;
 	FILEDESC_SLOCK(fdp);
 	/* working directory */
 	if (fdp->fd_cdir != NULL) {
 		vref(fdp->fd_cdir);
 		export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD, FREAD, efbuf);
 	}
 	/* root directory */
 	if (fdp->fd_rdir != NULL) {
 		vref(fdp->fd_rdir);
 		export_vnode_to_sb(fdp->fd_rdir, KF_FD_TYPE_ROOT, FREAD, efbuf);
 	}
 	/* jail directory */
 	if (fdp->fd_jdir != NULL) {
 		vref(fdp->fd_jdir);
 		export_vnode_to_sb(fdp->fd_jdir, KF_FD_TYPE_JAIL, FREAD, efbuf);
 	}
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 #ifdef CAPABILITIES
 		rights = *cap_rights(fdp, i);
 #else /* !CAPABILITIES */
 		cap_rights_init(&rights);
 #endif
 		/*
 		 * Create sysctl entry.  It is OK to drop the filedesc
 		 * lock inside of export_file_to_sb() as we will
 		 * re-validate and re-evaluate its properties when the
 		 * loop continues.
 		 */
 		error = export_file_to_sb(fp, i, &rights, efbuf);
 		if (error != 0 || efbuf->remainder == 0)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 fail:
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 #define FILEDESC_SBUF_SIZE	(sizeof(struct kinfo_file) * 5)
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_filedesc(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_filedesc_out(p, &sb, maxlen,
 	    KERN_FILEDESC_PACK_KINFO);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 #ifdef KINFO_OFILE_SIZE
 CTASSERT(sizeof(struct kinfo_ofile) == KINFO_OFILE_SIZE);
 #endif
 
 #ifdef COMPAT_FREEBSD7
 static void
 kinfo_to_okinfo(struct kinfo_file *kif, struct kinfo_ofile *okif)
 {
 
 	okif->kf_structsize = sizeof(*okif);
 	okif->kf_type = kif->kf_type;
 	okif->kf_fd = kif->kf_fd;
 	okif->kf_ref_count = kif->kf_ref_count;
 	okif->kf_flags = kif->kf_flags & (KF_FLAG_READ | KF_FLAG_WRITE |
 	    KF_FLAG_APPEND | KF_FLAG_ASYNC | KF_FLAG_FSYNC | KF_FLAG_NONBLOCK |
 	    KF_FLAG_DIRECT | KF_FLAG_HASLOCK);
 	okif->kf_offset = kif->kf_offset;
 	okif->kf_vnode_type = kif->kf_vnode_type;
 	okif->kf_sock_domain = kif->kf_sock_domain;
 	okif->kf_sock_type = kif->kf_sock_type;
 	okif->kf_sock_protocol = kif->kf_sock_protocol;
 	strlcpy(okif->kf_path, kif->kf_path, sizeof(okif->kf_path));
 	okif->kf_sa_local = kif->kf_sa_local;
 	okif->kf_sa_peer = kif->kf_sa_peer;
 }
 
 static int
 export_vnode_for_osysctl(struct vnode *vp, int type, struct kinfo_file *kif,
     struct kinfo_ofile *okif, struct filedesc *fdp, struct sysctl_req *req)
 {
 	int error;
 
 	vref(vp);
 	FILEDESC_SUNLOCK(fdp);
 	export_vnode_to_kinfo(vp, type, 0, kif, KERN_FILEDESC_PACK_KINFO);
 	kinfo_to_okinfo(kif, okif);
 	error = SYSCTL_OUT(req, okif, sizeof(*okif));
 	FILEDESC_SLOCK(fdp);
 	return (error);
 }
 
 /*
  * Get per-process file descriptors for use by procstat(1), et al.
  */
 static int
 sysctl_kern_proc_ofiledesc(SYSCTL_HANDLER_ARGS)
 {
 	struct kinfo_ofile *okif;
 	struct kinfo_file *kif;
 	struct filedesc *fdp;
 	int error, i, *name;
 	struct file *fp;
 	struct proc *p;
 
 	name = (int *)arg1;
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0)
 		return (error);
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (ENOENT);
 	kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
 	okif = malloc(sizeof(*okif), M_TEMP, M_WAITOK);
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_cdir, KF_FD_TYPE_CWD, kif,
 		    okif, fdp, req);
 	if (fdp->fd_rdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_rdir, KF_FD_TYPE_ROOT, kif,
 		    okif, fdp, req);
 	if (fdp->fd_jdir != NULL)
 		export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
 		    okif, fdp, req);
 	for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
 		if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
 			continue;
 		export_file_to_kinfo(fp, i, NULL, kif, fdp,
 		    KERN_FILEDESC_PACK_KINFO);
 		FILEDESC_SUNLOCK(fdp);
 		kinfo_to_okinfo(kif, okif);
 		error = SYSCTL_OUT(req, okif, sizeof(*okif));
 		FILEDESC_SLOCK(fdp);
 		if (error)
 			break;
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(kif, M_TEMP);
 	free(okif, M_TEMP);
 	return (0);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
     "Process ofiledesc entries");
 #endif	/* COMPAT_FREEBSD7 */
 
 int
 vntype_to_kinfo(int vtype)
 {
 	struct {
 		int	vtype;
 		int	kf_vtype;
 	} vtypes_table[] = {
 		{ VBAD, KF_VTYPE_VBAD },
 		{ VBLK, KF_VTYPE_VBLK },
 		{ VCHR, KF_VTYPE_VCHR },
 		{ VDIR, KF_VTYPE_VDIR },
 		{ VFIFO, KF_VTYPE_VFIFO },
 		{ VLNK, KF_VTYPE_VLNK },
 		{ VNON, KF_VTYPE_VNON },
 		{ VREG, KF_VTYPE_VREG },
 		{ VSOCK, KF_VTYPE_VSOCK }
 	};
 	unsigned int i;
 
 	/*
 	 * Perform vtype translation.
 	 */
 	for (i = 0; i < nitems(vtypes_table); i++)
 		if (vtypes_table[i].vtype == vtype)
 			return (vtypes_table[i].kf_vtype);
 
 	return (KF_VTYPE_UNKNOWN);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
     CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
     "Process filedesc entries");
 
 /*
  * Store a process current working directory information to sbuf.
  *
  * Takes a locked proc as argument, and returns with the proc unlocked.
  */
 int
 kern_proc_cwd_out(struct proc *p,  struct sbuf *sb, ssize_t maxlen)
 {
 	struct filedesc *fdp;
 	struct export_fd_buf *efbuf;
 	int error;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	fdp = fdhold(p);
 	PROC_UNLOCK(p);
 	if (fdp == NULL)
 		return (EINVAL);
 
 	efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
 	efbuf->fdp = fdp;
 	efbuf->sb = sb;
 	efbuf->remainder = maxlen;
 
 	FILEDESC_SLOCK(fdp);
 	if (fdp->fd_cdir == NULL)
 		error = EINVAL;
 	else {
 		vref(fdp->fd_cdir);
 		error = export_vnode_to_sb(fdp->fd_cdir, KF_FD_TYPE_CWD,
 		    FREAD, efbuf);
 	}
 	FILEDESC_SUNLOCK(fdp);
 	fddrop(fdp);
 	free(efbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Get per-process current working directory.
  */
 static int
 sysctl_kern_proc_cwd(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sb;
 	struct proc *p;
 	ssize_t maxlen;
 	int error, error2, *name;
 
 	name = (int *)arg1;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file), req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
 	if (error != 0) {
 		sbuf_delete(&sb);
 		return (error);
 	}
 	maxlen = req->oldptr != NULL ? req->oldlen : -1;
 	error = kern_proc_cwd_out(p, &sb, maxlen);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 static SYSCTL_NODE(_kern_proc, KERN_PROC_CWD, cwd, CTLFLAG_RD|CTLFLAG_MPSAFE,
     sysctl_kern_proc_cwd, "Process current working directory");
 
 #ifdef DDB
 /*
  * For the purposes of debugging, generate a human-readable string for the
  * file type.
  */
 static const char *
 file_type_to_name(short type)
 {
 
 	switch (type) {
 	case 0:
 		return ("zero");
 	case DTYPE_VNODE:
 		return ("vnod");
 	case DTYPE_SOCKET:
 		return ("sock");
 	case DTYPE_PIPE:
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
 	case DTYPE_KQUEUE:
 		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
 	case DTYPE_MQUEUE:
 		return ("mque");
 	case DTYPE_SHM:
 		return ("shm");
 	case DTYPE_SEM:
 		return ("ksem");
 	default:
 		return ("unkn");
 	}
 }
 
 /*
  * For the purposes of debugging, identify a process (if any, perhaps one of
  * many) that references the passed file in its file descriptor array. Return
  * NULL if none.
  */
 static struct proc *
 file_to_first_proc(struct file *fp)
 {
 	struct filedesc *fdp;
 	struct proc *p;
 	int n;
 
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
 		if (fdp == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; n++) {
 			if (fp == fdp->fd_ofiles[n].fde_file)
 				return (p);
 		}
 	}
 	return (NULL);
 }
 
 static void
 db_print_file(struct file *fp, int header)
 {
 	struct proc *p;
 
 	if (header)
 		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
 		    "File", "Type", "Data", "Flag", "GCFl", "Count",
 		    "MCount", "Vnode", "FPID", "FCmd");
 	p = file_to_first_proc(fp);
 	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
 	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
 	    0, fp->f_count, 0, fp->f_vnode,
 	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
 }
 
 DB_SHOW_COMMAND(file, db_show_file)
 {
 	struct file *fp;
 
 	if (!have_addr) {
 		db_printf("usage: show file <addr>\n");
 		return;
 	}
 	fp = (struct file *)addr;
 	db_print_file(fp, 1);
 }
 
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct filedesc *fdp;
 	struct file *fp;
 	struct proc *p;
 	int header;
 	int n;
 
 	header = 1;
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		if ((fdp = p->p_fd) == NULL)
 			continue;
 		for (n = 0; n <= fdp->fd_lastfile; ++n) {
 			if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
 				continue;
 			db_print_file(fp, header);
 			header = 0;
 		}
 	}
 }
 #endif
 
 SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, CTLFLAG_RW,
     &maxfilesperproc, 0, "Maximum files allowed open per process");
 
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW,
     &maxfiles, 0, "Maximum number of files");
 
 SYSCTL_INT(_kern, OID_AUTO, openfiles, CTLFLAG_RD,
     __DEVOLATILE(int *, &openfiles), 0, "System-wide number of open files");
 
 /* ARGSUSED*/
 static void
 filelistinit(void *dummy)
 {
 
 	file_zone = uma_zcreate("Files", sizeof(struct file), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	filedesc0_zone = uma_zcreate("filedesc0", sizeof(struct filedesc0),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	mtx_init(&sigio_lock, "sigio lock", NULL, MTX_DEF);
 }
 SYSINIT(select, SI_SUB_LOCK, SI_ORDER_FIRST, filelistinit, NULL);
 
 /*-------------------------------------------------------------------*/
 
 static int
 badfo_readwrite(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 static int
 badfo_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_close(struct file *fp, struct thread *td)
 {
 
 	return (0);
 }
 
 static int
 badfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EBADF);
 }
 
 static int
 badfo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	return (0);
 }
 
 struct fileops badfileops = {
 	.fo_read = badfo_readwrite,
 	.fo_write = badfo_readwrite,
 	.fo_truncate = badfo_truncate,
 	.fo_ioctl = badfo_ioctl,
 	.fo_poll = badfo_poll,
 	.fo_kqfilter = badfo_kqfilter,
 	.fo_stat = badfo_stat,
 	.fo_close = badfo_close,
 	.fo_chmod = badfo_chmod,
 	.fo_chown = badfo_chown,
 	.fo_sendfile = badfo_sendfile,
 	.fo_fill_kinfo = badfo_fill_kinfo,
 };
 
 int
 invfo_rdwr(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 
 	return (EOPNOTSUPP);
 }
 
 int
 invfo_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_ioctl(struct file *fp, u_long com, void *data,
     struct ucred *active_cred, struct thread *td)
 {
 
 	return (ENOTTY);
 }
 
 int
 invfo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (poll_no_poll(events));
 }
 
 int
 invfo_kqfilter(struct file *fp, struct knote *kn)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 int
 invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 
 	return (EINVAL);
 }
 
 /*-------------------------------------------------------------------*/
 
 /*
  * File Descriptor pseudo-device driver (/dev/fd/).
  *
  * Opening minor device N dup()s the file (if any) connected to file
  * descriptor N belonging to the calling process.  Note that this driver
  * consists of only the ``open()'' routine, because all subsequent
  * references to this file will be direct to the other driver.
  *
  * XXX: we could give this one a cloning event handler if necessary.
  */
 
 /* ARGSUSED */
 static int
 fdopen(struct cdev *dev, int mode, int type, struct thread *td)
 {
 
 	/*
 	 * XXX Kludge: set curthread->td_dupfd to contain the value of the
 	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
 	 * will simply report the error.
 	 */
 	td->td_dupfd = dev2unit(dev);
 	return (ENODEV);
 }
 
 static struct cdevsw fildesc_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_open =	fdopen,
 	.d_name =	"FD",
 };
 
 static void
 fildesc_drvinit(void *unused)
 {
 	struct cdev *dev;
 
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 0, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/0");
 	make_dev_alias(dev, "stdin");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 1, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/1");
 	make_dev_alias(dev, "stdout");
 	dev = make_dev_credf(MAKEDEV_ETERNAL, &fildesc_cdevsw, 2, NULL,
 	    UID_ROOT, GID_WHEEL, 0666, "fd/2");
 	make_dev_alias(dev, "stderr");
 }
 
 SYSINIT(fildescdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, fildesc_drvinit, NULL);
Index: user/alc/PQ_LAUNDRY/sys/kern/kern_procctl.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/kern_procctl.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/kern/kern_procctl.c	(revision 306283)
@@ -1,576 +1,576 @@
 /*-
  * Copyright (c) 2014 John Baldwin
- * Copyright (c) 2014 The FreeBSD Foundation
+ * Copyright (c) 2014, 2016 The FreeBSD Foundation
  *
  * Portions of this software were developed by Konstantin Belousov
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/procctl.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 #include <sys/wait.h>
 
 static int
 protect_setchild(struct thread *td, struct proc *p, int flags)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_SYSTEM || p_cansched(td, p) != 0)
 		return (0);
 	if (flags & PPROT_SET) {
 		p->p_flag |= P_PROTECTED;
 		if (flags & PPROT_INHERIT)
 			p->p_flag2 |= P2_INHERIT_PROTECTED;
 	} else {
 		p->p_flag &= ~P_PROTECTED;
 		p->p_flag2 &= ~P2_INHERIT_PROTECTED;
 	}
 	return (1);
 }
 
 static int
 protect_setchildren(struct thread *td, struct proc *top, int flags)
 {
 	struct proc *p;
 	int ret;
 
 	p = top;
 	ret = 0;
 	sx_assert(&proctree_lock, SX_LOCKED);
 	for (;;) {
 		ret |= protect_setchild(td, p, flags);
 		PROC_UNLOCK(p);
 		/*
 		 * If this process has children, descend to them next,
 		 * otherwise do any siblings, and if done with this level,
 		 * follow back up the tree (but not past top).
 		 */
 		if (!LIST_EMPTY(&p->p_children))
 			p = LIST_FIRST(&p->p_children);
 		else for (;;) {
 			if (p == top) {
 				PROC_LOCK(p);
 				return (ret);
 			}
 			if (LIST_NEXT(p, p_sibling)) {
 				p = LIST_NEXT(p, p_sibling);
 				break;
 			}
 			p = p->p_pptr;
 		}
 		PROC_LOCK(p);
 	}
 }
 
 static int
 protect_set(struct thread *td, struct proc *p, int flags)
 {
 	int error, ret;
 
 	switch (PPROT_OP(flags)) {
 	case PPROT_SET:
 	case PPROT_CLEAR:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if ((PPROT_FLAGS(flags) & ~(PPROT_DESCEND | PPROT_INHERIT)) != 0)
 		return (EINVAL);
 
 	error = priv_check(td, PRIV_VM_MADV_PROTECT);
 	if (error)
 		return (error);
 
 	if (flags & PPROT_DESCEND)
 		ret = protect_setchildren(td, p, flags);
 	else
 		ret = protect_setchild(td, p, flags);
 	if (ret == 0)
 		return (EPERM);
 	return (0);
 }
 
 static int
 reap_acquire(struct thread *td, struct proc *p)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	if (p != curproc)
 		return (EPERM);
 	if ((p->p_treeflag & P_TREE_REAPER) != 0)
 		return (EBUSY);
 	p->p_treeflag |= P_TREE_REAPER;
 	/*
 	 * We do not reattach existing children and the whole tree
 	 * under them to us, since p->p_reaper already seen them.
 	 */
 	return (0);
 }
 
 static int
 reap_release(struct thread *td, struct proc *p)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	if (p != curproc)
 		return (EPERM);
 	if (p == initproc)
 		return (EINVAL);
 	if ((p->p_treeflag & P_TREE_REAPER) == 0)
 		return (EINVAL);
 	reaper_abandon_children(p, false);
 	return (0);
 }
 
 static int
 reap_status(struct thread *td, struct proc *p,
     struct procctl_reaper_status *rs)
 {
 	struct proc *reap, *p2, *first_p;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	bzero(rs, sizeof(*rs));
 	if ((p->p_treeflag & P_TREE_REAPER) == 0) {
 		reap = p->p_reaper;
 	} else {
 		reap = p;
 		rs->rs_flags |= REAPER_STATUS_OWNED;
 	}
 	if (reap == initproc)
 		rs->rs_flags |= REAPER_STATUS_REALINIT;
 	rs->rs_reaper = reap->p_pid;
 	rs->rs_descendants = 0;
 	rs->rs_children = 0;
 	if (!LIST_EMPTY(&reap->p_reaplist)) {
 		first_p = LIST_FIRST(&reap->p_children);
 		if (first_p == NULL)
 			first_p = LIST_FIRST(&reap->p_reaplist);
 		rs->rs_pid = first_p->p_pid;
 		LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) {
 			if (proc_realparent(p2) == reap)
 				rs->rs_children++;
 			rs->rs_descendants++;
 		}
 	} else {
 		rs->rs_pid = -1;
 	}
 	return (0);
 }
 
 static int
 reap_getpids(struct thread *td, struct proc *p, struct procctl_reaper_pids *rp)
 {
 	struct proc *reap, *p2;
 	struct procctl_reaper_pidinfo *pi, *pip;
 	u_int i, n;
 	int error;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	PROC_UNLOCK(p);
 	reap = (p->p_treeflag & P_TREE_REAPER) == 0 ? p->p_reaper : p;
 	n = i = 0;
 	error = 0;
 	LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling)
 		n++;
 	sx_unlock(&proctree_lock);
 	if (rp->rp_count < n)
 		n = rp->rp_count;
 	pi = malloc(n * sizeof(*pi), M_TEMP, M_WAITOK);
 	sx_slock(&proctree_lock);
 	LIST_FOREACH(p2, &reap->p_reaplist, p_reapsibling) {
 		if (i == n)
 			break;
 		pip = &pi[i];
 		bzero(pip, sizeof(*pip));
 		pip->pi_pid = p2->p_pid;
 		pip->pi_subtree = p2->p_reapsubtree;
 		pip->pi_flags = REAPER_PIDINFO_VALID;
 		if (proc_realparent(p2) == reap)
 			pip->pi_flags |= REAPER_PIDINFO_CHILD;
 		i++;
 	}
 	sx_sunlock(&proctree_lock);
 	error = copyout(pi, rp->rp_pids, i * sizeof(*pi));
 	free(pi, M_TEMP);
 	sx_slock(&proctree_lock);
 	PROC_LOCK(p);
 	return (error);
 }
 
 static int
 reap_kill(struct thread *td, struct proc *p, struct procctl_reaper_kill *rk)
 {
 	struct proc *reap, *p2;
 	ksiginfo_t ksi;
 	int error, error1;
 
 	sx_assert(&proctree_lock, SX_LOCKED);
 	if (IN_CAPABILITY_MODE(td))
 		return (ECAPMODE);
 	if (rk->rk_sig <= 0 || rk->rk_sig > _SIG_MAXSIG)
 		return (EINVAL);
 	if ((rk->rk_flags & ~REAPER_KILL_CHILDREN) != 0)
 		return (EINVAL);
 	PROC_UNLOCK(p);
 	reap = (p->p_treeflag & P_TREE_REAPER) == 0 ? p->p_reaper : p;
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = rk->rk_sig;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	error = ESRCH;
 	rk->rk_killed = 0;
 	rk->rk_fpid = -1;
 	for (p2 = (rk->rk_flags & REAPER_KILL_CHILDREN) != 0 ?
 	    LIST_FIRST(&reap->p_children) : LIST_FIRST(&reap->p_reaplist);
 	    p2 != NULL;
 	    p2 = (rk->rk_flags & REAPER_KILL_CHILDREN) != 0 ?
 	    LIST_NEXT(p2, p_sibling) : LIST_NEXT(p2, p_reapsibling)) {
 		if ((rk->rk_flags & REAPER_KILL_SUBTREE) != 0 &&
 		    p2->p_reapsubtree != rk->rk_subtree)
 			continue;
 		PROC_LOCK(p2);
 		error1 = p_cansignal(td, p2, rk->rk_sig);
 		if (error1 == 0) {
 			pksignal(p2, rk->rk_sig, &ksi);
 			rk->rk_killed++;
 			error = error1;
 		} else if (error == ESRCH) {
 			error = error1;
 			rk->rk_fpid = p2->p_pid;
 		}
 		PROC_UNLOCK(p2);
 		/* Do not end the loop on error, signal everything we can. */
 	}
 	PROC_LOCK(p);
 	return (error);
 }
 
 static int
 trace_ctl(struct thread *td, struct proc *p, int state)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Ktrace changes p_traceflag from or to zero under the
 	 * process lock, so the test does not need to acquire ktrace
 	 * mutex.
 	 */
 	if ((p->p_flag & P_TRACED) != 0 || p->p_traceflag != 0)
 		return (EBUSY);
 
 	switch (state) {
 	case PROC_TRACE_CTL_ENABLE:
 		if (td->td_proc != p)
 			return (EPERM);
 		p->p_flag2 &= ~(P2_NOTRACE | P2_NOTRACE_EXEC);
 		break;
 	case PROC_TRACE_CTL_DISABLE_EXEC:
 		p->p_flag2 |= P2_NOTRACE_EXEC | P2_NOTRACE;
 		break;
 	case PROC_TRACE_CTL_DISABLE:
 		if ((p->p_flag2 & P2_NOTRACE_EXEC) != 0) {
 			KASSERT((p->p_flag2 & P2_NOTRACE) != 0,
 			    ("dandling P2_NOTRACE_EXEC"));
 			if (td->td_proc != p)
 				return (EPERM);
 			p->p_flag2 &= ~P2_NOTRACE_EXEC;
 		} else {
 			p->p_flag2 |= P2_NOTRACE;
 		}
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 trace_status(struct thread *td, struct proc *p, int *data)
 {
 
 	if ((p->p_flag2 & P2_NOTRACE) != 0) {
 		KASSERT((p->p_flag & P_TRACED) == 0,
 		    ("%d traced but tracing disabled", p->p_pid));
 		*data = -1;
 	} else if ((p->p_flag & P_TRACED) != 0) {
 		*data = p->p_pptr->p_pid;
 	} else {
 		*data = 0;
 	}
 	return (0);
 }
 
 static int
 trapcap_ctl(struct thread *td, struct proc *p, int state)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	switch (state) {
 	case PROC_TRAPCAP_CTL_ENABLE:
 		p->p_flag2 |= P2_TRAPCAP;
 		break;
 	case PROC_TRAPCAP_CTL_DISABLE:
 		p->p_flag2 &= ~P2_TRAPCAP;
 		break;
 	default:
 		return (EINVAL);
 	}
 	return (0);
 }
 
 static int
 trapcap_status(struct thread *td, struct proc *p, int *data)
 {
 
 	*data = (p->p_flag2 & P2_TRAPCAP) != 0 ? PROC_TRAPCAP_CTL_ENABLE :
 	    PROC_TRAPCAP_CTL_DISABLE;
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct procctl_args {
 	idtype_t idtype;
 	id_t	id;
 	int	com;
 	void	*data;
 };
 #endif
 /* ARGSUSED */
 int
 sys_procctl(struct thread *td, struct procctl_args *uap)
 {
 	void *data;
 	union {
 		struct procctl_reaper_status rs;
 		struct procctl_reaper_pids rp;
 		struct procctl_reaper_kill rk;
 	} x;
 	int error, error1, flags;
 
 	switch (uap->com) {
 	case PROC_SPROTECT:
 	case PROC_TRACE_CTL:
 	case PROC_TRAPCAP_CTL:
 		error = copyin(uap->data, &flags, sizeof(flags));
 		if (error != 0)
 			return (error);
 		data = &flags;
 		break;
 	case PROC_REAP_ACQUIRE:
 	case PROC_REAP_RELEASE:
 		if (uap->data != NULL)
 			return (EINVAL);
 		data = NULL;
 		break;
 	case PROC_REAP_STATUS:
 		data = &x.rs;
 		break;
 	case PROC_REAP_GETPIDS:
 		error = copyin(uap->data, &x.rp, sizeof(x.rp));
 		if (error != 0)
 			return (error);
 		data = &x.rp;
 		break;
 	case PROC_REAP_KILL:
 		error = copyin(uap->data, &x.rk, sizeof(x.rk));
 		if (error != 0)
 			return (error);
 		data = &x.rk;
 		break;
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		data = &flags;
 		break;
 	default:
 		return (EINVAL);
 	}
 	error = kern_procctl(td, uap->idtype, uap->id, uap->com, data);
 	switch (uap->com) {
 	case PROC_REAP_STATUS:
 		if (error == 0)
 			error = copyout(&x.rs, uap->data, sizeof(x.rs));
 		break;
 	case PROC_REAP_KILL:
 		error1 = copyout(&x.rk, uap->data, sizeof(x.rk));
 		if (error == 0)
 			error = error1;
 		break;
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		if (error == 0)
 			error = copyout(&flags, uap->data, sizeof(flags));
 		break;
 	}
 	return (error);
 }
 
 static int
 kern_procctl_single(struct thread *td, struct proc *p, int com, void *data)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	switch (com) {
 	case PROC_SPROTECT:
 		return (protect_set(td, p, *(int *)data));
 	case PROC_REAP_ACQUIRE:
 		return (reap_acquire(td, p));
 	case PROC_REAP_RELEASE:
 		return (reap_release(td, p));
 	case PROC_REAP_STATUS:
 		return (reap_status(td, p, data));
 	case PROC_REAP_GETPIDS:
 		return (reap_getpids(td, p, data));
 	case PROC_REAP_KILL:
 		return (reap_kill(td, p, data));
 	case PROC_TRACE_CTL:
 		return (trace_ctl(td, p, *(int *)data));
 	case PROC_TRACE_STATUS:
 		return (trace_status(td, p, data));
 	case PROC_TRAPCAP_CTL:
 		return (trapcap_ctl(td, p, *(int *)data));
 	case PROC_TRAPCAP_STATUS:
 		return (trapcap_status(td, p, data));
 	default:
 		return (EINVAL);
 	}
 }
 
 int
 kern_procctl(struct thread *td, idtype_t idtype, id_t id, int com, void *data)
 {
 	struct pgrp *pg;
 	struct proc *p;
 	int error, first_error, ok;
 	bool tree_locked;
 
 	switch (com) {
 	case PROC_REAP_ACQUIRE:
 	case PROC_REAP_RELEASE:
 	case PROC_REAP_STATUS:
 	case PROC_REAP_GETPIDS:
 	case PROC_REAP_KILL:
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		if (idtype != P_PID)
 			return (EINVAL);
 	}
 
 	switch (com) {
 	case PROC_SPROTECT:
 	case PROC_REAP_STATUS:
 	case PROC_REAP_GETPIDS:
 	case PROC_REAP_KILL:
 	case PROC_TRACE_CTL:
 	case PROC_TRAPCAP_CTL:
 		sx_slock(&proctree_lock);
 		tree_locked = true;
 		break;
 	case PROC_REAP_ACQUIRE:
 	case PROC_REAP_RELEASE:
 		sx_xlock(&proctree_lock);
 		tree_locked = true;
 		break;
 	case PROC_TRACE_STATUS:
 	case PROC_TRAPCAP_STATUS:
 		tree_locked = false;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	switch (idtype) {
 	case P_PID:
 		p = pfind(id);
 		if (p == NULL) {
 			error = ESRCH;
 			break;
 		}
 		error = p_cansee(td, p);
 		if (error == 0)
 			error = kern_procctl_single(td, p, com, data);
 		PROC_UNLOCK(p);
 		break;
 	case P_PGID:
 		/*
 		 * Attempt to apply the operation to all members of the
 		 * group.  Ignore processes in the group that can't be
 		 * seen.  Ignore errors so long as at least one process is
 		 * able to complete the request successfully.
 		 */
 		pg = pgfind(id);
 		if (pg == NULL) {
 			error = ESRCH;
 			break;
 		}
 		PGRP_UNLOCK(pg);
 		ok = 0;
 		first_error = 0;
 		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NEW || p_cansee(td, p) != 0) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			error = kern_procctl_single(td, p, com, data);
 			PROC_UNLOCK(p);
 			if (error == 0)
 				ok = 1;
 			else if (first_error == 0)
 				first_error = error;
 		}
 		if (ok)
 			error = 0;
 		else if (first_error != 0)
 			error = first_error;
 		else
 			/*
 			 * Was not able to see any processes in the
 			 * process group.
 			 */
 			error = ESRCH;
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 	if (tree_locked)
 		sx_unlock(&proctree_lock);
 	return (error);
 }
Index: user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c	(revision 306283)
@@ -1,3025 +1,3025 @@
 /*-
  * Copyright (c) 2008 Isilon Systems, Inc.
  * Copyright (c) 2008 Ilya Maykov <ivmaykov@gmail.com>
  * Copyright (c) 1998 Berkeley Software Design, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Berkeley Software Design Inc's name may not be used to endorse or
  *    promote products derived from this software without specific prior
  *    written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
  *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
  */
 
 /*
  * Implementation of the `witness' lock verifier.  Originally implemented for
  * mutexes in BSD/OS.  Extended to handle generic lock objects and lock
  * classes in FreeBSD.
  */
 
 /*
  *	Main Entry: witness
  *	Pronunciation: 'wit-n&s
  *	Function: noun
  *	Etymology: Middle English witnesse, from Old English witnes knowledge,
  *	    testimony, witness, from 2wit
  *	Date: before 12th century
  *	1 : attestation of a fact or event : TESTIMONY
  *	2 : one that gives evidence; specifically : one who testifies in
  *	    a cause or before a judicial tribunal
  *	3 : one asked to be present at a transaction so as to be able to
  *	    testify to its having taken place
  *	4 : one who has personal knowledge of something
  *	5 a : something serving as evidence or proof : SIGN
  *	  b : public affirmation by word or example of usually
  *	      religious faith or conviction <the heroic witness to divine
  *	      life -- Pilot>
  *	6 capitalized : a member of the Jehovah's Witnesses 
  */
 
 /*
  * Special rules concerning Giant and lock orders:
  *
  * 1) Giant must be acquired before any other mutexes.  Stated another way,
  *    no other mutex may be held when Giant is acquired.
  *
  * 2) Giant must be released when blocking on a sleepable lock.
  *
  * This rule is less obvious, but is a result of Giant providing the same
  * semantics as spl().  Basically, when a thread sleeps, it must release
  * Giant.  When a thread blocks on a sleepable lock, it sleeps.  Hence rule
  * 2).
  *
  * 3) Giant may be acquired before or after sleepable locks.
  *
  * This rule is also not quite as obvious.  Giant may be acquired after
  * a sleepable lock because it is a non-sleepable lock and non-sleepable
  * locks may always be acquired while holding a sleepable lock.  The second
  * case, Giant before a sleepable lock, follows from rule 2) above.  Suppose
  * you have two threads T1 and T2 and a sleepable lock X.  Suppose that T1
  * acquires X and blocks on Giant.  Then suppose that T2 acquires Giant and
  * blocks on X.  When T2 blocks on X, T2 will release Giant allowing T1 to
  * execute.  Thus, acquiring Giant both before and after a sleepable lock
  * will not result in a lock order reversal.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_stack.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/stack.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 
 #if !defined(DDB) && !defined(STACK)
 #error "DDB or STACK options are required for WITNESS"
 #endif
 
 /* Note that these traces do not work with KTR_ALQ. */
 #if 0
 #define	KTR_WITNESS	KTR_SUBSYS
 #else
 #define	KTR_WITNESS	0
 #endif
 
 #define	LI_RECURSEMASK	0x0000ffff	/* Recursion depth of lock instance. */
 #define	LI_EXCLUSIVE	0x00010000	/* Exclusive lock instance. */
 #define	LI_NORELEASE	0x00020000	/* Lock not allowed to be released. */
 
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
 #ifndef WITNESS_COUNT
 #define	WITNESS_COUNT 		1536
 #endif
 #define	WITNESS_HASH_SIZE	251	/* Prime, gives load factor < 2 */
 #define	WITNESS_PENDLIST	(1024 + MAXCPU)
 
 /* Allocate 256 KB of stack data space */
 #define	WITNESS_LO_DATA_COUNT	2048
 
 /* Prime, gives load factor of ~2 at full load */
 #define	WITNESS_LO_HASH_SIZE	1021
 
 /*
  * XXX: This is somewhat bogus, as we assume here that at most 2048 threads
  * will hold LOCK_NCHILDREN locks.  We handle failure ok, and we should
  * probably be safe for the most part, but it's still a SWAG.
  */
 #define	LOCK_NCHILDREN	5
 #define	LOCK_CHILDCOUNT	2048
 
 #define	MAX_W_NAME	64
 
 #define	FULLGRAPH_SBUF_SIZE	512
 
 /*
  * These flags go in the witness relationship matrix and describe the
  * relationship between any two struct witness objects.
  */
 #define	WITNESS_UNRELATED        0x00    /* No lock order relation. */
 #define	WITNESS_PARENT           0x01    /* Parent, aka direct ancestor. */
 #define	WITNESS_ANCESTOR         0x02    /* Direct or indirect ancestor. */
 #define	WITNESS_CHILD            0x04    /* Child, aka direct descendant. */
 #define	WITNESS_DESCENDANT       0x08    /* Direct or indirect descendant. */
 #define	WITNESS_ANCESTOR_MASK    (WITNESS_PARENT | WITNESS_ANCESTOR)
 #define	WITNESS_DESCENDANT_MASK  (WITNESS_CHILD | WITNESS_DESCENDANT)
 #define	WITNESS_RELATED_MASK						\
 	(WITNESS_ANCESTOR_MASK | WITNESS_DESCENDANT_MASK)
 #define	WITNESS_REVERSAL         0x10    /* A lock order reversal has been
 					  * observed. */
 #define	WITNESS_RESERVED1        0x20    /* Unused flag, reserved. */
 #define	WITNESS_RESERVED2        0x40    /* Unused flag, reserved. */
 #define	WITNESS_LOCK_ORDER_KNOWN 0x80    /* This lock order is known. */
 
 /* Descendant to ancestor flags */
 #define	WITNESS_DTOA(x)	(((x) & WITNESS_RELATED_MASK) >> 2)
 
 /* Ancestor to descendant flags */
 #define	WITNESS_ATOD(x)	(((x) & WITNESS_RELATED_MASK) << 2)
 
 #define	WITNESS_INDEX_ASSERT(i)						\
 	MPASS((i) > 0 && (i) <= w_max_used_index && (i) < witness_count)
 
 static MALLOC_DEFINE(M_WITNESS, "Witness", "Witness");
 
 /*
  * Lock instances.  A lock instance is the data associated with a lock while
  * it is held by witness.  For example, a lock instance will hold the
  * recursion count of a lock.  Lock instances are held in lists.  Spin locks
  * are held in a per-cpu list while sleep locks are held in per-thread list.
  */
 struct lock_instance {
 	struct lock_object	*li_lock;
 	const char		*li_file;
 	int			li_line;
 	u_int			li_flags;
 };
 
 /*
  * A simple list type used to build the list of locks held by a thread
  * or CPU.  We can't simply embed the list in struct lock_object since a
  * lock may be held by more than one thread if it is a shared lock.  Locks
  * are added to the head of the list, so we fill up each list entry from
  * "the back" logically.  To ease some of the arithmetic, we actually fill
  * in each list entry the normal way (children[0] then children[1], etc.) but
  * when we traverse the list we read children[count-1] as the first entry
  * down to children[0] as the final entry.
  */
 struct lock_list_entry {
 	struct lock_list_entry	*ll_next;
 	struct lock_instance	ll_children[LOCK_NCHILDREN];
 	u_int			ll_count;
 };
 
 /*
  * The main witness structure. One of these per named lock type in the system
  * (for example, "vnode interlock").
  */
 struct witness {
 	char  			w_name[MAX_W_NAME];
 	uint32_t 		w_index;  /* Index in the relationship matrix */
 	struct lock_class	*w_class;
 	STAILQ_ENTRY(witness) 	w_list;		/* List of all witnesses. */
 	STAILQ_ENTRY(witness) 	w_typelist;	/* Witnesses of a type. */
 	struct witness		*w_hash_next; /* Linked list in hash buckets. */
 	const char		*w_file; /* File where last acquired */
 	uint32_t 		w_line; /* Line where last acquired */
 	uint32_t 		w_refcount;
 	uint16_t 		w_num_ancestors; /* direct/indirect
 						  * ancestor count */
 	uint16_t 		w_num_descendants; /* direct/indirect
 						    * descendant count */
 	int16_t 		w_ddb_level;
 	unsigned		w_displayed:1;
 	unsigned		w_reversed:1;
 };
 
 STAILQ_HEAD(witness_list, witness);
 
 /*
  * The witness hash table. Keys are witness names (const char *), elements are
  * witness objects (struct witness *).
  */
 struct witness_hash {
 	struct witness	*wh_array[WITNESS_HASH_SIZE];
 	uint32_t	wh_size;
 	uint32_t	wh_count;
 };
 
 /*
  * Key type for the lock order data hash table.
  */
 struct witness_lock_order_key {
 	uint16_t	from;
 	uint16_t	to;
 };
 
 struct witness_lock_order_data {
 	struct stack			wlod_stack;
 	struct witness_lock_order_key	wlod_key;
 	struct witness_lock_order_data	*wlod_next;
 };
 
 /*
  * The witness lock order data hash table. Keys are witness index tuples
  * (struct witness_lock_order_key), elements are lock order data objects
  * (struct witness_lock_order_data). 
  */
 struct witness_lock_order_hash {
 	struct witness_lock_order_data	*wloh_array[WITNESS_LO_HASH_SIZE];
 	u_int	wloh_size;
 	u_int	wloh_count;
 };
 
 #ifdef BLESSING
 struct witness_blessed {
 	const char	*b_lock1;
 	const char	*b_lock2;
 };
 #endif
 
 struct witness_pendhelp {
 	const char		*wh_type;
 	struct lock_object	*wh_lock;
 };
 
 struct witness_order_list_entry {
 	const char		*w_name;
 	struct lock_class	*w_class;
 };
 
 /*
  * Returns 0 if one of the locks is a spin lock and the other is not.
  * Returns 1 otherwise.
  */
 static __inline int
 witness_lock_type_equal(struct witness *w1, struct witness *w2)
 {
 
 	return ((w1->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)) ==
 		(w2->w_class->lc_flags & (LC_SLEEPLOCK | LC_SPINLOCK)));
 }
 
 static __inline int
 witness_lock_order_key_equal(const struct witness_lock_order_key *a,
     const struct witness_lock_order_key *b)
 {
 
 	return (a->from == b->from && a->to == b->to);
 }
 
 static int	_isitmyx(struct witness *w1, struct witness *w2, int rmask,
 		    const char *fname);
 static void	adopt(struct witness *parent, struct witness *child);
 #ifdef BLESSING
 static int	blessed(struct witness *, struct witness *);
 #endif
 static void	depart(struct witness *w);
 static struct witness	*enroll(const char *description,
 			    struct lock_class *lock_class);
 static struct lock_instance	*find_instance(struct lock_list_entry *list,
 				    const struct lock_object *lock);
 static int	isitmychild(struct witness *parent, struct witness *child);
 static int	isitmydescendant(struct witness *parent, struct witness *child);
 static void	itismychild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS);
 static int	sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS);
 static void	witness_add_fullgraph(struct sbuf *sb, struct witness *parent);
 #ifdef DDB
 static void	witness_ddb_compute_levels(void);
 static void	witness_ddb_display(int(*)(const char *fmt, ...));
 static void	witness_ddb_display_descendants(int(*)(const char *fmt, ...),
 		    struct witness *, int indent);
 static void	witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
 		    struct witness_list *list);
 static void	witness_ddb_level_descendants(struct witness *parent, int l);
 static void	witness_ddb_list(struct thread *td);
 #endif
 static void	witness_debugger(int cond, const char *msg);
 static void	witness_free(struct witness *m);
 static struct witness	*witness_get(void);
 static uint32_t	witness_hash_djb2(const uint8_t *key, uint32_t size);
 static struct witness	*witness_hash_get(const char *key);
 static void	witness_hash_put(struct witness *w);
 static void	witness_init_hash_tables(void);
 static void	witness_increment_graph_generation(void);
 static void	witness_lock_list_free(struct lock_list_entry *lle);
 static struct lock_list_entry	*witness_lock_list_get(void);
 static int	witness_lock_order_add(struct witness *parent,
 		    struct witness *child);
 static int	witness_lock_order_check(struct witness *parent,
 		    struct witness *child);
 static struct witness_lock_order_data	*witness_lock_order_get(
 					    struct witness *parent,
 					    struct witness *child);
 static void	witness_list_lock(struct lock_instance *instance,
 		    int (*prnt)(const char *fmt, ...));
 static int	witness_output(const char *fmt, ...) __printflike(1, 2);
 static int	witness_voutput(const char *fmt, va_list ap) __printflike(1, 0);
 static void	witness_setflag(struct lock_object *lock, int flag, int set);
 
 static SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, NULL,
     "Witness Locking");
 
 /*
  * If set to 0, lock order checking is disabled.  If set to -1,
  * witness is completely disabled.  Otherwise witness performs full
  * lock order checking for all locks.  At runtime, lock order checking
  * may be toggled.  However, witness cannot be reenabled once it is
  * completely disabled.
  */
 static int witness_watch = 1;
 SYSCTL_PROC(_debug_witness, OID_AUTO, watch, CTLFLAG_RWTUN | CTLTYPE_INT, NULL, 0,
     sysctl_debug_witness_watch, "I", "witness is watching lock operations");
 
 #ifdef KDB
 /*
  * When KDB is enabled and witness_kdb is 1, it will cause the system
  * to drop into kdebug() when:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
 int	witness_kdb = 1;
 #else
 int	witness_kdb = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, kdb, CTLFLAG_RWTUN, &witness_kdb, 0, "");
 #endif /* KDB */
 
 #if defined(DDB) || defined(KDB)
 /*
  * When DDB or KDB is enabled and witness_trace is 1, it will cause the system
  * to print a stack trace:
  *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
 SYSCTL_INT(_debug_witness, OID_AUTO, trace, CTLFLAG_RWTUN, &witness_trace, 0, "");
 #endif /* DDB || KDB */
 
 #ifdef WITNESS_SKIPSPIN
 int	witness_skipspin = 1;
 #else
 int	witness_skipspin = 0;
 #endif
 SYSCTL_INT(_debug_witness, OID_AUTO, skipspin, CTLFLAG_RDTUN, &witness_skipspin, 0, "");
 
 int badstack_sbuf_size;
 
 int witness_count = WITNESS_COUNT;
 SYSCTL_INT(_debug_witness, OID_AUTO, witness_count, CTLFLAG_RDTUN, 
     &witness_count, 0, "");
 
 /*
  * Output channel for witness messages.  By default we print to the console.
  */
 enum witness_channel {
 	WITNESS_CONSOLE,
 	WITNESS_LOG,
 	WITNESS_NONE,
 };
 
 static enum witness_channel witness_channel = WITNESS_CONSOLE;
 SYSCTL_PROC(_debug_witness, OID_AUTO, output_channel, CTLTYPE_STRING |
     CTLFLAG_RWTUN, NULL, 0, sysctl_debug_witness_channel, "A",
     "Output channel for warnings");
 
 /*
  * Call this to print out the relations between locks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, fullgraph, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_fullgraph, "A", "Show locks relation graphs");
 
 /*
  * Call this to print out the witness faulty stacks.
  */
 SYSCTL_PROC(_debug_witness, OID_AUTO, badstacks, CTLTYPE_STRING | CTLFLAG_RD,
     NULL, 0, sysctl_debug_witness_badstacks, "A", "Show bad witness stacks");
 
 static struct mtx w_mtx;
 
 /* w_list */
 static struct witness_list w_free = STAILQ_HEAD_INITIALIZER(w_free);
 static struct witness_list w_all = STAILQ_HEAD_INITIALIZER(w_all);
 
 /* w_typelist */
 static struct witness_list w_spin = STAILQ_HEAD_INITIALIZER(w_spin);
 static struct witness_list w_sleep = STAILQ_HEAD_INITIALIZER(w_sleep);
 
 /* lock list */
 static struct lock_list_entry *w_lock_list_free = NULL;
 static struct witness_pendhelp pending_locks[WITNESS_PENDLIST];
 static u_int pending_cnt;
 
 static int w_free_cnt, w_spin_cnt, w_sleep_cnt;
 SYSCTL_INT(_debug_witness, OID_AUTO, free_cnt, CTLFLAG_RD, &w_free_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, spin_cnt, CTLFLAG_RD, &w_spin_cnt, 0, "");
 SYSCTL_INT(_debug_witness, OID_AUTO, sleep_cnt, CTLFLAG_RD, &w_sleep_cnt, 0,
     "");
 
 static struct witness *w_data;
 static uint8_t **w_rmatrix;
 static struct lock_list_entry w_locklistdata[LOCK_CHILDCOUNT];
 static struct witness_hash w_hash;	/* The witness hash table. */
 
 /* The lock order data hash */
 static struct witness_lock_order_data w_lodata[WITNESS_LO_DATA_COUNT];
 static struct witness_lock_order_data *w_lofree = NULL;
 static struct witness_lock_order_hash w_lohash;
 static int w_max_used_index = 0;
 static unsigned int w_generation = 0;
 static const char w_notrunning[] = "Witness not running\n";
 static const char w_stillcold[] = "Witness is still cold\n";
 
 
 static struct witness_order_list_entry order_lists[] = {
 	/*
 	 * sx locks
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
 	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
 	{ "process lock", &lock_class_mtx_sleep },
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_rw },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-sleep", &lock_class_mtx_sleep },
 #endif
 	{ "time lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * umtx
 	 */
 	{ "umtx lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "sellck", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
 	{ "radix node head", &lock_class_rw },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * IPv4 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in_multi_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * IPv6 multicast:
 	 * protocol locks before interface locks, after UDP locks.
 	 */
 	{ "udpinp", &lock_class_rw },
 	{ "in6_multi_mtx", &lock_class_mtx_sleep },
 	{ "mld_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_lock", &lock_class_rw },
 	{ NULL, NULL },
 	/*
 	 * UNIX Domain Sockets
 	 */
 	{ "unp_link_rwlock", &lock_class_rw },
 	{ "unp_list_lock", &lock_class_mtx_sleep },
 	{ "unp", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * UDP/IP
 	 */
 	{ "udp", &lock_class_rw },
 	{ "udpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * TCP/IP
 	 */
 	{ "tcp", &lock_class_rw },
 	{ "tcpinp", &lock_class_rw },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * BPF
 	 */
 	{ "bpf global lock", &lock_class_mtx_sleep },
 	{ "bpf interface lock", &lock_class_rw },
 	{ "bpf cdev lock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * NFS server
 	 */
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 
 	/*
 	 * IEEE 802.11
 	 */
 	{ "802.11 com lock", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 	/*
 	 * Network drivers
 	 */
 	{ "network driver", &lock_class_mtx_sleep},
 	{ NULL, NULL },
 
 	/*
 	 * Netgraph
 	 */
 	{ "ng_node", &lock_class_mtx_sleep },
 	{ "ng_worklist", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
 	{ "vm map (system)", &lock_class_mtx_sleep },
 	{ "vm page queue", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VM
 	 */
 	{ "vm map (user)", &lock_class_sx },
 	{ "vm object", &lock_class_rw },
 	{ "vm page", &lock_class_mtx_sleep },
 	{ "vm page queue", &lock_class_mtx_sleep },
 	{ "pmap pv global", &lock_class_rw },
 	{ "pmap", &lock_class_mtx_sleep },
 	{ "pmap pv list", &lock_class_rw },
 	{ "vm page free queue", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * kqueue/VFS interaction
 	 */
 	{ "kqueue", &lock_class_mtx_sleep },
 	{ "struct mount mtx", &lock_class_mtx_sleep },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * VFS namecache
 	 */
-	{ "ncglobal", &lock_class_rw },
+	{ "ncvn", &lock_class_mtx_sleep },
 	{ "ncbuc", &lock_class_rw },
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ "ncneg", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },
 	{ "dr->dt.di.dr_mtx", &lock_class_sx },
 	{ "db->db_mtx", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * spin locks
 	 */
 #ifdef SMP
 	{ "ap boot", &lock_class_mtx_spin },
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
 #ifdef __sparc64__
 	{ "pcib_mtx", &lock_class_mtx_spin },
 	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
 	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
 	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-per-proc", &lock_class_mtx_spin },
 #endif
 	{ "process slock", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
 	{ "rm_spinlock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
 	{ "turnstile lock", &lock_class_mtx_spin },
 	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 #ifdef SMP
 	{ "smp rendezvous", &lock_class_mtx_spin },
 #endif
 #ifdef __powerpc__
 	{ "tlb0", &lock_class_mtx_spin },
 #endif
 	/*
 	 * leaf locks
 	 */
 	{ "intrcnt", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
 #if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
 #endif
 #ifdef __i386__
 	{ "allpmaps", &lock_class_mtx_spin },
 	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
 	{ "cpuset", &lock_class_mtx_spin },
 	{ "mprof lock", &lock_class_mtx_spin },
 	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #if defined(__i386__) || defined(__amd64__)
 	{ "pcicfg", &lock_class_mtx_spin },
 	{ "NDIS thread lock", &lock_class_mtx_spin },
 #endif
 	{ "tw_osl_io_lock", &lock_class_mtx_spin },
 	{ "tw_osl_q_lock", &lock_class_mtx_spin },
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
 #ifdef	HWPMC_HOOKS
 	{ "pmc-leaf", &lock_class_mtx_spin },
 #endif
 	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
 
 #ifdef BLESSING
 /*
  * Pairs of locks which have been blessed
  * Don't complain about order problems with blessed locks
  */
 static struct witness_blessed blessed_list[] = {
 };
 #endif
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
  */
 static int witness_cold = 1;
 
 /*
  * This global is set to 1 once the static lock orders have been enrolled
  * so that a warning can be issued for any spin locks enrolled later.
  */
 static int witness_spin_warn = 0;
 
 /* Trim useless garbage from filenames. */
 static const char *
 fixup_filename(const char *file)
 {
 
 	if (file == NULL)
 		return (NULL);
 	while (strncmp(file, "../", 3) == 0)
 		file += 3;
 	return (file);
 }
 
 /*
  * The WITNESS-enabled diagnostic code.  Note that the witness code does
  * assume that the early boot is single-threaded at least until after this
  * routine is completed.
  */
 static void
 witness_initialize(void *dummy __unused)
 {
 	struct lock_object *lock;
 	struct witness_order_list_entry *order;
 	struct witness *w, *w1;
 	int i;
 
 	w_data = malloc(sizeof (struct witness) * witness_count, M_WITNESS,
 	    M_WAITOK | M_ZERO);
 
 	w_rmatrix = malloc(sizeof(*w_rmatrix) * (witness_count + 1),
 	    M_WITNESS, M_WAITOK | M_ZERO);
 
 	for (i = 0; i < witness_count + 1; i++) {
 		w_rmatrix[i] = malloc(sizeof(*w_rmatrix[i]) *
 		    (witness_count + 1), M_WITNESS, M_WAITOK | M_ZERO);
 	}
 	badstack_sbuf_size = witness_count * 256;
 
 	/*
 	 * We have to release Giant before initializing its witness
 	 * structure so that WITNESS doesn't get confused.
 	 */
 	mtx_unlock(&Giant);
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
 	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = witness_count - 1; i >= 0; i--) {
 		w = &w_data[i];
 		memset(w, 0, sizeof(*w));
 		w_data[i].w_index = i;	/* Witness index never changes. */
 		witness_free(w);
 	}
 	KASSERT(STAILQ_FIRST(&w_free)->w_index == 0,
 	    ("%s: Invalid list of free witness objects", __func__));
 
 	/* Witness with index 0 is not used to aid in debugging. */
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 
 	for (i = 0; i < witness_count; i++) {
 		memset(w_rmatrix[i], 0, sizeof(*w_rmatrix[i]) * 
 		    (witness_count + 1));
 	}
 
 	for (i = 0; i < LOCK_CHILDCOUNT; i++)
 		witness_lock_list_free(&w_locklistdata[i]);
 	witness_init_hash_tables();
 
 	/* First add in all the specified order lists. */
 	for (order = order_lists; order->w_name != NULL; order++) {
 		w = enroll(order->w_name, order->w_class);
 		if (w == NULL)
 			continue;
 		w->w_file = "order list";
 		for (order++; order->w_name != NULL; order++) {
 			w1 = enroll(order->w_name, order->w_class);
 			if (w1 == NULL)
 				continue;
 			w1->w_file = "order list";
 			itismychild(w, w1);
 			w = w1;
 		}
 	}
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
 	for (i = 0; pending_locks[i].wh_lock != NULL; i++) {
 		lock = pending_locks[i].wh_lock;
 		KASSERT(lock->lo_flags & LO_WITNESS,
 		    ("%s: lock %s is on pending list but not LO_WITNESS",
 		    __func__, lock->lo_name));
 		lock->lo_witness = enroll(pending_locks[i].wh_type,
 		    LOCK_CLASS(lock));
 	}
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
 
 	mtx_lock(&Giant);
 }
 SYSINIT(witness_init, SI_SUB_WITNESS, SI_ORDER_FIRST, witness_initialize,
     NULL);
 
 void
 witness_init(struct lock_object *lock, const char *type)
 {
 	struct lock_class *class;
 
 	/* Various sanity checks. */
 	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be recursable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 	    (class->lc_flags & LC_SLEEPABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be sleepable",
 		    __func__, class->lc_name, lock->lo_name);
 	if ((lock->lo_flags & LO_UPGRADABLE) != 0 &&
 	    (class->lc_flags & LC_UPGRADABLE) == 0)
 		kassert_panic("%s: lock (%s) %s can not be upgradable",
 		    __func__, class->lc_name, lock->lo_name);
 
 	/*
 	 * If we shouldn't watch this lock, then just clear lo_witness.
 	 * Otherwise, if witness_cold is set, then it is too early to
 	 * enroll this lock, so defer it to witness_initialize() by adding
 	 * it to the pending_locks list.  If it is not too early, then enroll
 	 * the lock now.
 	 */
 	if (witness_watch < 1 || panicstr != NULL ||
 	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
 	else if (witness_cold) {
 		pending_locks[pending_cnt].wh_lock = lock;
 		pending_locks[pending_cnt++].wh_type = type;
 		if (pending_cnt > WITNESS_PENDLIST)
 			panic("%s: pending locks list is too small, "
 			    "increase WITNESS_PENDLIST\n",
 			    __func__);
 	} else
 		lock->lo_witness = enroll(type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
 	struct lock_class *class;
 	struct witness *w;
 
 	class = LOCK_CLASS(lock);
 
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
 		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
 	if ((lock->lo_flags & LO_WITNESS) == 0 || lock->lo_witness == NULL)
 		return;
 	w = lock->lo_witness;
 
 	mtx_lock_spin(&w_mtx);
 	MPASS(w->w_refcount > 0);
 	w->w_refcount--;
 
 	if (w->w_refcount == 0)
 		depart(w);
 	mtx_unlock_spin(&w_mtx);
 }
 
 #ifdef DDB
 static void
 witness_ddb_compute_levels(void)
 {
 	struct witness *w;
 
 	/*
 	 * First clear all levels.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_ddb_level = -1;
 
 	/*
 	 * Look for locks with no parents and level all their descendants.
 	 */
 	STAILQ_FOREACH(w, &w_all, w_list) {
 
 		/* If the witness has ancestors (is not a root), skip it. */
 		if (w->w_num_ancestors > 0)
 			continue;
 		witness_ddb_level_descendants(w, 0);
 	}
 }
 
 static void
 witness_ddb_level_descendants(struct witness *w, int l)
 {
 	int i;
 
 	if (w->w_ddb_level >= l)
 		return;
 
 	w->w_ddb_level = l;
 	l++;
 
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_level_descendants(&w_data[i], l);
 	}
 }
 
 static void
 witness_ddb_display_descendants(int(*prnt)(const char *fmt, ...),
     struct witness *w, int indent)
 {
 	int i;
 
  	for (i = 0; i < indent; i++)
  		prnt(" ");
 	prnt("%s (type: %s, depth: %d, active refs: %d)",
 	     w->w_name, w->w_class->lc_name,
 	     w->w_ddb_level, w->w_refcount);
  	if (w->w_displayed) {
  		prnt(" -- (already displayed)\n");
  		return;
  	}
  	w->w_displayed = 1;
 	if (w->w_file != NULL && w->w_line != 0)
 		prnt(" -- last acquired @ %s:%d\n", fixup_filename(w->w_file),
 		    w->w_line);
 	else
 		prnt(" -- never acquired\n");
 	indent++;
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (db_pager_quit)
 			return;
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT)
 			witness_ddb_display_descendants(prnt, &w_data[i],
 			    indent);
 	}
 }
 
 static void
 witness_ddb_display_list(int(*prnt)(const char *fmt, ...),
     struct witness_list *list)
 {
 	struct witness *w;
 
 	STAILQ_FOREACH(w, list, w_typelist) {
 		if (w->w_file == NULL || w->w_ddb_level > 0)
 			continue;
 
 		/* This lock has no anscestors - display its descendants. */
 		witness_ddb_display_descendants(prnt, w, 0);
 		if (db_pager_quit)
 			return;
 	}
 }
 	
 static void
 witness_ddb_display(int(*prnt)(const char *fmt, ...))
 {
 	struct witness *w;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	witness_ddb_compute_levels();
 
 	/* Clear all the displayed flags. */
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 
 	/*
 	 * First, handle sleep locks which have been acquired at least
 	 * once.
 	 */
 	prnt("Sleep locks:\n");
 	witness_ddb_display_list(prnt, &w_sleep);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Now do spin locks which have been acquired at least once.
 	 */
 	prnt("\nSpin locks:\n");
 	witness_ddb_display_list(prnt, &w_spin);
 	if (db_pager_quit)
 		return;
 	
 	/*
 	 * Finally, any locks which have not been acquired yet.
 	 */
 	prnt("\nLocks which were never acquired:\n");
 	STAILQ_FOREACH(w, &w_all, w_list) {
 		if (w->w_file != NULL || w->w_refcount == 0)
 			continue;
 		prnt("%s (type: %s, depth: %d)\n", w->w_name,
 		    w->w_class->lc_name, w->w_ddb_level);
 		if (db_pager_quit)
 			return;
 	}
 }
 #endif /* DDB */
 
 int
 witness_defineorder(struct lock_object *lock1, struct lock_object *lock2)
 {
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (0);
 
 	/* Require locks that witness knows about. */
 	if (lock1 == NULL || lock1->lo_witness == NULL || lock2 == NULL ||
 	    lock2->lo_witness == NULL)
 		return (EINVAL);
 
 	mtx_assert(&w_mtx, MA_NOTOWNED);
 	mtx_lock_spin(&w_mtx);
 
 	/*
 	 * If we already have either an explicit or implied lock order that
 	 * is the other way around, then return an error.
 	 */
 	if (witness_watch &&
 	    isitmydescendant(lock2->lo_witness, lock1->lo_witness)) {
 		mtx_unlock_spin(&w_mtx);
 		return (EDOOFUS);
 	}
 	
 	/* Try to add the new order. */
 	CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 	    lock2->lo_witness->w_name, lock1->lo_witness->w_name);
 	itismychild(lock1->lo_witness, lock2->lo_witness);
 	mtx_unlock_spin(&w_mtx);
 	return (0);
 }
 
 void
 witness_checkorder(struct lock_object *lock, int flags, const char *file,
     int line, struct lock_object *interlock)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1, *lock2, *plock;
 	struct lock_class *class, *iclass;
 	struct witness *w, *w1;
 	struct thread *td;
 	int i, j;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 
 	w = lock->lo_witness;
 	class = LOCK_CLASS(lock);
 	td = curthread;
 
 	if (class->lc_flags & LC_SLEEPLOCK) {
 
 		/*
 		 * Since spin locks include a critical section, this check
 		 * implicitly enforces a lock order of all sleep locks before
 		 * all spin locks.
 		 */
 		if (td->td_critnest != 0 && !kdb_active)
 			kassert_panic("acquiring blockable sleep lock with "
 			    "spinlock or critical section held (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 
 		/*
 		 * If this is the first lock acquired then just return as
 		 * no order checking is needed.
 		 */
 		lock_list = td->td_sleeplocks;
 		if (lock_list == NULL || lock_list->ll_count == 0)
 			return;
 	} else {
 
 		/*
 		 * If this is the first lock, just return as no order
 		 * checking is needed.  Avoid problems with thread
 		 * migration pinning the thread while checking if
 		 * spinlocks are held.  If at least one spinlock is held
 		 * the thread is in a safe path and it is allowed to
 		 * unpin it.
 		 */
 		sched_pin();
 		lock_list = PCPU_GET(spinlocks);
 		if (lock_list == NULL || lock_list->ll_count == 0) {
 			sched_unpin();
 			return;
 		}
 		sched_unpin();
 	}
 
 	/*
 	 * Check to see if we are recursing on a lock we already own.  If
 	 * so, make sure that we don't mismatch exclusive and shared lock
 	 * acquires.
 	 */
 	lock1 = find_instance(lock_list, lock);
 	if (lock1 != NULL) {
 		if ((lock1->li_flags & LI_EXCLUSIVE) != 0 &&
 		    (flags & LOP_EXCLUSIVE) == 0) {
 			witness_output("shared lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while exclusively locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("excl->share");
 		}
 		if ((lock1->li_flags & LI_EXCLUSIVE) == 0 &&
 		    (flags & LOP_EXCLUSIVE) != 0) {
 			witness_output("exclusive lock of (%s) %s @ %s:%d\n",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			witness_output("while share locked from %s:%d\n",
 			    fixup_filename(lock1->li_file), lock1->li_line);
 			kassert_panic("share->excl");
 		}
 		return;
 	}
 
 	/* Warn if the interlock is not locked exactly once. */
 	if (interlock != NULL) {
 		iclass = LOCK_CLASS(interlock);
 		lock1 = find_instance(lock_list, interlock);
 		if (lock1 == NULL)
 			kassert_panic("interlock (%s) %s not locked @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 		else if ((lock1->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("interlock (%s) %s recursed @ %s:%d",
 			    iclass->lc_name, interlock->lo_name,
 			    fixup_filename(file), line);
 	}
 
 	/*
 	 * Find the previously acquired lock, but ignore interlocks.
 	 */
 	plock = &lock_list->ll_children[lock_list->ll_count - 1];
 	if (interlock != NULL && plock->li_lock == interlock) {
 		if (lock_list->ll_count > 1)
 			plock =
 			    &lock_list->ll_children[lock_list->ll_count - 2];
 		else {
 			lle = lock_list->ll_next;
 
 			/*
 			 * The interlock is the only lock we hold, so
 			 * simply return.
 			 */
 			if (lle == NULL)
 				return;
 			plock = &lle->ll_children[lle->ll_count - 1];
 		}
 	}
 	
 	/*
 	 * Try to perform most checks without a lock.  If this succeeds we
 	 * can skip acquiring the lock and return success.  Otherwise we redo
 	 * the check with the lock held to handle races with concurrent updates.
 	 */
 	w1 = plock->li_lock->lo_witness;
 	if (witness_lock_order_check(w1, w))
 		return;
 
 	mtx_lock_spin(&w_mtx);
 	if (witness_lock_order_check(w1, w)) {
 		mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	witness_lock_order_add(w1, w);
 
 	/*
 	 * Check for duplicate locks of the same type.  Note that we only
 	 * have to check for this on the last lock we just acquired.  Any
 	 * other cases will be caught as lock order violations.
 	 */
 	if (w1 == w) {
 		i = w->w_index;
 		if (!(lock->lo_flags & LO_DUPOK) && !(flags & LOP_DUPOK) &&
 		    !(w_rmatrix[i][i] & WITNESS_REVERSAL)) {
 		    w_rmatrix[i][i] |= WITNESS_REVERSAL;
 			w->w_reversed = 1;
 			mtx_unlock_spin(&w_mtx);
 			witness_output(
 			    "acquiring duplicate lock of same type: \"%s\"\n", 
 			    w->w_name);
 			witness_output(" 1st %s @ %s:%d\n", plock->li_lock->lo_name,
 			    fixup_filename(plock->li_file), plock->li_line);
 			witness_output(" 2nd %s @ %s:%d\n", lock->lo_name,
 			    fixup_filename(file), line);
 			witness_debugger(1, __func__);
 		} else
 			mtx_unlock_spin(&w_mtx);
 		return;
 	}
 	mtx_assert(&w_mtx, MA_OWNED);
 
 	/*
 	 * If we know that the lock we are acquiring comes after
 	 * the lock we most recently acquired in the lock order tree,
 	 * then there is no need for any further checks.
 	 */
 	if (isitmychild(w1, w))
 		goto out;
 
 	for (j = 0, lle = lock_list; lle != NULL; lle = lle->ll_next) {
 		for (i = lle->ll_count - 1; i >= 0; i--, j++) {
 
 			MPASS(j < LOCK_CHILDCOUNT * LOCK_NCHILDREN);
 			lock1 = &lle->ll_children[i];
 
 			/*
 			 * Ignore the interlock.
 			 */
 			if (interlock == lock1->li_lock)
 				continue;
 
 			/*
 			 * If this lock doesn't undergo witness checking,
 			 * then skip it.
 			 */
 			w1 = lock1->li_lock->lo_witness;
 			if (w1 == NULL) {
 				KASSERT((lock1->li_lock->lo_flags & LO_WITNESS) == 0,
 				    ("lock missing witness structure"));
 				continue;
 			}
 
 			/*
 			 * If we are locking Giant and this is a sleepable
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * isn't sleepable, we want to treat it as a lock
 			 * order violation to enfore a general lock order of
 			 * sleepable locks before non-sleepable locks.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				goto reversal;
 
 			/*
 			 * If we are locking Giant and this is a non-sleepable
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
 			    lock == &Giant.lock_object)
 				goto reversal;
 
 			/*
 			 * Check the lock order hierarchy for a reveresal.
 			 */
 			if (!isitmydescendant(w, w1))
 				continue;
 		reversal:
 
 			/*
 			 * We have a lock order violation, check to see if it
 			 * is allowed or has already been yelled about.
 			 */
 #ifdef BLESSING
 
 			/*
 			 * If the lock order is blessed, just bail.  We don't
 			 * look for other lock order violations though, which
 			 * may be a bug.
 			 */
 			if (blessed(w, w1))
 				goto out;
 #endif
 
 			/* Bail if this violation is known */
 			if (w_rmatrix[w1->w_index][w->w_index] & WITNESS_REVERSAL)
 				goto out;
 
 			/* Record this as a violation */
 			w_rmatrix[w1->w_index][w->w_index] |= WITNESS_REVERSAL;
 			w_rmatrix[w->w_index][w1->w_index] |= WITNESS_REVERSAL;
 			w->w_reversed = w1->w_reversed = 1;
 			witness_increment_graph_generation();
 			mtx_unlock_spin(&w_mtx);
 
 #ifdef WITNESS_NO_VNODE
 			/*
 			 * There are known LORs between VNODE locks. They are
 			 * not an indication of a bug. VNODE locks are flagged
 			 * as such (LO_IS_VNODE) and we don't yell if the LOR
 			 * is between 2 VNODE locks.
 			 */
 			if ((lock->lo_flags & LO_IS_VNODE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_IS_VNODE) != 0)
 				return;
 #endif
 
 			/*
 			 * Ok, yell about it.
 			 */
 			if (((lock->lo_flags & LO_SLEEPABLE) != 0 &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0))
 				witness_output(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
 			    && lock == &Giant.lock_object)
 				witness_output(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
 				witness_output("lock order reversal:\n");
 
 			/*
 			 * Try to locate an earlier lock with
 			 * witness w in our list.
 			 */
 			do {
 				lock2 = &lle->ll_children[i];
 				MPASS(lock2->li_lock != NULL);
 				if (lock2->li_lock->lo_witness == w)
 					break;
 				if (i == 0 && lle->ll_next != NULL) {
 					lle = lle->ll_next;
 					i = lle->ll_count - 1;
 					MPASS(i >= 0 && i < LOCK_NCHILDREN);
 				} else
 					i--;
 			} while (i >= 0);
 			if (i < 0) {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			} else {
 				witness_output(" 1st %p %s (%s) @ %s:%d\n",
 				    lock2->li_lock, lock2->li_lock->lo_name,
 				    lock2->li_lock->lo_witness->w_name,
 				    fixup_filename(lock2->li_file),
 				    lock2->li_line);
 				witness_output(" 2nd %p %s (%s) @ %s:%d\n",
 				    lock1->li_lock, lock1->li_lock->lo_name,
 				    w1->w_name, fixup_filename(lock1->li_file),
 				    lock1->li_line);
 				witness_output(" 3rd %p %s (%s) @ %s:%d\n", lock,
 				    lock->lo_name, w->w_name,
 				    fixup_filename(file), line);
 			}
 			witness_debugger(1, __func__);
 			return;
 		}
 	}
 
 	/*
 	 * If requested, build a new lock order.  However, don't build a new
 	 * relationship between a sleepable lock and Giant if it is in the
 	 * wrong direction.  The correct lock order is that sleepable locks
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
 	    !(plock->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    w->w_name, plock->li_lock->lo_witness->w_name);
 		itismychild(plock->li_lock->lo_witness, w);
 	}
 out:
 	mtx_unlock_spin(&w_mtx);
 }
 
 void
 witness_lock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct witness *w;
 	struct thread *td;
 
 	if (witness_cold || witness_watch == -1 || lock->lo_witness == NULL ||
 	    panicstr != NULL)
 		return;
 	w = lock->lo_witness;
 	td = curthread;
 
 	/* Determine lock list for this lock. */
 	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 
 	/* Check to see if we are recursing on a lock we already own. */
 	instance = find_instance(*lock_list, lock);
 	if (instance != NULL) {
 		instance->li_flags++;
 		CTR4(KTR_WITNESS, "%s: pid %d recursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, lock->lo_name,
 		    instance->li_flags & LI_RECURSEMASK);
 		instance->li_file = file;
 		instance->li_line = line;
 		return;
 	}
 
 	/* Update per-witness last file and line acquire. */
 	w->w_file = file;
 	w->w_line = line;
 
 	/* Find the next open lock instance in the list and fill it. */
 	lle = *lock_list;
 	if (lle == NULL || lle->ll_count == LOCK_NCHILDREN) {
 		lle = witness_lock_list_get();
 		if (lle == NULL)
 			return;
 		lle->ll_next = *lock_list;
 		CTR3(KTR_WITNESS, "%s: pid %d added lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		*lock_list = lle;
 	}
 	instance = &lle->ll_children[lle->ll_count++];
 	instance->li_lock = lock;
 	instance->li_line = line;
 	instance->li_file = file;
 	if ((flags & LOP_EXCLUSIVE) != 0)
 		instance->li_flags = LI_EXCLUSIVE;
 	else
 		instance->li_flags = 0;
 	CTR4(KTR_WITNESS, "%s: pid %d added %s as lle[%d]", __func__,
 	    td->td_proc->p_pid, lock->lo_name, lle->ll_count - 1);
 }
 
 void
 witness_upgrade(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "upgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "upgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("upgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "upgrade of exclusive lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "upgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags |= LI_EXCLUSIVE;
 }
 
 void
 witness_downgrade(struct lock_object *lock, int flags, const char *file,
     int line)
 {
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (witness_watch) {
 		if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 			kassert_panic(
 			    "downgrade of non-upgradable lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 			kassert_panic(
 			    "downgrade of non-sleep lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 	}
 	instance = find_instance(curthread->td_sleeplocks, lock);
 	if (instance == NULL) {
 		kassert_panic("downgrade of unlocked lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name,
 		    fixup_filename(file), line);
 		return;
 	}
 	if (witness_watch) {
 		if ((instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "downgrade of shared lock (%s) %s @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic(
 			    "downgrade of recursed lock (%s) %s r=%d @ %s:%d",
 			    class->lc_name, lock->lo_name,
 			    instance->li_flags & LI_RECURSEMASK,
 			    fixup_filename(file), line);
 	}
 	instance->li_flags &= ~LI_EXCLUSIVE;
 }
 
 void
 witness_unlock(struct lock_object *lock, int flags, const char *file, int line)
 {
 	struct lock_list_entry **lock_list, *lle;
 	struct lock_instance *instance;
 	struct lock_class *class;
 	struct thread *td;
 	register_t s;
 	int i, j;
 
 	if (witness_cold || lock->lo_witness == NULL || panicstr != NULL)
 		return;
 	td = curthread;
 	class = LOCK_CLASS(lock);
 
 	/* Find lock instance associated with this lock. */
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
 	lle = *lock_list;
 	for (; *lock_list != NULL; lock_list = &(*lock_list)->ll_next)
 		for (i = 0; i < (*lock_list)->ll_count; i++) {
 			instance = &(*lock_list)->ll_children[i];
 			if (instance->li_lock == lock)
 				goto found;
 		}
 
 	/*
 	 * When disabling WITNESS through witness_watch we could end up in
 	 * having registered locks in the td_sleeplocks queue.
 	 * We have to make sure we flush these queues, so just search for
 	 * eventual register locks and remove them.
 	 */
 	if (witness_watch > 0) {
 		kassert_panic("lock (%s) %s not locked @ %s:%d", class->lc_name,
 		    lock->lo_name, fixup_filename(file), line);
 		return;
 	} else {
 		return;
 	}
 found:
 
 	/* First, check for shared/exclusive mismatches. */
 	if ((instance->li_flags & LI_EXCLUSIVE) != 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) == 0) {
 		witness_output("shared unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while exclusively locked from %s:%d\n",
 		    fixup_filename(instance->li_file), instance->li_line);
 		kassert_panic("excl->ushare");
 	}
 	if ((instance->li_flags & LI_EXCLUSIVE) == 0 && witness_watch > 0 &&
 	    (flags & LOP_EXCLUSIVE) != 0) {
 		witness_output("exclusive unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		witness_output("while share locked from %s:%d\n",
 		    fixup_filename(instance->li_file),
 		    instance->li_line);
 		kassert_panic("share->uexcl");
 	}
 	/* If we are recursed, unrecurse. */
 	if ((instance->li_flags & LI_RECURSEMASK) > 0) {
 		CTR4(KTR_WITNESS, "%s: pid %d unrecursed on %s r=%d", __func__,
 		    td->td_proc->p_pid, instance->li_lock->lo_name,
 		    instance->li_flags);
 		instance->li_flags--;
 		return;
 	}
 	/* The lock is now being dropped, check for NORELEASE flag */
 	if ((instance->li_flags & LI_NORELEASE) != 0 && witness_watch > 0) {
 		witness_output("forbidden unlock of (%s) %s @ %s:%d\n",
 		    class->lc_name, lock->lo_name, fixup_filename(file), line);
 		kassert_panic("lock marked norelease");
 	}
 
 	/* Otherwise, remove this item from the list. */
 	s = intr_disable();
 	CTR4(KTR_WITNESS, "%s: pid %d removed %s from lle[%d]", __func__,
 	    td->td_proc->p_pid, instance->li_lock->lo_name,
 	    (*lock_list)->ll_count - 1);
 	for (j = i; j < (*lock_list)->ll_count - 1; j++)
 		(*lock_list)->ll_children[j] =
 		    (*lock_list)->ll_children[j + 1];
 	(*lock_list)->ll_count--;
 	intr_restore(s);
 
 	/*
 	 * In order to reduce contention on w_mtx, we want to keep always an
 	 * head object into lists so that frequent allocation from the 
 	 * free witness pool (and subsequent locking) is avoided.
 	 * In order to maintain the current code simple, when the head
 	 * object is totally unloaded it means also that we do not have
 	 * further objects in the list, so the list ownership needs to be
 	 * hand over to another object if the current head needs to be freed.
 	 */
 	if ((*lock_list)->ll_count == 0) {
 		if (*lock_list == lle) {
 			if (lle->ll_next == NULL)
 				return;
 		} else
 			lle = *lock_list;
 		*lock_list = lle->ll_next;
 		CTR3(KTR_WITNESS, "%s: pid %d removed lle %p", __func__,
 		    td->td_proc->p_pid, lle);
 		witness_lock_list_free(lle);
 	}
 }
 
 void
 witness_thread_exit(struct thread *td)
 {
 	struct lock_list_entry *lle;
 	int i, n;
 
 	lle = td->td_sleeplocks;
 	if (lle == NULL || panicstr != NULL)
 		return;
 	if (lle->ll_count != 0) {
 		for (n = 0; lle != NULL; lle = lle->ll_next)
 			for (i = lle->ll_count - 1; i >= 0; i--) {
 				if (n == 0)
 					witness_output(
 		    "Thread %p exiting with the following locks held:\n", td);
 				n++;
 				witness_list_lock(&lle->ll_children[i],
 				    witness_output);
 				
 			}
 		kassert_panic(
 		    "Thread %p cannot exit while holding sleeplocks\n", td);
 	}
 	witness_lock_list_free(lle);
 }
 
 /*
  * Warn if any locks other than 'lock' are held.  Flags can be passed in to
  * exempt Giant and sleepable locks from the checks as well.  If any
  * non-exempt locks are held, then a supplied message is printed to the
  * output channel along with a list of the offending locks.  If indicated in the
  * flags then a failure results in a panic as well.
  */
 int
 witness_warn(int flags, struct lock_object *lock, const char *fmt, ...)
 {
 	struct lock_list_entry *lock_list, *lle;
 	struct lock_instance *lock1;
 	struct thread *td;
 	va_list ap;
 	int i, n;
 
 	if (witness_cold || witness_watch < 1 || panicstr != NULL)
 		return (0);
 	n = 0;
 	td = curthread;
 	for (lle = td->td_sleeplocks; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			lock1 = &lle->ll_children[i];
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
 			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
 				continue;
 			if (n == 0) {
 				va_start(ap, fmt);
 				witness_voutput(fmt, ap);
 				va_end(ap);
 				witness_output(
 				    " with the following %slocks held:\n",
 				    (flags & WARN_SLEEPOK) != 0 ?
 				    "non-sleepable " : "");
 			}
 			n++;
 			witness_list_lock(lock1, witness_output);
 		}
 
 	/*
 	 * Pin the thread in order to avoid problems with thread migration.
 	 * Once that all verifies are passed about spinlocks ownership,
 	 * the thread is in a safe path and it can be unpinned.
 	 */
 	sched_pin();
 	lock_list = PCPU_GET(spinlocks);
 	if (lock_list != NULL && lock_list->ll_count != 0) {
 		sched_unpin();
 
 		/*
 		 * We should only have one spinlock and as long as
 		 * the flags cannot match for this locks class,
 		 * check if the first spinlock is the one curthread
 		 * should hold.
 		 */
 		lock1 = &lock_list->ll_children[lock_list->ll_count - 1];
 		if (lock_list->ll_count == 1 && lock_list->ll_next == NULL &&
 		    lock1->li_lock == lock && n == 0)
 			return (0);
 
 		va_start(ap, fmt);
 		witness_voutput(fmt, ap);
 		va_end(ap);
 		witness_output(" with the following %slocks held:\n",
 		    (flags & WARN_SLEEPOK) != 0 ?  "non-sleepable " : "");
 		n += witness_list_locks(&lock_list, witness_output);
 	} else
 		sched_unpin();
 	if (flags & WARN_PANIC && n)
 		kassert_panic("%s", __func__);
 	else
 		witness_debugger(n, __func__);
 	return (n);
 }
 
 const char *
 witness_file(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return ("?");
 	w = lock->lo_witness;
 	return (w->w_file);
 }
 
 int
 witness_line(struct lock_object *lock)
 {
 	struct witness *w;
 
 	if (witness_cold || witness_watch < 1 || lock->lo_witness == NULL)
 		return (0);
 	w = lock->lo_witness;
 	return (w->w_line);
 }
 
 static struct witness *
 enroll(const char *description, struct lock_class *lock_class)
 {
 	struct witness *w;
 	struct witness_list *typelist;
 
 	MPASS(description != NULL);
 
 	if (witness_watch == -1 || panicstr != NULL)
 		return (NULL);
 	if ((lock_class->lc_flags & LC_SPINLOCK)) {
 		if (witness_skipspin)
 			return (NULL);
 		else
 			typelist = &w_spin;
 	} else if ((lock_class->lc_flags & LC_SLEEPLOCK)) {
 		typelist = &w_sleep;
 	} else {
 		kassert_panic("lock class %s is not sleep or spin",
 		    lock_class->lc_name);
 		return (NULL);
 	}
 
 	mtx_lock_spin(&w_mtx);
 	w = witness_hash_get(description);
 	if (w)
 		goto found;
 	if ((w = witness_get()) == NULL)
 		return (NULL);
 	MPASS(strlen(description) < MAX_W_NAME);
 	strcpy(w->w_name, description);
 	w->w_class = lock_class;
 	w->w_refcount = 1;
 	STAILQ_INSERT_HEAD(&w_all, w, w_list);
 	if (lock_class->lc_flags & LC_SPINLOCK) {
 		STAILQ_INSERT_HEAD(&w_spin, w, w_typelist);
 		w_spin_cnt++;
 	} else if (lock_class->lc_flags & LC_SLEEPLOCK) {
 		STAILQ_INSERT_HEAD(&w_sleep, w, w_typelist);
 		w_sleep_cnt++;
 	}
 
 	/* Insert new witness into the hash */
 	witness_hash_put(w);
 	witness_increment_graph_generation();
 	mtx_unlock_spin(&w_mtx);
 	return (w);
 found:
 	w->w_refcount++;
 	mtx_unlock_spin(&w_mtx);
 	if (lock_class != w->w_class)
 		kassert_panic(
 			"lock (%s) %s does not match earlier (%s) lock",
 			description, lock_class->lc_name,
 			w->w_class->lc_name);
 	return (w);
 }
 
 static void
 depart(struct witness *w)
 {
 	struct witness_list *list;
 
 	MPASS(w->w_refcount == 0);
 	if (w->w_class->lc_flags & LC_SLEEPLOCK) {
 		list = &w_sleep;
 		w_sleep_cnt--;
 	} else {
 		list = &w_spin;
 		w_spin_cnt--;
 	}
 	/*
 	 * Set file to NULL as it may point into a loadable module.
 	 */
 	w->w_file = NULL;
 	w->w_line = 0;
 	witness_increment_graph_generation();
 }
 
 
 static void
 adopt(struct witness *parent, struct witness *child)
 {
 	int pi, ci, i, j;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	/* If the relationship is already known, there's no work to be done. */
 	if (isitmychild(parent, child))
 		return;
 
 	/* When the structure of the graph changes, bump up the generation. */
 	witness_increment_graph_generation();
 
 	/*
 	 * The hard part ... create the direct relationship, then propagate all
 	 * indirect relationships.
 	 */
 	pi = parent->w_index;
 	ci = child->w_index;
 	WITNESS_INDEX_ASSERT(pi);
 	WITNESS_INDEX_ASSERT(ci);
 	MPASS(pi != ci);
 	w_rmatrix[pi][ci] |= WITNESS_PARENT;
 	w_rmatrix[ci][pi] |= WITNESS_CHILD;
 
 	/*
 	 * If parent was not already an ancestor of child,
 	 * then we increment the descendant and ancestor counters.
 	 */
 	if ((w_rmatrix[pi][ci] & WITNESS_ANCESTOR) == 0) {
 		parent->w_num_descendants++;
 		child->w_num_ancestors++;
 	}
 
 	/* 
 	 * Find each ancestor of 'pi'. Note that 'pi' itself is counted as 
 	 * an ancestor of 'pi' during this loop.
 	 */
 	for (i = 1; i <= w_max_used_index; i++) {
 		if ((w_rmatrix[i][pi] & WITNESS_ANCESTOR_MASK) == 0 && 
 		    (i != pi))
 			continue;
 
 		/* Find each descendant of 'i' and mark it as a descendant. */
 		for (j = 1; j <= w_max_used_index; j++) {
 
 			/* 
 			 * Skip children that are already marked as
 			 * descendants of 'i'.
 			 */
 			if (w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK)
 				continue;
 
 			/*
 			 * We are only interested in descendants of 'ci'. Note
 			 * that 'ci' itself is counted as a descendant of 'ci'.
 			 */
 			if ((w_rmatrix[ci][j] & WITNESS_ANCESTOR_MASK) == 0 && 
 			    (j != ci))
 				continue;
 			w_rmatrix[i][j] |= WITNESS_ANCESTOR;
 			w_rmatrix[j][i] |= WITNESS_DESCENDANT;
 			w_data[i].w_num_descendants++;
 			w_data[j].w_num_ancestors++;
 
 			/* 
 			 * Make sure we aren't marking a node as both an
 			 * ancestor and descendant. We should have caught 
 			 * this as a lock order reversal earlier.
 			 */
 			if ((w_rmatrix[i][j] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[i][j] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    i, j, w_rmatrix[i][j]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 			if ((w_rmatrix[j][i] & WITNESS_ANCESTOR_MASK) &&
 			    (w_rmatrix[j][i] & WITNESS_DESCENDANT_MASK)) {
 				printf("witness rmatrix paradox! [%d][%d]=%d "
 				    "both ancestor and descendant\n",
 				    j, i, w_rmatrix[j][i]); 
 				kdb_backtrace();
 				printf("Witness disabled.\n");
 				witness_watch = -1;
 			}
 		}
 	}
 }
 
 static void
 itismychild(struct witness *parent, struct witness *child)
 {
 	int unlocked;
 
 	MPASS(child != NULL && parent != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (!witness_lock_type_equal(parent, child)) {
 		if (witness_cold == 0) {
 			unlocked = 1;
 			mtx_unlock_spin(&w_mtx);
 		} else {
 			unlocked = 0;
 		}
 		kassert_panic(
 		    "%s: parent \"%s\" (%s) and child \"%s\" (%s) are not "
 		    "the same lock type", __func__, parent->w_name,
 		    parent->w_class->lc_name, child->w_name,
 		    child->w_class->lc_name);
 		if (unlocked)
 			mtx_lock_spin(&w_mtx);
 	}
 	adopt(parent, child);
 }
 
 /*
  * Generic code for the isitmy*() functions. The rmask parameter is the
  * expected relationship of w1 to w2.
  */
 static int
 _isitmyx(struct witness *w1, struct witness *w2, int rmask, const char *fname)
 {
 	unsigned char r1, r2;
 	int i1, i2;
 
 	i1 = w1->w_index;
 	i2 = w2->w_index;
 	WITNESS_INDEX_ASSERT(i1);
 	WITNESS_INDEX_ASSERT(i2);
 	r1 = w_rmatrix[i1][i2] & WITNESS_RELATED_MASK;
 	r2 = w_rmatrix[i2][i1] & WITNESS_RELATED_MASK;
 
 	/* The flags on one better be the inverse of the flags on the other */
 	if (!((WITNESS_ATOD(r1) == r2 && WITNESS_DTOA(r2) == r1) ||
 	    (WITNESS_DTOA(r1) == r2 && WITNESS_ATOD(r2) == r1))) {
 		/* Don't squawk if we're potentially racing with an update. */
 		if (!mtx_owned(&w_mtx))
 			return (0);
 		printf("%s: rmatrix mismatch between %s (index %d) and %s "
 		    "(index %d): w_rmatrix[%d][%d] == %hhx but "
 		    "w_rmatrix[%d][%d] == %hhx\n",
 		    fname, w1->w_name, i1, w2->w_name, i2, i1, i2, r1,
 		    i2, i1, r2);
 		kdb_backtrace();
 		printf("Witness disabled.\n");
 		witness_watch = -1;
 	}
 	return (r1 & rmask);
 }
 
 /*
  * Checks if @child is a direct child of @parent.
  */
 static int
 isitmychild(struct witness *parent, struct witness *child)
 {
 
 	return (_isitmyx(parent, child, WITNESS_PARENT, __func__));
 }
 
 /*
  * Checks if @descendant is a direct or inderect descendant of @ancestor.
  */
 static int
 isitmydescendant(struct witness *ancestor, struct witness *descendant)
 {
 
 	return (_isitmyx(ancestor, descendant, WITNESS_ANCESTOR_MASK,
 	    __func__));
 }
 
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
 {
 	int i;
 	struct witness_blessed *b;
 
 	for (i = 0; i < nitems(blessed_list); i++) {
 		b = &blessed_list[i];
 		if (strcmp(w1->w_name, b->b_lock1) == 0) {
 			if (strcmp(w2->w_name, b->b_lock2) == 0)
 				return (1);
 			continue;
 		}
 		if (strcmp(w1->w_name, b->b_lock2) == 0)
 			if (strcmp(w2->w_name, b->b_lock1) == 0)
 				return (1);
 	}
 	return (0);
 }
 #endif
 
 static struct witness *
 witness_get(void)
 {
 	struct witness *w;
 	int index;
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 
 	if (witness_watch == -1) {
 		mtx_unlock_spin(&w_mtx);
 		return (NULL);
 	}
 	if (STAILQ_EMPTY(&w_free)) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("WITNESS: unable to allocate a new witness object\n");
 		return (NULL);
 	}
 	w = STAILQ_FIRST(&w_free);
 	STAILQ_REMOVE_HEAD(&w_free, w_list);
 	w_free_cnt--;
 	index = w->w_index;
 	MPASS(index > 0 && index == w_max_used_index+1 &&
 	    index < witness_count);
 	bzero(w, sizeof(*w));
 	w->w_index = index;
 	if (index > w_max_used_index)
 		w_max_used_index = index;
 	return (w);
 }
 
 static void
 witness_free(struct witness *w)
 {
 
 	STAILQ_INSERT_HEAD(&w_free, w, w_list);
 	w_free_cnt++;
 }
 
 static struct lock_list_entry *
 witness_lock_list_get(void)
 {
 	struct lock_list_entry *lle;
 
 	if (witness_watch == -1)
 		return (NULL);
 	mtx_lock_spin(&w_mtx);
 	lle = w_lock_list_free;
 	if (lle == NULL) {
 		witness_watch = -1;
 		mtx_unlock_spin(&w_mtx);
 		printf("%s: witness exhausted\n", __func__);
 		return (NULL);
 	}
 	w_lock_list_free = lle->ll_next;
 	mtx_unlock_spin(&w_mtx);
 	bzero(lle, sizeof(*lle));
 	return (lle);
 }
 		
 static void
 witness_lock_list_free(struct lock_list_entry *lle)
 {
 
 	mtx_lock_spin(&w_mtx);
 	lle->ll_next = w_lock_list_free;
 	w_lock_list_free = lle;
 	mtx_unlock_spin(&w_mtx);
 }
 
 static struct lock_instance *
 find_instance(struct lock_list_entry *list, const struct lock_object *lock)
 {
 	struct lock_list_entry *lle;
 	struct lock_instance *instance;
 	int i;
 
 	for (lle = list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			instance = &lle->ll_children[i];
 			if (instance->li_lock == lock)
 				return (instance);
 		}
 	return (NULL);
 }
 
 static void
 witness_list_lock(struct lock_instance *instance,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_object *lock;
 
 	lock = instance->li_lock;
 	prnt("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
 	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_witness->w_name != lock->lo_name)
 		prnt(" (%s)", lock->lo_witness->w_name);
 	prnt(" r = %d (%p) locked @ %s:%d\n",
 	    instance->li_flags & LI_RECURSEMASK, lock,
 	    fixup_filename(instance->li_file), instance->li_line);
 }
 
 static int
 witness_output(const char *fmt, ...)
 {
 	va_list ap;
 	int ret;
 
 	va_start(ap, fmt);
 	ret = witness_voutput(fmt, ap);
 	va_end(ap);
 	return (ret);
 }
 
 static int
 witness_voutput(const char *fmt, va_list ap)
 {
 	int ret;
 
 	ret = 0;
 	switch (witness_channel) {
 	case WITNESS_CONSOLE:
 		ret = vprintf(fmt, ap);
 		break;
 	case WITNESS_LOG:
 		vlog(LOG_NOTICE, fmt, ap);
 		break;
 	case WITNESS_NONE:
 		break;
 	}
 	return (ret);
 }
 
 #ifdef DDB
 static int
 witness_thread_has_locks(struct thread *td)
 {
 
 	if (td->td_sleeplocks == NULL)
 		return (0);
 	return (td->td_sleeplocks->ll_count != 0);
 }
 
 static int
 witness_proc_has_locks(struct proc *p)
 {
 	struct thread *td;
 
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (witness_thread_has_locks(td))
 			return (1);
 	}
 	return (0);
 }
 #endif
 
 int
 witness_list_locks(struct lock_list_entry **lock_list,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_list_entry *lle;
 	int i, nheld;
 
 	nheld = 0;
 	for (lle = *lock_list; lle != NULL; lle = lle->ll_next)
 		for (i = lle->ll_count - 1; i >= 0; i--) {
 			witness_list_lock(&lle->ll_children[i], prnt);
 			nheld++;
 		}
 	return (nheld);
 }
 
 /*
  * This is a bit risky at best.  We call this function when we have timed
  * out acquiring a spin lock, and we assume that the other CPU is stuck
  * with this lock held.  So, we go groveling around in the other CPU's
  * per-cpu data to try to find the lock instance for this spin lock to
  * see when it was last acquired.
  */
 void
 witness_display_spinlock(struct lock_object *lock, struct thread *owner,
     int (*prnt)(const char *fmt, ...))
 {
 	struct lock_instance *instance;
 	struct pcpu *pc;
 
 	if (owner->td_critnest == 0 || owner->td_oncpu == NOCPU)
 		return;
 	pc = pcpu_find(owner->td_oncpu);
 	instance = find_instance(pc->pc_spinlocks, lock);
 	if (instance != NULL)
 		witness_list_lock(instance, prnt);
 }
 
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
 
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	/*
 	 * This function is used independently in locking code to deal with
 	 * Giant, SCHEDULER_STOPPED() check can be removed here after Giant
 	 * is gone.
 	 */
 	if (SCHEDULER_STOPPED())
 		return;
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	if (instance == NULL)
 		return;
 	instance->li_file = file;
 	instance->li_line = line;
 }
 
 void
 witness_assert(const struct lock_object *lock, int flags, const char *file,
     int line)
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch < 1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
 	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		kassert_panic("Lock (%s) %s is not sleep or spin!",
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			kassert_panic("Lock (%s) %s locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
 	case LA_LOCKED | LA_NOTRECURSED:
 	case LA_SLOCKED:
 	case LA_SLOCKED | LA_RECURSED:
 	case LA_SLOCKED | LA_NOTRECURSED:
 	case LA_XLOCKED:
 	case LA_XLOCKED | LA_RECURSED:
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			kassert_panic("Lock (%s) %s not locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			kassert_panic(
 			    "Lock (%s) %s not exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			kassert_panic(
 			    "Lock (%s) %s exclusively locked @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			kassert_panic("Lock (%s) %s not recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			kassert_panic("Lock (%s) %s recursed @ %s:%d.",
 			    class->lc_name, lock->lo_name,
 			    fixup_filename(file), line);
 		break;
 	default:
 		kassert_panic("Invalid lock assertion at %s:%d.",
 		    fixup_filename(file), line);
 
 	}
 #endif	/* INVARIANT_SUPPORT */
 }
 
 static void
 witness_setflag(struct lock_object *lock, int flag, int set)
 {
 	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
 	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == -1 || panicstr != NULL)
 		return;
 	class = LOCK_CLASS(lock);
 	if (class->lc_flags & LC_SLEEPLOCK)
 		lock_list = curthread->td_sleeplocks;
 	else {
 		if (witness_skipspin)
 			return;
 		lock_list = PCPU_GET(spinlocks);
 	}
 	instance = find_instance(lock_list, lock);
 	if (instance == NULL) {
 		kassert_panic("%s: lock (%s) %s not locked", __func__,
 		    class->lc_name, lock->lo_name);
 		return;
 	}
 
 	if (set)
 		instance->li_flags |= flag;
 	else
 		instance->li_flags &= ~flag;
 }
 
 void
 witness_norelease(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 1);
 }
 
 void
 witness_releaseok(struct lock_object *lock)
 {
 
 	witness_setflag(lock, LI_NORELEASE, 0);
 }
 
 #ifdef DDB
 static void
 witness_ddb_list(struct thread *td)
 {
 
 	KASSERT(witness_cold == 0, ("%s: witness_cold", __func__));
 	KASSERT(kdb_active, ("%s: not in the debugger", __func__));
 
 	if (witness_watch < 1)
 		return;
 
 	witness_list_locks(&td->td_sleeplocks, db_printf);
 
 	/*
 	 * We only handle spinlocks if td == curthread.  This is somewhat broken
 	 * if td is currently executing on some other CPU and holds spin locks
 	 * as we won't display those locks.  If we had a MI way of getting
 	 * the per-cpu data for a given cpu then we could use
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
 	 * That still wouldn't really fix this unless we locked the scheduler
 	 * lock or stopped the other CPU to make sure it wasn't changing the
 	 * list out from under us.  It is probably best to just not try to
 	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks), db_printf);
 }
 
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
 
 	if (have_addr)
 		td = db_lookup_thread(addr, true);
 	else
 		td = kdb_thread;
 	witness_ddb_list(td);
 }
 
 DB_SHOW_ALL_COMMAND(locks, db_witness_list_all)
 {
 	struct thread *td;
 	struct proc *p;
 
 	/*
 	 * It would be nice to list only threads and processes that actually
 	 * held sleep locks, but that information is currently not exported
 	 * by WITNESS.
 	 */
 	FOREACH_PROC_IN_SYSTEM(p) {
 		if (!witness_proc_has_locks(p))
 			continue;
 		FOREACH_THREAD_IN_PROC(p, td) {
 			if (!witness_thread_has_locks(td))
 				continue;
 			db_printf("Process %d (%s) thread %p (%d)\n", p->p_pid,
 			    p->p_comm, td, td->td_tid);
 			witness_ddb_list(td);
 			if (db_pager_quit)
 				return;
 		}
 	}
 }
 DB_SHOW_ALIAS(alllocks, db_witness_list_all)
 
 DB_SHOW_COMMAND(witness, db_witness_display)
 {
 
 	witness_ddb_display(db_printf);
 }
 #endif
 
 static int
 sysctl_debug_witness_badstacks(SYSCTL_HANDLER_ARGS)
 {
 	struct witness_lock_order_data *data1, *data2, *tmp_data1, *tmp_data2;
 	struct witness *tmp_w1, *tmp_w2, *w1, *w2;
 	struct sbuf *sb;
 	u_int w_rmatrix1, w_rmatrix2;
 	int error, generation, i, j;
 
 	tmp_data1 = NULL;
 	tmp_data2 = NULL;
 	tmp_w1 = NULL;
 	tmp_w2 = NULL;
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 	sb = sbuf_new(NULL, NULL, badstack_sbuf_size, SBUF_AUTOEXTEND);
 	if (sb == NULL)
 		return (ENOMEM);
 
 	/* Allocate and init temporary storage space. */
 	tmp_w1 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_w2 = malloc(sizeof(struct witness), M_TEMP, M_WAITOK | M_ZERO);
 	tmp_data1 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	tmp_data2 = malloc(sizeof(struct witness_lock_order_data), M_TEMP, 
 	    M_WAITOK | M_ZERO);
 	stack_zero(&tmp_data1->wlod_stack);
 	stack_zero(&tmp_data2->wlod_stack);
 
 restart:
 	mtx_lock_spin(&w_mtx);
 	generation = w_generation;
 	mtx_unlock_spin(&w_mtx);
 	sbuf_printf(sb, "Number of known direct relationships is %d\n",
 	    w_lohash.wloh_count);
 	for (i = 1; i < w_max_used_index; i++) {
 		mtx_lock_spin(&w_mtx);
 		if (generation != w_generation) {
 			mtx_unlock_spin(&w_mtx);
 
 			/* The graph has changed, try again. */
 			req->oldidx = 0;
 			sbuf_clear(sb);
 			goto restart;
 		}
 
 		w1 = &w_data[i];
 		if (w1->w_reversed == 0) {
 			mtx_unlock_spin(&w_mtx);
 			continue;
 		}
 
 		/* Copy w1 locally so we can release the spin lock. */
 		*tmp_w1 = *w1;
 		mtx_unlock_spin(&w_mtx);
 
 		if (tmp_w1->w_reversed == 0)
 			continue;
 		for (j = 1; j < w_max_used_index; j++) {
 			if ((w_rmatrix[i][j] & WITNESS_REVERSAL) == 0 || i > j)
 				continue;
 
 			mtx_lock_spin(&w_mtx);
 			if (generation != w_generation) {
 				mtx_unlock_spin(&w_mtx);
 
 				/* The graph has changed, try again. */
 				req->oldidx = 0;
 				sbuf_clear(sb);
 				goto restart;
 			}
 
 			w2 = &w_data[j];
 			data1 = witness_lock_order_get(w1, w2);
 			data2 = witness_lock_order_get(w2, w1);
 
 			/*
 			 * Copy information locally so we can release the
 			 * spin lock.
 			 */
 			*tmp_w2 = *w2;
 			w_rmatrix1 = (unsigned int)w_rmatrix[i][j];
 			w_rmatrix2 = (unsigned int)w_rmatrix[j][i];
 
 			if (data1) {
 				stack_zero(&tmp_data1->wlod_stack);
 				stack_copy(&data1->wlod_stack,
 				    &tmp_data1->wlod_stack);
 			}
 			if (data2 && data2 != data1) {
 				stack_zero(&tmp_data2->wlod_stack);
 				stack_copy(&data2->wlod_stack,
 				    &tmp_data2->wlod_stack);
 			}
 			mtx_unlock_spin(&w_mtx);
 
 			sbuf_printf(sb,
 	    "\nLock order reversal between \"%s\"(%s) and \"%s\"(%s)!\n",
 			    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 			    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 			if (data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name, 
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data1->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 			if (data2 && data2 != data1) {
 				sbuf_printf(sb,
 			"Lock order \"%s\"(%s) -> \"%s\"(%s) first seen at:\n",
 				    tmp_w2->w_name, tmp_w2->w_class->lc_name, 
 				    tmp_w1->w_name, tmp_w1->w_class->lc_name);
 				stack_sbuf_print(sb, &tmp_data2->wlod_stack);
 				sbuf_printf(sb, "\n");
 			}
 		}
 	}
 	mtx_lock_spin(&w_mtx);
 	if (generation != w_generation) {
 		mtx_unlock_spin(&w_mtx);
 
 		/*
 		 * The graph changed while we were printing stack data,
 		 * try again.
 		 */
 		req->oldidx = 0;
 		sbuf_clear(sb);
 		goto restart;
 	}
 	mtx_unlock_spin(&w_mtx);
 
 	/* Free temporary storage space. */
 	free(tmp_data1, M_TEMP);
 	free(tmp_data2, M_TEMP);
 	free(tmp_w1, M_TEMP);
 	free(tmp_w2, M_TEMP);
 
 	sbuf_finish(sb);
 	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_channel(SYSCTL_HANDLER_ARGS)
 {
 	static const struct {
 		enum witness_channel channel;
 		const char *name;
 	} channels[] = {
 		{ WITNESS_CONSOLE, "console" },
 		{ WITNESS_LOG, "log" },
 		{ WITNESS_NONE, "none" },
 	};
 	char buf[16];
 	u_int i;
 	int error;
 
 	buf[0] = '\0';
 	for (i = 0; i < nitems(channels); i++)
 		if (witness_channel == channels[i].channel) {
 			snprintf(buf, sizeof(buf), "%s", channels[i].name);
 			break;
 		}
 
 	error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	error = EINVAL;
 	for (i = 0; i < nitems(channels); i++)
 		if (strcmp(channels[i].name, buf) == 0) {
 			witness_channel = channels[i].channel;
 			error = 0;
 			break;
 		}
 	return (error);
 }
 
 static int
 sysctl_debug_witness_fullgraph(SYSCTL_HANDLER_ARGS)
 {
 	struct witness *w;
 	struct sbuf *sb;
 	int error;
 
 	if (witness_watch < 1) {
 		error = SYSCTL_OUT(req, w_notrunning, sizeof(w_notrunning));
 		return (error);
 	}
 	if (witness_cold) {
 		error = SYSCTL_OUT(req, w_stillcold, sizeof(w_stillcold));
 		return (error);
 	}
 	error = 0;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sb = sbuf_new_for_sysctl(NULL, NULL, FULLGRAPH_SBUF_SIZE, req);
 	if (sb == NULL)
 		return (ENOMEM);
 	sbuf_printf(sb, "\n");
 
 	mtx_lock_spin(&w_mtx);
 	STAILQ_FOREACH(w, &w_all, w_list)
 		w->w_displayed = 0;
 	STAILQ_FOREACH(w, &w_all, w_list)
 		witness_add_fullgraph(sb, w);
 	mtx_unlock_spin(&w_mtx);
 
 	/*
 	 * Close the sbuf and return to userland.
 	 */
 	error = sbuf_finish(sb);
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 static int
 sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS)
 {
 	int error, value;
 
 	value = witness_watch;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (value > 1 || value < -1 ||
 	    (witness_watch == -1 && value != witness_watch))
 		return (EINVAL);
 	witness_watch = value;
 	return (0);
 }
 
 static void
 witness_add_fullgraph(struct sbuf *sb, struct witness *w)
 {
 	int i;
 
 	if (w->w_displayed != 0 || (w->w_file == NULL && w->w_line == 0))
 		return;
 	w->w_displayed = 1;
 
 	WITNESS_INDEX_ASSERT(w->w_index);
 	for (i = 1; i <= w_max_used_index; i++) {
 		if (w_rmatrix[w->w_index][i] & WITNESS_PARENT) {
 			sbuf_printf(sb, "\"%s\",\"%s\"\n", w->w_name,
 			    w_data[i].w_name);
 			witness_add_fullgraph(sb, &w_data[i]);
 		}
 	}
 }
 
 /*
  * A simple hash function. Takes a key pointer and a key size. If size == 0,
  * interprets the key as a string and reads until the null
  * terminator. Otherwise, reads the first size bytes. Returns an unsigned 32-bit
  * hash value computed from the key.
  */
 static uint32_t
 witness_hash_djb2(const uint8_t *key, uint32_t size)
 {
 	unsigned int hash = 5381;
 	int i;
 
 	/* hash = hash * 33 + key[i] */
 	if (size)
 		for (i = 0; i < size; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 	else
 		for (i = 0; key[i] != 0; i++)
 			hash = ((hash << 5) + hash) + (unsigned int)key[i];
 
 	return (hash);
 }
 
 
 /*
  * Initializes the two witness hash tables. Called exactly once from
  * witness_initialize().
  */
 static void
 witness_init_hash_tables(void)
 {
 	int i;
 
 	MPASS(witness_cold);
 
 	/* Initialize the hash tables. */
 	for (i = 0; i < WITNESS_HASH_SIZE; i++)
 		w_hash.wh_array[i] = NULL;
 
 	w_hash.wh_size = WITNESS_HASH_SIZE;
 	w_hash.wh_count = 0;
 
 	/* Initialize the lock order data hash. */
 	w_lofree = NULL;
 	for (i = 0; i < WITNESS_LO_DATA_COUNT; i++) {
 		memset(&w_lodata[i], 0, sizeof(w_lodata[i]));
 		w_lodata[i].wlod_next = w_lofree;
 		w_lofree = &w_lodata[i];
 	}
 	w_lohash.wloh_size = WITNESS_LO_HASH_SIZE;
 	w_lohash.wloh_count = 0;
 	for (i = 0; i < WITNESS_LO_HASH_SIZE; i++)
 		w_lohash.wloh_array[i] = NULL;
 }
 
 static struct witness *
 witness_hash_get(const char *key)
 {
 	struct witness *w;
 	uint32_t hash;
 	
 	MPASS(key != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	hash = witness_hash_djb2(key, 0) % w_hash.wh_size;
 	w = w_hash.wh_array[hash];
 	while (w != NULL) {
 		if (strcmp(w->w_name, key) == 0)
 			goto out;
 		w = w->w_hash_next;
 	}
 
 out:
 	return (w);
 }
 
 static void
 witness_hash_put(struct witness *w)
 {
 	uint32_t hash;
 
 	MPASS(w != NULL);
 	MPASS(w->w_name != NULL);
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	KASSERT(witness_hash_get(w->w_name) == NULL,
 	    ("%s: trying to add a hash entry that already exists!", __func__));
 	KASSERT(w->w_hash_next == NULL,
 	    ("%s: w->w_hash_next != NULL", __func__));
 
 	hash = witness_hash_djb2(w->w_name, 0) % w_hash.wh_size;
 	w->w_hash_next = w_hash.wh_array[hash];
 	w_hash.wh_array[hash] = w;
 	w_hash.wh_count++;
 }
 
 
 static struct witness_lock_order_data *
 witness_lock_order_get(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if ((w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN) == 0)
 		goto out;
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	data = w_lohash.wloh_array[hash];
 	while (data != NULL) {
 		if (witness_lock_order_key_equal(&data->wlod_key, &key))
 			break;
 		data = data->wlod_next;
 	}
 
 out:
 	return (data);
 }
 
 /*
  * Verify that parent and child have a known relationship, are not the same,
  * and child is actually a child of parent.  This is done without w_mtx
  * to avoid contention in the common case.
  */
 static int
 witness_lock_order_check(struct witness *parent, struct witness *child)
 {
 
 	if (parent != child &&
 	    w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN &&
 	    isitmychild(parent, child))
 		return (1);
 
 	return (0);
 }
 
 static int
 witness_lock_order_add(struct witness *parent, struct witness *child)
 {
 	struct witness_lock_order_data *data = NULL;
 	struct witness_lock_order_key key;
 	unsigned int hash;
 	
 	MPASS(parent != NULL && child != NULL);
 	key.from = parent->w_index;
 	key.to = child->w_index;
 	WITNESS_INDEX_ASSERT(key.from);
 	WITNESS_INDEX_ASSERT(key.to);
 	if (w_rmatrix[parent->w_index][child->w_index]
 	    & WITNESS_LOCK_ORDER_KNOWN)
 		return (1);
 
 	hash = witness_hash_djb2((const char*)&key,
 	    sizeof(key)) % w_lohash.wloh_size;
 	w_rmatrix[parent->w_index][child->w_index] |= WITNESS_LOCK_ORDER_KNOWN;
 	data = w_lofree;
 	if (data == NULL)
 		return (0);
 	w_lofree = data->wlod_next;
 	data->wlod_next = w_lohash.wloh_array[hash];
 	data->wlod_key = key;
 	w_lohash.wloh_array[hash] = data;
 	w_lohash.wloh_count++;
 	stack_zero(&data->wlod_stack);
 	stack_save(&data->wlod_stack);
 	return (1);
 }
 
 /* Call this whenever the structure of the witness graph changes. */
 static void
 witness_increment_graph_generation(void)
 {
 
 	if (witness_cold == 0)
 		mtx_assert(&w_mtx, MA_OWNED);
 	w_generation++;
 }
 
 static int
 witness_output_drain(void *arg __unused, const char *data, int len)
 {
 
 	witness_output("%.*s", len, data);
 	return (len);
 }
 
 static void
 witness_debugger(int cond, const char *msg)
 {
 	char buf[32];
 	struct sbuf sb;
 	struct stack st;
 
 	if (!cond)
 		return;
 
 	if (witness_trace) {
 		sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
 		sbuf_set_drain(&sb, witness_output_drain, NULL);
 
 		stack_zero(&st);
 		stack_save(&st);
 		witness_output("stack backtrace:\n");
 		stack_sbuf_print_ddb(&sb, &st);
 
 		sbuf_finish(&sb);
 	}
 
 #ifdef KDB
 	if (witness_kdb)
 		kdb_enter(KDB_WHY_WITNESS, msg);
 #endif
 }
Index: user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c
===================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c	(revision 306283)
@@ -1,1739 +1,2242 @@
 /*-
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Poul-Henning Kamp of the FreeBSD Project.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <vm/uma.h>
 
 SDT_PROVIDER_DECLARE(vfs);
 SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
     "char *", "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
 SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
     "struct vnode *", "char *");
 SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
     "char *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
 SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
 SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
     "struct vnode *");
 SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
     "char *");
 
 /*
  * This structure describes the elements in the cache of recent
  * names looked up by namei.
  */
 
 struct	namecache {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name + nul */
 };
 
 /*
  * struct namecache_ts repeats struct namecache layout up to the
  * nc_nlen member.
  * struct namecache_ts is used in place of struct namecache when time(s) need
  * to be stored.  The nc_dotdottime field is used when a cache entry is mapping
  * both a non-dotdot directory name plus dotdot for the directory's
  * parent.
  */
 struct	namecache_ts {
 	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	struct	timespec nc_time;	/* timespec provided by fs */
 	struct	timespec nc_dotdottime;	/* dotdot timespec provided by fs */
 	int	nc_ticks;		/* ticks value when entry was added */
 	char	nc_name[0];		/* segment name + nul */
 };
 
 /*
  * Flags in namecache.nc_flag
  */
 #define NCF_WHITE	0x01
 #define NCF_ISDOTDOT	0x02
 #define	NCF_TS		0x04
 #define	NCF_DTS		0x08
 #define	NCF_DVDROP	0x10
 
 /*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
  * for future reference.  It is managed LRU, so frequently
  * used names will hang around.  Cache is indexed by hash value
  * obtained from (vp, name) where vp refers to the directory
  * containing name.
  *
  * If it is a "negative" entry, (i.e. for a name that is known NOT to
  * exist) the vnode pointer will be NULL.
  *
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
  *
  * These locks are used (in the order in which they can be taken):
- * NAME         TYPE    ROLE
- * cache_lock   rwlock  global, needed for all modifications
- * bucketlock   rwlock  for access to given hash bucket
- * ncneg_mtx    mtx     negative entry LRU management
+ * NAME		TYPE	ROLE
+ * vnodelock	mtx	vnode lists and v_cache_dd field protection
+ * bucketlock	rwlock	for access to given set of hash buckets
+ * ncneg_mtx	mtx	negative entry LRU management
  *
- * A name -> vnode lookup can be safely performed by either locking cache_lock
- * or the relevant hash bucket.
+ * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
+ * shrinking the LRU list.
  *
- * ".." and vnode -> name lookups require cache_lock.
+ * It is legal to take multiple vnodelock and bucketlock locks. The locking
+ * order is lower address first. Both are recursive.
  *
- * Modifications require both cache_lock and relevant bucketlock taken for
- * writing.
+ * "." lookups are lockless.
  *
- * Negative entry LRU management requires ncneg_mtx taken on top of either
- * cache_lock or bucketlock.
+ * ".." and vnode -> name lookups require vnodelock.
+ *
+ * name -> vnode lookup requires the relevant bucketlock to be held for reading.
+ *
+ * Insertions and removals of entries require involved vnodes and bucketlocks
+ * to be write-locked to prevent other threads from seeing the entry.
+ *
+ * Some lookups result in removal of the found entry (e.g. getting rid of a
+ * negative entry with the intent to create a positive one), which poses a
+ * problem when multiple threads reach the state. Similarly, two different
+ * threads can purge two different vnodes and try to remove the same name.
+ *
+ * If the already held vnode lock is lower than the second required lock, we
+ * can just take the other lock. However, in the opposite case, this could
+ * deadlock. As such, this is resolved by trylocking and if that fails unlocking
+ * the first node, locking everything in order and revalidating the state.
  */
 
 /*
  * Structures associated with name caching.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
 static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
 static TAILQ_HEAD(, namecache) ncneg;	/* Hash Table */
 static u_long	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
     "Size of namecache hash table");
 static u_long	ncnegfactor = 16;	/* ratio of negative entries */
 SYSCTL_ULONG(_vfs, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0,
     "Ratio of negative namecache entries");
 static u_long	numneg;			/* number of negative entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0,
     "Number of negative entries in namecache");
 static u_long	numcache;		/* number of cache entries allocated */
 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0,
     "Number of namecache entries");
 static u_long	numcachehv;		/* number of cache entries with vnodes held */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0,
     "Number of namecache entries with vnodes held");
 u_int	ncsizefactor = 2;
 SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor, CTLFLAG_RW, &ncsizefactor, 0,
     "Size factor for namecache");
 
 struct nchstats	nchstats;		/* cache effectiveness statistics */
 
-static struct rwlock cache_lock;
-RW_SYSINIT(vfscache, &cache_lock, "ncglobal");
+static struct mtx       ncneg_shrink_lock;
+MTX_SYSINIT(vfscache_shrink_neg, &ncneg_shrink_lock, "Name Cache shrink neg",
+    MTX_DEF);
 
-#define	CACHE_TRY_WLOCK()	rw_try_wlock(&cache_lock)
-#define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
-#define	CACHE_RLOCK()		rw_rlock(&cache_lock)
-#define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
-#define	CACHE_WLOCK()		rw_wlock(&cache_lock)
-#define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
-
 static struct mtx_padalign ncneg_mtx;
 MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF);
 
 static u_int   numbucketlocks;
 static struct rwlock_padalign  *bucketlocks;
 #define	HASH2BUCKETLOCK(hash) \
 	((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
 
+static u_int   numvnodelocks;
+static struct mtx *vnodelocks;
+static inline struct mtx *
+VP2VNODELOCK(struct vnode *vp)
+{
+	struct mtx *vlp;
+
+	if (vp == NULL)
+		return (NULL);
+	vlp = &vnodelocks[(((uintptr_t)(vp) >> 8) % numvnodelocks)];
+	return (vlp);
+}
+
 /*
  * UMA zones for the VFS cache.
  *
  * The small cache is used for entries with short names, which are the
  * most common.  The large cache is used for entries which are too big to
  * fit in the small cache.
  */
 static uma_zone_t cache_zone_small;
 static uma_zone_t cache_zone_small_ts;
 static uma_zone_t cache_zone_large;
 static uma_zone_t cache_zone_large_ts;
 
 #define	CACHE_PATH_CUTOFF	35
 
 static struct namecache *
 cache_alloc(int len, int ts)
 {
 
 	if (len > CACHE_PATH_CUTOFF) {
 		if (ts)
 			return (uma_zalloc(cache_zone_large_ts, M_WAITOK));
 		else
 			return (uma_zalloc(cache_zone_large, M_WAITOK));
 	}
 	if (ts)
 		return (uma_zalloc(cache_zone_small_ts, M_WAITOK));
 	else
 		return (uma_zalloc(cache_zone_small, M_WAITOK));
 }
 
 static void
 cache_free(struct namecache *ncp)
 {
 	int ts;
 
 	if (ncp == NULL)
 		return;
 	ts = ncp->nc_flag & NCF_TS;
 	if ((ncp->nc_flag & NCF_DVDROP) != 0)
 		vdrop(ncp->nc_dvp);
 	if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) {
 		if (ts)
 			uma_zfree(cache_zone_small_ts, ncp);
 		else
 			uma_zfree(cache_zone_small, ncp);
 	} else if (ts)
 		uma_zfree(cache_zone_large_ts, ncp);
 	else
 		uma_zfree(cache_zone_large, ncp);
 }
 
 static char *
 nc_get_name(struct namecache *ncp)
 {
 	struct namecache_ts *ncp_ts;
 
 	if ((ncp->nc_flag & NCF_TS) == 0)
 		return (ncp->nc_name);
 	ncp_ts = (struct namecache_ts *)ncp;
 	return (ncp_ts->nc_name);
 }
 
 static void
 cache_out_ts(struct namecache *ncp, struct timespec *tsp, int *ticksp)
 {
 
 	KASSERT((ncp->nc_flag & NCF_TS) != 0 ||
 	    (tsp == NULL && ticksp == NULL),
 	    ("No NCF_TS"));
 
 	if (tsp != NULL)
 		*tsp = ((struct namecache_ts *)ncp)->nc_time;
 	if (ticksp != NULL)
 		*ticksp = ((struct namecache_ts *)ncp)->nc_ticks;
 }
 
 static int	doingcache = 1;		/* 1 => enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
     "VFS namecache enabled");
 
 /* Export size information to userland */
 SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
     sizeof(struct namecache), "sizeof(struct namecache)");
 
 /*
  * The new name cache statistics
  */
 static SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0,
     "Name cache statistics");
 #define STATNODE_ULONG(name, descr)	\
 	SYSCTL_ULONG(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, 0, descr);
 #define STATNODE_COUNTER(name, descr)	\
 	static counter_u64_t name;	\
 	SYSCTL_COUNTER_U64(_vfs_cache, OID_AUTO, name, CTLFLAG_RD, &name, descr);
 STATNODE_ULONG(numneg, "Number of negative cache entries");
 STATNODE_ULONG(numcache, "Number of cache entries");
 STATNODE_COUNTER(numcalls, "Number of cache lookups");
 STATNODE_COUNTER(dothits, "Number of '.' hits");
 STATNODE_COUNTER(dotdothits, "Number of '..' hits");
 STATNODE_COUNTER(numchecks, "Number of checks in lookup");
 STATNODE_COUNTER(nummiss, "Number of cache misses");
 STATNODE_COUNTER(nummisszap, "Number of cache misses we do not want to cache");
 STATNODE_COUNTER(numposzaps,
     "Number of cache hits (positive) we do not want to cache");
 STATNODE_COUNTER(numposhits, "Number of cache hits (positive)");
 STATNODE_COUNTER(numnegzaps,
     "Number of cache hits (negative) we do not want to cache");
 STATNODE_COUNTER(numneghits, "Number of cache hits (negative)");
 /* These count for kern___getcwd(), too. */
 STATNODE_COUNTER(numfullpathcalls, "Number of fullpath search calls");
 STATNODE_COUNTER(numfullpathfail1, "Number of fullpath search errors (ENOTDIR)");
 STATNODE_COUNTER(numfullpathfail2,
     "Number of fullpath search errors (VOP_VPTOCNP failures)");
 STATNODE_COUNTER(numfullpathfail4, "Number of fullpath search errors (ENOMEM)");
 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
-static long numupgrades; STATNODE_ULONG(numupgrades,
-    "Number of updates of the cache after lookup (write lock + retry)");
 static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
-    "Number of times bucketlocked zap_and_exit case failed to writelock");
+    "Number of times zap_and_exit failed to lock");
+static long cache_lock_vnodes_cel_3_failures;
+STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
+    "Number of times 3-way vnode locking failed");
 
-static void cache_zap(struct namecache *ncp);
-static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
-    u_int *buflen);
+static void cache_zap_locked(struct namecache *ncp, bool neg_locked);
 static int vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen);
 
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
 
+static int cache_yield;
+SYSCTL_INT(_vfs_cache, OID_AUTO, yield, CTLFLAG_RD, &cache_yield, 0,
+    "Number of times cache called yield");
+
+static void
+cache_maybe_yield(void)
+{
+
+	if (should_yield()) {
+		cache_yield++;
+		kern_yield(PRI_USER);
+	}
+}
+
+static inline void
+cache_assert_vlp_locked(struct mtx *vlp)
+{
+
+	if (vlp != NULL)
+		mtx_assert(vlp, MA_OWNED);
+}
+
+static inline void
+cache_assert_vnode_locked(struct vnode *vp)
+{
+	struct mtx *vlp;
+
+	vlp = VP2VNODELOCK(vp);
+	cache_assert_vlp_locked(vlp);
+}
+
 static uint32_t
 cache_get_hash(char *name, u_char len, struct vnode *dvp)
 {
 	uint32_t hash;
 
 	hash = fnv_32_buf(name, len, FNV1_32_INIT);
 	hash = fnv_32_buf(&dvp, sizeof(dvp), hash);
 	return (hash);
 }
 
+static inline struct rwlock *
+NCP2BUCKETLOCK(struct namecache *ncp)
+{
+	uint32_t hash;
+
+	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+	return (HASH2BUCKETLOCK(hash));
+}
+
 #ifdef INVARIANTS
 static void
 cache_assert_bucket_locked(struct namecache *ncp, int mode)
 {
-	struct rwlock *bucketlock;
-	uint32_t hash;
+	struct rwlock *blp;
 
-	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
-	bucketlock = HASH2BUCKETLOCK(hash);
-	rw_assert(bucketlock, mode);
+	blp = NCP2BUCKETLOCK(ncp);
+	rw_assert(blp, mode);
 }
 #else
 #define cache_assert_bucket_locked(x, y) do { } while (0)
 #endif
 
+#define cache_sort(x, y)	_cache_sort((void **)(x), (void **)(y))
 static void
+_cache_sort(void **p1, void **p2)
+{
+	void *tmp;
+
+	if (*p1 > *p2) {
+		tmp = *p2;
+		*p2 = *p1;
+		*p1 = tmp;
+	}
+}
+
+static void
 cache_lock_all_buckets(void)
 {
 	u_int i;
 
 	for (i = 0; i < numbucketlocks; i++)
 		rw_wlock(&bucketlocks[i]);
 }
 
 static void
 cache_unlock_all_buckets(void)
 {
 	u_int i;
 
 	for (i = 0; i < numbucketlocks; i++)
 		rw_wunlock(&bucketlocks[i]);
 }
 
+static void
+cache_lock_all_vnodes(void)
+{
+	u_int i;
+
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_lock(&vnodelocks[i]);
+}
+
+static void
+cache_unlock_all_vnodes(void)
+{
+	u_int i;
+
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_unlock(&vnodelocks[i]);
+}
+
 static int
+cache_trylock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+	cache_sort(&vlp1, &vlp2);
+	MPASS(vlp2 != NULL);
+
+	if (vlp1 != NULL) {
+		if (!mtx_trylock(vlp1))
+			return (EAGAIN);
+	}
+	if (!mtx_trylock(vlp2)) {
+		if (vlp1 != NULL)
+			mtx_unlock(vlp1);
+		return (EAGAIN);
+	}
+
+	return (0);
+}
+
+static void
+cache_unlock_vnodes(struct mtx *vlp1, struct mtx *vlp2)
+{
+
+	MPASS(vlp1 != NULL || vlp2 != NULL);
+
+	if (vlp1 != NULL)
+		mtx_unlock(vlp1);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+}
+
+static int
 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 {
 	struct nchstats snap;
 
 	if (req->oldptr == NULL)
 		return (SYSCTL_OUT(req, 0, sizeof(snap)));
 
 	snap = nchstats;
 	snap.ncs_goodhits = counter_u64_fetch(numposhits);
 	snap.ncs_neghits = counter_u64_fetch(numneghits);
 	snap.ncs_badhits = counter_u64_fetch(numposzaps) +
 	    counter_u64_fetch(numnegzaps);
 	snap.ncs_miss = counter_u64_fetch(nummisszap) +
 	    counter_u64_fetch(nummiss);
 
 	return (SYSCTL_OUT(req, &snap, sizeof(snap)));
 }
 SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OPAQUE | CTLFLAG_RD |
     CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
     "VFS cache effectiveness statistics");
 
 #ifdef DIAGNOSTIC
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
  */
 static SYSCTL_NODE(_debug, OID_AUTO, hashstat, CTLFLAG_RW, NULL,
     "hash table stats");
 
 static int
 sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int i, error, n_nchash, *cntbuf;
 
 retry:
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	if (req->oldptr == NULL)
 		return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
 	cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
-	CACHE_RLOCK();
+	cache_lock_all_buckets();
 	if (n_nchash != nchash + 1) {
-		CACHE_RUNLOCK();
+		cache_unlock_all_buckets();
 		free(cntbuf, M_TEMP);
 		goto retry;
 	}
 	/* Scan hash tables counting entries */
 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
 		LIST_FOREACH(ncp, ncpp, nc_hash)
 			cntbuf[i]++;
-	CACHE_RUNLOCK();
+	cache_unlock_all_buckets();
 	for (error = 0, i = 0; i < n_nchash; i++)
 		if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
 			break;
 	free(cntbuf, M_TEMP);
 	return (error);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
     "nchash chain lengths");
 
 static int
 sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct nchashhead *ncpp;
 	struct namecache *ncp;
 	int n_nchash;
 	int count, maxlength, used, pct;
 
 	if (!req->oldptr)
 		return SYSCTL_OUT(req, 0, 4 * sizeof(int));
 
-	CACHE_RLOCK();
+	cache_lock_all_buckets();
 	n_nchash = nchash + 1;	/* nchash is max index, not count */
 	used = 0;
 	maxlength = 0;
 
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
 		LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
 			used++;
 		if (maxlength < count)
 			maxlength = count;
 	}
 	n_nchash = nchash + 1;
-	CACHE_RUNLOCK();
+	cache_unlock_all_buckets();
 	pct = (used * 100) / (n_nchash / 100);
 	error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &used, sizeof(used));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &maxlength, sizeof(maxlength));
 	if (error)
 		return (error);
 	error = SYSCTL_OUT(req, &pct, sizeof(pct));
 	if (error)
 		return (error);
 	return (0);
 }
 SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
     CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
     "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
 #endif
 
 /*
  * Negative entries management
  */
 static void
 cache_negative_hit(struct namecache *ncp)
 {
 
+	MPASS(ncp->nc_vp == NULL);
 	mtx_lock(&ncneg_mtx);
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	mtx_unlock(&ncneg_mtx);
 }
 
 static void
 cache_negative_insert(struct namecache *ncp)
 {
 
-	rw_assert(&cache_lock, RA_WLOCKED);
-	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	MPASS(ncp->nc_vp == NULL);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	mtx_lock(&ncneg_mtx);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	numneg++;
 	mtx_unlock(&ncneg_mtx);
 }
 
 static void
-cache_negative_remove(struct namecache *ncp)
+cache_negative_remove(struct namecache *ncp, bool neg_locked)
 {
 
-	rw_assert(&cache_lock, RA_WLOCKED);
-	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	MPASS(ncp->nc_vp == NULL);
-	mtx_lock(&ncneg_mtx);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+	if (!neg_locked)
+		mtx_lock(&ncneg_mtx);
+	else
+		mtx_assert(&ncneg_mtx, MA_OWNED);
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	numneg--;
-	mtx_unlock(&ncneg_mtx);
+	if (!neg_locked)
+		mtx_unlock(&ncneg_mtx);
 }
 
-static struct namecache *
+static void
 cache_negative_zap_one(void)
 {
-	struct namecache *ncp;
+	struct namecache *ncp, *ncp2;
+	struct mtx *dvlp;
+	struct rwlock *blp;
 
-	rw_assert(&cache_lock, RA_WLOCKED);
+	if (!mtx_trylock(&ncneg_shrink_lock))
+		return;
+
+	mtx_lock(&ncneg_mtx);
 	ncp = TAILQ_FIRST(&ncneg);
-	KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg",
-	    ncp, ncp->nc_vp));
-	cache_zap(ncp);
-	return (ncp);
+	if (ncp == NULL) {
+		mtx_unlock(&ncneg_mtx);
+		goto out;
+	}
+	MPASS(ncp->nc_vp == NULL);
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	blp = NCP2BUCKETLOCK(ncp);
+	mtx_unlock(&ncneg_mtx);
+	mtx_lock(dvlp);
+	rw_wlock(blp);
+	mtx_lock(&ncneg_mtx);
+	ncp2 = TAILQ_FIRST(&ncneg);
+	if (ncp != ncp2 || dvlp != VP2VNODELOCK(ncp2->nc_dvp) ||
+	    blp != NCP2BUCKETLOCK(ncp2) || ncp2->nc_vp != NULL) {
+		ncp = NULL;
+		goto out_unlock_all;
+	}
+	cache_zap_locked(ncp, true);
+out_unlock_all:
+	mtx_unlock(&ncneg_mtx);
+	rw_wunlock(blp);
+	mtx_unlock(dvlp);
+out:
+	mtx_unlock(&ncneg_shrink_lock);
+	cache_free(ncp);
 }
 
 /*
- * cache_zap():
+ * cache_zap_locked():
  *
  *   Removes a namecache entry from cache, whether it contains an actual
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
-cache_zap_locked(struct namecache *ncp)
+cache_zap_locked(struct namecache *ncp, bool neg_locked)
 {
 
-	rw_assert(&cache_lock, RA_WLOCKED);
+	cache_assert_vnode_locked(ncp->nc_vp);
+	cache_assert_vnode_locked(ncp->nc_dvp);
 	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
 	if (ncp->nc_vp != NULL) {
 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 		    nc_get_name(ncp), ncp->nc_vp);
 	} else {
 		SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
 		    nc_get_name(ncp));
 	}
 	LIST_REMOVE(ncp, nc_hash);
 	if (ncp->nc_flag & NCF_ISDOTDOT) {
 		if (ncp == ncp->nc_dvp->v_cache_dd)
 			ncp->nc_dvp->v_cache_dd = NULL;
 	} else {
 		LIST_REMOVE(ncp, nc_src);
 		if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) {
 			ncp->nc_flag |= NCF_DVDROP;
-			numcachehv--;
+			atomic_subtract_rel_long(&numcachehv, 1);
 		}
 	}
 	if (ncp->nc_vp) {
 		TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst);
 		if (ncp == ncp->nc_vp->v_cache_dd)
 			ncp->nc_vp->v_cache_dd = NULL;
 	} else {
-		cache_negative_remove(ncp);
+		cache_negative_remove(ncp, neg_locked);
 	}
-	numcache--;
+	atomic_subtract_rel_long(&numcache, 1);
 }
 
 static void
-cache_zap(struct namecache *ncp)
+cache_zap_negative_locked_vnode_kl(struct namecache *ncp, struct vnode *vp)
 {
-	struct rwlock *bucketlock;
-	uint32_t hash;
+	struct rwlock *blp;
 
-	rw_assert(&cache_lock, RA_WLOCKED);
+	MPASS(ncp->nc_dvp == vp);
+	MPASS(ncp->nc_vp == NULL);
+	cache_assert_vnode_locked(vp);
 
-	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
-	bucketlock = HASH2BUCKETLOCK(hash);
-	rw_wlock(bucketlock);
-	cache_zap_locked(ncp);
-	rw_wunlock(bucketlock);
+	blp = NCP2BUCKETLOCK(ncp);
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
 }
 
+static bool
+cache_zap_locked_vnode_kl2(struct namecache *ncp, struct vnode *vp,
+    struct mtx **vlpp)
+{
+	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+	struct rwlock *blp;
+
+	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+	cache_assert_vnode_locked(vp);
+
+	if (ncp->nc_vp == NULL) {
+		if (*vlpp != NULL) {
+			mtx_unlock(*vlpp);
+			*vlpp = NULL;
+		}
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		return (true);
+	}
+
+	pvlp = VP2VNODELOCK(vp);
+	blp = NCP2BUCKETLOCK(ncp);
+	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+	vlp2 = VP2VNODELOCK(ncp->nc_vp);
+
+	if (*vlpp == vlp1 || *vlpp == vlp2) {
+		to_unlock = *vlpp;
+		*vlpp = NULL;
+	} else {
+		if (*vlpp != NULL) {
+			mtx_unlock(*vlpp);
+			*vlpp = NULL;
+		}
+		cache_sort(&vlp1, &vlp2);
+		if (vlp1 == pvlp) {
+			mtx_lock(vlp2);
+			to_unlock = vlp2;
+		} else {
+			if (!mtx_trylock(vlp1))
+				goto out_relock;
+			to_unlock = vlp1;
+		}
+	}
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+	if (to_unlock != NULL)
+		mtx_unlock(to_unlock);
+	return (true);
+
+out_relock:
+	mtx_unlock(vlp2);
+	mtx_lock(vlp1);
+	mtx_lock(vlp2);
+	MPASS(*vlpp == NULL);
+	*vlpp = vlp1;
+	return (false);
+}
+
+static int
+cache_zap_locked_vnode(struct namecache *ncp, struct vnode *vp)
+{
+	struct mtx *pvlp, *vlp1, *vlp2, *to_unlock;
+	struct rwlock *blp;
+	int error = 0;
+
+	MPASS(vp == ncp->nc_dvp || vp == ncp->nc_vp);
+	cache_assert_vnode_locked(vp);
+
+	pvlp = VP2VNODELOCK(vp);
+	if (ncp->nc_vp == NULL) {
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
+		goto out;
+	}
+
+	blp = NCP2BUCKETLOCK(ncp);
+	vlp1 = VP2VNODELOCK(ncp->nc_dvp);
+	vlp2 = VP2VNODELOCK(ncp->nc_vp);
+	cache_sort(&vlp1, &vlp2);
+	if (vlp1 == pvlp) {
+		mtx_lock(vlp2);
+		to_unlock = vlp2;
+	} else {
+		if (!mtx_trylock(vlp1)) {
+			error = EAGAIN;
+			goto out;
+		}
+		to_unlock = vlp1;
+	}
+	rw_wlock(blp);
+	cache_zap_locked(ncp, false);
+	rw_wunlock(blp);
+	mtx_unlock(to_unlock);
+out:
+	mtx_unlock(pvlp);
+	return (error);
+}
+
+static int
+cache_zap_rlocked_bucket(struct namecache *ncp, struct rwlock *blp)
+{
+	struct mtx *dvlp, *vlp;
+
+	cache_assert_bucket_locked(ncp, RA_RLOCKED);
+
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	vlp = VP2VNODELOCK(ncp->nc_vp);
+	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+		rw_runlock(blp);
+		rw_wlock(blp);
+		cache_zap_locked(ncp, false);
+		rw_wunlock(blp);
+		cache_unlock_vnodes(dvlp, vlp);
+		return (0);
+	}
+
+	rw_runlock(blp);
+	return (EAGAIN);
+}
+
+static int
+cache_zap_wlocked_bucket_kl(struct namecache *ncp, struct rwlock *blp,
+    struct mtx **vlpp1, struct mtx **vlpp2)
+{
+	struct mtx *dvlp, *vlp;
+
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
+
+	dvlp = VP2VNODELOCK(ncp->nc_dvp);
+	vlp = VP2VNODELOCK(ncp->nc_vp);
+	cache_sort(&dvlp, &vlp);
+
+	if (*vlpp1 == dvlp && *vlpp2 == vlp) {
+		cache_zap_locked(ncp, false);
+		cache_unlock_vnodes(dvlp, vlp);
+		*vlpp1 = NULL;
+		*vlpp2 = NULL;
+		return (0);
+	}
+
+	if (*vlpp1 != NULL)
+		mtx_unlock(*vlpp1);
+	if (*vlpp2 != NULL)
+		mtx_unlock(*vlpp2);
+	*vlpp1 = NULL;
+	*vlpp2 = NULL;
+
+	if (cache_trylock_vnodes(dvlp, vlp) == 0) {
+		cache_zap_locked(ncp, false);
+		cache_unlock_vnodes(dvlp, vlp);
+		return (0);
+	}
+
+	rw_wunlock(blp);
+	*vlpp1 = dvlp;
+	*vlpp2 = vlp;
+	if (*vlpp1 != NULL)
+		mtx_lock(*vlpp1);
+	mtx_lock(*vlpp2);
+	rw_wlock(blp);
+	return (EAGAIN);
+}
+
+static void
+cache_lookup_unlock(struct rwlock *blp, struct mtx *vlp)
+{
+
+	if (blp != NULL) {
+		rw_runlock(blp);
+		mtx_assert(vlp, MA_NOTOWNED);
+	} else {
+		mtx_unlock(vlp);
+	}
+}
+
 /*
  * Lookup an entry in the cache
  *
  * Lookup is called with dvp pointing to the directory to search,
  * cnp pointing to the name of the entry being sought. If the lookup
  * succeeds, the vnode is returned in *vpp, and a status of -1 is
  * returned. If the lookup determines that the name does not exist
  * (negative caching), a status of ENOENT is returned. If the lookup
  * fails, a status of zero is returned.  If the directory vnode is
  * recycled out from under us due to a forced unmount, a status of
  * ENOENT is returned.
  *
  * vpp is locked and ref'd on return.  If we're looking up DOTDOT, dvp is
  * unlocked.  If we're looking up . an extra ref is taken, but the lock is
  * not recursively acquired.
  */
 
-enum { UNLOCKED, WLOCKED, RLOCKED };
-
-static void
-cache_unlock(int cache_locked)
-{
-
-	switch (cache_locked) {
-	case UNLOCKED:
-		break;
-	case WLOCKED:
-		CACHE_WUNLOCK();
-		break;
-	case RLOCKED:
-		CACHE_RUNLOCK();
-		break;
-	}
-}
-
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
-	struct rwlock *bucketlock;
 	struct namecache *ncp;
+	struct rwlock *blp;
+	struct mtx *dvlp, *dvlp2;
 	uint32_t hash;
-	int error, ltype, cache_locked;
+	int error, ltype;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 retry:
-	bucketlock = NULL;
-	cache_locked = UNLOCKED;
+	blp = NULL;
+	dvlp = VP2VNODELOCK(dvp);
 	error = 0;
 	counter_u64_add(numcalls, 1);
 
-retry_wlocked:
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1) {
 			*vpp = dvp;
 			CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
 			    dvp, cnp->cn_nameptr);
 			counter_u64_add(dothits, 1);
 			SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
 			if (tsp != NULL)
 				timespecclear(tsp);
 			if (ticksp != NULL)
 				*ticksp = ticks;
 			VREF(*vpp);
 			/*
 			 * When we lookup "." we still can be asked to lock it
 			 * differently...
 			 */
 			ltype = cnp->cn_lkflags & LK_TYPE_MASK;
 			if (ltype != VOP_ISLOCKED(*vpp)) {
 				if (ltype == LK_EXCLUSIVE) {
 					vn_lock(*vpp, LK_UPGRADE | LK_RETRY);
 					if ((*vpp)->v_iflag & VI_DOOMED) {
 						/* forced unmount */
 						vrele(*vpp);
 						*vpp = NULL;
 						return (ENOENT);
 					}
 				} else
 					vn_lock(*vpp, LK_DOWNGRADE | LK_RETRY);
 			}
 			return (-1);
 		}
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			counter_u64_add(dotdothits, 1);
-			if (cache_locked == UNLOCKED) {
-				CACHE_RLOCK();
-				cache_locked = RLOCKED;
-			}
-
-			if (dvp->v_cache_dd == NULL) {
+			dvlp2 = NULL;
+			mtx_lock(dvlp);
+retry_dotdot:
+			ncp = dvp->v_cache_dd;
+			if (ncp == NULL) {
 				SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 				    "..", NULL);
-				goto unlock;
+				mtx_unlock(dvlp);
+				return (0);
 			}
 			if ((cnp->cn_flags & MAKEENTRY) == 0) {
-				if (cache_locked != WLOCKED &&
-				    !CACHE_UPGRADE_LOCK())
-					goto wlock;
-				ncp = NULL;
-				if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) {
-					ncp = dvp->v_cache_dd;
-					cache_zap(ncp);
+				if ((ncp->nc_flag & NCF_ISDOTDOT) != 0) {
+					if (ncp->nc_dvp != dvp)
+						panic("dvp %p v_cache_dd %p\n", dvp, ncp);
+					if (!cache_zap_locked_vnode_kl2(ncp,
+					    dvp, &dvlp2))
+						goto retry_dotdot;
+					MPASS(dvp->v_cache_dd == NULL);
+					mtx_unlock(dvlp);
+					if (dvlp2 != NULL)
+						mtx_unlock(dvlp2);
+					cache_free(ncp);
+				} else {
+					dvp->v_cache_dd = NULL;
+					mtx_unlock(dvlp);
+					if (dvlp2 != NULL)
+						mtx_unlock(dvlp2);
 				}
-				dvp->v_cache_dd = NULL;
-				CACHE_WUNLOCK();
-				cache_free(ncp);
 				return (0);
 			}
-			ncp = dvp->v_cache_dd;
-			if (ncp->nc_flag & NCF_ISDOTDOT)
+			if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 				*vpp = ncp->nc_vp;
 			else
 				*vpp = ncp->nc_dvp;
 			/* Return failure if negative entry was found. */
 			if (*vpp == NULL)
 				goto negative_success;
 			CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
 			    dvp, cnp->cn_nameptr, *vpp);
 			SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
 			    *vpp);
 			cache_out_ts(ncp, tsp, ticksp);
 			if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
 			    NCF_DTS && tsp != NULL)
 				*tsp = ((struct namecache_ts *)ncp)->
 				    nc_dotdottime;
 			goto success;
 		}
 	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
-	if (cache_locked == UNLOCKED) {
-		bucketlock = HASH2BUCKETLOCK(hash);
-		rw_rlock(bucketlock);
-	}
+	blp = HASH2BUCKETLOCK(hash);
+	rw_rlock(blp);
 
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		counter_u64_add(numchecks, 1);
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
 
 	/* We failed to find an entry */
 	if (ncp == NULL) {
 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 		    NULL);
 		if ((cnp->cn_flags & MAKEENTRY) == 0) {
 			counter_u64_add(nummisszap, 1);
 		} else {
 			counter_u64_add(nummiss, 1);
 		}
 		goto unlock;
 	}
 
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		counter_u64_add(numposzaps, 1);
 		goto zap_and_exit;
 	}
 
 	/* We found a "positive" match, return the vnode */
 	if (ncp->nc_vp) {
 		counter_u64_add(numposhits, 1);
 		*vpp = ncp->nc_vp;
 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 		    dvp, cnp->cn_nameptr, *vpp, ncp);
 		SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
 		    *vpp);
 		cache_out_ts(ncp, tsp, ticksp);
 		goto success;
 	}
 
 negative_success:
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		counter_u64_add(numnegzaps, 1);
 		goto zap_and_exit;
 	}
 
 	counter_u64_add(numneghits, 1);
 	cache_negative_hit(ncp);
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 	    nc_get_name(ncp));
 	cache_out_ts(ncp, tsp, ticksp);
-	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
-	if (bucketlock != NULL)
-		rw_runlock(bucketlock);
-	cache_unlock(cache_locked);
+	cache_lookup_unlock(blp, dvlp);
 	return (ENOENT);
 
-wlock:
-	/*
-	 * We need to update the cache after our lookup, so upgrade to
-	 * a write lock and retry the operation.
-	 */
-	CACHE_RUNLOCK();
-wlock_unlocked:
-	CACHE_WLOCK();
-	numupgrades++;
-	cache_locked = WLOCKED;
-	goto retry_wlocked;
-
 success:
 	/*
 	 * On success we return a locked and ref'd vnode as per the lookup
 	 * protocol.
 	 */
 	MPASS(dvp != *vpp);
 	ltype = 0;	/* silence gcc warning */
 	if (cnp->cn_flags & ISDOTDOT) {
 		ltype = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 	}
 	vhold(*vpp);
-	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
-	if (bucketlock != NULL)
-		rw_runlock(bucketlock);
-	cache_unlock(cache_locked);
+	cache_lookup_unlock(blp, dvlp);
 	error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
 			if (error == 0)
 				vput(*vpp);
 			*vpp = NULL;
 			return (ENOENT);
 		}
 	}
 	if (error) {
 		*vpp = NULL;
 		goto retry;
 	}
 	if ((cnp->cn_flags & ISLASTCN) &&
 	    (cnp->cn_lkflags & LK_TYPE_MASK) == LK_EXCLUSIVE) {
 		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 	}
 	return (-1);
 
 unlock:
-	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
-	if (bucketlock != NULL)
-		rw_runlock(bucketlock);
-	cache_unlock(cache_locked);
+	cache_lookup_unlock(blp, dvlp);
 	return (0);
 
 zap_and_exit:
-	if (bucketlock != NULL) {
-		rw_assert(&cache_lock, RA_UNLOCKED);
-		if (!CACHE_TRY_WLOCK()) {
-			rw_runlock(bucketlock);
-			bucketlock = NULL;
-			zap_and_exit_bucket_fail++;
-			goto wlock_unlocked;
-		}
-		cache_locked = WLOCKED;
-		rw_runlock(bucketlock);
-		bucketlock = NULL;
-	} else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK())
-		goto wlock;
-	cache_zap(ncp);
-	CACHE_WUNLOCK();
+	if (blp != NULL)
+		error = cache_zap_rlocked_bucket(ncp, blp);
+	else
+		error = cache_zap_locked_vnode(ncp, dvp);
+	if (error != 0) {
+		zap_and_exit_bucket_fail++;
+		cache_maybe_yield();
+		goto retry;
+	}
 	cache_free(ncp);
 	return (0);
 }
 
+struct celockstate {
+	struct mtx *vlp[3];
+	struct rwlock *blp[2];
+};
+CTASSERT((nitems(((struct celockstate *)0)->vlp) == 3));
+CTASSERT((nitems(((struct celockstate *)0)->blp) == 2));
+
+static inline void
+cache_celockstate_init(struct celockstate *cel)
+{
+
+	bzero(cel, sizeof(*cel));
+}
+
+static void
+cache_lock_vnodes_cel(struct celockstate *cel, struct vnode *vp,
+    struct vnode *dvp)
+{
+	struct mtx *vlp1, *vlp2;
+
+	MPASS(cel->vlp[0] == NULL);
+	MPASS(cel->vlp[1] == NULL);
+	MPASS(cel->vlp[2] == NULL);
+
+	MPASS(vp != NULL || dvp != NULL);
+
+	vlp1 = VP2VNODELOCK(vp);
+	vlp2 = VP2VNODELOCK(dvp);
+	cache_sort(&vlp1, &vlp2);
+
+	if (vlp1 != NULL) {
+		mtx_lock(vlp1);
+		cel->vlp[0] = vlp1;
+	}
+	mtx_lock(vlp2);
+	cel->vlp[1] = vlp2;
+}
+
+static void
+cache_unlock_vnodes_cel(struct celockstate *cel)
+{
+
+	MPASS(cel->vlp[0] != NULL || cel->vlp[1] != NULL);
+
+	if (cel->vlp[0] != NULL)
+		mtx_unlock(cel->vlp[0]);
+	if (cel->vlp[1] != NULL)
+		mtx_unlock(cel->vlp[1]);
+	if (cel->vlp[2] != NULL)
+		mtx_unlock(cel->vlp[2]);
+}
+
+static bool
+cache_lock_vnodes_cel_3(struct celockstate *cel, struct vnode *vp)
+{
+	struct mtx *vlp;
+	bool ret;
+
+	cache_assert_vlp_locked(cel->vlp[0]);
+	cache_assert_vlp_locked(cel->vlp[1]);
+	MPASS(cel->vlp[2] == NULL);
+
+	vlp = VP2VNODELOCK(vp);
+	if (vlp == NULL)
+		return (true);
+
+	ret = true;
+	if (vlp >= cel->vlp[1]) {
+		mtx_lock(vlp);
+	} else {
+		if (mtx_trylock(vlp))
+			goto out;
+		cache_lock_vnodes_cel_3_failures++;
+		cache_unlock_vnodes_cel(cel);
+		if (vlp < cel->vlp[0]) {
+			mtx_lock(vlp);
+			mtx_lock(cel->vlp[0]);
+			mtx_lock(cel->vlp[1]);
+		} else {
+			if (cel->vlp[0] != NULL)
+				mtx_lock(cel->vlp[0]);
+			mtx_lock(vlp);
+			mtx_lock(cel->vlp[1]);
+		}
+		ret = false;
+	}
+out:
+	cel->vlp[2] = vlp;
+	return (ret);
+}
+
+static void
+cache_lock_buckets_cel(struct celockstate *cel, struct rwlock *blp1,
+    struct rwlock *blp2)
+{
+
+	MPASS(cel->blp[0] == NULL);
+	MPASS(cel->blp[1] == NULL);
+
+	cache_sort(&blp1, &blp2);
+
+	if (blp1 != NULL) {
+		rw_wlock(blp1);
+		cel->blp[0] = blp1;
+	}
+	rw_wlock(blp2);
+	cel->blp[1] = blp2;
+}
+
+static void
+cache_unlock_buckets_cel(struct celockstate *cel)
+{
+
+	if (cel->blp[0] != NULL)
+		rw_wunlock(cel->blp[0]);
+	rw_wunlock(cel->blp[1]);
+}
+
 /*
+ * Lock part of the cache affected by the insertion.
+ *
+ * This means vnodelocks for dvp, vp and the relevant bucketlock.
+ * However, insertion can result in removal of an old entry. In this
+ * case we have an additional vnode and bucketlock pair to lock. If the
+ * entry is negative, ncelock is locked instead of the vnode.
+ *
+ * That is, in the worst case we have to lock 3 vnodes and 2 bucketlocks, while
+ * preserving the locking order (smaller address first).
+ */
+static void
+cache_enter_lock(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+    uint32_t hash)
+{
+	struct namecache *ncp;
+	struct rwlock *blps[2];
+
+	blps[0] = HASH2BUCKETLOCK(hash);
+	for (;;) {
+		blps[1] = NULL;
+		cache_lock_vnodes_cel(cel, dvp, vp);
+		if (vp == NULL || vp->v_type != VDIR)
+			break;
+		ncp = vp->v_cache_dd;
+		if (ncp == NULL)
+			break;
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+		MPASS(ncp->nc_dvp == vp);
+		blps[1] = NCP2BUCKETLOCK(ncp);
+		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+			break;
+		/*
+		 * All vnodes got re-locked. Re-validate the state and if
+		 * nothing changed we are done. Otherwise restart.
+		 */
+		if (ncp == vp->v_cache_dd &&
+		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+		    blps[1] == NCP2BUCKETLOCK(ncp) &&
+		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+			break;
+		cache_unlock_vnodes_cel(cel);
+		cel->vlp[0] = NULL;
+		cel->vlp[1] = NULL;
+		cel->vlp[2] = NULL;
+	}
+	cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_lock_dd(struct celockstate *cel, struct vnode *dvp, struct vnode *vp,
+    uint32_t hash)
+{
+	struct namecache *ncp;
+	struct rwlock *blps[2];
+
+	blps[0] = HASH2BUCKETLOCK(hash);
+	for (;;) {
+		blps[1] = NULL;
+		cache_lock_vnodes_cel(cel, dvp, vp);
+		ncp = dvp->v_cache_dd;
+		if (ncp == NULL)
+			break;
+		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
+			break;
+		MPASS(ncp->nc_dvp == dvp);
+		blps[1] = NCP2BUCKETLOCK(ncp);
+		if (cache_lock_vnodes_cel_3(cel, ncp->nc_vp))
+			break;
+		if (ncp == dvp->v_cache_dd &&
+		    (ncp->nc_flag & NCF_ISDOTDOT) != 0 &&
+		    blps[1] == NCP2BUCKETLOCK(ncp) &&
+		    VP2VNODELOCK(ncp->nc_vp) == cel->vlp[2])
+			break;
+		cache_unlock_vnodes_cel(cel);
+		cel->vlp[0] = NULL;
+		cel->vlp[1] = NULL;
+		cel->vlp[2] = NULL;
+	}
+	cache_lock_buckets_cel(cel, blps[0], blps[1]);
+}
+
+static void
+cache_enter_unlock(struct celockstate *cel)
+{
+
+	cache_unlock_buckets_cel(cel);
+	cache_unlock_vnodes_cel(cel);
+}
+
+/*
  * Add an entry to the cache.
  */
 void
 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
     struct timespec *tsp, struct timespec *dtsp)
 {
-	struct rwlock *bucketlock;
-	struct namecache *ncp, *n2, *ndd, *nneg;
+	struct celockstate cel;
+	struct namecache *ncp, *n2, *ndd;
 	struct namecache_ts *n3;
 	struct nchashhead *ncpp;
 	uint32_t hash;
 	int flag;
 	int len;
 
 	CTR3(KTR_VFS, "cache_enter(%p, %p, %s)", dvp, vp, cnp->cn_nameptr);
 	VNASSERT(vp == NULL || (vp->v_iflag & VI_DOOMED) == 0, vp,
 	    ("cache_enter: Adding a doomed vnode"));
 	VNASSERT(dvp == NULL || (dvp->v_iflag & VI_DOOMED) == 0, dvp,
 	    ("cache_enter: Doomed vnode used as src"));
 
 	if (!doingcache)
 		return;
 
 	/*
 	 * Avoid blowout in namecache entries.
 	 */
 	if (numcache >= desiredvnodes * ncsizefactor)
 		return;
 
-	ndd = nneg = NULL;
+	cache_celockstate_init(&cel);
+	ndd = NULL;
 	flag = 0;
 	if (cnp->cn_nameptr[0] == '.') {
 		if (cnp->cn_namelen == 1)
 			return;
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
-			CACHE_WLOCK();
+			len = cnp->cn_namelen;
+			hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
+			cache_enter_lock_dd(&cel, dvp, vp, hash);
 			/*
 			 * If dotdot entry already exists, just retarget it
 			 * to new parent vnode, otherwise continue with new
 			 * namecache entry allocation.
 			 */
 			if ((ncp = dvp->v_cache_dd) != NULL &&
 			    ncp->nc_flag & NCF_ISDOTDOT) {
 				KASSERT(ncp->nc_dvp == dvp,
 				    ("wrong isdotdot parent"));
 				if (ncp->nc_vp != NULL) {
 					TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
 					    ncp, nc_dst);
 				} else {
-					cache_negative_remove(ncp);
+					cache_negative_remove(ncp, false);
 				}
 				if (vp != NULL) {
 					TAILQ_INSERT_HEAD(&vp->v_cache_dst,
 					    ncp, nc_dst);
 				} else {
 					cache_negative_insert(ncp);
 				}
 				ncp->nc_vp = vp;
-				CACHE_WUNLOCK();
+				cache_enter_unlock(&cel);
 				return;
 			}
 			dvp->v_cache_dd = NULL;
+			cache_enter_unlock(&cel);
+			cache_celockstate_init(&cel);
 			SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
-			CACHE_WUNLOCK();
 			flag = NCF_ISDOTDOT;
 		}
 	}
 
 	/*
 	 * Calculate the hash key and setup as much of the new
 	 * namecache entry as possible before acquiring the lock.
 	 */
 	ncp = cache_alloc(cnp->cn_namelen, tsp != NULL);
 	ncp->nc_vp = vp;
 	ncp->nc_dvp = dvp;
 	ncp->nc_flag = flag;
 	if (tsp != NULL) {
 		n3 = (struct namecache_ts *)ncp;
 		n3->nc_time = *tsp;
 		n3->nc_ticks = ticks;
 		n3->nc_flag |= NCF_TS;
 		if (dtsp != NULL) {
 			n3->nc_dotdottime = *dtsp;
 			n3->nc_flag |= NCF_DTS;
 		}
 	}
 	len = ncp->nc_nlen = cnp->cn_namelen;
 	hash = cache_get_hash(cnp->cn_nameptr, len, dvp);
 	strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1);
-	CACHE_WLOCK();
+	cache_enter_lock(&cel, dvp, vp, hash);
 
 	/*
 	 * See if this vnode or negative entry is already in the cache
 	 * with this name.  This can happen with concurrent lookups of
 	 * the same path name.
 	 */
 	ncpp = NCHHASH(hash);
 	LIST_FOREACH(n2, ncpp, nc_hash) {
 		if (n2->nc_dvp == dvp &&
 		    n2->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) {
 			if (tsp != NULL) {
 				KASSERT((n2->nc_flag & NCF_TS) != 0,
 				    ("no NCF_TS"));
 				n3 = (struct namecache_ts *)n2;
 				n3->nc_time =
 				    ((struct namecache_ts *)ncp)->nc_time;
 				n3->nc_ticks =
 				    ((struct namecache_ts *)ncp)->nc_ticks;
 				if (dtsp != NULL) {
 					n3->nc_dotdottime =
 					    ((struct namecache_ts *)ncp)->
 					    nc_dotdottime;
 					n3->nc_flag |= NCF_DTS;
 				}
 			}
-			CACHE_WUNLOCK();
-			cache_free(ncp);
-			return;
+			goto out_unlock_free;
 		}
 	}
 
 	if (flag == NCF_ISDOTDOT) {
 		/*
 		 * See if we are trying to add .. entry, but some other lookup
 		 * has populated v_cache_dd pointer already.
 		 */
-		if (dvp->v_cache_dd != NULL) {
-			CACHE_WUNLOCK();
-			cache_free(ncp);
-			return;
-		}
+		if (dvp->v_cache_dd != NULL)
+			goto out_unlock_free;
 		KASSERT(vp == NULL || vp->v_type == VDIR,
 		    ("wrong vnode type %p", vp));
 		dvp->v_cache_dd = ncp;
 	}
 
-	numcache++;
+	atomic_add_rel_long(&numcache, 1);
 	if (vp != NULL) {
 		if (vp->v_type == VDIR) {
 			if (flag != NCF_ISDOTDOT) {
 				/*
 				 * For this case, the cache entry maps both the
 				 * directory name in it and the name ".." for the
 				 * directory's parent.
 				 */
 				if ((ndd = vp->v_cache_dd) != NULL) {
 					if ((ndd->nc_flag & NCF_ISDOTDOT) != 0)
-						cache_zap(ndd);
+						cache_zap_locked(ndd, false);
 					else
 						ndd = NULL;
 				}
 				vp->v_cache_dd = ncp;
 			}
 		} else {
 			vp->v_cache_dd = NULL;
 		}
 	}
 
 	if (flag != NCF_ISDOTDOT) {
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			vhold(dvp);
-			numcachehv++;
+			atomic_add_rel_long(&numcachehv, 1);
 		}
 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	}
 
-	bucketlock = HASH2BUCKETLOCK(hash);
-	rw_wlock(bucketlock);
-
 	/*
 	 * Insert the new namecache entry into the appropriate chain
 	 * within the cache entries table.
 	 */
 	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
 	 */
 	if (vp != NULL) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
 		SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
 		    vp);
 	} else {
 		if (cnp->cn_flags & ISWHITEOUT)
 			ncp->nc_flag |= NCF_WHITE;
 		cache_negative_insert(ncp);
 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 		    nc_get_name(ncp));
 	}
-	rw_wunlock(bucketlock);
+	cache_enter_unlock(&cel);
 	if (numneg * ncnegfactor > numcache)
-		nneg = cache_negative_zap_one();
-	CACHE_WUNLOCK();
+		cache_negative_zap_one();
 	cache_free(ndd);
-	cache_free(nneg);
+	return;
+out_unlock_free:
+	cache_enter_unlock(&cel);
+	cache_free(ncp);
+	return;
 }
 
 static u_int
 cache_roundup_2(u_int val)
 {
 	u_int res;
 
 	for (res = 1; res <= val; res <<= 1)
 		continue;
 
 	return (res);
 }
 
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
 	u_int i;
 
 	TAILQ_INIT(&ncneg);
 
 	cache_zone_small = uma_zcreate("S VFS Cache",
 	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_small_ts = uma_zcreate("STS VFS Cache",
 	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large = uma_zcreate("L VFS Cache",
 	    sizeof(struct namecache) + NAME_MAX + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 	cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
 	    sizeof(struct namecache_ts) + NAME_MAX + 1,
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
-	numbucketlocks = cache_roundup_2(mp_ncpus * 16);
-	if (numbucketlocks > nchash)
-		numbucketlocks = nchash;
+	numbucketlocks = cache_roundup_2(mp_ncpus * 64);
 	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
 	    M_WAITOK | M_ZERO);
 	for (i = 0; i < numbucketlocks; i++)
-		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK);
+		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK | RW_RECURSE);
+	numvnodelocks = cache_roundup_2(mp_ncpus * 64);
+	vnodelocks = malloc(sizeof(*vnodelocks) * numvnodelocks, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numvnodelocks; i++)
+		mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
 
 	numcalls = counter_u64_alloc(M_WAITOK);
 	dothits = counter_u64_alloc(M_WAITOK);
 	dotdothits = counter_u64_alloc(M_WAITOK);
 	numchecks = counter_u64_alloc(M_WAITOK);
 	nummiss = counter_u64_alloc(M_WAITOK);
 	nummisszap = counter_u64_alloc(M_WAITOK);
 	numposzaps = counter_u64_alloc(M_WAITOK);
 	numposhits = counter_u64_alloc(M_WAITOK);
 	numnegzaps = counter_u64_alloc(M_WAITOK);
 	numneghits = counter_u64_alloc(M_WAITOK);
 	numfullpathcalls = counter_u64_alloc(M_WAITOK);
 	numfullpathfail1 = counter_u64_alloc(M_WAITOK);
 	numfullpathfail2 = counter_u64_alloc(M_WAITOK);
 	numfullpathfail4 = counter_u64_alloc(M_WAITOK);
 	numfullpathfound = counter_u64_alloc(M_WAITOK);
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
 
 void
 cache_changesize(int newmaxvnodes)
 {
 	struct nchashhead *new_nchashtbl, *old_nchashtbl;
 	u_long new_nchash, old_nchash;
 	struct namecache *ncp;
 	uint32_t hash;
 	int i;
 
 	new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash);
 	/* If same hash table size, nothing to do */
 	if (nchash == new_nchash) {
 		free(new_nchashtbl, M_VFSCACHE);
 		return;
 	}
 	/*
 	 * Move everything from the old hash table to the new table.
 	 * None of the namecache entries in the table can be removed
 	 * because to do so, they have to be removed from the hash table.
 	 */
-	CACHE_WLOCK();
+	cache_lock_all_vnodes();
 	cache_lock_all_buckets();
 	old_nchashtbl = nchashtbl;
 	old_nchash = nchash;
 	nchashtbl = new_nchashtbl;
 	nchash = new_nchash;
 	for (i = 0; i <= old_nchash; i++) {
 		while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
 			hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen,
 			    ncp->nc_dvp);
 			LIST_REMOVE(ncp, nc_hash);
 			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 		}
 	}
 	cache_unlock_all_buckets();
-	CACHE_WUNLOCK();
+	cache_unlock_all_vnodes();
 	free(old_nchashtbl, M_VFSCACHE);
 }
 
 /*
  * Invalidate all entries to a particular vnode.
  */
 void
 cache_purge(struct vnode *vp)
 {
 	TAILQ_HEAD(, namecache) ncps;
 	struct namecache *ncp, *nnp;
+	struct mtx *vlp, *vlp2;
 
 	CTR1(KTR_VFS, "cache_purge(%p)", vp);
 	SDT_PROBE1(vfs, namecache, purge, done, vp);
+	if (LIST_EMPTY(&vp->v_cache_src) && TAILQ_EMPTY(&vp->v_cache_dst) &&
+	    vp->v_cache_dd == NULL)
+		return;
 	TAILQ_INIT(&ncps);
-	CACHE_WLOCK();
+	vlp = VP2VNODELOCK(vp);
+	vlp2 = NULL;
+	mtx_lock(vlp);
+retry:
 	while (!LIST_EMPTY(&vp->v_cache_src)) {
 		ncp = LIST_FIRST(&vp->v_cache_src);
-		cache_zap(ncp);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
 	while (!TAILQ_EMPTY(&vp->v_cache_dst)) {
 		ncp = TAILQ_FIRST(&vp->v_cache_dst);
-		cache_zap(ncp);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
-	if (vp->v_cache_dd != NULL) {
-		ncp = vp->v_cache_dd;
+	ncp = vp->v_cache_dd;
+	if (ncp != NULL) {
 		KASSERT(ncp->nc_flag & NCF_ISDOTDOT,
 		   ("lost dotdot link"));
-		cache_zap(ncp);
+		if (!cache_zap_locked_vnode_kl2(ncp, vp, &vlp2))
+			goto retry;
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
 	KASSERT(vp->v_cache_dd == NULL, ("incomplete purge"));
-	CACHE_WUNLOCK();
+	mtx_unlock(vlp);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 		cache_free(ncp);
 	}
 }
 
 /*
  * Invalidate all negative entries for a particular directory vnode.
  */
 void
 cache_purge_negative(struct vnode *vp)
 {
 	TAILQ_HEAD(, namecache) ncps;
 	struct namecache *ncp, *nnp;
+	struct mtx *vlp;
 
 	CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
 	SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
 	TAILQ_INIT(&ncps);
-	CACHE_WLOCK();
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
 	LIST_FOREACH_SAFE(ncp, &vp->v_cache_src, nc_src, nnp) {
 		if (ncp->nc_vp != NULL)
 			continue;
-		cache_zap(ncp);
+		cache_zap_negative_locked_vnode_kl(ncp, vp);
 		TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
 	}
-	CACHE_WUNLOCK();
+	mtx_unlock(vlp);
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 		cache_free(ncp);
 	}
 }
 
 /*
  * Flush all entries referencing a particular filesystem.
  */
 void
 cache_purgevfs(struct mount *mp)
 {
 	TAILQ_HEAD(, namecache) ncps;
-	struct rwlock *bucketlock;
+	struct mtx *vlp1, *vlp2;
+	struct rwlock *blp;
 	struct nchashhead *bucket;
 	struct namecache *ncp, *nnp;
 	u_long i, j, n_nchash;
+	int error;
 
 	/* Scan hash tables for applicable entries */
 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 	TAILQ_INIT(&ncps);
-	CACHE_WLOCK();
 	n_nchash = nchash + 1;
+	vlp1 = vlp2 = NULL;
 	for (i = 0; i < numbucketlocks; i++) {
-		bucketlock = (struct rwlock *)&bucketlocks[i];
-		rw_wlock(bucketlock);
+		blp = (struct rwlock *)&bucketlocks[i];
+		rw_wlock(blp);
 		for (j = i; j < n_nchash; j += numbucketlocks) {
+retry:
 			bucket = &nchashtbl[j];
 			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
 				if (ncp->nc_dvp->v_mount != mp)
 					continue;
-				cache_zap_locked(ncp);
+				error = cache_zap_wlocked_bucket_kl(ncp, blp,
+				    &vlp1, &vlp2);
+				if (error != 0)
+					goto retry;
 				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
 			}
 		}
-		rw_wunlock(bucketlock);
+		rw_wunlock(blp);
+		if (vlp1 == NULL && vlp2 == NULL)
+			cache_maybe_yield();
 	}
-	CACHE_WUNLOCK();
+	if (vlp1 != NULL)
+		mtx_unlock(vlp1);
+	if (vlp2 != NULL)
+		mtx_unlock(vlp2);
+
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
 		cache_free(ncp);
 	}
 }
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
  * through the vop_cachedlookup only if needed.
  */
 
 int
 vfs_cache_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp;
 	int error;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct ucred *cred = cnp->cn_cred;
 	int flags = cnp->cn_flags;
 	struct thread *td = cnp->cn_thread;
 
 	*vpp = NULL;
 	dvp = ap->a_dvp;
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
 	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
 		return (EROFS);
 
 	error = VOP_ACCESS(dvp, VEXEC, cred, td);
 	if (error)
 		return (error);
 
 	error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
 	if (error == 0)
 		return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
 	if (error == -1)
 		return (0);
 	return (error);
 }
 
 /*
  * XXX All of these sysctls would probably be more productive dead.
  */
 static int disablecwd;
 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
    "Disable the getcwd syscall");
 
 /* Implementation of the getcwd syscall. */
 int
 sys___getcwd(struct thread *td, struct __getcwd_args *uap)
 {
 
 	return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
 	    MAXPATHLEN));
 }
 
 int
 kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen,
     u_int path_max)
 {
 	char *bp, *tmpbuf;
 	struct filedesc *fdp;
 	struct vnode *cdir, *rdir;
 	int error;
 
 	if (disablecwd)
 		return (ENODEV);
 	if (buflen < 2)
 		return (EINVAL);
 	if (buflen > path_max)
 		buflen = path_max;
 
 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	cdir = fdp->fd_cdir;
 	VREF(cdir);
 	rdir = fdp->fd_rdir;
 	VREF(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
 	vrele(rdir);
 	vrele(cdir);
 
 	if (!error) {
 		if (bufseg == UIO_SYSSPACE)
 			bcopy(bp, buf, strlen(bp) + 1);
 		else
 			error = copyout(bp, buf, strlen(bp) + 1);
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(bp);
 #endif
 	}
 	free(tmpbuf, M_TEMP);
 	return (error);
 }
 
 /*
  * Thus begins the fullpath magic.
  */
 
 static int disablefullpath;
 SYSCTL_INT(_debug, OID_AUTO, disablefullpath, CTLFLAG_RW, &disablefullpath, 0,
     "Disable the vn_fullpath function");
 
 /*
  * Retrieve the full filesystem path that correspond to a vnode from the name
  * cache (if available)
  */
 int
 vn_fullpath(struct thread *td, struct vnode *vn, char **retbuf, char **freebuf)
 {
 	char *buf;
 	struct filedesc *fdp;
 	struct vnode *rdir;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 	rdir = fdp->fd_rdir;
 	VREF(rdir);
 	FILEDESC_SUNLOCK(fdp);
 	error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
 	vrele(rdir);
 
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 /*
  * This function is similar to vn_fullpath, but it attempts to lookup the
  * pathname relative to the global root mount point.  This is required for the
  * auditing sub-system, as audited pathnames must be absolute, relative to the
  * global root mount point.
  */
 int
 vn_fullpath_global(struct thread *td, struct vnode *vn,
     char **retbuf, char **freebuf)
 {
 	char *buf;
 	int error;
 
 	if (disablefullpath)
 		return (ENODEV);
 	if (vn == NULL)
 		return (EINVAL);
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	error = vn_fullpath1(td, vn, rootvnode, buf, retbuf, MAXPATHLEN);
 	if (!error)
 		*freebuf = buf;
 	else
 		free(buf, M_TEMP);
 	return (error);
 }
 
 int
 vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
 {
-	int error;
-
-	CACHE_RLOCK();
-	error = vn_vptocnp_locked(vp, cred, buf, buflen);
-	if (error == 0)
-		CACHE_RUNLOCK();
-	return (error);
-}
-
-static int
-vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
-    u_int *buflen)
-{
 	struct vnode *dvp;
 	struct namecache *ncp;
+	struct mtx *vlp;
 	int error;
 
+	vlp = VP2VNODELOCK(*vp);
+	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	}
 	if (ncp != NULL) {
 		if (*buflen < ncp->nc_nlen) {
-			CACHE_RUNLOCK();
+			mtx_unlock(vlp);
 			vrele(*vp);
 			counter_u64_add(numfullpathfail4, 1);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    vp, NULL);
 			return (error);
 		}
 		*buflen -= ncp->nc_nlen;
 		memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
 		SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 		    nc_get_name(ncp), vp);
 		dvp = *vp;
 		*vp = ncp->nc_dvp;
 		vref(*vp);
-		CACHE_RUNLOCK();
+		mtx_unlock(vlp);
 		vrele(dvp);
-		CACHE_RLOCK();
 		return (0);
 	}
 	SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
 
-	CACHE_RUNLOCK();
+	mtx_unlock(vlp);
 	vn_lock(*vp, LK_SHARED | LK_RETRY);
 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
 	vput(*vp);
 	if (error) {
 		counter_u64_add(numfullpathfail2, 1);
 		SDT_PROBE3(vfs, namecache, fullpath, return,  error, vp, NULL);
 		return (error);
 	}
 
 	*vp = dvp;
-	CACHE_RLOCK();
 	if (dvp->v_iflag & VI_DOOMED) {
 		/* forced unmount */
-		CACHE_RUNLOCK();
 		vrele(dvp);
 		error = ENOENT;
 		SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
 		return (error);
 	}
 	/*
 	 * *vp has its use count incremented still.
 	 */
 
 	return (0);
 }
 
 /*
  * The magic behind kern___getcwd() and vn_fullpath().
  */
 static int
 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen)
 {
 	int error, slash_prefixed;
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
 	struct vnode *vp1;
 
 	buflen--;
 	buf[buflen] = '\0';
 	error = 0;
 	slash_prefixed = 0;
 
 	SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
 	counter_u64_add(numfullpathcalls, 1);
 	vref(vp);
-	CACHE_RLOCK();
 	if (vp->v_type != VDIR) {
-		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 		if (error)
 			return (error);
 		if (buflen == 0) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = 1;
 	}
 	while (vp != rdir && vp != rootvnode) {
 		if (vp->v_vflag & VV_ROOT) {
 			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
-				CACHE_RUNLOCK();
 				vrele(vp);
 				error = ENOENT;
 				SDT_PROBE3(vfs, namecache, fullpath, return,
 				    error, vp, NULL);
 				break;
 			}
 			vp1 = vp->v_mount->mnt_vnodecovered;
 			vref(vp1);
-			CACHE_RUNLOCK();
 			vrele(vp);
 			vp = vp1;
-			CACHE_RLOCK();
 			continue;
 		}
 		if (vp->v_type != VDIR) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			counter_u64_add(numfullpathfail1, 1);
 			error = ENOTDIR;
 			SDT_PROBE3(vfs, namecache, fullpath, return,
 			    error, vp, NULL);
 			break;
 		}
-		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
+		error = vn_vptocnp(&vp, td->td_ucred, buf, &buflen);
 		if (error)
 			break;
 		if (buflen == 0) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			error = ENOMEM;
 			SDT_PROBE3(vfs, namecache, fullpath, return, error,
 			    startvp, NULL);
 			break;
 		}
 		buf[--buflen] = '/';
 		slash_prefixed = 1;
 	}
 	if (error)
 		return (error);
 	if (!slash_prefixed) {
 		if (buflen == 0) {
-			CACHE_RUNLOCK();
 			vrele(vp);
 			counter_u64_add(numfullpathfail4, 1);
 			SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
 			    startvp, NULL);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
 	}
 	counter_u64_add(numfullpathfound, 1);
-	CACHE_RUNLOCK();
 	vrele(vp);
 
 	SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
 	*retbuf = buf + buflen;
 	return (0);
 }
 
 struct vnode *
 vn_dir_dd_ino(struct vnode *vp)
 {
 	struct namecache *ncp;
 	struct vnode *ddvp;
+	struct mtx *vlp;
 
 	ASSERT_VOP_LOCKED(vp, "vn_dir_dd_ino");
-	CACHE_RLOCK();
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &(vp->v_cache_dst), nc_dst) {
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 			continue;
 		ddvp = ncp->nc_dvp;
 		vhold(ddvp);
-		CACHE_RUNLOCK();
+		mtx_unlock(vlp);
 		if (vget(ddvp, LK_SHARED | LK_NOWAIT | LK_VNHELD, curthread))
 			return (NULL);
 		return (ddvp);
 	}
-	CACHE_RUNLOCK();
+	mtx_unlock(vlp);
 	return (NULL);
 }
 
 int
 vn_commname(struct vnode *vp, char *buf, u_int buflen)
 {
 	struct namecache *ncp;
+	struct mtx *vlp;
 	int l;
 
-	CACHE_RLOCK();
+	vlp = VP2VNODELOCK(vp);
+	mtx_lock(vlp);
 	TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst)
 		if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
 			break;
 	if (ncp == NULL) {
-		CACHE_RUNLOCK();
+		mtx_unlock(vlp);
 		return (ENOENT);
 	}
 	l = min(ncp->nc_nlen, buflen - 1);
 	memcpy(buf, nc_get_name(ncp), l);
-	CACHE_RUNLOCK();
+	mtx_unlock(vlp);
 	buf[l] = '\0';
 	return (0);
 }
 
 /* ABI compat shims for old kernel modules. */
 #undef cache_enter
 
 void	cache_enter(struct vnode *dvp, struct vnode *vp,
 	    struct componentname *cnp);
 
 void
 cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
 {
 
 	cache_enter_time(dvp, vp, cnp, NULL, NULL);
 }
 
 /*
  * This function updates path string to vnode's full global path
  * and checks the size of the new path string against the pathlen argument.
  *
  * Requires a locked, referenced vnode.
  * Vnode is re-locked on success or ENODEV, otherwise unlocked.
  *
  * If sysctl debug.disablefullpath is set, ENODEV is returned,
  * vnode is left locked and path remain untouched.
  *
  * If vp is a directory, the call to vn_fullpath_global() always succeeds
  * because it falls back to the ".." lookup if the namecache lookup fails.
  */
 int
 vn_path_to_global_path(struct thread *td, struct vnode *vp, char *path,
     u_int pathlen)
 {
 	struct nameidata nd;
 	struct vnode *vp1;
 	char *rpath, *fbuf;
 	int error;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
 
 	/* Return ENODEV if sysctl debug.disablefullpath==1 */
 	if (disablefullpath)
 		return (ENODEV);
 
 	/* Construct global filesystem path from vp. */
 	VOP_UNLOCK(vp, 0);
 	error = vn_fullpath_global(td, vp, &rpath, &fbuf);
 
 	if (error != 0) {
 		vrele(vp);
 		return (error);
 	}
 
 	if (strlen(rpath) >= pathlen) {
 		vrele(vp);
 		error = ENAMETOOLONG;
 		goto out;
 	}
 
 	/*
 	 * Re-lookup the vnode by path to detect a possible rename.
 	 * As a side effect, the vnode is relocked.
 	 * If vnode was renamed, return ENOENT.
 	 */
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
 	    UIO_SYSSPACE, path, td);
 	error = namei(&nd);
 	if (error != 0) {
 		vrele(vp);
 		goto out;
 	}
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp1 = nd.ni_vp;
 	vrele(vp);
 	if (vp1 == vp)
 		strcpy(path, rpath);
 	else {
 		vput(vp1);
 		error = ENOENT;
 	}
 
 out:
 	free(fbuf, M_TEMP);
 	return (error);
 }
Index: user/alc/PQ_LAUNDRY/sys/sys/filedesc.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/sys/filedesc.h	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/sys/filedesc.h	(revision 306283)
@@ -1,246 +1,248 @@
 /*-
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)filedesc.h	8.1 (Berkeley) 6/2/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_FILEDESC_H_
 #define	_SYS_FILEDESC_H_
 
 #include <sys/caprights.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/lock.h>
 #include <sys/priority.h>
 #include <sys/seq.h>
 #include <sys/sx.h>
 
 #include <machine/_limits.h>
 
 struct filecaps {
 	cap_rights_t	 fc_rights;	/* per-descriptor capability rights */
 	u_long		*fc_ioctls;	/* per-descriptor allowed ioctls */
 	int16_t		 fc_nioctls;	/* fc_ioctls array size */
 	uint32_t	 fc_fcntls;	/* per-descriptor allowed fcntls */
 };
 
 struct filedescent {
 	struct file	*fde_file;	/* file structure for open file */
 	struct filecaps	 fde_caps;	/* per-descriptor rights */
 	uint8_t		 fde_flags;	/* per-process open file flags */
 	seq_t		 fde_seq;	/* keep file and caps in sync */
 };
 #define	fde_rights	fde_caps.fc_rights
 #define	fde_fcntls	fde_caps.fc_fcntls
 #define	fde_ioctls	fde_caps.fc_ioctls
 #define	fde_nioctls	fde_caps.fc_nioctls
 #define	fde_change_size	(offsetof(struct filedescent, fde_seq))
 
 struct fdescenttbl {
 	int	fdt_nfiles;		/* number of open files allocated */
 	struct	filedescent fdt_ofiles[0];	/* open files */
 };
 #define	fd_seq(fdt, fd)	(&(fdt)->fdt_ofiles[(fd)].fde_seq)
 
 /*
  * This structure is used for the management of descriptors.  It may be
  * shared by multiple processes.
  */
 #define NDSLOTTYPE	u_long
 
 struct filedesc {
 	struct	fdescenttbl *fd_files;	/* open files table */
 	struct	vnode *fd_cdir;		/* current directory */
 	struct	vnode *fd_rdir;		/* root directory */
 	struct	vnode *fd_jdir;		/* jail root directory */
 	NDSLOTTYPE *fd_map;		/* bitmap of free fds */
 	int	fd_lastfile;		/* high-water mark of fd_ofiles */
 	int	fd_freefile;		/* approx. next free file */
 	u_short	fd_cmask;		/* mask for file creation */
 	int	fd_refcnt;		/* thread reference count */
 	int	fd_holdcnt;		/* hold count on structure + mutex */
 	struct	sx fd_sx;		/* protects members of this struct */
 	struct	kqlist fd_kqlist;	/* list of kqueues on this filedesc */
 	int	fd_holdleaderscount;	/* block fdfree() for shared close() */
 	int	fd_holdleaderswakeup;	/* fdfree() needs wakeup */
 };
 
 /*
  * Structure to keep track of (process leader, struct fildedesc) tuples.
  * Each process has a pointer to such a structure when detailed tracking
  * is needed, e.g., when rfork(RFPROC | RFMEM) causes a file descriptor
  * table to be shared by processes having different "p_leader" pointers
  * and thus distinct POSIX style locks.
  *
  * fdl_refcount and fdl_holdcount are protected by struct filedesc mtx.
  */
 struct filedesc_to_leader {
 	int		fdl_refcount;	/* references from struct proc */
 	int		fdl_holdcount;	/* temporary hold during closef */
 	int		fdl_wakeup;	/* fdfree() waits on closef() */
 	struct proc	*fdl_leader;	/* owner of POSIX locks */
 	/* Circular list: */
 	struct filedesc_to_leader *fdl_prev;
 	struct filedesc_to_leader *fdl_next;
 };
 #define	fd_nfiles	fd_files->fdt_nfiles
 #define	fd_ofiles	fd_files->fdt_ofiles
 
 /*
  * Per-process open flags.
  */
 #define	UF_EXCLOSE	0x01		/* auto-close on exec */
 
 #ifdef _KERNEL
 
 /* Lock a file descriptor table. */
 #define	FILEDESC_LOCK_INIT(fdp)	sx_init(&(fdp)->fd_sx, "filedesc structure")
 #define	FILEDESC_LOCK_DESTROY(fdp)	sx_destroy(&(fdp)->fd_sx)
 #define	FILEDESC_LOCK(fdp)	(&(fdp)->fd_sx)
 #define	FILEDESC_XLOCK(fdp)	sx_xlock(&(fdp)->fd_sx)
 #define	FILEDESC_XUNLOCK(fdp)	sx_xunlock(&(fdp)->fd_sx)
 #define	FILEDESC_SLOCK(fdp)	sx_slock(&(fdp)->fd_sx)
 #define	FILEDESC_SUNLOCK(fdp)	sx_sunlock(&(fdp)->fd_sx)
 
 #define	FILEDESC_LOCK_ASSERT(fdp)	sx_assert(&(fdp)->fd_sx, SX_LOCKED | \
 					    SX_NOTRECURSED)
 #define	FILEDESC_XLOCK_ASSERT(fdp)	sx_assert(&(fdp)->fd_sx, SX_XLOCKED | \
 					    SX_NOTRECURSED)
 #define	FILEDESC_UNLOCK_ASSERT(fdp)	sx_assert(&(fdp)->fd_sx, SX_UNLOCKED)
 
 /* Operation types for kern_dup(). */
 enum {
 	FDDUP_NORMAL,		/* dup() behavior. */
 	FDDUP_FCNTL,		/* fcntl()-style errors. */
 	FDDUP_FIXED,		/* Force fixed allocation. */
 	FDDUP_MUSTREPLACE,	/* Target must exist. */
 	FDDUP_LASTMODE,
 };
 
 /* Flags for kern_dup(). */
 #define	FDDUP_FLAG_CLOEXEC	0x1	/* Atomically set UF_EXCLOSE. */
 
 /* For backward compatibility. */
 #define	falloc(td, resultfp, resultfd, flags) \
 	falloc_caps(td, resultfp, resultfd, flags, NULL)
 
 struct thread;
 
 void	filecaps_init(struct filecaps *fcaps);
 int	filecaps_copy(const struct filecaps *src, struct filecaps *dst,
 	    bool locked);
 void	filecaps_move(struct filecaps *src, struct filecaps *dst);
 void	filecaps_free(struct filecaps *fcaps);
 
 int	closef(struct file *fp, struct thread *td);
 int	dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
 	    int openerror, int *indxp);
 int	falloc_caps(struct thread *td, struct file **resultfp, int *resultfd,
 	    int flags, struct filecaps *fcaps);
 int	falloc_noinstall(struct thread *td, struct file **resultfp);
 void	_finstall(struct filedesc *fdp, struct file *fp, int fd, int flags,
 	    struct filecaps *fcaps);
 int	finstall(struct thread *td, struct file *fp, int *resultfd, int flags,
 	    struct filecaps *fcaps);
 int	fdalloc(struct thread *td, int minfd, int *result);
 int	fdallocn(struct thread *td, int minfd, int *fds, int n);
 int	fdcheckstd(struct thread *td);
 void	fdclose(struct thread *td, struct file *fp, int idx);
 void	fdcloseexec(struct thread *td);
 void	fdsetugidsafety(struct thread *td);
 struct	filedesc *fdcopy(struct filedesc *fdp);
 int	fdcopy_remapped(struct filedesc *fdp, const int *fds, size_t nfds,
 	    struct filedesc **newfdp);
 void	fdinstall_remapped(struct thread *td, struct filedesc *fdp);
 void	fdunshare(struct thread *td);
 void	fdescfree(struct thread *td);
 void	fdescfree_remapped(struct filedesc *fdp);
 struct	filedesc *fdinit(struct filedesc *fdp, bool prepfiles);
 struct	filedesc *fdshare(struct filedesc *fdp);
 struct filedesc_to_leader *
 	filedesc_to_leader_alloc(struct filedesc_to_leader *old,
 	    struct filedesc *fdp, struct proc *leader);
 int	getvnode(struct thread *td, int fd, cap_rights_t *rightsp,
 	    struct file **fpp);
 void	mountcheckdirs(struct vnode *olddp, struct vnode *newdp);
 
 int	fget_cap_locked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 	    struct file **fpp, struct filecaps *havecapsp);
 int	fget_cap(struct thread *td, int fd, cap_rights_t *needrightsp,
 	    struct file **fpp, struct filecaps *havecapsp);
 
 /* Return a referenced file from an unlocked descriptor. */
 int	fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
 	    struct file **fpp, seq_t *seqp);
 
 /* Requires a FILEDESC_{S,X}LOCK held and returns without a ref. */
 static __inline struct file *
 fget_locked(struct filedesc *fdp, int fd)
 {
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	if (fd < 0 || fd > fdp->fd_lastfile)
 		return (NULL);
 
 	return (fdp->fd_ofiles[fd].fde_file);
 }
 
 static __inline struct filedescent *
 fdeget_locked(struct filedesc *fdp, int fd)
 {
 	struct filedescent *fde;
 
 	FILEDESC_LOCK_ASSERT(fdp);
 
 	if (fd < 0 || fd > fdp->fd_lastfile)
 		return (NULL);
 
 	fde = &fdp->fd_ofiles[fd];
 	if (fde->fde_file == NULL)
 		return (NULL);
 
 	return (fde);
 }
 
+#ifdef CAPABILITIES
 static __inline bool
 fd_modified(struct filedesc *fdp, int fd, seq_t seq)
 {
 
 	return (!seq_consistent(fd_seq(fdp->fd_files, fd), seq));
 }
+#endif
 
 /* cdir/rdir/jdir manipulation functions. */
 void	pwd_chdir(struct thread *td, struct vnode *vp);
 int	pwd_chroot(struct thread *td, struct vnode *vp);
 void	pwd_ensure_dirs(void);
 
 #endif /* _KERNEL */
 
 #endif /* !_SYS_FILEDESC_H_ */
Index: user/alc/PQ_LAUNDRY/sys/sys/param.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/sys/param.h	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/sys/param.h	(revision 306283)
@@ -1,363 +1,363 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /* 
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1200009	/* Master, propagated to newvers */
+#define __FreeBSD_version 1200010	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #ifdef _KERNEL
 #define	P_OSREL_SIGWAIT			700000
 #define	P_OSREL_SIGSEGV			700004
 #define	P_OSREL_MAP_ANON		800104
 #define	P_OSREL_MAP_FSTRICT		1100036
 #define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 
 #define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	63		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 /* Signals. */
 #include <sys/signal.h>
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS
 #define MAXPHYS		(128 * 1024)	/* max raw I/O transfer size */
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache.  This must
  *		be >= MAXBSIZE and can be set differently for different
  *		architectures by defining it in <machine/param.h>.
  *		Making this larger allows NFS to do larger reads/writes.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual 
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *		The default value here can be overridden on a per-architecture
  *		basis by defining it in <machine/param.h>.  This should
  *		probably be done to increase its value, when MAXBCACHEBUF is
  *		defined as a larger value in <machine/param.h>.
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use 
  *		the KVM memory reserved for the buffer cache and will wind 
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #ifndef	MAXBCACHEBUF
 #define	MAXBCACHEBUF	MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
 #endif
 #ifndef	BKVASIZE
 #define BKVASIZE	16384	/* must be power of 2 */
 #endif
 #define BKVAMASK	(BKVASIZE-1)
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) ((x)&(~((y)-1)))          /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef lint
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* lint */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
  
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */
Index: user/alc/PQ_LAUNDRY/sys/sys/procctl.h
===================================================================
--- user/alc/PQ_LAUNDRY/sys/sys/procctl.h	(revision 306282)
+++ user/alc/PQ_LAUNDRY/sys/sys/procctl.h	(revision 306283)
@@ -1,117 +1,121 @@
 /*-
  * Copyright (c) 2013 Hudson River Trading LLC
+ * Copyright (c) 2014, 2016 The FreeBSD Foundation
  * Written by: John H. Baldwin <jhb@FreeBSD.org>
  * All rights reserved.
+ *
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef	_SYS_PROCCTL_H_
 #define	_SYS_PROCCTL_H_
 
 #ifndef _KERNEL
 #include <sys/types.h>
 #include <sys/wait.h>
 #endif
 
 #define	PROC_SPROTECT		1	/* set protected state */
 #define	PROC_REAP_ACQUIRE	2	/* reaping enable */
 #define	PROC_REAP_RELEASE	3	/* reaping disable */
 #define	PROC_REAP_STATUS	4	/* reaping status */
 #define	PROC_REAP_GETPIDS	5	/* get descendants */
 #define	PROC_REAP_KILL		6	/* kill descendants */
 #define	PROC_TRACE_CTL		7	/* en/dis ptrace and coredumps */
 #define	PROC_TRACE_STATUS	8	/* query tracing status */
 #define	PROC_TRAPCAP_CTL	9	/* trap capability errors */
 #define	PROC_TRAPCAP_STATUS	10	/* query trap capability status */
 
 /* Operations for PROC_SPROTECT (passed in integer arg). */
 #define	PPROT_OP(x)	((x) & 0xf)
 #define	PPROT_SET	1
 #define	PPROT_CLEAR	2
 
 /* Flags for PROC_SPROTECT (ORed in with operation). */
 #define	PPROT_FLAGS(x)	((x) & ~0xf)
 #define	PPROT_DESCEND	0x10
 #define	PPROT_INHERIT	0x20
 
 /* Result of PREAP_STATUS (returned by value). */
 struct procctl_reaper_status {
 	u_int	rs_flags;
 	u_int	rs_children;
 	u_int	rs_descendants;
 	pid_t	rs_reaper;
 	pid_t	rs_pid;
 	u_int	rs_pad0[15];
 };
 
 /* struct procctl_reaper_status rs_flags */
 #define	REAPER_STATUS_OWNED	0x00000001
 #define	REAPER_STATUS_REALINIT	0x00000002
 
 struct procctl_reaper_pidinfo {
 	pid_t	pi_pid;
 	pid_t	pi_subtree;
 	u_int	pi_flags;
 	u_int	pi_pad0[15];
 };
 
 #define	REAPER_PIDINFO_VALID	0x00000001
 #define	REAPER_PIDINFO_CHILD	0x00000002
 
 struct procctl_reaper_pids {
 	u_int	rp_count;
 	u_int	rp_pad0[15];
 	struct procctl_reaper_pidinfo *rp_pids;
 };
 
 struct procctl_reaper_kill {
 	int	rk_sig;		/* in  - signal to send */
 	u_int	rk_flags;	/* in  - REAPER_KILL flags */
 	pid_t	rk_subtree;	/* in  - subtree, if REAPER_KILL_SUBTREE */
 	u_int	rk_killed;	/* out - count of processes successfully
 				   killed */
 	pid_t	rk_fpid;	/* out - first failed pid for which error
 				   is returned */
 	u_int	rk_pad0[15];
 };
 
 #define	REAPER_KILL_CHILDREN	0x00000001
 #define	REAPER_KILL_SUBTREE	0x00000002
 
 #define	PROC_TRACE_CTL_ENABLE		1
 #define	PROC_TRACE_CTL_DISABLE		2
 #define	PROC_TRACE_CTL_DISABLE_EXEC	3
 
 #define	PROC_TRAPCAP_CTL_ENABLE		1
 #define	PROC_TRAPCAP_CTL_DISABLE	2
 
 #ifndef _KERNEL
 __BEGIN_DECLS
 int	procctl(idtype_t, id_t, int, void *);
 __END_DECLS
 
 #endif
 
 #endif /* !_SYS_PROCCTL_H_ */
Index: user/alc/PQ_LAUNDRY/targets/pseudo/userland/Makefile.depend
===================================================================
--- user/alc/PQ_LAUNDRY/targets/pseudo/userland/Makefile.depend	(revision 306282)
+++ user/alc/PQ_LAUNDRY/targets/pseudo/userland/Makefile.depend	(revision 306283)
@@ -1,930 +1,931 @@
 # $FreeBSD$
 
 # This file is not autogenerated - take care!
 
 .if !defined(MK_MANDOCDB)
 .include <src.opts.mk>
 .endif
 
 DIRDEPS=
 .if ${MK_MANDOCDB} == "no"
 DIRDEPS+= usr.bin/makewhatis
 .endif
 DIRDEPS+= \
 	bin/cat \
 	bin/chflags \
 	bin/chio \
 	bin/chmod \
 	bin/cp \
 	bin/csh \
 	bin/date \
 	bin/dd \
 	bin/df \
 	bin/domainname \
 	bin/echo \
 	bin/ed \
 	bin/expr \
 	bin/freebsd-version \
 	bin/getfacl \
 	bin/hostname \
 	bin/kenv \
 	bin/kill \
 	bin/ln \
 	bin/ls \
 	bin/mkdir \
 	bin/mv \
 	bin/pax \
 	bin/pkill \
 	bin/ps \
 	bin/pwait \
 	bin/pwd \
 	bin/rcp \
 	bin/realpath \
 	bin/rm \
 	bin/rmail \
 	bin/rmdir \
 	bin/setfacl \
 	bin/sh \
 	bin/sleep \
 	bin/stty \
 	bin/sync \
 	bin/test \
 	bin/uuidgen \
 	sbin/adjkerntz \
 	sbin/atacontrol \
 	sbin/atm/atmconfig \
 	sbin/badsect \
 	sbin/camcontrol \
 	sbin/ccdconfig \
 	sbin/clri \
 	sbin/comcontrol \
 	sbin/conscontrol \
 	sbin/ddb \
 	sbin/devd \
 	sbin/devfs \
 	sbin/dhclient \
 	sbin/dmesg \
 	sbin/dump \
 	sbin/dumpfs \
 	sbin/dumpon \
 	sbin/etherswitchcfg \
 	sbin/ffsinfo \
 	sbin/fsck \
 	sbin/fsck_ffs \
 	sbin/fsck_msdosfs \
 	sbin/fsdb \
 	sbin/fsirand \
 	sbin/gbde \
 	sbin/geom/class/cache \
 	sbin/geom/class/concat \
 	sbin/geom/class/eli \
 	sbin/geom/class/journal \
 	sbin/geom/class/label \
 	sbin/geom/class/mirror \
 	sbin/geom/class/mountver \
 	sbin/geom/class/multipath \
 	sbin/geom/class/nop \
 	sbin/geom/class/part \
 	sbin/geom/class/raid \
 	sbin/geom/class/raid3 \
 	sbin/geom/class/sched \
 	sbin/geom/class/shsec \
 	sbin/geom/class/stripe \
 	sbin/geom/class/virstor \
 	sbin/geom/core \
 	sbin/ggate/ggatec \
 	sbin/ggate/ggated \
 	sbin/ggate/ggatel \
 	sbin/growfs \
 	sbin/gvinum \
 	sbin/hastctl \
 	sbin/hastd \
 	sbin/ifconfig \
 	sbin/init \
 	sbin/ipf/ipf \
 	sbin/ipf/ipfs \
 	sbin/ipf/ipfstat \
 	sbin/ipf/ipftest \
 	sbin/ipf/ipmon \
 	sbin/ipf/ipnat \
 	sbin/ipf/ippool \
 	sbin/ipf/ipresend \
 	sbin/ipf/libipf \
 	sbin/ipfw \
 	sbin/iscontrol \
 	sbin/kldconfig \
 	sbin/kldload \
 	sbin/kldstat \
 	sbin/kldunload \
 	sbin/ldconfig \
 	sbin/md5 \
 	sbin/mdconfig \
 	sbin/mdmfs \
 	sbin/mknod \
 	sbin/mksnap_ffs \
 	sbin/mount \
 	sbin/mount_cd9660 \
 	sbin/mount_fusefs \
 	sbin/mount_msdosfs \
 	sbin/mount_nfs \
 	sbin/mount_nullfs \
 	sbin/mount_udf \
 	sbin/mount_unionfs \
 	sbin/natd \
 	sbin/newfs \
 	sbin/newfs_msdos \
 	sbin/nfsiod \
 	sbin/nos-tun \
 	sbin/pfctl \
 	sbin/pflogd \
 	sbin/ping \
 	sbin/ping6 \
 	sbin/quotacheck \
 	sbin/rcorder \
 	sbin/reboot \
 	sbin/recoverdisk \
 	sbin/resolvconf \
 	sbin/restore \
 	sbin/route \
 	sbin/routed \
 	sbin/routed/rtquery \
 	sbin/rtsol \
 	sbin/savecore \
 	sbin/setkey \
 	sbin/shutdown \
 	sbin/spppcontrol \
 	sbin/swapon \
 	sbin/sysctl \
 	sbin/tunefs \
 	sbin/umount \
 	usr.bin/alias \
 	usr.bin/apply \
 	usr.bin/ar \
 	usr.bin/asa \
 	usr.bin/at \
 	usr.bin/atf/atf-config \
 	usr.bin/atf/atf-report \
 	usr.bin/atf/atf-run \
 	usr.bin/atf/atf-version \
 	usr.bin/atm/sscop \
 	usr.bin/awk \
 	usr.bin/banner \
 	usr.bin/basename \
 	usr.bin/bc \
 	usr.bin/biff \
 	usr.bin/bluetooth/bthost \
 	usr.bin/bluetooth/btsockstat \
 	usr.bin/bluetooth/rfcomm_sppd \
 	usr.bin/bmake \
 	usr.bin/brandelf \
 	usr.bin/bsdiff/bsdiff \
 	usr.bin/bsdiff/bspatch \
 	usr.bin/bsdcat \
 	usr.bin/bzip2 \
 	usr.bin/bzip2recover \
 	usr.bin/c89 \
 	usr.bin/c99 \
 	usr.bin/calendar \
 	usr.bin/cap_mkdb \
 	usr.bin/catman \
 	usr.bin/chat \
 	usr.bin/checknr \
 	usr.bin/chkey \
 	usr.bin/chpass \
 	usr.bin/cksum \
 	usr.bin/cmp \
 	usr.bin/col \
 	usr.bin/colcrt \
 	usr.bin/colldef \
 	usr.bin/colrm \
 	usr.bin/column \
 	usr.bin/comm \
 	usr.bin/compile_et \
 	usr.bin/compress \
 	usr.bin/cpio \
 	usr.bin/cpuset \
 	usr.bin/csplit \
 	usr.bin/csup \
 	usr.bin/ctags \
 	usr.bin/ctlstat \
 	usr.bin/cut \
 	usr.bin/dc \
 	usr.bin/dig \
 	usr.bin/dirname \
 	usr.bin/dpv \
 	usr.bin/drill \
 	usr.bin/du \
 	usr.bin/ee \
 	usr.bin/elf2aout \
 	usr.bin/elfdump \
 	usr.bin/enigma \
 	usr.bin/env \
 	usr.bin/expand \
 	usr.bin/false \
 	usr.bin/fetch \
 	usr.bin/file \
 	usr.bin/file2c \
 	usr.bin/find \
 	usr.bin/finger \
 	usr.bin/fmt \
 	usr.bin/fold \
 	usr.bin/from \
 	usr.bin/fstat \
 	usr.bin/fsync \
 	usr.bin/ftp \
 	usr.bin/gcore \
 	usr.bin/gencat \
 	usr.bin/getconf \
 	usr.bin/getent \
 	usr.bin/getopt \
 	usr.bin/gprof \
 	usr.bin/grep \
 	usr.bin/gzip \
 	usr.bin/head \
 	usr.bin/hexdump \
 	usr.bin/host \
 	usr.bin/iconv \
 	usr.bin/id \
 	usr.bin/ident \
 	usr.bin/indent \
 	usr.bin/ipcrm \
 	usr.bin/ipcs \
 	usr.bin/iscsictl \
 	usr.bin/join \
 	usr.bin/jot \
 	usr.bin/kdump \
 	usr.bin/keylogin \
 	usr.bin/keylogout \
 	usr.bin/killall \
 	usr.bin/ktrace \
 	usr.bin/ktrdump \
 	usr.bin/lam \
 	usr.bin/last \
 	usr.bin/lastcomm \
 	usr.bin/lex \
 	usr.bin/ldd \
 	usr.bin/leave \
 	usr.bin/less \
 	usr.bin/lessecho \
 	usr.bin/lesskey \
 	usr.bin/lex/lib \
 	usr.bin/limits \
 	usr.bin/locale \
 	usr.bin/locate/bigram \
 	usr.bin/locate/code \
 	usr.bin/locate/locate \
 	usr.bin/lock \
 	usr.bin/lockf \
 	usr.bin/logger \
 	usr.bin/login \
 	usr.bin/logins \
 	usr.bin/logname \
 	usr.bin/look \
 	usr.bin/lorder \
 	usr.bin/lsvfs \
 	usr.bin/lzmainfo \
 	usr.bin/m4 \
 	usr.bin/mail \
 	usr.bin/man \
 	usr.bin/mandoc \
 	usr.bin/mesg \
 	usr.bin/minigzip \
 	usr.bin/ministat \
 	usr.bin/mkcsmapper \
 	usr.bin/mkdep \
 	usr.bin/mkesdb \
 	usr.bin/mkfifo \
 	usr.bin/mkimg \
 	usr.bin/mklocale \
 	usr.bin/mkstr \
 	usr.bin/mktemp \
 	usr.bin/mkuzip \
 	usr.bin/msgs \
 	usr.bin/mt \
 	usr.bin/nc \
 	usr.bin/ncal \
 	usr.bin/netstat \
 	usr.bin/newgrp \
 	usr.bin/newkey \
 	usr.bin/nfsstat \
 	usr.bin/nice \
 	usr.bin/nl \
 	usr.bin/nohup \
 	usr.bin/nslookup \
 	usr.bin/nsupdate \
 	usr.bin/numactl \
 	usr.bin/opieinfo \
 	usr.bin/opiekey \
 	usr.bin/opiepasswd \
 	usr.bin/pagesize \
 	usr.bin/passwd \
 	usr.bin/paste \
 	usr.bin/patch \
 	usr.bin/pathchk \
 	usr.bin/perror \
 	usr.bin/pr \
 	usr.bin/printenv \
 	usr.bin/printf \
+	usr.bin/proccontrol \
 	usr.bin/procstat \
 	usr.bin/protect \
 	usr.bin/quota \
 	usr.bin/rctl \
 	usr.bin/renice \
 	usr.bin/resizewin \
 	usr.bin/rev \
 	usr.bin/revoke \
 	usr.bin/rlogin \
 	usr.bin/rpcgen \
 	usr.bin/rpcinfo \
 	usr.bin/rs \
 	usr.bin/rsh \
 	usr.bin/rup \
 	usr.bin/ruptime \
 	usr.bin/rusers \
 	usr.bin/rwall \
 	usr.bin/rwho \
 	usr.bin/script \
 	usr.bin/sdiff \
 	usr.bin/sed \
 	usr.bin/send-pr \
 	usr.bin/seq \
 	usr.bin/shar \
 	usr.bin/showmount \
 	usr.bin/smbutil \
 	usr.bin/sockstat \
 	usr.bin/soelim \
 	usr.bin/sort \
 	usr.bin/split \
 	usr.bin/ssh-copy-id \
 	usr.bin/stat \
 	usr.bin/stdbuf \
 	usr.bin/su \
 	usr.bin/svn/svn \
 	usr.bin/svn/svnadmin \
 	usr.bin/svn/svnbench \
 	usr.bin/svn/svndumpfilter \
 	usr.bin/svn/svnfsfs \
 	usr.bin/svn/svnlook \
 	usr.bin/svn/svnmucc \
 	usr.bin/svn/svnrdump \
 	usr.bin/svn/svnserve \
 	usr.bin/svn/svnsync \
 	usr.bin/svn/svnversion \
 	usr.bin/systat \
 	usr.bin/tabs \
 	usr.bin/tail \
 	usr.bin/talk \
 	usr.bin/tar \
 	usr.bin/tcopy \
 	usr.bin/tee \
 	usr.bin/telnet \
 	usr.bin/tftp \
 	usr.bin/time \
 	usr.bin/timeout \
 	usr.bin/tip/tip \
 	usr.bin/top \
 	usr.bin/touch \
 	usr.bin/tput \
 	usr.bin/tr \
 	usr.bin/true \
 	usr.bin/truncate \
 	usr.bin/truss \
 	usr.bin/tset \
 	usr.bin/tsort \
 	usr.bin/tty \
 	usr.bin/ul \
 	usr.bin/uname \
 	usr.bin/unexpand \
 	usr.bin/unifdef \
 	usr.bin/uniq \
 	usr.bin/units \
 	usr.bin/unvis \
 	usr.bin/unzip \
 	usr.bin/usbhidaction \
 	usr.bin/usbhidctl \
 	usr.bin/users \
 	usr.bin/uudecode \
 	usr.bin/uuencode \
 	usr.bin/vacation \
 	usr.bin/vgrind \
 	usr.bin/vi \
 	usr.bin/vi/catalog \
 	usr.bin/vis \
 	usr.bin/vmstat \
 	usr.bin/vtfontcvt \
 	usr.bin/w \
 	usr.bin/wall \
 	usr.bin/wc \
 	usr.bin/what \
 	usr.bin/whereis \
 	usr.bin/which \
 	usr.bin/who \
 	usr.bin/whois \
 	usr.bin/write \
 	usr.bin/xargs \
 	usr.bin/xinstall \
 	usr.bin/xlint/lint1 \
 	usr.bin/xlint/lint2 \
 	usr.bin/xlint/llib \
 	usr.bin/xlint/xlint \
 	usr.bin/xo \
 	usr.bin/xstr \
 	usr.bin/xz \
 	usr.bin/xzdec \
 	usr.bin/yacc \
 	usr.bin/yes \
 	usr.bin/ypcat \
 	usr.bin/ypmatch \
 	usr.bin/ypwhich \
 	usr.sbin/IPXrouted \
 	usr.sbin/ac \
 	usr.sbin/accton \
 	usr.sbin/adduser \
 	usr.sbin/amd/amd \
 	usr.sbin/amd/amq \
 	usr.sbin/amd/doc \
 	usr.sbin/amd/fixmount \
 	usr.sbin/amd/fsinfo \
 	usr.sbin/amd/hlfsd \
 	usr.sbin/amd/include \
 	usr.sbin/amd/libamu \
 	usr.sbin/amd/mk-amd-map \
 	usr.sbin/amd/pawd \
 	usr.sbin/amd/scripts \
 	usr.sbin/amd/wire-test \
 	usr.sbin/ancontrol \
 	usr.sbin/apm \
 	usr.sbin/arp \
 	usr.sbin/arpaname \
 	usr.sbin/audit \
 	usr.sbin/auditd \
 	usr.sbin/auditdistd \
 	usr.sbin/auditreduce \
 	usr.sbin/authpf \
 	usr.sbin/autofs \
 	usr.sbin/binmiscctl \
 	usr.sbin/bluetooth/ath3kfw \
 	usr.sbin/bluetooth/bcmfw \
 	usr.sbin/bluetooth/bt3cfw \
 	usr.sbin/bluetooth/bthidcontrol \
 	usr.sbin/bluetooth/bthidd \
 	usr.sbin/bluetooth/btpand \
 	usr.sbin/bluetooth/hccontrol \
 	usr.sbin/bluetooth/hcsecd \
 	usr.sbin/bluetooth/hcseriald \
 	usr.sbin/bluetooth/l2control \
 	usr.sbin/bluetooth/l2ping \
 	usr.sbin/bluetooth/rfcomm_pppd \
 	usr.sbin/bluetooth/sdpcontrol \
 	usr.sbin/bluetooth/sdpd \
 	usr.sbin/bootparamd/bootparamd \
 	usr.sbin/bootparamd/callbootd \
 	usr.sbin/bsdconfig \
 	usr.sbin/bsdconfig/console \
 	usr.sbin/bsdconfig/console/include \
 	usr.sbin/bsdconfig/diskmgmt \
 	usr.sbin/bsdconfig/diskmgmt/include \
 	usr.sbin/bsdconfig/docsinstall \
 	usr.sbin/bsdconfig/docsinstall/include \
 	usr.sbin/bsdconfig/dot \
 	usr.sbin/bsdconfig/dot/include \
 	usr.sbin/bsdconfig/examples \
 	usr.sbin/bsdconfig/include \
 	usr.sbin/bsdconfig/includes \
 	usr.sbin/bsdconfig/includes/include \
 	usr.sbin/bsdconfig/mouse \
 	usr.sbin/bsdconfig/mouse/include \
 	usr.sbin/bsdconfig/networking \
 	usr.sbin/bsdconfig/networking/include \
 	usr.sbin/bsdconfig/networking/share \
 	usr.sbin/bsdconfig/packages \
 	usr.sbin/bsdconfig/packages/include \
 	usr.sbin/bsdconfig/password \
 	usr.sbin/bsdconfig/password/include \
 	usr.sbin/bsdconfig/password/share \
 	usr.sbin/bsdconfig/security \
 	usr.sbin/bsdconfig/security/include \
 	usr.sbin/bsdconfig/share \
 	usr.sbin/bsdconfig/share/media \
 	usr.sbin/bsdconfig/share/packages \
 	usr.sbin/bsdconfig/startup \
 	usr.sbin/bsdconfig/startup/include \
 	usr.sbin/bsdconfig/startup/share \
 	usr.sbin/bsdconfig/timezone \
 	usr.sbin/bsdconfig/timezone/include \
 	usr.sbin/bsdconfig/timezone/share \
 	usr.sbin/bsdconfig/ttys \
 	usr.sbin/bsdconfig/ttys/include \
 	usr.sbin/bsdconfig/usermgmt \
 	usr.sbin/bsdconfig/usermgmt/include \
 	usr.sbin/bsdconfig/usermgmt/share \
 	usr.sbin/bsdinstall/distextract \
 	usr.sbin/bsdinstall/distfetch \
 	usr.sbin/bsdinstall/partedit \
 	usr.sbin/bsdinstall/scripts \
 	usr.sbin/bsnmpd/bsnmpd \
 	usr.sbin/bsnmpd/gensnmptree \
 	usr.sbin/bsnmpd/modules/snmp_atm \
 	usr.sbin/bsnmpd/modules/snmp_bridge \
 	usr.sbin/bsnmpd/modules/snmp_hast \
 	usr.sbin/bsnmpd/modules/snmp_hostres \
 	usr.sbin/bsnmpd/modules/snmp_lm75 \
 	usr.sbin/bsnmpd/modules/snmp_mibII \
 	usr.sbin/bsnmpd/modules/snmp_netgraph \
 	usr.sbin/bsnmpd/modules/snmp_pf \
 	usr.sbin/bsnmpd/modules/snmp_target \
 	usr.sbin/bsnmpd/modules/snmp_usm \
 	usr.sbin/bsnmpd/modules/snmp_vacm \
 	usr.sbin/bsnmpd/modules/snmp_wlan \
 	usr.sbin/bsnmpd/tools/bsnmptools \
 	usr.sbin/bsnmpd/tools/libbsnmptools \
 	usr.sbin/burncd \
 	usr.sbin/cdcontrol \
 	usr.sbin/chkgrp \
 	usr.sbin/chown \
 	usr.sbin/chroot \
 	usr.sbin/ckdist \
 	usr.sbin/clear_locks \
 	usr.sbin/config \
 	usr.sbin/crashinfo \
 	usr.sbin/cron/cron \
 	usr.sbin/cron/crontab \
 	usr.sbin/cron/lib \
 	usr.sbin/crunch/crunchgen \
 	usr.sbin/crunch/crunchide \
 	usr.sbin/ctladm \
 	usr.sbin/ctld \
 	usr.sbin/ctm/ctm \
 	usr.sbin/ctm/ctm_dequeue \
 	usr.sbin/ctm/ctm_rmail \
 	usr.sbin/ctm/ctm_smail \
 	usr.sbin/daemon \
 	usr.sbin/dconschat \
 	usr.sbin/ddns-confgen \
 	usr.sbin/devctl \
 	usr.sbin/devinfo \
 	usr.sbin/digictl \
 	usr.sbin/diskinfo \
 	usr.sbin/dnssec-dsfromkey \
 	usr.sbin/dnssec-keyfromlabel \
 	usr.sbin/dnssec-keygen \
 	usr.sbin/dnssec-revoke \
 	usr.sbin/dnssec-settime \
 	usr.sbin/dnssec-signzone \
 	usr.sbin/dumpcis \
 	usr.sbin/editmap \
 	usr.sbin/edquota \
 	usr.sbin/etcupdate \
 	usr.sbin/extattr \
 	usr.sbin/extattrctl \
 	usr.sbin/fdcontrol \
 	usr.sbin/fdformat \
 	usr.sbin/fdread \
 	usr.sbin/fdwrite \
 	usr.sbin/fifolog/fifolog_create \
 	usr.sbin/fifolog/fifolog_reader \
 	usr.sbin/fifolog/fifolog_writer \
 	usr.sbin/fifolog/lib \
 	usr.sbin/flowctl \
 	usr.sbin/fmtree \
 	usr.sbin/freebsd-update \
 	usr.sbin/fstyp \
 	usr.sbin/ftp-proxy \
 	usr.sbin/fwcontrol \
 	usr.sbin/genrandom \
 	usr.sbin/getfmac \
 	usr.sbin/getpmac \
 	usr.sbin/gpioctl \
 	usr.sbin/gssd \
 	usr.sbin/gstat \
 	usr.sbin/i2c \
 	usr.sbin/ifmcstat \
 	usr.sbin/inetd \
 	usr.sbin/iostat \
 	usr.sbin/iovctl \
 	usr.sbin/ip6addrctl \
 	usr.sbin/ipfwpcap \
 	usr.sbin/isc-hmac-fixup \
 	usr.sbin/iscsid \
 	usr.sbin/isfctl \
 	usr.sbin/jail \
 	usr.sbin/jexec \
 	usr.sbin/jls \
 	usr.sbin/kbdcontrol \
 	usr.sbin/kbdmap \
 	usr.sbin/keyserv \
 	usr.sbin/kldxref \
 	usr.sbin/lastlogin \
 	usr.sbin/lmcconfig \
 	usr.sbin/lpr/chkprintcap \
 	usr.sbin/lpr/common_source \
 	usr.sbin/lpr/filters \
 	usr.sbin/lpr/filters.ru/koi2855 \
 	usr.sbin/lpr/filters.ru/koi2alt \
 	usr.sbin/lpr/lp \
 	usr.sbin/lpr/lpc \
 	usr.sbin/lpr/lpd \
 	usr.sbin/lpr/lpq \
 	usr.sbin/lpr/lpr \
 	usr.sbin/lpr/lprm \
 	usr.sbin/lpr/lptest \
 	usr.sbin/lpr/pac \
 	usr.sbin/mailstats \
 	usr.sbin/mailwrapper \
 	usr.sbin/makefs \
 	usr.sbin/makemap \
 	usr.sbin/manctl \
 	usr.sbin/memcontrol \
 	usr.sbin/mergemaster \
 	usr.sbin/mfiutil \
 	usr.sbin/mixer \
 	usr.sbin/mld6query \
 	usr.sbin/mlxcontrol \
 	usr.sbin/mount_smbfs \
 	usr.sbin/mountd \
 	usr.sbin/moused \
 	usr.sbin/mpsutil \
 	usr.sbin/mptutil \
 	usr.sbin/mtest \
 	usr.sbin/named \
 	usr.sbin/named-checkconf \
 	usr.sbin/named-checkzone \
 	usr.sbin/named-journalprint \
 	usr.sbin/ndp \
 	usr.sbin/newsyslog \
 	usr.sbin/nfscbd \
 	usr.sbin/nfsd \
 	usr.sbin/nfsdumpstate \
 	usr.sbin/nfsrevoke \
 	usr.sbin/nfsuserd \
 	usr.sbin/ngctl \
 	usr.sbin/nghook \
 	usr.sbin/nmtree \
 	usr.sbin/nologin \
 	usr.sbin/nscd \
 	usr.sbin/nsec3hash \
 	usr.sbin/ntp/doc \
 	usr.sbin/ntp/doc/drivers/icons \
 	usr.sbin/ntp/doc/drivers/scripts \
 	usr.sbin/ntp/doc/drivers \
 	usr.sbin/ntp/doc/hints \
 	usr.sbin/ntp/doc/icons \
 	usr.sbin/ntp/doc/pic \
 	usr.sbin/ntp/doc/scripts \
 	usr.sbin/ntp/libntp \
 	usr.sbin/ntp/libopts \
 	usr.sbin/ntp/libparse \
 	usr.sbin/ntp/ntp-keygen \
 	usr.sbin/ntp/ntpd \
 	usr.sbin/ntp/ntpdate \
 	usr.sbin/ntp/ntpdc \
 	usr.sbin/ntp/ntpq \
 	usr.sbin/ntp/ntptime \
 	usr.sbin/ntp/sntp \
 	usr.sbin/pc-sysinstall/backend \
 	usr.sbin/pc-sysinstall/backend-partmanager \
 	usr.sbin/pc-sysinstall/backend-query \
 	usr.sbin/pc-sysinstall/conf \
 	usr.sbin/pc-sysinstall/doc \
 	usr.sbin/pc-sysinstall/examples \
 	usr.sbin/pc-sysinstall/pc-sysinstall \
 	usr.sbin/pciconf \
 	usr.sbin/periodic \
 	usr.sbin/pkg \
 	usr.sbin/pkg_install/add \
 	usr.sbin/pkg_install/create \
 	usr.sbin/pkg_install/delete \
 	usr.sbin/pkg_install/info \
 	usr.sbin/pkg_install/lib \
 	usr.sbin/pkg_install/updating \
 	usr.sbin/pkg_install/version \
 	usr.sbin/pmcannotate \
 	usr.sbin/pmccontrol \
 	usr.sbin/pmcstat \
 	usr.sbin/pmcstudy \
 	usr.sbin/portsnap/make_index \
 	usr.sbin/portsnap/phttpget \
 	usr.sbin/portsnap/portsnap \
 	usr.sbin/powerd \
 	usr.sbin/ppp \
 	usr.sbin/pppctl \
 	usr.sbin/praliases \
 	usr.sbin/praudit \
 	usr.sbin/procctl \
 	usr.sbin/pstat \
 	usr.sbin/pw \
 	usr.sbin/pwd_mkdb \
 	usr.sbin/quot \
 	usr.sbin/quotaon \
 	usr.sbin/rarpd \
 	usr.sbin/repquota \
 	usr.sbin/rip6query \
 	usr.sbin/rmt \
 	usr.sbin/rndc \
 	usr.sbin/rndc-confgen \
 	usr.sbin/route6d \
 	usr.sbin/rpc.lockd \
 	usr.sbin/rpc.statd \
 	usr.sbin/rpc.umntall \
 	usr.sbin/rpc.yppasswdd \
 	usr.sbin/rpc.ypupdated \
 	usr.sbin/rpc.ypxfrd \
 	usr.sbin/rpcbind \
 	usr.sbin/rrenumd \
 	usr.sbin/rtadvctl \
 	usr.sbin/rtadvd \
 	usr.sbin/rtprio \
 	usr.sbin/rtsold \
 	usr.sbin/rwhod \
 	usr.sbin/sa \
 	usr.sbin/sendmail \
 	usr.sbin/service \
 	usr.sbin/services_mkdb \
 	usr.sbin/sesutil \
 	usr.sbin/setfib \
 	usr.sbin/setfmac \
 	usr.sbin/setpmac \
 	usr.sbin/smbmsg \
 	usr.sbin/snapinfo \
 	usr.sbin/spray \
 	usr.sbin/syslogd \
 	usr.sbin/sysrc \
 	usr.sbin/tcpdchk \
 	usr.sbin/tcpdmatch \
 	usr.sbin/tcpdrop \
 	usr.sbin/tcpdump/tcpdump \
 	usr.sbin/timed/timed \
 	usr.sbin/timed/timedc \
 	usr.sbin/traceroute \
 	usr.sbin/traceroute6 \
 	usr.sbin/trpt \
 	usr.sbin/tzsetup \
 	usr.sbin/uathload \
 	usr.sbin/uefisign \
 	usr.sbin/ugidfw \
 	usr.sbin/uhsoctl \
 	usr.sbin/unbound/anchor \
 	usr.sbin/unbound/checkconf \
 	usr.sbin/unbound/control \
 	usr.sbin/unbound/daemon \
 	usr.sbin/unbound/local-setup \
 	usr.sbin/usbconfig \
 	usr.sbin/usbdump \
 	usr.sbin/utx \
 	usr.sbin/vidcontrol \
 	usr.sbin/vigr \
 	usr.sbin/vipw \
 	usr.sbin/wake \
 	usr.sbin/watch \
 	usr.sbin/watchdogd \
 	usr.sbin/wlandebug \
 	usr.sbin/wpa/hostapd \
 	usr.sbin/wpa/hostapd_cli \
 	usr.sbin/wpa/ndis_events \
 	usr.sbin/wpa/wpa_cli \
 	usr.sbin/wpa/wpa_passphrase \
 	usr.sbin/wpa/wpa_supplicant \
 	usr.sbin/yp_mkdb \
 	usr.sbin/ypbind \
 	usr.sbin/ypldap \
 	usr.sbin/yppoll \
 	usr.sbin/yppush \
 	usr.sbin/ypserv \
 	usr.sbin/ypset \
 	usr.sbin/zic/zdump \
 	usr.sbin/zic/zic \
 	usr.sbin/zonectl \
 	${DEP_RELDIR}/cddl \
 	${DEP_RELDIR}/games \
 	${DEP_RELDIR}/gnu \
 	${DEP_RELDIR}/include \
 	${DEP_RELDIR}/kerberos5 \
 	${DEP_RELDIR}/lib \
 	${DEP_RELDIR}/libexec \
 	${DEP_RELDIR}/misc \
 	${DEP_RELDIR}/secure \
 	${DEP_RELDIR}/share \
 
 .if ${MK_NAND} != "no"
 DIRDEPS+= \
 	sbin/nandfs \
 	sbin/newfs_nandfs \
 	usr.sbin/nandsim \
 	usr.sbin/nandtool \
 
 .endif
 
 DIRDEPS.amd64= \
 	sbin/bsdlabel \
 	sbin/fdisk \
 	sbin/nvmecontrol \
 	usr.sbin/acpi/acpiconf \
 	usr.sbin/acpi/acpidb \
 	usr.sbin/acpi/acpidump \
 	usr.sbin/acpi/iasl \
 	usr.sbin/apm \
 	usr.sbin/asf \
 	usr.sbin/bhyve \
 	usr.sbin/bhyvectl \
 	usr.sbin/bhyveload \
 	usr.sbin/boot0cfg \
 	usr.sbin/btxld \
 	usr.sbin/camdd \
 	usr.sbin/cpucontrol \
 	usr.sbin/hyperv/tools \
 	usr.sbin/kgmon \
 	usr.sbin/lptcontrol \
 	usr.sbin/mptable \
 	usr.sbin/ndiscvt \
 	usr.sbin/spkrtest \
 	usr.sbin/sade \
 	usr.sbin/zzz
 
 DIRDEPS.arm= \
 	sbin/bsdlabel \
 	sbin/fdisk \
 	usr.sbin/ofwdump \
 	usr.sbin/kgmon
 
 DIRDEPS.i386= \
 	sbin/bsdlabel \
 	sbin/fdisk \
 	sbin/nvmecontrol \
 	sbin/sconfig \
 	usr.sbin/apm \
 	usr.sbin/apmd \
 	usr.sbin/asf \
 	usr.sbin/btxld \
 	usr.sbin/cpucontrol \
 	usr.sbin/hyperv/tools \
 	usr.sbin/kgmon \
 	usr.sbin/kgzip \
 	usr.sbin/lptcontrol \
 	usr.sbin/mptable \
 	usr.sbin/ndiscvt \
 	usr.sbin/pnpinfo \
 	usr.sbin/sade \
 	usr.sbin/spkrtest \
 	usr.sbin/zzz \
 	usr.sbin/acpi \
 	usr.sbin/boot0cfg
 
 DIRDEPS.arm64= \
 	usr.sbin/acpi \
 	usr.sbin/ofwdump
 
 DIRDEPS.mips= \
 	sbin/bsdlabel \
 	sbin/fdisk
 
 DIRDEPS.pc98= \
 	sbin/bsdlabel \
 	sbin/fdisk_pc98 \
 	sbin/sconfig
 
 DIRDEPS.sparc64= \
 	sbin/bsdlabel \
 	sbin/sunlabel \
 	usr.sbin/eeprom \
 	usr.sbin/ofwdump \
 	usr.sbin/sade
 
 DIRDEPS.powerpc= \
 	usr.sbin/nvram \
 	usr.sbin/ofwdump
 
 .if ${MK_BLACKLIST_SUPPORT} != "no"
 DIRDEPS+= \
 	usr.sbin/blacklistctl \
 	usr.sbin/blacklistd
 .endif
 
 .if ${MK_GPL_DTC} != "yes"
 DIRDEPS+= usr.bin/dtc
 .endif
 
 .if ${MK_OFED} != "no"
 DIRDEPS+= \
 	  contrib/ofed/usr.bin/ibaddr \
 	  contrib/ofed/usr.bin/ibnetdiscover \
 	  contrib/ofed/usr.bin/ibping \
 	  contrib/ofed/usr.bin/ibportstate \
 	  contrib/ofed/usr.bin/ibroute \
 	  contrib/ofed/usr.bin/ibsendtrap \
 	  contrib/ofed/usr.bin/ibstat \
 	  contrib/ofed/usr.bin/ibsysstat \
 	  contrib/ofed/usr.bin/ibtracert \
 	  contrib/ofed/usr.bin/opensm \
 	  contrib/ofed/usr.bin/osmtest \
 	  contrib/ofed/usr.bin/perfquery \
 	  contrib/ofed/usr.bin/saquery \
 	  contrib/ofed/usr.bin/sminfo \
 	  contrib/ofed/usr.bin/smpdump \
 	  contrib/ofed/usr.bin/smpquery \
 	  contrib/ofed/usr.bin/vendstat
 .endif
 
 DIRDEPS+= ${DIRDEPS.${MACHINE}:U}
 
 
 .include <dirdeps.mk>
Index: user/alc/PQ_LAUNDRY/tests/etc/rc.d/routing_test.sh
===================================================================
--- user/alc/PQ_LAUNDRY/tests/etc/rc.d/routing_test.sh	(revision 306282)
+++ user/alc/PQ_LAUNDRY/tests/etc/rc.d/routing_test.sh	(revision 306283)
@@ -1,73 +1,77 @@
 #
 #  Copyright (c) 2014 Spectra Logic Corporation
 #  All rights reserved.
 # 
 #  Redistribution and use in source and binary forms, with or without
 #  modification, are permitted provided that the following conditions
 #  are met:
 #  1. Redistributions of source code must retain the above copyright
 #     notice, this list of conditions, and the following disclaimer,
 #     without modification.
 #  2. Redistributions in binary form must reproduce at minimum a disclaimer
 #     substantially similar to the "NO WARRANTY" disclaimer below
 #     ("Disclaimer") and any redistribution must be conditioned upon
 #     including a substantially similar Disclaimer requirement for further
 #     binary redistribution.
 # 
 #  NO WARRANTY
 #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 #  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 #  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
 #  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 #  HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 #  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 #  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 #  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 #  STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 #  IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #  POSSIBILITY OF SUCH DAMAGES.
 # 
 #  Authors: Alan Somers         (Spectra Logic Corporation)
 #
 # $FreeBSD$
 
 atf_test_case static_ipv4_loopback_route_for_each_fib cleanup
 static_ipv4_loopback_route_for_each_fib_head()
 {
 	atf_set "descr" "Every FIB should have a static IPv4 loopback route"
 }
 static_ipv4_loopback_route_for_each_fib_body()
 {
 	local nfibs fib
 	nfibs=`sysctl -n net.fibs`
 
 	# Check for an IPv4 loopback route
 	for fib in `seq 0 $((${nfibs} - 1))`; do
 		atf_check -o match:"interface: lo0" -s exit:0 \
 			setfib -F ${fib} route -4 get 127.0.0.1
 	done
 }
 
 atf_test_case static_ipv6_loopback_route_for_each_fib cleanup
 static_ipv6_loopback_route_for_each_fib_head()
 {
 	atf_set "descr" "Every FIB should have a static IPv6 loopback route"
 }
 static_ipv6_loopback_route_for_each_fib_body()
 {
 	local nfibs fib
 	nfibs=`sysctl -n net.fibs`
 
+	if [ "`sysctl -in kern.features.inet6`" != "1" ]; then
+		atf_skip "This test requires IPv6 support"
+	fi
+
 	# Check for an IPv6 loopback route
 	for fib in `seq 0 $((${nfibs} - 1))`; do
 		atf_check -o match:"interface: lo0" -s exit:0 \
 			setfib -F ${fib} route -6 get ::1
 	done
 }
 
 atf_init_test_cases()
 {
 	atf_add_test_case static_ipv4_loopback_route_for_each_fib
 	atf_add_test_case static_ipv6_loopback_route_for_each_fib
 }
 
Index: user/alc/PQ_LAUNDRY/usr.bin/Makefile
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/Makefile	(revision 306282)
+++ user/alc/PQ_LAUNDRY/usr.bin/Makefile	(revision 306283)
@@ -1,310 +1,311 @@
 #	From: @(#)Makefile	8.3 (Berkeley) 1/7/94
 # $FreeBSD$
 
 .include <src.opts.mk>
 
 # XXX MISSING:		deroff diction graph learn plot
 #			spell spline struct xsend
 # XXX Use GNU versions: diff ld patch
 # Moved to secure: bdes
 #
 
 SUBDIR=	alias \
 	apply \
 	asa \
 	awk \
 	banner \
 	basename \
 	brandelf \
 	bsdcat \
 	bsdiff \
 	bzip2 \
 	bzip2recover \
 	cap_mkdb \
 	chat \
 	chpass \
 	cksum \
 	cmp \
 	col \
 	colldef \
 	colrm \
 	column \
 	comm \
 	compress \
 	cpuset \
 	csplit \
 	ctlstat \
 	cut \
 	dirname \
 	dpv \
 	du \
 	elf2aout \
 	elfdump \
 	enigma \
 	env \
 	expand \
 	false \
 	fetch \
 	find \
 	fmt \
 	fold \
 	fstat \
 	fsync \
 	gcore \
 	gencat \
 	getconf \
 	getent \
 	getopt \
 	grep \
 	gzip \
 	head \
 	hexdump \
 	id \
 	ident \
 	ipcrm \
 	ipcs \
 	join \
 	jot \
 	keylogin \
 	keylogout \
 	killall \
 	ktrace \
 	ktrdump \
 	lam \
 	lastcomm \
 	ldd \
 	leave \
 	less \
 	lessecho \
 	lesskey \
 	limits \
 	locale \
 	localedef \
 	lock \
 	lockf \
 	logger \
 	login \
 	logins \
 	logname \
 	look \
 	lorder \
 	lsvfs \
 	lzmainfo \
 	m4 \
 	mandoc \
 	mesg \
 	minigzip \
 	ministat \
 	mkdep \
 	mkfifo \
 	mkimg \
 	mklocale \
 	mktemp \
 	mkuzip \
 	mt \
 	ncal \
 	netstat \
 	newgrp \
 	nfsstat \
 	nice \
 	nl \
 	numactl \
 	nohup \
 	opieinfo \
 	opiekey \
 	opiepasswd \
 	pagesize \
 	passwd \
 	paste \
 	patch \
 	pathchk \
 	perror \
 	pr \
 	printenv \
 	printf \
+	proccontrol \
 	procstat \
 	protect \
 	rctl \
 	renice \
 	resizewin \
 	rev \
 	revoke \
 	rpcinfo \
 	rs \
 	rup \
 	rusers \
 	rwall \
 	script \
 	sdiff \
 	sed \
 	send-pr \
 	seq \
 	shar \
 	showmount \
 	sockstat \
 	soelim \
 	sort \
 	split \
 	stat \
 	stdbuf \
 	su \
 	systat \
 	tabs \
 	tail \
 	tar \
 	tcopy \
 	tee \
 	time \
 	timeout \
 	tip \
 	top \
 	touch \
 	tput \
 	tr \
 	true \
 	truncate \
 	tset \
 	tsort \
 	tty \
 	uname \
 	unexpand \
 	uniq \
 	unzip \
 	units \
 	unvis \
 	uudecode \
 	uuencode \
 	vis \
 	vmstat \
 	w \
 	wall \
 	wc \
 	what \
 	whereis \
 	which \
 	whois \
 	write \
 	xargs \
 	xinstall \
 	xo \
 	xz \
 	xzdec \
 	yes
 
 # NB: keep these sorted by MK_* knobs
 
 SUBDIR.${MK_AT}+=	at
 SUBDIR.${MK_ATM}+=	atm
 SUBDIR.${MK_BLUETOOTH}+=	bluetooth
 SUBDIR.${MK_BSD_CPIO}+=	cpio
 SUBDIR.${MK_CALENDAR}+=	calendar
 SUBDIR.${MK_CLANG}+=	clang
 SUBDIR.${MK_EE}+=	ee
 SUBDIR.${MK_FILE}+=	file
 SUBDIR.${MK_FINGER}+=	finger
 SUBDIR.${MK_FTP}+=	ftp
 SUBDIR.${MK_GAMES}+=	caesar
 SUBDIR.${MK_GAMES}+=	factor
 SUBDIR.${MK_GAMES}+=	fortune
 SUBDIR.${MK_GAMES}+=	grdc
 SUBDIR.${MK_GAMES}+=	morse
 SUBDIR.${MK_GAMES}+=	number
 SUBDIR.${MK_GAMES}+=	pom
 SUBDIR.${MK_GAMES}+=	primes
 SUBDIR.${MK_GAMES}+=	random
 .if ${MK_GPL_DTC} != "yes"
 .if ${COMPILER_FEATURES:Mc++11}
 SUBDIR+=	dtc
 .endif
 .endif
 SUBDIR.${MK_GROFF}+=	vgrind
 SUBDIR.${MK_HESIOD}+=	hesinfo
 SUBDIR.${MK_ICONV}+=	iconv
 SUBDIR.${MK_ICONV}+=	mkcsmapper
 SUBDIR.${MK_ICONV}+=	mkesdb
 SUBDIR.${MK_ISCSI}+=	iscsictl
 SUBDIR.${MK_KDUMP}+=	kdump
 SUBDIR.${MK_KDUMP}+=	truss
 SUBDIR.${MK_KERBEROS_SUPPORT}+=	compile_et
 SUBDIR.${MK_LDNS_UTILS}+=	drill
 SUBDIR.${MK_LDNS_UTILS}+=	host
 SUBDIR.${MK_LOCATE}+=	locate
 # XXX msgs?
 SUBDIR.${MK_MAIL}+=	biff
 SUBDIR.${MK_MAIL}+=	from
 SUBDIR.${MK_MAIL}+=	mail
 SUBDIR.${MK_MAIL}+=	msgs
 SUBDIR.${MK_MAKE}+=	bmake
 SUBDIR.${MK_MAN_UTILS}+=	catman
 .if ${MK_MANDOCDB} == "no"	# AND
 SUBDIR.${MK_MAN_UTILS}+=	makewhatis
 .endif
 SUBDIR.${MK_MAN_UTILS}+=	man
 SUBDIR.${MK_NETCAT}+=	nc
 SUBDIR.${MK_NIS}+=	ypcat
 SUBDIR.${MK_NIS}+=	ypmatch
 SUBDIR.${MK_NIS}+=	ypwhich
 SUBDIR.${MK_OPENSSH}+=	ssh-copy-id
 SUBDIR.${MK_OPENSSL}+=	bc
 SUBDIR.${MK_OPENSSL}+=	chkey
 SUBDIR.${MK_OPENSSL}+=	dc
 SUBDIR.${MK_OPENSSL}+=	newkey
 SUBDIR.${MK_QUOTAS}+=	quota
 SUBDIR.${MK_RCMDS}+=	rlogin
 SUBDIR.${MK_RCMDS}+=	rsh
 SUBDIR.${MK_RCMDS}+=	ruptime
 SUBDIR.${MK_RCMDS}+=	rwho
 SUBDIR.${MK_SENDMAIL}+=	vacation
 SUBDIR.${MK_TALK}+=	talk
 SUBDIR.${MK_TELNET}+=	telnet
 SUBDIR.${MK_TESTS}+=	tests
 SUBDIR.${MK_TEXTPROC}+=	checknr
 SUBDIR.${MK_TEXTPROC}+=	colcrt
 SUBDIR.${MK_TEXTPROC}+=	ul
 SUBDIR.${MK_TFTP}+=	tftp
 SUBDIR.${MK_TOOLCHAIN}+=	addr2line
 SUBDIR.${MK_TOOLCHAIN}+=	ar
 SUBDIR.${MK_TOOLCHAIN}+=	c89
 SUBDIR.${MK_TOOLCHAIN}+=	c99
 SUBDIR.${MK_TOOLCHAIN}+=	ctags
 SUBDIR.${MK_TOOLCHAIN}+=	cxxfilt
 SUBDIR.${MK_TOOLCHAIN}+=	elfcopy
 SUBDIR.${MK_TOOLCHAIN}+=	file2c
 # ARM64TODO gprof does not build
 # RISCVTODO gprof does not build
 .if ${MACHINE_ARCH} != "aarch64" && ${MACHINE_CPUARCH} != "riscv"
 SUBDIR.${MK_TOOLCHAIN}+=	gprof
 .endif
 SUBDIR.${MK_TOOLCHAIN}+=	indent
 SUBDIR.${MK_TOOLCHAIN}+=	lex
 SUBDIR.${MK_TOOLCHAIN}+=	mkstr
 SUBDIR.${MK_TOOLCHAIN}+=	nm
 SUBDIR.${MK_TOOLCHAIN}+=	readelf
 SUBDIR.${MK_TOOLCHAIN}+=	rpcgen
 SUBDIR.${MK_TOOLCHAIN}+=	unifdef
 SUBDIR.${MK_TOOLCHAIN}+=	size
 SUBDIR.${MK_TOOLCHAIN}+=	strings
 .if ${MACHINE_ARCH} != "aarch64" # ARM64TODO xlint does not build
 SUBDIR.${MK_TOOLCHAIN}+=	xlint
 .endif
 SUBDIR.${MK_TOOLCHAIN}+=	xstr
 SUBDIR.${MK_TOOLCHAIN}+=	yacc
 SUBDIR.${MK_VI}+=	vi
 SUBDIR.${MK_VT}+=	vtfontcvt
 SUBDIR.${MK_USB}+=	usbhidaction
 SUBDIR.${MK_USB}+=	usbhidctl
 SUBDIR.${MK_UTMPX}+=	last
 .if ${MK_CXX} != "no"
 SUBDIR.${MK_UTMPX}+=	users
 .endif
 SUBDIR.${MK_UTMPX}+=	who
 SUBDIR.${MK_SVN}+=	svn
 SUBDIR.${MK_SVNLITE}+=	svn
 
 .include <bsd.arch.inc.mk>
 
 SUBDIR:=	${SUBDIR:O:u}
 
 SUBDIR_PARALLEL=
 
 .include <bsd.subdir.mk>
Index: user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile.depend
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile.depend	(nonexistent)
+++ user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile.depend	(revision 306283)
@@ -0,0 +1,18 @@
+# $FreeBSD$
+# Autogenerated - do NOT edit!
+
+DIRDEPS = \
+	gnu/lib/csu \
+	gnu/lib/libgcc \
+	include \
+	include/xlocale \
+	lib/${CSU_DIR} \
+	lib/libc \
+	lib/libcompiler_rt \
+
+
+.include <dirdeps.mk>
+
+.if ${DEP_RELDIR} == ${_DEP_RELDIR}
+# local dependencies - needed for -jN in clean tree
+.endif

Property changes on: user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile.depend
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile	(nonexistent)
+++ user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile	(revision 306283)
@@ -0,0 +1,7 @@
+# $FreeBSD$
+
+PROG=   proccontrol
+WARNS?=	6
+MK_MAN=no
+
+.include <bsd.prog.mk>

Property changes on: user/alc/PQ_LAUNDRY/usr.bin/proccontrol/Makefile
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: user/alc/PQ_LAUNDRY/usr.bin/proccontrol/proccontrol.c
===================================================================
--- user/alc/PQ_LAUNDRY/usr.bin/proccontrol/proccontrol.c	(nonexistent)
+++ user/alc/PQ_LAUNDRY/usr.bin/proccontrol/proccontrol.c	(revision 306283)
@@ -0,0 +1,180 @@
+/*-
+ * Copyright (c) 2016 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/procctl.h>
+#include <err.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+enum {
+	MODE_INVALID,
+	MODE_TRACE,
+	MODE_TRAPCAP,
+};
+
+static pid_t
+str2pid(const char *str)
+{
+	pid_t res;
+	char *tail;
+
+	res = strtol(str, &tail, 0);
+	if (*tail != '\0') {
+		warnx("non-numeric pid");
+		return (-1);
+	}
+	return (res);
+}
+
+static void __dead2
+usage(void)
+{
+
+	fprintf(stderr, "Usage: proccontrol -m (trace|trapcap) [-q] "
+	    "[-s (enable|disable)] [-p pid | command]\n");
+	exit(1);
+}
+
+int
+main(int argc, char *argv[])
+{
+	int arg, ch, error, mode;
+	pid_t pid;
+	bool enable, do_command, query;
+
+	mode = MODE_INVALID;
+	enable = true;
+	pid = -1;
+	query = false;
+	while ((ch = getopt(argc, argv, "m:qs:p:")) != -1) {
+		switch (ch) {
+		case 'm':
+			if (strcmp(optarg, "trace") == 0)
+				mode = MODE_TRACE;
+			else if (strcmp(optarg, "trapcap") == 0)
+				mode = MODE_TRAPCAP;
+			else
+				usage();
+			break;
+		case 's':
+			if (strcmp(optarg, "enable") == 0)
+				enable = true;
+			else if (strcmp(optarg, "disable") == 0)
+				enable = false;
+			else
+				usage();
+			break;
+		case 'p':
+			pid = str2pid(optarg);
+			break;
+		case 'q':
+			query = true;
+			break;
+		case '?':
+		default:
+			usage();
+			break;
+		}
+	}
+	argc -= optind;
+	argv += optind;
+	do_command = argc != 0;
+	if (do_command) {
+		if (pid != -1 || query)
+			usage();
+		pid = getpid();
+	} else if (pid == -1) {
+		pid = getpid();
+	}
+
+	if (query) {
+		switch (mode) {
+		case MODE_TRACE:
+			error = procctl(P_PID, pid, PROC_TRACE_STATUS, &arg);
+			break;
+		case MODE_TRAPCAP:
+			error = procctl(P_PID, pid, PROC_TRAPCAP_STATUS, &arg);
+			break;
+		default:
+			usage();
+			break;
+		}
+		if (error != 0)
+			err(1, "procctl status");
+		switch (mode) {
+		case MODE_TRACE:
+			if (arg == -1)
+				printf("disabled\n");
+			else if (arg == 0)
+				printf("enabled, no debugger\n");
+			else
+				printf("enabled, traced by %d\n", arg);
+			break;
+		case MODE_TRAPCAP:
+			switch (arg) {
+			case PROC_TRAPCAP_CTL_ENABLE:
+				printf("enabled\n");
+				break;
+			case PROC_TRAPCAP_CTL_DISABLE:
+				printf("disabled\n");
+				break;
+			}
+			break;
+		}
+	} else {
+		switch (mode) {
+		case MODE_TRACE:
+			arg = enable ? PROC_TRACE_CTL_ENABLE :
+			    PROC_TRACE_CTL_DISABLE;
+			error = procctl(P_PID, pid, PROC_TRACE_CTL, &arg);
+			break;
+		case MODE_TRAPCAP:
+			arg = enable ? PROC_TRAPCAP_CTL_ENABLE :
+			    PROC_TRAPCAP_CTL_DISABLE;
+			error = procctl(P_PID, pid, PROC_TRAPCAP_CTL, &arg);
+			break;
+		default:
+			usage();
+			break;
+		}
+		if (error != 0)
+			err(1, "procctl ctl");
+		if (do_command) {
+			error = execvp(argv[0], argv);
+			err(1, "exec");
+		}
+	}
+	exit(0);
+}

Property changes on: user/alc/PQ_LAUNDRY/usr.bin/proccontrol/proccontrol.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: user/alc/PQ_LAUNDRY
===================================================================
--- user/alc/PQ_LAUNDRY	(revision 306282)
+++ user/alc/PQ_LAUNDRY	(revision 306283)

Property changes on: user/alc/PQ_LAUNDRY
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r306221-306282