Index: projects/fuse2/include/stdlib.h
===================================================================
--- projects/fuse2/include/stdlib.h	(revision 350434)
+++ projects/fuse2/include/stdlib.h	(revision 350435)
@@ -1,347 +1,348 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)stdlib.h	8.5 (Berkeley) 5/19/95
  * $FreeBSD$
  */
 
 #ifndef _STDLIB_H_
 #define	_STDLIB_H_
 
 #include <sys/cdefs.h>
 #include <sys/_null.h>
 #include <sys/_types.h>
 
 __NULLABILITY_PRAGMA_PUSH
 
 #if __BSD_VISIBLE
 #ifndef _RUNE_T_DECLARED
 typedef	__rune_t	rune_t;
 #define	_RUNE_T_DECLARED
 #endif
 #endif
 
 #ifndef _SIZE_T_DECLARED
 typedef	__size_t	size_t;
 #define	_SIZE_T_DECLARED
 #endif
 
 #ifndef	__cplusplus
 #ifndef _WCHAR_T_DECLARED
 typedef	___wchar_t	wchar_t;
 #define	_WCHAR_T_DECLARED
 #endif
 #endif
 
 typedef struct {
 	int	quot;		/* quotient */
 	int	rem;		/* remainder */
 } div_t;
 
 typedef struct {
 	long	quot;
 	long	rem;
 } ldiv_t;
 
 #define	EXIT_FAILURE	1
 #define	EXIT_SUCCESS	0
 
 #define	RAND_MAX	0x7ffffffd
 
 __BEGIN_DECLS
 #ifdef _XLOCALE_H_
 #include <xlocale/_stdlib.h>
 #endif
 extern int __mb_cur_max;
 extern int ___mb_cur_max(void);
 #define	MB_CUR_MAX	((size_t)___mb_cur_max())
 
 _Noreturn void	 abort(void);
 int	 abs(int) __pure2;
 int	 atexit(void (* _Nonnull)(void));
 double	 atof(const char *);
 int	 atoi(const char *);
 long	 atol(const char *);
 void	*bsearch(const void *, const void *, size_t,
 	    size_t, int (*)(const void * _Nonnull, const void *));
 void	*calloc(size_t, size_t) __malloc_like __result_use_check
 	     __alloc_size2(1, 2);
 div_t	 div(int, int) __pure2;
 _Noreturn void	 exit(int);
 void	 free(void *);
 char	*getenv(const char *);
 long	 labs(long) __pure2;
 ldiv_t	 ldiv(long, long) __pure2;
 void	*malloc(size_t) __malloc_like __result_use_check __alloc_size(1);
 int	 mblen(const char *, size_t);
 size_t	 mbstowcs(wchar_t * __restrict , const char * __restrict, size_t);
 int	 mbtowc(wchar_t * __restrict, const char * __restrict, size_t);
 void	 qsort(void *, size_t, size_t,
 	    int (* _Nonnull)(const void *, const void *));
 int	 rand(void);
 void	*realloc(void *, size_t) __result_use_check __alloc_size(2);
 void	 srand(unsigned);
 double	 strtod(const char * __restrict, char ** __restrict);
 float	 strtof(const char * __restrict, char ** __restrict);
 long	 strtol(const char * __restrict, char ** __restrict, int);
 long double
 	 strtold(const char * __restrict, char ** __restrict);
 unsigned long
 	 strtoul(const char * __restrict, char ** __restrict, int);
 int	 system(const char *);
 int	 wctomb(char *, wchar_t);
 size_t	 wcstombs(char * __restrict, const wchar_t * __restrict, size_t);
 
 /*
  * Functions added in C99 which we make conditionally available in the
  * BSD^C89 namespace if the compiler supports `long long'.
  * The #if test is more complicated than it ought to be because
  * __BSD_VISIBLE implies __ISO_C_VISIBLE == 1999 *even if* `long long'
  * is not supported in the compilation environment (which therefore means
  * that it can't really be ISO C99).
  *
  * (The only other extension made by C99 in thie header is _Exit().)
  */
 #if __ISO_C_VISIBLE >= 1999 || defined(__cplusplus)
 #ifdef __LONG_LONG_SUPPORTED
 /* LONGLONG */
 typedef struct {
 	long long quot;
 	long long rem;
 } lldiv_t;
 
 /* LONGLONG */
 long long
 	 atoll(const char *);
 /* LONGLONG */
 long long
 	 llabs(long long) __pure2;
 /* LONGLONG */
 lldiv_t	 lldiv(long long, long long) __pure2;
 /* LONGLONG */
 long long
 	 strtoll(const char * __restrict, char ** __restrict, int);
 /* LONGLONG */
 unsigned long long
 	 strtoull(const char * __restrict, char ** __restrict, int);
 #endif /* __LONG_LONG_SUPPORTED */
 
 _Noreturn void	 _Exit(int);
 #endif /* __ISO_C_VISIBLE >= 1999 */
 
 /*
  * If we're in a mode greater than C99, expose C11 functions.
  */
 #if __ISO_C_VISIBLE >= 2011 || __cplusplus >= 201103L
 void *	aligned_alloc(size_t, size_t) __malloc_like __alloc_align(1)
 	    __alloc_size(2);
 int	at_quick_exit(void (*)(void));
 _Noreturn void
 	quick_exit(int);
 #endif /* __ISO_C_VISIBLE >= 2011 */
 /*
  * Extensions made by POSIX relative to C.
  */
 #if __POSIX_VISIBLE >= 199506 || __XSI_VISIBLE
 char	*realpath(const char * __restrict, char * __restrict);
 #endif
 #if __POSIX_VISIBLE >= 199506
 int	 rand_r(unsigned *);			/* (TSF) */
 #endif
 #if __POSIX_VISIBLE >= 200112
 int	 posix_memalign(void **, size_t, size_t); /* (ADV) */
 int	 setenv(const char *, const char *, int);
 int	 unsetenv(const char *);
 #endif
 
 #if __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE
 int	 getsubopt(char **, char *const *, char **);
 #ifndef _MKDTEMP_DECLARED
 char	*mkdtemp(char *);
 #define	_MKDTEMP_DECLARED
 #endif
 #ifndef _MKSTEMP_DECLARED
 int	 mkstemp(char *);
 #define	_MKSTEMP_DECLARED
 #endif
 #endif /* __POSIX_VISIBLE >= 200809 || __XSI_VISIBLE */
 
 /*
  * The only changes to the XSI namespace in revision 6 were the deletion
  * of the ttyslot() and valloc() functions, which FreeBSD never declared
  * in this header.  For revision 7, ecvt(), fcvt(), and gcvt(), which
  * FreeBSD also does not have, and mktemp(), are to be deleted.
  */
 #if __XSI_VISIBLE
 /* XXX XSI requires pollution from <sys/wait.h> here.  We'd rather not. */
 long	 a64l(const char *);
 double	 drand48(void);
 /* char	*ecvt(double, int, int * __restrict, int * __restrict); */
 double	 erand48(unsigned short[3]);
 /* char	*fcvt(double, int, int * __restrict, int * __restrict); */
 /* char	*gcvt(double, int, int * __restrict, int * __restrict); */
 int	 grantpt(int);
 char	*initstate(unsigned int, char *, size_t);
 long	 jrand48(unsigned short[3]);
 char	*l64a(long);
 void	 lcong48(unsigned short[7]);
 long	 lrand48(void);
 #if !defined(_MKTEMP_DECLARED) && (__BSD_VISIBLE || __XSI_VISIBLE <= 600)
 char	*mktemp(char *);
 #define	_MKTEMP_DECLARED
 #endif
 long	 mrand48(void);
 long	 nrand48(unsigned short[3]);
 int	 posix_openpt(int);
 char	*ptsname(int);
 int	 putenv(char *);
 long	 random(void);
 unsigned short
 	*seed48(unsigned short[3]);
 char	*setstate(/* const */ char *);
 void	 srand48(long);
 void	 srandom(unsigned int);
 int	 unlockpt(int);
 #endif /* __XSI_VISIBLE */
 
 #if __BSD_VISIBLE
 extern const char *malloc_conf;
 extern void (*malloc_message)(void *, const char *);
 
 /*
  * The alloca() function can't be implemented in C, and on some
  * platforms it can't be implemented at all as a callable function.
  * The GNU C compiler provides a built-in alloca() which we can use.
  * On platforms where alloca() is not in libc, programs which use it
  * will fail to link when compiled with non-GNU compilers.
  */
 #if __GNUC__ >= 2 || defined(__INTEL_COMPILER)
 #undef  alloca	/* some GNU bits try to get cute and define this on their own */
 #define alloca(sz) __builtin_alloca(sz)
 #endif
 
 void	 abort2(const char *, int, void **) __dead2;
 __uint32_t
 	 arc4random(void);
 void	 arc4random_buf(void *, size_t);
 __uint32_t 
 	 arc4random_uniform(__uint32_t);
 
 #ifdef __BLOCKS__
 int	 atexit_b(void (^ _Nonnull)(void));
 void	*bsearch_b(const void *, const void *, size_t,
 	    size_t, int (^ _Nonnull)(const void *, const void *));
 #endif
 char	*getbsize(int *, long *);
 					/* getcap(3) functions */
 char	*cgetcap(char *, const char *, int);
 int	 cgetclose(void);
 int	 cgetent(char **, char **, const char *);
 int	 cgetfirst(char **, char **);
 int	 cgetmatch(const char *, const char *);
 int	 cgetnext(char **, char **);
 int	 cgetnum(char *, const char *, long *);
 int	 cgetset(const char *);
 int	 cgetstr(char *, const char *, char **);
 int	 cgetustr(char *, const char *, char **);
 
 int	 daemon(int, int);
 int	 daemonfd(int, int);
 char	*devname(__dev_t, __mode_t);
 char	*devname_r(__dev_t, __mode_t, char *, int);
 char	*fdevname(int);
 char	*fdevname_r(int, char *, int);
 int	 getloadavg(double [], int);
 const char *
 	 getprogname(void);
 
 int	 heapsort(void *, size_t, size_t,
 	    int (* _Nonnull)(const void *, const void *));
 #ifdef __BLOCKS__
 int	 heapsort_b(void *, size_t, size_t,
 	    int (^ _Nonnull)(const void *, const void *));
 void	 qsort_b(void *, size_t, size_t,
 	    int (^ _Nonnull)(const void *, const void *));
 #endif
 int	 l64a_r(long, char *, int);
 int	 mergesort(void *, size_t, size_t, int (*)(const void *, const void *));
 #ifdef __BLOCKS__
 int	 mergesort_b(void *, size_t, size_t, int (^)(const void *, const void *));
 #endif
 int	 mkostemp(char *, int);
 int	 mkostemps(char *, int, int);
+int	 mkostempsat(int, char *, int, int);
 void	 qsort_r(void *, size_t, size_t, void *,
 	    int (*)(void *, const void *, const void *));
 int	 radixsort(const unsigned char **, int, const unsigned char *,
 	    unsigned);
 void	*reallocarray(void *, size_t, size_t) __result_use_check
 	    __alloc_size2(2, 3);
 void	*reallocf(void *, size_t) __result_use_check __alloc_size(2);
 int	 rpmatch(const char *);
 void	 setprogname(const char *);
 int	 sradixsort(const unsigned char **, int, const unsigned char *,
 	    unsigned);
 void	 sranddev(void);
 void	 srandomdev(void);
 long long
 	strtonum(const char *, long long, long long, const char **);
 
 /* Deprecated interfaces, to be removed. */
 __int64_t
 	 strtoq(const char *, char **, int);
 __uint64_t
 	 strtouq(const char *, char **, int);
 
 extern char *suboptarg;			/* getsubopt(3) external variable */
 #endif /* __BSD_VISIBLE */
 
 #if __EXT1_VISIBLE
 
 #ifndef _ERRNO_T_DEFINED
 #define _ERRNO_T_DEFINED
 typedef int errno_t;
 #endif
 
 /* K.3.6 */
 typedef void (*constraint_handler_t)(const char * __restrict,
     void * __restrict, errno_t);
 /* K.3.6.1.1 */
 constraint_handler_t set_constraint_handler_s(constraint_handler_t handler);
 /* K.3.6.1.2 */
 _Noreturn void abort_handler_s(const char * __restrict, void * __restrict,
     errno_t);
 /* K3.6.1.3 */
 void ignore_handler_s(const char * __restrict, void * __restrict, errno_t);
 #endif /* __EXT1_VISIBLE */
 
 __END_DECLS
 __NULLABILITY_PRAGMA_POP
 
 #endif /* !_STDLIB_H_ */
Index: projects/fuse2/lib/libarchive/tests/Makefile
===================================================================
--- projects/fuse2/lib/libarchive/tests/Makefile	(revision 350434)
+++ projects/fuse2/lib/libarchive/tests/Makefile	(revision 350435)
@@ -1,629 +1,628 @@
 # $FreeBSD$
 
 PACKAGE=	tests
 
 _LIBARCHIVEDIR=	${SRCTOP}/contrib/libarchive
 
 ATF_TESTS_SH+=	functional_test
 
 TEST_METADATA.functional_test+=	timeout="600"
 
 BINDIR=	${TESTSDIR}
 
 PROGS+=	libarchive_test
 
 CFLAGS+= -I${.CURDIR} -I${.CURDIR:H} -I${.OBJDIR}
 CFLAGS+= -I${_LIBARCHIVEDIR}/libarchive -I${_LIBARCHIVEDIR}/libarchive/test
 CFLAGS+= -I${_LIBARCHIVEDIR}/test_utils
 CFLAGS+= -DHAVE_LIBLZMA=1 -DHAVE_LZMA_H=1
 
 # Uncomment to link against dmalloc
 #LDADD+= -L/usr/local/lib -ldmalloc
 #CFLAGS+= -I/usr/local/include -DUSE_DMALLOC
 
 .PATH: ${_LIBARCHIVEDIR}/libarchive/test
 TESTS_SRCS= \
 	test_acl_nfs4.c				\
 	test_acl_pax.c				\
 	test_acl_platform_nfs4.c		\
 	test_acl_platform_posix1e.c		\
 	test_acl_posix1e.c			\
 	test_acl_text.c				\
 	test_archive_api_feature.c		\
 	test_archive_clear_error.c		\
 	test_archive_cmdline.c			\
 	test_archive_digest.c			\
 	test_archive_getdate.c			\
 	test_archive_match_time.c		\
 	test_archive_match_owner.c		\
 	test_archive_match_path.c		\
 	test_archive_pathmatch.c		\
 	test_archive_read_add_passphrase.c	\
 	test_archive_read_close_twice.c		\
 	test_archive_read_close_twice_open_fd.c	\
 	test_archive_read_close_twice_open_filename.c	\
 	test_archive_read_multiple_data_objects.c	\
 	test_archive_read_next_header_empty.c	\
 	test_archive_read_next_header_raw.c	\
 	test_archive_read_open2.c		\
 	test_archive_read_set_filter_option.c	\
 	test_archive_read_set_format_option.c	\
 	test_archive_read_set_option.c		\
 	test_archive_read_set_options.c		\
 	test_archive_read_support.c		\
 	test_archive_set_error.c		\
 	test_archive_string.c			\
 	test_archive_string_conversion.c	\
 	test_archive_write_add_filter_by_name.c	\
 	test_archive_write_set_filter_option.c	\
 	test_archive_write_set_format_by_name.c	\
 	test_archive_write_set_format_filter_by_ext.c \
 	test_archive_write_set_format_option.c	\
 	test_archive_write_set_option.c		\
 	test_archive_write_set_options.c	\
 	test_archive_write_set_passphrase.c	\
 	test_bad_fd.c				\
 	test_compat_bzip2.c			\
 	test_compat_cpio.c			\
 	test_compat_gtar.c			\
 	test_compat_gzip.c			\
 	test_compat_lz4.c			\
 	test_compat_lzip.c			\
 	test_compat_lzma.c			\
 	test_compat_lzop.c			\
 	test_compat_mac.c			\
 	test_compat_perl_archive_tar.c		\
 	test_compat_plexus_archiver_tar.c	\
 	test_compat_solaris_tar_acl.c		\
 	test_compat_solaris_pax_sparse.c	\
 	test_compat_star_acl.c			\
 	test_compat_tar_hardlink.c		\
 	test_compat_uudecode.c			\
 	test_compat_uudecode_large.c		\
 	test_compat_xz.c			\
 	test_compat_zip.c			\
 	test_compat_zstd.c			\
 	test_empty_write.c			\
 	test_entry.c				\
 	test_entry_strmode.c			\
 	test_extattr_freebsd.c			\
 	test_filter_count.c			\
 	test_fuzz.c				\
 	test_gnutar_filename_encoding.c		\
 	test_link_resolver.c			\
 	test_open_fd.c				\
 	test_open_failure.c			\
 	test_open_file.c			\
 	test_open_filename.c			\
 	test_pax_filename_encoding.c		\
 	test_read_data_large.c			\
 	test_read_disk.c			\
 	test_read_disk_directory_traversals.c	\
 	test_read_disk_entry_from_file.c	\
 	test_read_extract.c			\
 	test_read_file_nonexistent.c		\
 	test_read_filter_compress.c		\
 	test_read_filter_grzip.c		\
 	test_read_filter_lrzip.c		\
 	test_read_filter_lzop.c			\
 	test_read_filter_lzop_multiple_parts.c	\
 	test_read_filter_program.c		\
 	test_read_filter_program_signature.c	\
 	test_read_filter_uudecode.c		\
 	test_read_format_7zip.c			\
 	test_read_format_7zip_encryption_data.c \
 	test_read_format_7zip_encryption_header.c	\
 	test_read_format_7zip_encryption_partially.c	\
 	test_read_format_7zip_malformed.c	\
 	test_read_format_ar.c			\
 	test_read_format_cab.c			\
 	test_read_format_cab_filename.c		\
 	test_read_format_cpio_afio.c		\
 	test_read_format_cpio_bin.c		\
 	test_read_format_cpio_bin_Z.c		\
 	test_read_format_cpio_bin_be.c		\
 	test_read_format_cpio_bin_bz2.c		\
 	test_read_format_cpio_bin_gz.c		\
 	test_read_format_cpio_bin_le.c		\
 	test_read_format_cpio_bin_lzip.c	\
 	test_read_format_cpio_bin_lzma.c	\
 	test_read_format_cpio_bin_xz.c		\
 	test_read_format_cpio_filename.c	\
 	test_read_format_cpio_odc.c		\
 	test_read_format_cpio_svr4_gzip.c	\
 	test_read_format_cpio_svr4c_Z.c		\
 	test_read_format_cpio_svr4_bzip2_rpm.c	\
 	test_read_format_cpio_svr4_gzip_rpm.c	\
 	test_read_format_empty.c		\
 	test_read_format_gtar_filename.c	\
 	test_read_format_gtar_gz.c		\
 	test_read_format_gtar_lzma.c		\
 	test_read_format_gtar_sparse.c		\
 	test_read_format_gtar_sparse_skip_entry.c \
 	test_read_format_iso_Z.c		\
 	test_read_format_iso_multi_extent.c	\
 	test_read_format_iso_xorriso.c		\
 	test_read_format_isorr_rr_moved.c	\
 	test_read_format_isojoliet_bz2.c	\
 	test_read_format_isojoliet_long.c	\
 	test_read_format_isojoliet_rr.c		\
 	test_read_format_isojoliet_versioned.c	\
 	test_read_format_isorr_bz2.c		\
 	test_read_format_isorr_ce.c		\
 	test_read_format_isorr_new_bz2.c	\
 	test_read_format_isozisofs_bz2.c	\
 	test_read_format_lha.c			\
 	test_read_format_lha_bugfix_0.c		\
 	test_read_format_lha_filename.c		\
 	test_read_format_mtree.c		\
 	test_read_format_mtree_crash747.c	\
 	test_read_format_pax_bz2.c		\
 	test_read_format_rar.c			\
 	test_read_format_rar5.c			\
 	test_read_format_rar_encryption_data.c	\
 	test_read_format_rar_encryption_header.c	\
 	test_read_format_rar_encryption_partially.c	\
 	test_read_format_rar_invalid1.c		\
 	test_read_format_raw.c			\
 	test_read_format_tar.c			\
 	test_read_format_tar_concatenated.c	\
 	test_read_format_tar_empty_filename.c	\
 	test_read_format_tar_empty_pax.c	\
 	test_read_format_tar_empty_with_gnulabel.c	\
 	test_read_format_tar_filename.c		\
 	test_read_format_tbz.c			\
 	test_read_format_tgz.c			\
 	test_read_format_tlz.c			\
 	test_read_format_txz.c			\
 	test_read_format_tz.c			\
 	test_read_format_ustar_filename.c	\
 	test_read_format_warc.c			\
 	test_read_format_xar.c			\
 	test_read_format_zip.c			\
 	test_read_format_zip_7075_utf8_paths.c	\
 	test_read_format_zip_comment_stored.c	\
 	test_read_format_zip_encryption_data.c	\
 	test_read_format_zip_encryption_header.c	\
 	test_read_format_zip_encryption_partially.c	\
 	test_read_format_zip_extra_padding.c	\
 	test_read_format_zip_filename.c		\
 	test_read_format_zip_high_compression.c	\
 	test_read_format_zip_jar.c		\
 	test_read_format_zip_mac_metadata.c	\
 	test_read_format_zip_malformed.c	\
 	test_read_format_zip_msdos.c		\
 	test_read_format_zip_nested.c		\
 	test_read_format_zip_nofiletype.c	\
 	test_read_format_zip_padded.c		\
 	test_read_format_zip_sfx.c		\
 	test_read_format_zip_traditional_encryption_data.c	\
 	test_read_format_zip_winzip_aes.c	\
 	test_read_format_zip_winzip_aes_large.c	\
 	test_read_format_zip_with_invalid_traditional_eocd.c	\
 	test_read_format_zip_zip64.c		\
 	test_read_large.c			\
 	test_read_pax_schily_xattr.c		\
 	test_read_pax_truncated.c		\
 	test_read_position.c			\
 	test_read_set_format.c			\
 	test_read_too_many_filters.c		\
 	test_read_truncated.c			\
 	test_read_truncated_filter.c		\
 	test_sparse_basic.c			\
 	test_tar_filenames.c			\
 	test_tar_large.c			\
 	test_warn_missing_hardlink_target.c	\
 	test_ustar_filenames.c			\
 	test_ustar_filename_encoding.c		\
 	test_write_disk.c			\
 	test_write_disk_appledouble.c		\
 	test_write_disk_failures.c		\
 	test_write_disk_hardlink.c		\
 	test_write_disk_hfs_compression.c	\
 	test_write_disk_lookup.c		\
 	test_write_disk_mac_metadata.c		\
 	test_write_disk_no_hfs_compression.c	\
 	test_write_disk_perms.c			\
 	test_write_disk_secure.c		\
 	test_write_disk_secure744.c		\
 	test_write_disk_secure745.c		\
 	test_write_disk_secure746.c		\
 	test_write_disk_sparse.c		\
 	test_write_disk_symlink.c		\
 	test_write_disk_times.c			\
 	test_write_filter_b64encode.c		\
 	test_write_filter_bzip2.c		\
 	test_write_filter_compress.c		\
 	test_write_filter_gzip.c		\
 	test_write_filter_gzip_timestamp.c	\
 	test_write_filter_lrzip.c		\
 	test_write_filter_lz4.c			\
 	test_write_filter_lzip.c		\
 	test_write_filter_lzma.c		\
 	test_write_filter_lzop.c		\
 	test_write_filter_program.c		\
 	test_write_filter_uuencode.c		\
 	test_write_filter_xz.c			\
 	test_write_filter_zstd.c		\
 	test_write_format_7zip.c		\
 	test_write_format_7zip_empty.c		\
 	test_write_format_7zip_large.c		\
 	test_write_format_ar.c			\
 	test_write_format_cpio.c		\
 	test_write_format_cpio_empty.c		\
 	test_write_format_cpio_newc.c		\
 	test_write_format_cpio_odc.c		\
 	test_write_format_gnutar.c		\
 	test_write_format_gnutar_filenames.c	\
 	test_write_format_iso9660.c		\
 	test_write_format_iso9660_boot.c	\
 	test_write_format_iso9660_empty.c	\
 	test_write_format_iso9660_filename.c	\
 	test_write_format_iso9660_zisofs.c	\
 	test_write_format_mtree.c		\
 	test_write_format_mtree_absolute_path.c	\
 	test_write_format_mtree_classic.c	\
 	test_write_format_mtree_classic_indent.c	\
 	test_write_format_mtree_fflags.c	\
 	test_write_format_mtree_no_separator.c	\
 	test_write_format_mtree_quoted_filename.c	\
 	test_write_format_pax.c			\
 	test_write_format_raw.c			\
 	test_write_format_raw_b64.c		\
 	test_write_format_shar_empty.c		\
 	test_write_format_tar.c			\
 	test_write_format_tar_empty.c		\
 	test_write_format_tar_sparse.c		\
 	test_write_format_tar_ustar.c		\
 	test_write_format_tar_v7tar.c		\
 	test_write_format_warc.c		\
 	test_write_format_warc_empty.c		\
 	test_write_format_xar.c			\
 	test_write_format_xar_empty.c		\
 	test_write_format_zip.c			\
 	test_write_format_zip_compression_store.c	\
 	test_write_format_zip_empty.c		\
 	test_write_format_zip_empty_zip64.c	\
 	test_write_format_zip_file.c		\
 	test_write_format_zip_file_zip64.c	\
 	test_write_format_zip_large.c		\
 	test_write_format_zip_zip64.c		\
 	test_write_open_memory.c		\
 	test_write_read_format_zip.c		\
 	test_xattr_platform.c			\
 	test_zip_filename_encoding.c
 
 # Deterministic failures:
 # Crashes with SIGBUS
 BROKEN_TESTS+=			test_archive_rmd160
 # Fails with `libarchive/test/test_archive_crypto.c:121: md != actualmd`
 BROKEN_TESTS+=			test_archive_sha384
 # Fails with `test_read_disk_directory_traversals.c:1094: File at has atime 886622, 1443306049 seconds ago`
 BROKEN_TESTS+=			test_read_disk_directory_traversals
 
 # Non-deterministic failures:
 # (Times out?) [and] crashes
 BROKEN_TESTS+=			test_fuzz_rar
 
 # Build the test program.
 SRCS.libarchive_test=		\
 	${TESTS_SRCS}		\
 	read_open_memory.c	\
 	list.h
 
 LIBADD.libarchive_test=	archive
 
 .PATH: ${_LIBARCHIVEDIR}/test_utils
 SRCS.libarchive_test+=	test_main.c	\
 			test_utils.c
 
 # list.h is just a list of all tests, as indicated by DEFINE_TEST macro lines
 list.h: ${TESTS_SRCS} Makefile
 	@(cd ${_LIBARCHIVEDIR}/libarchive/test && \
 	grep -E -h ^DEFINE_TEST ${.ALLSRC:N*Makefile} | \
 	    egrep -v '${BROKEN_TESTS:tW:C/ /|/g}') > ${.TARGET}.tmp
 	@mv ${.TARGET}.tmp ${.TARGET}
 
 CLEANTESTS+=	list.h list.h.tmp
 ${PACKAGE}FILES+=	README
 ${PACKAGE}FILES+=	test_acl_pax_posix1e.tar.uu
 ${PACKAGE}FILES+=	test_acl_pax_nfs4.tar.uu
 ${PACKAGE}FILES+=	test_archive_string_conversion.txt.Z.uu
 ${PACKAGE}FILES+=	test_compat_bzip2_1.tbz.uu
 ${PACKAGE}FILES+=	test_compat_bzip2_2.tbz.uu
 ${PACKAGE}FILES+=	test_compat_cpio_1.cpio.uu
 ${PACKAGE}FILES+=	test_compat_gtar_1.tar.uu
 ${PACKAGE}FILES+=	test_compat_gtar_2.tar.uu
 ${PACKAGE}FILES+=	test_compat_gzip_1.tgz.uu
 ${PACKAGE}FILES+=	test_compat_gzip_2.tgz.uu
 ${PACKAGE}FILES+=	test_compat_lz4_1.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_2.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_3.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B4.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B4BD.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B4BDBX.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B5.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B5BD.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B6.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B6BD.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B7.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lz4_B7BD.tar.lz4.uu
 ${PACKAGE}FILES+=	test_compat_lzip_1.tlz.uu
 ${PACKAGE}FILES+=	test_compat_lzip_2.tlz.uu
 ${PACKAGE}FILES+=	test_compat_lzma_1.tlz.uu
 ${PACKAGE}FILES+=	test_compat_lzma_2.tlz.uu
 ${PACKAGE}FILES+=	test_compat_lzma_3.tlz.uu
 ${PACKAGE}FILES+=	test_compat_lzop_1.tar.lzo.uu
 ${PACKAGE}FILES+=	test_compat_lzop_2.tar.lzo.uu
 ${PACKAGE}FILES+=	test_compat_lzop_3.tar.lzo.uu
 ${PACKAGE}FILES+=	test_compat_mac-1.tar.Z.uu
 ${PACKAGE}FILES+=	test_compat_mac-2.tar.Z.uu
 ${PACKAGE}FILES+=	test_compat_perl_archive_tar.tar.uu
 ${PACKAGE}FILES+=	test_compat_plexus_archiver_tar.tar.uu
 ${PACKAGE}FILES+=	test_compat_solaris_pax_sparse_1.pax.Z.uu
 ${PACKAGE}FILES+=	test_compat_solaris_pax_sparse_2.pax.Z.uu
 ${PACKAGE}FILES+=	test_compat_solaris_tar_acl.tar.uu
 ${PACKAGE}FILES+=	test_compat_star_acl_nfs4.tar.uu
 ${PACKAGE}FILES+=	test_compat_star_acl_posix1e.tar.uu
 ${PACKAGE}FILES+=	test_compat_tar_hardlink_1.tar.uu
 ${PACKAGE}FILES+=	test_compat_uudecode_large.tar.Z.uu
 ${PACKAGE}FILES+=	test_compat_xz_1.txz.uu
 ${PACKAGE}FILES+=	test_compat_zip_1.zip.uu
 ${PACKAGE}FILES+=	test_compat_zip_2.zip.uu
 ${PACKAGE}FILES+=	test_compat_zip_3.zip.uu
 ${PACKAGE}FILES+=	test_compat_zip_4.zip.uu
 ${PACKAGE}FILES+=	test_compat_zip_5.zip.uu
 ${PACKAGE}FILES+=	test_compat_zip_6.zip.uu
 ${PACKAGE}FILES+=	test_compat_zip_7.xps.uu
 ${PACKAGE}FILES+=	test_compat_zip_8.zip.uu
 ${PACKAGE}FILES+=	test_compat_zstd_1.tar.zst.uu
 ${PACKAGE}FILES+=	test_fuzz.cab.uu
 ${PACKAGE}FILES+=	test_fuzz.lzh.uu
 ${PACKAGE}FILES+=	test_fuzz_1.iso.Z.uu
 ${PACKAGE}FILES+=	test_pax_filename_encoding.tar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_multiple_files.part1.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_multiple_files.part2.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_multiple_files.part3.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_multiple_files.part4.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_multiple_files.part5.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_multiple_files.part6.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_single_file.part1.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_single_file.part2.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_single_file.part3.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part01.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part02.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part03.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part04.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part05.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part06.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part07.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part08.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part09.rar.uu
 ${PACKAGE}FILES+=	test_rar_multivolume_uncompressed_files.part10.rar.uu
 ${PACKAGE}FILES+=	test_read_filter_grzip.tar.grz.uu
 ${PACKAGE}FILES+=	test_read_filter_lrzip.tar.lrz.uu
 ${PACKAGE}FILES+=	test_read_filter_lzop.tar.lzo.uu
 ${PACKAGE}FILES+=	test_read_filter_lzop_multiple_parts.tar.lzo.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_bzip2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_copy_1.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_copy_2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_copy_lzma.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_deflate.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_lzma1_1.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_lzma1_2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_lzma2_1.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj2_lzma2_2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj_bzip2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj_copy.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj_deflate.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj_lzma1.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bcj_lzma2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_bzip2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_copy.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_copy_2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_deflate.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_delta_lzma1.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_delta_lzma2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_empty_archive.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_empty_file.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_encryption.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_encryption_header.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_encryption_partially.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_lzma1.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_lzma1_2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_lzma1_lzma2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_lzma2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_malformed.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_malformed2.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_ppmd.7z.uu
 ${PACKAGE}FILES+=	test_read_format_7zip_symbolic_name.7z.uu
 ${PACKAGE}FILES+=	test_read_format_ar.ar.uu
 ${PACKAGE}FILES+=	test_read_format_cab_1.cab.uu
 ${PACKAGE}FILES+=	test_read_format_cab_2.cab.uu
 ${PACKAGE}FILES+=	test_read_format_cab_3.cab.uu
 ${PACKAGE}FILES+=	test_read_format_cab_filename_cp932.cab.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_bin_be.cpio.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_bin_le.cpio.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_filename_cp866.cpio.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_filename_eucjp.cpio.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_filename_koi8r.cpio.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_filename_utf8_jp.cpio.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_filename_utf8_ru.cpio.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_svr4_bzip2_rpm.rpm.uu
 ${PACKAGE}FILES+=	test_read_format_cpio_svr4_gzip_rpm.rpm.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_filename_cp866.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_filename_eucjp.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_filename_koi8r.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_sparse_1_13.tar.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_sparse_1_17.tar.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_sparse_1_17_posix00.tar.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_sparse_1_17_posix01.tar.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_sparse_1_17_posix10.tar.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_sparse_1_17_posix10_modified.tar.uu
 ${PACKAGE}FILES+=	test_read_format_gtar_sparse_skip_entry.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_2.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_joliet.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_joliet_by_nero.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_joliet_long.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_joliet_rockridge.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_multi_extent.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_rockridge.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_rockridge_ce.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_rockridge_new.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_rockridge_rr_moved.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_xorriso.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_iso_zisofs.iso.Z.uu
 ${PACKAGE}FILES+=	test_read_format_lha_bugfix_0.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_filename_cp932.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_header0.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_header1.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_header2.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_header3.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_lh0.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_lh6.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_lh7.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_lha_withjunk.lzh.uu
 ${PACKAGE}FILES+=	test_read_format_mtree.mtree.uu
 ${PACKAGE}FILES+=	test_read_format_mtree_crash747.mtree.bz2.uu
 ${PACKAGE}FILES+=	test_read_format_mtree_nomagic.mtree.uu
 ${PACKAGE}FILES+=	test_read_format_mtree_nomagic2.mtree.uu
 ${PACKAGE}FILES+=	test_read_format_mtree_nomagic3.mtree.uu
 ${PACKAGE}FILES+=	test_read_format_mtree_noprint.mtree.uu
 ${PACKAGE}FILES+=	test_read_format_rar.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_binary_data.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_compress_best.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_compress_normal.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_encryption_data.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_encryption_header.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_encryption_partially.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_invalid1.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_multi_lzss_blocks.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_multivolume.part0001.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_multivolume.part0002.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_multivolume.part0003.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_multivolume.part0004.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_noeof.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_ppmd_lzss_conversion.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_ppmd_use_after_free.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_ppmd_use_after_free2.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_sfx.exe.uu
 ${PACKAGE}FILES+=	test_read_format_rar_subblock.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_unicode.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar_windows.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_arm.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_arm_filter_on_window_boundary.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_blake2.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_compressed.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_different_window_size.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_distance_overflow.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_extra_field_version.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_fileattr.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_hardlink.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_invalid_dict_reference.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_leftshift1.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_leftshift2.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part01.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part02.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part03.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part04.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part05.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part06.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part07.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive.part08.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive_solid.part01.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive_solid.part02.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive_solid.part03.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiarchive_solid.part04.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiple_files.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_multiple_files_solid.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_nonempty_dir_stream.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_owner.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_readtables_overflow.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_solid.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_stored.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_stored_manyfiles.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_symlink.rar.uu
 ${PACKAGE}FILES+=	test_read_format_rar5_truncated_huff.rar.uu	
 ${PACKAGE}FILES+=	test_read_format_rar5_win32.rar.uu
 ${PACKAGE}FILES+=	test_read_format_raw.bufr.uu
 ${PACKAGE}FILES+=	test_read_format_raw.data.Z.uu
 ${PACKAGE}FILES+=	test_read_format_raw.data.gz.uu
 ${PACKAGE}FILES+=	test_read_format_raw.data.uu
 ${PACKAGE}FILES+=	test_read_format_tar_concatenated.tar.uu
 ${PACKAGE}FILES+=	test_read_format_tar_empty_filename.tar.uu
 ${PACKAGE}FILES+=	test_read_format_tar_empty_with_gnulabel.tar.uu
 ${PACKAGE}FILES+=	test_read_format_tar_empty_pax.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_tar_filename_koi8r.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_ustar_filename_cp866.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_ustar_filename_eucjp.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_ustar_filename_koi8r.tar.Z.uu
 ${PACKAGE}FILES+=	test_read_format_warc.warc.uu
 ${PACKAGE}FILES+=	test_read_format_zip.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_7075_utf8_paths.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_bz2_hang.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_bzip2.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_bzip2_multi.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_comment_stored_1.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_comment_stored_2.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_encryption_data.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_encryption_header.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_encryption_partially.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_extra_padding.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_filename_cp866.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_filename_cp932.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_filename_koi8r.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_filename_utf8_jp.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_filename_utf8_ru.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_filename_utf8_ru2.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_high_compression.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_jar.jar.uu
 ${PACKAGE}FILES+=	test_read_format_zip_length_at_end.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_lzma_alone_leak.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_lzma.zipx.uu
-${PACKAGE}FILES+=	test_read_format_zip_lzma.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_lzma_multi.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_mac_metadata.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_malformed1.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_msdos.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_nested.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_nofiletype.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_padded1.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_padded2.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_padded3.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_ppmd8.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_ppmd8_crash_1.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_ppmd8_crash_2.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_ppmd8_multi.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_sfx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_symlink.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_traditional_encryption_data.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_ux.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_with_invalid_traditional_eocd.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_winzip_aes128.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_winzip_aes256.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_winzip_aes256_large.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_winzip_aes256_stored.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_xz_multi.zipx.uu
 ${PACKAGE}FILES+=	test_read_format_zip_zip64a.zip.uu
 ${PACKAGE}FILES+=	test_read_format_zip_zip64b.zip.uu
 ${PACKAGE}FILES+=	test_read_large_splitted_rar_aa.uu
 ${PACKAGE}FILES+=	test_read_large_splitted_rar_ab.uu
 ${PACKAGE}FILES+=	test_read_large_splitted_rar_ac.uu
 ${PACKAGE}FILES+=	test_read_large_splitted_rar_ad.uu
 ${PACKAGE}FILES+=	test_read_large_splitted_rar_ae.uu
 ${PACKAGE}FILES+=	test_read_pax_schily_xattr.tar.uu
 ${PACKAGE}FILES+=	test_read_splitted_rar_aa.uu
 ${PACKAGE}FILES+=	test_read_splitted_rar_ab.uu
 ${PACKAGE}FILES+=	test_read_splitted_rar_ac.uu
 ${PACKAGE}FILES+=	test_read_splitted_rar_ad.uu
 ${PACKAGE}FILES+=	test_read_too_many_filters.gz.uu
 ${PACKAGE}FILES+=	test_splitted_rar_seek_support_aa.uu
 ${PACKAGE}FILES+=	test_splitted_rar_seek_support_ab.uu
 ${PACKAGE}FILES+=	test_splitted_rar_seek_support_ac.uu
 ${PACKAGE}FILES+=	test_write_disk_appledouble.cpio.gz.uu
 ${PACKAGE}FILES+=	test_write_disk_hfs_compression.tgz.uu
 ${PACKAGE}FILES+=	test_write_disk_mac_metadata.tar.gz.uu
 ${PACKAGE}FILES+=	test_write_disk_no_hfs_compression.tgz.uu
 
 .include <bsd.test.mk>
Index: projects/fuse2/lib/libc/stdio/Makefile.inc
===================================================================
--- projects/fuse2/lib/libc/stdio/Makefile.inc	(revision 350434)
+++ projects/fuse2/lib/libc/stdio/Makefile.inc	(revision 350435)
@@ -1,88 +1,88 @@
 #	@(#)Makefile.inc	8.3 (Berkeley) 4/17/94
 # $FreeBSD$
 
 # stdio sources
 .PATH: ${LIBC_SRCTOP}/stdio
 
 SRCS+=	_flock_stub.c asprintf.c clrerr.c dprintf.c \
 	fclose.c fcloseall.c fdopen.c \
 	feof.c ferror.c fflush.c fgetc.c fgetln.c fgetpos.c fgets.c fgetwc.c \
 	fgetwln.c fgetws.c \
 	fileno.c findfp.c flags.c fmemopen.c fopen.c \
 	fopencookie.c fprintf.c fpurge.c \
 	fputc.c fputs.c \
 	fputwc.c fputws.c fread.c freopen.c fscanf.c fseek.c fsetpos.c \
 	ftell.c funopen.c fvwrite.c fwalk.c fwide.c fwprintf.c fwscanf.c \
 	fwrite.c getc.c getchar.c getdelim.c getline.c \
 	gets.c gets_s.c getw.c getwc.c getwchar.c makebuf.c mktemp.c \
 	open_memstream.c open_wmemstream.c \
 	perror.c printf.c printf-pos.c putc.c putchar.c \
 	puts.c putw.c putwc.c putwchar.c \
 	refill.c remove.c rewind.c rget.c scanf.c setbuf.c setbuffer.c \
 	setvbuf.c snprintf.c sprintf.c sscanf.c stdio.c swprintf.c swscanf.c \
 	tempnam.c tmpfile.c \
 	tmpnam.c ungetc.c ungetwc.c vasprintf.c vdprintf.c vfprintf.c \
 	vfscanf.c \
 	vfwprintf.c vfwscanf.c vprintf.c vscanf.c vsnprintf.c vsprintf.c \
 	vsscanf.c \
 	vswprintf.c vswscanf.c vwprintf.c vwscanf.c wbuf.c wprintf.c wscanf.c \
 	wsetup.c
 
 SRCS+=	xprintf.c xprintf_float.c xprintf_int.c xprintf_str.c
 SRCS+=	xprintf_errno.c xprintf_hexdump.c xprintf_quote.c 
 SRCS+=	xprintf_time.c xprintf_vis.c
 
 SYM_MAPS+=	${LIBC_SRCTOP}/stdio/Symbol.map
 
 MAN+=	fclose.3 ferror.3 fflush.3 fgetln.3 fgets.3 fgetwln.3 fgetws.3 \
 	flockfile.3 \
 	fopen.3 fopencookie.3 fputs.3 \
 	fputws.3 fread.3 fseek.3 funopen.3 fwide.3 getc.3 \
 	getline.3 getwc.3 mktemp.3 open_memstream.3 \
 	printf.3 printf_l.3 putc.3 putwc.3 remove.3 scanf.3 scanf_l.3 setbuf.3 \
 	stdio.3 tmpnam.3 \
 	ungetc.3 ungetwc.3 wprintf.3 wscanf.3
 
 MLINKS+=fclose.3 fcloseall.3 fclose.3 fdclose.3
 MLINKS+=ferror.3 ferror_unlocked.3 \
 	ferror.3 clearerr.3 ferror.3 clearerr_unlocked.3 \
 	ferror.3 feof.3 ferror.3 feof_unlocked.3 \
 	ferror.3 fileno.3 ferror.3 fileno_unlocked.3
 MLINKS+=fflush.3 fpurge.3
 MLINKS+=fgets.3 gets.3
 MLINKS+=fgets.3 gets_s.3
 MLINKS+=flockfile.3 ftrylockfile.3 flockfile.3 funlockfile.3
 MLINKS+=fopen.3 fdopen.3 fopen.3 freopen.3 fopen.3 fmemopen.3
 MLINKS+=fputs.3 puts.3
 MLINKS+=fread.3 fwrite.3
 MLINKS+=fseek.3 fgetpos.3 fseek.3 fseeko.3 fseek.3 fsetpos.3 fseek.3 ftell.3 \
 	fseek.3 ftello.3 fseek.3 rewind.3
 MLINKS+=funopen.3 fropen.3 funopen.3 fwopen.3
 MLINKS+=getc.3 fgetc.3 getc.3 getc_unlocked.3 getc.3 getchar.3 \
 	getc.3 getchar_unlocked.3 getc.3 getw.3
 MLINKS+=getline.3 getdelim.3
 MLINKS+=getwc.3 fgetwc.3 getwc.3 getwchar.3
 MLINKS+=mktemp.3 mkdtemp.3 mktemp.3 mkstemp.3 mktemp.3 mkstemps.3 \
-	mktemp.3 mkostemp.3 mktemp.3 mkostemps.3
+	mktemp.3 mkostemp.3 mktemp.3 mkostemps.3 mktemp.3 mkostempsat.3
 MLINKS+=open_memstream.3 open_wmemstream.3
 MLINKS+=printf.3 asprintf.3 printf.3 dprintf.3 printf.3 fprintf.3 \
 	printf.3 snprintf.3 printf.3 sprintf.3 \
 	printf.3 vasprintf.3 printf.3 vdprintf.3 \
 	printf.3 vfprintf.3 printf.3 vprintf.3 printf.3 vsnprintf.3 \
 	printf.3 vsprintf.3
 MLINKS+=printf_l.3 asprintf_l.3 printf_l.3 fprintf_l.3 printf_l.3 snprintf_l.3 \
 	printf_l.3 sprintf_l.3 printf_l.3 vasprintf_l.3 printf_l.3 vfprintf_l.3 \
 	printf_l.3 vprintf_l.3 printf_l.3 vsnprintf_l.3 printf_l.3 vsprintf_l.3
 MLINKS+=putc.3 fputc.3 putc.3 putc_unlocked.3 putc.3 putchar.3 \
 	putc.3 putchar_unlocked.3 putc.3 putw.3
 MLINKS+=putwc.3 fputwc.3 putwc.3 putwchar.3
 MLINKS+=scanf.3 fscanf.3 scanf.3 sscanf.3 scanf.3 vfscanf.3 scanf.3 vscanf.3 \
 	scanf.3 vsscanf.3
 MLINKS+=scanf_l.3 fscanf_l.3 scanf_l.3 sscanf_l.3 scanf_l.3 vfscanf_l.3 \
 	scanf_l.3 vscanf_l.3 scanf_l.3 vsscanf_l.3
 MLINKS+=setbuf.3 setbuffer.3 setbuf.3 setlinebuf.3 setbuf.3 setvbuf.3
 MLINKS+=tmpnam.3 tempnam.3 tmpnam.3 tmpfile.3
 MLINKS+=wprintf.3 fwprintf.3 wprintf.3 swprintf.3 \
 	wprintf.3 vwprintf.3 wprintf.3 vfwprintf.3 wprintf.3 vswprintf.3
 MLINKS+=wscanf.3 fwscanf.3 wscanf.3 swscanf.3 wscanf.3 vwscanf.3 \
 	wscanf.3 vswscanf.3 wscanf.3 vfwscanf.3
Index: projects/fuse2/lib/libc/stdio/Symbol.map
===================================================================
--- projects/fuse2/lib/libc/stdio/Symbol.map	(revision 350434)
+++ projects/fuse2/lib/libc/stdio/Symbol.map	(revision 350435)
@@ -1,212 +1,216 @@
 /*
  * $FreeBSD$
  */
 
 FBSD_1.0 {
 	flockfile;
 	ftrylockfile;
 	funlockfile;
 	asprintf;
 	clearerr;
 	fclose;
 	fcloseall;
 	fdopen;
 	feof;
 	ferror;
 	fflush;
 	fgetc;
 	fgetln;
 	fgetpos;
 	fgets;
 	fgetwc;
 	fgetwln;
 	fgetws;
 	fileno;
 	__sF;
 	__stdinp;
 	__stdoutp;
 	__stderrp;
 	f_prealloc;	/* deprecated??? */
 	fopen;
 	fprintf;
 	fpurge;
 	fputc;
 	fputs;
 	fputwc;
 	fputws;
 	fread;
 	freopen;
 	fscanf;
 	fseek;
 	fseeko;
 	fsetpos;
 	ftell;
 	ftello;
 	funopen;
 	fwide;
 	fwprintf;
 	fwrite;
 	fwscanf;
 	getc;
 	getchar;
 	gets;
 	getw;
 	getwc;
 	getwchar;
 	mkstemps;
 	mkstemp;
 	mkdtemp;
 	mktemp;
 	perror;
 	printf;
 	putc;
 	putchar;
 	puts;
 	putw;
 	putwc;
 	putwchar;
 	remove;
 	rewind;
 	__srget;
 	scanf;
 	setbuf;
 	setbuffer;
 	setlinebuf;
 	setvbuf;
 	snprintf;
 	sprintf;
 	sscanf;
 	swprintf;
 	swscanf;
 	tempnam;
 	tmpfile;
 	tmpnam;
 	ungetc;
 	ungetwc;
 	getchar_unlocked;
 	getc_unlocked;
 	putchar_unlocked;
 	putc_unlocked;
 	feof_unlocked;
 	ferror_unlocked;
 	clearerr_unlocked;
 	fileno_unlocked;
 	vasprintf;
 	vfprintf;
 	vfscanf;
 	vfwprintf;
 	vfwscanf;
 	vprintf;
 	vscanf;
 	vsnprintf;
 	vsprintf;
 	vsscanf;
 	vswprintf;
 	vswscanf;
 	vwprintf;
 	vwscanf;
 	__swbuf;
 	wprintf;
 	wscanf;
 };
 
 FBSD_1.1 {
 	 dprintf;
 	 getdelim;
 	 getline;
 	 vdprintf;
 };
 
 FBSD_1.3 {
 	asprintf_l;
 	fprintf_l;
 	fwprintf_l;
 	printf_l;
 	snprintf_l;
 	sprintf_l;
 	swprintf_l;
 	vasprintf_l;
 	vfprintf_l;
 	vfwprintf_l;
 	vprintf_l;
 	vsnprintf_l;
 	vsprintf_l;
 	vswprintf_l;
 	vwprintf_l;
 	wprintf_l;
 	fgetwc_l;
 	fputwc_l;
 	ungetwc_l;
 	vfwscanf_l;
 	vswscanf_l;
 	fscanf_l;
 	fwscanf_l;
 	scanf_l;
 	sscanf_l;
 	swscanf_l;
 	vfscanf_l;
 	vscanf_l;
 	vsscanf_l;
 	vwscanf_l;
 	wscanf_l;
 	fgetws_l;
 	fputws_l;
 	getwc_l;
 	getwchar_l;
 	putwc_l;
 	putwchar_l;
 	fmemopen;
 	open_memstream;
 	open_wmemstream;
 	mkostemp;
 	mkostemps;
 };
 
 FBSD_1.4 {
 	fdclose;
 	fopencookie;
 };
 
 FBSD_1.5 {
 	gets_s;
 };
 
+FBSD_1.6 {
+	mkostempsat;
+};
+
 FBSDprivate_1.0 {
 	_flockfile;
 	_flockfile_debug_stub;
 	_flockfile_debug;
 	_ftrylockfile;
 	_funlockfile;
 	__vfscanf;
 
 	/*
 	 * xprintf support
 	 */
 	__use_xprintf;
 	__lowercase_hex;
 	__uppercase_hex;
 	__printf_flush;
 	__printf_puts;
 	__printf_pad;
 	__printf_out;
 	__xvprintf;
 	register_printf_function;
 	register_printf_render;
 	register_printf_render_std;
 	__printf_arginfo_float;
 	__printf_render_float;
 	__printf_arginfo_hexdump;
 	__printf_render_hexdump;
 	__printf_arginfo_int;
 	__printf_render_int;
 	__printf_arginfo_ptr;
 	__printf_render_ptr;
 	__printf_arginfo_str;
 	__printf_render_str;
 	__printf_arginfo_chr;
 	__printf_render_chr;
 	__printf_arginfo_time;
 	__printf_render_time;
 	__printf_arginfo_vis;
 	__printf_render_vis;
 };
Index: projects/fuse2/lib/libc/stdio/mktemp.3
===================================================================
--- projects/fuse2/lib/libc/stdio/mktemp.3	(revision 350434)
+++ projects/fuse2/lib/libc/stdio/mktemp.3	(revision 350435)
@@ -1,326 +1,349 @@
 .\" Copyright (c) 1989, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"     @(#)mktemp.3	8.1 (Berkeley) 6/4/93
 .\" $FreeBSD$
 .\"
-.Dd August 8, 2013
+.Dd July 29, 2019
 .Dt MKTEMP 3
 .Os
 .Sh NAME
 .Nm mktemp
 .Nd make temporary file name (unique)
 .Sh LIBRARY
 .Lb libc
 .Sh SYNOPSIS
 .In stdlib.h
 .Ft char *
 .Fn mktemp "char *template"
 .Ft int
 .Fn mkstemp "char *template"
 .Ft int
 .Fn mkostemp "char *template" "int oflags"
 .Ft int
 .Fn mkostemps "char *template" "int suffixlen" "int oflags"
+.Ft int
+.Fn mkostempsat "int dfd" "char *template" "int suffixlen" "int oflags"
 .Ft char *
 .Fn mkdtemp "char *template"
 .In unistd.h
 .Ft int
 .Fn mkstemps "char *template" "int suffixlen"
 .Sh DESCRIPTION
 The
 .Fn mktemp
 function
 takes the given file name template and overwrites a portion of it
 to create a file name.
 This file name is guaranteed not to exist at the time of function invocation
 and is suitable for use
 by the application.
 The template may be any file name with some number of
 .Ql X Ns s
 appended
 to it, for example
 .Pa /tmp/temp.XXXXXX .
 The trailing
 .Ql X Ns s
 are replaced with a
 unique alphanumeric combination.
 The number of unique file names
 .Fn mktemp
 can return depends on the number of
 .Ql X Ns s
 provided; six
 .Ql X Ns s
 will
 result in
 .Fn mktemp
 selecting one of 56800235584 (62 ** 6) possible temporary file names.
 .Pp
 The
 .Fn mkstemp
 function
 makes the same replacement to the template and creates the template file,
 mode 0600, returning a file descriptor opened for reading and writing.
 This avoids the race between testing for a file's existence and opening it
 for use.
 .Pp
 The
 .Fn mkostemp
 function
 is like
 .Fn mkstemp
 but allows specifying additional
 .Xr open 2
 flags (defined in
 .In fcntl.h ) .
 The permitted flags are
 .Dv O_APPEND ,
 .Dv O_DIRECT ,
 .Dv O_SHLOCK ,
 .Dv O_EXLOCK ,
 .Dv O_SYNC
 and
 .Dv O_CLOEXEC .
 .Pp
 The
 .Fn mkstemps
 and
 .Fn mkostemps
 functions act the same as
 .Fn mkstemp
 and
 .Fn mkostemp
 respectively,
 except they permit a suffix to exist in the template.
 The template should be of the form
 .Pa /tmp/tmpXXXXXXsuffix .
 The
 .Fn mkstemps
 and
 .Fn mkostemps
 function
 are told the length of the suffix string.
 .Pp
 The
+.Fn mkostempsat
+function acts the same as
+.Fn mkostemps
+but takes an additional directory descriptor as a parameter.
+The temporary file is created relative to the corresponding
+directory, or to the current working directory if the special
+value
+.Dv AT_FDCWD
+is specified.
+If the template path is an absolute path, the
+.Fa dfd
+parameter is ignored and the behavior is identical to
+.Fn mkostemps .
+.Pp
+The
 .Fn mkdtemp
 function makes the same replacement to the template as in
 .Fn mktemp
 and creates the template directory, mode 0700.
 .Sh RETURN VALUES
 The
 .Fn mktemp
 and
 .Fn mkdtemp
 functions return a pointer to the template on success and
 .Dv NULL
 on failure.
 The
 .Fn mkstemp ,
 .Fn mkostemp
 .Fn mkstemps
 and
 .Fn mkostemps
 functions
 return \-1 if no suitable file could be created.
 If either call fails an error code is placed in the global variable
 .Va errno .
 .Sh ERRORS
 The
 .Fn mkstemp ,
 .Fn mkostemp ,
 .Fn mkstemps ,
 .Fn mkostemps
 and
 .Fn mkdtemp
 functions
 may set
 .Va errno
 to one of the following values:
 .Bl -tag -width Er
 .It Bq Er ENOTDIR
 The pathname portion of the template is not an existing directory.
 .El
 .Pp
 The
 .Fn mkostemp
 and
 .Fn mkostemps
 functions
 may also set
 .Va errno
 to the following value:
 .Bl -tag -width Er
 .It Bq Er EINVAL
 The
 .Fa oflags
 argument is invalid.
 .El
 .Pp
 The
 .Fn mkstemp ,
 .Fn mkostemp ,
 .Fn mkstemps ,
 .Fn mkostemps
 and
 .Fn mkdtemp
 functions
 may also set
 .Va errno
 to any value specified by the
 .Xr stat 2
 function.
 .Pp
 The
 .Fn mkstemp ,
 .Fn mkostemp ,
 .Fn mkstemps
 and
 .Fn mkostemps
 functions
 may also set
 .Va errno
 to any value specified by the
 .Xr open 2
 function.
 .Pp
 The
 .Fn mkdtemp
 function
 may also set
 .Va errno
 to any value specified by the
 .Xr mkdir 2
 function.
 .Sh NOTES
 A common problem that results in a core dump is that the programmer
 passes in a read-only string to
 .Fn mktemp ,
 .Fn mkstemp ,
 .Fn mkstemps
 or
 .Fn mkdtemp .
 This is common with programs that were developed before
 .St -isoC
 compilers were common.
 For example, calling
 .Fn mkstemp
 with an argument of
 .Qq /tmp/tempfile.XXXXXX
 will result in a core dump due to
 .Fn mkstemp
 attempting to modify the string constant that was given.
 .Pp
 The
 .Fn mkdtemp ,
 .Fn mkstemp
 and
 .Fn mktemp
 function prototypes are also available from
 .In unistd.h .
 .Sh SEE ALSO
 .Xr chmod 2 ,
 .Xr getpid 2 ,
 .Xr mkdir 2 ,
 .Xr open 2 ,
 .Xr stat 2
 .Sh STANDARDS
 The
 .Fn mkstemp
 and
 .Fn mkdtemp
 functions are expected to conform to
 .St -p1003.1-2008 .
 The
 .Fn mktemp
 function is expected to conform to
 .St -p1003.1-2001
 and is not specified by
 .St -p1003.1-2008 .
 The
 .Fn mkostemp ,
-.Fn mkstemps
-and
+.Fn mkstemps ,
 .Fn mkostemps
+and
+.Fn mkostempsat
 functions do not conform to any standard.
 .Sh HISTORY
 A
 .Fn mktemp
 function appeared in
 .At v7 .
 The
 .Fn mkstemp
 function appeared in
 .Bx 4.4 .
 The
 .Fn mkdtemp
 function first appeared in
 .Ox 2.2 ,
 and later in
 .Fx 3.2 .
 The
 .Fn mkstemps
 function first appeared in
 .Ox 2.4 ,
 and later in
 .Fx 3.4 .
 The
 .Fn mkostemp
 and
 .Fn mkostemps
 functions appeared in
 .Fx 10.0 .
+The
+.Fn mkostempsat
+function appeared in
+.Fx 13.0 .
 .Sh BUGS
 This family of functions produces filenames which can be guessed,
 though the risk is minimized when large numbers of
 .Ql X Ns s
 are used to
 increase the number of possible temporary filenames.
 This makes the race in
 .Fn mktemp ,
 between testing for a file's existence (in the
 .Fn mktemp
 function call)
 and opening it for use
 (later in the user application)
 particularly dangerous from a security perspective.
 Whenever it is possible,
-.Fn mkstemp
-or
+.Fn mkstemp ,
 .Fn mkostemp
-should be used instead, since it does not have the race condition.
+or
+.Fn mkostempsat
+should be used instead, since they do not have the race condition.
 If
 .Fn mkstemp
 cannot be used, the filename created by
 .Fn mktemp
 should be created using the
 .Dv O_EXCL
 flag to
 .Xr open 2
 and the return status of the call should be tested for failure.
 This will ensure that the program does not continue blindly
 in the event that an attacker has already created the file
 with the intention of manipulating or reading its contents.
Index: projects/fuse2/lib/libc/stdio/mktemp.c
===================================================================
--- projects/fuse2/lib/libc/stdio/mktemp.c	(revision 350434)
+++ projects/fuse2/lib/libc/stdio/mktemp.c	(revision 350435)
@@ -1,212 +1,220 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1987, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
 static char sccsid[] = "@(#)mktemp.c	8.1 (Berkeley) 6/4/93";
 #endif /* LIBC_SCCS and not lint */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "namespace.h"
 #include <sys/param.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include <unistd.h>
 #include "un-namespace.h"
 
 char *_mktemp(char *);
 
-static int _gettemp(char *, int *, int, int, int);
+static int _gettemp(int, char *, int *, int, int, int);
 
 static const unsigned char padchar[] =
 "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
 
 int
+mkostempsat(int dfd, char *path, int slen, int oflags)
+{
+	int fd;
+
+	return (_gettemp(dfd, path, &fd, 0, slen, oflags) ? fd : -1);
+}
+
+int
 mkostemps(char *path, int slen, int oflags)
 {
 	int fd;
 
-	return (_gettemp(path, &fd, 0, slen, oflags) ? fd : -1);
+	return (_gettemp(AT_FDCWD, path, &fd, 0, slen, oflags) ? fd : -1);
 }
 
 int
 mkstemps(char *path, int slen)
 {
 	int fd;
 
-	return (_gettemp(path, &fd, 0, slen, 0) ? fd : -1);
+	return (_gettemp(AT_FDCWD, path, &fd, 0, slen, 0) ? fd : -1);
 }
 
 int
 mkostemp(char *path, int oflags)
 {
 	int fd;
 
-	return (_gettemp(path, &fd, 0, 0, oflags) ? fd : -1);
+	return (_gettemp(AT_FDCWD, path, &fd, 0, 0, oflags) ? fd : -1);
 }
 
 int
 mkstemp(char *path)
 {
 	int fd;
 
-	return (_gettemp(path, &fd, 0, 0, 0) ? fd : -1);
+	return (_gettemp(AT_FDCWD, path, &fd, 0, 0, 0) ? fd : -1);
 }
 
 char *
 mkdtemp(char *path)
 {
-	return (_gettemp(path, (int *)NULL, 1, 0, 0) ? path : (char *)NULL);
+	return (_gettemp(AT_FDCWD, path, (int *)NULL, 1, 0, 0) ? path : (char *)NULL);
 }
 
 char *
 _mktemp(char *path)
 {
-	return (_gettemp(path, (int *)NULL, 0, 0, 0) ? path : (char *)NULL);
+	return (_gettemp(AT_FDCWD, path, (int *)NULL, 0, 0, 0) ? path : (char *)NULL);
 }
 
 __warn_references(mktemp,
     "warning: mktemp() possibly used unsafely; consider using mkstemp()");
 
 char *
 mktemp(char *path)
 {
 	return (_mktemp(path));
 }
 
 static int
-_gettemp(char *path, int *doopen, int domkdir, int slen, int oflags)
+_gettemp(int dfd, char *path, int *doopen, int domkdir, int slen, int oflags)
 {
 	char *start, *trv, *suffp, *carryp;
 	char *pad;
 	struct stat sbuf;
 	int rval;
 	uint32_t rand;
 	char carrybuf[MAXPATHLEN];
 
 	if ((doopen != NULL && domkdir) || slen < 0 ||
 	    (oflags & ~(O_APPEND | O_DIRECT | O_SHLOCK | O_EXLOCK | O_SYNC |
 	    O_CLOEXEC)) != 0) {
 		errno = EINVAL;
 		return (0);
 	}
 
 	for (trv = path; *trv != '\0'; ++trv)
 		;
 	if (trv - path >= MAXPATHLEN) {
 		errno = ENAMETOOLONG;
 		return (0);
 	}
 	trv -= slen;
 	suffp = trv;
 	--trv;
 	if (trv < path || NULL != strchr(suffp, '/')) {
 		errno = EINVAL;
 		return (0);
 	}
 
 	/* Fill space with random characters */
 	while (trv >= path && *trv == 'X') {
 		rand = arc4random_uniform(sizeof(padchar) - 1);
 		*trv-- = padchar[rand];
 	}
 	start = trv + 1;
 
 	/* save first combination of random characters */
 	memcpy(carrybuf, start, suffp - start);
 
 	/*
 	 * check the target directory.
 	 */
 	if (doopen != NULL || domkdir) {
 		for (; trv > path; --trv) {
 			if (*trv == '/') {
 				*trv = '\0';
-				rval = stat(path, &sbuf);
+				rval = fstatat(dfd, path, &sbuf, 0);
 				*trv = '/';
 				if (rval != 0)
 					return (0);
 				if (!S_ISDIR(sbuf.st_mode)) {
 					errno = ENOTDIR;
 					return (0);
 				}
 				break;
 			}
 		}
 	}
 
+	oflags |= O_CREAT | O_EXCL | O_RDWR;
 	for (;;) {
 		if (doopen) {
-			if ((*doopen =
-			    _open(path, O_CREAT|O_EXCL|O_RDWR|oflags, 0600)) >=
-			    0)
+			*doopen = _openat(dfd, path, oflags, 0600);
+			if (*doopen >= 0)
 				return (1);
 			if (errno != EEXIST)
 				return (0);
 		} else if (domkdir) {
 			if (mkdir(path, 0700) == 0)
 				return (1);
 			if (errno != EEXIST)
 				return (0);
 		} else if (lstat(path, &sbuf))
 			return (errno == ENOENT);
 
 		/* If we have a collision, cycle through the space of filenames */
 		for (trv = start, carryp = carrybuf;;) {
 			/* have we tried all possible permutations? */
 			if (trv == suffp)
 				return (0); /* yes - exit with EEXIST */
 			pad = strchr(padchar, *trv);
 			if (pad == NULL) {
 				/* this should never happen */
 				errno = EIO;
 				return (0);
 			}
 			/* increment character */
 			*trv = (*++pad == '\0') ? padchar[0] : *pad;
 			/* carry to next position? */
 			if (*trv == *carryp) {
 				/* increment position and loop */
 				++trv;
 				++carryp;
 			} else {
 				/* try with new name */
 				break;
 			}
 		}
 	}
 	/*NOTREACHED*/
 }
Index: projects/fuse2/sbin/camcontrol/camcontrol.c
===================================================================
--- projects/fuse2/sbin/camcontrol/camcontrol.c	(revision 350434)
+++ projects/fuse2/sbin/camcontrol/camcontrol.c	(revision 350435)
@@ -1,10726 +1,10742 @@
 /*
  * Copyright (c) 1997-2007 Kenneth D. Merry
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/ioctl.h>
 #include <sys/stdint.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/endian.h>
 #include <sys/sbuf.h>
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <inttypes.h>
 #include <limits.h>
 #include <fcntl.h>
 #include <ctype.h>
 #include <err.h>
 #include <libutil.h>
 #include <limits.h>
 #include <inttypes.h>
 
 #include <cam/cam.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_ccb.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/scsi/scsi_pass.h>
 #include <cam/scsi/scsi_message.h>
 #include <cam/scsi/smp_all.h>
 #include <cam/ata/ata_all.h>
 #include <cam/mmc/mmc_all.h>
 #include <camlib.h>
 #include "camcontrol.h"
 #ifdef WITH_NVME
 #include "nvmecontrol_ext.h"
 #endif
 
 typedef enum {
 	CAM_CMD_NONE		= 0x00000000,
 	CAM_CMD_DEVLIST		= 0x00000001,
 	CAM_CMD_TUR		= 0x00000002,
 	CAM_CMD_INQUIRY		= 0x00000003,
 	CAM_CMD_STARTSTOP	= 0x00000004,
 	CAM_CMD_RESCAN		= 0x00000005,
 	CAM_CMD_READ_DEFECTS	= 0x00000006,
 	CAM_CMD_MODE_PAGE	= 0x00000007,
 	CAM_CMD_SCSI_CMD	= 0x00000008,
 	CAM_CMD_DEVTREE		= 0x00000009,
 	CAM_CMD_USAGE		= 0x0000000a,
 	CAM_CMD_DEBUG		= 0x0000000b,
 	CAM_CMD_RESET		= 0x0000000c,
 	CAM_CMD_FORMAT		= 0x0000000d,
 	CAM_CMD_TAG		= 0x0000000e,
 	CAM_CMD_RATE		= 0x0000000f,
 	CAM_CMD_DETACH		= 0x00000010,
 	CAM_CMD_REPORTLUNS	= 0x00000011,
 	CAM_CMD_READCAP		= 0x00000012,
 	CAM_CMD_IDENTIFY	= 0x00000013,
 	CAM_CMD_IDLE		= 0x00000014,
 	CAM_CMD_STANDBY		= 0x00000015,
 	CAM_CMD_SLEEP		= 0x00000016,
 	CAM_CMD_SMP_CMD		= 0x00000017,
 	CAM_CMD_SMP_RG		= 0x00000018,
 	CAM_CMD_SMP_PC		= 0x00000019,
 	CAM_CMD_SMP_PHYLIST	= 0x0000001a,
 	CAM_CMD_SMP_MANINFO	= 0x0000001b,
 	CAM_CMD_DOWNLOAD_FW	= 0x0000001c,
 	CAM_CMD_SECURITY	= 0x0000001d,
 	CAM_CMD_HPA		= 0x0000001e,
 	CAM_CMD_SANITIZE	= 0x0000001f,
 	CAM_CMD_PERSIST		= 0x00000020,
 	CAM_CMD_APM		= 0x00000021,
 	CAM_CMD_AAM		= 0x00000022,
 	CAM_CMD_ATTRIB		= 0x00000023,
 	CAM_CMD_OPCODES		= 0x00000024,
 	CAM_CMD_REPROBE		= 0x00000025,
 	CAM_CMD_ZONE		= 0x00000026,
 	CAM_CMD_EPC		= 0x00000027,
 	CAM_CMD_TIMESTAMP	= 0x00000028,
 	CAM_CMD_MMCSD_CMD	= 0x00000029,
 	CAM_CMD_POWER_MODE	= 0x0000002a,
 	CAM_CMD_DEVTYPE		= 0x0000002b,
 	CAM_CMD_AMA	= 0x0000002c,
 } cam_cmdmask;
 
 typedef enum {
 	CAM_ARG_NONE		= 0x00000000,
 	CAM_ARG_VERBOSE		= 0x00000001,
 	CAM_ARG_DEVICE		= 0x00000002,
 	CAM_ARG_BUS		= 0x00000004,
 	CAM_ARG_TARGET		= 0x00000008,
 	CAM_ARG_LUN		= 0x00000010,
 	CAM_ARG_EJECT		= 0x00000020,
 	CAM_ARG_UNIT		= 0x00000040,
 	CAM_ARG_FORMAT_BLOCK	= 0x00000080,
 	CAM_ARG_FORMAT_BFI	= 0x00000100,
 	CAM_ARG_FORMAT_PHYS	= 0x00000200,
 	CAM_ARG_PLIST		= 0x00000400,
 	CAM_ARG_GLIST		= 0x00000800,
 	CAM_ARG_GET_SERIAL	= 0x00001000,
 	CAM_ARG_GET_STDINQ	= 0x00002000,
 	CAM_ARG_GET_XFERRATE	= 0x00004000,
 	CAM_ARG_INQ_MASK	= 0x00007000,
 	CAM_ARG_TIMEOUT		= 0x00020000,
 	CAM_ARG_CMD_IN		= 0x00040000,
 	CAM_ARG_CMD_OUT		= 0x00080000,
 	CAM_ARG_ERR_RECOVER	= 0x00200000,
 	CAM_ARG_RETRIES		= 0x00400000,
 	CAM_ARG_START_UNIT	= 0x00800000,
 	CAM_ARG_DEBUG_INFO	= 0x01000000,
 	CAM_ARG_DEBUG_TRACE	= 0x02000000,
 	CAM_ARG_DEBUG_SUBTRACE	= 0x04000000,
 	CAM_ARG_DEBUG_CDB	= 0x08000000,
 	CAM_ARG_DEBUG_XPT	= 0x10000000,
 	CAM_ARG_DEBUG_PERIPH	= 0x20000000,
 	CAM_ARG_DEBUG_PROBE	= 0x40000000,
 } cam_argmask;
 
 struct camcontrol_opts {
 	const char	*optname;
 	uint32_t	cmdnum;
 	cam_argmask	argnum;
 	const char	*subopt;
 };
 
 struct ata_res_pass16 {
 	u_int16_t reserved[5];
 	u_int8_t flags;
 	u_int8_t error;
 	u_int8_t sector_count_exp;
 	u_int8_t sector_count;
 	u_int8_t lba_low_exp;
 	u_int8_t lba_low;
 	u_int8_t lba_mid_exp;
 	u_int8_t lba_mid;
 	u_int8_t lba_high_exp;
 	u_int8_t lba_high;
 	u_int8_t device;
 	u_int8_t status;
 };
 
 struct ata_set_max_pwd
 {
 	u_int16_t reserved1;
 	u_int8_t password[32];
 	u_int16_t reserved2[239];
 };
 
 static struct scsi_nv task_attrs[] = {
 	{ "simple", MSG_SIMPLE_Q_TAG },
 	{ "head", MSG_HEAD_OF_Q_TAG },
 	{ "ordered", MSG_ORDERED_Q_TAG },
 	{ "iwr", MSG_IGN_WIDE_RESIDUE },
 	{ "aca", MSG_ACA_TASK }
 };
 
 static const char scsicmd_opts[] = "a:c:dfi:o:r";
 static const char readdefect_opts[] = "f:GPqsS:X";
 static const char negotiate_opts[] = "acD:M:O:qR:T:UW:";
 static const char smprg_opts[] = "l";
 static const char smppc_opts[] = "a:A:d:lm:M:o:p:s:S:T:";
 static const char smpphylist_opts[] = "lq";
 static char pwd_opt;
 
 static struct camcontrol_opts option_table[] = {
 	{"tur", CAM_CMD_TUR, CAM_ARG_NONE, NULL},
 	{"inquiry", CAM_CMD_INQUIRY, CAM_ARG_NONE, "DSR"},
 	{"identify", CAM_CMD_IDENTIFY, CAM_ARG_NONE, NULL},
 	{"start", CAM_CMD_STARTSTOP, CAM_ARG_START_UNIT, NULL},
 	{"stop", CAM_CMD_STARTSTOP, CAM_ARG_NONE, NULL},
 	{"load", CAM_CMD_STARTSTOP, CAM_ARG_START_UNIT | CAM_ARG_EJECT, NULL},
 	{"eject", CAM_CMD_STARTSTOP, CAM_ARG_EJECT, NULL},
 	{"reportluns", CAM_CMD_REPORTLUNS, CAM_ARG_NONE, "clr:"},
 	{"readcapacity", CAM_CMD_READCAP, CAM_ARG_NONE, "bhHlNqs"},
 	{"reprobe", CAM_CMD_REPROBE, CAM_ARG_NONE, NULL},
 	{"rescan", CAM_CMD_RESCAN, CAM_ARG_NONE, NULL},
 	{"reset", CAM_CMD_RESET, CAM_ARG_NONE, NULL},
 	{"cmd", CAM_CMD_SCSI_CMD, CAM_ARG_NONE, scsicmd_opts},
 	{"mmcsdcmd", CAM_CMD_MMCSD_CMD, CAM_ARG_NONE, "c:a:f:Wb:l:41S:I"},
 	{"command", CAM_CMD_SCSI_CMD, CAM_ARG_NONE, scsicmd_opts},
 	{"smpcmd", CAM_CMD_SMP_CMD, CAM_ARG_NONE, "r:R:"},
 	{"smprg", CAM_CMD_SMP_RG, CAM_ARG_NONE, smprg_opts},
 	{"smpreportgeneral", CAM_CMD_SMP_RG, CAM_ARG_NONE, smprg_opts},
 	{"smppc", CAM_CMD_SMP_PC, CAM_ARG_NONE, smppc_opts},
 	{"smpphycontrol", CAM_CMD_SMP_PC, CAM_ARG_NONE, smppc_opts},
 	{"smpplist", CAM_CMD_SMP_PHYLIST, CAM_ARG_NONE, smpphylist_opts},
 	{"smpphylist", CAM_CMD_SMP_PHYLIST, CAM_ARG_NONE, smpphylist_opts},
 	{"smpmaninfo", CAM_CMD_SMP_MANINFO, CAM_ARG_NONE, "l"},
 	{"defects", CAM_CMD_READ_DEFECTS, CAM_ARG_NONE, readdefect_opts},
 	{"defectlist", CAM_CMD_READ_DEFECTS, CAM_ARG_NONE, readdefect_opts},
 	{"devlist", CAM_CMD_DEVTREE, CAM_ARG_NONE, "-b"},
 	{"devtype", CAM_CMD_DEVTYPE, CAM_ARG_NONE, ""},
 	{"periphlist", CAM_CMD_DEVLIST, CAM_ARG_NONE, NULL},
 	{"modepage", CAM_CMD_MODE_PAGE, CAM_ARG_NONE, "bdelm:P:"},
 	{"tags", CAM_CMD_TAG, CAM_ARG_NONE, "N:q"},
 	{"negotiate", CAM_CMD_RATE, CAM_ARG_NONE, negotiate_opts},
 	{"rate", CAM_CMD_RATE, CAM_ARG_NONE, negotiate_opts},
 	{"debug", CAM_CMD_DEBUG, CAM_ARG_NONE, "IPTSXcp"},
 	{"format", CAM_CMD_FORMAT, CAM_ARG_NONE, "qrwy"},
 	{"sanitize", CAM_CMD_SANITIZE, CAM_ARG_NONE, "a:c:IP:qrUwy"},
 	{"idle", CAM_CMD_IDLE, CAM_ARG_NONE, "t:"},
 	{"standby", CAM_CMD_STANDBY, CAM_ARG_NONE, "t:"},
 	{"sleep", CAM_CMD_SLEEP, CAM_ARG_NONE, ""},
 	{"powermode", CAM_CMD_POWER_MODE, CAM_ARG_NONE, ""},
 	{"apm", CAM_CMD_APM, CAM_ARG_NONE, "l:"},
 	{"aam", CAM_CMD_AAM, CAM_ARG_NONE, "l:"},
 	{"fwdownload", CAM_CMD_DOWNLOAD_FW, CAM_ARG_NONE, "f:qsy"},
 	{"security", CAM_CMD_SECURITY, CAM_ARG_NONE, "d:e:fh:k:l:qs:T:U:y"},
 	{"hpa", CAM_CMD_HPA, CAM_ARG_NONE, "Pflp:qs:U:y"},
 	{"ama", CAM_CMD_AMA, CAM_ARG_NONE, "fqs:"},
 	{"persist", CAM_CMD_PERSIST, CAM_ARG_NONE, "ai:I:k:K:o:ps:ST:U"},
 	{"attrib", CAM_CMD_ATTRIB, CAM_ARG_NONE, "a:ce:F:p:r:s:T:w:V:"},
 	{"opcodes", CAM_CMD_OPCODES, CAM_ARG_NONE, "No:s:T"},
 	{"zone", CAM_CMD_ZONE, CAM_ARG_NONE, "ac:l:No:P:"},
 	{"epc", CAM_CMD_EPC, CAM_ARG_NONE, "c:dDeHp:Pr:sS:T:"},
 	{"timestamp", CAM_CMD_TIMESTAMP, CAM_ARG_NONE, "f:mrsUT:"},
 	{"help", CAM_CMD_USAGE, CAM_ARG_NONE, NULL},
 	{"-?", CAM_CMD_USAGE, CAM_ARG_NONE, NULL},
 	{"-h", CAM_CMD_USAGE, CAM_ARG_NONE, NULL},
 	{NULL, 0, 0, NULL}
 };
 
 struct cam_devitem {
 	struct device_match_result dev_match;
 	int num_periphs;
 	struct periph_match_result *periph_matches;
 	struct scsi_vpd_device_id *device_id;
 	int device_id_len;
 	STAILQ_ENTRY(cam_devitem) links;
 };
 
 struct cam_devlist {
 	STAILQ_HEAD(, cam_devitem) dev_queue;
 	path_id_t path_id;
 };
 
 static cam_cmdmask cmdlist;
 static cam_argmask arglist;
 
 static const char *devtype_names[] = {
 	"none",
 	"scsi",
 	"satl",
 	"ata",
 	"nvme",
 	"mmcsd",
 	"unknown",
 };
 
 camcontrol_optret getoption(struct camcontrol_opts *table, char *arg,
 			    uint32_t *cmdnum, cam_argmask *argnum,
 			    const char **subopt);
 static int getdevlist(struct cam_device *device);
 static int getdevtree(int argc, char **argv, char *combinedopt);
 static int getdevtype(struct cam_device *device);
 static int print_dev_scsi(struct device_match_result *dev_result, char *tmpstr);
 static int print_dev_ata(struct device_match_result *dev_result, char *tmpstr);
 static int print_dev_semb(struct device_match_result *dev_result, char *tmpstr);
 static int print_dev_mmcsd(struct device_match_result *dev_result,
     char *tmpstr);
 #ifdef WITH_NVME
 static int print_dev_nvme(struct device_match_result *dev_result, char *tmpstr);
 #endif
 static int testunitready(struct cam_device *device, int task_attr,
 			 int retry_count, int timeout, int quiet);
 static int scsistart(struct cam_device *device, int startstop, int loadeject,
 		     int task_attr, int retry_count, int timeout);
 static int scsiinquiry(struct cam_device *device, int task_attr,
 		       int retry_count, int timeout);
 static int scsiserial(struct cam_device *device, int task_attr,
 		      int retry_count, int timeout);
 static int parse_btl(char *tstr, path_id_t *bus, target_id_t *target,
 		     lun_id_t *lun, cam_argmask *arglst);
 static int reprobe(struct cam_device *device);
 static int dorescan_or_reset(int argc, char **argv, int rescan);
 static int rescan_or_reset_bus(path_id_t bus, int rescan);
 static int scanlun_or_reset_dev(path_id_t bus, target_id_t target,
     lun_id_t lun, int scan);
 static int readdefects(struct cam_device *device, int argc, char **argv,
 		       char *combinedopt, int task_attr, int retry_count,
 		       int timeout);
 static void modepage(struct cam_device *device, int argc, char **argv,
 		     char *combinedopt, int task_attr, int retry_count,
 		     int timeout);
 static int scsicmd(struct cam_device *device, int argc, char **argv,
 		   char *combinedopt, int task_attr, int retry_count,
 		   int timeout);
 static int smpcmd(struct cam_device *device, int argc, char **argv,
 		  char *combinedopt, int retry_count, int timeout);
 static int mmcsdcmd(struct cam_device *device, int argc, char **argv,
 		  char *combinedopt, int retry_count, int timeout);
 static int smpreportgeneral(struct cam_device *device, int argc, char **argv,
 			    char *combinedopt, int retry_count, int timeout);
 static int smpphycontrol(struct cam_device *device, int argc, char **argv,
 			 char *combinedopt, int retry_count, int timeout);
 static int smpmaninfo(struct cam_device *device, int argc, char **argv,
 		      char *combinedopt, int retry_count, int timeout);
 static int getdevid(struct cam_devitem *item);
 static int buildbusdevlist(struct cam_devlist *devlist);
 static void freebusdevlist(struct cam_devlist *devlist);
 static struct cam_devitem *findsasdevice(struct cam_devlist *devlist,
 					 uint64_t sasaddr);
 static int smpphylist(struct cam_device *device, int argc, char **argv,
 		      char *combinedopt, int retry_count, int timeout);
 static int tagcontrol(struct cam_device *device, int argc, char **argv,
 		      char *combinedopt);
 static void cts_print(struct cam_device *device,
 		      struct ccb_trans_settings *cts);
 static void cpi_print(struct ccb_pathinq *cpi);
 static int get_cpi(struct cam_device *device, struct ccb_pathinq *cpi);
 static int get_cgd(struct cam_device *device, struct ccb_getdev *cgd);
 static int get_print_cts(struct cam_device *device, int user_settings,
 			 int quiet, struct ccb_trans_settings *cts);
 static int ratecontrol(struct cam_device *device, int task_attr,
 		       int retry_count, int timeout, int argc, char **argv,
 		       char *combinedopt);
 static int scsiformat(struct cam_device *device, int argc, char **argv,
 		      char *combinedopt, int task_attr, int retry_count,
 		      int timeout);
 static int sanitize(struct cam_device *device, int argc, char **argv,
 			char *combinedopt, int task_attr, int retry_count,
 			int timeout);
 static int scsireportluns(struct cam_device *device, int argc, char **argv,
 			  char *combinedopt, int task_attr, int retry_count,
 			  int timeout);
 static int scsireadcapacity(struct cam_device *device, int argc, char **argv,
 			    char *combinedopt, int task_attr, int retry_count,
 			    int timeout);
 static int atapm(struct cam_device *device, int argc, char **argv,
 		 char *combinedopt, int retry_count, int timeout);
 static int atasecurity(struct cam_device *device, int retry_count, int timeout,
 		       int argc, char **argv, char *combinedopt);
 static int atahpa(struct cam_device *device, int retry_count, int timeout,
 		  int argc, char **argv, char *combinedopt);
 static int ataama(struct cam_device *device, int retry_count, int timeout,
 		  int argc, char **argv, char *combinedopt);
 static int scsiprintoneopcode(struct cam_device *device, int req_opcode,
 			      int sa_set, int req_sa, uint8_t *buf,
 			      uint32_t valid_len);
 static int scsiprintopcodes(struct cam_device *device, int td_req, uint8_t *buf,
 			    uint32_t valid_len);
 static int scsiopcodes(struct cam_device *device, int argc, char **argv,
 		       char *combinedopt, int task_attr, int retry_count,
 		       int timeout, int verbose);
 
 #ifndef min
 #define min(a,b) (((a)<(b))?(a):(b))
 #endif
 #ifndef max
 #define max(a,b) (((a)>(b))?(a):(b))
 #endif
 
 camcontrol_optret
 getoption(struct camcontrol_opts *table, char *arg, uint32_t *cmdnum,
 	  cam_argmask *argnum, const char **subopt)
 {
 	struct camcontrol_opts *opts;
 	int num_matches = 0;
 
 	for (opts = table; (opts != NULL) && (opts->optname != NULL);
 	     opts++) {
 		if (strncmp(opts->optname, arg, strlen(arg)) == 0) {
 			*cmdnum = opts->cmdnum;
 			*argnum = opts->argnum;
 			*subopt = opts->subopt;
 			if (++num_matches > 1)
 				return (CC_OR_AMBIGUOUS);
 		}
 	}
 
 	if (num_matches > 0)
 		return (CC_OR_FOUND);
 	else
 		return (CC_OR_NOT_FOUND);
 }
 
 static int
 getdevlist(struct cam_device *device)
 {
 	union ccb *ccb;
 	char status[32];
 	int error = 0;
 
 	ccb = cam_getccb(device);
 
 	ccb->ccb_h.func_code = XPT_GDEVLIST;
 	ccb->ccb_h.flags = CAM_DIR_NONE;
 	ccb->ccb_h.retry_count = 1;
 	ccb->cgdl.index = 0;
 	ccb->cgdl.status = CAM_GDEVLIST_MORE_DEVS;
 	while (ccb->cgdl.status == CAM_GDEVLIST_MORE_DEVS) {
 		if (cam_send_ccb(device, ccb) < 0) {
 			perror("error getting device list");
 			cam_freeccb(ccb);
 			return (1);
 		}
 
 		status[0] = '\0';
 
 		switch (ccb->cgdl.status) {
 			case CAM_GDEVLIST_MORE_DEVS:
 				strcpy(status, "MORE");
 				break;
 			case CAM_GDEVLIST_LAST_DEVICE:
 				strcpy(status, "LAST");
 				break;
 			case CAM_GDEVLIST_LIST_CHANGED:
 				strcpy(status, "CHANGED");
 				break;
 			case CAM_GDEVLIST_ERROR:
 				strcpy(status, "ERROR");
 				error = 1;
 				break;
 		}
 
 		fprintf(stdout, "%s%d:  generation: %d index: %d status: %s\n",
 			ccb->cgdl.periph_name,
 			ccb->cgdl.unit_number,
 			ccb->cgdl.generation,
 			ccb->cgdl.index,
 			status);
 
 		/*
 		 * If the list has changed, we need to start over from the
 		 * beginning.
 		 */
 		if (ccb->cgdl.status == CAM_GDEVLIST_LIST_CHANGED)
 			ccb->cgdl.index = 0;
 	}
 
 	cam_freeccb(ccb);
 
 	return (error);
 }
 
 static int
 getdevtree(int argc, char **argv, char *combinedopt)
 {
 	union ccb ccb;
 	int bufsize, fd;
 	unsigned int i;
 	int need_close = 0;
 	int error = 0;
 	int skip_device = 0;
 	int busonly = 0;
 	int c;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'b':
 			if ((arglist & CAM_ARG_VERBOSE) == 0)
 				busonly = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if ((fd = open(XPT_DEVICE, O_RDWR)) == -1) {
 		warn("couldn't open %s", XPT_DEVICE);
 		return (1);
 	}
 
 	bzero(&ccb, sizeof(union ccb));
 
 	ccb.ccb_h.path_id = CAM_XPT_PATH_ID;
 	ccb.ccb_h.target_id = CAM_TARGET_WILDCARD;
 	ccb.ccb_h.target_lun = CAM_LUN_WILDCARD;
 
 	ccb.ccb_h.func_code = XPT_DEV_MATCH;
 	bufsize = sizeof(struct dev_match_result) * 100;
 	ccb.cdm.match_buf_len = bufsize;
 	ccb.cdm.matches = (struct dev_match_result *)malloc(bufsize);
 	if (ccb.cdm.matches == NULL) {
 		warnx("can't malloc memory for matches");
 		close(fd);
 		return (1);
 	}
 	ccb.cdm.num_matches = 0;
 
 	/*
 	 * We fetch all nodes, since we display most of them in the default
 	 * case, and all in the verbose case.
 	 */
 	ccb.cdm.num_patterns = 0;
 	ccb.cdm.pattern_buf_len = 0;
 
 	/*
 	 * We do the ioctl multiple times if necessary, in case there are
 	 * more than 100 nodes in the EDT.
 	 */
 	do {
 		if (ioctl(fd, CAMIOCOMMAND, &ccb) == -1) {
 			warn("error sending CAMIOCOMMAND ioctl");
 			error = 1;
 			break;
 		}
 
 		if ((ccb.ccb_h.status != CAM_REQ_CMP)
 		 || ((ccb.cdm.status != CAM_DEV_MATCH_LAST)
 		    && (ccb.cdm.status != CAM_DEV_MATCH_MORE))) {
 			warnx("got CAM error %#x, CDM error %d\n",
 			      ccb.ccb_h.status, ccb.cdm.status);
 			error = 1;
 			break;
 		}
 
 		for (i = 0; i < ccb.cdm.num_matches; i++) {
 			switch (ccb.cdm.matches[i].type) {
 			case DEV_MATCH_BUS: {
 				struct bus_match_result *bus_result;
 
 				/*
 				 * Only print the bus information if the
 				 * user turns on the verbose flag.
 				 */
 				if ((busonly == 0) &&
 				    (arglist & CAM_ARG_VERBOSE) == 0)
 					break;
 
 				bus_result =
 					&ccb.cdm.matches[i].result.bus_result;
 
 				if (need_close) {
 					fprintf(stdout, ")\n");
 					need_close = 0;
 				}
 
 				fprintf(stdout, "scbus%d on %s%d bus %d%s\n",
 					bus_result->path_id,
 					bus_result->dev_name,
 					bus_result->unit_number,
 					bus_result->bus_id,
 					(busonly ? "" : ":"));
 				break;
 			}
 			case DEV_MATCH_DEVICE: {
 				struct device_match_result *dev_result;
 				char tmpstr[256];
 
 				if (busonly == 1)
 					break;
 
 				dev_result =
 				     &ccb.cdm.matches[i].result.device_result;
 
 				if ((dev_result->flags
 				     & DEV_RESULT_UNCONFIGURED)
 				 && ((arglist & CAM_ARG_VERBOSE) == 0)) {
 					skip_device = 1;
 					break;
 				} else
 					skip_device = 0;
 
 				if (dev_result->protocol == PROTO_SCSI) {
 					if (print_dev_scsi(dev_result,
 					    &tmpstr[0]) != 0) {
 						skip_device = 1;
 						break;
 					}
 				} else if (dev_result->protocol == PROTO_ATA ||
 				    dev_result->protocol == PROTO_SATAPM) {
 					if (print_dev_ata(dev_result,
 					    &tmpstr[0]) != 0) {
 						skip_device = 1;
 						break;
 					}
 				} else if (dev_result->protocol == PROTO_MMCSD){
 					if (print_dev_mmcsd(dev_result,
 					    &tmpstr[0]) != 0) {
 						skip_device = 1;
 						break;
 					}
 				} else if (dev_result->protocol == PROTO_SEMB) {
 					if (print_dev_semb(dev_result,
 					    &tmpstr[0]) != 0) {
 						skip_device = 1;
 						break;
 					}
 #ifdef WITH_NVME
 				} else if (dev_result->protocol == PROTO_NVME) {
 					if (print_dev_nvme(dev_result,
 					    &tmpstr[0]) != 0) {
 						skip_device = 1;
 						break;
 					}
 #endif
 				} else {
 				    sprintf(tmpstr, "<>");
 				}
 				if (need_close) {
 					fprintf(stdout, ")\n");
 					need_close = 0;
 				}
 
 				fprintf(stdout, "%-33s  at scbus%d "
 					"target %d lun %jx (",
 					tmpstr,
 					dev_result->path_id,
 					dev_result->target_id,
 					(uintmax_t)dev_result->target_lun);
 
 				need_close = 1;
 
 				break;
 			}
 			case DEV_MATCH_PERIPH: {
 				struct periph_match_result *periph_result;
 
 				periph_result =
 				      &ccb.cdm.matches[i].result.periph_result;
 
 				if (busonly || skip_device != 0)
 					break;
 
 				if (need_close > 1)
 					fprintf(stdout, ",");
 
 				fprintf(stdout, "%s%d",
 					periph_result->periph_name,
 					periph_result->unit_number);
 
 				need_close++;
 				break;
 			}
 			default:
 				fprintf(stdout, "unknown match type\n");
 				break;
 			}
 		}
 
 	} while ((ccb.ccb_h.status == CAM_REQ_CMP)
 		&& (ccb.cdm.status == CAM_DEV_MATCH_MORE));
 
 	if (need_close)
 		fprintf(stdout, ")\n");
 
 	close(fd);
 
 	return (error);
 }
 
 static int
 getdevtype(struct cam_device *cam_dev)
 {
 	camcontrol_devtype dt;
 	int error;
 
 	/*
 	 * Get the device type and report it, request no I/O be done to do this.
 	 */
 	error = get_device_type(cam_dev, -1, 0, 0, &dt);
 	if (error != 0 || (unsigned)dt > CC_DT_UNKNOWN) {
 		fprintf(stdout, "illegal\n");
 		return (1);
 	}
 	fprintf(stdout, "%s\n", devtype_names[dt]);
 	return (0);
 }
 
 static int
 print_dev_scsi(struct device_match_result *dev_result, char *tmpstr)
 {
 	char vendor[16], product[48], revision[16];
 
 	cam_strvis(vendor, dev_result->inq_data.vendor,
 	    sizeof(dev_result->inq_data.vendor), sizeof(vendor));
 	cam_strvis(product, dev_result->inq_data.product,
 	    sizeof(dev_result->inq_data.product), sizeof(product));
 	cam_strvis(revision, dev_result->inq_data.revision,
 	    sizeof(dev_result->inq_data.revision), sizeof(revision));
 	sprintf(tmpstr, "<%s %s %s>", vendor, product, revision);
 
 	return (0);
 }
 
 static int
 print_dev_ata(struct device_match_result *dev_result, char *tmpstr)
 {
 	char product[48], revision[16];
 
 	cam_strvis(product, dev_result->ident_data.model,
 	    sizeof(dev_result->ident_data.model), sizeof(product));
 	cam_strvis(revision, dev_result->ident_data.revision,
 	    sizeof(dev_result->ident_data.revision), sizeof(revision));
 	sprintf(tmpstr, "<%s %s>", product, revision);
 
 	return (0);
 }
 
 static int
 print_dev_semb(struct device_match_result *dev_result, char *tmpstr)
 {
 	struct sep_identify_data *sid;
 	char vendor[16], product[48], revision[16], fw[5];
 
 	sid = (struct sep_identify_data *)&dev_result->ident_data;
 	cam_strvis(vendor, sid->vendor_id,
 	    sizeof(sid->vendor_id), sizeof(vendor));
 	cam_strvis(product, sid->product_id,
 	    sizeof(sid->product_id), sizeof(product));
 	cam_strvis(revision, sid->product_rev,
 	    sizeof(sid->product_rev), sizeof(revision));
 	cam_strvis(fw, sid->firmware_rev,
 	    sizeof(sid->firmware_rev), sizeof(fw));
 	sprintf(tmpstr, "<%s %s %s %s>", vendor, product, revision, fw);
 
 	return (0);
 }
 
 static int
 print_dev_mmcsd(struct device_match_result *dev_result, char *tmpstr)
 {
 	union ccb *ccb;
 	struct ccb_dev_advinfo *advi;
 	struct cam_device *dev;
 	struct mmc_params mmc_ident_data;
 
 	dev = cam_open_btl(dev_result->path_id, dev_result->target_id,
 	    dev_result->target_lun, O_RDWR, NULL);
 	if (dev == NULL) {
 		warnx("%s", cam_errbuf);
 		return (1);
 	}
 
 	ccb = cam_getccb(dev);
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		cam_close_device(dev);
 		return (1);
 	}
 
 	advi = &ccb->cdai;
 	advi->ccb_h.flags = CAM_DIR_IN;
 	advi->ccb_h.func_code = XPT_DEV_ADVINFO;
 	advi->flags = CDAI_FLAG_NONE;
 	advi->buftype = CDAI_TYPE_MMC_PARAMS;
 	advi->bufsiz = sizeof(struct mmc_params);
 	advi->buf = (uint8_t *)&mmc_ident_data;
 
 	if (cam_send_ccb(dev, ccb) < 0) {
 		warn("error sending CAMIOCOMMAND ioctl");
 		cam_freeccb(ccb);
 		cam_close_device(dev);
 		return (1);
 	}
 
 	if (strlen(mmc_ident_data.model) > 0) {
 		sprintf(tmpstr, "<%s>", mmc_ident_data.model);
 	} else {
 		sprintf(tmpstr, "<%s card>",
 		    mmc_ident_data.card_features &
 		    CARD_FEATURE_SDIO ? "SDIO" : "unknown");
 	}
 
 	cam_freeccb(ccb);
 	cam_close_device(dev);
 	return (0);
 }
 
 #ifdef WITH_NVME
 static int
 nvme_get_cdata(struct cam_device *dev, struct nvme_controller_data *cdata)
 {
 	union ccb *ccb;
 	struct ccb_dev_advinfo *advi;
 
 	ccb = cam_getccb(dev);
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		cam_close_device(dev);
 		return (1);
 	}
 
 	advi = &ccb->cdai;
 	advi->ccb_h.flags = CAM_DIR_IN;
 	advi->ccb_h.func_code = XPT_DEV_ADVINFO;
 	advi->flags = CDAI_FLAG_NONE;
 	advi->buftype = CDAI_TYPE_NVME_CNTRL;
 	advi->bufsiz = sizeof(struct nvme_controller_data);
 	advi->buf = (uint8_t *)cdata;
 
 	if (cam_send_ccb(dev, ccb) < 0) {
 		warn("error sending CAMIOCOMMAND ioctl");
 		cam_freeccb(ccb);
 		cam_close_device(dev);
 		return(1);
 	}
 	if (advi->ccb_h.status != CAM_REQ_CMP) {
 		warnx("got CAM error %#x", advi->ccb_h.status);
 		cam_freeccb(ccb);
 		cam_close_device(dev);
 		return(1);
 	}
 	cam_freeccb(ccb);
 	return 0;
 }
 
 static int
 print_dev_nvme(struct device_match_result *dev_result, char *tmpstr)
 {
 	struct cam_device *dev;
 	struct nvme_controller_data cdata;
 	char vendor[64], product[64];
 
 	dev = cam_open_btl(dev_result->path_id, dev_result->target_id,
 	    dev_result->target_lun, O_RDWR, NULL);
 	if (dev == NULL) {
 		warnx("%s", cam_errbuf);
 		return (1);
 	}
 
 	if (nvme_get_cdata(dev, &cdata))
 		return (1);
 
 	cam_strvis(vendor, cdata.mn, sizeof(cdata.mn), sizeof(vendor));
 	cam_strvis(product, cdata.fr, sizeof(cdata.fr), sizeof(product));
 	sprintf(tmpstr, "<%s %s>", vendor, product);
 
 	cam_close_device(dev);
 	return (0);
 }
 #endif
 
 static int
 testunitready(struct cam_device *device, int task_attr, int retry_count,
 	      int timeout, int quiet)
 {
 	int error = 0;
 	union ccb *ccb;
 
 	ccb = cam_getccb(device);
 
 	scsi_test_unit_ready(&ccb->csio,
 			     /* retries */ retry_count,
 			     /* cbfcnp */ NULL,
 			     /* tag_action */ task_attr,
 			     /* sense_len */ SSD_FULL_SIZE,
 			     /* timeout */ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		if (quiet == 0)
 			perror("error sending test unit ready");
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 		if (quiet == 0)
 			fprintf(stdout, "Unit is ready\n");
 	} else {
 		if (quiet == 0)
 			fprintf(stdout, "Unit is not ready\n");
 		error = 1;
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 	}
 
 	cam_freeccb(ccb);
 
 	return (error);
 }
 
 static int
 scsistart(struct cam_device *device, int startstop, int loadeject,
 	  int task_attr, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	int error = 0;
 
 	ccb = cam_getccb(device);
 
 	/*
 	 * If we're stopping, send an ordered tag so the drive in question
 	 * will finish any previously queued writes before stopping.  If
 	 * the device isn't capable of tagged queueing, or if tagged
 	 * queueing is turned off, the tag action is a no-op.  We override
 	 * the default simple tag, although this also has the effect of
 	 * overriding the user's wishes if he wanted to specify a simple
 	 * tag.
 	 */
 	if ((startstop == 0)
 	 && (task_attr == MSG_SIMPLE_Q_TAG))
 		task_attr = MSG_ORDERED_Q_TAG;
 
 	scsi_start_stop(&ccb->csio,
 			/* retries */ retry_count,
 			/* cbfcnp */ NULL,
 			/* tag_action */ task_attr,
 			/* start/stop */ startstop,
 			/* load_eject */ loadeject,
 			/* immediate */ 0,
 			/* sense_len */ SSD_FULL_SIZE,
 			/* timeout */ timeout ? timeout : 120000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		perror("error sending start unit");
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
 		if (startstop) {
 			fprintf(stdout, "Unit started successfully");
 			if (loadeject)
 				fprintf(stdout,", Media loaded\n");
 			else
 				fprintf(stdout,"\n");
 		} else {
 			fprintf(stdout, "Unit stopped successfully");
 			if (loadeject)
 				fprintf(stdout, ", Media ejected\n");
 			else
 				fprintf(stdout, "\n");
 		}
 	else {
 		error = 1;
 		if (startstop)
 			fprintf(stdout,
 				"Error received from start unit command\n");
 		else
 			fprintf(stdout,
 				"Error received from stop unit command\n");
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 	}
 
 	cam_freeccb(ccb);
 
 	return (error);
 }
 
 int
 scsidoinquiry(struct cam_device *device, int argc, char **argv,
 	      char *combinedopt, int task_attr, int retry_count, int timeout)
 {
 	int c;
 	int error = 0;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'D':
 			arglist |= CAM_ARG_GET_STDINQ;
 			break;
 		case 'R':
 			arglist |= CAM_ARG_GET_XFERRATE;
 			break;
 		case 'S':
 			arglist |= CAM_ARG_GET_SERIAL;
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * If the user didn't specify any inquiry options, he wants all of
 	 * them.
 	 */
 	if ((arglist & CAM_ARG_INQ_MASK) == 0)
 		arglist |= CAM_ARG_INQ_MASK;
 
 	if (arglist & CAM_ARG_GET_STDINQ)
 		error = scsiinquiry(device, task_attr, retry_count, timeout);
 
 	if (error != 0)
 		return (error);
 
 	if (arglist & CAM_ARG_GET_SERIAL)
 		scsiserial(device, task_attr, retry_count, timeout);
 
 	if (arglist & CAM_ARG_GET_XFERRATE)
 		error = camxferrate(device);
 
 	return (error);
 }
 
 static int
 scsiinquiry(struct cam_device *device, int task_attr, int retry_count,
 	    int timeout)
 {
 	union ccb *ccb;
 	struct scsi_inquiry_data *inq_buf;
 	int error = 0;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		return (1);
 	}
 
 	/* cam_getccb cleans up the header, caller has to zero the payload */
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	inq_buf = (struct scsi_inquiry_data *)malloc(
 		sizeof(struct scsi_inquiry_data));
 
 	if (inq_buf == NULL) {
 		cam_freeccb(ccb);
 		warnx("can't malloc memory for inquiry\n");
 		return (1);
 	}
 	bzero(inq_buf, sizeof(*inq_buf));
 
 	/*
 	 * Note that although the size of the inquiry buffer is the full
 	 * 256 bytes specified in the SCSI spec, we only tell the device
 	 * that we have allocated SHORT_INQUIRY_LENGTH bytes.  There are
 	 * two reasons for this:
 	 *
 	 *  - The SCSI spec says that when a length field is only 1 byte,
 	 *    a value of 0 will be interpreted as 256.  Therefore
 	 *    scsi_inquiry() will convert an inq_len (which is passed in as
 	 *    a u_int32_t, but the field in the CDB is only 1 byte) of 256
 	 *    to 0.  Evidently, very few devices meet the spec in that
 	 *    regard.  Some devices, like many Seagate disks, take the 0 as
 	 *    0, and don't return any data.  One Pioneer DVD-R drive
 	 *    returns more data than the command asked for.
 	 *
 	 *    So, since there are numerous devices that just don't work
 	 *    right with the full inquiry size, we don't send the full size.
 	 *
 	 *  - The second reason not to use the full inquiry data length is
 	 *    that we don't need it here.  The only reason we issue a
 	 *    standard inquiry is to get the vendor name, device name,
 	 *    and revision so scsi_print_inquiry() can print them.
 	 *
 	 * If, at some point in the future, more inquiry data is needed for
 	 * some reason, this code should use a procedure similar to the
 	 * probe code.  i.e., issue a short inquiry, and determine from
 	 * the additional length passed back from the device how much
 	 * inquiry data the device supports.  Once the amount the device
 	 * supports is determined, issue an inquiry for that amount and no
 	 * more.
 	 *
 	 * KDM, 2/18/2000
 	 */
 	scsi_inquiry(&ccb->csio,
 		     /* retries */ retry_count,
 		     /* cbfcnp */ NULL,
 		     /* tag_action */ task_attr,
 		     /* inq_buf */ (u_int8_t *)inq_buf,
 		     /* inq_len */ SHORT_INQUIRY_LENGTH,
 		     /* evpd */ 0,
 		     /* page_code */ 0,
 		     /* sense_len */ SSD_FULL_SIZE,
 		     /* timeout */ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		perror("error sending SCSI inquiry");
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		error = 1;
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 	}
 
 	cam_freeccb(ccb);
 
 	if (error != 0) {
 		free(inq_buf);
 		return (error);
 	}
 
 	fprintf(stdout, "%s%d: ", device->device_name,
 		device->dev_unit_num);
 	scsi_print_inquiry(inq_buf);
 
 	free(inq_buf);
 
 	return (0);
 }
 
 static int
 scsiserial(struct cam_device *device, int task_attr, int retry_count,
 	   int timeout)
 {
 	union ccb *ccb;
 	struct scsi_vpd_unit_serial_number *serial_buf;
 	char serial_num[SVPD_SERIAL_NUM_SIZE + 1];
 	int error = 0;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		return (1);
 	}
 
 	/* cam_getccb cleans up the header, caller has to zero the payload */
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	serial_buf = (struct scsi_vpd_unit_serial_number *)
 		malloc(sizeof(*serial_buf));
 
 	if (serial_buf == NULL) {
 		cam_freeccb(ccb);
 		warnx("can't malloc memory for serial number");
 		return (1);
 	}
 
 	scsi_inquiry(&ccb->csio,
 		     /*retries*/ retry_count,
 		     /*cbfcnp*/ NULL,
 		     /* tag_action */ task_attr,
 		     /* inq_buf */ (u_int8_t *)serial_buf,
 		     /* inq_len */ sizeof(*serial_buf),
 		     /* evpd */ 1,
 		     /* page_code */ SVPD_UNIT_SERIAL_NUMBER,
 		     /* sense_len */ SSD_FULL_SIZE,
 		     /* timeout */ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("error getting serial number");
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		cam_freeccb(ccb);
 		free(serial_buf);
 		return (1);
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		error = 1;
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 	}
 
 	cam_freeccb(ccb);
 
 	if (error != 0) {
 		free(serial_buf);
 		return (error);
 	}
 
 	bcopy(serial_buf->serial_num, serial_num, serial_buf->length);
 	serial_num[serial_buf->length] = '\0';
 
 	if ((arglist & CAM_ARG_GET_STDINQ)
 	 || (arglist & CAM_ARG_GET_XFERRATE))
 		fprintf(stdout, "%s%d: Serial Number ",
 			device->device_name, device->dev_unit_num);
 
 	fprintf(stdout, "%.60s\n", serial_num);
 
 	free(serial_buf);
 
 	return (0);
 }
 
 int
 camxferrate(struct cam_device *device)
 {
 	struct ccb_pathinq cpi;
 	u_int32_t freq = 0;
 	u_int32_t speed = 0;
 	union ccb *ccb;
 	u_int mb;
 	int retval = 0;
 
 	if ((retval = get_cpi(device, &cpi)) != 0)
 		return (1);
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cts);
 
 	ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 	ccb->cts.type = CTS_TYPE_CURRENT_SETTINGS;
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char error_string[] = "error getting transfer settings";
 
 		if (retval < 0)
 			warn(error_string);
 		else
 			warnx(error_string);
 
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 
 		retval = 1;
 
 		goto xferrate_bailout;
 
 	}
 
 	speed = cpi.base_transfer_speed;
 	freq = 0;
 	if (ccb->cts.transport == XPORT_SPI) {
 		struct ccb_trans_settings_spi *spi =
 		    &ccb->cts.xport_specific.spi;
 
 		if ((spi->valid & CTS_SPI_VALID_SYNC_RATE) != 0) {
 			freq = scsi_calc_syncsrate(spi->sync_period);
 			speed = freq;
 		}
 		if ((spi->valid & CTS_SPI_VALID_BUS_WIDTH) != 0) {
 			speed *= (0x01 << spi->bus_width);
 		}
 	} else if (ccb->cts.transport == XPORT_FC) {
 		struct ccb_trans_settings_fc *fc =
 		    &ccb->cts.xport_specific.fc;
 
 		if (fc->valid & CTS_FC_VALID_SPEED)
 			speed = fc->bitrate;
 	} else if (ccb->cts.transport == XPORT_SAS) {
 		struct ccb_trans_settings_sas *sas =
 		    &ccb->cts.xport_specific.sas;
 
 		if (sas->valid & CTS_SAS_VALID_SPEED)
 			speed = sas->bitrate;
 	} else if (ccb->cts.transport == XPORT_ATA) {
 		struct ccb_trans_settings_pata *pata =
 		    &ccb->cts.xport_specific.ata;
 
 		if (pata->valid & CTS_ATA_VALID_MODE)
 			speed = ata_mode2speed(pata->mode);
 	} else if (ccb->cts.transport == XPORT_SATA) {
 		struct	ccb_trans_settings_sata *sata =
 		    &ccb->cts.xport_specific.sata;
 
 		if (sata->valid & CTS_SATA_VALID_REVISION)
 			speed = ata_revision2speed(sata->revision);
 	}
 
 	mb = speed / 1000;
 	if (mb > 0) {
 		fprintf(stdout, "%s%d: %d.%03dMB/s transfers",
 			device->device_name, device->dev_unit_num,
 			mb, speed % 1000);
 	} else {
 		fprintf(stdout, "%s%d: %dKB/s transfers",
 			device->device_name, device->dev_unit_num,
 			speed);
 	}
 
 	if (ccb->cts.transport == XPORT_SPI) {
 		struct ccb_trans_settings_spi *spi =
 		    &ccb->cts.xport_specific.spi;
 
 		if (((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) != 0)
 		 && (spi->sync_offset != 0))
 			fprintf(stdout, " (%d.%03dMHz, offset %d", freq / 1000,
 				freq % 1000, spi->sync_offset);
 
 		if (((spi->valid & CTS_SPI_VALID_BUS_WIDTH) != 0)
 		 && (spi->bus_width > 0)) {
 			if (((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) != 0)
 			 && (spi->sync_offset != 0)) {
 				fprintf(stdout, ", ");
 			} else {
 				fprintf(stdout, " (");
 			}
 			fprintf(stdout, "%dbit)", 8 * (0x01 << spi->bus_width));
 		} else if (((spi->valid & CTS_SPI_VALID_SYNC_OFFSET) != 0)
 		 && (spi->sync_offset != 0)) {
 			fprintf(stdout, ")");
 		}
 	} else if (ccb->cts.transport == XPORT_ATA) {
 		struct ccb_trans_settings_pata *pata =
 		    &ccb->cts.xport_specific.ata;
 
 		printf(" (");
 		if (pata->valid & CTS_ATA_VALID_MODE)
 			printf("%s, ", ata_mode2string(pata->mode));
 		if ((pata->valid & CTS_ATA_VALID_ATAPI) && pata->atapi != 0)
 			printf("ATAPI %dbytes, ", pata->atapi);
 		if (pata->valid & CTS_ATA_VALID_BYTECOUNT)
 			printf("PIO %dbytes", pata->bytecount);
 		printf(")");
 	} else if (ccb->cts.transport == XPORT_SATA) {
 		struct ccb_trans_settings_sata *sata =
 		    &ccb->cts.xport_specific.sata;
 
 		printf(" (");
 		if (sata->valid & CTS_SATA_VALID_REVISION)
 			printf("SATA %d.x, ", sata->revision);
 		else
 			printf("SATA, ");
 		if (sata->valid & CTS_SATA_VALID_MODE)
 			printf("%s, ", ata_mode2string(sata->mode));
 		if ((sata->valid & CTS_SATA_VALID_ATAPI) && sata->atapi != 0)
 			printf("ATAPI %dbytes, ", sata->atapi);
 		if (sata->valid & CTS_SATA_VALID_BYTECOUNT)
 			printf("PIO %dbytes", sata->bytecount);
 		printf(")");
 	}
 
 	if (ccb->cts.protocol == PROTO_SCSI) {
 		struct ccb_trans_settings_scsi *scsi =
 		    &ccb->cts.proto_specific.scsi;
 		if (scsi->valid & CTS_SCSI_VALID_TQ) {
 			if (scsi->flags & CTS_SCSI_FLAGS_TAG_ENB) {
 				fprintf(stdout, ", Command Queueing Enabled");
 			}
 		}
 	}
 
 	fprintf(stdout, "\n");
 
 xferrate_bailout:
 
 	cam_freeccb(ccb);
 
 	return (retval);
 }
 
 static void
 atahpa_print(struct ata_params *parm, u_int64_t hpasize, int header)
 {
 	u_int32_t lbasize = (u_int32_t)parm->lba_size_1 |
 				((u_int32_t)parm->lba_size_2 << 16);
 
 	u_int64_t lbasize48 = ((u_int64_t)parm->lba_size48_1) |
 				((u_int64_t)parm->lba_size48_2 << 16) |
 				((u_int64_t)parm->lba_size48_3 << 32) |
 				((u_int64_t)parm->lba_size48_4 << 48);
 
 	if (header) {
 		printf("\nFeature                      "
 		       "Support  Enabled   Value\n");
 	}
 
 	printf("Host Protected Area (HPA)      ");
 	if (parm->support.command1 & ATA_SUPPORT_PROTECTED) {
 		u_int64_t lba = lbasize48 ? lbasize48 : lbasize;
 		printf("yes      %s     %ju/%ju\n", (hpasize > lba) ? "yes" : "no ",
 			lba, hpasize);
 
 		printf("HPA - Security                 ");
 		if (parm->support.command2 & ATA_SUPPORT_MAXSECURITY)
 			printf("yes      %s\n", (parm->enabled.command2 &
 			    ATA_SUPPORT_MAXSECURITY) ? "yes" : "no ");
 		else
 			printf("no\n");
 	} else {
 		printf("no\n");
 	}
 }
 
 static void
 ataama_print(struct ata_params *parm, u_int64_t nativesize, int header)
 {
 	u_int32_t lbasize = (u_int32_t)parm->lba_size_1 |
 				((u_int32_t)parm->lba_size_2 << 16);
 
 	u_int64_t lbasize48 = ((u_int64_t)parm->lba_size48_1) |
 				((u_int64_t)parm->lba_size48_2 << 16) |
 				((u_int64_t)parm->lba_size48_3 << 32) |
 				((u_int64_t)parm->lba_size48_4 << 48);
 
 	if (header) {
 		printf("\nFeature                      "
 		       "Support  Enabled   Value\n");
 	}
 
 	printf("Accessible Max Address Config  ");
 	if (parm->support2 & ATA_SUPPORT_AMAX_ADDR) {
 		u_int64_t lba = lbasize48 ? lbasize48 : lbasize;
 		printf("yes      %s     %ju/%ju\n",
 		    (nativesize > lba) ? "yes" : "no ", lba, nativesize);
 	} else {
 		printf("no\n");
 	}
 }
 
 static int
 atasata(struct ata_params *parm)
 {
 
 
 	if (parm->satacapabilities != 0xffff &&
 	    parm->satacapabilities != 0x0000)
 		return 1;
 
 	return 0;
 }
 
 static void
 atacapprint(struct ata_params *parm)
 {
 	const char *proto;
 	u_int32_t lbasize = (u_int32_t)parm->lba_size_1 |
 				((u_int32_t)parm->lba_size_2 << 16);
 
 	u_int64_t lbasize48 = ((u_int64_t)parm->lba_size48_1) |
 				((u_int64_t)parm->lba_size48_2 << 16) |
 				((u_int64_t)parm->lba_size48_3 << 32) |
 				((u_int64_t)parm->lba_size48_4 << 48);
 
 	printf("\n");
 	printf("protocol              ");
 	proto = (parm->config == ATA_PROTO_CFA) ? "CFA" :
 		(parm->config & ATA_PROTO_ATAPI) ? "ATAPI" : "ATA";
 	if (ata_version(parm->version_major) == 0) {
 		printf("%s", proto);
 	} else if (ata_version(parm->version_major) <= 7) {
 		printf("%s-%d", proto,
 		    ata_version(parm->version_major));
 	} else if (ata_version(parm->version_major) == 8) {
 		printf("%s8-ACS", proto);
 	} else {
 		printf("ACS-%d %s",
 		    ata_version(parm->version_major) - 7, proto);
 	}
 	if (parm->satacapabilities && parm->satacapabilities != 0xffff) {
 		if (parm->satacapabilities & ATA_SATA_GEN3)
 			printf(" SATA 3.x\n");
 		else if (parm->satacapabilities & ATA_SATA_GEN2)
 			printf(" SATA 2.x\n");
 		else if (parm->satacapabilities & ATA_SATA_GEN1)
 			printf(" SATA 1.x\n");
 		else
 			printf(" SATA\n");
 	}
 	else
 		printf("\n");
 	printf("device model          %.40s\n", parm->model);
 	printf("firmware revision     %.8s\n", parm->revision);
 	printf("serial number         %.20s\n", parm->serial);
 	if (parm->enabled.extension & ATA_SUPPORT_64BITWWN) {
 		printf("WWN                   %04x%04x%04x%04x\n",
 		    parm->wwn[0], parm->wwn[1], parm->wwn[2], parm->wwn[3]);
 	}
+	printf("additional product id %.8s\n", parm->product_id);
 	if (parm->enabled.extension & ATA_SUPPORT_MEDIASN) {
 		printf("media serial number   %.30s\n",
 		    parm->media_serial);
 	}
 
 	printf("cylinders             %d\n", parm->cylinders);
 	printf("heads                 %d\n", parm->heads);
 	printf("sectors/track         %d\n", parm->sectors);
 	printf("sector size           logical %u, physical %lu, offset %lu\n",
 	    ata_logical_sector_size(parm),
 	    (unsigned long)ata_physical_sector_size(parm),
 	    (unsigned long)ata_logical_sector_offset(parm));
 
 	if (parm->config == ATA_PROTO_CFA ||
 	    (parm->support.command2 & ATA_SUPPORT_CFA))
 		printf("CFA supported\n");
 
 	printf("LBA%ssupported         ",
 		parm->capabilities1 & ATA_SUPPORT_LBA ? " " : " not ");
 	if (lbasize)
 		printf("%d sectors\n", lbasize);
 	else
 		printf("\n");
 
 	printf("LBA48%ssupported       ",
 		parm->support.command2 & ATA_SUPPORT_ADDRESS48 ? " " : " not ");
 	if (lbasize48)
 		printf("%ju sectors\n", (uintmax_t)lbasize48);
 	else
 		printf("\n");
 
 	printf("PIO supported         PIO");
 	switch (ata_max_pmode(parm)) {
 	case ATA_PIO4:
 		printf("4");
 		break;
 	case ATA_PIO3:
 		printf("3");
 		break;
 	case ATA_PIO2:
 		printf("2");
 		break;
 	case ATA_PIO1:
 		printf("1");
 		break;
 	default:
 		printf("0");
 	}
 	if ((parm->capabilities1 & ATA_SUPPORT_IORDY) == 0)
 		printf(" w/o IORDY");
 	printf("\n");
 
 	printf("DMA%ssupported         ",
 		parm->capabilities1 & ATA_SUPPORT_DMA ? " " : " not ");
 	if (parm->capabilities1 & ATA_SUPPORT_DMA) {
 		if (parm->mwdmamodes & 0xff) {
 			printf("WDMA");
 			if (parm->mwdmamodes & 0x04)
 				printf("2");
 			else if (parm->mwdmamodes & 0x02)
 				printf("1");
 			else if (parm->mwdmamodes & 0x01)
 				printf("0");
 			printf(" ");
 		}
 		if ((parm->atavalid & ATA_FLAG_88) &&
 		    (parm->udmamodes & 0xff)) {
 			printf("UDMA");
 			if (parm->udmamodes & 0x40)
 				printf("6");
 			else if (parm->udmamodes & 0x20)
 				printf("5");
 			else if (parm->udmamodes & 0x10)
 				printf("4");
 			else if (parm->udmamodes & 0x08)
 				printf("3");
 			else if (parm->udmamodes & 0x04)
 				printf("2");
 			else if (parm->udmamodes & 0x02)
 				printf("1");
 			else if (parm->udmamodes & 0x01)
 				printf("0");
 			printf(" ");
 		}
 	}
 	printf("\n");
 
 	if (parm->media_rotation_rate == 1) {
 		printf("media RPM             non-rotating\n");
 	} else if (parm->media_rotation_rate >= 0x0401 &&
 	    parm->media_rotation_rate <= 0xFFFE) {
 		printf("media RPM             %d\n",
 			parm->media_rotation_rate);
 	}
 
 	printf("Zoned-Device Commands ");
 	switch (parm->support3 & ATA_SUPPORT_ZONE_MASK) {
 		case ATA_SUPPORT_ZONE_DEV_MANAGED:
 			printf("device managed\n");
 			break;
 		case ATA_SUPPORT_ZONE_HOST_AWARE:
 			printf("host aware\n");
 			break;
 		default:
 			printf("no\n");
 	}
 
 	printf("\nFeature                      "
 		"Support  Enabled   Value           Vendor\n");
 	printf("read ahead                     %s	%s\n",
 		parm->support.command1 & ATA_SUPPORT_LOOKAHEAD ? "yes" : "no",
 		parm->enabled.command1 & ATA_SUPPORT_LOOKAHEAD ? "yes" : "no");
 	printf("write cache                    %s	%s\n",
 		parm->support.command1 & ATA_SUPPORT_WRITECACHE ? "yes" : "no",
 		parm->enabled.command1 & ATA_SUPPORT_WRITECACHE ? "yes" : "no");
 	printf("flush cache                    %s	%s\n",
 		parm->support.command2 & ATA_SUPPORT_FLUSHCACHE ? "yes" : "no",
 		parm->enabled.command2 & ATA_SUPPORT_FLUSHCACHE ? "yes" : "no");
 	printf("overlap                        %s\n",
 		parm->capabilities1 & ATA_SUPPORT_OVERLAP ? "yes" : "no");
 	printf("Tagged Command Queuing (TCQ)   %s	%s",
 		parm->support.command2 & ATA_SUPPORT_QUEUED ? "yes" : "no",
 		parm->enabled.command2 & ATA_SUPPORT_QUEUED ? "yes" : "no");
 		if (parm->support.command2 & ATA_SUPPORT_QUEUED) {
 			printf("	%d tags\n",
 			    ATA_QUEUE_LEN(parm->queue) + 1);
 		} else
 			printf("\n");
 	printf("Native Command Queuing (NCQ)   ");
-	if (parm->satacapabilities != 0xffff &&
-	    (parm->satacapabilities & ATA_SUPPORT_NCQ)) {
+	if (atasata(parm) && (parm->satacapabilities & ATA_SUPPORT_NCQ)) {
 		printf("yes		%d tags\n",
 		    ATA_QUEUE_LEN(parm->queue) + 1);
+		printf("NCQ Priority Information       %s\n",
+		    parm->satacapabilities & ATA_SUPPORT_NCQ_PRIO ?
+		    "yes" : "no");
+		printf("NCQ Non-Data Command           %s\n",
+		    parm->satacapabilities2 & ATA_SUPPORT_NCQ_NON_DATA ?
+		    "yes" : "no");
+		printf("NCQ Streaming                  %s\n",
+		    parm->satacapabilities2 & ATA_SUPPORT_NCQ_STREAM ?
+		    "yes" : "no");
+		printf("Receive & Send FPDMA Queued    %s\n",
+		    parm->satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED ?
+		    "yes" : "no");
+		printf("NCQ Autosense                  %s\n",
+		    parm->satasupport & ATA_SUPPORT_NCQ_AUTOSENSE ?
+		    "yes" : "no");
 	} else
 		printf("no\n");
 
-	printf("NCQ Queue Management           %s\n", atasata(parm) &&
-		parm->satacapabilities2 & ATA_SUPPORT_NCQ_QMANAGEMENT ?
-		"yes" : "no");
-	printf("NCQ Streaming                  %s\n", atasata(parm) &&
-		parm->satacapabilities2 & ATA_SUPPORT_NCQ_STREAM ?
-		"yes" : "no");
-	printf("Receive & Send FPDMA Queued    %s\n", atasata(parm) &&
-		parm->satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED ?
-		"yes" : "no");
-
 	printf("SMART                          %s	%s\n",
 		parm->support.command1 & ATA_SUPPORT_SMART ? "yes" : "no",
 		parm->enabled.command1 & ATA_SUPPORT_SMART ? "yes" : "no");
-	printf("microcode download             %s	%s\n",
-		parm->support.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no",
-		parm->enabled.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no");
 	printf("security                       %s	%s\n",
 		parm->support.command1 & ATA_SUPPORT_SECURITY ? "yes" : "no",
 		parm->enabled.command1 & ATA_SUPPORT_SECURITY ? "yes" : "no");
 	printf("power management               %s	%s\n",
 		parm->support.command1 & ATA_SUPPORT_POWERMGT ? "yes" : "no",
 		parm->enabled.command1 & ATA_SUPPORT_POWERMGT ? "yes" : "no");
+	printf("microcode download             %s	%s\n",
+		parm->support.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no",
+		parm->enabled.command2 & ATA_SUPPORT_MICROCODE ? "yes" : "no");
 	printf("advanced power management      %s	%s",
 		parm->support.command2 & ATA_SUPPORT_APM ? "yes" : "no",
 		parm->enabled.command2 & ATA_SUPPORT_APM ? "yes" : "no");
 		if (parm->support.command2 & ATA_SUPPORT_APM) {
 			printf("	%d/0x%02X\n",
 			    parm->apm_value & 0xff, parm->apm_value & 0xff);
 		} else
 			printf("\n");
 	printf("automatic acoustic management  %s	%s",
 		parm->support.command2 & ATA_SUPPORT_AUTOACOUSTIC ? "yes" :"no",
 		parm->enabled.command2 & ATA_SUPPORT_AUTOACOUSTIC ? "yes" :"no");
 		if (parm->support.command2 & ATA_SUPPORT_AUTOACOUSTIC) {
 			printf("	%d/0x%02X	%d/0x%02X\n",
 			    ATA_ACOUSTIC_CURRENT(parm->acoustic),
 			    ATA_ACOUSTIC_CURRENT(parm->acoustic),
 			    ATA_ACOUSTIC_VENDOR(parm->acoustic),
 			    ATA_ACOUSTIC_VENDOR(parm->acoustic));
 		} else
 			printf("\n");
 	printf("media status notification      %s	%s\n",
 		parm->support.command2 & ATA_SUPPORT_NOTIFY ? "yes" : "no",
 		parm->enabled.command2 & ATA_SUPPORT_NOTIFY ? "yes" : "no");
 	printf("power-up in Standby            %s	%s\n",
 		parm->support.command2 & ATA_SUPPORT_STANDBY ? "yes" : "no",
 		parm->enabled.command2 & ATA_SUPPORT_STANDBY ? "yes" : "no");
 	printf("write-read-verify              %s	%s",
 		parm->support2 & ATA_SUPPORT_WRITEREADVERIFY ? "yes" : "no",
 		parm->enabled2 & ATA_SUPPORT_WRITEREADVERIFY ? "yes" : "no");
 		if (parm->support2 & ATA_SUPPORT_WRITEREADVERIFY) {
 			printf("	%d/0x%x\n",
 			    parm->wrv_mode, parm->wrv_mode);
 		} else
 			printf("\n");
 	printf("unload                         %s	%s\n",
 		parm->support.extension & ATA_SUPPORT_UNLOAD ? "yes" : "no",
 		parm->enabled.extension & ATA_SUPPORT_UNLOAD ? "yes" : "no");
 	printf("general purpose logging        %s	%s\n",
 		parm->support.extension & ATA_SUPPORT_GENLOG ? "yes" : "no",
 		parm->enabled.extension & ATA_SUPPORT_GENLOG ? "yes" : "no");
 	printf("free-fall                      %s	%s\n",
 		parm->support2 & ATA_SUPPORT_FREEFALL ? "yes" : "no",
 		parm->enabled2 & ATA_SUPPORT_FREEFALL ? "yes" : "no");
+	printf("sense data reporting           %s	%s\n",
+		parm->support2 & ATA_SUPPORT_SENSE_REPORT ? "yes" : "no",
+		parm->enabled2 & ATA_SUPPORT_SENSE_REPORT ? "yes" : "no");
+	printf("extended power conditions      %s	%s\n",
+		parm->support2 & ATA_SUPPORT_EPC ? "yes" : "no",
+		parm->enabled2 & ATA_SUPPORT_EPC ? "yes" : "no");
+	printf("device statistics notification %s	%s\n",
+		parm->support2 & ATA_SUPPORT_DSN ? "yes" : "no",
+		parm->enabled2 & ATA_SUPPORT_DSN ? "yes" : "no");
 	printf("Data Set Management (DSM/TRIM) ");
 	if (parm->support_dsm & ATA_SUPPORT_DSM_TRIM) {
 		printf("yes\n");
 		printf("DSM - max 512byte blocks       ");
 		if (parm->max_dsm_blocks == 0x00)
 			printf("yes              not specified\n");
 		else
 			printf("yes              %d\n",
 				parm->max_dsm_blocks);
 
 		printf("DSM - deterministic read       ");
 		if (parm->support3 & ATA_SUPPORT_DRAT) {
 			if (parm->support3 & ATA_SUPPORT_RZAT)
 				printf("yes              zeroed\n");
 			else
 				printf("yes              any value\n");
 		} else {
 			printf("no\n");
 		}
 	} else {
 		printf("no\n");
 	}
+	printf("encrypts all user data         %s\n",
+		parm->support3 & ATA_ENCRYPTS_ALL_USER_DATA ? "yes" : "no");
 	printf("Sanitize                       ");
 	if (parm->multi & ATA_SUPPORT_SANITIZE) {
 		printf("yes\t\t%s%s%s\n",
 		    parm->multi & ATA_SUPPORT_BLOCK_ERASE_EXT ? "block, " : "",
 		    parm->multi & ATA_SUPPORT_OVERWRITE_EXT ? "overwrite, " : "",
 		    parm->multi & ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT ? "crypto" : "");
 		printf("Sanitize - commands allowed    %s\n",
 		    parm->multi & ATA_SUPPORT_SANITIZE_ALLOWED ? "yes" : "no");
 		printf("Sanitize - antifreeze lock     %s\n",
 		    parm->multi & ATA_SUPPORT_ANTIFREEZE_LOCK_EXT ? "yes" : "no");
 	} else {
 		printf("no\n");
 	}
 }
 
 static int
 scsi_cam_pass_16_send(struct cam_device *device, union ccb *ccb, int quiet)
 {
 	struct ata_pass_16 *ata_pass_16;
 	struct ata_cmd ata_cmd;
 
 	ata_pass_16 = (struct ata_pass_16 *)ccb->csio.cdb_io.cdb_bytes;
 	ata_cmd.command = ata_pass_16->command;
 	ata_cmd.control = ata_pass_16->control;
 	ata_cmd.features = ata_pass_16->features;
 
 	if (arglist & CAM_ARG_VERBOSE) {
 		warnx("sending ATA %s via pass_16 with timeout of %u msecs",
 		      ata_op_string(&ata_cmd),
 		      ccb->csio.ccb_h.timeout);
 	}
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		if (quiet != 1 || arglist & CAM_ARG_VERBOSE) {
 			warn("error sending ATA %s via pass_16",
 			     ata_op_string(&ata_cmd));
 		}
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		return (1);
 	}
 
 	if (!(ata_pass_16->flags & AP_FLAG_CHK_COND) &&
 	    (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (quiet != 1 || arglist & CAM_ARG_VERBOSE) {
 			warnx("ATA %s via pass_16 failed",
 			      ata_op_string(&ata_cmd));
 		}
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		return (1);
 	}
 
 	return (0);
 }
 
 
 static int
 ata_cam_send(struct cam_device *device, union ccb *ccb, int quiet)
 {
 	if (arglist & CAM_ARG_VERBOSE) {
 		warnx("sending ATA %s with timeout of %u msecs",
 		      ata_op_string(&(ccb->ataio.cmd)),
 		      ccb->ataio.ccb_h.timeout);
 	}
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		if (quiet != 1 || arglist & CAM_ARG_VERBOSE) {
 			warn("error sending ATA %s",
 			     ata_op_string(&(ccb->ataio.cmd)));
 		}
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		return (1);
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (quiet != 1 || arglist & CAM_ARG_VERBOSE) {
 			warnx("ATA %s failed: %d",
 			      ata_op_string(&(ccb->ataio.cmd)), quiet);
 		}
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		return (1);
 	}
 
 	return (0);
 }
 
 static int
 ata_do_pass_16(struct cam_device *device, union ccb *ccb, int retries,
 	       u_int32_t flags, u_int8_t protocol, u_int8_t ata_flags,
 	       u_int8_t tag_action, u_int8_t command, u_int8_t features,
 	       u_int64_t lba, u_int8_t sector_count, u_int8_t *data_ptr,
 	       u_int16_t dxfer_len, int timeout, int quiet)
 {
 	if (data_ptr != NULL) {
 		ata_flags |= AP_FLAG_BYT_BLOK_BYTES |
 			    AP_FLAG_TLEN_SECT_CNT;
 		if (flags & CAM_DIR_OUT)
 			ata_flags |= AP_FLAG_TDIR_TO_DEV;
 		else
 			ata_flags |= AP_FLAG_TDIR_FROM_DEV;
 	} else {
 		ata_flags |= AP_FLAG_TLEN_NO_DATA;
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	scsi_ata_pass_16(&ccb->csio,
 			 retries,
 			 NULL,
 			 flags,
 			 tag_action,
 			 protocol,
 			 ata_flags,
 			 features,
 			 sector_count,
 			 lba,
 			 command,
 			 /*control*/0,
 			 data_ptr,
 			 dxfer_len,
 			 /*sense_len*/SSD_FULL_SIZE,
 			 timeout);
 
 	return scsi_cam_pass_16_send(device, ccb, quiet);
 }
 
 static int
 ata_try_pass_16(struct cam_device *device)
 {
 	struct ccb_pathinq cpi;
 
 	if (get_cpi(device, &cpi) != 0) {
 		warnx("couldn't get CPI");
 		return (-1);
 	}
 
 	if (cpi.protocol == PROTO_SCSI) {
 		/* possibly compatible with pass_16 */
 		return (1);
 	}
 
 	/* likely not compatible with pass_16 */
 	return (0);
 }
 
 static int
 ata_do_28bit_cmd(struct cam_device *device, union ccb *ccb, int retries,
 		 u_int32_t flags, u_int8_t protocol, u_int8_t tag_action,
 		 u_int8_t command, u_int8_t features, u_int32_t lba,
 		 u_int8_t sector_count, u_int8_t *data_ptr, u_int16_t dxfer_len,
 		 int timeout, int quiet)
 {
 
 
 	switch (ata_try_pass_16(device)) {
 	case -1:
 		return (1);
 	case 1:
 		/* Try using SCSI Passthrough */
 		return ata_do_pass_16(device, ccb, retries, flags, protocol,
 				      0, tag_action, command, features, lba,
 				      sector_count, data_ptr, dxfer_len,
 				      timeout, quiet);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->ataio);
 	cam_fill_ataio(&ccb->ataio,
 		       retries,
 		       NULL,
 		       flags,
 		       tag_action,
 		       data_ptr,
 		       dxfer_len,
 		       timeout);
 
 	ata_28bit_cmd(&ccb->ataio, command, features, lba, sector_count);
 	return ata_cam_send(device, ccb, quiet);
 }
 
 static int
 ata_do_cmd(struct cam_device *device, union ccb *ccb, int retries,
 	   u_int32_t flags, u_int8_t protocol, u_int8_t ata_flags,
 	   u_int8_t tag_action, u_int8_t command, u_int8_t features,
 	   u_int64_t lba, u_int8_t sector_count, u_int8_t *data_ptr,
 	   u_int16_t dxfer_len, int timeout, int force48bit)
 {
 	int retval;
 
 	retval = ata_try_pass_16(device);
 	if (retval == -1)
 		return (1);
 
 	if (retval == 1) {
 		int error;
 
 		/* Try using SCSI Passthrough */
 		error = ata_do_pass_16(device, ccb, retries, flags, protocol,
 				      ata_flags, tag_action, command, features,
 				      lba, sector_count, data_ptr, dxfer_len,
 				      timeout, 0);
 
 		if (ata_flags & AP_FLAG_CHK_COND) {
 			/* Decode ata_res from sense data */
 			struct ata_res_pass16 *res_pass16;
 			struct ata_res *res;
 			u_int i;
 			u_int16_t *ptr;
 
 			/* sense_data is 4 byte aligned */
 			ptr = (uint16_t*)(uintptr_t)&ccb->csio.sense_data;
 			for (i = 0; i < sizeof(*res_pass16) / 2; i++)
 				ptr[i] = le16toh(ptr[i]);
 
 			/* sense_data is 4 byte aligned */
 			res_pass16 = (struct ata_res_pass16 *)(uintptr_t)
 			    &ccb->csio.sense_data;
 			res = &ccb->ataio.res;
 			res->flags = res_pass16->flags;
 			res->status = res_pass16->status;
 			res->error = res_pass16->error;
 			res->lba_low = res_pass16->lba_low;
 			res->lba_mid = res_pass16->lba_mid;
 			res->lba_high = res_pass16->lba_high;
 			res->device = res_pass16->device;
 			res->lba_low_exp = res_pass16->lba_low_exp;
 			res->lba_mid_exp = res_pass16->lba_mid_exp;
 			res->lba_high_exp = res_pass16->lba_high_exp;
 			res->sector_count = res_pass16->sector_count;
 			res->sector_count_exp = res_pass16->sector_count_exp;
 			ccb->ccb_h.status &= ~CAM_STATUS_MASK;
 			if (res->status & ATA_STATUS_ERROR)
 				ccb->ccb_h.status |= CAM_ATA_STATUS_ERROR;
 			else
 				ccb->ccb_h.status |= CAM_REQ_CMP;
 		}
 
 		return (error);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->ataio);
 	cam_fill_ataio(&ccb->ataio,
 		       retries,
 		       NULL,
 		       flags,
 		       tag_action,
 		       data_ptr,
 		       dxfer_len,
 		       timeout);
 
 	if (force48bit || lba > ATA_MAX_28BIT_LBA)
 		ata_48bit_cmd(&ccb->ataio, command, features, lba, sector_count);
 	else
 		ata_28bit_cmd(&ccb->ataio, command, features, lba, sector_count);
 
 	if (ata_flags & AP_FLAG_CHK_COND)
 		ccb->ataio.cmd.flags |= CAM_ATAIO_NEEDRESULT;
 
 	return ata_cam_send(device, ccb, 0);
 }
 
 static void
 dump_data(uint16_t *ptr, uint32_t len)
 {
 	u_int i;
 
 	for (i = 0; i < len / 2; i++) {
 		if ((i % 8) == 0)
 			printf(" %3d: ", i);
 		printf("%04hx ", ptr[i]);
 		if ((i % 8) == 7)
 			printf("\n");
 	}
 	if ((i % 8) != 7)
 		printf("\n");
 }
 
 static int
 atahpa_proc_resp(struct cam_device *device, union ccb *ccb,
 		 int is48bit, u_int64_t *hpasize)
 {
 	struct ata_res *res;
 
 	res = &ccb->ataio.res;
 	if (res->status & ATA_STATUS_ERROR) {
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 			printf("error = 0x%02x, sector_count = 0x%04x, "
 			       "device = 0x%02x, status = 0x%02x\n",
 			       res->error, res->sector_count,
 			       res->device, res->status);
 		}
 
 		if (res->error & ATA_ERROR_ID_NOT_FOUND) {
 			warnx("Max address has already been set since "
 			      "last power-on or hardware reset");
 		}
 
 		return (1);
 	}
 
 	if (arglist & CAM_ARG_VERBOSE) {
 		fprintf(stdout, "%s%d: Raw native max data:\n",
 			device->device_name, device->dev_unit_num);
 		/* res is 4 byte aligned */
 		dump_data((uint16_t*)(uintptr_t)res, sizeof(struct ata_res));
 
 		printf("error = 0x%02x, sector_count = 0x%04x, device = 0x%02x, "
 		       "status = 0x%02x\n", res->error, res->sector_count,
 		       res->device, res->status);
 	}
 
 	if (hpasize != NULL) {
 		if (is48bit) {
 			*hpasize = (((u_int64_t)((res->lba_high_exp << 16) |
 			    (res->lba_mid_exp << 8) | res->lba_low_exp) << 24) |
 			    ((res->lba_high << 16) | (res->lba_mid << 8) |
 			    res->lba_low)) + 1;
 		} else {
 			*hpasize = (((res->device & 0x0f) << 24) |
 			    (res->lba_high << 16) | (res->lba_mid << 8) |
 			    res->lba_low) + 1;
 		}
 	}
 
 	return (0);
 }
 
 static int
 ata_read_native_max(struct cam_device *device, int retry_count,
 		      u_int32_t timeout, union ccb *ccb,
 		      struct ata_params *parm, u_int64_t *hpasize)
 {
 	int error;
 	u_int cmd, is48bit;
 	u_int8_t protocol;
 
 	is48bit = parm->support.command2 & ATA_SUPPORT_ADDRESS48;
 	protocol = AP_PROTO_NON_DATA;
 
 	if (is48bit) {
 		cmd = ATA_READ_NATIVE_MAX_ADDRESS48;
 		protocol |= AP_EXTEND;
 	} else {
 		cmd = ATA_READ_NATIVE_MAX_ADDRESS;
 	}
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_NONE,
 			   /*protocol*/protocol,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/cmd,
 			   /*features*/0,
 			   /*lba*/0,
 			   /*sector_count*/0,
 			   /*data_ptr*/NULL,
 			   /*dxfer_len*/0,
 			   timeout ? timeout : 5000,
 			   is48bit);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, is48bit, hpasize);
 }
 
 static int
 atahpa_set_max(struct cam_device *device, int retry_count,
 	      u_int32_t timeout, union ccb *ccb,
 	      int is48bit, u_int64_t maxsize, int persist)
 {
 	int error;
 	u_int cmd;
 	u_int8_t protocol;
 
 	protocol = AP_PROTO_NON_DATA;
 
 	if (is48bit) {
 		cmd = ATA_SET_MAX_ADDRESS48;
 		protocol |= AP_EXTEND;
 	} else {
 		cmd = ATA_SET_MAX_ADDRESS;
 	}
 
 	/* lba's are zero indexed so the max lba is requested max - 1 */
 	if (maxsize)
 		maxsize--;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_NONE,
 			   /*protocol*/protocol,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/cmd,
 			   /*features*/ATA_HPA_FEAT_MAX_ADDR,
 			   /*lba*/maxsize,
 			   /*sector_count*/persist,
 			   /*data_ptr*/NULL,
 			   /*dxfer_len*/0,
 			   timeout ? timeout : 1000,
 			   is48bit);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, is48bit, NULL);
 }
 
 static int
 atahpa_password(struct cam_device *device, int retry_count,
 		u_int32_t timeout, union ccb *ccb,
 		int is48bit, struct ata_set_max_pwd *pwd)
 {
 	int error;
 	u_int cmd;
 	u_int8_t protocol;
 
 	protocol = AP_PROTO_PIO_OUT;
 	cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_OUT,
 			   /*protocol*/protocol,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/cmd,
 			   /*features*/ATA_HPA_FEAT_SET_PWD,
 			   /*lba*/0,
 			   /*sector_count*/0,
 			   /*data_ptr*/(u_int8_t*)pwd,
 			   /*dxfer_len*/sizeof(struct ata_set_max_pwd),
 			   timeout ? timeout : 1000,
 			   is48bit);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, is48bit, NULL);
 }
 
 static int
 atahpa_lock(struct cam_device *device, int retry_count,
 	    u_int32_t timeout, union ccb *ccb, int is48bit)
 {
 	int error;
 	u_int cmd;
 	u_int8_t protocol;
 
 	protocol = AP_PROTO_NON_DATA;
 	cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_NONE,
 			   /*protocol*/protocol,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/cmd,
 			   /*features*/ATA_HPA_FEAT_LOCK,
 			   /*lba*/0,
 			   /*sector_count*/0,
 			   /*data_ptr*/NULL,
 			   /*dxfer_len*/0,
 			   timeout ? timeout : 1000,
 			   is48bit);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, is48bit, NULL);
 }
 
 static int
 atahpa_unlock(struct cam_device *device, int retry_count,
 	      u_int32_t timeout, union ccb *ccb,
 	      int is48bit, struct ata_set_max_pwd *pwd)
 {
 	int error;
 	u_int cmd;
 	u_int8_t protocol;
 
 	protocol = AP_PROTO_PIO_OUT;
 	cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_OUT,
 			   /*protocol*/protocol,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/cmd,
 			   /*features*/ATA_HPA_FEAT_UNLOCK,
 			   /*lba*/0,
 			   /*sector_count*/0,
 			   /*data_ptr*/(u_int8_t*)pwd,
 			   /*dxfer_len*/sizeof(struct ata_set_max_pwd),
 			   timeout ? timeout : 1000,
 			   is48bit);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, is48bit, NULL);
 }
 
 static int
 atahpa_freeze_lock(struct cam_device *device, int retry_count,
 		   u_int32_t timeout, union ccb *ccb, int is48bit)
 {
 	int error;
 	u_int cmd;
 	u_int8_t protocol;
 
 	protocol = AP_PROTO_NON_DATA;
 	cmd = (is48bit) ? ATA_SET_MAX_ADDRESS48 : ATA_SET_MAX_ADDRESS;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_NONE,
 			   /*protocol*/protocol,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/cmd,
 			   /*features*/ATA_HPA_FEAT_FREEZE,
 			   /*lba*/0,
 			   /*sector_count*/0,
 			   /*data_ptr*/NULL,
 			   /*dxfer_len*/0,
 			   timeout ? timeout : 1000,
 			   is48bit);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, is48bit, NULL);
 }
 
 static int
 ata_get_native_max(struct cam_device *device, int retry_count,
 		      u_int32_t timeout, union ccb *ccb,
 		      u_int64_t *nativesize)
 {
 	int error;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_NONE,
 			   /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/ATA_AMAX_ADDR,
 			   /*features*/ATA_AMAX_ADDR_GET,
 			   /*lba*/0,
 			   /*sector_count*/0,
 			   /*data_ptr*/NULL,
 			   /*dxfer_len*/0,
 			   timeout ? timeout : 30 * 1000,
 			   /*force48bit*/1);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, /*is48bit*/1, nativesize);
 }
 
 static int
 ataama_set(struct cam_device *device, int retry_count,
 	      u_int32_t timeout, union ccb *ccb, u_int64_t maxsize)
 {
 	int error;
 
 	/* lba's are zero indexed so the max lba is requested max - 1 */
 	if (maxsize)
 		maxsize--;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_NONE,
 			   /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/ATA_AMAX_ADDR,
 			   /*features*/ATA_AMAX_ADDR_SET,
 			   /*lba*/maxsize,
 			   /*sector_count*/0,
 			   /*data_ptr*/NULL,
 			   /*dxfer_len*/0,
 			   timeout ? timeout : 30 * 1000,
 			   /*force48bit*/1);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, /*is48bit*/1, NULL);
 }
 
 static int
 ataama_freeze(struct cam_device *device, int retry_count,
 		   u_int32_t timeout, union ccb *ccb)
 {
 	int error;
 
 	error = ata_do_cmd(device,
 			   ccb,
 			   retry_count,
 			   /*flags*/CAM_DIR_NONE,
 			   /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND,
 			   /*ata_flags*/AP_FLAG_CHK_COND,
 			   /*tag_action*/MSG_SIMPLE_Q_TAG,
 			   /*command*/ATA_AMAX_ADDR,
 			   /*features*/ATA_AMAX_ADDR_FREEZE,
 			   /*lba*/0,
 			   /*sector_count*/0,
 			   /*data_ptr*/NULL,
 			   /*dxfer_len*/0,
 			   timeout ? timeout : 30 * 1000,
 			   /*force48bit*/1);
 
 	if (error)
 		return (error);
 
 	return atahpa_proc_resp(device, ccb, /*is48bit*/1, NULL);
 }
 
 int
 ata_do_identify(struct cam_device *device, int retry_count, int timeout,
 		union ccb *ccb, struct ata_params** ident_bufp)
 {
 	struct ata_params *ident_buf;
 	struct ccb_pathinq cpi;
 	struct ccb_getdev cgd;
 	u_int i, error;
 	int16_t *ptr;
 	u_int8_t command, retry_command;
 
 	if (get_cpi(device, &cpi) != 0) {
 		warnx("couldn't get CPI");
 		return (-1);
 	}
 
 	/* Neither PROTO_ATAPI or PROTO_SATAPM are used in cpi.protocol */
 	if (cpi.protocol == PROTO_ATA) {
 		if (get_cgd(device, &cgd) != 0) {
 			warnx("couldn't get CGD");
 			return (-1);
 		}
 
 		command = (cgd.protocol == PROTO_ATA) ?
 		    ATA_ATA_IDENTIFY : ATA_ATAPI_IDENTIFY;
 		retry_command = 0;
 	} else {
 		/* We don't know which for sure so try both */
 		command = ATA_ATA_IDENTIFY;
 		retry_command = ATA_ATAPI_IDENTIFY;
 	}
 
 	ptr = (uint16_t *)calloc(1, sizeof(struct ata_params));
 	if (ptr == NULL) {
 		warnx("can't calloc memory for identify\n");
 		return (1);
 	}
 
 	error = ata_do_28bit_cmd(device,
 				 ccb,
 				 /*retries*/retry_count,
 				 /*flags*/CAM_DIR_IN,
 				 /*protocol*/AP_PROTO_PIO_IN,
 				 /*tag_action*/MSG_SIMPLE_Q_TAG,
 				 /*command*/command,
 				 /*features*/0,
 				 /*lba*/0,
 				 /*sector_count*/0,
 				 /*data_ptr*/(u_int8_t *)ptr,
 				 /*dxfer_len*/sizeof(struct ata_params),
 				 /*timeout*/timeout ? timeout : 30 * 1000,
 				 /*quiet*/1);
 
 	if (error != 0) {
 		if (retry_command == 0) {
 			free(ptr);
 			return (1);
 		}
 		error = ata_do_28bit_cmd(device,
 					 ccb,
 					 /*retries*/retry_count,
 					 /*flags*/CAM_DIR_IN,
 					 /*protocol*/AP_PROTO_PIO_IN,
 					 /*tag_action*/MSG_SIMPLE_Q_TAG,
 					 /*command*/retry_command,
 					 /*features*/0,
 					 /*lba*/0,
 					 /*sector_count*/0,
 					 /*data_ptr*/(u_int8_t *)ptr,
 					 /*dxfer_len*/sizeof(struct ata_params),
 					 /*timeout*/timeout ? timeout : 30 * 1000,
 					 /*quiet*/0);
 
 		if (error != 0) {
 			free(ptr);
 			return (1);
 		}
 	}
 
 	ident_buf = (struct ata_params *)ptr;
 	ata_param_fixup(ident_buf);
 
 	error = 1;
 	for (i = 0; i < sizeof(struct ata_params) / 2; i++) {
 		if (ptr[i] != 0)
 			error = 0;
 	}
 
 	/* check for invalid (all zero) response */
 	if (error != 0) {
 		warnx("Invalid identify response detected");
 		free(ptr);
 		return (error);
 	}
 
 	*ident_bufp = ident_buf;
 
 	return (0);
 }
 
 
 static int
 ataidentify(struct cam_device *device, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	struct ata_params *ident_buf;
 	u_int64_t hpasize, nativesize;
 
 	if ((ccb = cam_getccb(device)) == NULL) {
 		warnx("couldn't allocate CCB");
 		return (1);
 	}
 
 	if (ata_do_identify(device, retry_count, timeout, ccb, &ident_buf) != 0) {
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	if (arglist & CAM_ARG_VERBOSE) {
 		printf("%s%d: Raw identify data:\n",
 		    device->device_name, device->dev_unit_num);
 		dump_data((void*)ident_buf, sizeof(struct ata_params));
 	}
 
 	if (ident_buf->support.command1 & ATA_SUPPORT_PROTECTED) {
 		if (ata_read_native_max(device, retry_count, timeout, ccb,
 					ident_buf, &hpasize) != 0) {
 			cam_freeccb(ccb);
 			return (1);
 		}
 	} else {
 		hpasize = 0;
 	}
 	if (ident_buf->support2 & ATA_SUPPORT_AMAX_ADDR) {
 		if (ata_get_native_max(device, retry_count, timeout, ccb,
 					&nativesize) != 0) {
 			cam_freeccb(ccb);
 			return (1);
 		}
 	} else {
 		nativesize = 0;
 	}
 
 	printf("%s%d: ", device->device_name, device->dev_unit_num);
 	ata_print_ident(ident_buf);
 	camxferrate(device);
 	atacapprint(ident_buf);
 	atahpa_print(ident_buf, hpasize, 0);
 	ataama_print(ident_buf, nativesize, 0);
 
 	free(ident_buf);
 	cam_freeccb(ccb);
 
 	return (0);
 }
 
 #ifdef WITH_NVME
 static int
 nvmeidentify(struct cam_device *device, int retry_count __unused, int timeout __unused)
 {
 	struct nvme_controller_data cdata;
 
 	if (nvme_get_cdata(device, &cdata))
 		return (1);
 	nvme_print_controller(&cdata);
 
 	return (0);
 }
 #endif
 
 static int
 identify(struct cam_device *device, int retry_count, int timeout)
 {
 #ifdef WITH_NVME
 	struct ccb_pathinq cpi;
 
 	if (get_cpi(device, &cpi) != 0) {
 		warnx("couldn't get CPI");
 		return (-1);
 	}
 
 	if (cpi.protocol == PROTO_NVME) {
 		return (nvmeidentify(device, retry_count, timeout));
 	}
 #endif
 	return (ataidentify(device, retry_count, timeout));
 }
 
 
 enum {
 	ATA_SECURITY_ACTION_PRINT,
 	ATA_SECURITY_ACTION_FREEZE,
 	ATA_SECURITY_ACTION_UNLOCK,
 	ATA_SECURITY_ACTION_DISABLE,
 	ATA_SECURITY_ACTION_ERASE,
 	ATA_SECURITY_ACTION_ERASE_ENHANCED,
 	ATA_SECURITY_ACTION_SET_PASSWORD
 };
 
 static void
 atasecurity_print_time(u_int16_t tw)
 {
 
 	if (tw == 0)
 		printf("unspecified");
 	else if (tw >= 255)
 		printf("> 508 min");
 	else
 		printf("%i min", 2 * tw);
 }
 
 static u_int32_t
 atasecurity_erase_timeout_msecs(u_int16_t timeout)
 {
 
 	if (timeout == 0)
 		return 2 * 3600 * 1000; /* default: two hours */
 	else if (timeout > 255)
 		return (508 + 60) * 60 * 1000; /* spec says > 508 minutes */
 
 	return ((2 * timeout) + 5) * 60 * 1000; /* add a 5min margin */
 }
 
 
 static void
 atasecurity_notify(u_int8_t command, struct ata_security_password *pwd)
 {
 	struct ata_cmd cmd;
 
 	bzero(&cmd, sizeof(cmd));
 	cmd.command = command;
 	printf("Issuing %s", ata_op_string(&cmd));
 
 	if (pwd != NULL) {
 		char pass[sizeof(pwd->password)+1];
 
 		/* pwd->password may not be null terminated */
 		pass[sizeof(pwd->password)] = '\0';
 		strncpy(pass, pwd->password, sizeof(pwd->password));
 		printf(" password='%s', user='%s'",
 			pass,
 			(pwd->ctrl & ATA_SECURITY_PASSWORD_MASTER) ?
 			"master" : "user");
 
 		if (command == ATA_SECURITY_SET_PASSWORD) {
 			printf(", mode='%s'",
 			       (pwd->ctrl & ATA_SECURITY_LEVEL_MAXIMUM) ?
 			       "maximum" : "high");
 		}
 	}
 
 	printf("\n");
 }
 
 static int
 atasecurity_freeze(struct cam_device *device, union ccb *ccb,
 		   int retry_count, u_int32_t timeout, int quiet)
 {
 
 	if (quiet == 0)
 		atasecurity_notify(ATA_SECURITY_FREEZE_LOCK, NULL);
 
 	return ata_do_28bit_cmd(device,
 				ccb,
 				retry_count,
 				/*flags*/CAM_DIR_NONE,
 				/*protocol*/AP_PROTO_NON_DATA,
 				/*tag_action*/MSG_SIMPLE_Q_TAG,
 				/*command*/ATA_SECURITY_FREEZE_LOCK,
 				/*features*/0,
 				/*lba*/0,
 				/*sector_count*/0,
 				/*data_ptr*/NULL,
 				/*dxfer_len*/0,
 				/*timeout*/timeout,
 				/*quiet*/0);
 }
 
 static int
 atasecurity_unlock(struct cam_device *device, union ccb *ccb,
 		   int retry_count, u_int32_t timeout,
 		   struct ata_security_password *pwd, int quiet)
 {
 
 	if (quiet == 0)
 		atasecurity_notify(ATA_SECURITY_UNLOCK, pwd);
 
 	return ata_do_28bit_cmd(device,
 				ccb,
 				retry_count,
 				/*flags*/CAM_DIR_OUT,
 				/*protocol*/AP_PROTO_PIO_OUT,
 				/*tag_action*/MSG_SIMPLE_Q_TAG,
 				/*command*/ATA_SECURITY_UNLOCK,
 				/*features*/0,
 				/*lba*/0,
 				/*sector_count*/0,
 				/*data_ptr*/(u_int8_t *)pwd,
 				/*dxfer_len*/sizeof(*pwd),
 				/*timeout*/timeout,
 				/*quiet*/0);
 }
 
 static int
 atasecurity_disable(struct cam_device *device, union ccb *ccb,
 		    int retry_count, u_int32_t timeout,
 		    struct ata_security_password *pwd, int quiet)
 {
 
 	if (quiet == 0)
 		atasecurity_notify(ATA_SECURITY_DISABLE_PASSWORD, pwd);
 	return ata_do_28bit_cmd(device,
 				ccb,
 				retry_count,
 				/*flags*/CAM_DIR_OUT,
 				/*protocol*/AP_PROTO_PIO_OUT,
 				/*tag_action*/MSG_SIMPLE_Q_TAG,
 				/*command*/ATA_SECURITY_DISABLE_PASSWORD,
 				/*features*/0,
 				/*lba*/0,
 				/*sector_count*/0,
 				/*data_ptr*/(u_int8_t *)pwd,
 				/*dxfer_len*/sizeof(*pwd),
 				/*timeout*/timeout,
 				/*quiet*/0);
 }
 
 
 static int
 atasecurity_erase_confirm(struct cam_device *device,
 			  struct ata_params* ident_buf)
 {
 
 	printf("\nYou are about to ERASE ALL DATA from the following"
 	       " device:\n%s%d,%s%d: ", device->device_name,
 	       device->dev_unit_num, device->given_dev_name,
 	       device->given_unit_number);
 	ata_print_ident(ident_buf);
 
 	for(;;) {
 		char str[50];
 		printf("\nAre you SURE you want to ERASE ALL DATA? (yes/no) ");
 
 		if (fgets(str, sizeof(str), stdin) != NULL) {
 			if (strncasecmp(str, "yes", 3) == 0) {
 				return (1);
 			} else if (strncasecmp(str, "no", 2) == 0) {
 				return (0);
 			} else {
 				printf("Please answer \"yes\" or "
 				       "\"no\"\n");
 			}
 		}
 	}
 
 	/* NOTREACHED */
 	return (0);
 }
 
 static int
 atasecurity_erase(struct cam_device *device, union ccb *ccb,
 		  int retry_count, u_int32_t timeout,
 		  u_int32_t erase_timeout,
 		  struct ata_security_password *pwd, int quiet)
 {
 	int error;
 
 	if (quiet == 0)
 		atasecurity_notify(ATA_SECURITY_ERASE_PREPARE, NULL);
 
 	error = ata_do_28bit_cmd(device,
 				 ccb,
 				 retry_count,
 				 /*flags*/CAM_DIR_NONE,
 				 /*protocol*/AP_PROTO_NON_DATA,
 				 /*tag_action*/MSG_SIMPLE_Q_TAG,
 				 /*command*/ATA_SECURITY_ERASE_PREPARE,
 				 /*features*/0,
 				 /*lba*/0,
 				 /*sector_count*/0,
 				 /*data_ptr*/NULL,
 				 /*dxfer_len*/0,
 				 /*timeout*/timeout,
 				 /*quiet*/0);
 
 	if (error != 0)
 		return error;
 
 	if (quiet == 0)
 		atasecurity_notify(ATA_SECURITY_ERASE_UNIT, pwd);
 
 	error = ata_do_28bit_cmd(device,
 				 ccb,
 				 retry_count,
 				 /*flags*/CAM_DIR_OUT,
 				 /*protocol*/AP_PROTO_PIO_OUT,
 				 /*tag_action*/MSG_SIMPLE_Q_TAG,
 				 /*command*/ATA_SECURITY_ERASE_UNIT,
 				 /*features*/0,
 				 /*lba*/0,
 				 /*sector_count*/0,
 				 /*data_ptr*/(u_int8_t *)pwd,
 				 /*dxfer_len*/sizeof(*pwd),
 				 /*timeout*/erase_timeout,
 				 /*quiet*/0);
 
 	if (error == 0 && quiet == 0)
 		printf("\nErase Complete\n");
 
 	return error;
 }
 
 static int
 atasecurity_set_password(struct cam_device *device, union ccb *ccb,
 			 int retry_count, u_int32_t timeout,
 			 struct ata_security_password *pwd, int quiet)
 {
 
 	if (quiet == 0)
 		atasecurity_notify(ATA_SECURITY_SET_PASSWORD, pwd);
 
 	return ata_do_28bit_cmd(device,
 				 ccb,
 				 retry_count,
 				 /*flags*/CAM_DIR_OUT,
 				 /*protocol*/AP_PROTO_PIO_OUT,
 				 /*tag_action*/MSG_SIMPLE_Q_TAG,
 				 /*command*/ATA_SECURITY_SET_PASSWORD,
 				 /*features*/0,
 				 /*lba*/0,
 				 /*sector_count*/0,
 				 /*data_ptr*/(u_int8_t *)pwd,
 				 /*dxfer_len*/sizeof(*pwd),
 				 /*timeout*/timeout,
 				 /*quiet*/0);
 }
 
 static void
 atasecurity_print(struct ata_params *parm)
 {
 
 	printf("\nSecurity Option           Value\n");
 	if (arglist & CAM_ARG_VERBOSE) {
 		printf("status                    %04x\n",
 		       parm->security_status);
 	}
 	printf("supported                 %s\n",
 		parm->security_status & ATA_SECURITY_SUPPORTED ? "yes" : "no");
 	if (!(parm->security_status & ATA_SECURITY_SUPPORTED))
 		return;
 	printf("enabled                   %s\n",
 		parm->security_status & ATA_SECURITY_ENABLED ? "yes" : "no");
 	printf("drive locked              %s\n",
 		parm->security_status & ATA_SECURITY_LOCKED ? "yes" : "no");
 	printf("security config frozen    %s\n",
 		parm->security_status & ATA_SECURITY_FROZEN ? "yes" : "no");
 	printf("count expired             %s\n",
 		parm->security_status & ATA_SECURITY_COUNT_EXP ? "yes" : "no");
 	printf("security level            %s\n",
 		parm->security_status & ATA_SECURITY_LEVEL ? "maximum" : "high");
 	printf("enhanced erase supported  %s\n",
 		parm->security_status & ATA_SECURITY_ENH_SUPP ? "yes" : "no");
 	printf("erase time                ");
 	atasecurity_print_time(parm->erase_time);
 	printf("\n");
 	printf("enhanced erase time       ");
 	atasecurity_print_time(parm->enhanced_erase_time);
 	printf("\n");
 	printf("master password rev       %04x%s\n",
 		parm->master_passwd_revision,
 		parm->master_passwd_revision == 0x0000 ||
 		parm->master_passwd_revision == 0xFFFF ?  " (unsupported)" : "");
 }
 
 /*
  * Validates and copies the password in optarg to the passed buffer.
  * If the password in optarg is the same length as the buffer then
  * the data will still be copied but no null termination will occur.
  */
 static int
 ata_getpwd(u_int8_t *passwd, int max, char opt)
 {
 	int len;
 
 	len = strlen(optarg);
 	if (len > max) {
 		warnx("-%c password is too long", opt);
 		return (1);
 	} else if (len == 0) {
 		warnx("-%c password is missing", opt);
 		return (1);
 	} else if (optarg[0] == '-'){
 		warnx("-%c password starts with '-' (generic arg?)", opt);
 		return (1);
 	} else if (strlen(passwd) != 0 && strcmp(passwd, optarg) != 0) {
 		warnx("-%c password conflicts with existing password from -%c",
 		      opt, pwd_opt);
 		return (1);
 	}
 
 	/* Callers pass in a buffer which does NOT need to be terminated */
 	strncpy(passwd, optarg, max);
 	pwd_opt = opt;
 
 	return (0);
 }
 
 enum {
 	ATA_HPA_ACTION_PRINT,
 	ATA_HPA_ACTION_SET_MAX,
 	ATA_HPA_ACTION_SET_PWD,
 	ATA_HPA_ACTION_LOCK,
 	ATA_HPA_ACTION_UNLOCK,
 	ATA_HPA_ACTION_FREEZE_LOCK
 };
 
 static int
 atahpa_set_confirm(struct cam_device *device, struct ata_params* ident_buf,
 		   u_int64_t maxsize, int persist)
 {
 	printf("\nYou are about to configure HPA to limit the user accessible\n"
 	       "sectors to %ju %s on the device:\n%s%d,%s%d: ", maxsize,
 	       persist ? "persistently" : "temporarily",
 	       device->device_name, device->dev_unit_num,
 	       device->given_dev_name, device->given_unit_number);
 	ata_print_ident(ident_buf);
 
 	for(;;) {
 		char str[50];
 		printf("\nAre you SURE you want to configure HPA? (yes/no) ");
 
 		if (NULL != fgets(str, sizeof(str), stdin)) {
 			if (0 == strncasecmp(str, "yes", 3)) {
 				return (1);
 			} else if (0 == strncasecmp(str, "no", 2)) {
 				return (0);
 			} else {
 				printf("Please answer \"yes\" or "
 				       "\"no\"\n");
 			}
 		}
 	}
 
 	/* NOTREACHED */
 	return (0);
 }
 
 static int
 atahpa(struct cam_device *device, int retry_count, int timeout,
        int argc, char **argv, char *combinedopt)
 {
 	union ccb *ccb;
 	struct ata_params *ident_buf;
 	struct ccb_getdev cgd;
 	struct ata_set_max_pwd pwd;
 	int error, confirm, quiet, c, action, actions, persist;
 	int security, is48bit, pwdsize;
 	u_int64_t hpasize, maxsize;
 
 	actions = 0;
 	confirm = 0;
 	quiet = 0;
 	maxsize = 0;
 	persist = 0;
 	security = 0;
 
 	memset(&pwd, 0, sizeof(pwd));
 
 	/* default action is to print hpa information */
 	action = ATA_HPA_ACTION_PRINT;
 	pwdsize = sizeof(pwd.password);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c){
 		case 's':
 			action = ATA_HPA_ACTION_SET_MAX;
 			maxsize = strtoumax(optarg, NULL, 0);
 			actions++;
 			break;
 
 		case 'p':
 			if (ata_getpwd(pwd.password, pwdsize, c) != 0)
 				return (1);
 			action = ATA_HPA_ACTION_SET_PWD;
 			security = 1;
 			actions++;
 			break;
 
 		case 'l':
 			action = ATA_HPA_ACTION_LOCK;
 			security = 1;
 			actions++;
 			break;
 
 		case 'U':
 			if (ata_getpwd(pwd.password, pwdsize, c) != 0)
 				return (1);
 			action = ATA_HPA_ACTION_UNLOCK;
 			security = 1;
 			actions++;
 			break;
 
 		case 'f':
 			action = ATA_HPA_ACTION_FREEZE_LOCK;
 			security = 1;
 			actions++;
 			break;
 
 		case 'P':
 			persist = 1;
 			break;
 
 		case 'y':
 			confirm++;
 			break;
 
 		case 'q':
 			quiet++;
 			break;
 		}
 	}
 
 	if (actions > 1) {
 		warnx("too many hpa actions specified");
 		return (1);
 	}
 
 	if (get_cgd(device, &cgd) != 0) {
 		warnx("couldn't get CGD");
 		return (1);
 	}
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		return (1);
 	}
 
 	error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf);
 	if (error != 0) {
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	if (quiet == 0) {
 		printf("%s%d: ", device->device_name, device->dev_unit_num);
 		ata_print_ident(ident_buf);
 		camxferrate(device);
 	}
 
 	if (action == ATA_HPA_ACTION_PRINT) {
 		error = ata_read_native_max(device, retry_count, timeout, ccb,
 					    ident_buf, &hpasize);
 		if (error == 0)
 			atahpa_print(ident_buf, hpasize, 1);
 
 		cam_freeccb(ccb);
 		free(ident_buf);
 		return (error);
 	}
 
 	if (!(ident_buf->support.command1 & ATA_SUPPORT_PROTECTED)) {
 		warnx("HPA is not supported by this device");
 		cam_freeccb(ccb);
 		free(ident_buf);
 		return (1);
 	}
 
 	if (security && !(ident_buf->support.command2 & ATA_SUPPORT_MAXSECURITY)) {
 		warnx("HPA Security is not supported by this device");
 		cam_freeccb(ccb);
 		free(ident_buf);
 		return (1);
 	}
 
 	is48bit = ident_buf->support.command2 & ATA_SUPPORT_ADDRESS48;
 
 	/*
 	 * The ATA spec requires:
 	 * 1. Read native max addr is called directly before set max addr
 	 * 2. Read native max addr is NOT called before any other set max call
 	 */
 	switch(action) {
 	case ATA_HPA_ACTION_SET_MAX:
 		if (confirm == 0 &&
 		    atahpa_set_confirm(device, ident_buf, maxsize,
 		    persist) == 0) {
 			cam_freeccb(ccb);
 			free(ident_buf);
 			return (1);
 		}
 
 		error = ata_read_native_max(device, retry_count, timeout,
 					    ccb, ident_buf, &hpasize);
 		if (error == 0) {
 			error = atahpa_set_max(device, retry_count, timeout,
 					       ccb, is48bit, maxsize, persist);
 			if (error == 0 && quiet == 0) {
 				/* redo identify to get new lba values */
 				error = ata_do_identify(device, retry_count,
 							timeout, ccb,
 							&ident_buf);
 				atahpa_print(ident_buf, hpasize, 1);
 				/* Hint CAM to reprobe the device. */
 				reprobe(device);
 			}
 		}
 		break;
 
 	case ATA_HPA_ACTION_SET_PWD:
 		error = atahpa_password(device, retry_count, timeout,
 					ccb, is48bit, &pwd);
 		if (error == 0 && quiet == 0)
 			printf("HPA password has been set\n");
 		break;
 
 	case ATA_HPA_ACTION_LOCK:
 		error = atahpa_lock(device, retry_count, timeout,
 				    ccb, is48bit);
 		if (error == 0 && quiet == 0)
 			printf("HPA has been locked\n");
 		break;
 
 	case ATA_HPA_ACTION_UNLOCK:
 		error = atahpa_unlock(device, retry_count, timeout,
 				      ccb, is48bit, &pwd);
 		if (error == 0 && quiet == 0)
 			printf("HPA has been unlocked\n");
 		break;
 
 	case ATA_HPA_ACTION_FREEZE_LOCK:
 		error = atahpa_freeze_lock(device, retry_count, timeout,
 					   ccb, is48bit);
 		if (error == 0 && quiet == 0)
 			printf("HPA has been frozen\n");
 		break;
 
 	default:
 		errx(1, "Option currently not supported");
 	}
 
 	cam_freeccb(ccb);
 	free(ident_buf);
 
 	return (error);
 }
 
 enum {
 	ATA_AMA_ACTION_PRINT,
 	ATA_AMA_ACTION_SET_MAX,
 	ATA_AMA_ACTION_FREEZE_LOCK
 };
 
 static int
 ataama(struct cam_device *device, int retry_count, int timeout,
        int argc, char **argv, char *combinedopt)
 {
 	union ccb *ccb;
 	struct ata_params *ident_buf;
 	struct ccb_getdev cgd;
 	int error, quiet, c, action, actions;
 	u_int64_t nativesize, maxsize;
 
 	actions = 0;
 	quiet = 0;
 	maxsize = 0;
 
 	/* default action is to print AMA information */
 	action = ATA_AMA_ACTION_PRINT;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c){
 		case 's':
 			action = ATA_AMA_ACTION_SET_MAX;
 			maxsize = strtoumax(optarg, NULL, 0);
 			actions++;
 			break;
 
 		case 'f':
 			action = ATA_AMA_ACTION_FREEZE_LOCK;
 			actions++;
 			break;
 
 		case 'q':
 			quiet++;
 			break;
 		}
 	}
 
 	if (actions > 1) {
 		warnx("too many AMA actions specified");
 		return (1);
 	}
 
 	if (get_cgd(device, &cgd) != 0) {
 		warnx("couldn't get CGD");
 		return (1);
 	}
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		return (1);
 	}
 
 	error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf);
 	if (error != 0) {
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	if (quiet == 0) {
 		printf("%s%d: ", device->device_name, device->dev_unit_num);
 		ata_print_ident(ident_buf);
 		camxferrate(device);
 	}
 
 	if (action == ATA_AMA_ACTION_PRINT) {
 		error = ata_get_native_max(device, retry_count, timeout, ccb,
 					   &nativesize);
 		if (error == 0)
 			ataama_print(ident_buf, nativesize, 1);
 
 		cam_freeccb(ccb);
 		free(ident_buf);
 		return (error);
 	}
 
 	if (!(ident_buf->support2 & ATA_SUPPORT_AMAX_ADDR)) {
 		warnx("Accessible Max Address is not supported by this device");
 		cam_freeccb(ccb);
 		free(ident_buf);
 		return (1);
 	}
 
 	switch(action) {
 	case ATA_AMA_ACTION_SET_MAX:
 		error = ata_get_native_max(device, retry_count, timeout, ccb,
 					   &nativesize);
 		if (error == 0) {
 			error = ataama_set(device, retry_count, timeout,
 				       ccb, maxsize);
 			if (error == 0 && quiet == 0) {
 				/* redo identify to get new lba values */
 				error = ata_do_identify(device, retry_count,
 				    timeout, ccb, &ident_buf);
 				ataama_print(ident_buf, nativesize, 1);
 				/* Hint CAM to reprobe the device. */
 				reprobe(device);
 			}
 		}
 		break;
 
 	case ATA_AMA_ACTION_FREEZE_LOCK:
 		error = ataama_freeze(device, retry_count, timeout,
 					   ccb);
 		if (error == 0 && quiet == 0)
 			printf("Accessible Max Address has been frozen\n");
 		break;
 
 	default:
 		errx(1, "Option currently not supported");
 	}
 
 	cam_freeccb(ccb);
 	free(ident_buf);
 
 	return (error);
 }
 
 static int
 atasecurity(struct cam_device *device, int retry_count, int timeout,
 	    int argc, char **argv, char *combinedopt)
 {
 	union ccb *ccb;
 	struct ata_params *ident_buf;
 	int error, confirm, quiet, c, action, actions, setpwd;
 	int security_enabled, erase_timeout, pwdsize;
 	struct ata_security_password pwd;
 
 	actions = 0;
 	setpwd = 0;
 	erase_timeout = 0;
 	confirm = 0;
 	quiet = 0;
 
 	memset(&pwd, 0, sizeof(pwd));
 
 	/* default action is to print security information */
 	action = ATA_SECURITY_ACTION_PRINT;
 
 	/* user is master by default as its safer that way */
 	pwd.ctrl |= ATA_SECURITY_PASSWORD_MASTER;
 	pwdsize = sizeof(pwd.password);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c){
 		case 'f':
 			action = ATA_SECURITY_ACTION_FREEZE;
 			actions++;
 			break;
 
 		case 'U':
 			if (strcasecmp(optarg, "user") == 0) {
 				pwd.ctrl |= ATA_SECURITY_PASSWORD_USER;
 				pwd.ctrl &= ~ATA_SECURITY_PASSWORD_MASTER;
 			} else if (strcasecmp(optarg, "master") == 0) {
 				pwd.ctrl |= ATA_SECURITY_PASSWORD_MASTER;
 				pwd.ctrl &= ~ATA_SECURITY_PASSWORD_USER;
 			} else {
 				warnx("-U argument '%s' is invalid (must be "
 				      "'user' or 'master')", optarg);
 				return (1);
 			}
 			break;
 
 		case 'l':
 			if (strcasecmp(optarg, "high") == 0) {
 				pwd.ctrl |= ATA_SECURITY_LEVEL_HIGH;
 				pwd.ctrl &= ~ATA_SECURITY_LEVEL_MAXIMUM;
 			} else if (strcasecmp(optarg, "maximum") == 0) {
 				pwd.ctrl |= ATA_SECURITY_LEVEL_MAXIMUM;
 				pwd.ctrl &= ~ATA_SECURITY_LEVEL_HIGH;
 			} else {
 				warnx("-l argument '%s' is unknown (must be "
 				      "'high' or 'maximum')", optarg);
 				return (1);
 			}
 			break;
 
 		case 'k':
 			if (ata_getpwd(pwd.password, pwdsize, c) != 0)
 				return (1);
 			action = ATA_SECURITY_ACTION_UNLOCK;
 			actions++;
 			break;
 
 		case 'd':
 			if (ata_getpwd(pwd.password, pwdsize, c) != 0)
 				return (1);
 			action = ATA_SECURITY_ACTION_DISABLE;
 			actions++;
 			break;
 
 		case 'e':
 			if (ata_getpwd(pwd.password, pwdsize, c) != 0)
 				return (1);
 			action = ATA_SECURITY_ACTION_ERASE;
 			actions++;
 			break;
 
 		case 'h':
 			if (ata_getpwd(pwd.password, pwdsize, c) != 0)
 				return (1);
 			pwd.ctrl |= ATA_SECURITY_ERASE_ENHANCED;
 			action = ATA_SECURITY_ACTION_ERASE_ENHANCED;
 			actions++;
 			break;
 
 		case 's':
 			if (ata_getpwd(pwd.password, pwdsize, c) != 0)
 				return (1);
 			setpwd = 1;
 			if (action == ATA_SECURITY_ACTION_PRINT)
 				action = ATA_SECURITY_ACTION_SET_PASSWORD;
 			/*
 			 * Don't increment action as this can be combined
 			 * with other actions.
 			 */
 			break;
 
 		case 'y':
 			confirm++;
 			break;
 
 		case 'q':
 			quiet++;
 			break;
 
 		case 'T':
 			erase_timeout = atoi(optarg) * 1000;
 			break;
 		}
 	}
 
 	if (actions > 1) {
 		warnx("too many security actions specified");
 		return (1);
 	}
 
 	if ((ccb = cam_getccb(device)) == NULL) {
 		warnx("couldn't allocate CCB");
 		return (1);
 	}
 
 	error = ata_do_identify(device, retry_count, timeout, ccb, &ident_buf);
 	if (error != 0) {
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	if (quiet == 0) {
 		printf("%s%d: ", device->device_name, device->dev_unit_num);
 		ata_print_ident(ident_buf);
 		camxferrate(device);
 	}
 
 	if (action == ATA_SECURITY_ACTION_PRINT) {
 		atasecurity_print(ident_buf);
 		free(ident_buf);
 		cam_freeccb(ccb);
 		return (0);
 	}
 
 	if ((ident_buf->support.command1 & ATA_SUPPORT_SECURITY) == 0) {
 		warnx("Security not supported");
 		free(ident_buf);
 		cam_freeccb(ccb);
 		return (1);
 	}
 
 	/* default timeout 15 seconds the same as linux hdparm */
 	timeout = timeout ? timeout : 15 * 1000;
 
 	security_enabled = ident_buf->security_status & ATA_SECURITY_ENABLED;
 
 	/* first set the password if requested */
 	if (setpwd == 1) {
 		/* confirm we can erase before setting the password if erasing */
 		if (confirm == 0 &&
 		    (action == ATA_SECURITY_ACTION_ERASE_ENHANCED ||
 		    action == ATA_SECURITY_ACTION_ERASE) &&
 		    atasecurity_erase_confirm(device, ident_buf) == 0) {
 			cam_freeccb(ccb);
 			free(ident_buf);
 			return (error);
 		}
 
 		if (pwd.ctrl & ATA_SECURITY_PASSWORD_MASTER) {
 			pwd.revision = ident_buf->master_passwd_revision;
 			if (pwd.revision != 0 && pwd.revision != 0xfff &&
 			    --pwd.revision == 0) {
 				pwd.revision = 0xfffe;
 			}
 		}
 		error = atasecurity_set_password(device, ccb, retry_count,
 						 timeout, &pwd, quiet);
 		if (error != 0) {
 			cam_freeccb(ccb);
 			free(ident_buf);
 			return (error);
 		}
 		security_enabled = 1;
 	}
 
 	switch(action) {
 	case ATA_SECURITY_ACTION_FREEZE:
 		error = atasecurity_freeze(device, ccb, retry_count,
 					   timeout, quiet);
 		break;
 
 	case ATA_SECURITY_ACTION_UNLOCK:
 		if (security_enabled) {
 			if (ident_buf->security_status & ATA_SECURITY_LOCKED) {
 				error = atasecurity_unlock(device, ccb,
 					retry_count, timeout, &pwd, quiet);
 			} else {
 				warnx("Can't unlock, drive is not locked");
 				error = 1;
 			}
 		} else {
 			warnx("Can't unlock, security is disabled");
 			error = 1;
 		}
 		break;
 
 	case ATA_SECURITY_ACTION_DISABLE:
 		if (security_enabled) {
 			/* First unlock the drive if its locked */
 			if (ident_buf->security_status & ATA_SECURITY_LOCKED) {
 				error = atasecurity_unlock(device, ccb,
 							   retry_count,
 							   timeout,
 							   &pwd,
 							   quiet);
 			}
 
 			if (error == 0) {
 				error = atasecurity_disable(device,
 							    ccb,
 							    retry_count,
 							    timeout,
 							    &pwd,
 							    quiet);
 			}
 		} else {
 			warnx("Can't disable security (already disabled)");
 			error = 1;
 		}
 		break;
 
 	case ATA_SECURITY_ACTION_ERASE:
 		if (security_enabled) {
 			if (erase_timeout == 0) {
 				erase_timeout = atasecurity_erase_timeout_msecs(
 				    ident_buf->erase_time);
 			}
 
 			error = atasecurity_erase(device, ccb, retry_count,
 			    timeout, erase_timeout, &pwd, quiet);
 		} else {
 			warnx("Can't secure erase (security is disabled)");
 			error = 1;
 		}
 		break;
 
 	case ATA_SECURITY_ACTION_ERASE_ENHANCED:
 		if (security_enabled) {
 			if (ident_buf->security_status & ATA_SECURITY_ENH_SUPP) {
 				if (erase_timeout == 0) {
 					erase_timeout =
 					    atasecurity_erase_timeout_msecs(
 						ident_buf->enhanced_erase_time);
 				}
 
 				error = atasecurity_erase(device, ccb,
 							  retry_count, timeout,
 							  erase_timeout, &pwd,
 							  quiet);
 			} else {
 				warnx("Enhanced erase is not supported");
 				error = 1;
 			}
 		} else {
 			warnx("Can't secure erase (enhanced), "
 			      "(security is disabled)");
 			error = 1;
 		}
 		break;
 	}
 
 	cam_freeccb(ccb);
 	free(ident_buf);
 
 	return (error);
 }
 
 /*
  * Convert periph name into a bus, target and lun.
  *
  * Returns the number of parsed components, or 0.
  */
 static int
 parse_btl_name(char *tstr, path_id_t *bus, target_id_t *target, lun_id_t *lun,
     cam_argmask *arglst)
 {
 	int fd;
 	union ccb ccb;
 
 	bzero(&ccb, sizeof(ccb));
 	ccb.ccb_h.func_code = XPT_GDEVLIST;
 	if (cam_get_device(tstr, ccb.cgdl.periph_name,
 	    sizeof(ccb.cgdl.periph_name), &ccb.cgdl.unit_number) == -1) {
 		warnx("%s", cam_errbuf);
 		return (0);
 	}
 
 	/*
 	 * Attempt to get the passthrough device.  This ioctl will
 	 * fail if the device name is null, if the device doesn't
 	 * exist, or if the passthrough driver isn't in the kernel.
 	 */
 	if ((fd = open(XPT_DEVICE, O_RDWR)) == -1) {
 		warn("Unable to open %s", XPT_DEVICE);
 		return (0);
 	}
 	if (ioctl(fd, CAMGETPASSTHRU, &ccb) == -1) {
 		warn("Unable to find bus:target:lun for device %s%d",
 		    ccb.cgdl.periph_name, ccb.cgdl.unit_number);
 		close(fd);
 		return (0);
 	}
 	close(fd);
 	if ((ccb.ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		const struct cam_status_entry *entry;
 
 		entry = cam_fetch_status_entry(ccb.ccb_h.status);
 		warnx("Unable to find bus:target_lun for device %s%d, "
 		    "CAM status: %s (%#x)",
 		    ccb.cgdl.periph_name, ccb.cgdl.unit_number,
 		    entry ? entry->status_text : "Unknown",
 		    ccb.ccb_h.status);
 		return (0);
 	}
 
 	/*
 	 * The kernel fills in the bus/target/lun.  We don't
 	 * need the passthrough device name and unit number since
 	 * we aren't going to open it.
 	 */
 	*bus = ccb.ccb_h.path_id;
 	*target = ccb.ccb_h.target_id;
 	*lun = ccb.ccb_h.target_lun;
 	*arglst |= CAM_ARG_BUS | CAM_ARG_TARGET | CAM_ARG_LUN;
 	return (3);
 }
 
 /*
  * Parse out a bus, or a bus, target and lun in the following
  * format:
  * bus
  * bus:target
  * bus:target:lun
  *
  * Returns the number of parsed components, or 0.
  */
 static int
 parse_btl(char *tstr, path_id_t *bus, target_id_t *target, lun_id_t *lun,
     cam_argmask *arglst)
 {
 	char *tmpstr, *end;
 	int convs = 0;
 
 	*bus = CAM_BUS_WILDCARD;
 	*target = CAM_TARGET_WILDCARD;
 	*lun = CAM_LUN_WILDCARD;
 
 	while (isspace(*tstr) && (*tstr != '\0'))
 		tstr++;
 
 	if (strncasecmp(tstr, "all", strlen("all")) == 0) {
 		arglist |= CAM_ARG_BUS;
 		return (1);
 	}
 
 	if (!isdigit(*tstr))
 		return (parse_btl_name(tstr, bus, target, lun, arglst));
 
 	tmpstr = strsep(&tstr, ":");
 	if ((tmpstr != NULL) && (*tmpstr != '\0')) {
 		*bus = strtol(tmpstr, &end, 0);
 		if (*end != '\0')
 			return (0);
 		*arglst |= CAM_ARG_BUS;
 		convs++;
 		tmpstr = strsep(&tstr, ":");
 		if ((tmpstr != NULL) && (*tmpstr != '\0')) {
 			*target = strtol(tmpstr, &end, 0);
 			if (*end != '\0')
 				return (0);
 			*arglst |= CAM_ARG_TARGET;
 			convs++;
 			tmpstr = strsep(&tstr, ":");
 			if ((tmpstr != NULL) && (*tmpstr != '\0')) {
 				*lun = strtoll(tmpstr, &end, 0);
 				if (*end != '\0')
 					return (0);
 				*arglst |= CAM_ARG_LUN;
 				convs++;
 			}
 		}
 	}
 
 	return convs;
 }
 
 static int
 dorescan_or_reset(int argc, char **argv, int rescan)
 {
 	static const char must[] =
 	    "you must specify \"all\", a bus, a bus:target:lun or periph to %s";
 	int rv, error = 0;
 	path_id_t bus = CAM_BUS_WILDCARD;
 	target_id_t target = CAM_TARGET_WILDCARD;
 	lun_id_t lun = CAM_LUN_WILDCARD;
 	char *tstr;
 
 	if (argc < 3) {
 		warnx(must, rescan? "rescan" : "reset");
 		return (1);
 	}
 
 	tstr = argv[optind];
 	while (isspace(*tstr) && (*tstr != '\0'))
 		tstr++;
 	if (strncasecmp(tstr, "all", strlen("all")) == 0)
 		arglist |= CAM_ARG_BUS;
 	else {
 		rv = parse_btl(argv[optind], &bus, &target, &lun, &arglist);
 		if (rv != 1 && rv != 3) {
 			warnx(must, rescan ? "rescan" : "reset");
 			return (1);
 		}
 	}
 
 	if (arglist & CAM_ARG_LUN)
 		error = scanlun_or_reset_dev(bus, target, lun, rescan);
 	else
 		error = rescan_or_reset_bus(bus, rescan);
 
 	return (error);
 }
 
 static int
 rescan_or_reset_bus(path_id_t bus, int rescan)
 {
 	union ccb *ccb = NULL, *matchccb = NULL;
 	int fd = -1, retval;
 	int bufsize;
 
 	retval = 0;
 
 	if ((fd = open(XPT_DEVICE, O_RDWR)) < 0) {
 		warnx("error opening transport layer device %s", XPT_DEVICE);
 		warn("%s", XPT_DEVICE);
 		return (1);
 	}
 
 	ccb = malloc(sizeof(*ccb));
 	if (ccb == NULL) {
 		warn("failed to allocate CCB");
 		retval = 1;
 		goto bailout;
 	}
 	bzero(ccb, sizeof(*ccb));
 
 	if (bus != CAM_BUS_WILDCARD) {
 		ccb->ccb_h.func_code = rescan ? XPT_SCAN_BUS : XPT_RESET_BUS;
 		ccb->ccb_h.path_id = bus;
 		ccb->ccb_h.target_id = CAM_TARGET_WILDCARD;
 		ccb->ccb_h.target_lun = CAM_LUN_WILDCARD;
 		ccb->crcn.flags = CAM_FLAG_NONE;
 
 		/* run this at a low priority */
 		ccb->ccb_h.pinfo.priority = 5;
 
 		if (ioctl(fd, CAMIOCOMMAND, ccb) == -1) {
 			warn("CAMIOCOMMAND ioctl failed");
 			retval = 1;
 			goto bailout;
 		}
 
 		if ((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP) {
 			fprintf(stdout, "%s of bus %d was successful\n",
 			    rescan ? "Re-scan" : "Reset", bus);
 		} else {
 			fprintf(stdout, "%s of bus %d returned error %#x\n",
 				rescan ? "Re-scan" : "Reset", bus,
 				ccb->ccb_h.status & CAM_STATUS_MASK);
 			retval = 1;
 		}
 
 		goto bailout;
 	}
 
 
 	/*
 	 * The right way to handle this is to modify the xpt so that it can
 	 * handle a wildcarded bus in a rescan or reset CCB.  At the moment
 	 * that isn't implemented, so instead we enumerate the buses and
 	 * send the rescan or reset to those buses in the case where the
 	 * given bus is -1 (wildcard).  We don't send a rescan or reset
 	 * to the xpt bus; sending a rescan to the xpt bus is effectively a
 	 * no-op, sending a rescan to the xpt bus would result in a status of
 	 * CAM_REQ_INVALID.
 	 */
 	matchccb = malloc(sizeof(*matchccb));
 	if (matchccb == NULL) {
 		warn("failed to allocate CCB");
 		retval = 1;
 		goto bailout;
 	}
 	bzero(matchccb, sizeof(*matchccb));
 	matchccb->ccb_h.func_code = XPT_DEV_MATCH;
 	matchccb->ccb_h.path_id = CAM_BUS_WILDCARD;
 	bufsize = sizeof(struct dev_match_result) * 20;
 	matchccb->cdm.match_buf_len = bufsize;
 	matchccb->cdm.matches=(struct dev_match_result *)malloc(bufsize);
 	if (matchccb->cdm.matches == NULL) {
 		warnx("can't malloc memory for matches");
 		retval = 1;
 		goto bailout;
 	}
 	matchccb->cdm.num_matches = 0;
 
 	matchccb->cdm.num_patterns = 1;
 	matchccb->cdm.pattern_buf_len = sizeof(struct dev_match_pattern);
 
 	matchccb->cdm.patterns = (struct dev_match_pattern *)malloc(
 		matchccb->cdm.pattern_buf_len);
 	if (matchccb->cdm.patterns == NULL) {
 		warnx("can't malloc memory for patterns");
 		retval = 1;
 		goto bailout;
 	}
 	matchccb->cdm.patterns[0].type = DEV_MATCH_BUS;
 	matchccb->cdm.patterns[0].pattern.bus_pattern.flags = BUS_MATCH_ANY;
 
 	do {
 		unsigned int i;
 
 		if (ioctl(fd, CAMIOCOMMAND, matchccb) == -1) {
 			warn("CAMIOCOMMAND ioctl failed");
 			retval = 1;
 			goto bailout;
 		}
 
 		if ((matchccb->ccb_h.status != CAM_REQ_CMP)
 		 || ((matchccb->cdm.status != CAM_DEV_MATCH_LAST)
 		   && (matchccb->cdm.status != CAM_DEV_MATCH_MORE))) {
 			warnx("got CAM error %#x, CDM error %d\n",
 			      matchccb->ccb_h.status, matchccb->cdm.status);
 			retval = 1;
 			goto bailout;
 		}
 
 		for (i = 0; i < matchccb->cdm.num_matches; i++) {
 			struct bus_match_result *bus_result;
 
 			/* This shouldn't happen. */
 			if (matchccb->cdm.matches[i].type != DEV_MATCH_BUS)
 				continue;
 
 			bus_result =&matchccb->cdm.matches[i].result.bus_result;
 
 			/*
 			 * We don't want to rescan or reset the xpt bus.
 			 * See above.
 			 */
 			if (bus_result->path_id == CAM_XPT_PATH_ID)
 				continue;
 
 			ccb->ccb_h.func_code = rescan ? XPT_SCAN_BUS :
 						       XPT_RESET_BUS;
 			ccb->ccb_h.path_id = bus_result->path_id;
 			ccb->ccb_h.target_id = CAM_TARGET_WILDCARD;
 			ccb->ccb_h.target_lun = CAM_LUN_WILDCARD;
 			ccb->crcn.flags = CAM_FLAG_NONE;
 
 			/* run this at a low priority */
 			ccb->ccb_h.pinfo.priority = 5;
 
 			if (ioctl(fd, CAMIOCOMMAND, ccb) == -1) {
 				warn("CAMIOCOMMAND ioctl failed");
 				retval = 1;
 				goto bailout;
 			}
 
 			if ((ccb->ccb_h.status & CAM_STATUS_MASK)==CAM_REQ_CMP){
 				fprintf(stdout, "%s of bus %d was successful\n",
 					rescan? "Re-scan" : "Reset",
 					bus_result->path_id);
 			} else {
 				/*
 				 * Don't bail out just yet, maybe the other
 				 * rescan or reset commands will complete
 				 * successfully.
 				 */
 				fprintf(stderr, "%s of bus %d returned error "
 					"%#x\n", rescan? "Re-scan" : "Reset",
 					bus_result->path_id,
 					ccb->ccb_h.status & CAM_STATUS_MASK);
 				retval = 1;
 			}
 		}
 	} while ((matchccb->ccb_h.status == CAM_REQ_CMP)
 		 && (matchccb->cdm.status == CAM_DEV_MATCH_MORE));
 
 bailout:
 
 	if (fd != -1)
 		close(fd);
 
 	if (matchccb != NULL) {
 		free(matchccb->cdm.patterns);
 		free(matchccb->cdm.matches);
 		free(matchccb);
 	}
 	free(ccb);
 
 	return (retval);
 }
 
 static int
 scanlun_or_reset_dev(path_id_t bus, target_id_t target, lun_id_t lun, int scan)
 {
 	union ccb ccb;
 	struct cam_device *device;
 	int fd;
 
 	device = NULL;
 
 	if (bus == CAM_BUS_WILDCARD) {
 		warnx("invalid bus number %d", bus);
 		return (1);
 	}
 
 	if (target == CAM_TARGET_WILDCARD) {
 		warnx("invalid target number %d", target);
 		return (1);
 	}
 
 	if (lun == CAM_LUN_WILDCARD) {
 		warnx("invalid lun number %jx", (uintmax_t)lun);
 		return (1);
 	}
 
 	fd = -1;
 
 	bzero(&ccb, sizeof(union ccb));
 
 	if (scan) {
 		if ((fd = open(XPT_DEVICE, O_RDWR)) < 0) {
 			warnx("error opening transport layer device %s\n",
 			    XPT_DEVICE);
 			warn("%s", XPT_DEVICE);
 			return (1);
 		}
 	} else {
 		device = cam_open_btl(bus, target, lun, O_RDWR, NULL);
 		if (device == NULL) {
 			warnx("%s", cam_errbuf);
 			return (1);
 		}
 	}
 
 	ccb.ccb_h.func_code = (scan)? XPT_SCAN_LUN : XPT_RESET_DEV;
 	ccb.ccb_h.path_id = bus;
 	ccb.ccb_h.target_id = target;
 	ccb.ccb_h.target_lun = lun;
 	ccb.ccb_h.timeout = 5000;
 	ccb.crcn.flags = CAM_FLAG_NONE;
 
 	/* run this at a low priority */
 	ccb.ccb_h.pinfo.priority = 5;
 
 	if (scan) {
 		if (ioctl(fd, CAMIOCOMMAND, &ccb) < 0) {
 			warn("CAMIOCOMMAND ioctl failed");
 			close(fd);
 			return (1);
 		}
 	} else {
 		if (cam_send_ccb(device, &ccb) < 0) {
 			warn("error sending XPT_RESET_DEV CCB");
 			cam_close_device(device);
 			return (1);
 		}
 	}
 
 	if (scan)
 		close(fd);
 	else
 		cam_close_device(device);
 
 	/*
 	 * An error code of CAM_BDR_SENT is normal for a BDR request.
 	 */
 	if (((ccb.ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
 	 || ((!scan)
 	  && ((ccb.ccb_h.status & CAM_STATUS_MASK) == CAM_BDR_SENT))) {
 		fprintf(stdout, "%s of %d:%d:%jx was successful\n",
 		    scan? "Re-scan" : "Reset", bus, target, (uintmax_t)lun);
 		return (0);
 	} else {
 		fprintf(stdout, "%s of %d:%d:%jx returned error %#x\n",
 		    scan? "Re-scan" : "Reset", bus, target, (uintmax_t)lun,
 		    ccb.ccb_h.status & CAM_STATUS_MASK);
 		return (1);
 	}
 }
 
 
 static struct scsi_nv defect_list_type_map[] = {
 	{ "block", SRDD10_BLOCK_FORMAT },
 	{ "extbfi", SRDD10_EXT_BFI_FORMAT },
 	{ "extphys", SRDD10_EXT_PHYS_FORMAT },
 	{ "longblock", SRDD10_LONG_BLOCK_FORMAT },
 	{ "bfi", SRDD10_BYTES_FROM_INDEX_FORMAT },
 	{ "phys", SRDD10_PHYSICAL_SECTOR_FORMAT }
 };
 
 static int
 readdefects(struct cam_device *device, int argc, char **argv,
 	    char *combinedopt, int task_attr, int retry_count, int timeout)
 {
 	union ccb *ccb = NULL;
 	struct scsi_read_defect_data_hdr_10 *hdr10 = NULL;
 	struct scsi_read_defect_data_hdr_12 *hdr12 = NULL;
 	size_t hdr_size = 0, entry_size = 0;
 	int use_12byte = 0;
 	int hex_format = 0;
 	u_int8_t *defect_list = NULL;
 	u_int8_t list_format = 0;
 	int list_type_set = 0;
 	u_int32_t dlist_length = 0;
 	u_int32_t returned_length = 0, valid_len = 0;
 	u_int32_t num_returned = 0, num_valid = 0;
 	u_int32_t max_possible_size = 0, hdr_max = 0;
 	u_int32_t starting_offset = 0;
 	u_int8_t returned_format, returned_type;
 	unsigned int i;
 	int summary = 0, quiet = 0;
 	int c, error = 0;
 	int lists_specified = 0;
 	int get_length = 1, first_pass = 1;
 	int mads = 0;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c){
 		case 'f':
 		{
 			scsi_nv_status status;
 			int entry_num = 0;
 
 			status = scsi_get_nv(defect_list_type_map,
 			    sizeof(defect_list_type_map) /
 			    sizeof(defect_list_type_map[0]), optarg,
 			    &entry_num, SCSI_NV_FLAG_IG_CASE);
 
 			if (status == SCSI_NV_FOUND) {
 				list_format = defect_list_type_map[
 				    entry_num].value;
 				list_type_set = 1;
 			} else {
 				warnx("%s: %s %s option %s", __func__,
 				    (status == SCSI_NV_AMBIGUOUS) ?
 				    "ambiguous" : "invalid", "defect list type",
 				    optarg);
 				error = 1;
 				goto defect_bailout;
 			}
 			break;
 		}
 		case 'G':
 			arglist |= CAM_ARG_GLIST;
 			break;
 		case 'P':
 			arglist |= CAM_ARG_PLIST;
 			break;
 		case 'q':
 			quiet = 1;
 			break;
 		case 's':
 			summary = 1;
 			break;
 		case 'S': {
 			char *endptr;
 
 			starting_offset = strtoul(optarg, &endptr, 0);
 			if (*endptr != '\0') {
 				error = 1;
 				warnx("invalid starting offset %s", optarg);
 				goto defect_bailout;
 			}
 			break;
 		}
 		case 'X':
 			hex_format = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (list_type_set == 0) {
 		error = 1;
 		warnx("no defect list format specified");
 		goto defect_bailout;
 	}
 
 	if (arglist & CAM_ARG_PLIST) {
 		list_format |= SRDD10_PLIST;
 		lists_specified++;
 	}
 
 	if (arglist & CAM_ARG_GLIST) {
 		list_format |= SRDD10_GLIST;
 		lists_specified++;
 	}
 
 	/*
 	 * This implies a summary, and was the previous behavior.
 	 */
 	if (lists_specified == 0)
 		summary = 1;
 
 	ccb = cam_getccb(device);
 
 retry_12byte:
 
 	/*
 	 * We start off asking for just the header to determine how much
 	 * defect data is available.  Some Hitachi drives return an error
 	 * if you ask for more data than the drive has.  Once we know the
 	 * length, we retry the command with the returned length.
 	 */
 	if (use_12byte == 0)
 		dlist_length = sizeof(*hdr10);
 	else
 		dlist_length = sizeof(*hdr12);
 
 retry:
 	if (defect_list != NULL) {
 		free(defect_list);
 		defect_list = NULL;
 	}
 	defect_list = malloc(dlist_length);
 	if (defect_list == NULL) {
 		warnx("can't malloc memory for defect list");
 		error = 1;
 		goto defect_bailout;
 	}
 
 next_batch:
 	bzero(defect_list, dlist_length);
 
 	/*
 	 * cam_getccb() zeros the CCB header only.  So we need to zero the
 	 * payload portion of the ccb.
 	 */
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	scsi_read_defects(&ccb->csio,
 			  /*retries*/ retry_count,
 			  /*cbfcnp*/ NULL,
 			  /*tag_action*/ task_attr,
 			  /*list_format*/ list_format,
 			  /*addr_desc_index*/ starting_offset,
 			  /*data_ptr*/ defect_list,
 			  /*dxfer_len*/ dlist_length,
 			  /*minimum_cmd_size*/ use_12byte ? 12 : 0,
 			  /*sense_len*/ SSD_FULL_SIZE,
 			  /*timeout*/ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		perror("error reading defect list");
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		error = 1;
 		goto defect_bailout;
 	}
 
 	valid_len = ccb->csio.dxfer_len - ccb->csio.resid;
 
 	if (use_12byte == 0) {
 		hdr10 = (struct scsi_read_defect_data_hdr_10 *)defect_list;
 		hdr_size = sizeof(*hdr10);
 		hdr_max = SRDDH10_MAX_LENGTH;
 
 		if (valid_len >= hdr_size) {
 			returned_length = scsi_2btoul(hdr10->length);
 			returned_format = hdr10->format;
 		} else {
 			returned_length = 0;
 			returned_format = 0;
 		}
 	} else {
 		hdr12 = (struct scsi_read_defect_data_hdr_12 *)defect_list;
 		hdr_size = sizeof(*hdr12);
 		hdr_max = SRDDH12_MAX_LENGTH;
 
 		if (valid_len >= hdr_size) {
 			returned_length = scsi_4btoul(hdr12->length);
 			returned_format = hdr12->format;
 		} else {
 			returned_length = 0;
 			returned_format = 0;
 		}
 	}
 
 	returned_type = returned_format & SRDDH10_DLIST_FORMAT_MASK;
 	switch (returned_type) {
 	case SRDD10_BLOCK_FORMAT:
 		entry_size = sizeof(struct scsi_defect_desc_block);
 		break;
 	case SRDD10_LONG_BLOCK_FORMAT:
 		entry_size = sizeof(struct scsi_defect_desc_long_block);
 		break;
 	case SRDD10_EXT_PHYS_FORMAT:
 	case SRDD10_PHYSICAL_SECTOR_FORMAT:
 		entry_size = sizeof(struct scsi_defect_desc_phys_sector);
 		break;
 	case SRDD10_EXT_BFI_FORMAT:
 	case SRDD10_BYTES_FROM_INDEX_FORMAT:
 		entry_size = sizeof(struct scsi_defect_desc_bytes_from_index);
 		break;
 	default:
 		warnx("Unknown defect format 0x%x\n", returned_type);
 		error = 1;
 		goto defect_bailout;
 		break;
 	}
 
 	max_possible_size = (hdr_max / entry_size) * entry_size;
 	num_returned = returned_length / entry_size;
 	num_valid = min(returned_length, valid_len - hdr_size);
 	num_valid /= entry_size;
 
 	if (get_length != 0) {
 		get_length = 0;
 
 		if ((ccb->ccb_h.status & CAM_STATUS_MASK) ==
 		     CAM_SCSI_STATUS_ERROR) {
 			struct scsi_sense_data *sense;
 			int error_code, sense_key, asc, ascq;
 
 			sense = &ccb->csio.sense_data;
 			scsi_extract_sense_len(sense, ccb->csio.sense_len -
 			    ccb->csio.sense_resid, &error_code, &sense_key,
 			    &asc, &ascq, /*show_errors*/ 1);
 
 			/*
 			 * If the drive is reporting that it just doesn't
 			 * support the defect list format, go ahead and use
 			 * the length it reported.  Otherwise, the length
 			 * may not be valid, so use the maximum.
 			 */
 			if ((sense_key == SSD_KEY_RECOVERED_ERROR)
 			 && (asc == 0x1c) && (ascq == 0x00)
 			 && (returned_length > 0)) {
 				if ((use_12byte == 0)
 				 && (returned_length >= max_possible_size)) {
 					get_length = 1;
 					use_12byte = 1;
 					goto retry_12byte;
 				}
 				dlist_length = returned_length + hdr_size;
 			} else if ((sense_key == SSD_KEY_RECOVERED_ERROR)
 				&& (asc == 0x1f) && (ascq == 0x00)
 				&& (returned_length > 0)) {
 				/* Partial defect list transfer */
 				/*
 				 * Hitachi drives return this error
 				 * along with a partial defect list if they
 				 * have more defects than the 10 byte
 				 * command can support.  Retry with the 12
 				 * byte command.
 				 */
 				if (use_12byte == 0) {
 					get_length = 1;
 					use_12byte = 1;
 					goto retry_12byte;
 				}
 				dlist_length = returned_length + hdr_size;
 			} else if ((sense_key == SSD_KEY_ILLEGAL_REQUEST)
 				&& (asc == 0x24) && (ascq == 0x00)) {
 				/* Invalid field in CDB */
 				/*
 				 * SBC-3 says that if the drive has more
 				 * defects than can be reported with the
 				 * 10 byte command, it should return this
 	 			 * error and no data.  Retry with the 12
 				 * byte command.
 				 */
 				if (use_12byte == 0) {
 					get_length = 1;
 					use_12byte = 1;
 					goto retry_12byte;
 				}
 				dlist_length = returned_length + hdr_size;
 			} else {
 				/*
 				 * If we got a SCSI error and no valid length,
 				 * just use the 10 byte maximum.  The 12
 				 * byte maximum is too large.
 				 */
 				if (returned_length == 0)
 					dlist_length = SRDD10_MAX_LENGTH;
 				else {
 					if ((use_12byte == 0)
 					 && (returned_length >=
 					     max_possible_size)) {
 						get_length = 1;
 						use_12byte = 1;
 						goto retry_12byte;
 					}
 					dlist_length = returned_length +
 					    hdr_size;
 				}
 			}
 		} else if ((ccb->ccb_h.status & CAM_STATUS_MASK) !=
 			    CAM_REQ_CMP){
 			error = 1;
 			warnx("Error reading defect header");
 			if (arglist & CAM_ARG_VERBOSE)
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			goto defect_bailout;
 		} else {
 			if ((use_12byte == 0)
 			 && (returned_length >= max_possible_size)) {
 				get_length = 1;
 				use_12byte = 1;
 				goto retry_12byte;
 			}
 			dlist_length = returned_length + hdr_size;
 		}
 		if (summary != 0) {
 			fprintf(stdout, "%u", num_returned);
 			if (quiet == 0) {
 				fprintf(stdout, " defect%s",
 					(num_returned != 1) ? "s" : "");
 			}
 			fprintf(stdout, "\n");
 
 			goto defect_bailout;
 		}
 
 		/*
 		 * We always limit the list length to the 10-byte maximum
 		 * length (0xffff).  The reason is that some controllers
 		 * can't handle larger I/Os, and we can transfer the entire
 		 * 10 byte list in one shot.  For drives that support the 12
 		 * byte read defects command, we'll step through the list
 		 * by specifying a starting offset.  For drives that don't
 		 * support the 12 byte command's starting offset, we'll
 		 * just display the first 64K.
 		 */
 		dlist_length = min(dlist_length, SRDD10_MAX_LENGTH);
 
 		goto retry;
 	}
 
 
 	if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_SCSI_STATUS_ERROR)
 	 && (ccb->csio.scsi_status == SCSI_STATUS_CHECK_COND)
 	 && ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)) {
 		struct scsi_sense_data *sense;
 		int error_code, sense_key, asc, ascq;
 
 		sense = &ccb->csio.sense_data;
 		scsi_extract_sense_len(sense, ccb->csio.sense_len -
 		    ccb->csio.sense_resid, &error_code, &sense_key, &asc,
 		    &ascq, /*show_errors*/ 1);
 
 		/*
 		 * According to the SCSI spec, if the disk doesn't support
 		 * the requested format, it will generally return a sense
 		 * key of RECOVERED ERROR, and an additional sense code
 		 * of "DEFECT LIST NOT FOUND".  HGST drives also return
 		 * Primary/Grown defect list not found errors.  So just
 		 * check for an ASC of 0x1c.
 		 */
 		if ((sense_key == SSD_KEY_RECOVERED_ERROR)
 		 && (asc == 0x1c)) {
 			const char *format_str;
 
 			format_str = scsi_nv_to_str(defect_list_type_map,
 			    sizeof(defect_list_type_map) /
 			    sizeof(defect_list_type_map[0]),
 			    list_format & SRDD10_DLIST_FORMAT_MASK);
 			warnx("requested defect format %s not available",
 			    format_str ? format_str : "unknown");
 
 			format_str = scsi_nv_to_str(defect_list_type_map,
 			    sizeof(defect_list_type_map) /
 			    sizeof(defect_list_type_map[0]), returned_type);
 			if (format_str != NULL) {
 				warnx("Device returned %s format",
 				    format_str);
 			} else {
 				error = 1;
 				warnx("Device returned unknown defect"
 				     " data format %#x", returned_type);
 				goto defect_bailout;
 			}
 		} else {
 			error = 1;
 			warnx("Error returned from read defect data command");
 			if (arglist & CAM_ARG_VERBOSE)
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			goto defect_bailout;
 		}
 	} else if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		error = 1;
 		warnx("Error returned from read defect data command");
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		goto defect_bailout;
 	}
 
 	if (first_pass != 0) {
 		fprintf(stderr, "Got %d defect", num_returned);
 
 		if ((lists_specified == 0) || (num_returned == 0)) {
 			fprintf(stderr, "s.\n");
 			goto defect_bailout;
 		} else if (num_returned == 1)
 			fprintf(stderr, ":\n");
 		else
 			fprintf(stderr, "s:\n");
 
 		first_pass = 0;
 	}
 
 	/*
 	 * XXX KDM  I should probably clean up the printout format for the
 	 * disk defects.
 	 */
 	switch (returned_type) {
 	case SRDD10_PHYSICAL_SECTOR_FORMAT:
 	case SRDD10_EXT_PHYS_FORMAT:
 	{
 		struct scsi_defect_desc_phys_sector *dlist;
 
 		dlist = (struct scsi_defect_desc_phys_sector *)
 			(defect_list + hdr_size);
 
 		for (i = 0; i < num_valid; i++) {
 			uint32_t sector;
 
 			sector = scsi_4btoul(dlist[i].sector);
 			if (returned_type == SRDD10_EXT_PHYS_FORMAT) {
 				mads = (sector & SDD_EXT_PHYS_MADS) ?
 				       0 : 1;
 				sector &= ~SDD_EXT_PHYS_FLAG_MASK;
 			}
 			if (hex_format == 0)
 				fprintf(stdout, "%d:%d:%d%s",
 					scsi_3btoul(dlist[i].cylinder),
 					dlist[i].head,
 					scsi_4btoul(dlist[i].sector),
 					mads ? " - " : "\n");
 			else
 				fprintf(stdout, "0x%x:0x%x:0x%x%s",
 					scsi_3btoul(dlist[i].cylinder),
 					dlist[i].head,
 					scsi_4btoul(dlist[i].sector),
 					mads ? " - " : "\n");
 			mads = 0;
 		}
 		if (num_valid < num_returned) {
 			starting_offset += num_valid;
 			goto next_batch;
 		}
 		break;
 	}
 	case SRDD10_BYTES_FROM_INDEX_FORMAT:
 	case SRDD10_EXT_BFI_FORMAT:
 	{
 		struct scsi_defect_desc_bytes_from_index *dlist;
 
 		dlist = (struct scsi_defect_desc_bytes_from_index *)
 			(defect_list + hdr_size);
 
 		for (i = 0; i < num_valid; i++) {
 			uint32_t bfi;
 
 			bfi = scsi_4btoul(dlist[i].bytes_from_index);
 			if (returned_type == SRDD10_EXT_BFI_FORMAT) {
 				mads = (bfi & SDD_EXT_BFI_MADS) ? 1 : 0;
 				bfi &= ~SDD_EXT_BFI_FLAG_MASK;
 			}
 			if (hex_format == 0)
 				fprintf(stdout, "%d:%d:%d%s",
 					scsi_3btoul(dlist[i].cylinder),
 					dlist[i].head,
 					scsi_4btoul(dlist[i].bytes_from_index),
 					mads ? " - " : "\n");
 			else
 				fprintf(stdout, "0x%x:0x%x:0x%x%s",
 					scsi_3btoul(dlist[i].cylinder),
 					dlist[i].head,
 					scsi_4btoul(dlist[i].bytes_from_index),
 					mads ? " - " : "\n");
 
 			mads = 0;
 		}
 		if (num_valid < num_returned) {
 			starting_offset += num_valid;
 			goto next_batch;
 		}
 		break;
 	}
 	case SRDDH10_BLOCK_FORMAT:
 	{
 		struct scsi_defect_desc_block *dlist;
 
 		dlist = (struct scsi_defect_desc_block *)
 			(defect_list + hdr_size);
 
 		for (i = 0; i < num_valid; i++) {
 			if (hex_format == 0)
 				fprintf(stdout, "%u\n",
 					scsi_4btoul(dlist[i].address));
 			else
 				fprintf(stdout, "0x%x\n",
 					scsi_4btoul(dlist[i].address));
 		}
 
 		if (num_valid < num_returned) {
 			starting_offset += num_valid;
 			goto next_batch;
 		}
 
 		break;
 	}
 	case SRDD10_LONG_BLOCK_FORMAT:
 	{
 		struct scsi_defect_desc_long_block *dlist;
 
 		dlist = (struct scsi_defect_desc_long_block *)
 			(defect_list + hdr_size);
 
 		for (i = 0; i < num_valid; i++) {
 			if (hex_format == 0)
 				fprintf(stdout, "%ju\n",
 					(uintmax_t)scsi_8btou64(
 					dlist[i].address));
 			else
 				fprintf(stdout, "0x%jx\n",
 					(uintmax_t)scsi_8btou64(
 					dlist[i].address));
 		}
 
 		if (num_valid < num_returned) {
 			starting_offset += num_valid;
 			goto next_batch;
 		}
 		break;
 	}
 	default:
 		fprintf(stderr, "Unknown defect format 0x%x\n",
 			returned_type);
 		error = 1;
 		break;
 	}
 defect_bailout:
 
 	if (defect_list != NULL)
 		free(defect_list);
 
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	return (error);
 }
 
 #if 0
 void
 reassignblocks(struct cam_device *device, u_int32_t *blocks, int num_blocks)
 {
 	union ccb *ccb;
 
 	ccb = cam_getccb(device);
 
 	cam_freeccb(ccb);
 }
 #endif
 
 void
 mode_sense(struct cam_device *device, int dbd, int pc, int page, int subpage,
 	   int task_attr, int retry_count, int timeout, u_int8_t *data,
 	   int datalen)
 {
 	union ccb *ccb;
 	int retval;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL)
 		errx(1, "mode_sense: couldn't allocate CCB");
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	scsi_mode_sense_subpage(&ccb->csio,
 			/* retries */ retry_count,
 			/* cbfcnp */ NULL,
 			/* tag_action */ task_attr,
 			/* dbd */ dbd,
 			/* pc */ pc << 6,
 			/* page */ page,
 			/* subpage */ subpage,
 			/* param_buf */ data,
 			/* param_len */ datalen,
 			/* minimum_cmd_size */ 0,
 			/* sense_len */ SSD_FULL_SIZE,
 			/* timeout */ timeout ? timeout : 5000);
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		cam_freeccb(ccb);
 		cam_close_device(device);
 		if (retval < 0)
 			err(1, "error sending mode sense command");
 		else
 			errx(1, "error sending mode sense command");
 	}
 
 	cam_freeccb(ccb);
 }
 
 void
 mode_select(struct cam_device *device, int save_pages, int task_attr,
 	    int retry_count, int timeout, u_int8_t *data, int datalen)
 {
 	union ccb *ccb;
 	int retval;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL)
 		errx(1, "mode_select: couldn't allocate CCB");
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	scsi_mode_select(&ccb->csio,
 			 /* retries */ retry_count,
 			 /* cbfcnp */ NULL,
 			 /* tag_action */ task_attr,
 			 /* scsi_page_fmt */ 1,
 			 /* save_pages */ save_pages,
 			 /* param_buf */ data,
 			 /* param_len */ datalen,
 			 /* sense_len */ SSD_FULL_SIZE,
 			 /* timeout */ timeout ? timeout : 5000);
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		cam_freeccb(ccb);
 		cam_close_device(device);
 
 		if (retval < 0)
 			err(1, "error sending mode select command");
 		else
 			errx(1, "error sending mode select command");
 
 	}
 
 	cam_freeccb(ccb);
 }
 
 void
 modepage(struct cam_device *device, int argc, char **argv, char *combinedopt,
 	 int task_attr, int retry_count, int timeout)
 {
 	char *str_subpage;
 	int c, page = -1, subpage = -1, pc = 0;
 	int binary = 0, dbd = 0, edit = 0, list = 0;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'b':
 			binary = 1;
 			break;
 		case 'd':
 			dbd = 1;
 			break;
 		case 'e':
 			edit = 1;
 			break;
 		case 'l':
 			list++;
 			break;
 		case 'm':
 			str_subpage = optarg;
 			strsep(&str_subpage, ",");
 			page = strtol(optarg, NULL, 0);
 			if (str_subpage)
 			    subpage = strtol(str_subpage, NULL, 0);
 			else
 			    subpage = 0;
 			if (page < 0)
 				errx(1, "invalid mode page %d", page);
 			if (subpage < 0)
 				errx(1, "invalid mode subpage %d", subpage);
 			break;
 		case 'P':
 			pc = strtol(optarg, NULL, 0);
 			if ((pc < 0) || (pc > 3))
 				errx(1, "invalid page control field %d", pc);
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (page == -1 && list == 0)
 		errx(1, "you must specify a mode page!");
 
 	if (list != 0) {
 		mode_list(device, dbd, pc, list > 1, task_attr, retry_count,
 			  timeout);
 	} else {
 		mode_edit(device, dbd, pc, page, subpage, edit, binary,
 		    task_attr, retry_count, timeout);
 	}
 }
 
 static int
 scsicmd(struct cam_device *device, int argc, char **argv, char *combinedopt,
 	int task_attr, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	u_int32_t flags = CAM_DIR_NONE;
 	u_int8_t *data_ptr = NULL;
 	u_int8_t cdb[20];
 	u_int8_t atacmd[12];
 	struct get_hook hook;
 	int c, data_bytes = 0, valid_bytes;
 	int cdb_len = 0;
 	int atacmd_len = 0;
 	int dmacmd = 0;
 	int fpdmacmd = 0;
 	int need_res = 0;
 	char *datastr = NULL, *tstr, *resstr = NULL;
 	int error = 0;
 	int fd_data = 0, fd_res = 0;
 	int retval;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("scsicmd: error allocating ccb");
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(ccb);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'a':
 			tstr = optarg;
 			while (isspace(*tstr) && (*tstr != '\0'))
 				tstr++;
 			hook.argc = argc - optind;
 			hook.argv = argv + optind;
 			hook.got = 0;
 			atacmd_len = buff_encode_visit(atacmd, sizeof(atacmd), tstr,
 						    iget, &hook);
 			/*
 			 * Increment optind by the number of arguments the
 			 * encoding routine processed.  After each call to
 			 * getopt(3), optind points to the argument that
 			 * getopt should process _next_.  In this case,
 			 * that means it points to the first command string
 			 * argument, if there is one.  Once we increment
 			 * this, it should point to either the next command
 			 * line argument, or it should be past the end of
 			 * the list.
 			 */
 			optind += hook.got;
 			break;
 		case 'c':
 			tstr = optarg;
 			while (isspace(*tstr) && (*tstr != '\0'))
 				tstr++;
 			hook.argc = argc - optind;
 			hook.argv = argv + optind;
 			hook.got = 0;
 			cdb_len = buff_encode_visit(cdb, sizeof(cdb), tstr,
 						    iget, &hook);
 			/*
 			 * Increment optind by the number of arguments the
 			 * encoding routine processed.  After each call to
 			 * getopt(3), optind points to the argument that
 			 * getopt should process _next_.  In this case,
 			 * that means it points to the first command string
 			 * argument, if there is one.  Once we increment
 			 * this, it should point to either the next command
 			 * line argument, or it should be past the end of
 			 * the list.
 			 */
 			optind += hook.got;
 			break;
 		case 'd':
 			dmacmd = 1;
 			break;
 		case 'f':
 			fpdmacmd = 1;
 			break;
 		case 'i':
 			if (arglist & CAM_ARG_CMD_OUT) {
 				warnx("command must either be "
 				      "read or write, not both");
 				error = 1;
 				goto scsicmd_bailout;
 			}
 			arglist |= CAM_ARG_CMD_IN;
 			flags = CAM_DIR_IN;
 			data_bytes = strtol(optarg, NULL, 0);
 			if (data_bytes <= 0) {
 				warnx("invalid number of input bytes %d",
 				      data_bytes);
 				error = 1;
 				goto scsicmd_bailout;
 			}
 			hook.argc = argc - optind;
 			hook.argv = argv + optind;
 			hook.got = 0;
 			optind++;
 			datastr = cget(&hook, NULL);
 			/*
 			 * If the user supplied "-" instead of a format, he
 			 * wants the data to be written to stdout.
 			 */
 			if ((datastr != NULL)
 			 && (datastr[0] == '-'))
 				fd_data = 1;
 
 			data_ptr = (u_int8_t *)malloc(data_bytes);
 			if (data_ptr == NULL) {
 				warnx("can't malloc memory for data_ptr");
 				error = 1;
 				goto scsicmd_bailout;
 			}
 			break;
 		case 'o':
 			if (arglist & CAM_ARG_CMD_IN) {
 				warnx("command must either be "
 				      "read or write, not both");
 				error = 1;
 				goto scsicmd_bailout;
 			}
 			arglist |= CAM_ARG_CMD_OUT;
 			flags = CAM_DIR_OUT;
 			data_bytes = strtol(optarg, NULL, 0);
 			if (data_bytes <= 0) {
 				warnx("invalid number of output bytes %d",
 				      data_bytes);
 				error = 1;
 				goto scsicmd_bailout;
 			}
 			hook.argc = argc - optind;
 			hook.argv = argv + optind;
 			hook.got = 0;
 			datastr = cget(&hook, NULL);
 			data_ptr = (u_int8_t *)malloc(data_bytes);
 			if (data_ptr == NULL) {
 				warnx("can't malloc memory for data_ptr");
 				error = 1;
 				goto scsicmd_bailout;
 			}
 			bzero(data_ptr, data_bytes);
 			/*
 			 * If the user supplied "-" instead of a format, he
 			 * wants the data to be read from stdin.
 			 */
 			if ((datastr != NULL)
 			 && (datastr[0] == '-'))
 				fd_data = 1;
 			else
 				buff_encode_visit(data_ptr, data_bytes, datastr,
 						  iget, &hook);
 			optind += hook.got;
 			break;
 		case 'r':
 			need_res = 1;
 			hook.argc = argc - optind;
 			hook.argv = argv + optind;
 			hook.got = 0;
 			resstr = cget(&hook, NULL);
 			if ((resstr != NULL) && (resstr[0] == '-'))
 				fd_res = 1;
 			optind += hook.got;
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * If fd_data is set, and we're writing to the device, we need to
 	 * read the data the user wants written from stdin.
 	 */
 	if ((fd_data == 1) && (arglist & CAM_ARG_CMD_OUT)) {
 		ssize_t amt_read;
 		int amt_to_read = data_bytes;
 		u_int8_t *buf_ptr = data_ptr;
 
 		for (amt_read = 0; amt_to_read > 0;
 		     amt_read = read(STDIN_FILENO, buf_ptr, amt_to_read)) {
 			if (amt_read == -1) {
 				warn("error reading data from stdin");
 				error = 1;
 				goto scsicmd_bailout;
 			}
 			amt_to_read -= amt_read;
 			buf_ptr += amt_read;
 		}
 	}
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		flags |= CAM_PASS_ERR_RECOVER;
 
 	/* Disable freezing the device queue */
 	flags |= CAM_DEV_QFRZDIS;
 
 	if (cdb_len) {
 		/*
 		 * This is taken from the SCSI-3 draft spec.
 		 * (T10/1157D revision 0.3)
 		 * The top 3 bits of an opcode are the group code.
 		 * The next 5 bits are the command code.
 		 * Group 0:  six byte commands
 		 * Group 1:  ten byte commands
 		 * Group 2:  ten byte commands
 		 * Group 3:  reserved
 		 * Group 4:  sixteen byte commands
 		 * Group 5:  twelve byte commands
 		 * Group 6:  vendor specific
 		 * Group 7:  vendor specific
 		 */
 		switch((cdb[0] >> 5) & 0x7) {
 			case 0:
 				cdb_len = 6;
 				break;
 			case 1:
 			case 2:
 				cdb_len = 10;
 				break;
 			case 3:
 			case 6:
 			case 7:
 				/* computed by buff_encode_visit */
 				break;
 			case 4:
 				cdb_len = 16;
 				break;
 			case 5:
 				cdb_len = 12;
 				break;
 		}
 
 		/*
 		 * We should probably use csio_build_visit or something like that
 		 * here, but it's easier to encode arguments as you go.  The
 		 * alternative would be skipping the CDB argument and then encoding
 		 * it here, since we've got the data buffer argument by now.
 		 */
 		bcopy(cdb, &ccb->csio.cdb_io.cdb_bytes, cdb_len);
 
 		cam_fill_csio(&ccb->csio,
 		      /*retries*/ retry_count,
 		      /*cbfcnp*/ NULL,
 		      /*flags*/ flags,
 		      /*tag_action*/ task_attr,
 		      /*data_ptr*/ data_ptr,
 		      /*dxfer_len*/ data_bytes,
 		      /*sense_len*/ SSD_FULL_SIZE,
 		      /*cdb_len*/ cdb_len,
 		      /*timeout*/ timeout ? timeout : 5000);
 	} else {
 		atacmd_len = 12;
 		bcopy(atacmd, &ccb->ataio.cmd.command, atacmd_len);
 		if (need_res)
 			ccb->ataio.cmd.flags |= CAM_ATAIO_NEEDRESULT;
 		if (dmacmd)
 			ccb->ataio.cmd.flags |= CAM_ATAIO_DMA;
 		if (fpdmacmd)
 			ccb->ataio.cmd.flags |= CAM_ATAIO_FPDMA;
 
 		cam_fill_ataio(&ccb->ataio,
 		      /*retries*/ retry_count,
 		      /*cbfcnp*/ NULL,
 		      /*flags*/ flags,
 		      /*tag_action*/ 0,
 		      /*data_ptr*/ data_ptr,
 		      /*dxfer_len*/ data_bytes,
 		      /*timeout*/ timeout ? timeout : 5000);
 	}
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char warnstr[] = "error sending command";
 
 		if (retval < 0)
 			warn(warnstr);
 		else
 			warnx(warnstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 
 		error = 1;
 		goto scsicmd_bailout;
 	}
 
 	if (atacmd_len && need_res) {
 		if (fd_res == 0) {
 			buff_decode_visit(&ccb->ataio.res.status, 11, resstr,
 					  arg_put, NULL);
 			fprintf(stdout, "\n");
 		} else {
 			fprintf(stdout,
 			    "%02X %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n",
 			    ccb->ataio.res.status,
 			    ccb->ataio.res.error,
 			    ccb->ataio.res.lba_low,
 			    ccb->ataio.res.lba_mid,
 			    ccb->ataio.res.lba_high,
 			    ccb->ataio.res.device,
 			    ccb->ataio.res.lba_low_exp,
 			    ccb->ataio.res.lba_mid_exp,
 			    ccb->ataio.res.lba_high_exp,
 			    ccb->ataio.res.sector_count,
 			    ccb->ataio.res.sector_count_exp);
 			fflush(stdout);
 		}
 	}
 
 	if (cdb_len)
 		valid_bytes = ccb->csio.dxfer_len - ccb->csio.resid;
 	else
 		valid_bytes = ccb->ataio.dxfer_len - ccb->ataio.resid;
 	if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
 	 && (arglist & CAM_ARG_CMD_IN)
 	 && (valid_bytes > 0)) {
 		if (fd_data == 0) {
 			buff_decode_visit(data_ptr, valid_bytes, datastr,
 					  arg_put, NULL);
 			fprintf(stdout, "\n");
 		} else {
 			ssize_t amt_written;
 			int amt_to_write = valid_bytes;
 			u_int8_t *buf_ptr = data_ptr;
 
 			for (amt_written = 0; (amt_to_write > 0) &&
 			     (amt_written =write(1, buf_ptr,amt_to_write))> 0;){
 				amt_to_write -= amt_written;
 				buf_ptr += amt_written;
 			}
 			if (amt_written == -1) {
 				warn("error writing data to stdout");
 				error = 1;
 				goto scsicmd_bailout;
 			} else if ((amt_written == 0)
 				&& (amt_to_write > 0)) {
 				warnx("only wrote %u bytes out of %u",
 				      valid_bytes - amt_to_write, valid_bytes);
 			}
 		}
 	}
 
 scsicmd_bailout:
 
 	if ((data_bytes > 0) && (data_ptr != NULL))
 		free(data_ptr);
 
 	cam_freeccb(ccb);
 
 	return (error);
 }
 
 static int
 camdebug(int argc, char **argv, char *combinedopt)
 {
 	int c, fd;
 	path_id_t bus = CAM_BUS_WILDCARD;
 	target_id_t target = CAM_TARGET_WILDCARD;
 	lun_id_t lun = CAM_LUN_WILDCARD;
 	char *tstr;
 	union ccb ccb;
 	int error = 0, rv;
 
 	bzero(&ccb, sizeof(union ccb));
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'I':
 			arglist |= CAM_ARG_DEBUG_INFO;
 			ccb.cdbg.flags |= CAM_DEBUG_INFO;
 			break;
 		case 'P':
 			arglist |= CAM_ARG_DEBUG_PERIPH;
 			ccb.cdbg.flags |= CAM_DEBUG_PERIPH;
 			break;
 		case 'S':
 			arglist |= CAM_ARG_DEBUG_SUBTRACE;
 			ccb.cdbg.flags |= CAM_DEBUG_SUBTRACE;
 			break;
 		case 'T':
 			arglist |= CAM_ARG_DEBUG_TRACE;
 			ccb.cdbg.flags |= CAM_DEBUG_TRACE;
 			break;
 		case 'X':
 			arglist |= CAM_ARG_DEBUG_XPT;
 			ccb.cdbg.flags |= CAM_DEBUG_XPT;
 			break;
 		case 'c':
 			arglist |= CAM_ARG_DEBUG_CDB;
 			ccb.cdbg.flags |= CAM_DEBUG_CDB;
 			break;
 		case 'p':
 			arglist |= CAM_ARG_DEBUG_PROBE;
 			ccb.cdbg.flags |= CAM_DEBUG_PROBE;
 			break;
 		default:
 			break;
 		}
 	}
 
 	argc -= optind;
 	argv += optind;
 
 	if (argc <= 0) {
 		warnx("you must specify \"off\", \"all\" or a bus,");
 		warnx("bus:target, bus:target:lun or periph");
 		return (1);
 	}
 
 	tstr = *argv;
 	while (isspace(*tstr) && (*tstr != '\0'))
 		tstr++;
 
 	if (strncmp(tstr, "off", 3) == 0) {
 		ccb.cdbg.flags = CAM_DEBUG_NONE;
 		arglist &= ~(CAM_ARG_DEBUG_INFO|CAM_ARG_DEBUG_PERIPH|
 			     CAM_ARG_DEBUG_TRACE|CAM_ARG_DEBUG_SUBTRACE|
 			     CAM_ARG_DEBUG_XPT|CAM_ARG_DEBUG_PROBE);
 	} else {
 		rv = parse_btl(tstr, &bus, &target, &lun, &arglist);
 		if (rv < 1) {
 			warnx("you must specify \"all\", \"off\", or a bus,");
 			warnx("bus:target, bus:target:lun or periph to debug");
 			return (1);
 		}
 	}
 
 	if ((fd = open(XPT_DEVICE, O_RDWR)) < 0) {
 		warnx("error opening transport layer device %s", XPT_DEVICE);
 		warn("%s", XPT_DEVICE);
 		return (1);
 	}
 
 	ccb.ccb_h.func_code = XPT_DEBUG;
 	ccb.ccb_h.path_id = bus;
 	ccb.ccb_h.target_id = target;
 	ccb.ccb_h.target_lun = lun;
 
 	if (ioctl(fd, CAMIOCOMMAND, &ccb) == -1) {
 		warn("CAMIOCOMMAND ioctl failed");
 		error = 1;
 	} else {
 		if ((ccb.ccb_h.status & CAM_STATUS_MASK) ==
 		     CAM_FUNC_NOTAVAIL) {
 			warnx("CAM debugging not available");
 			warnx("you need to put options CAMDEBUG in"
 			      " your kernel config file!");
 			error = 1;
 		} else if ((ccb.ccb_h.status & CAM_STATUS_MASK) !=
 			    CAM_REQ_CMP) {
 			warnx("XPT_DEBUG CCB failed with status %#x",
 			      ccb.ccb_h.status);
 			error = 1;
 		} else {
 			if (ccb.cdbg.flags == CAM_DEBUG_NONE) {
 				fprintf(stderr,
 					"Debugging turned off\n");
 			} else {
 				fprintf(stderr,
 					"Debugging enabled for "
 					"%d:%d:%jx\n",
 					bus, target, (uintmax_t)lun);
 			}
 		}
 	}
 	close(fd);
 
 	return (error);
 }
 
 static int
 tagcontrol(struct cam_device *device, int argc, char **argv,
 	   char *combinedopt)
 {
 	int c;
 	union ccb *ccb;
 	int numtags = -1;
 	int retval = 0;
 	int quiet = 0;
 	char pathstr[1024];
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("tagcontrol: error allocating ccb");
 		return (1);
 	}
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'N':
 			numtags = strtol(optarg, NULL, 0);
 			if (numtags < 0) {
 				warnx("tag count %d is < 0", numtags);
 				retval = 1;
 				goto tagcontrol_bailout;
 			}
 			break;
 		case 'q':
 			quiet++;
 			break;
 		default:
 			break;
 		}
 	}
 
 	cam_path_string(device, pathstr, sizeof(pathstr));
 
 	if (numtags >= 0) {
 		CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->crs);
 		ccb->ccb_h.func_code = XPT_REL_SIMQ;
 		ccb->ccb_h.flags = CAM_DEV_QFREEZE;
 		ccb->crs.release_flags = RELSIM_ADJUST_OPENINGS;
 		ccb->crs.openings = numtags;
 
 
 		if (cam_send_ccb(device, ccb) < 0) {
 			perror("error sending XPT_REL_SIMQ CCB");
 			retval = 1;
 			goto tagcontrol_bailout;
 		}
 
 		if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			warnx("XPT_REL_SIMQ CCB failed");
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 			retval = 1;
 			goto tagcontrol_bailout;
 		}
 
 
 		if (quiet == 0)
 			fprintf(stdout, "%stagged openings now %d\n",
 				pathstr, ccb->crs.openings);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cgds);
 
 	ccb->ccb_h.func_code = XPT_GDEV_STATS;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		perror("error sending XPT_GDEV_STATS CCB");
 		retval = 1;
 		goto tagcontrol_bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		warnx("XPT_GDEV_STATS CCB failed");
 		cam_error_print(device, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto tagcontrol_bailout;
 	}
 
 	if (arglist & CAM_ARG_VERBOSE) {
 		fprintf(stdout, "%s", pathstr);
 		fprintf(stdout, "dev_openings  %d\n", ccb->cgds.dev_openings);
 		fprintf(stdout, "%s", pathstr);
 		fprintf(stdout, "dev_active    %d\n", ccb->cgds.dev_active);
 		fprintf(stdout, "%s", pathstr);
 		fprintf(stdout, "allocated     %d\n", ccb->cgds.allocated);
 		fprintf(stdout, "%s", pathstr);
 		fprintf(stdout, "queued        %d\n", ccb->cgds.queued);
 		fprintf(stdout, "%s", pathstr);
 		fprintf(stdout, "held          %d\n", ccb->cgds.held);
 		fprintf(stdout, "%s", pathstr);
 		fprintf(stdout, "mintags       %d\n", ccb->cgds.mintags);
 		fprintf(stdout, "%s", pathstr);
 		fprintf(stdout, "maxtags       %d\n", ccb->cgds.maxtags);
 	} else {
 		if (quiet == 0) {
 			fprintf(stdout, "%s", pathstr);
 			fprintf(stdout, "device openings: ");
 		}
 		fprintf(stdout, "%d\n", ccb->cgds.dev_openings +
 			ccb->cgds.dev_active);
 	}
 
 tagcontrol_bailout:
 
 	cam_freeccb(ccb);
 	return (retval);
 }
 
 static void
 cts_print(struct cam_device *device, struct ccb_trans_settings *cts)
 {
 	char pathstr[1024];
 
 	cam_path_string(device, pathstr, sizeof(pathstr));
 
 	if (cts->transport == XPORT_SPI) {
 		struct ccb_trans_settings_spi *spi =
 		    &cts->xport_specific.spi;
 
 		if ((spi->valid & CTS_SPI_VALID_SYNC_RATE) != 0) {
 
 			fprintf(stdout, "%ssync parameter: %d\n", pathstr,
 				spi->sync_period);
 
 			if (spi->sync_offset != 0) {
 				u_int freq;
 
 				freq = scsi_calc_syncsrate(spi->sync_period);
 				fprintf(stdout, "%sfrequency: %d.%03dMHz\n",
 					pathstr, freq / 1000, freq % 1000);
 			}
 		}
 
 		if (spi->valid & CTS_SPI_VALID_SYNC_OFFSET) {
 			fprintf(stdout, "%soffset: %d\n", pathstr,
 			    spi->sync_offset);
 		}
 
 		if (spi->valid & CTS_SPI_VALID_BUS_WIDTH) {
 			fprintf(stdout, "%sbus width: %d bits\n", pathstr,
 				(0x01 << spi->bus_width) * 8);
 		}
 
 		if (spi->valid & CTS_SPI_VALID_DISC) {
 			fprintf(stdout, "%sdisconnection is %s\n", pathstr,
 				(spi->flags & CTS_SPI_FLAGS_DISC_ENB) ?
 				"enabled" : "disabled");
 		}
 	}
 	if (cts->transport == XPORT_FC) {
 		struct ccb_trans_settings_fc *fc =
 		    &cts->xport_specific.fc;
 
 		if (fc->valid & CTS_FC_VALID_WWNN)
 			fprintf(stdout, "%sWWNN: 0x%llx\n", pathstr,
 			    (long long) fc->wwnn);
 		if (fc->valid & CTS_FC_VALID_WWPN)
 			fprintf(stdout, "%sWWPN: 0x%llx\n", pathstr,
 			    (long long) fc->wwpn);
 		if (fc->valid & CTS_FC_VALID_PORT)
 			fprintf(stdout, "%sPortID: 0x%x\n", pathstr, fc->port);
 		if (fc->valid & CTS_FC_VALID_SPEED)
 			fprintf(stdout, "%stransfer speed: %d.%03dMB/s\n",
 			    pathstr, fc->bitrate / 1000, fc->bitrate % 1000);
 	}
 	if (cts->transport == XPORT_SAS) {
 		struct ccb_trans_settings_sas *sas =
 		    &cts->xport_specific.sas;
 
 		if (sas->valid & CTS_SAS_VALID_SPEED)
 			fprintf(stdout, "%stransfer speed: %d.%03dMB/s\n",
 			    pathstr, sas->bitrate / 1000, sas->bitrate % 1000);
 	}
 	if (cts->transport == XPORT_ATA) {
 		struct ccb_trans_settings_pata *pata =
 		    &cts->xport_specific.ata;
 
 		if ((pata->valid & CTS_ATA_VALID_MODE) != 0) {
 			fprintf(stdout, "%sATA mode: %s\n", pathstr,
 				ata_mode2string(pata->mode));
 		}
 		if ((pata->valid & CTS_ATA_VALID_ATAPI) != 0) {
 			fprintf(stdout, "%sATAPI packet length: %d\n", pathstr,
 				pata->atapi);
 		}
 		if ((pata->valid & CTS_ATA_VALID_BYTECOUNT) != 0) {
 			fprintf(stdout, "%sPIO transaction length: %d\n",
 				pathstr, pata->bytecount);
 		}
 	}
 	if (cts->transport == XPORT_SATA) {
 		struct ccb_trans_settings_sata *sata =
 		    &cts->xport_specific.sata;
 
 		if ((sata->valid & CTS_SATA_VALID_REVISION) != 0) {
 			fprintf(stdout, "%sSATA revision: %d.x\n", pathstr,
 				sata->revision);
 		}
 		if ((sata->valid & CTS_SATA_VALID_MODE) != 0) {
 			fprintf(stdout, "%sATA mode: %s\n", pathstr,
 				ata_mode2string(sata->mode));
 		}
 		if ((sata->valid & CTS_SATA_VALID_ATAPI) != 0) {
 			fprintf(stdout, "%sATAPI packet length: %d\n", pathstr,
 				sata->atapi);
 		}
 		if ((sata->valid & CTS_SATA_VALID_BYTECOUNT) != 0) {
 			fprintf(stdout, "%sPIO transaction length: %d\n",
 				pathstr, sata->bytecount);
 		}
 		if ((sata->valid & CTS_SATA_VALID_PM) != 0) {
 			fprintf(stdout, "%sPMP presence: %d\n", pathstr,
 				sata->pm_present);
 		}
 		if ((sata->valid & CTS_SATA_VALID_TAGS) != 0) {
 			fprintf(stdout, "%sNumber of tags: %d\n", pathstr,
 				sata->tags);
 		}
 		if ((sata->valid & CTS_SATA_VALID_CAPS) != 0) {
 			fprintf(stdout, "%sSATA capabilities: %08x\n", pathstr,
 				sata->caps);
 		}
 	}
 	if (cts->protocol == PROTO_ATA) {
 		struct ccb_trans_settings_ata *ata=
 		    &cts->proto_specific.ata;
 
 		if (ata->valid & CTS_ATA_VALID_TQ) {
 			fprintf(stdout, "%stagged queueing: %s\n", pathstr,
 				(ata->flags & CTS_ATA_FLAGS_TAG_ENB) ?
 				"enabled" : "disabled");
 		}
 	}
 	if (cts->protocol == PROTO_SCSI) {
 		struct ccb_trans_settings_scsi *scsi=
 		    &cts->proto_specific.scsi;
 
 		if (scsi->valid & CTS_SCSI_VALID_TQ) {
 			fprintf(stdout, "%stagged queueing: %s\n", pathstr,
 				(scsi->flags & CTS_SCSI_FLAGS_TAG_ENB) ?
 				"enabled" : "disabled");
 		}
 	}
 #ifdef WITH_NVME
 	if (cts->protocol == PROTO_NVME) {
 		struct ccb_trans_settings_nvme *nvmex =
 		    &cts->xport_specific.nvme;
 
 		if (nvmex->valid & CTS_NVME_VALID_SPEC) {
 			fprintf(stdout, "%sNVMe Spec: %d.%d\n", pathstr,
 			    NVME_MAJOR(nvmex->spec),
 			    NVME_MINOR(nvmex->spec));
 		}
 		if (nvmex->valid & CTS_NVME_VALID_LINK) {
 			fprintf(stdout, "%sPCIe lanes: %d (%d max)\n", pathstr,
 			    nvmex->lanes, nvmex->max_lanes);
 			fprintf(stdout, "%sPCIe Generation: %d (%d max)\n", pathstr,
 			    nvmex->speed, nvmex->max_speed);
 		}
 	}
 #endif
 }
 
 /*
  * Get a path inquiry CCB for the specified device.
  */
 static int
 get_cpi(struct cam_device *device, struct ccb_pathinq *cpi)
 {
 	union ccb *ccb;
 	int retval = 0;
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("get_cpi: couldn't allocate CCB");
 		return (1);
 	}
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cpi);
 	ccb->ccb_h.func_code = XPT_PATH_INQ;
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("get_cpi: error sending Path Inquiry CCB");
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto get_cpi_bailout;
 	}
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto get_cpi_bailout;
 	}
 	bcopy(&ccb->cpi, cpi, sizeof(struct ccb_pathinq));
 
 get_cpi_bailout:
 	cam_freeccb(ccb);
 	return (retval);
 }
 
 /*
  * Get a get device CCB for the specified device.
  */
 static int
 get_cgd(struct cam_device *device, struct ccb_getdev *cgd)
 {
 	union ccb *ccb;
 	int retval = 0;
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("get_cgd: couldn't allocate CCB");
 		return (1);
 	}
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cgd);
 	ccb->ccb_h.func_code = XPT_GDEV_TYPE;
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("get_cgd: error sending Path Inquiry CCB");
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto get_cgd_bailout;
 	}
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto get_cgd_bailout;
 	}
 	bcopy(&ccb->cgd, cgd, sizeof(struct ccb_getdev));
 
 get_cgd_bailout:
 	cam_freeccb(ccb);
 	return (retval);
 }
 
 /*
  * Returns 1 if the device has the VPD page, 0 if it does not, and -1 on an
  * error.
  */
 int
 dev_has_vpd_page(struct cam_device *dev, uint8_t page_id, int retry_count,
 		 int timeout, int verbosemode)
 {
 	union ccb *ccb = NULL;
 	struct scsi_vpd_supported_page_list sup_pages;
 	int i;
 	int retval = 0;
 
 	ccb = cam_getccb(dev);
 	if (ccb == NULL) {
 		warn("Unable to allocate CCB");
 		retval = -1;
 		goto bailout;
 	}
 
 	/* cam_getccb cleans up the header, caller has to zero the payload */
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	bzero(&sup_pages, sizeof(sup_pages));
 
 	scsi_inquiry(&ccb->csio,
 		     /*retries*/ retry_count,
 		     /*cbfcnp*/ NULL,
 		     /* tag_action */ MSG_SIMPLE_Q_TAG,
 		     /* inq_buf */ (u_int8_t *)&sup_pages,
 		     /* inq_len */ sizeof(sup_pages),
 		     /* evpd */ 1,
 		     /* page_code */ SVPD_SUPPORTED_PAGE_LIST,
 		     /* sense_len */ SSD_FULL_SIZE,
 		     /* timeout */ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (retry_count != 0)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(dev, ccb) < 0) {
 		cam_freeccb(ccb);
 		ccb = NULL;
 		retval = -1;
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (verbosemode != 0)
 			cam_error_print(dev, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = -1;
 		goto bailout;
 	}
 
 	for (i = 0; i < sup_pages.length; i++) {
 		if (sup_pages.list[i] == page_id) {
 			retval = 1;
 			goto bailout;
 		}
 	}
 bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	return (retval);
 }
 
 /*
  * devtype is filled in with the type of device.
  * Returns 0 for success, non-zero for failure.
  */
 int
 get_device_type(struct cam_device *dev, int retry_count, int timeout,
 		    int verbosemode, camcontrol_devtype *devtype)
 {
 	struct ccb_getdev cgd;
 	int retval;
 
 	retval = get_cgd(dev, &cgd);
 	if (retval != 0)
 		goto bailout;
 
 	switch (cgd.protocol) {
 	case PROTO_SCSI:
 		break;
 	case PROTO_ATA:
 	case PROTO_ATAPI:
 	case PROTO_SATAPM:
 		*devtype = CC_DT_ATA;
 		goto bailout;
 		break; /*NOTREACHED*/
 	case PROTO_NVME:
 		*devtype = CC_DT_NVME;
 		goto bailout;
 		break; /*NOTREACHED*/
 	case PROTO_MMCSD:
 		*devtype = CC_DT_MMCSD;
 		goto bailout;
 		break; /*NOTREACHED*/
 	default:
 		*devtype = CC_DT_UNKNOWN;
 		goto bailout;
 		break; /*NOTREACHED*/
 	}
 
 	if (retry_count == -1) {
 		/*
 		 * For a retry count of -1, used only the cached data to avoid
 		 * I/O to the drive. Sending the identify command to the drive
 		 * can cause issues for SATL attachaed drives since identify is
 		 * not an NCQ command.
 		 */
 		if (cgd.ident_data.config != 0)
 			*devtype = CC_DT_SATL;
 		else
 			*devtype = CC_DT_SCSI;
 	} else {
 		/*
 		 * Check for the ATA Information VPD page (0x89).  If this is an
 		 * ATA device behind a SCSI to ATA translation layer (SATL),
 		 * this VPD page should be present.
 		 *
 		 * If that VPD page isn't present, or we get an error back from
 		 * the INQUIRY command, we'll just treat it as a normal SCSI
 		 * device.
 		 */
 		retval = dev_has_vpd_page(dev, SVPD_ATA_INFORMATION, retry_count,
 		    timeout, verbosemode);
 		if (retval == 1)
 			*devtype = CC_DT_SATL;
 		else
 			*devtype = CC_DT_SCSI;
 	}
 	retval = 0;
 
 bailout:
 	return (retval);
 }
 
 int
 build_ata_cmd(union ccb *ccb, uint32_t retry_count, uint32_t flags,
     uint8_t tag_action, uint8_t protocol, uint8_t ata_flags, uint16_t features,
     uint16_t sector_count, uint64_t lba, uint8_t command, uint32_t auxiliary,
     uint8_t *data_ptr, uint32_t dxfer_len, uint8_t *cdb_storage,
     size_t cdb_storage_len, uint8_t sense_len, uint32_t timeout,
     int is48bit, camcontrol_devtype devtype)
 {
 	int retval = 0;
 
 	if (devtype == CC_DT_ATA) {
 		cam_fill_ataio(&ccb->ataio,
 		    /*retries*/ retry_count,
 		    /*cbfcnp*/ NULL,
 		    /*flags*/ flags,
 		    /*tag_action*/ tag_action,
 		    /*data_ptr*/ data_ptr,
 		    /*dxfer_len*/ dxfer_len,
 		    /*timeout*/ timeout);
 		if (is48bit || lba > ATA_MAX_28BIT_LBA)
 			ata_48bit_cmd(&ccb->ataio, command, features, lba,
 			    sector_count);
 		else
 			ata_28bit_cmd(&ccb->ataio, command, features, lba,
 			    sector_count);
 
 		if (auxiliary != 0) {
 			ccb->ataio.ata_flags |= ATA_FLAG_AUX;
 			ccb->ataio.aux = auxiliary;
 		}
 
 		if (ata_flags & AP_FLAG_CHK_COND)
 			ccb->ataio.cmd.flags |= CAM_ATAIO_NEEDRESULT;
 
 		if ((protocol & AP_PROTO_MASK) == AP_PROTO_DMA)
 			ccb->ataio.cmd.flags |= CAM_ATAIO_DMA;
 		else if ((protocol & AP_PROTO_MASK) == AP_PROTO_FPDMA)
 			ccb->ataio.cmd.flags |= CAM_ATAIO_FPDMA;
 	} else {
 		if (is48bit || lba > ATA_MAX_28BIT_LBA)
 			protocol |= AP_EXTEND;
 
 		retval = scsi_ata_pass(&ccb->csio,
 		    /*retries*/ retry_count,
 		    /*cbfcnp*/ NULL,
 		    /*flags*/ flags,
 		    /*tag_action*/ tag_action,
 		    /*protocol*/ protocol,
 		    /*ata_flags*/ ata_flags,
 		    /*features*/ features,
 		    /*sector_count*/ sector_count,
 		    /*lba*/ lba,
 		    /*command*/ command,
 		    /*device*/ 0,
 		    /*icc*/ 0,
 		    /*auxiliary*/ auxiliary,
 		    /*control*/ 0,
 		    /*data_ptr*/ data_ptr,
 		    /*dxfer_len*/ dxfer_len,
 		    /*cdb_storage*/ cdb_storage,
 		    /*cdb_storage_len*/ cdb_storage_len,
 		    /*minimum_cmd_size*/ 0,
 		    /*sense_len*/ sense_len,
 		    /*timeout*/ timeout);
 	}
 
 	return (retval);
 }
 
 int
 get_ata_status(struct cam_device *dev, union ccb *ccb, uint8_t *error,
 	       uint16_t *count, uint64_t *lba, uint8_t *device, uint8_t *status)
 {
 	int retval = 0;
 
 	switch (ccb->ccb_h.func_code) {
 	case XPT_SCSI_IO: {
 		uint8_t opcode;
 		int error_code = 0, sense_key = 0, asc = 0, ascq = 0;
 
 		/*
 		 * In this case, we have SCSI ATA PASS-THROUGH command, 12
 		 * or 16 byte, and need to see what
 		 */
 		if (ccb->ccb_h.flags & CAM_CDB_POINTER)
 			opcode = ccb->csio.cdb_io.cdb_ptr[0];
 		else
 			opcode = ccb->csio.cdb_io.cdb_bytes[0];
 		if ((opcode != ATA_PASS_12)
 		 && (opcode != ATA_PASS_16)) {
 			retval = 1;
 			warnx("%s: unsupported opcode %02x", __func__, opcode);
 			goto bailout;
 		}
 
 		retval = scsi_extract_sense_ccb(ccb, &error_code, &sense_key,
 						&asc, &ascq);
 		/* Note: the _ccb() variant returns 0 for an error */
 		if (retval == 0) {
 			retval = 1;
 			goto bailout;
 		} else
 			retval = 0;
 
 		switch (error_code) {
 		case SSD_DESC_CURRENT_ERROR:
 		case SSD_DESC_DEFERRED_ERROR: {
 			struct scsi_sense_data_desc *sense;
 			struct scsi_sense_ata_ret_desc *desc;
 			uint8_t *desc_ptr;
 
 			sense = (struct scsi_sense_data_desc *)
 			    &ccb->csio.sense_data;
 
 			desc_ptr = scsi_find_desc(sense, ccb->csio.sense_len -
 			    ccb->csio.sense_resid, SSD_DESC_ATA);
 			if (desc_ptr == NULL) {
 				cam_error_print(dev, ccb, CAM_ESF_ALL,
 				    CAM_EPF_ALL, stderr);
 				retval = 1;
 				goto bailout;
 			}
 			desc = (struct scsi_sense_ata_ret_desc *)desc_ptr;
 
 			*error = desc->error;
 			*count = (desc->count_15_8 << 8) |
 				  desc->count_7_0;
 			*lba = ((uint64_t)desc->lba_47_40 << 40) |
 			       ((uint64_t)desc->lba_39_32 << 32) |
 			       ((uint64_t)desc->lba_31_24 << 24) |
 			       (desc->lba_23_16 << 16) |
 			       (desc->lba_15_8  <<  8) |
 				desc->lba_7_0;
 			*device = desc->device;
 			*status = desc->status;
 
 			/*
 			 * If the extend bit isn't set, the result is for a
 			 * 12-byte ATA PASS-THROUGH command or a 16 or 32 byte
 			 * command without the extend bit set.  This means
 			 * that the device is supposed to return 28-bit
 			 * status.  The count field is only 8 bits, and the
 			 * LBA field is only 8 bits.
 			 */
 			if ((desc->flags & SSD_DESC_ATA_FLAG_EXTEND) == 0){
 				*count &= 0xff;
 				*lba &= 0x0fffffff;
 			}
 			break;
 		}
 		case SSD_CURRENT_ERROR:
 		case SSD_DEFERRED_ERROR: {
 #if 0
 			struct scsi_sense_data_fixed *sense;
 #endif
 			/*
 			 * XXX KDM need to support fixed sense data.
 			 */
 			warnx("%s: Fixed sense data not supported yet",
 			    __func__);
 			retval = 1;
 			goto bailout;
 			break; /*NOTREACHED*/
 		}
 		default:
 			retval = 1;
 			goto bailout;
 			break;
 		}
 
 		break;
 	}
 	case XPT_ATA_IO: {
 		struct ata_res *res;
 
 		/*
 		 * In this case, we have an ATA command, and we need to
 		 * fill in the requested values from the result register
 		 * set.
 		 */
 		res = &ccb->ataio.res;
 		*error = res->error;
 		*status = res->status;
 		*device = res->device;
 		*count = res->sector_count;
 		*lba = (res->lba_high << 16) |
 		       (res->lba_mid << 8) |
 		       (res->lba_low);
 		if (res->flags & CAM_ATAIO_48BIT) {
 			*count |= (res->sector_count_exp << 8);
 			*lba |= ((uint64_t)res->lba_low_exp << 24) |
 				((uint64_t)res->lba_mid_exp << 32) |
 				((uint64_t)res->lba_high_exp << 40);
 		} else {
 			*lba |= (res->device & 0xf) << 24;
 		}
 		break;
 	}
 	default:
 		retval = 1;
 		break;
 	}
 bailout:
 	return (retval);
 }
 
 static void
 cpi_print(struct ccb_pathinq *cpi)
 {
 	char adapter_str[1024];
 	uint64_t i;
 
 	snprintf(adapter_str, sizeof(adapter_str),
 		 "%s%d:", cpi->dev_name, cpi->unit_number);
 
 	fprintf(stdout, "%s SIM/HBA version: %d\n", adapter_str,
 		cpi->version_num);
 
 	for (i = 1; i < UINT8_MAX; i = i << 1) {
 		const char *str;
 
 		if ((i & cpi->hba_inquiry) == 0)
 			continue;
 
 		fprintf(stdout, "%s supports ", adapter_str);
 
 		switch(i) {
 		case PI_MDP_ABLE:
 			str = "MDP message";
 			break;
 		case PI_WIDE_32:
 			str = "32 bit wide SCSI";
 			break;
 		case PI_WIDE_16:
 			str = "16 bit wide SCSI";
 			break;
 		case PI_SDTR_ABLE:
 			str = "SDTR message";
 			break;
 		case PI_LINKED_CDB:
 			str = "linked CDBs";
 			break;
 		case PI_TAG_ABLE:
 			str = "tag queue messages";
 			break;
 		case PI_SOFT_RST:
 			str = "soft reset alternative";
 			break;
 		case PI_SATAPM:
 			str = "SATA Port Multiplier";
 			break;
 		default:
 			str = "unknown PI bit set";
 			break;
 		}
 		fprintf(stdout, "%s\n", str);
 	}
 
 	for (i = 1; i < UINT32_MAX; i = i << 1) {
 		const char *str;
 
 		if ((i & cpi->hba_misc) == 0)
 			continue;
 
 		fprintf(stdout, "%s ", adapter_str);
 
 		switch(i) {
 		case PIM_ATA_EXT:
 			str = "can understand ata_ext requests";
 			break;
 		case PIM_EXTLUNS:
 			str = "64bit extended LUNs supported";
 			break;
 		case PIM_SCANHILO:
 			str = "bus scans from high ID to low ID";
 			break;
 		case PIM_NOREMOVE:
 			str = "removable devices not included in scan";
 			break;
 		case PIM_NOINITIATOR:
 			str = "initiator role not supported";
 			break;
 		case PIM_NOBUSRESET:
 			str = "user has disabled initial BUS RESET or"
 			      " controller is in target/mixed mode";
 			break;
 		case PIM_NO_6_BYTE:
 			str = "do not send 6-byte commands";
 			break;
 		case PIM_SEQSCAN:
 			str = "scan bus sequentially";
 			break;
 		case PIM_UNMAPPED:
 			str = "unmapped I/O supported";
 			break;
 		case PIM_NOSCAN:
 			str = "does its own scanning";
 			break;
 		default:
 			str = "unknown PIM bit set";
 			break;
 		}
 		fprintf(stdout, "%s\n", str);
 	}
 
 	for (i = 1; i < UINT16_MAX; i = i << 1) {
 		const char *str;
 
 		if ((i & cpi->target_sprt) == 0)
 			continue;
 
 		fprintf(stdout, "%s supports ", adapter_str);
 		switch(i) {
 		case PIT_PROCESSOR:
 			str = "target mode processor mode";
 			break;
 		case PIT_PHASE:
 			str = "target mode phase cog. mode";
 			break;
 		case PIT_DISCONNECT:
 			str = "disconnects in target mode";
 			break;
 		case PIT_TERM_IO:
 			str = "terminate I/O message in target mode";
 			break;
 		case PIT_GRP_6:
 			str = "group 6 commands in target mode";
 			break;
 		case PIT_GRP_7:
 			str = "group 7 commands in target mode";
 			break;
 		default:
 			str = "unknown PIT bit set";
 			break;
 		}
 
 		fprintf(stdout, "%s\n", str);
 	}
 	fprintf(stdout, "%s HBA engine count: %d\n", adapter_str,
 		cpi->hba_eng_cnt);
 	fprintf(stdout, "%s maximum target: %d\n", adapter_str,
 		cpi->max_target);
 	fprintf(stdout, "%s maximum LUN: %d\n", adapter_str,
 		cpi->max_lun);
 	fprintf(stdout, "%s highest path ID in subsystem: %d\n",
 		adapter_str, cpi->hpath_id);
 	fprintf(stdout, "%s initiator ID: %d\n", adapter_str,
 		cpi->initiator_id);
 	fprintf(stdout, "%s SIM vendor: %s\n", adapter_str, cpi->sim_vid);
 	fprintf(stdout, "%s HBA vendor: %s\n", adapter_str, cpi->hba_vid);
 	fprintf(stdout, "%s HBA vendor ID: 0x%04x\n",
 	    adapter_str, cpi->hba_vendor);
 	fprintf(stdout, "%s HBA device ID: 0x%04x\n",
 	    adapter_str, cpi->hba_device);
 	fprintf(stdout, "%s HBA subvendor ID: 0x%04x\n",
 	    adapter_str, cpi->hba_subvendor);
 	fprintf(stdout, "%s HBA subdevice ID: 0x%04x\n",
 	    adapter_str, cpi->hba_subdevice);
 	fprintf(stdout, "%s bus ID: %d\n", adapter_str, cpi->bus_id);
 	fprintf(stdout, "%s base transfer speed: ", adapter_str);
 	if (cpi->base_transfer_speed > 1000)
 		fprintf(stdout, "%d.%03dMB/sec\n",
 			cpi->base_transfer_speed / 1000,
 			cpi->base_transfer_speed % 1000);
 	else
 		fprintf(stdout, "%dKB/sec\n",
 			(cpi->base_transfer_speed % 1000) * 1000);
 	fprintf(stdout, "%s maximum transfer size: %u bytes\n",
 	    adapter_str, cpi->maxio);
 }
 
 static int
 get_print_cts(struct cam_device *device, int user_settings, int quiet,
 	      struct ccb_trans_settings *cts)
 {
 	int retval;
 	union ccb *ccb;
 
 	retval = 0;
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("get_print_cts: error allocating ccb");
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cts);
 
 	ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 
 	if (user_settings == 0)
 		ccb->cts.type = CTS_TYPE_CURRENT_SETTINGS;
 	else
 		ccb->cts.type = CTS_TYPE_USER_SETTINGS;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		perror("error sending XPT_GET_TRAN_SETTINGS CCB");
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto get_print_cts_bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		warnx("XPT_GET_TRANS_SETTINGS CCB failed");
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto get_print_cts_bailout;
 	}
 
 	if (quiet == 0)
 		cts_print(device, &ccb->cts);
 
 	if (cts != NULL)
 		bcopy(&ccb->cts, cts, sizeof(struct ccb_trans_settings));
 
 get_print_cts_bailout:
 
 	cam_freeccb(ccb);
 
 	return (retval);
 }
 
 static int
 ratecontrol(struct cam_device *device, int task_attr, int retry_count,
 	    int timeout, int argc, char **argv, char *combinedopt)
 {
 	int c;
 	union ccb *ccb;
 	int user_settings = 0;
 	int retval = 0;
 	int disc_enable = -1, tag_enable = -1;
 	int mode = -1;
 	int offset = -1;
 	double syncrate = -1;
 	int bus_width = -1;
 	int quiet = 0;
 	int change_settings = 0, send_tur = 0;
 	struct ccb_pathinq cpi;
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("ratecontrol: error allocating ccb");
 		return (1);
 	}
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c){
 		case 'a':
 			send_tur = 1;
 			break;
 		case 'c':
 			user_settings = 0;
 			break;
 		case 'D':
 			if (strncasecmp(optarg, "enable", 6) == 0)
 				disc_enable = 1;
 			else if (strncasecmp(optarg, "disable", 7) == 0)
 				disc_enable = 0;
 			else {
 				warnx("-D argument \"%s\" is unknown", optarg);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			change_settings = 1;
 			break;
 		case 'M':
 			mode = ata_string2mode(optarg);
 			if (mode < 0) {
 				warnx("unknown mode '%s'", optarg);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			change_settings = 1;
 			break;
 		case 'O':
 			offset = strtol(optarg, NULL, 0);
 			if (offset < 0) {
 				warnx("offset value %d is < 0", offset);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			change_settings = 1;
 			break;
 		case 'q':
 			quiet++;
 			break;
 		case 'R':
 			syncrate = atof(optarg);
 			if (syncrate < 0) {
 				warnx("sync rate %f is < 0", syncrate);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			change_settings = 1;
 			break;
 		case 'T':
 			if (strncasecmp(optarg, "enable", 6) == 0)
 				tag_enable = 1;
 			else if (strncasecmp(optarg, "disable", 7) == 0)
 				tag_enable = 0;
 			else {
 				warnx("-T argument \"%s\" is unknown", optarg);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			change_settings = 1;
 			break;
 		case 'U':
 			user_settings = 1;
 			break;
 		case 'W':
 			bus_width = strtol(optarg, NULL, 0);
 			if (bus_width < 0) {
 				warnx("bus width %d is < 0", bus_width);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			change_settings = 1;
 			break;
 		default:
 			break;
 		}
 	}
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cpi);
 	/*
 	 * Grab path inquiry information, so we can determine whether
 	 * or not the initiator is capable of the things that the user
 	 * requests.
 	 */
 	ccb->ccb_h.func_code = XPT_PATH_INQ;
 	if (cam_send_ccb(device, ccb) < 0) {
 		perror("error sending XPT_PATH_INQ CCB");
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		retval = 1;
 		goto ratecontrol_bailout;
 	}
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		warnx("XPT_PATH_INQ CCB failed");
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		retval = 1;
 		goto ratecontrol_bailout;
 	}
 	bcopy(&ccb->cpi, &cpi, sizeof(struct ccb_pathinq));
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cts);
 	if (quiet == 0) {
 		fprintf(stdout, "%s parameters:\n",
 		    user_settings ? "User" : "Current");
 	}
 	retval = get_print_cts(device, user_settings, quiet, &ccb->cts);
 	if (retval != 0)
 		goto ratecontrol_bailout;
 
 	if (arglist & CAM_ARG_VERBOSE)
 		cpi_print(&cpi);
 
 	if (change_settings) {
 		int didsettings = 0;
 		struct ccb_trans_settings_spi *spi = NULL;
 		struct ccb_trans_settings_pata *pata = NULL;
 		struct ccb_trans_settings_sata *sata = NULL;
 		struct ccb_trans_settings_ata *ata = NULL;
 		struct ccb_trans_settings_scsi *scsi = NULL;
 
 		if (ccb->cts.transport == XPORT_SPI)
 			spi = &ccb->cts.xport_specific.spi;
 		if (ccb->cts.transport == XPORT_ATA)
 			pata = &ccb->cts.xport_specific.ata;
 		if (ccb->cts.transport == XPORT_SATA)
 			sata = &ccb->cts.xport_specific.sata;
 		if (ccb->cts.protocol == PROTO_ATA)
 			ata = &ccb->cts.proto_specific.ata;
 		if (ccb->cts.protocol == PROTO_SCSI)
 			scsi = &ccb->cts.proto_specific.scsi;
 		ccb->cts.xport_specific.valid = 0;
 		ccb->cts.proto_specific.valid = 0;
 		if (spi && disc_enable != -1) {
 			spi->valid |= CTS_SPI_VALID_DISC;
 			if (disc_enable == 0)
 				spi->flags &= ~CTS_SPI_FLAGS_DISC_ENB;
 			else
 				spi->flags |= CTS_SPI_FLAGS_DISC_ENB;
 			didsettings++;
 		}
 		if (tag_enable != -1) {
 			if ((cpi.hba_inquiry & PI_TAG_ABLE) == 0) {
 				warnx("HBA does not support tagged queueing, "
 				      "so you cannot modify tag settings");
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			if (ata) {
 				ata->valid |= CTS_SCSI_VALID_TQ;
 				if (tag_enable == 0)
 					ata->flags &= ~CTS_ATA_FLAGS_TAG_ENB;
 				else
 					ata->flags |= CTS_ATA_FLAGS_TAG_ENB;
 				didsettings++;
 			} else if (scsi) {
 				scsi->valid |= CTS_SCSI_VALID_TQ;
 				if (tag_enable == 0)
 					scsi->flags &= ~CTS_SCSI_FLAGS_TAG_ENB;
 				else
 					scsi->flags |= CTS_SCSI_FLAGS_TAG_ENB;
 				didsettings++;
 			}
 		}
 		if (spi && offset != -1) {
 			if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) {
 				warnx("HBA is not capable of changing offset");
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			spi->valid |= CTS_SPI_VALID_SYNC_OFFSET;
 			spi->sync_offset = offset;
 			didsettings++;
 		}
 		if (spi && syncrate != -1) {
 			int prelim_sync_period;
 
 			if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) {
 				warnx("HBA is not capable of changing "
 				      "transfer rates");
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			spi->valid |= CTS_SPI_VALID_SYNC_RATE;
 			/*
 			 * The sync rate the user gives us is in MHz.
 			 * We need to translate it into KHz for this
 			 * calculation.
 			 */
 			syncrate *= 1000;
 			/*
 			 * Next, we calculate a "preliminary" sync period
 			 * in tenths of a nanosecond.
 			 */
 			if (syncrate == 0)
 				prelim_sync_period = 0;
 			else
 				prelim_sync_period = 10000000 / syncrate;
 			spi->sync_period =
 				scsi_calc_syncparam(prelim_sync_period);
 			didsettings++;
 		}
 		if (sata && syncrate != -1) {
 			if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) {
 				warnx("HBA is not capable of changing "
 				      "transfer rates");
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			if  (!user_settings) {
 				warnx("You can modify only user rate "
 				    "settings for SATA");
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			sata->revision = ata_speed2revision(syncrate * 100);
 			if (sata->revision < 0) {
 				warnx("Invalid rate %f", syncrate);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			sata->valid |= CTS_SATA_VALID_REVISION;
 			didsettings++;
 		}
 		if ((pata || sata) && mode != -1) {
 			if ((cpi.hba_inquiry & PI_SDTR_ABLE) == 0) {
 				warnx("HBA is not capable of changing "
 				      "transfer rates");
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			if  (!user_settings) {
 				warnx("You can modify only user mode "
 				    "settings for ATA/SATA");
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			if (pata) {
 				pata->mode = mode;
 				pata->valid |= CTS_ATA_VALID_MODE;
 			} else {
 				sata->mode = mode;
 				sata->valid |= CTS_SATA_VALID_MODE;
 			}
 			didsettings++;
 		}
 		/*
 		 * The bus_width argument goes like this:
 		 * 0 == 8 bit
 		 * 1 == 16 bit
 		 * 2 == 32 bit
 		 * Therefore, if you shift the number of bits given on the
 		 * command line right by 4, you should get the correct
 		 * number.
 		 */
 		if (spi && bus_width != -1) {
 			/*
 			 * We might as well validate things here with a
 			 * decipherable error message, rather than what
 			 * will probably be an indecipherable error message
 			 * by the time it gets back to us.
 			 */
 			if ((bus_width == 16)
 			 && ((cpi.hba_inquiry & PI_WIDE_16) == 0)) {
 				warnx("HBA does not support 16 bit bus width");
 				retval = 1;
 				goto ratecontrol_bailout;
 			} else if ((bus_width == 32)
 				&& ((cpi.hba_inquiry & PI_WIDE_32) == 0)) {
 				warnx("HBA does not support 32 bit bus width");
 				retval = 1;
 				goto ratecontrol_bailout;
 			} else if ((bus_width != 8)
 				&& (bus_width != 16)
 				&& (bus_width != 32)) {
 				warnx("Invalid bus width %d", bus_width);
 				retval = 1;
 				goto ratecontrol_bailout;
 			}
 			spi->valid |= CTS_SPI_VALID_BUS_WIDTH;
 			spi->bus_width = bus_width >> 4;
 			didsettings++;
 		}
 		if  (didsettings == 0) {
 			goto ratecontrol_bailout;
 		}
 		ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
 		if (cam_send_ccb(device, ccb) < 0) {
 			perror("error sending XPT_SET_TRAN_SETTINGS CCB");
 			if (arglist & CAM_ARG_VERBOSE) {
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			}
 			retval = 1;
 			goto ratecontrol_bailout;
 		}
 		if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 			warnx("XPT_SET_TRANS_SETTINGS CCB failed");
 			if (arglist & CAM_ARG_VERBOSE) {
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			}
 			retval = 1;
 			goto ratecontrol_bailout;
 		}
 	}
 	if (send_tur) {
 		retval = testunitready(device, task_attr, retry_count, timeout,
 				       (arglist & CAM_ARG_VERBOSE) ? 0 : 1);
 		/*
 		 * If the TUR didn't succeed, just bail.
 		 */
 		if (retval != 0) {
 			if (quiet == 0)
 				fprintf(stderr, "Test Unit Ready failed\n");
 			goto ratecontrol_bailout;
 		}
 	}
 	if ((change_settings || send_tur) && !quiet &&
 	    (ccb->cts.transport == XPORT_ATA ||
 	     ccb->cts.transport == XPORT_SATA || send_tur)) {
 		fprintf(stdout, "New parameters:\n");
 		retval = get_print_cts(device, user_settings, 0, NULL);
 	}
 
 ratecontrol_bailout:
 	cam_freeccb(ccb);
 	return (retval);
 }
 
 static int
 scsiformat(struct cam_device *device, int argc, char **argv,
 	   char *combinedopt, int task_attr, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	int c;
 	int ycount = 0, quiet = 0;
 	int error = 0, retval = 0;
 	int use_timeout = 10800 * 1000;
 	int immediate = 1;
 	struct format_defect_list_header fh;
 	u_int8_t *data_ptr = NULL;
 	u_int32_t dxfer_len = 0;
 	u_int8_t byte2 = 0;
 	int num_warnings = 0;
 	int reportonly = 0;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("scsiformat: error allocating ccb");
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'q':
 			quiet++;
 			break;
 		case 'r':
 			reportonly = 1;
 			break;
 		case 'w':
 			immediate = 0;
 			break;
 		case 'y':
 			ycount++;
 			break;
 		}
 	}
 
 	if (reportonly)
 		goto doreport;
 
 	if (quiet == 0) {
 		fprintf(stdout, "You are about to REMOVE ALL DATA from the "
 			"following device:\n");
 
 		error = scsidoinquiry(device, argc, argv, combinedopt,
 				      task_attr, retry_count, timeout);
 
 		if (error != 0) {
 			warnx("scsiformat: error sending inquiry");
 			goto scsiformat_bailout;
 		}
 	}
 
 	if (ycount == 0) {
 		if (!get_confirmation()) {
 			error = 1;
 			goto scsiformat_bailout;
 		}
 	}
 
 	if (timeout != 0)
 		use_timeout = timeout;
 
 	if (quiet == 0) {
 		fprintf(stdout, "Current format timeout is %d seconds\n",
 			use_timeout / 1000);
 	}
 
 	/*
 	 * If the user hasn't disabled questions and didn't specify a
 	 * timeout on the command line, ask them if they want the current
 	 * timeout.
 	 */
 	if ((ycount == 0)
 	 && (timeout == 0)) {
 		char str[1024];
 		int new_timeout = 0;
 
 		fprintf(stdout, "Enter new timeout in seconds or press\n"
 			"return to keep the current timeout [%d] ",
 			use_timeout / 1000);
 
 		if (fgets(str, sizeof(str), stdin) != NULL) {
 			if (str[0] != '\0')
 				new_timeout = atoi(str);
 		}
 
 		if (new_timeout != 0) {
 			use_timeout = new_timeout * 1000;
 			fprintf(stdout, "Using new timeout value %d\n",
 				use_timeout / 1000);
 		}
 	}
 
 	/*
 	 * Keep this outside the if block below to silence any unused
 	 * variable warnings.
 	 */
 	bzero(&fh, sizeof(fh));
 
 	/*
 	 * If we're in immediate mode, we've got to include the format
 	 * header
 	 */
 	if (immediate != 0) {
 		fh.byte2 = FU_DLH_IMMED;
 		data_ptr = (u_int8_t *)&fh;
 		dxfer_len = sizeof(fh);
 		byte2 = FU_FMT_DATA;
 	} else if (quiet == 0) {
 		fprintf(stdout, "Formatting...");
 		fflush(stdout);
 	}
 
 	scsi_format_unit(&ccb->csio,
 			 /* retries */ retry_count,
 			 /* cbfcnp */ NULL,
 			 /* tag_action */ task_attr,
 			 /* byte2 */ byte2,
 			 /* ileave */ 0,
 			 /* data_ptr */ data_ptr,
 			 /* dxfer_len */ dxfer_len,
 			 /* sense_len */ SSD_FULL_SIZE,
 			 /* timeout */ use_timeout);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((immediate == 0)
 	   && ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP))) {
 		const char errstr[] = "error sending format command";
 
 		if (retval < 0)
 			warn(errstr);
 		else
 			warnx(errstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		error = 1;
 		goto scsiformat_bailout;
 	}
 
 	/*
 	 * If we ran in non-immediate mode, we already checked for errors
 	 * above and printed out any necessary information.  If we're in
 	 * immediate mode, we need to loop through and get status
 	 * information periodically.
 	 */
 	if (immediate == 0) {
 		if (quiet == 0) {
 			fprintf(stdout, "Format Complete\n");
 		}
 		goto scsiformat_bailout;
 	}
 
 doreport:
 	do {
 		cam_status status;
 
 		CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 		/*
 		 * There's really no need to do error recovery or
 		 * retries here, since we're just going to sit in a
 		 * loop and wait for the device to finish formatting.
 		 */
 		scsi_test_unit_ready(&ccb->csio,
 				     /* retries */ 0,
 				     /* cbfcnp */ NULL,
 				     /* tag_action */ task_attr,
 				     /* sense_len */ SSD_FULL_SIZE,
 				     /* timeout */ 5000);
 
 		/* Disable freezing the device queue */
 		ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 		retval = cam_send_ccb(device, ccb);
 
 		/*
 		 * If we get an error from the ioctl, bail out.  SCSI
 		 * errors are expected.
 		 */
 		if (retval < 0) {
 			warn("error sending CAMIOCOMMAND ioctl");
 			if (arglist & CAM_ARG_VERBOSE) {
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			}
 			error = 1;
 			goto scsiformat_bailout;
 		}
 
 		status = ccb->ccb_h.status & CAM_STATUS_MASK;
 
 		if ((status != CAM_REQ_CMP)
 		 && (status == CAM_SCSI_STATUS_ERROR)
 		 && ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)) {
 			struct scsi_sense_data *sense;
 			int error_code, sense_key, asc, ascq;
 
 			sense = &ccb->csio.sense_data;
 			scsi_extract_sense_len(sense, ccb->csio.sense_len -
 			    ccb->csio.sense_resid, &error_code, &sense_key,
 			    &asc, &ascq, /*show_errors*/ 1);
 
 			/*
 			 * According to the SCSI-2 and SCSI-3 specs, a
 			 * drive that is in the middle of a format should
 			 * return NOT READY with an ASC of "logical unit
 			 * not ready, format in progress".  The sense key
 			 * specific bytes will then be a progress indicator.
 			 */
 			if ((sense_key == SSD_KEY_NOT_READY)
 			 && (asc == 0x04) && (ascq == 0x04)) {
 				uint8_t sks[3];
 
 				if ((scsi_get_sks(sense, ccb->csio.sense_len -
 				     ccb->csio.sense_resid, sks) == 0)
 				 && (quiet == 0)) {
 					uint32_t val;
 					u_int64_t percentage;
 
 					val = scsi_2btoul(&sks[1]);
 					percentage = 10000ull * val;
 
 					fprintf(stdout,
 						"\rFormatting:  %ju.%02u %% "
 						"(%u/%d) done",
 						(uintmax_t)(percentage /
 						(0x10000 * 100)),
 						(unsigned)((percentage /
 						0x10000) % 100),
 						val, 0x10000);
 					fflush(stdout);
 				} else if ((quiet == 0)
 					&& (++num_warnings <= 1)) {
 					warnx("Unexpected SCSI Sense Key "
 					      "Specific value returned "
 					      "during format:");
 					scsi_sense_print(device, &ccb->csio,
 							 stderr);
 					warnx("Unable to print status "
 					      "information, but format will "
 					      "proceed.");
 					warnx("will exit when format is "
 					      "complete");
 				}
 				sleep(1);
 			} else {
 				warnx("Unexpected SCSI error during format");
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 				error = 1;
 				goto scsiformat_bailout;
 			}
 
 		} else if (status != CAM_REQ_CMP) {
 			warnx("Unexpected CAM status %#x", status);
 			if (arglist & CAM_ARG_VERBOSE)
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			error = 1;
 			goto scsiformat_bailout;
 		}
 
 	} while((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP);
 
 	if (quiet == 0)
 		fprintf(stdout, "\nFormat Complete\n");
 
 scsiformat_bailout:
 
 	cam_freeccb(ccb);
 
 	return (error);
 }
 
 static int
 sanitize_wait_ata(struct cam_device *device, union ccb *ccb, int quiet)
 {
 	struct ata_res *res;
 	int retval;
 	cam_status status;
 	u_int val, perc;
 
 	do {
 		retval = ata_do_cmd(device,
 				   ccb,
 				   /*retries*/1,
 				   /*flags*/CAM_DIR_NONE,
 				   /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND,
 				   /*ata_flags*/AP_FLAG_CHK_COND,
 				   /*tag_action*/MSG_SIMPLE_Q_TAG,
 				   /*command*/ATA_SANITIZE,
 				   /*features*/0x00, /* SANITIZE STATUS EXT */
 				   /*lba*/0,
 				   /*sector_count*/0,
 				   /*data_ptr*/NULL,
 				   /*dxfer_len*/0,
 				   /*timeout*/10000,
 				   /*is48bit*/1);
 		if (retval < 0) {
 			warn("error sending CAMIOCOMMAND ioctl");
 			if (arglist & CAM_ARG_VERBOSE) {
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			}
 			return (1);
 		}
 
 		status = ccb->ccb_h.status & CAM_STATUS_MASK;
 		if (status == CAM_REQ_CMP) {
 			res = &ccb->ataio.res;
 			if (res->sector_count_exp & 0x40) {
 				if (quiet == 0) {
 					val = (res->lba_mid << 8) + res->lba_low;
 					perc = 10000 * val;
 					fprintf(stdout,
 					    "Sanitizing: %u.%02u%% (%d/%d)\r",
 					    (perc / (0x10000 * 100)),
 					    ((perc / 0x10000) % 100),
 					    val, 0x10000);
 					fflush(stdout);
 				}
 				sleep(1);
 			} else if ((res->sector_count_exp & 0x80) == 0) {
 				warnx("Sanitize complete with an error.     ");
 				return (1);
 			} else
 				break;
 
 		} else if (status != CAM_REQ_CMP && status != CAM_REQUEUE_REQ) {
 			warnx("Unexpected CAM status %#x", status);
 			if (arglist & CAM_ARG_VERBOSE)
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			return (1);
 		}
 	} while (1);
 	return (0);
 }
 
 static int
 sanitize_wait_scsi(struct cam_device *device, union ccb *ccb, int task_attr, int quiet)
 {
 	int warnings = 0, retval;
 	cam_status status;
 	u_int val, perc;
 
 	do {
 		CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 		/*
 		 * There's really no need to do error recovery or
 		 * retries here, since we're just going to sit in a
 		 * loop and wait for the device to finish sanitizing.
 		 */
 		scsi_test_unit_ready(&ccb->csio,
 				     /* retries */ 0,
 				     /* cbfcnp */ NULL,
 				     /* tag_action */ task_attr,
 				     /* sense_len */ SSD_FULL_SIZE,
 				     /* timeout */ 5000);
 
 		/* Disable freezing the device queue */
 		ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 		retval = cam_send_ccb(device, ccb);
 
 		/*
 		 * If we get an error from the ioctl, bail out.  SCSI
 		 * errors are expected.
 		 */
 		if (retval < 0) {
 			warn("error sending CAMIOCOMMAND ioctl");
 			if (arglist & CAM_ARG_VERBOSE) {
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			}
 			return (1);
 		}
 
 		status = ccb->ccb_h.status & CAM_STATUS_MASK;
 		if ((status == CAM_SCSI_STATUS_ERROR) &&
 		    ((ccb->ccb_h.status & CAM_AUTOSNS_VALID) != 0)) {
 			struct scsi_sense_data *sense;
 			int error_code, sense_key, asc, ascq;
 
 			sense = &ccb->csio.sense_data;
 			scsi_extract_sense_len(sense, ccb->csio.sense_len -
 			    ccb->csio.sense_resid, &error_code, &sense_key,
 			    &asc, &ascq, /*show_errors*/ 1);
 
 			/*
 			 * According to the SCSI-3 spec, a drive that is in the
 			 * middle of a sanitize should return NOT READY with an
 			 * ASC of "logical unit not ready, sanitize in
 			 * progress". The sense key specific bytes will then
 			 * be a progress indicator.
 			 */
 			if ((sense_key == SSD_KEY_NOT_READY)
 			 && (asc == 0x04) && (ascq == 0x1b)) {
 				uint8_t sks[3];
 
 				if ((scsi_get_sks(sense, ccb->csio.sense_len -
 				     ccb->csio.sense_resid, sks) == 0)
 				 && (quiet == 0)) {
 					val = scsi_2btoul(&sks[1]);
 					perc = 10000 * val;
 					fprintf(stdout,
 					    "Sanitizing: %u.%02u%% (%d/%d)\r",
 					    (perc / (0x10000 * 100)),
 					    ((perc / 0x10000) % 100),
 					    val, 0x10000);
 					fflush(stdout);
 				} else if ((quiet == 0) && (++warnings <= 1)) {
 					warnx("Unexpected SCSI Sense Key "
 					      "Specific value returned "
 					      "during sanitize:");
 					scsi_sense_print(device, &ccb->csio,
 							 stderr);
 					warnx("Unable to print status "
 					      "information, but sanitze will "
 					      "proceed.");
 					warnx("will exit when sanitize is "
 					      "complete");
 				}
 				sleep(1);
 			} else {
 				warnx("Unexpected SCSI error during sanitize");
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 				return (1);
 			}
 
 		} else if (status != CAM_REQ_CMP && status != CAM_REQUEUE_REQ) {
 			warnx("Unexpected CAM status %#x", status);
 			if (arglist & CAM_ARG_VERBOSE)
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			return (1);
 		}
 	} while ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP);
 	return (0);
 }
 
 static int
 sanitize(struct cam_device *device, int argc, char **argv,
 	     char *combinedopt, int task_attr, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	u_int8_t action = 0;
 	int c;
 	int ycount = 0, quiet = 0;
 	int error = 0;
 	int use_timeout;
 	int immediate = 1;
 	int invert = 0;
 	int passes = 0;
 	int ause = 0;
 	int fd = -1;
 	const char *pattern = NULL;
 	u_int8_t *data_ptr = NULL;
 	u_int32_t dxfer_len = 0;
 	uint8_t byte2;
 	uint16_t feature, count;
 	uint64_t lba;
 	int reportonly = 0;
 	camcontrol_devtype dt;
 
 	/*
 	 * Get the device type, request no I/O be done to do this.
 	 */
 	error = get_device_type(device, -1, 0, 0, &dt);
 	if (error != 0 || (unsigned)dt > CC_DT_UNKNOWN) {
 		warnx("sanitize: can't get device type");
 		return (1);
 	}
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("sanitize: error allocating ccb");
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch(c) {
 		case 'a':
 			if (strcasecmp(optarg, "overwrite") == 0)
 				action = SSZ_SERVICE_ACTION_OVERWRITE;
 			else if (strcasecmp(optarg, "block") == 0)
 				action = SSZ_SERVICE_ACTION_BLOCK_ERASE;
 			else if (strcasecmp(optarg, "crypto") == 0)
 				action = SSZ_SERVICE_ACTION_CRYPTO_ERASE;
 			else if (strcasecmp(optarg, "exitfailure") == 0)
 				action = SSZ_SERVICE_ACTION_EXIT_MODE_FAILURE;
 			else {
 				warnx("invalid service operation \"%s\"",
 				      optarg);
 				error = 1;
 				goto sanitize_bailout;
 			}
 			break;
 		case 'c':
 			passes = strtol(optarg, NULL, 0);
 			if (passes < 1 || passes > 31) {
 				warnx("invalid passes value %d", passes);
 				error = 1;
 				goto sanitize_bailout;
 			}
 			break;
 		case 'I':
 			invert = 1;
 			break;
 		case 'P':
 			pattern = optarg;
 			break;
 		case 'q':
 			quiet++;
 			break;
 		case 'U':
 			ause = 1;
 			break;
 		case 'r':
 			reportonly = 1;
 			break;
 		case 'w':
 			/* ATA supports only immediate commands. */
 			if (dt == CC_DT_SCSI)
 				immediate = 0;
 			break;
 		case 'y':
 			ycount++;
 			break;
 		}
 	}
 
 	if (reportonly)
 		goto doreport;
 
 	if (action == 0) {
 		warnx("an action is required");
 		error = 1;
 		goto sanitize_bailout;
 	} else if (action == SSZ_SERVICE_ACTION_OVERWRITE) {
 		struct scsi_sanitize_parameter_list *pl;
 		struct stat sb;
 		ssize_t sz, amt;
 
 		if (pattern == NULL) {
 			warnx("overwrite action requires -P argument");
 			error = 1;
 			goto sanitize_bailout;
 		}
 		fd = open(pattern, O_RDONLY);
 		if (fd < 0) {
 			warn("cannot open pattern file %s", pattern);
 			error = 1;
 			goto sanitize_bailout;
 		}
 		if (fstat(fd, &sb) < 0) {
 			warn("cannot stat pattern file %s", pattern);
 			error = 1;
 			goto sanitize_bailout;
 		}
 		sz = sb.st_size;
 		if (sz > SSZPL_MAX_PATTERN_LENGTH) {
 			warnx("pattern file size exceeds maximum value %d",
 			      SSZPL_MAX_PATTERN_LENGTH);
 			error = 1;
 			goto sanitize_bailout;
 		}
 		dxfer_len = sizeof(*pl) + sz;
 		data_ptr = calloc(1, dxfer_len);
 		if (data_ptr == NULL) {
 			warnx("cannot allocate parameter list buffer");
 			error = 1;
 			goto sanitize_bailout;
 		}
 
 		amt = read(fd, data_ptr + sizeof(*pl), sz);
 		if (amt < 0) {
 			warn("cannot read pattern file");
 			error = 1;
 			goto sanitize_bailout;
 		} else if (amt != sz) {
 			warnx("short pattern file read");
 			error = 1;
 			goto sanitize_bailout;
 		}
 
 		pl = (struct scsi_sanitize_parameter_list *)data_ptr;
 		if (passes == 0)
 			pl->byte1 = 1;
 		else
 			pl->byte1 = passes;
 		if (invert != 0)
 			pl->byte1 |= SSZPL_INVERT;
 		scsi_ulto2b(sz, pl->length);
 	} else {
 		const char *arg;
 
 		if (passes != 0)
 			arg = "-c";
 		else if (invert != 0)
 			arg = "-I";
 		else if (pattern != NULL)
 			arg = "-P";
 		else
 			arg = NULL;
 		if (arg != NULL) {
 			warnx("%s argument only valid with overwrite "
 			      "operation", arg);
 			error = 1;
 			goto sanitize_bailout;
 		}
 	}
 
 	if (quiet == 0) {
 		fprintf(stdout, "You are about to REMOVE ALL DATA from the "
 			"following device:\n");
 
 		if (dt == CC_DT_SCSI) {
 			error = scsidoinquiry(device, argc, argv, combinedopt,
 					      task_attr, retry_count, timeout);
 		} else if (dt == CC_DT_ATA || dt == CC_DT_SATL) {
 			struct ata_params *ident_buf;
 			error = ata_do_identify(device, retry_count, timeout,
 						ccb, &ident_buf);
 			if (error == 0) {
 				printf("%s%d: ", device->device_name,
 				    device->dev_unit_num);
 				ata_print_ident(ident_buf);
 				free(ident_buf);
 			}
 		} else
 			error = 1;
 
 		if (error != 0) {
 			warnx("sanitize: error sending inquiry");
 			goto sanitize_bailout;
 		}
 	}
 
 	if (ycount == 0) {
 		if (!get_confirmation()) {
 			error = 1;
 			goto sanitize_bailout;
 		}
 	}
 
 	if (timeout != 0)
 		use_timeout = timeout;
 	else
 		use_timeout = (immediate ? 10 : 10800) * 1000;
 
 	if (immediate == 0 && quiet == 0) {
 		fprintf(stdout, "Current sanitize timeout is %d seconds\n",
 			use_timeout / 1000);
 	}
 
 	/*
 	 * If the user hasn't disabled questions and didn't specify a
 	 * timeout on the command line, ask them if they want the current
 	 * timeout.
 	 */
 	if (immediate == 0 && ycount == 0 && timeout == 0) {
 		char str[1024];
 		int new_timeout = 0;
 
 		fprintf(stdout, "Enter new timeout in seconds or press\n"
 			"return to keep the current timeout [%d] ",
 			use_timeout / 1000);
 
 		if (fgets(str, sizeof(str), stdin) != NULL) {
 			if (str[0] != '\0')
 				new_timeout = atoi(str);
 		}
 
 		if (new_timeout != 0) {
 			use_timeout = new_timeout * 1000;
 			fprintf(stdout, "Using new timeout value %d\n",
 				use_timeout / 1000);
 		}
 	}
 
 	if (dt == CC_DT_SCSI) {
 		byte2 = action;
 		if (ause != 0)
 			byte2 |= SSZ_UNRESTRICTED_EXIT;
 		if (immediate != 0)
 			byte2 |= SSZ_IMMED;
 		scsi_sanitize(&ccb->csio,
 			      /* retries */ retry_count,
 			      /* cbfcnp */ NULL,
 			      /* tag_action */ task_attr,
 			      /* byte2 */ byte2,
 			      /* control */ 0,
 			      /* data_ptr */ data_ptr,
 			      /* dxfer_len */ dxfer_len,
 			      /* sense_len */ SSD_FULL_SIZE,
 			      /* timeout */ use_timeout);
 
 		ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 		if (arglist & CAM_ARG_ERR_RECOVER)
 			ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 		if (cam_send_ccb(device, ccb) < 0) {
 			warn("error sending sanitize command");
 			error = 1;
 			goto sanitize_bailout;
 		}
 	} else if (dt == CC_DT_ATA || dt == CC_DT_SATL) {
 		if (action == SSZ_SERVICE_ACTION_OVERWRITE) {
 			feature = 0x14; /* OVERWRITE EXT */
 			lba = 0x4F5700000000 | scsi_4btoul(data_ptr + 4);
 			count = (passes == 0) ? 1 : (passes >= 16) ? 0 : passes;
 			if (invert)
 				count |= 0x80; /* INVERT PATTERN */
 			if (ause)
 				count |= 0x10; /* FAILURE MODE */
 		} else if (action == SSZ_SERVICE_ACTION_BLOCK_ERASE) {
 			feature = 0x12; /* BLOCK ERASE EXT */
 			lba = 0x0000426B4572;
 			count = 0;
 			if (ause)
 				count |= 0x10; /* FAILURE MODE */
 		} else if (action == SSZ_SERVICE_ACTION_CRYPTO_ERASE) {
 			feature = 0x11; /* CRYPTO SCRAMBLE EXT */
 			lba = 0x000043727970;
 			count = 0;
 			if (ause)
 				count |= 0x10; /* FAILURE MODE */
 		} else if (action == SSZ_SERVICE_ACTION_EXIT_MODE_FAILURE) {
 			feature = 0x00; /* SANITIZE STATUS EXT */
 			lba = 0;
 			count = 1; /* CLEAR SANITIZE OPERATION FAILED */
 		} else {
 			error = 1;
 			goto sanitize_bailout;
 		}
 
 		error = ata_do_cmd(device,
 				   ccb,
 				   retry_count,
 				   /*flags*/CAM_DIR_NONE,
 				   /*protocol*/AP_PROTO_NON_DATA | AP_EXTEND,
 				   /*ata_flags*/AP_FLAG_CHK_COND,
 				   /*tag_action*/MSG_SIMPLE_Q_TAG,
 				   /*command*/ATA_SANITIZE,
 				   /*features*/feature,
 				   /*lba*/lba,
 				   /*sector_count*/count,
 				   /*data_ptr*/NULL,
 				   /*dxfer_len*/0,
 				   /*timeout*/ use_timeout,
 				   /*is48bit*/1);
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		struct scsi_sense_data *sense;
 		int error_code, sense_key, asc, ascq;
 
 		if ((ccb->ccb_h.status & CAM_STATUS_MASK) ==
 		    CAM_SCSI_STATUS_ERROR) {
 			sense = &ccb->csio.sense_data;
 			scsi_extract_sense_len(sense, ccb->csio.sense_len -
 			    ccb->csio.sense_resid, &error_code, &sense_key,
 			    &asc, &ascq, /*show_errors*/ 1);
 
 			if (sense_key == SSD_KEY_ILLEGAL_REQUEST &&
 			    asc == 0x20 && ascq == 0x00)
 				warnx("sanitize is not supported by "
 				      "this device");
 			else
 				warnx("error sanitizing this device");
 		} else
 			warnx("error sanitizing this device");
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		error = 1;
 		goto sanitize_bailout;
 	}
 
 	/*
 	 * If we ran in non-immediate mode, we already checked for errors
 	 * above and printed out any necessary information.  If we're in
 	 * immediate mode, we need to loop through and get status
 	 * information periodically.
 	 */
 	if (immediate == 0) {
 		if (quiet == 0) {
 			fprintf(stdout, "Sanitize Complete\n");
 		}
 		goto sanitize_bailout;
 	}
 
 doreport:
 	if (dt == CC_DT_SCSI) {
 		error = sanitize_wait_scsi(device, ccb, task_attr, quiet);
 	} else if (dt == CC_DT_ATA || dt == CC_DT_SATL) {
 		error = sanitize_wait_ata(device, ccb, quiet);
 	} else
 		error = 1;
 	if (error == 0 && quiet == 0)
 		fprintf(stdout, "Sanitize Complete                      \n");
 
 sanitize_bailout:
 	if (fd >= 0)
 		close(fd);
 	if (data_ptr != NULL)
 		free(data_ptr);
 	cam_freeccb(ccb);
 
 	return (error);
 }
 
 static int
 scsireportluns(struct cam_device *device, int argc, char **argv,
 	       char *combinedopt, int task_attr, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	int c, countonly, lunsonly;
 	struct scsi_report_luns_data *lundata;
 	int alloc_len;
 	uint8_t report_type;
 	uint32_t list_len, i, j;
 	int retval;
 
 	retval = 0;
 	lundata = NULL;
 	report_type = RPL_REPORT_DEFAULT;
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("%s: error allocating ccb", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	countonly = 0;
 	lunsonly = 0;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'c':
 			countonly++;
 			break;
 		case 'l':
 			lunsonly++;
 			break;
 		case 'r':
 			if (strcasecmp(optarg, "default") == 0)
 				report_type = RPL_REPORT_DEFAULT;
 			else if (strcasecmp(optarg, "wellknown") == 0)
 				report_type = RPL_REPORT_WELLKNOWN;
 			else if (strcasecmp(optarg, "all") == 0)
 				report_type = RPL_REPORT_ALL;
 			else {
 				warnx("%s: invalid report type \"%s\"",
 				      __func__, optarg);
 				retval = 1;
 				goto bailout;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 
 	if ((countonly != 0)
 	 && (lunsonly != 0)) {
 		warnx("%s: you can only specify one of -c or -l", __func__);
 		retval = 1;
 		goto bailout;
 	}
 	/*
 	 * According to SPC-4, the allocation length must be at least 16
 	 * bytes -- enough for the header and one LUN.
 	 */
 	alloc_len = sizeof(*lundata) + 8;
 
 retry:
 
 	lundata = malloc(alloc_len);
 
 	if (lundata == NULL) {
 		warn("%s: error mallocing %d bytes", __func__, alloc_len);
 		retval = 1;
 		goto bailout;
 	}
 
 	scsi_report_luns(&ccb->csio,
 			 /*retries*/ retry_count,
 			 /*cbfcnp*/ NULL,
 			 /*tag_action*/ task_attr,
 			 /*select_report*/ report_type,
 			 /*rpl_buf*/ lundata,
 			 /*alloc_len*/ alloc_len,
 			 /*sense_len*/ SSD_FULL_SIZE,
 			 /*timeout*/ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("error sending REPORT LUNS command");
 
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto bailout;
 	}
 
 
 	list_len = scsi_4btoul(lundata->length);
 
 	/*
 	 * If we need to list the LUNs, and our allocation
 	 * length was too short, reallocate and retry.
 	 */
 	if ((countonly == 0)
 	 && (list_len > (alloc_len - sizeof(*lundata)))) {
 		alloc_len = list_len + sizeof(*lundata);
 		free(lundata);
 		goto retry;
 	}
 
 	if (lunsonly == 0)
 		fprintf(stdout, "%u LUN%s found\n", list_len / 8,
 			((list_len / 8) > 1) ? "s" : "");
 
 	if (countonly != 0)
 		goto bailout;
 
 	for (i = 0; i < (list_len / 8); i++) {
 		int no_more;
 
 		no_more = 0;
 		for (j = 0; j < sizeof(lundata->luns[i].lundata); j += 2) {
 			if (j != 0)
 				fprintf(stdout, ",");
 			switch (lundata->luns[i].lundata[j] &
 				RPL_LUNDATA_ATYP_MASK) {
 			case RPL_LUNDATA_ATYP_PERIPH:
 				if ((lundata->luns[i].lundata[j] &
 				    RPL_LUNDATA_PERIPH_BUS_MASK) != 0)
 					fprintf(stdout, "%d:",
 						lundata->luns[i].lundata[j] &
 						RPL_LUNDATA_PERIPH_BUS_MASK);
 				else if ((j == 0)
 				      && ((lundata->luns[i].lundata[j+2] &
 					  RPL_LUNDATA_PERIPH_BUS_MASK) == 0))
 					no_more = 1;
 
 				fprintf(stdout, "%d",
 					lundata->luns[i].lundata[j+1]);
 				break;
 			case RPL_LUNDATA_ATYP_FLAT: {
 				uint8_t tmplun[2];
 				tmplun[0] = lundata->luns[i].lundata[j] &
 					RPL_LUNDATA_FLAT_LUN_MASK;
 				tmplun[1] = lundata->luns[i].lundata[j+1];
 
 				fprintf(stdout, "%d", scsi_2btoul(tmplun));
 				no_more = 1;
 				break;
 			}
 			case RPL_LUNDATA_ATYP_LUN:
 				fprintf(stdout, "%d:%d:%d",
 					(lundata->luns[i].lundata[j+1] &
 					RPL_LUNDATA_LUN_BUS_MASK) >> 5,
 					lundata->luns[i].lundata[j] &
 					RPL_LUNDATA_LUN_TARG_MASK,
 					lundata->luns[i].lundata[j+1] &
 					RPL_LUNDATA_LUN_LUN_MASK);
 				break;
 			case RPL_LUNDATA_ATYP_EXTLUN: {
 				int field_len_code, eam_code;
 
 				eam_code = lundata->luns[i].lundata[j] &
 					RPL_LUNDATA_EXT_EAM_MASK;
 				field_len_code = (lundata->luns[i].lundata[j] &
 					RPL_LUNDATA_EXT_LEN_MASK) >> 4;
 
 				if ((eam_code == RPL_LUNDATA_EXT_EAM_WK)
 				 && (field_len_code == 0x00)) {
 					fprintf(stdout, "%d",
 						lundata->luns[i].lundata[j+1]);
 				} else if ((eam_code ==
 					    RPL_LUNDATA_EXT_EAM_NOT_SPEC)
 					&& (field_len_code == 0x03)) {
 					uint8_t tmp_lun[8];
 
 					/*
 					 * This format takes up all 8 bytes.
 					 * If we aren't starting at offset 0,
 					 * that's a bug.
 					 */
 					if (j != 0) {
 						fprintf(stdout, "Invalid "
 							"offset %d for "
 							"Extended LUN not "
 							"specified format", j);
 						no_more = 1;
 						break;
 					}
 					bzero(tmp_lun, sizeof(tmp_lun));
 					bcopy(&lundata->luns[i].lundata[j+1],
 					      &tmp_lun[1], sizeof(tmp_lun) - 1);
 					fprintf(stdout, "%#jx",
 					       (intmax_t)scsi_8btou64(tmp_lun));
 					no_more = 1;
 				} else {
 					fprintf(stderr, "Unknown Extended LUN"
 						"Address method %#x, length "
 						"code %#x", eam_code,
 						field_len_code);
 					no_more = 1;
 				}
 				break;
 			}
 			default:
 				fprintf(stderr, "Unknown LUN address method "
 					"%#x\n", lundata->luns[i].lundata[0] &
 					RPL_LUNDATA_ATYP_MASK);
 				break;
 			}
 			/*
 			 * For the flat addressing method, there are no
 			 * other levels after it.
 			 */
 			if (no_more != 0)
 				break;
 		}
 		fprintf(stdout, "\n");
 	}
 
 bailout:
 
 	cam_freeccb(ccb);
 
 	free(lundata);
 
 	return (retval);
 }
 
 static int
 scsireadcapacity(struct cam_device *device, int argc, char **argv,
 		 char *combinedopt, int task_attr, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	int blocksizeonly, humanize, numblocks, quiet, sizeonly, baseten, longonly;
 	struct scsi_read_capacity_data rcap;
 	struct scsi_read_capacity_data_long rcaplong;
 	uint64_t maxsector;
 	uint32_t block_len;
 	int retval;
 	int c;
 
 	blocksizeonly = 0;
 	humanize = 0;
 	longonly = 0;
 	numblocks = 0;
 	quiet = 0;
 	sizeonly = 0;
 	baseten = 0;
 	retval = 0;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("%s: error allocating ccb", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'b':
 			blocksizeonly++;
 			break;
 		case 'h':
 			humanize++;
 			baseten = 0;
 			break;
 		case 'H':
 			humanize++;
 			baseten++;
 			break;
 		case 'l':
 			longonly++;
 			break;
 		case 'N':
 			numblocks++;
 			break;
 		case 'q':
 			quiet++;
 			break;
 		case 's':
 			sizeonly++;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if ((blocksizeonly != 0)
 	 && (numblocks != 0)) {
 		warnx("%s: you can only specify one of -b or -N", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((blocksizeonly != 0)
 	 && (sizeonly != 0)) {
 		warnx("%s: you can only specify one of -b or -s", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((humanize != 0)
 	 && (quiet != 0)) {
 		warnx("%s: you can only specify one of -h/-H or -q", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((humanize != 0)
 	 && (blocksizeonly != 0)) {
 		warnx("%s: you can only specify one of -h/-H or -b", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	if (longonly != 0)
 		goto long_only;
 
 	scsi_read_capacity(&ccb->csio,
 			   /*retries*/ retry_count,
 			   /*cbfcnp*/ NULL,
 			   /*tag_action*/ task_attr,
 			   &rcap,
 			   SSD_FULL_SIZE,
 			   /*timeout*/ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("error sending READ CAPACITY command");
 
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto bailout;
 	}
 
 	maxsector = scsi_4btoul(rcap.addr);
 	block_len = scsi_4btoul(rcap.length);
 
 	/*
 	 * A last block of 2^32-1 means that the true capacity is over 2TB,
 	 * and we need to issue the long READ CAPACITY to get the real
 	 * capacity.  Otherwise, we're all set.
 	 */
 	if (maxsector != 0xffffffff)
 		goto do_print;
 
 long_only:
 	scsi_read_capacity_16(&ccb->csio,
 			      /*retries*/ retry_count,
 			      /*cbfcnp*/ NULL,
 			      /*tag_action*/ task_attr,
 			      /*lba*/ 0,
 			      /*reladdr*/ 0,
 			      /*pmi*/ 0,
 			      /*rcap_buf*/ (uint8_t *)&rcaplong,
 			      /*rcap_buf_len*/ sizeof(rcaplong),
 			      /*sense_len*/ SSD_FULL_SIZE,
 			      /*timeout*/ timeout ? timeout : 5000);
 
 	/* Disable freezing the device queue */
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (arglist & CAM_ARG_ERR_RECOVER)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("error sending READ CAPACITY (16) command");
 
 		if (arglist & CAM_ARG_VERBOSE)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto bailout;
 	}
 
 	maxsector = scsi_8btou64(rcaplong.addr);
 	block_len = scsi_4btoul(rcaplong.length);
 
 do_print:
 	if (blocksizeonly == 0) {
 		/*
 		 * Humanize implies !quiet, and also implies numblocks.
 		 */
 		if (humanize != 0) {
 			char tmpstr[6];
 			int64_t tmpbytes;
 			int ret;
 
 			tmpbytes = (maxsector + 1) * block_len;
 			ret = humanize_number(tmpstr, sizeof(tmpstr),
 					      tmpbytes, "", HN_AUTOSCALE,
 					      HN_B | HN_DECIMAL |
 					      ((baseten != 0) ?
 					      HN_DIVISOR_1000 : 0));
 			if (ret == -1) {
 				warnx("%s: humanize_number failed!", __func__);
 				retval = 1;
 				goto bailout;
 			}
 			fprintf(stdout, "Device Size: %s%s", tmpstr,
 				(sizeonly == 0) ?  ", " : "\n");
 		} else if (numblocks != 0) {
 			fprintf(stdout, "%s%ju%s", (quiet == 0) ?
 				"Blocks: " : "", (uintmax_t)maxsector + 1,
 				(sizeonly == 0) ? ", " : "\n");
 		} else {
 			fprintf(stdout, "%s%ju%s", (quiet == 0) ?
 				"Last Block: " : "", (uintmax_t)maxsector,
 				(sizeonly == 0) ? ", " : "\n");
 		}
 	}
 	if (sizeonly == 0)
 		fprintf(stdout, "%s%u%s\n", (quiet == 0) ?
 			"Block Length: " : "", block_len, (quiet == 0) ?
 			" bytes" : "");
 bailout:
 	cam_freeccb(ccb);
 
 	return (retval);
 }
 
 static int
 smpcmd(struct cam_device *device, int argc, char **argv, char *combinedopt,
        int retry_count, int timeout)
 {
 	int c, error = 0;
 	union ccb *ccb;
 	uint8_t *smp_request = NULL, *smp_response = NULL;
 	int request_size = 0, response_size = 0;
 	int fd_request = 0, fd_response = 0;
 	char *datastr = NULL;
 	struct get_hook hook;
 	int retval;
 	int flags = 0;
 
 	/*
 	 * Note that at the moment we don't support sending SMP CCBs to
 	 * devices that aren't probed by CAM.
 	 */
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'R':
 			arglist |= CAM_ARG_CMD_IN;
 			response_size = strtol(optarg, NULL, 0);
 			if (response_size <= 0) {
 				warnx("invalid number of response bytes %d",
 				      response_size);
 				error = 1;
 				goto smpcmd_bailout;
 			}
 			hook.argc = argc - optind;
 			hook.argv = argv + optind;
 			hook.got = 0;
 			optind++;
 			datastr = cget(&hook, NULL);
 			/*
 			 * If the user supplied "-" instead of a format, he
 			 * wants the data to be written to stdout.
 			 */
 			if ((datastr != NULL)
 			 && (datastr[0] == '-'))
 				fd_response = 1;
 
 			smp_response = (u_int8_t *)malloc(response_size);
 			if (smp_response == NULL) {
 				warn("can't malloc memory for SMP response");
 				error = 1;
 				goto smpcmd_bailout;
 			}
 			break;
 		case 'r':
 			arglist |= CAM_ARG_CMD_OUT;
 			request_size = strtol(optarg, NULL, 0);
 			if (request_size <= 0) {
 				warnx("invalid number of request bytes %d",
 				      request_size);
 				error = 1;
 				goto smpcmd_bailout;
 			}
 			hook.argc = argc - optind;
 			hook.argv = argv + optind;
 			hook.got = 0;
 			datastr = cget(&hook, NULL);
 			smp_request = (u_int8_t *)malloc(request_size);
 			if (smp_request == NULL) {
 				warn("can't malloc memory for SMP request");
 				error = 1;
 				goto smpcmd_bailout;
 			}
 			bzero(smp_request, request_size);
 			/*
 			 * If the user supplied "-" instead of a format, he
 			 * wants the data to be read from stdin.
 			 */
 			if ((datastr != NULL)
 			 && (datastr[0] == '-'))
 				fd_request = 1;
 			else
 				buff_encode_visit(smp_request, request_size,
 						  datastr,
 						  iget, &hook);
 			optind += hook.got;
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * If fd_data is set, and we're writing to the device, we need to
 	 * read the data the user wants written from stdin.
 	 */
 	if ((fd_request == 1) && (arglist & CAM_ARG_CMD_OUT)) {
 		ssize_t amt_read;
 		int amt_to_read = request_size;
 		u_int8_t *buf_ptr = smp_request;
 
 		for (amt_read = 0; amt_to_read > 0;
 		     amt_read = read(STDIN_FILENO, buf_ptr, amt_to_read)) {
 			if (amt_read == -1) {
 				warn("error reading data from stdin");
 				error = 1;
 				goto smpcmd_bailout;
 			}
 			amt_to_read -= amt_read;
 			buf_ptr += amt_read;
 		}
 	}
 
 	if (((arglist & CAM_ARG_CMD_IN) == 0)
 	 || ((arglist & CAM_ARG_CMD_OUT) == 0)) {
 		warnx("%s: need both the request (-r) and response (-R) "
 		      "arguments", __func__);
 		error = 1;
 		goto smpcmd_bailout;
 	}
 
 	flags |= CAM_DEV_QFRZDIS;
 
 	cam_fill_smpio(&ccb->smpio,
 		       /*retries*/ retry_count,
 		       /*cbfcnp*/ NULL,
 		       /*flags*/ flags,
 		       /*smp_request*/ smp_request,
 		       /*smp_request_len*/ request_size,
 		       /*smp_response*/ smp_response,
 		       /*smp_response_len*/ response_size,
 		       /*timeout*/ timeout ? timeout : 5000);
 
 	ccb->smpio.flags = SMP_FLAG_NONE;
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char warnstr[] = "error sending command";
 
 		if (retval < 0)
 			warn(warnstr);
 		else
 			warnx(warnstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 	}
 
 	if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)
 	 && (response_size > 0)) {
 		if (fd_response == 0) {
 			buff_decode_visit(smp_response, response_size,
 					  datastr, arg_put, NULL);
 			fprintf(stdout, "\n");
 		} else {
 			ssize_t amt_written;
 			int amt_to_write = response_size;
 			u_int8_t *buf_ptr = smp_response;
 
 			for (amt_written = 0; (amt_to_write > 0) &&
 			     (amt_written = write(STDOUT_FILENO, buf_ptr,
 						  amt_to_write)) > 0;){
 				amt_to_write -= amt_written;
 				buf_ptr += amt_written;
 			}
 			if (amt_written == -1) {
 				warn("error writing data to stdout");
 				error = 1;
 				goto smpcmd_bailout;
 			} else if ((amt_written == 0)
 				&& (amt_to_write > 0)) {
 				warnx("only wrote %u bytes out of %u",
 				      response_size - amt_to_write,
 				      response_size);
 			}
 		}
 	}
 smpcmd_bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	if (smp_request != NULL)
 		free(smp_request);
 
 	if (smp_response != NULL)
 		free(smp_response);
 
 	return (error);
 }
 
 static int
 mmcsdcmd(struct cam_device *device, int argc, char **argv, char *combinedopt,
        int retry_count, int timeout)
 {
 	int c, error = 0;
 	union ccb *ccb;
 	int32_t mmc_opcode = 0, mmc_arg = 0;
 	int32_t mmc_flags = -1;
 	int retval;
 	int is_write = 0;
 	int is_bw_4 = 0, is_bw_1 = 0;
 	int is_highspeed = 0, is_stdspeed = 0;
 	int is_info_request = 0;
 	int flags = 0;
 	uint8_t mmc_data_byte = 0;
 
 	/* For IO_RW_EXTENDED command */
 	uint8_t *mmc_data = NULL;
 	struct mmc_data mmc_d;
 	int mmc_data_len = 0;
 
 	/*
 	 * Note that at the moment we don't support sending SMP CCBs to
 	 * devices that aren't probed by CAM.
 	 */
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		return (1);
 	}
 
 	bzero(&(&ccb->ccb_h)[1],
 	      sizeof(union ccb) - sizeof(struct ccb_hdr));
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case '4':
 			is_bw_4 = 1;
 			break;
 		case '1':
 			is_bw_1 = 1;
 			break;
 		case 'S':
 			if (!strcmp(optarg, "high"))
 				is_highspeed = 1;
 			else
 				is_stdspeed = 1;
 			break;
 		case 'I':
 			is_info_request = 1;
 			break;
 		case 'c':
 			mmc_opcode = strtol(optarg, NULL, 0);
 			if (mmc_opcode < 0) {
 				warnx("invalid MMC opcode %d",
 				      mmc_opcode);
 				error = 1;
 				goto mmccmd_bailout;
 			}
 			break;
 		case 'a':
 			mmc_arg = strtol(optarg, NULL, 0);
 			if (mmc_arg < 0) {
 				warnx("invalid MMC arg %d",
 				      mmc_arg);
 				error = 1;
 				goto mmccmd_bailout;
 			}
 			break;
 		case 'f':
 			mmc_flags = strtol(optarg, NULL, 0);
 			if (mmc_flags < 0) {
 				warnx("invalid MMC flags %d",
 				      mmc_flags);
 				error = 1;
 				goto mmccmd_bailout;
 			}
 			break;
 		case 'l':
 			mmc_data_len = strtol(optarg, NULL, 0);
 			if (mmc_data_len <= 0) {
 				warnx("invalid MMC data len %d",
 				      mmc_data_len);
 				error = 1;
 				goto mmccmd_bailout;
 			}
 			break;
 		case 'W':
 			is_write = 1;
 			break;
 		case 'b':
 			mmc_data_byte = strtol(optarg, NULL, 0);
 			break;
 		default:
 			break;
 		}
 	}
 	flags |= CAM_DEV_QFRZDIS; /* masks are broken?! */
 
 	/* If flags are left default, supply the right flags */
 	if (mmc_flags < 0)
 		switch (mmc_opcode) {
 		case MMC_GO_IDLE_STATE:
 			mmc_flags = MMC_RSP_NONE | MMC_CMD_BC;
 			break;
 		case IO_SEND_OP_COND:
 			mmc_flags = MMC_RSP_R4;
 			break;
 		case SD_SEND_RELATIVE_ADDR:
 			mmc_flags = MMC_RSP_R6 | MMC_CMD_BCR;
 			break;
 		case MMC_SELECT_CARD:
 			mmc_flags = MMC_RSP_R1B | MMC_CMD_AC;
 			mmc_arg = mmc_arg << 16;
 			break;
 		case SD_IO_RW_DIRECT:
 			mmc_flags = MMC_RSP_R5 | MMC_CMD_AC;
 			mmc_arg = SD_IO_RW_ADR(mmc_arg);
 			if (is_write)
 				mmc_arg |= SD_IO_RW_WR | SD_IO_RW_RAW | SD_IO_RW_DAT(mmc_data_byte);
 			break;
 		case SD_IO_RW_EXTENDED:
 			mmc_flags = MMC_RSP_R5 | MMC_CMD_ADTC;
 			mmc_arg = SD_IO_RW_ADR(mmc_arg);
 			int len_arg = mmc_data_len;
 			if (mmc_data_len == 512)
 				len_arg = 0;
 
 			// Byte mode
 			mmc_arg |= SD_IOE_RW_LEN(len_arg) | SD_IO_RW_INCR;
 			// Block mode
 //                        mmc_arg |= SD_IOE_RW_BLK | SD_IOE_RW_LEN(len_arg) | SD_IO_RW_INCR;
 			break;
 		default:
 			mmc_flags = MMC_RSP_R1;
 			break;
 		}
 
 	// Switch bus width instead of sending IO command
 	if (is_bw_4 || is_bw_1) {
 		struct ccb_trans_settings_mmc *cts;
 		ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
 		ccb->ccb_h.flags = 0;
 		cts = &ccb->cts.proto_specific.mmc;
 		cts->ios.bus_width = is_bw_4 == 1 ? bus_width_4 : bus_width_1;
 		cts->ios_valid = MMC_BW;
 		if (((retval = cam_send_ccb(device, ccb)) < 0)
 		    || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 			warn("Error sending command");
 		} else {
 			printf("Parameters set OK\n");
 		}
 		cam_freeccb(ccb);
 		return (retval);
 	}
 
 	// Switch bus speed instead of sending IO command
 	if (is_stdspeed || is_highspeed) {
 		struct ccb_trans_settings_mmc *cts;
 		ccb->ccb_h.func_code = XPT_SET_TRAN_SETTINGS;
 		ccb->ccb_h.flags = 0;
 		cts = &ccb->cts.proto_specific.mmc;
 		cts->ios.timing = is_highspeed == 1 ? bus_timing_hs : bus_timing_normal;
 		cts->ios_valid = MMC_BT;
 		if (((retval = cam_send_ccb(device, ccb)) < 0)
 		    || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 			warn("Error sending command");
 		} else {
 			printf("Speed set OK (HS: %d)\n", is_highspeed);
 		}
 		cam_freeccb(ccb);
 		return (retval);
 	}
 
 	// Get information about controller and its settings
 	if (is_info_request) {
 		ccb->ccb_h.func_code = XPT_GET_TRAN_SETTINGS;
 		ccb->ccb_h.flags = 0;
 		struct ccb_trans_settings_mmc *cts;
 		cts = &ccb->cts.proto_specific.mmc;
 		if (((retval = cam_send_ccb(device, ccb)) < 0)
 		    || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 			warn("Error sending command");
 			return (retval);
 		}
 		printf("Host controller information\n");
 		printf("Host OCR: 0x%x\n", cts->host_ocr);
 		printf("Min frequency: %u KHz\n", cts->host_f_min / 1000);
 		printf("Max frequency: %u MHz\n", cts->host_f_max / 1000000);
 		printf("Supported bus width: ");
 		if (cts->host_caps & MMC_CAP_4_BIT_DATA)
 			printf(" 4 bit\n");
 		if (cts->host_caps & MMC_CAP_8_BIT_DATA)
 			printf(" 8 bit\n");
 		printf("\nCurrent settings:\n");
 		printf("Bus width: ");
 		switch (cts->ios.bus_width) {
 		case bus_width_1:
 			printf("1 bit\n");
 			break;
 		case bus_width_4:
 			printf("4 bit\n");
 			break;
 		case bus_width_8:
 			printf("8 bit\n");
 			break;
 		}
 		printf("Freq: %d.%03d MHz%s\n",
 		       cts->ios.clock / 1000000,
 		       (cts->ios.clock / 1000) % 1000,
 		       cts->ios.timing == bus_timing_hs ? "(high-speed timing)" : "");
 		return (0);
 	}
 
 	printf("CMD %d arg %d flags %02x\n", mmc_opcode, mmc_arg, mmc_flags);
 
 	if (mmc_data_len > 0) {
 		flags |= CAM_DIR_IN;
 		mmc_data = malloc(mmc_data_len);
 		memset(mmc_data, 0, mmc_data_len);
 		memset(&mmc_d, 0, sizeof(mmc_d));
 		mmc_d.len = mmc_data_len;
 		mmc_d.data = mmc_data;
 		mmc_d.flags = MMC_DATA_READ;
 	} else flags |= CAM_DIR_NONE;
 
 	cam_fill_mmcio(&ccb->mmcio,
 		       /*retries*/ retry_count,
 		       /*cbfcnp*/ NULL,
 		       /*flags*/ flags,
 		       /*mmc_opcode*/ mmc_opcode,
 		       /*mmc_arg*/ mmc_arg,
 		       /*mmc_flags*/ mmc_flags,
 		       /*mmc_data*/ mmc_data_len > 0 ? &mmc_d : NULL,
 		       /*timeout*/ timeout ? timeout : 5000);
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char warnstr[] = "error sending command";
 
 		if (retval < 0)
 			warn(warnstr);
 		else
 			warnx(warnstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 	}
 
 	if (((ccb->ccb_h.status & CAM_STATUS_MASK) == CAM_REQ_CMP)) {
 		printf("MMCIO: error %d, %08x %08x %08x %08x\n",
 		       ccb->mmcio.cmd.error, ccb->mmcio.cmd.resp[0],
 		       ccb->mmcio.cmd.resp[1],
 		       ccb->mmcio.cmd.resp[2],
 		       ccb->mmcio.cmd.resp[3]);
 
 		switch (mmc_opcode) {
 		case SD_IO_RW_DIRECT:
 			printf("IO_RW_DIRECT: resp byte %02x, cur state %d\n",
 			       SD_R5_DATA(ccb->mmcio.cmd.resp),
 			       (ccb->mmcio.cmd.resp[0] >> 12) & 0x3);
 			break;
 		case SD_IO_RW_EXTENDED:
 			printf("IO_RW_EXTENDED: read %d bytes w/o error:\n", mmc_data_len);
 			hexdump(mmc_data, mmc_data_len, NULL, 0);
 			break;
 		case SD_SEND_RELATIVE_ADDR:
 			printf("SEND_RELATIVE_ADDR: published RCA %02x\n", ccb->mmcio.cmd.resp[0] >> 16);
 			break;
 		default:
 			printf("No command-specific decoder for CMD %d\n", mmc_opcode);
 		}
 	}
 mmccmd_bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	if (mmc_data_len > 0 && mmc_data != NULL)
 		free(mmc_data);
 
 	return (error);
 }
 
 static int
 smpreportgeneral(struct cam_device *device, int argc, char **argv,
 		 char *combinedopt, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	struct smp_report_general_request *request = NULL;
 	struct smp_report_general_response *response = NULL;
 	struct sbuf *sb = NULL;
 	int error = 0;
 	int c, long_response = 0;
 	int retval;
 
 	/*
 	 * Note that at the moment we don't support sending SMP CCBs to
 	 * devices that aren't probed by CAM.
 	 */
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'l':
 			long_response = 1;
 			break;
 		default:
 			break;
 		}
 	}
 	request = malloc(sizeof(*request));
 	if (request == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*request));
 		error = 1;
 		goto bailout;
 	}
 
 	response = malloc(sizeof(*response));
 	if (response == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*response));
 		error = 1;
 		goto bailout;
 	}
 
 try_long:
 	smp_report_general(&ccb->smpio,
 			   retry_count,
 			   /*cbfcnp*/ NULL,
 			   request,
 			   /*request_len*/ sizeof(*request),
 			   (uint8_t *)response,
 			   /*response_len*/ sizeof(*response),
 			   /*long_response*/ long_response,
 			   timeout);
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char warnstr[] = "error sending command";
 
 		if (retval < 0)
 			warn(warnstr);
 		else
 			warnx(warnstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		error = 1;
 		goto bailout;
 	}
 
 	/*
 	 * If the device supports the long response bit, try again and see
 	 * if we can get all of the data.
 	 */
 	if ((response->long_response & SMP_RG_LONG_RESPONSE)
 	 && (long_response == 0)) {
 		ccb->ccb_h.status = CAM_REQ_INPROG;
 		CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio);
 		long_response = 1;
 		goto try_long;
 	}
 
 	/*
 	 * XXX KDM detect and decode SMP errors here.
 	 */
 	sb = sbuf_new_auto();
 	if (sb == NULL) {
 		warnx("%s: error allocating sbuf", __func__);
 		goto bailout;
 	}
 
 	smp_report_general_sbuf(response, sizeof(*response), sb);
 
 	if (sbuf_finish(sb) != 0) {
 		warnx("%s: sbuf_finish", __func__);
 		goto bailout;
 	}
 
 	printf("%s", sbuf_data(sb));
 
 bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	if (request != NULL)
 		free(request);
 
 	if (response != NULL)
 		free(response);
 
 	if (sb != NULL)
 		sbuf_delete(sb);
 
 	return (error);
 }
 
 static struct camcontrol_opts phy_ops[] = {
 	{"nop", SMP_PC_PHY_OP_NOP, CAM_ARG_NONE, NULL},
 	{"linkreset", SMP_PC_PHY_OP_LINK_RESET, CAM_ARG_NONE, NULL},
 	{"hardreset", SMP_PC_PHY_OP_HARD_RESET, CAM_ARG_NONE, NULL},
 	{"disable", SMP_PC_PHY_OP_DISABLE, CAM_ARG_NONE, NULL},
 	{"clearerrlog", SMP_PC_PHY_OP_CLEAR_ERR_LOG, CAM_ARG_NONE, NULL},
 	{"clearaffiliation", SMP_PC_PHY_OP_CLEAR_AFFILIATON, CAM_ARG_NONE,NULL},
 	{"sataportsel", SMP_PC_PHY_OP_TRANS_SATA_PSS, CAM_ARG_NONE, NULL},
 	{"clearitnl", SMP_PC_PHY_OP_CLEAR_STP_ITN_LS, CAM_ARG_NONE, NULL},
 	{"setdevname", SMP_PC_PHY_OP_SET_ATT_DEV_NAME, CAM_ARG_NONE, NULL},
 	{NULL, 0, 0, NULL}
 };
 
 static int
 smpphycontrol(struct cam_device *device, int argc, char **argv,
 	      char *combinedopt, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	struct smp_phy_control_request *request = NULL;
 	struct smp_phy_control_response *response = NULL;
 	int long_response = 0;
 	int retval = 0;
 	int phy = -1;
 	uint32_t phy_operation = SMP_PC_PHY_OP_NOP;
 	int phy_op_set = 0;
 	uint64_t attached_dev_name = 0;
 	int dev_name_set = 0;
 	uint32_t min_plr = 0, max_plr = 0;
 	uint32_t pp_timeout_val = 0;
 	int slumber_partial = 0;
 	int set_pp_timeout_val = 0;
 	int c;
 
 	/*
 	 * Note that at the moment we don't support sending SMP CCBs to
 	 * devices that aren't probed by CAM.
 	 */
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'a':
 		case 'A':
 		case 's':
 		case 'S': {
 			int enable = -1;
 
 			if (strcasecmp(optarg, "enable") == 0)
 				enable = 1;
 			else if (strcasecmp(optarg, "disable") == 0)
 				enable = 2;
 			else {
 				warnx("%s: Invalid argument %s", __func__,
 				      optarg);
 				retval = 1;
 				goto bailout;
 			}
 			switch (c) {
 			case 's':
 				slumber_partial |= enable <<
 						   SMP_PC_SAS_SLUMBER_SHIFT;
 				break;
 			case 'S':
 				slumber_partial |= enable <<
 						   SMP_PC_SAS_PARTIAL_SHIFT;
 				break;
 			case 'a':
 				slumber_partial |= enable <<
 						   SMP_PC_SATA_SLUMBER_SHIFT;
 				break;
 			case 'A':
 				slumber_partial |= enable <<
 						   SMP_PC_SATA_PARTIAL_SHIFT;
 				break;
 			default:
 				warnx("%s: programmer error", __func__);
 				retval = 1;
 				goto bailout;
 				break; /*NOTREACHED*/
 			}
 			break;
 		}
 		case 'd':
 			attached_dev_name = (uintmax_t)strtoumax(optarg,
 								 NULL,0);
 			dev_name_set = 1;
 			break;
 		case 'l':
 			long_response = 1;
 			break;
 		case 'm':
 			/*
 			 * We don't do extensive checking here, so this
 			 * will continue to work when new speeds come out.
 			 */
 			min_plr = strtoul(optarg, NULL, 0);
 			if ((min_plr == 0)
 			 || (min_plr > 0xf)) {
 				warnx("%s: invalid link rate %x",
 				      __func__, min_plr);
 				retval = 1;
 				goto bailout;
 			}
 			break;
 		case 'M':
 			/*
 			 * We don't do extensive checking here, so this
 			 * will continue to work when new speeds come out.
 			 */
 			max_plr = strtoul(optarg, NULL, 0);
 			if ((max_plr == 0)
 			 || (max_plr > 0xf)) {
 				warnx("%s: invalid link rate %x",
 				      __func__, max_plr);
 				retval = 1;
 				goto bailout;
 			}
 			break;
 		case 'o': {
 			camcontrol_optret optreturn;
 			cam_argmask argnums;
 			const char *subopt;
 
 			if (phy_op_set != 0) {
 				warnx("%s: only one phy operation argument "
 				      "(-o) allowed", __func__);
 				retval = 1;
 				goto bailout;
 			}
 
 			phy_op_set = 1;
 
 			/*
 			 * Allow the user to specify the phy operation
 			 * numerically, as well as with a name.  This will
 			 * future-proof it a bit, so options that are added
 			 * in future specs can be used.
 			 */
 			if (isdigit(optarg[0])) {
 				phy_operation = strtoul(optarg, NULL, 0);
 				if ((phy_operation == 0)
 				 || (phy_operation > 0xff)) {
 					warnx("%s: invalid phy operation %#x",
 					      __func__, phy_operation);
 					retval = 1;
 					goto bailout;
 				}
 				break;
 			}
 			optreturn = getoption(phy_ops, optarg, &phy_operation,
 					      &argnums, &subopt);
 
 			if (optreturn == CC_OR_AMBIGUOUS) {
 				warnx("%s: ambiguous option %s", __func__,
 				      optarg);
 				usage(0);
 				retval = 1;
 				goto bailout;
 			} else if (optreturn == CC_OR_NOT_FOUND) {
 				warnx("%s: option %s not found", __func__,
 				      optarg);
 				usage(0);
 				retval = 1;
 				goto bailout;
 			}
 			break;
 		}
 		case 'p':
 			phy = atoi(optarg);
 			break;
 		case 'T':
 			pp_timeout_val = strtoul(optarg, NULL, 0);
 			if (pp_timeout_val > 15) {
 				warnx("%s: invalid partial pathway timeout "
 				      "value %u, need a value less than 16",
 				      __func__, pp_timeout_val);
 				retval = 1;
 				goto bailout;
 			}
 			set_pp_timeout_val = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (phy == -1) {
 		warnx("%s: a PHY (-p phy) argument is required",__func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	if (((dev_name_set != 0)
 	  && (phy_operation != SMP_PC_PHY_OP_SET_ATT_DEV_NAME))
 	 || ((phy_operation == SMP_PC_PHY_OP_SET_ATT_DEV_NAME)
 	  && (dev_name_set == 0))) {
 		warnx("%s: -d name and -o setdevname arguments both "
 		      "required to set device name", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	request = malloc(sizeof(*request));
 	if (request == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*request));
 		retval = 1;
 		goto bailout;
 	}
 
 	response = malloc(sizeof(*response));
 	if (response == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*response));
 		retval = 1;
 		goto bailout;
 	}
 
 	smp_phy_control(&ccb->smpio,
 			retry_count,
 			/*cbfcnp*/ NULL,
 			request,
 			sizeof(*request),
 			(uint8_t *)response,
 			sizeof(*response),
 			long_response,
 			/*expected_exp_change_count*/ 0,
 			phy,
 			phy_operation,
 			(set_pp_timeout_val != 0) ? 1 : 0,
 			attached_dev_name,
 			min_plr,
 			max_plr,
 			slumber_partial,
 			pp_timeout_val,
 			timeout);
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char warnstr[] = "error sending command";
 
 		if (retval < 0)
 			warn(warnstr);
 		else
 			warnx(warnstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			/*
 			 * Use CAM_EPF_NORMAL so we only get one line of
 			 * SMP command decoding.
 			 */
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_NORMAL, stderr);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	/* XXX KDM print out something here for success? */
 bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	if (request != NULL)
 		free(request);
 
 	if (response != NULL)
 		free(response);
 
 	return (retval);
 }
 
 static int
 smpmaninfo(struct cam_device *device, int argc, char **argv,
 	   char *combinedopt, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	struct smp_report_manuf_info_request request;
 	struct smp_report_manuf_info_response response;
 	struct sbuf *sb = NULL;
 	int long_response = 0;
 	int retval = 0;
 	int c;
 
 	/*
 	 * Note that at the moment we don't support sending SMP CCBs to
 	 * devices that aren't probed by CAM.
 	 */
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio);
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'l':
 			long_response = 1;
 			break;
 		default:
 			break;
 		}
 	}
 	bzero(&request, sizeof(request));
 	bzero(&response, sizeof(response));
 
 	smp_report_manuf_info(&ccb->smpio,
 			      retry_count,
 			      /*cbfcnp*/ NULL,
 			      &request,
 			      sizeof(request),
 			      (uint8_t *)&response,
 			      sizeof(response),
 			      long_response,
 			      timeout);
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char warnstr[] = "error sending command";
 
 		if (retval < 0)
 			warn(warnstr);
 		else
 			warnx(warnstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	sb = sbuf_new_auto();
 	if (sb == NULL) {
 		warnx("%s: error allocating sbuf", __func__);
 		goto bailout;
 	}
 
 	smp_report_manuf_info_sbuf(&response, sizeof(response), sb);
 
 	if (sbuf_finish(sb) != 0) {
 		warnx("%s: sbuf_finish", __func__);
 		goto bailout;
 	}
 
 	printf("%s", sbuf_data(sb));
 
 bailout:
 
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	if (sb != NULL)
 		sbuf_delete(sb);
 
 	return (retval);
 }
 
 static int
 getdevid(struct cam_devitem *item)
 {
 	int retval = 0;
 	union ccb *ccb = NULL;
 
 	struct cam_device *dev;
 
 	dev = cam_open_btl(item->dev_match.path_id,
 			   item->dev_match.target_id,
 			   item->dev_match.target_lun, O_RDWR, NULL);
 
 	if (dev == NULL) {
 		warnx("%s", cam_errbuf);
 		retval = 1;
 		goto bailout;
 	}
 
 	item->device_id_len = 0;
 
 	ccb = cam_getccb(dev);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->cdai);
 
 	/*
 	 * On the first try, we just probe for the size of the data, and
 	 * then allocate that much memory and try again.
 	 */
 retry:
 	ccb->ccb_h.func_code = XPT_DEV_ADVINFO;
 	ccb->ccb_h.flags = CAM_DIR_IN;
 	ccb->cdai.flags = CDAI_FLAG_NONE;
 	ccb->cdai.buftype = CDAI_TYPE_SCSI_DEVID;
 	ccb->cdai.bufsiz = item->device_id_len;
 	if (item->device_id_len != 0)
 		ccb->cdai.buf = (uint8_t *)item->device_id;
 
 	if (cam_send_ccb(dev, ccb) < 0) {
 		warn("%s: error sending XPT_GDEV_ADVINFO CCB", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	if (ccb->ccb_h.status != CAM_REQ_CMP) {
 		warnx("%s: CAM status %#x", __func__, ccb->ccb_h.status);
 		retval = 1;
 		goto bailout;
 	}
 
 	if (item->device_id_len == 0) {
 		/*
 		 * This is our first time through.  Allocate the buffer,
 		 * and then go back to get the data.
 		 */
 		if (ccb->cdai.provsiz == 0) {
 			warnx("%s: invalid .provsiz field returned with "
 			     "XPT_GDEV_ADVINFO CCB", __func__);
 			retval = 1;
 			goto bailout;
 		}
 		item->device_id_len = ccb->cdai.provsiz;
 		item->device_id = malloc(item->device_id_len);
 		if (item->device_id == NULL) {
 			warn("%s: unable to allocate %d bytes", __func__,
 			     item->device_id_len);
 			retval = 1;
 			goto bailout;
 		}
 		ccb->ccb_h.status = CAM_REQ_INPROG;
 		goto retry;
 	}
 
 bailout:
 	if (dev != NULL)
 		cam_close_device(dev);
 
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	return (retval);
 }
 
 /*
  * XXX KDM merge this code with getdevtree()?
  */
 static int
 buildbusdevlist(struct cam_devlist *devlist)
 {
 	union ccb ccb;
 	int bufsize, fd = -1;
 	struct dev_match_pattern *patterns;
 	struct cam_devitem *item = NULL;
 	int skip_device = 0;
 	int retval = 0;
 
 	if ((fd = open(XPT_DEVICE, O_RDWR)) == -1) {
 		warn("couldn't open %s", XPT_DEVICE);
 		return (1);
 	}
 
 	bzero(&ccb, sizeof(union ccb));
 
 	ccb.ccb_h.path_id = CAM_XPT_PATH_ID;
 	ccb.ccb_h.target_id = CAM_TARGET_WILDCARD;
 	ccb.ccb_h.target_lun = CAM_LUN_WILDCARD;
 
 	ccb.ccb_h.func_code = XPT_DEV_MATCH;
 	bufsize = sizeof(struct dev_match_result) * 100;
 	ccb.cdm.match_buf_len = bufsize;
 	ccb.cdm.matches = (struct dev_match_result *)malloc(bufsize);
 	if (ccb.cdm.matches == NULL) {
 		warnx("can't malloc memory for matches");
 		close(fd);
 		return (1);
 	}
 	ccb.cdm.num_matches = 0;
 	ccb.cdm.num_patterns = 2;
 	ccb.cdm.pattern_buf_len = sizeof(struct dev_match_pattern) *
 		ccb.cdm.num_patterns;
 
 	patterns = (struct dev_match_pattern *)malloc(ccb.cdm.pattern_buf_len);
 	if (patterns == NULL) {
 		warnx("can't malloc memory for patterns");
 		retval = 1;
 		goto bailout;
 	}
 
 	ccb.cdm.patterns = patterns;
 	bzero(patterns, ccb.cdm.pattern_buf_len);
 
 	patterns[0].type = DEV_MATCH_DEVICE;
 	patterns[0].pattern.device_pattern.flags = DEV_MATCH_PATH;
 	patterns[0].pattern.device_pattern.path_id = devlist->path_id;
 	patterns[1].type = DEV_MATCH_PERIPH;
 	patterns[1].pattern.periph_pattern.flags = PERIPH_MATCH_PATH;
 	patterns[1].pattern.periph_pattern.path_id = devlist->path_id;
 
 	/*
 	 * We do the ioctl multiple times if necessary, in case there are
 	 * more than 100 nodes in the EDT.
 	 */
 	do {
 		unsigned int i;
 
 		if (ioctl(fd, CAMIOCOMMAND, &ccb) == -1) {
 			warn("error sending CAMIOCOMMAND ioctl");
 			retval = 1;
 			goto bailout;
 		}
 
 		if ((ccb.ccb_h.status != CAM_REQ_CMP)
 		 || ((ccb.cdm.status != CAM_DEV_MATCH_LAST)
 		    && (ccb.cdm.status != CAM_DEV_MATCH_MORE))) {
 			warnx("got CAM error %#x, CDM error %d\n",
 			      ccb.ccb_h.status, ccb.cdm.status);
 			retval = 1;
 			goto bailout;
 		}
 
 		for (i = 0; i < ccb.cdm.num_matches; i++) {
 			switch (ccb.cdm.matches[i].type) {
 			case DEV_MATCH_DEVICE: {
 				struct device_match_result *dev_result;
 
 				dev_result =
 				     &ccb.cdm.matches[i].result.device_result;
 
 				if (dev_result->flags &
 				    DEV_RESULT_UNCONFIGURED) {
 					skip_device = 1;
 					break;
 				} else
 					skip_device = 0;
 
 				item = malloc(sizeof(*item));
 				if (item == NULL) {
 					warn("%s: unable to allocate %zd bytes",
 					     __func__, sizeof(*item));
 					retval = 1;
 					goto bailout;
 				}
 				bzero(item, sizeof(*item));
 				bcopy(dev_result, &item->dev_match,
 				      sizeof(*dev_result));
 				STAILQ_INSERT_TAIL(&devlist->dev_queue, item,
 						   links);
 
 				if (getdevid(item) != 0) {
 					retval = 1;
 					goto bailout;
 				}
 				break;
 			}
 			case DEV_MATCH_PERIPH: {
 				struct periph_match_result *periph_result;
 
 				periph_result =
 				      &ccb.cdm.matches[i].result.periph_result;
 
 				if (skip_device != 0)
 					break;
 				item->num_periphs++;
 				item->periph_matches = realloc(
 					item->periph_matches,
 					item->num_periphs *
 					sizeof(struct periph_match_result));
 				if (item->periph_matches == NULL) {
 					warn("%s: error allocating periph "
 					     "list", __func__);
 					retval = 1;
 					goto bailout;
 				}
 				bcopy(periph_result, &item->periph_matches[
 				      item->num_periphs - 1],
 				      sizeof(*periph_result));
 				break;
 			}
 			default:
 				fprintf(stderr, "%s: unexpected match "
 					"type %d\n", __func__,
 					ccb.cdm.matches[i].type);
 				retval = 1;
 				goto bailout;
 				break; /*NOTREACHED*/
 			}
 		}
 	} while ((ccb.ccb_h.status == CAM_REQ_CMP)
 		&& (ccb.cdm.status == CAM_DEV_MATCH_MORE));
 bailout:
 
 	if (fd != -1)
 		close(fd);
 
 	free(patterns);
 
 	free(ccb.cdm.matches);
 
 	if (retval != 0)
 		freebusdevlist(devlist);
 
 	return (retval);
 }
 
 static void
 freebusdevlist(struct cam_devlist *devlist)
 {
 	struct cam_devitem *item, *item2;
 
 	STAILQ_FOREACH_SAFE(item, &devlist->dev_queue, links, item2) {
 		STAILQ_REMOVE(&devlist->dev_queue, item, cam_devitem,
 			      links);
 		free(item->device_id);
 		free(item->periph_matches);
 		free(item);
 	}
 }
 
 static struct cam_devitem *
 findsasdevice(struct cam_devlist *devlist, uint64_t sasaddr)
 {
 	struct cam_devitem *item;
 
 	STAILQ_FOREACH(item, &devlist->dev_queue, links) {
 		struct scsi_vpd_id_descriptor *idd;
 
 		/*
 		 * XXX KDM look for LUN IDs as well?
 		 */
 		idd = scsi_get_devid(item->device_id,
 					   item->device_id_len,
 					   scsi_devid_is_sas_target);
 		if (idd == NULL)
 			continue;
 
 		if (scsi_8btou64(idd->identifier) == sasaddr)
 			return (item);
 	}
 
 	return (NULL);
 }
 
 static int
 smpphylist(struct cam_device *device, int argc, char **argv,
 	   char *combinedopt, int retry_count, int timeout)
 {
 	struct smp_report_general_request *rgrequest = NULL;
 	struct smp_report_general_response *rgresponse = NULL;
 	struct smp_discover_request *disrequest = NULL;
 	struct smp_discover_response *disresponse = NULL;
 	struct cam_devlist devlist;
 	union ccb *ccb;
 	int long_response = 0;
 	int num_phys = 0;
 	int quiet = 0;
 	int retval;
 	int i, c;
 
 	/*
 	 * Note that at the moment we don't support sending SMP CCBs to
 	 * devices that aren't probed by CAM.
 	 */
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio);
 	STAILQ_INIT(&devlist.dev_queue);
 
 	rgrequest = malloc(sizeof(*rgrequest));
 	if (rgrequest == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*rgrequest));
 		retval = 1;
 		goto bailout;
 	}
 
 	rgresponse = malloc(sizeof(*rgresponse));
 	if (rgresponse == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*rgresponse));
 		retval = 1;
 		goto bailout;
 	}
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'l':
 			long_response = 1;
 			break;
 		case 'q':
 			quiet = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	smp_report_general(&ccb->smpio,
 			   retry_count,
 			   /*cbfcnp*/ NULL,
 			   rgrequest,
 			   /*request_len*/ sizeof(*rgrequest),
 			   (uint8_t *)rgresponse,
 			   /*response_len*/ sizeof(*rgresponse),
 			   /*long_response*/ long_response,
 			   timeout);
 
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (((retval = cam_send_ccb(device, ccb)) < 0)
 	 || ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)) {
 		const char warnstr[] = "error sending command";
 
 		if (retval < 0)
 			warn(warnstr);
 		else
 			warnx(warnstr);
 
 		if (arglist & CAM_ARG_VERBOSE) {
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	num_phys = rgresponse->num_phys;
 
 	if (num_phys == 0) {
 		if (quiet == 0)
 			fprintf(stdout, "%s: No Phys reported\n", __func__);
 		retval = 1;
 		goto bailout;
 	}
 
 	devlist.path_id = device->path_id;
 
 	retval = buildbusdevlist(&devlist);
 	if (retval != 0)
 		goto bailout;
 
 	if (quiet == 0) {
 		fprintf(stdout, "%d PHYs:\n", num_phys);
 		fprintf(stdout, "PHY  Attached SAS Address\n");
 	}
 
 	disrequest = malloc(sizeof(*disrequest));
 	if (disrequest == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*disrequest));
 		retval = 1;
 		goto bailout;
 	}
 
 	disresponse = malloc(sizeof(*disresponse));
 	if (disresponse == NULL) {
 		warn("%s: unable to allocate %zd bytes", __func__,
 		     sizeof(*disresponse));
 		retval = 1;
 		goto bailout;
 	}
 
 	for (i = 0; i < num_phys; i++) {
 		struct cam_devitem *item;
 		struct device_match_result *dev_match;
 		char vendor[16], product[48], revision[16];
 		char tmpstr[256];
 		int j;
 
 		CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->smpio);
 
 		ccb->ccb_h.status = CAM_REQ_INPROG;
 		ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 		smp_discover(&ccb->smpio,
 			     retry_count,
 			     /*cbfcnp*/ NULL,
 			     disrequest,
 			     sizeof(*disrequest),
 			     (uint8_t *)disresponse,
 			     sizeof(*disresponse),
 			     long_response,
 			     /*ignore_zone_group*/ 0,
 			     /*phy*/ i,
 			     timeout);
 
 		if (((retval = cam_send_ccb(device, ccb)) < 0)
 		 || (((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP)
 		  && (disresponse->function_result != SMP_FR_PHY_VACANT))) {
 			const char warnstr[] = "error sending command";
 
 			if (retval < 0)
 				warn(warnstr);
 			else
 				warnx(warnstr);
 
 			if (arglist & CAM_ARG_VERBOSE) {
 				cam_error_print(device, ccb, CAM_ESF_ALL,
 						CAM_EPF_ALL, stderr);
 			}
 			retval = 1;
 			goto bailout;
 		}
 
 		if (disresponse->function_result == SMP_FR_PHY_VACANT) {
 			if (quiet == 0)
 				fprintf(stdout, "%3d  <vacant>\n", i);
 			continue;
 		}
 
 		if (disresponse->attached_device == SMP_DIS_AD_TYPE_NONE) {
 			item = NULL;
 		} else {
 			item = findsasdevice(&devlist,
 			    scsi_8btou64(disresponse->attached_sas_address));
 		}
 
 		if ((quiet == 0)
 		 || (item != NULL)) {
 			fprintf(stdout, "%3d  0x%016jx", i,
 				(uintmax_t)scsi_8btou64(
 				disresponse->attached_sas_address));
 			if (item == NULL) {
 				fprintf(stdout, "\n");
 				continue;
 			}
 		} else if (quiet != 0)
 			continue;
 
 		dev_match = &item->dev_match;
 
 		if (dev_match->protocol == PROTO_SCSI) {
 			cam_strvis(vendor, dev_match->inq_data.vendor,
 				   sizeof(dev_match->inq_data.vendor),
 				   sizeof(vendor));
 			cam_strvis(product, dev_match->inq_data.product,
 				   sizeof(dev_match->inq_data.product),
 				   sizeof(product));
 			cam_strvis(revision, dev_match->inq_data.revision,
 				   sizeof(dev_match->inq_data.revision),
 				   sizeof(revision));
 			sprintf(tmpstr, "<%s %s %s>", vendor, product,
 				revision);
 		} else if ((dev_match->protocol == PROTO_ATA)
 			|| (dev_match->protocol == PROTO_SATAPM)) {
 			cam_strvis(product, dev_match->ident_data.model,
 				   sizeof(dev_match->ident_data.model),
 				   sizeof(product));
 			cam_strvis(revision, dev_match->ident_data.revision,
 				   sizeof(dev_match->ident_data.revision),
 				   sizeof(revision));
 			sprintf(tmpstr, "<%s %s>", product, revision);
 		} else {
 			sprintf(tmpstr, "<>");
 		}
 		fprintf(stdout, "   %-33s ", tmpstr);
 
 		/*
 		 * If we have 0 periphs, that's a bug...
 		 */
 		if (item->num_periphs == 0) {
 			fprintf(stdout, "\n");
 			continue;
 		}
 
 		fprintf(stdout, "(");
 		for (j = 0; j < item->num_periphs; j++) {
 			if (j > 0)
 				fprintf(stdout, ",");
 
 			fprintf(stdout, "%s%d",
 				item->periph_matches[j].periph_name,
 				item->periph_matches[j].unit_number);
 
 		}
 		fprintf(stdout, ")\n");
 	}
 bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	free(rgrequest);
 
 	free(rgresponse);
 
 	free(disrequest);
 
 	free(disresponse);
 
 	freebusdevlist(&devlist);
 
 	return (retval);
 }
 
 static int
 atapm_proc_resp(struct cam_device *device, union ccb *ccb)
 {
     struct ata_res *res;
 
     res = &ccb->ataio.res;
     if (res->status & ATA_STATUS_ERROR) {
         if (arglist & CAM_ARG_VERBOSE) {
             cam_error_print(device, ccb, CAM_ESF_ALL,
                     CAM_EPF_ALL, stderr);
             printf("error = 0x%02x, sector_count = 0x%04x, "
                    "device = 0x%02x, status = 0x%02x\n",
                    res->error, res->sector_count,
                    res->device, res->status);
         }
 
         return (1);
     }
 
     if (arglist & CAM_ARG_VERBOSE) {
         fprintf(stdout, "%s%d: Raw native check power data:\n",
             device->device_name, device->dev_unit_num);
         /* res is 4 byte aligned */
         dump_data((uint16_t*)(uintptr_t)res, sizeof(struct ata_res));
 
         printf("error = 0x%02x, sector_count = 0x%04x, device = 0x%02x, "
                "status = 0x%02x\n", res->error, res->sector_count,
                res->device, res->status);
     }
 
     printf("%s%d: ", device->device_name, device->dev_unit_num);
     switch (res->sector_count) {
     case 0x00:
        printf("Standby mode\n");
        break;
     case 0x40:
        printf("NV Cache Power Mode and the spindle is spun down or spinning down\n");
        break;
     case 0x41:
        printf("NV Cache Power Mode and the spindle is spun up or spinning up\n");
        break;
     case 0x80:
        printf("Idle mode\n");
        break;
     case 0xff:
        printf("Active or Idle mode\n");
        break;
     default:
        printf("Unknown mode 0x%02x\n", res->sector_count);
        break;
     }
 
     return (0);
 }
 
 static int
 atapm(struct cam_device *device, int argc, char **argv,
 		 char *combinedopt, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	int retval = 0;
 	int t = -1;
 	int c;
 	u_int8_t ata_flags = 0;
 	u_char cmd, sc;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("%s: error allocating ccb", __func__);
 		return (1);
 	}
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 't':
 			t = atoi(optarg);
 			break;
 		default:
 			break;
 		}
 	}
 	if (strcmp(argv[1], "idle") == 0) {
 		if (t == -1)
 			cmd = ATA_IDLE_IMMEDIATE;
 		else
 			cmd = ATA_IDLE_CMD;
 	} else if (strcmp(argv[1], "standby") == 0) {
 		if (t == -1)
 			cmd = ATA_STANDBY_IMMEDIATE;
 		else
 			cmd = ATA_STANDBY_CMD;
 	} else if (strcmp(argv[1], "powermode") == 0) {
 		cmd = ATA_CHECK_POWER_MODE;
 		ata_flags = AP_FLAG_CHK_COND;
 		t = -1;
 	} else {
 		cmd = ATA_SLEEP;
 		t = -1;
 	}
 
 	if (t < 0)
 		sc = 0;
 	else if (t <= (240 * 5))
 		sc = (t + 4) / 5;
 	else if (t <= (252 * 5))
 		/* special encoding for 21 minutes */
 		sc = 252;
 	else if (t <= (11 * 30 * 60))
 		sc = (t - 1) / (30 * 60) + 241;
 	else
 		sc = 253;
 
 	retval = ata_do_cmd(device,
 	    ccb,
 	    /*retries*/retry_count,
 	    /*flags*/CAM_DIR_NONE,
 	    /*protocol*/AP_PROTO_NON_DATA,
 	    /*ata_flags*/ata_flags,
 	    /*tag_action*/MSG_SIMPLE_Q_TAG,
 	    /*command*/cmd,
 	    /*features*/0,
 	    /*lba*/0,
 	    /*sector_count*/sc,
 	    /*data_ptr*/NULL,
 	    /*dxfer_len*/0,
 	    /*timeout*/timeout ? timeout : 30 * 1000,
 	    /*quiet*/1);
 
 	cam_freeccb(ccb);
 
 	if (retval || cmd != ATA_CHECK_POWER_MODE)
 		return (retval);
 
 	return (atapm_proc_resp(device, ccb));
 }
 
 static int
 ataaxm(struct cam_device *device, int argc, char **argv,
 		 char *combinedopt, int retry_count, int timeout)
 {
 	union ccb *ccb;
 	int retval = 0;
 	int l = -1;
 	int c;
 	u_char cmd, sc;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("%s: error allocating ccb", __func__);
 		return (1);
 	}
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'l':
 			l = atoi(optarg);
 			break;
 		default:
 			break;
 		}
 	}
 	sc = 0;
 	if (strcmp(argv[1], "apm") == 0) {
 		if (l == -1)
 			cmd = 0x85;
 		else {
 			cmd = 0x05;
 			sc = l;
 		}
 	} else /* aam */ {
 		if (l == -1)
 			cmd = 0xC2;
 		else {
 			cmd = 0x42;
 			sc = l;
 		}
 	}
 
 	retval = ata_do_28bit_cmd(device,
 	    ccb,
 	    /*retries*/retry_count,
 	    /*flags*/CAM_DIR_NONE,
 	    /*protocol*/AP_PROTO_NON_DATA,
 	    /*tag_action*/MSG_SIMPLE_Q_TAG,
 	    /*command*/ATA_SETFEATURES,
 	    /*features*/cmd,
 	    /*lba*/0,
 	    /*sector_count*/sc,
 	    /*data_ptr*/NULL,
 	    /*dxfer_len*/0,
 	    /*timeout*/timeout ? timeout : 30 * 1000,
 	    /*quiet*/1);
 
 	cam_freeccb(ccb);
 	return (retval);
 }
 
 int
 scsigetopcodes(struct cam_device *device, int opcode_set, int opcode,
 	       int show_sa_errors, int sa_set, int service_action,
 	       int timeout_desc, int task_attr, int retry_count, int timeout,
 	       int verbosemode, uint32_t *fill_len, uint8_t **data_ptr)
 {
 	union ccb *ccb = NULL;
 	uint8_t *buf = NULL;
 	uint32_t alloc_len = 0, num_opcodes;
 	uint32_t valid_len = 0;
 	uint32_t avail_len = 0;
 	struct scsi_report_supported_opcodes_all *all_hdr;
 	struct scsi_report_supported_opcodes_one *one;
 	int options = 0;
 	int retval = 0;
 
 	/*
 	 * Make it clear that we haven't yet allocated or filled anything.
 	 */
 	*fill_len = 0;
 	*data_ptr = NULL;
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("couldn't allocate CCB");
 		retval = 1;
 		goto bailout;
 	}
 
 	/* cam_getccb cleans up the header, caller has to zero the payload */
 	CCB_CLEAR_ALL_EXCEPT_HDR(&ccb->csio);
 
 	if (opcode_set != 0) {
 		options |= RSO_OPTIONS_OC;
 		num_opcodes = 1;
 		alloc_len = sizeof(*one) + CAM_MAX_CDBLEN;
 	} else {
 		num_opcodes = 256;
 		alloc_len = sizeof(*all_hdr) + (num_opcodes *
 		    sizeof(struct scsi_report_supported_opcodes_descr));
 	}
 
 	if (timeout_desc != 0) {
 		options |= RSO_RCTD;
 		alloc_len += num_opcodes *
 		    sizeof(struct scsi_report_supported_opcodes_timeout);
 	}
 
 	if (sa_set != 0) {
 		options |= RSO_OPTIONS_OC_SA;
 		if (show_sa_errors != 0)
 			options &= ~RSO_OPTIONS_OC;
 	}
 
 retry_alloc:
 	if (buf != NULL) {
 		free(buf);
 		buf = NULL;
 	}
 
 	buf = malloc(alloc_len);
 	if (buf == NULL) {
 		warn("Unable to allocate %u bytes", alloc_len);
 		retval = 1;
 		goto bailout;
 	}
 	bzero(buf, alloc_len);
 
 	scsi_report_supported_opcodes(&ccb->csio,
 				      /*retries*/ retry_count,
 				      /*cbfcnp*/ NULL,
 				      /*tag_action*/ task_attr,
 				      /*options*/ options,
 				      /*req_opcode*/ opcode,
 				      /*req_service_action*/ service_action,
 				      /*data_ptr*/ buf,
 				      /*dxfer_len*/ alloc_len,
 				      /*sense_len*/ SSD_FULL_SIZE,
 				      /*timeout*/ timeout ? timeout : 10000);
 
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 
 	if (retry_count != 0)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		perror("error sending REPORT SUPPORTED OPERATION CODES");
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		if (verbosemode != 0)
 			cam_error_print(device, ccb, CAM_ESF_ALL,
 					CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto bailout;
 	}
 
 	valid_len = ccb->csio.dxfer_len - ccb->csio.resid;
 
 	if (((options & RSO_OPTIONS_MASK) == RSO_OPTIONS_ALL)
 	 && (valid_len >= sizeof(*all_hdr))) {
 		all_hdr = (struct scsi_report_supported_opcodes_all *)buf;
 		avail_len = scsi_4btoul(all_hdr->length) + sizeof(*all_hdr);
 	} else if (((options & RSO_OPTIONS_MASK) != RSO_OPTIONS_ALL)
 		&& (valid_len >= sizeof(*one))) {
 		uint32_t cdb_length;
 
 		one = (struct scsi_report_supported_opcodes_one *)buf;
 		cdb_length = scsi_2btoul(one->cdb_length);
 		avail_len = sizeof(*one) + cdb_length;
 		if (one->support & RSO_ONE_CTDP) {
 			struct scsi_report_supported_opcodes_timeout *td;
 
 			td = (struct scsi_report_supported_opcodes_timeout *)
 			    &buf[avail_len];
 			if (valid_len >= (avail_len + sizeof(td->length))) {
 				avail_len += scsi_2btoul(td->length) +
 				    sizeof(td->length);
 			} else {
 				avail_len += sizeof(*td);
 			}
 		}
 	}
 
 	/*
 	 * avail_len could be zero if we didn't get enough data back from
 	 * thet target to determine
 	 */
 	if ((avail_len != 0)
 	 && (avail_len > valid_len)) {
 		alloc_len = avail_len;
 		goto retry_alloc;
 	}
 
 	*fill_len = valid_len;
 	*data_ptr = buf;
 bailout:
 	if (retval != 0)
 		free(buf);
 
 	cam_freeccb(ccb);
 
 	return (retval);
 }
 
 static int
 scsiprintoneopcode(struct cam_device *device, int req_opcode, int sa_set,
 		   int req_sa, uint8_t *buf, uint32_t valid_len)
 {
 	struct scsi_report_supported_opcodes_one *one;
 	struct scsi_report_supported_opcodes_timeout *td;
 	uint32_t cdb_len = 0, td_len = 0;
 	const char *op_desc = NULL;
 	unsigned int i;
 	int retval = 0;
 
 	one = (struct scsi_report_supported_opcodes_one *)buf;
 
 	/*
 	 * If we don't have the full single opcode descriptor, no point in
 	 * continuing.
 	 */
 	if (valid_len < __offsetof(struct scsi_report_supported_opcodes_one,
 	    cdb_length)) {
 		warnx("Only %u bytes returned, not enough to verify support",
 		      valid_len);
 		retval = 1;
 		goto bailout;
 	}
 
 	op_desc = scsi_op_desc(req_opcode, &device->inq_data);
 
 	printf("%s (0x%02x)", op_desc != NULL ? op_desc : "UNKNOWN",
 	       req_opcode);
 	if (sa_set != 0)
 		printf(", SA 0x%x", req_sa);
 	printf(": ");
 
 	switch (one->support & RSO_ONE_SUP_MASK) {
 	case RSO_ONE_SUP_UNAVAIL:
 		printf("No command support information currently available\n");
 		break;
 	case RSO_ONE_SUP_NOT_SUP:
 		printf("Command not supported\n");
 		retval = 1;
 		goto bailout;
 		break; /*NOTREACHED*/
 	case RSO_ONE_SUP_AVAIL:
 		printf("Command is supported, complies with a SCSI standard\n");
 		break;
 	case RSO_ONE_SUP_VENDOR:
 		printf("Command is supported, vendor-specific "
 		       "implementation\n");
 		break;
 	default:
 		printf("Unknown command support flags 0x%#x\n",
 		       one->support & RSO_ONE_SUP_MASK);
 		break;
 	}
 
 	/*
 	 * If we don't have the CDB length, it isn't exactly an error, the
 	 * command probably isn't supported.
 	 */
 	if (valid_len < __offsetof(struct scsi_report_supported_opcodes_one,
 	    cdb_usage))
 		goto bailout;
 
 	cdb_len = scsi_2btoul(one->cdb_length);
 
 	/*
 	 * If our valid data doesn't include the full reported length,
 	 * return.  The caller should have detected this and adjusted his
 	 * allocation length to get all of the available data.
 	 */
 	if (valid_len < sizeof(*one) + cdb_len) {
 		retval = 1;
 		goto bailout;
 	}
 
 	/*
 	 * If all we have is the opcode, there is no point in printing out
 	 * the usage bitmap.
 	 */
 	if (cdb_len <= 1) {
 		retval = 1;
 		goto bailout;
 	}
 
 	printf("CDB usage bitmap:");
 	for (i = 0; i < cdb_len; i++) {
 		printf(" %02x", one->cdb_usage[i]);
 	}
 	printf("\n");
 
 	/*
 	 * If we don't have a timeout descriptor, we're done.
 	 */
 	if ((one->support & RSO_ONE_CTDP) == 0)
 		goto bailout;
 
 	/*
 	 * If we don't have enough valid length to include the timeout
 	 * descriptor length, we're done.
 	 */
 	if (valid_len < (sizeof(*one) + cdb_len + sizeof(td->length)))
 		goto bailout;
 
 	td = (struct scsi_report_supported_opcodes_timeout *)
 	    &buf[sizeof(*one) + cdb_len];
 	td_len = scsi_2btoul(td->length);
 	td_len += sizeof(td->length);
 
 	/*
 	 * If we don't have the full timeout descriptor, we're done.
 	 */
 	if (td_len < sizeof(*td))
 		goto bailout;
 
 	/*
 	 * If we don't have enough valid length to contain the full timeout
 	 * descriptor, we're done.
 	 */
 	if (valid_len < (sizeof(*one) + cdb_len + td_len))
 		goto bailout;
 
 	printf("Timeout information:\n");
 	printf("Command-specific:    0x%02x\n", td->cmd_specific);
 	printf("Nominal timeout:     %u seconds\n",
 	       scsi_4btoul(td->nominal_time));
 	printf("Recommended timeout: %u seconds\n",
 	       scsi_4btoul(td->recommended_time));
 
 bailout:
 	return (retval);
 }
 
 static int
 scsiprintopcodes(struct cam_device *device, int td_req, uint8_t *buf,
 		 uint32_t valid_len)
 {
 	struct scsi_report_supported_opcodes_all *hdr;
 	struct scsi_report_supported_opcodes_descr *desc;
 	uint32_t avail_len = 0, used_len = 0;
 	uint8_t *cur_ptr;
 	int retval = 0;
 
 	if (valid_len < sizeof(*hdr)) {
 		warnx("%s: not enough returned data (%u bytes) opcode list",
 		      __func__, valid_len);
 		retval = 1;
 		goto bailout;
 	}
 	hdr = (struct scsi_report_supported_opcodes_all *)buf;
 	avail_len = scsi_4btoul(hdr->length);
 	avail_len += sizeof(hdr->length);
 	/*
 	 * Take the lesser of the amount of data the drive claims is
 	 * available, and the amount of data the HBA says was returned.
 	 */
 	avail_len = MIN(avail_len, valid_len);
 
 	used_len = sizeof(hdr->length);
 
 	printf("%-6s %4s %8s ",
 	       "Opcode", "SA", "CDB len" );
 
 	if (td_req != 0)
 		printf("%5s %6s %6s ", "CS", "Nom", "Rec");
 	printf(" Description\n");
 
 	while ((avail_len - used_len) > sizeof(*desc)) {
 		struct scsi_report_supported_opcodes_timeout *td;
 		uint32_t td_len;
 		const char *op_desc = NULL;
 
 		cur_ptr = &buf[used_len];
 		desc = (struct scsi_report_supported_opcodes_descr *)cur_ptr;
 
 		op_desc = scsi_op_desc(desc->opcode, &device->inq_data);
 		if (op_desc == NULL)
 			op_desc = "UNKNOWN";
 
 		printf("0x%02x   %#4x %8u ", desc->opcode,
 		       scsi_2btoul(desc->service_action),
 		       scsi_2btoul(desc->cdb_length));
 
 		used_len += sizeof(*desc);
 
 		if ((desc->flags & RSO_CTDP) == 0) {
 			printf(" %s\n", op_desc);
 			continue;
 		}
 
 		/*
 		 * If we don't have enough space to fit a timeout
 		 * descriptor, then we're done.
 		 */
 		if (avail_len - used_len < sizeof(*td)) {
 			used_len = avail_len;
 			printf(" %s\n", op_desc);
 			continue;
 		}
 		cur_ptr = &buf[used_len];
 		td = (struct scsi_report_supported_opcodes_timeout *)cur_ptr;
 		td_len = scsi_2btoul(td->length);
 		td_len += sizeof(td->length);
 
 		used_len += td_len;
 		/*
 		 * If the given timeout descriptor length is less than what
 		 * we understand, skip it.
 		 */
 		if (td_len < sizeof(*td)) {
 			printf(" %s\n", op_desc);
 			continue;
 		}
 
 		printf(" 0x%02x %6u %6u  %s\n", td->cmd_specific,
 		       scsi_4btoul(td->nominal_time),
 		       scsi_4btoul(td->recommended_time), op_desc);
 	}
 bailout:
 	return (retval);
 }
 
 static int
 scsiopcodes(struct cam_device *device, int argc, char **argv,
 	    char *combinedopt, int task_attr, int retry_count, int timeout,
 	    int verbosemode)
 {
 	int c;
 	uint32_t opcode = 0, service_action = 0;
 	int td_set = 0, opcode_set = 0, sa_set = 0;
 	int show_sa_errors = 1;
 	uint32_t valid_len = 0;
 	uint8_t *buf = NULL;
 	char *endptr;
 	int retval = 0;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'N':
 			show_sa_errors = 0;
 			break;
 		case 'o':
 			opcode = strtoul(optarg, &endptr, 0);
 			if (*endptr != '\0') {
 				warnx("Invalid opcode \"%s\", must be a number",
 				      optarg);
 				retval = 1;
 				goto bailout;
 			}
 			if (opcode > 0xff) {
 				warnx("Invalid opcode 0x%#x, must be between"
 				      "0 and 0xff inclusive", opcode);
 				retval = 1;
 				goto bailout;
 			}
 			opcode_set = 1;
 			break;
 		case 's':
 			service_action = strtoul(optarg, &endptr, 0);
 			if (*endptr != '\0') {
 				warnx("Invalid service action \"%s\", must "
 				      "be a number", optarg);
 				retval = 1;
 				goto bailout;
 			}
 			if (service_action > 0xffff) {
 				warnx("Invalid service action 0x%#x, must "
 				      "be between 0 and 0xffff inclusive",
 				      service_action);
 				retval = 1;
 			}
 			sa_set = 1;
 			break;
 		case 'T':
 			td_set = 1;
 			break;
 		default:
 			break;
 		}
 	}
 
 	if ((sa_set != 0)
 	 && (opcode_set == 0)) {
 		warnx("You must specify an opcode with -o if a service "
 		      "action is given");
 		retval = 1;
 		goto bailout;
 	}
 	retval = scsigetopcodes(device, opcode_set, opcode, show_sa_errors,
 				sa_set, service_action, td_set, task_attr,
 				retry_count, timeout, verbosemode, &valid_len,
 				&buf);
 	if (retval != 0)
 		goto bailout;
 
 	if ((opcode_set != 0)
 	 || (sa_set != 0)) {
 		retval = scsiprintoneopcode(device, opcode, sa_set,
 					    service_action, buf, valid_len);
 	} else {
 		retval = scsiprintopcodes(device, td_set, buf, valid_len);
 	}
 
 bailout:
 	free(buf);
 
 	return (retval);
 }
 
 
 static int
 reprobe(struct cam_device *device)
 {
 	union ccb *ccb;
 	int retval = 0;
 
 	ccb = cam_getccb(device);
 
 	if (ccb == NULL) {
 		warnx("%s: error allocating ccb", __func__);
 		return (1);
 	}
 
 	CCB_CLEAR_ALL_EXCEPT_HDR(ccb);
 
 	ccb->ccb_h.func_code = XPT_REPROBE_LUN;
 
 	if (cam_send_ccb(device, ccb) < 0) {
 		warn("error sending XPT_REPROBE_LUN CCB");
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr);
 		retval = 1;
 		goto bailout;
 	}
 
 bailout:
 	cam_freeccb(ccb);
 
 	return (retval);
 }
 
 void
 usage(int printlong)
 {
 
 	fprintf(printlong ? stdout : stderr,
 "usage:  camcontrol <command>  [device id][generic args][command args]\n"
 "        camcontrol devlist    [-b] [-v]\n"
 "        camcontrol periphlist [dev_id][-n dev_name] [-u unit]\n"
 "        camcontrol tur        [dev_id][generic args]\n"
 "        camcontrol inquiry    [dev_id][generic args] [-D] [-S] [-R]\n"
 "        camcontrol identify   [dev_id][generic args] [-v]\n"
 "        camcontrol reportluns [dev_id][generic args] [-c] [-l] [-r report]\n"
 "        camcontrol readcap    [dev_id][generic args] [-b] [-h] [-H] [-N]\n"
 "                              [-q] [-s] [-l]\n"
 "        camcontrol start      [dev_id][generic args]\n"
 "        camcontrol stop       [dev_id][generic args]\n"
 "        camcontrol load       [dev_id][generic args]\n"
 "        camcontrol eject      [dev_id][generic args]\n"
 "        camcontrol reprobe    [dev_id][generic args]\n"
 "        camcontrol rescan     <all | bus[:target:lun] | dev_id>\n"
 "        camcontrol reset      <all | bus[:target:lun] | dev_id>\n"
 "        camcontrol defects    [dev_id][generic args] <-f format> [-P][-G]\n"
 "                              [-q][-s][-S offset][-X]\n"
 "        camcontrol modepage   [dev_id][generic args] <-m page | -l>\n"
 "                              [-P pagectl][-e | -b][-d]\n"
 "        camcontrol cmd        [dev_id][generic args]\n"
 "                              <-a cmd [args] | -c cmd [args]>\n"
 "                              [-d] [-f] [-i len fmt|-o len fmt [args]] [-r fmt]\n"
 "        camcontrol smpcmd     [dev_id][generic args]\n"
 "                              <-r len fmt [args]> <-R len fmt [args]>\n"
 "        camcontrol smprg      [dev_id][generic args][-l]\n"
 "        camcontrol smppc      [dev_id][generic args] <-p phy> [-l]\n"
 "                              [-o operation][-d name][-m rate][-M rate]\n"
 "                              [-T pp_timeout][-a enable|disable]\n"
 "                              [-A enable|disable][-s enable|disable]\n"
 "                              [-S enable|disable]\n"
 "        camcontrol smpphylist [dev_id][generic args][-l][-q]\n"
 "        camcontrol smpmaninfo [dev_id][generic args][-l]\n"
 "        camcontrol debug      [-I][-P][-T][-S][-X][-c]\n"
 "                              <all|dev_id|bus[:target[:lun]]|off>\n"
 "        camcontrol tags       [dev_id][generic args] [-N tags] [-q] [-v]\n"
 "        camcontrol negotiate  [dev_id][generic args] [-a][-c]\n"
 "                              [-D <enable|disable>][-M mode][-O offset]\n"
 "                              [-q][-R syncrate][-v][-T <enable|disable>]\n"
 "                              [-U][-W bus_width]\n"
 "        camcontrol format     [dev_id][generic args][-q][-r][-w][-y]\n"
 "        camcontrol sanitize   [dev_id][generic args]\n"
 "                              [-a overwrite|block|crypto|exitfailure]\n"
 "                              [-c passes][-I][-P pattern][-q][-U][-r][-w]\n"
 "                              [-y]\n"
 "        camcontrol idle       [dev_id][generic args][-t time]\n"
 "        camcontrol standby    [dev_id][generic args][-t time]\n"
 "        camcontrol sleep      [dev_id][generic args]\n"
 "        camcontrol powermode  [dev_id][generic args]\n"
 "        camcontrol apm        [dev_id][generic args][-l level]\n"
 "        camcontrol aam        [dev_id][generic args][-l level]\n"
 "        camcontrol fwdownload [dev_id][generic args] <-f fw_image> [-q]\n"
 "                              [-s][-y]\n"
 "        camcontrol security   [dev_id][generic args]\n"
 "                              <-d pwd | -e pwd | -f | -h pwd | -k pwd>\n"
 "                              [-l <high|maximum>] [-q] [-s pwd] [-T timeout]\n"
 "                              [-U <user|master>] [-y]\n"
 "        camcontrol hpa        [dev_id][generic args] [-f] [-l] [-P] [-p pwd]\n"
 "                              [-q] [-s max_sectors] [-U pwd] [-y]\n"
 "        camcontrol ama        [dev_id][generic args] [-f] [-q] [-s max_sectors]\n"
 "        camcontrol persist    [dev_id][generic args] <-i action|-o action>\n"
 "                              [-a][-I tid][-k key][-K sa_key][-p][-R rtp]\n"
 "                              [-s scope][-S][-T type][-U]\n"
 "        camcontrol attrib     [dev_id][generic args] <-r action|-w attr>\n"
 "                              [-a attr_num][-c][-e elem][-F form1,form1]\n"
 "                              [-p part][-s start][-T type][-V vol]\n"
 "        camcontrol opcodes    [dev_id][generic args][-o opcode][-s SA]\n"
 "                              [-N][-T]\n"
 "        camcontrol zone       [dev_id][generic args]<-c cmd> [-a] [-l LBA]\n"
 "                              [-o rep_opts] [-P print_opts]\n"
 "        camcontrol epc        [dev_id][generic_args]<-c cmd> [-d] [-D] [-e]\n"
 "                              [-H] [-p power_cond] [-P] [-r rst_src] [-s]\n"
 "                              [-S power_src] [-T timer]\n"
 "        camcontrol timestamp  [dev_id][generic_args] <-r [-f format|-m|-U]>|\n"
 "                              <-s <-f format -T time | -U >>\n"
 "        camcontrol devtype    [dev_id]\n"
 "                              \n"
 "        camcontrol help\n");
 	if (!printlong)
 		return;
 	fprintf(stdout,
 "Specify one of the following options:\n"
 "devlist     list all CAM devices\n"
 "periphlist  list all CAM peripheral drivers attached to a device\n"
 "tur         send a test unit ready to the named device\n"
 "inquiry     send a SCSI inquiry command to the named device\n"
 "identify    send a ATA identify command to the named device\n"
 "reportluns  send a SCSI report luns command to the device\n"
 "readcap     send a SCSI read capacity command to the device\n"
 "start       send a Start Unit command to the device\n"
 "stop        send a Stop Unit command to the device\n"
 "load        send a Start Unit command to the device with the load bit set\n"
 "eject       send a Stop Unit command to the device with the eject bit set\n"
 "reprobe     update capacity information of the given device\n"
 "rescan      rescan all buses, the given bus, bus:target:lun or device\n"
 "reset       reset all buses, the given bus, bus:target:lun or device\n"
 "defects     read the defect list of the specified device\n"
 "modepage    display or edit (-e) the given mode page\n"
 "cmd         send the given SCSI command, may need -i or -o as well\n"
 "smpcmd      send the given SMP command, requires -o and -i\n"
 "smprg       send the SMP Report General command\n"
 "smppc       send the SMP PHY Control command, requires -p\n"
 "smpphylist  display phys attached to a SAS expander\n"
 "smpmaninfo  send the SMP Report Manufacturer Info command\n"
 "debug       turn debugging on/off for a bus, target, or lun, or all devices\n"
 "tags        report or set the number of transaction slots for a device\n"
 "negotiate   report or set device negotiation parameters\n"
 "format      send the SCSI FORMAT UNIT command to the named device\n"
 "sanitize    send the SCSI SANITIZE command to the named device\n"
 "idle        send the ATA IDLE command to the named device\n"
 "standby     send the ATA STANDBY command to the named device\n"
 "sleep       send the ATA SLEEP command to the named device\n"
 "powermode   send the ATA CHECK POWER MODE command to the named device\n"
 "fwdownload  program firmware of the named device with the given image\n"
 "security    report or send ATA security commands to the named device\n"
 "persist     send the SCSI PERSISTENT RESERVE IN or OUT commands\n"
 "attrib      send the SCSI READ or WRITE ATTRIBUTE commands\n"
 "opcodes     send the SCSI REPORT SUPPORTED OPCODES command\n"
 "zone        manage Zoned Block (Shingled) devices\n"
 "epc         send ATA Extended Power Conditions commands\n"
 "timestamp   report or set the device's timestamp\n"
 "devtype     report the type of device\n"
 "help        this message\n"
 "Device Identifiers:\n"
 "bus:target        specify the bus and target, lun defaults to 0\n"
 "bus:target:lun    specify the bus, target and lun\n"
 "deviceUNIT        specify the device name, like \"da4\" or \"cd2\"\n"
 "Generic arguments:\n"
 "-v                be verbose, print out sense information\n"
 "-t timeout        command timeout in seconds, overrides default timeout\n"
 "-n dev_name       specify device name, e.g. \"da\", \"cd\"\n"
 "-u unit           specify unit number, e.g. \"0\", \"5\"\n"
 "-E                have the kernel attempt to perform SCSI error recovery\n"
 "-C count          specify the SCSI command retry count (needs -E to work)\n"
 "-Q task_attr      specify ordered, simple or head tag type for SCSI cmds\n"
 "modepage arguments:\n"
 "-l                list all available mode pages\n"
 "-m page           specify the mode page to view or edit\n"
 "-e                edit the specified mode page\n"
 "-b                force view to binary mode\n"
 "-d                disable block descriptors for mode sense\n"
 "-P pgctl          page control field 0-3\n"
 "defects arguments:\n"
 "-f format         specify defect list format (block, bfi or phys)\n"
 "-G                get the grown defect list\n"
 "-P                get the permanent defect list\n"
 "inquiry arguments:\n"
 "-D                get the standard inquiry data\n"
 "-S                get the serial number\n"
 "-R                get the transfer rate, etc.\n"
 "reportluns arguments:\n"
 "-c                only report a count of available LUNs\n"
 "-l                only print out luns, and not a count\n"
 "-r <reporttype>   specify \"default\", \"wellknown\" or \"all\"\n"
 "readcap arguments\n"
 "-b                only report the blocksize\n"
 "-h                human readable device size, base 2\n"
 "-H                human readable device size, base 10\n"
 "-N                print the number of blocks instead of last block\n"
 "-q                quiet, print numbers only\n"
 "-s                only report the last block/device size\n"
 "cmd arguments:\n"
 "-c cdb [args]     specify the SCSI CDB\n"
 "-i len fmt        specify input data and input data format\n"
 "-o len fmt [args] specify output data and output data fmt\n"
 "smpcmd arguments:\n"
 "-r len fmt [args] specify the SMP command to be sent\n"
 "-R len fmt [args] specify SMP response format\n"
 "smprg arguments:\n"
 "-l                specify the long response format\n"
 "smppc arguments:\n"
 "-p phy            specify the PHY to operate on\n"
 "-l                specify the long request/response format\n"
 "-o operation      specify the phy control operation\n"
 "-d name           set the attached device name\n"
 "-m rate           set the minimum physical link rate\n"
 "-M rate           set the maximum physical link rate\n"
 "-T pp_timeout     set the partial pathway timeout value\n"
 "-a enable|disable enable or disable SATA slumber\n"
 "-A enable|disable enable or disable SATA partial phy power\n"
 "-s enable|disable enable or disable SAS slumber\n"
 "-S enable|disable enable or disable SAS partial phy power\n"
 "smpphylist arguments:\n"
 "-l                specify the long response format\n"
 "-q                only print phys with attached devices\n"
 "smpmaninfo arguments:\n"
 "-l                specify the long response format\n"
 "debug arguments:\n"
 "-I                CAM_DEBUG_INFO -- scsi commands, errors, data\n"
 "-T                CAM_DEBUG_TRACE -- routine flow tracking\n"
 "-S                CAM_DEBUG_SUBTRACE -- internal routine command flow\n"
 "-c                CAM_DEBUG_CDB -- print out SCSI CDBs only\n"
 "tags arguments:\n"
 "-N tags           specify the number of tags to use for this device\n"
 "-q                be quiet, don't report the number of tags\n"
 "-v                report a number of tag-related parameters\n"
 "negotiate arguments:\n"
 "-a                send a test unit ready after negotiation\n"
 "-c                report/set current negotiation settings\n"
 "-D <arg>          \"enable\" or \"disable\" disconnection\n"
 "-M mode           set ATA mode\n"
 "-O offset         set command delay offset\n"
 "-q                be quiet, don't report anything\n"
 "-R syncrate       synchronization rate in MHz\n"
 "-T <arg>          \"enable\" or \"disable\" tagged queueing\n"
 "-U                report/set user negotiation settings\n"
 "-W bus_width      set the bus width in bits (8, 16 or 32)\n"
 "-v                also print a Path Inquiry CCB for the controller\n"
 "format arguments:\n"
 "-q                be quiet, don't print status messages\n"
 "-r                run in report only mode\n"
 "-w                don't send immediate format command\n"
 "-y                don't ask any questions\n"
 "sanitize arguments:\n"
 "-a operation      operation mode: overwrite, block, crypto or exitfailure\n"
 "-c passes         overwrite passes to perform (1 to 31)\n"
 "-I                invert overwrite pattern after each pass\n"
 "-P pattern        path to overwrite pattern file\n"
 "-q                be quiet, don't print status messages\n"
 "-r                run in report only mode\n"
 "-U                run operation in unrestricted completion exit mode\n"
 "-w                don't send immediate sanitize command\n"
 "-y                don't ask any questions\n"
 "idle/standby arguments:\n"
 "-t <arg>          number of seconds before respective state.\n"
 "fwdownload arguments:\n"
 "-f fw_image       path to firmware image file\n"
 "-q                don't print informational messages, only errors\n"
 "-s                run in simulation mode\n"
 "-v                print info for every firmware segment sent to device\n"
 "-y                don't ask any questions\n"
 "security arguments:\n"
 "-d pwd            disable security using the given password for the selected\n"
 "                  user\n"
 "-e pwd            erase the device using the given pwd for the selected user\n"
 "-f                freeze the security configuration of the specified device\n"
 "-h pwd            enhanced erase the device using the given pwd for the\n"
 "                  selected user\n"
 "-k pwd            unlock the device using the given pwd for the selected\n"
 "                  user\n"
 "-l <high|maximum> specifies which security level to set: high or maximum\n"
 "-q                be quiet, do not print any status messages\n"
 "-s pwd            password the device (enable security) using the given\n"
 "                  pwd for the selected user\n"
 "-T timeout        overrides the timeout (seconds) used for erase operation\n"
 "-U <user|master>  specifies which user to set: user or master\n"
 "-y                don't ask any questions\n"
 "hpa arguments:\n"
 "-f                freeze the HPA configuration of the device\n"
 "-l                lock the HPA configuration of the device\n"
 "-P                make the HPA max sectors persist\n"
 "-p pwd            Set the HPA configuration password required for unlock\n"
 "                  calls\n"
 "-q                be quiet, do not print any status messages\n"
 "-s sectors        configures the maximum user accessible sectors of the\n"
 "                  device\n"
 "-U pwd            unlock the HPA configuration of the device\n"
 "-y                don't ask any questions\n"
 "ama arguments:\n"
 "-f                freeze the AMA configuration of the device\n"
 "-q                be quiet, do not print any status messages\n"
 "-s sectors        configures the maximum user accessible sectors of the\n"
 "                  device\n"
 "persist arguments:\n"
 "-i action         specify read_keys, read_reservation, report_cap, or\n"
 "                  read_full_status\n"
 "-o action         specify register, register_ignore, reserve, release,\n"
 "                  clear, preempt, preempt_abort, register_move, replace_lost\n"
 "-a                set the All Target Ports (ALL_TG_PT) bit\n"
 "-I tid            specify a Transport ID, e.g.: sas,0x1234567812345678\n"
 "-k key            specify the Reservation Key\n"
 "-K sa_key         specify the Service Action Reservation Key\n"
 "-p                set the Activate Persist Through Power Loss bit\n"
 "-R rtp            specify the Relative Target Port\n"
 "-s scope          specify the scope: lun, extent, element or a number\n"
 "-S                specify Transport ID for register, requires -I\n"
 "-T res_type       specify the reservation type: read_shared, wr_ex, rd_ex,\n"
 "                  ex_ac, wr_ex_ro, ex_ac_ro, wr_ex_ar, ex_ac_ar\n"
 "-U                unregister the current initiator for register_move\n"
 "attrib arguments:\n"
 "-r action         specify attr_values, attr_list, lv_list, part_list, or\n"
 "                  supp_attr\n"
 "-w attr           specify an attribute to write, one -w argument per attr\n"
 "-a attr_num       only display this attribute number\n"
 "-c                get cached attributes\n"
 "-e elem_addr      request attributes for the given element in a changer\n"
 "-F form1,form2    output format, comma separated list: text_esc, text_raw,\n"
 "                  nonascii_esc, nonascii_trim, nonascii_raw, field_all,\n"
 "                  field_none, field_desc, field_num, field_size, field_rw\n"
 "-p partition      request attributes for the given partition\n"
 "-s start_attr     request attributes starting at the given number\n"
 "-T elem_type      specify the element type (used with -e)\n"
 "-V logical_vol    specify the logical volume ID\n"
 "opcodes arguments:\n"
 "-o opcode         specify the individual opcode to list\n"
 "-s service_action specify the service action for the opcode\n"
 "-N                do not return SCSI error for unsupported SA\n"
 "-T                request nominal and recommended timeout values\n"
 "zone arguments:\n"
 "-c cmd            required: rz, open, close, finish, or rwp\n"
 "-a                apply the action to all zones\n"
 "-l LBA            specify the zone starting LBA\n"
 "-o rep_opts       report zones options: all, empty, imp_open, exp_open,\n"
 "                  closed, full, ro, offline, reset, nonseq, nonwp\n"
 "-P print_opt      report zones printing:  normal, summary, script\n"
 "epc arguments:\n"
 "-c cmd            required: restore, goto, timer, state, enable, disable,\n"
 "                  source, status, list\n"
 "-d                disable power mode (timer, state)\n"
 "-D                delayed entry (goto)\n"
 "-e                enable power mode (timer, state)\n"
 "-H                hold power mode (goto)\n"
 "-p power_cond     Idle_a, Idle_b, Idle_c, Standby_y, Standby_z (timer,\n"
 "                  state, goto)\n"
 "-P                only display power mode (status)\n"
 "-r rst_src        restore settings from: default, saved (restore)\n"
 "-s                save mode (timer, state, restore)\n"
 "-S power_src      set power source: battery, nonbattery (source)\n"
 "-T timer          set timer, seconds, .1 sec resolution (timer)\n"
 "timestamp arguments:\n"
 "-r                report the timestamp of the device\n"
 "-f format         report the timestamp of the device with the given\n"
 "                  strftime(3) format string\n"
 "-m                report the timestamp of the device as milliseconds since\n"
 "                  January 1st, 1970\n"
 "-U                report the time with UTC instead of the local time zone\n"
 "-s                set the timestamp of the device\n"
 "-f format         the format of the time string passed into strptime(3)\n"
 "-T time           the time value passed into strptime(3)\n"
 "-U                set the timestamp of the device to UTC time\n"
 );
 }
 
 int
 main(int argc, char **argv)
 {
 	int c;
 	char *device = NULL;
 	int unit = 0;
 	struct cam_device *cam_dev = NULL;
 	int timeout = 0, retry_count = 1;
 	camcontrol_optret optreturn;
 	char *tstr;
 	const char *mainopt = "C:En:Q:t:u:v";
 	const char *subopt = NULL;
 	char combinedopt[256];
 	int error = 0, optstart = 2;
 	int task_attr = MSG_SIMPLE_Q_TAG;
 	int devopen = 1;
 	path_id_t bus;
 	target_id_t target;
 	lun_id_t lun;
 
 	cmdlist = CAM_CMD_NONE;
 	arglist = CAM_ARG_NONE;
 
 	if (argc < 2) {
 		usage(0);
 		exit(1);
 	}
 
 	/*
 	 * Get the base option.
 	 */
 	optreturn = getoption(option_table,argv[1], &cmdlist, &arglist,&subopt);
 
 	if (optreturn == CC_OR_AMBIGUOUS) {
 		warnx("ambiguous option %s", argv[1]);
 		usage(0);
 		exit(1);
 	} else if (optreturn == CC_OR_NOT_FOUND) {
 		warnx("option %s not found", argv[1]);
 		usage(0);
 		exit(1);
 	}
 
 	/*
 	 * Ahh, getopt(3) is a pain.
 	 *
 	 * This is a gross hack.  There really aren't many other good
 	 * options (excuse the pun) for parsing options in a situation like
 	 * this.  getopt is kinda braindead, so you end up having to run
 	 * through the options twice, and give each invocation of getopt
 	 * the option string for the other invocation.
 	 *
 	 * You would think that you could just have two groups of options.
 	 * The first group would get parsed by the first invocation of
 	 * getopt, and the second group would get parsed by the second
 	 * invocation of getopt.  It doesn't quite work out that way.  When
 	 * the first invocation of getopt finishes, it leaves optind pointing
 	 * to the argument _after_ the first argument in the second group.
 	 * So when the second invocation of getopt comes around, it doesn't
 	 * recognize the first argument it gets and then bails out.
 	 *
 	 * A nice alternative would be to have a flag for getopt that says
 	 * "just keep parsing arguments even when you encounter an unknown
 	 * argument", but there isn't one.  So there's no real clean way to
 	 * easily parse two sets of arguments without having one invocation
 	 * of getopt know about the other.
 	 *
 	 * Without this hack, the first invocation of getopt would work as
 	 * long as the generic arguments are first, but the second invocation
 	 * (in the subfunction) would fail in one of two ways.  In the case
 	 * where you don't set optreset, it would fail because optind may be
 	 * pointing to the argument after the one it should be pointing at.
 	 * In the case where you do set optreset, and reset optind, it would
 	 * fail because getopt would run into the first set of options, which
 	 * it doesn't understand.
 	 *
 	 * All of this would "sort of" work if you could somehow figure out
 	 * whether optind had been incremented one option too far.  The
 	 * mechanics of that, however, are more daunting than just giving
 	 * both invocations all of the expect options for either invocation.
 	 *
 	 * Needless to say, I wouldn't mind if someone invented a better
 	 * (non-GPL!) command line parsing interface than getopt.  I
 	 * wouldn't mind if someone added more knobs to getopt to make it
 	 * work better.  Who knows, I may talk myself into doing it someday,
 	 * if the standards weenies let me.  As it is, it just leads to
 	 * hackery like this and causes people to avoid it in some cases.
 	 *
 	 * KDM, September 8th, 1998
 	 */
 	if (subopt != NULL)
 		sprintf(combinedopt, "%s%s", mainopt, subopt);
 	else
 		sprintf(combinedopt, "%s", mainopt);
 
 	/*
 	 * For these options we do not parse optional device arguments and
 	 * we do not open a passthrough device.
 	 */
 	if ((cmdlist == CAM_CMD_RESCAN)
 	 || (cmdlist == CAM_CMD_RESET)
 	 || (cmdlist == CAM_CMD_DEVTREE)
 	 || (cmdlist == CAM_CMD_USAGE)
 	 || (cmdlist == CAM_CMD_DEBUG))
 		devopen = 0;
 
 	if ((devopen == 1)
 	 && (argc > 2 && argv[2][0] != '-')) {
 		char name[30];
 		int rv;
 
 		if (isdigit(argv[2][0])) {
 			/* device specified as bus:target[:lun] */
 			rv = parse_btl(argv[2], &bus, &target, &lun, &arglist);
 			if (rv < 2)
 				errx(1, "numeric device specification must "
 				     "be either bus:target, or "
 				     "bus:target:lun");
 			/* default to 0 if lun was not specified */
 			if ((arglist & CAM_ARG_LUN) == 0) {
 				lun = 0;
 				arglist |= CAM_ARG_LUN;
 			}
 			optstart++;
 		} else {
 			if (cam_get_device(argv[2], name, sizeof name, &unit)
 			    == -1)
 				errx(1, "%s", cam_errbuf);
 			device = strdup(name);
 			arglist |= CAM_ARG_DEVICE | CAM_ARG_UNIT;
 			optstart++;
 		}
 	}
 	/*
 	 * Start getopt processing at argv[2/3], since we've already
 	 * accepted argv[1..2] as the command name, and as a possible
 	 * device name.
 	 */
 	optind = optstart;
 
 	/*
 	 * Now we run through the argument list looking for generic
 	 * options, and ignoring options that possibly belong to
 	 * subfunctions.
 	 */
 	while ((c = getopt(argc, argv, combinedopt))!= -1){
 		switch(c) {
 			case 'C':
 				retry_count = strtol(optarg, NULL, 0);
 				if (retry_count < 0)
 					errx(1, "retry count %d is < 0",
 					     retry_count);
 				arglist |= CAM_ARG_RETRIES;
 				break;
 			case 'E':
 				arglist |= CAM_ARG_ERR_RECOVER;
 				break;
 			case 'n':
 				arglist |= CAM_ARG_DEVICE;
 				tstr = optarg;
 				while (isspace(*tstr) && (*tstr != '\0'))
 					tstr++;
 				device = (char *)strdup(tstr);
 				break;
 			case 'Q': {
 				char *endptr;
 				int table_entry = 0;
 
 				tstr = optarg;
 				while (isspace(*tstr) && (*tstr != '\0'))
 					tstr++;
 				if (isdigit(*tstr)) {
 					task_attr = strtol(tstr, &endptr, 0);
 					if (*endptr != '\0') {
 						errx(1, "Invalid queue option "
 						    "%s", tstr);
 					}
 				} else {
 					size_t table_size;
 					scsi_nv_status status;
 
 					table_size = sizeof(task_attrs) /
 						     sizeof(task_attrs[0]);
 					status = scsi_get_nv(task_attrs,
 					    table_size, tstr, &table_entry,
 					    SCSI_NV_FLAG_IG_CASE);
 					if (status == SCSI_NV_FOUND)
 						task_attr = task_attrs[
 						    table_entry].value;
 					else {
 						errx(1, "%s option %s",
 						  (status == SCSI_NV_AMBIGUOUS)?
 						    "ambiguous" : "invalid",
 						    tstr);
 					}
 				}
 				break;
 			}
 			case 't':
 				timeout = strtol(optarg, NULL, 0);
 				if (timeout < 0)
 					errx(1, "invalid timeout %d", timeout);
 				/* Convert the timeout from seconds to ms */
 				timeout *= 1000;
 				arglist |= CAM_ARG_TIMEOUT;
 				break;
 			case 'u':
 				arglist |= CAM_ARG_UNIT;
 				unit = strtol(optarg, NULL, 0);
 				break;
 			case 'v':
 				arglist |= CAM_ARG_VERBOSE;
 				break;
 			default:
 				break;
 		}
 	}
 
 	/*
 	 * For most commands we'll want to open the passthrough device
 	 * associated with the specified device.  In the case of the rescan
 	 * commands, we don't use a passthrough device at all, just the
 	 * transport layer device.
 	 */
 	if (devopen == 1) {
 		if (((arglist & (CAM_ARG_BUS|CAM_ARG_TARGET)) == 0)
 		 && (((arglist & CAM_ARG_DEVICE) == 0)
 		  || ((arglist & CAM_ARG_UNIT) == 0))) {
 			errx(1, "subcommand \"%s\" requires a valid device "
 			     "identifier", argv[1]);
 		}
 
 		if ((cam_dev = ((arglist & (CAM_ARG_BUS | CAM_ARG_TARGET))?
 				cam_open_btl(bus, target, lun, O_RDWR, NULL) :
 				cam_open_spec_device(device,unit,O_RDWR,NULL)))
 		     == NULL)
 			errx(1,"%s", cam_errbuf);
 	}
 
 	/*
 	 * Reset optind to 2, and reset getopt, so these routines can parse
 	 * the arguments again.
 	 */
 	optind = optstart;
 	optreset = 1;
 
 	switch(cmdlist) {
 	case CAM_CMD_DEVLIST:
 		error = getdevlist(cam_dev);
 		break;
 	case CAM_CMD_HPA:
 		error = atahpa(cam_dev, retry_count, timeout,
 			       argc, argv, combinedopt);
 		break;
 	case CAM_CMD_AMA:
 		error = ataama(cam_dev, retry_count, timeout,
 			       argc, argv, combinedopt);
 		break;
 	case CAM_CMD_DEVTREE:
 		error = getdevtree(argc, argv, combinedopt);
 		break;
 	case CAM_CMD_DEVTYPE:
 		error = getdevtype(cam_dev);
 		break;
 	case CAM_CMD_TUR:
 		error = testunitready(cam_dev, task_attr, retry_count,
 		    timeout, 0);
 		break;
 	case CAM_CMD_INQUIRY:
 		error = scsidoinquiry(cam_dev, argc, argv, combinedopt,
 				      task_attr, retry_count, timeout);
 		break;
 	case CAM_CMD_IDENTIFY:
 		error = identify(cam_dev, retry_count, timeout);
 		break;
 	case CAM_CMD_STARTSTOP:
 		error = scsistart(cam_dev, arglist & CAM_ARG_START_UNIT,
 				  arglist & CAM_ARG_EJECT, task_attr,
 				  retry_count, timeout);
 		break;
 	case CAM_CMD_RESCAN:
 		error = dorescan_or_reset(argc, argv, 1);
 		break;
 	case CAM_CMD_RESET:
 		error = dorescan_or_reset(argc, argv, 0);
 		break;
 	case CAM_CMD_READ_DEFECTS:
 		error = readdefects(cam_dev, argc, argv, combinedopt,
 				    task_attr, retry_count, timeout);
 		break;
 	case CAM_CMD_MODE_PAGE:
 		modepage(cam_dev, argc, argv, combinedopt,
 			 task_attr, retry_count, timeout);
 		break;
 	case CAM_CMD_SCSI_CMD:
 		error = scsicmd(cam_dev, argc, argv, combinedopt,
 				task_attr, retry_count, timeout);
 		break;
 	case CAM_CMD_MMCSD_CMD:
 		error = mmcsdcmd(cam_dev, argc, argv, combinedopt,
 					retry_count, timeout);
 		break;
 	case CAM_CMD_SMP_CMD:
 		error = smpcmd(cam_dev, argc, argv, combinedopt,
 			       retry_count, timeout);
 		break;
 	case CAM_CMD_SMP_RG:
 		error = smpreportgeneral(cam_dev, argc, argv,
 					 combinedopt, retry_count,
 					 timeout);
 		break;
 	case CAM_CMD_SMP_PC:
 		error = smpphycontrol(cam_dev, argc, argv, combinedopt,
 				      retry_count, timeout);
 		break;
 	case CAM_CMD_SMP_PHYLIST:
 		error = smpphylist(cam_dev, argc, argv, combinedopt,
 				   retry_count, timeout);
 		break;
 	case CAM_CMD_SMP_MANINFO:
 		error = smpmaninfo(cam_dev, argc, argv, combinedopt,
 				   retry_count, timeout);
 		break;
 	case CAM_CMD_DEBUG:
 		error = camdebug(argc, argv, combinedopt);
 		break;
 	case CAM_CMD_TAG:
 		error = tagcontrol(cam_dev, argc, argv, combinedopt);
 		break;
 	case CAM_CMD_RATE:
 		error = ratecontrol(cam_dev, task_attr, retry_count,
 				    timeout, argc, argv, combinedopt);
 		break;
 	case CAM_CMD_FORMAT:
 		error = scsiformat(cam_dev, argc, argv,
 				   combinedopt, task_attr, retry_count,
 				   timeout);
 		break;
 	case CAM_CMD_REPORTLUNS:
 		error = scsireportluns(cam_dev, argc, argv,
 				       combinedopt, task_attr,
 				       retry_count, timeout);
 		break;
 	case CAM_CMD_READCAP:
 		error = scsireadcapacity(cam_dev, argc, argv,
 					 combinedopt, task_attr,
 					 retry_count, timeout);
 		break;
 	case CAM_CMD_IDLE:
 	case CAM_CMD_STANDBY:
 	case CAM_CMD_SLEEP:
 	case CAM_CMD_POWER_MODE:
 		error = atapm(cam_dev, argc, argv,
 			      combinedopt, retry_count, timeout);
 		break;
 	case CAM_CMD_APM:
 	case CAM_CMD_AAM:
 		error = ataaxm(cam_dev, argc, argv,
 			      combinedopt, retry_count, timeout);
 		break;
 	case CAM_CMD_SECURITY:
 		error = atasecurity(cam_dev, retry_count, timeout,
 				    argc, argv, combinedopt);
 		break;
 	case CAM_CMD_DOWNLOAD_FW:
 		error = fwdownload(cam_dev, argc, argv, combinedopt,
 		    arglist & CAM_ARG_VERBOSE, task_attr, retry_count,
 		    timeout);
 		break;
 	case CAM_CMD_SANITIZE:
 		error = sanitize(cam_dev, argc, argv, combinedopt, task_attr,
 				 retry_count, timeout);
 		break;
 	case CAM_CMD_PERSIST:
 		error = scsipersist(cam_dev, argc, argv, combinedopt,
 		    task_attr, retry_count, timeout,
 		    arglist & CAM_ARG_VERBOSE,
 		    arglist & CAM_ARG_ERR_RECOVER);
 		break;
 	case CAM_CMD_ATTRIB:
 		error = scsiattrib(cam_dev, argc, argv, combinedopt,
 		    task_attr, retry_count, timeout,
 		    arglist & CAM_ARG_VERBOSE,
 		    arglist & CAM_ARG_ERR_RECOVER);
 		break;
 	case CAM_CMD_OPCODES:
 		error = scsiopcodes(cam_dev, argc, argv, combinedopt,
 		    task_attr, retry_count, timeout,
 		    arglist & CAM_ARG_VERBOSE);
 		break;
 	case CAM_CMD_REPROBE:
 		error = reprobe(cam_dev);
 		break;
 	case CAM_CMD_ZONE:
 		error = zone(cam_dev, argc, argv, combinedopt,
 		    task_attr, retry_count, timeout,
 		    arglist & CAM_ARG_VERBOSE);
 		break;
 	case CAM_CMD_EPC:
 		error = epc(cam_dev, argc, argv, combinedopt,
 		    retry_count, timeout, arglist & CAM_ARG_VERBOSE);
 		break;
 	case CAM_CMD_TIMESTAMP:
 		error = timestamp(cam_dev, argc, argv, combinedopt,
 		    task_attr, retry_count, timeout,
 		    arglist & CAM_ARG_VERBOSE);
 		break;
 	case CAM_CMD_USAGE:
 		usage(1);
 		break;
 	default:
 		usage(0);
 		error = 1;
 		break;
 	}
 
 	if (cam_dev != NULL)
 		cam_close_device(cam_dev);
 
 	exit(error);
 }
Index: projects/fuse2/sbin/camcontrol/timestamp.c
===================================================================
--- projects/fuse2/sbin/camcontrol/timestamp.c	(revision 350434)
+++ projects/fuse2/sbin/camcontrol/timestamp.c	(revision 350435)
@@ -1,508 +1,501 @@
 /*-
  * Copyright (c) 2016 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * Authors: Ken Merry           (Spectra Logic Corporation)
  *          Reid Linnemann      (Spectra Logic Corporation)
  *          Samuel Klopsch      (Spectra Logic Corporation)
  */
 /*
  * SCSI tape drive timestamp support
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <string.h>
 #include <err.h>
 #include <time.h>
 #include <locale.h>
 
 #include <cam/cam.h>
 #include <cam/cam_debug.h>
 #include <cam/cam_ccb.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_message.h>
 #include <camlib.h>
 #include "camcontrol.h"
 
 #define TIMESTAMP_REPORT 0
 #define TIMESTAMP_SET    1
 #define MIL              "milliseconds"
 #define UTC              "utc"
 
 static int set_restore_flags(struct cam_device *device, uint8_t *flags,
 			     int set_flag, int task_attr, int retry_count,
 			     int timeout);
 static int report_timestamp(struct cam_device *device, uint64_t *ts,
 			    int task_attr, int retry_count, int timeout);
 static int set_timestamp(struct cam_device *device, char *format_string,
 			 char *timestamp_string, int task_attr, int retry_count,
 			 int timeout);
 
 static int
 set_restore_flags(struct cam_device *device, uint8_t *flags, int set_flag,
 		  int task_attr, int retry_count, int timeout)
 {
 	unsigned long blk_desc_length, hdr_and_blk_length;
 	int error = 0;
 	struct scsi_control_ext_page *control_page = NULL;
 	struct scsi_mode_header_10 *mode_hdr = NULL;
-	struct scsi_mode_sense_10 *cdb = NULL;
 	union ccb *ccb = NULL;
 	unsigned long mode_buf_size = sizeof(struct scsi_mode_header_10) +
 	    sizeof(struct scsi_mode_blk_desc) +
 	    sizeof(struct scsi_control_ext_page);
 	uint8_t mode_buf[mode_buf_size];
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		error = 1;
 		goto bailout;
 	}
 	/*
 	 * Get the control extension subpage, we'll send it back modified to
 	 * enable SCSI control over the tape drive's timestamp
 	 */
-	scsi_mode_sense_len(&ccb->csio,
+	scsi_mode_sense_subpage(&ccb->csio,
 	    /*retries*/ retry_count,
 	    /*cbfcnp*/ NULL,
 	    /*tag_action*/ task_attr,
 	    /*dbd*/ 0,
 	    /*page_control*/ SMS_PAGE_CTRL_CURRENT,
 	    /*page*/ SCEP_PAGE_CODE,
+	    /*subpage*/ SCEP_SUBPAGE_CODE,
 	    /*param_buf*/ &mode_buf[0],
 	    /*param_len*/ mode_buf_size,
 	    /*minimum_cmd_size*/ 10,
 	    /*sense_len*/ SSD_FULL_SIZE,
 	    /*timeout*/ timeout ? timeout : 5000);
-	/*
-	 * scsi_mode_sense_len does not have a subpage argument at the moment,
-	 * so we have to manually set the subpage code before calling
-	 * cam_send_ccb().
-	 */
-	cdb = (struct scsi_mode_sense_10 *)ccb->csio.cdb_io.cdb_bytes;
-	cdb->subpage = SCEP_SUBPAGE_CODE;
 
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 	if (retry_count > 0)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	error = cam_send_ccb(device, ccb);
 	if (error != 0) {
 		warn("error sending Mode Sense");
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 		error = 1;
 		goto bailout;
 	}
 
 	mode_hdr = (struct scsi_mode_header_10 *)&mode_buf[0];
 	blk_desc_length = scsi_2btoul(mode_hdr->blk_desc_len);
 	hdr_and_blk_length = sizeof(struct scsi_mode_header_10)+blk_desc_length;
 	/*
 	 * Create the control page at the correct point in the mode_buf, it
 	 * starts after the header and the blk description.
 	 */
 	assert(hdr_and_blk_length <=
 	    sizeof(mode_buf) - sizeof(struct scsi_control_ext_page));
 	control_page = (struct scsi_control_ext_page *)&mode_buf
 	    [hdr_and_blk_length];
 	if (set_flag != 0) {
 		*flags = control_page->flags;
 		/*
 		 * Set the SCSIP flag to enable SCSI to change the
 		 * tape drive's timestamp.
 		 */
 		control_page->flags |= SCEP_SCSIP;
 	} else {
 		control_page->flags = *flags;
 	}
 
 	scsi_mode_select_len(&ccb->csio,
 	    /*retries*/ retry_count,
 	    /*cbfcnp*/ NULL,
 	    /*tag_action*/ task_attr,
 	    /*scsi_page_fmt*/ 1,
 	    /*save_pages*/ 0,
 	    /*param_buf*/ &mode_buf[0],
 	    /*param_len*/ mode_buf_size,
 	    /*minimum_cmd_size*/ 10,
 	    /*sense_len*/ SSD_FULL_SIZE,
 	    /*timeout*/ timeout ? timeout : 5000);
 
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 	if (retry_count > 0)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	error = cam_send_ccb(device, ccb);
 	if (error != 0) {
 		warn("error sending Mode Select");
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 		error = 1;
 		goto bailout;
 	}
 
 bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	return error;
 }
 
 static int
 report_timestamp(struct cam_device *device, uint64_t *ts, int task_attr,
 		 int retry_count, int timeout)
 {
 	int error = 0;
 	struct scsi_report_timestamp_data *report_buf = malloc(
 		sizeof(struct scsi_report_timestamp_data));
 	uint8_t temp_timestamp[8];
 	uint32_t report_buf_size = sizeof(
 	    struct scsi_report_timestamp_data);
 	union ccb *ccb = NULL;
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		error = 1;
 		goto bailout;
 	}
 
 	scsi_report_timestamp(&ccb->csio,
 	    /*retries*/ retry_count,
 	    /*cbfcnp*/ NULL,
 	    /*tag_action*/ task_attr,
 	    /*pdf*/ 0,
 	    /*buf*/ report_buf,
 	    /*buf_len*/ report_buf_size,
 	    /*sense_len*/ SSD_FULL_SIZE,
 	    /*timeout*/ timeout ? timeout : 5000);
 
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 	if (retry_count > 0)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	error = cam_send_ccb(device, ccb);
 	if (error != 0) {
 		warn("error sending Report Timestamp");
 		goto bailout;
 	}
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 		error = 1;
 		goto bailout;
 	}
 
 	bzero(temp_timestamp, sizeof(temp_timestamp));
 	memcpy(&temp_timestamp[2], &report_buf->timestamp, 6);
 
 	*ts = scsi_8btou64(temp_timestamp);
 
 bailout:
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 	free(report_buf);
 
 	return error;
 }
 
 static int
 set_timestamp(struct cam_device *device, char *format_string,
 	      char *timestamp_string, int task_attr, int retry_count,
 	      int timeout)
 {
 	struct scsi_set_timestamp_parameters ts_p;
 	time_t time_value;
 	struct tm time_struct;
 	uint8_t flags = 0;
 	int error = 0;
 	uint64_t ts = 0;
 	union ccb *ccb = NULL;
 	int do_restore_flags = 0;
 
 	error = set_restore_flags(device, &flags, /*set_flag*/ 1, task_attr,
 				  retry_count, timeout);
 	if (error != 0)
 		goto bailout;
 
 	do_restore_flags = 1;
 
 	ccb = cam_getccb(device);
 	if (ccb == NULL) {
 		warnx("%s: error allocating CCB", __func__);
 		error = 1;
 		goto bailout;
 	}
 
 	if (strcmp(format_string, UTC) == 0) {
 		time(&time_value);
 		ts = (uint64_t) time_value;
 	} else {
 		bzero(&time_struct, sizeof(struct tm));
 		if (strptime(timestamp_string, format_string,
 		    &time_struct) == NULL) {
 			warnx("%s: strptime(3) failed", __func__);
 			error = 1;
 			goto bailout;
 		}
 		time_value = mktime(&time_struct);
 		ts = (uint64_t) time_value;
 	}
 	/* Convert time from seconds to milliseconds */
 	ts *= 1000;
 	bzero(&ts_p, sizeof(ts_p));
 	scsi_create_timestamp(ts_p.timestamp, ts);
 
 	scsi_set_timestamp(&ccb->csio,
 	    /*retries*/ retry_count,
 	    /*cbfcnp*/ NULL,
 	    /*tag_action*/ task_attr,
 	    /*buf*/ &ts_p,
 	    /*buf_len*/ sizeof(ts_p),
 	    /*sense_len*/ SSD_FULL_SIZE,
 	    /*timeout*/ timeout ? timeout : 5000);
 
 	ccb->ccb_h.flags |= CAM_DEV_QFRZDIS;
 	if (retry_count > 0)
 		ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER;
 
 	error = cam_send_ccb(device, ccb);
 	if (error != 0) {
 		warn("error sending Set Timestamp");
 		goto bailout;
 	}
 
 	if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) {
 		cam_error_print(device, ccb, CAM_ESF_ALL,
 				CAM_EPF_ALL, stderr);
 		error = 1;
 		goto bailout;
 	}
 
 	printf("Timestamp set to %ju\n", (uintmax_t)ts);
 
 bailout:
 	if (do_restore_flags != 0)
 		error = set_restore_flags(device, &flags, /*set_flag*/ 0,
 					  task_attr, retry_count, timeout);
 	if (ccb != NULL)
 		cam_freeccb(ccb);
 
 	return error;
 }
 
 int
 timestamp(struct cam_device *device, int argc, char **argv, char *combinedopt,
 	  int task_attr, int retry_count, int timeout, int verbosemode __unused)
 {
 	int c;
 	uint64_t ts = 0;
 	char *format_string = NULL;
 	char *timestamp_string = NULL;
 	int action = -1;
 	int error = 0;
 	int single_arg = 0;
 	int do_utc = 0;
 
 	while ((c = getopt(argc, argv, combinedopt)) != -1) {
 		switch (c) {
 		case 'r': {
 			if (action != -1) {
 				warnx("Use only one -r or only one -s");
 				error =1;
 				goto bailout;
 			}
 			action = TIMESTAMP_REPORT;
 			break;
 		}
 		case 's': {
 			if (action != -1) {
 				warnx("Use only one -r or only one -s");
 				error = 1;
 				goto bailout;
 			}
 			action = TIMESTAMP_SET;
 			break;
 		}
 		case 'f': {
 			single_arg++;
 			free(format_string);
 			format_string = strdup(optarg);
 			if (format_string == NULL) {
 				warn("Error allocating memory for format "
 				   "argument");
 				error = 1;
 				goto bailout;
 			}
 			break;
 		}
 		case 'm': {
 			single_arg++;
 			free(format_string);
 			format_string = strdup(MIL);
 			if (format_string == NULL) {
 				warn("Error allocating memory");
 				error = 1;
 				goto bailout;
 			}
 			break;
 		}
 		case 'U': {
 			do_utc = 1;
 			break;
 		}
 		case 'T':
 			free(timestamp_string);
 			timestamp_string = strdup(optarg);
 			if (timestamp_string == NULL) {
 				warn("Error allocating memory for format "
 				   "argument");
 				error = 1;
 				goto bailout;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (action == -1) {
 		warnx("Must specify an action, either -r or -s");
 		error = 1;
 		goto bailout;
 	}
 
 	if (single_arg > 1) {
 		warnx("Select only one: %s",
 		    (action == TIMESTAMP_REPORT) ?
 		    "-f format or -m for the -r flag" : 
 		    "-f format -T time or -U for the -s flag");
 		error = 1;
 		goto bailout;
 	}
 
 	if (action == TIMESTAMP_SET) {
 		if ((format_string == NULL)
 		 && (do_utc == 0)) {
 			warnx("Must specify either -f format or -U for "
 			    "setting the timestamp");
 			error = 1;
 		} else if ((format_string != NULL)
 			&& (do_utc != 0)) {
 			warnx("Must specify only one of -f or -U to set "
 			    "the timestamp");
 			error = 1;
 		} else if ((format_string != NULL)
 			&& (strcmp(format_string, MIL) == 0)) {
 			warnx("-m is not allowed for setting the "
 			    "timestamp");
 			error = 1;
 		} else if ((do_utc == 0)
 			&& (timestamp_string == NULL)) {
 			warnx("Must specify the time (-T) to set as the "
 			    "timestamp");
 			error = 1;
 		}
 		if (error != 0)
 			goto bailout;
 	} else if (action == TIMESTAMP_REPORT) {
 		if (format_string == NULL) {
 			format_string = strdup("%c %Z");
 			if (format_string == NULL) {
 				warn("Error allocating memory for format "
 				    "string");
 				error = 1;
 				goto bailout;
 			}
 		}
 	}
 
 	if (action == TIMESTAMP_REPORT) {
 		error = report_timestamp(device, &ts, task_attr, retry_count,
 		    timeout);
 		if (error != 0) {
 			goto bailout;
 		} else if (strcmp(format_string, MIL) == 0) {
 			printf("Timestamp in milliseconds: %ju\n",
 			    (uintmax_t)ts);
 		} else {
 			char temp_timestamp_string[100];
 			time_t time_var = ts / 1000;
 			const struct tm *restrict cur_time;
 
 			setlocale(LC_TIME, "");
 			if (do_utc != 0)
 				cur_time = gmtime(&time_var);
 			else
 				cur_time = localtime(&time_var);
 
 			strftime(temp_timestamp_string,
 			    sizeof(temp_timestamp_string), format_string,
 			    cur_time);
 			printf("Formatted timestamp: %s\n",
 			    temp_timestamp_string);
 		}
 	} else if (action == TIMESTAMP_SET) {
 		if (do_utc != 0) {
 			format_string = strdup(UTC);
 			if (format_string == NULL) {
 				warn("Error allocating memory for format "
 				    "string");
 				error = 1;
 				goto bailout;
 			}
 		}
 
 		error = set_timestamp(device, format_string, timestamp_string,
 		    task_attr, retry_count, timeout);
 	}
 
 bailout:
 	free(format_string);
 	free(timestamp_string);
 
 	return (error);
 }
Index: projects/fuse2/sbin/nvmecontrol/identify_ext.c
===================================================================
--- projects/fuse2/sbin/nvmecontrol/identify_ext.c	(revision 350434)
+++ projects/fuse2/sbin/nvmecontrol/identify_ext.c	(revision 350435)
@@ -1,223 +1,246 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 2012-2013 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #include <ctype.h>
 #include <err.h>
 #include <fcntl.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "nvmecontrol.h"
 #include "nvmecontrol_ext.h"
 
 void
 nvme_print_controller(struct nvme_controller_data *cdata)
 {
 	uint8_t str[128];
 	char cbuf[UINT128_DIG + 1];
 	uint16_t oncs, oacs;
-	uint8_t compare, write_unc, dsm, vwc_present;
+	uint8_t compare, write_unc, dsm, t;
 	uint8_t security, fmt, fw, nsmgmt;
 	uint8_t	fw_slot1_ro, fw_num_slots;
 	uint8_t ns_smart;
 	uint8_t sqes_max, sqes_min;
 	uint8_t cqes_max, cqes_min;
 
 	oncs = cdata->oncs;
 	compare = (oncs >> NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT) &
 		NVME_CTRLR_DATA_ONCS_COMPARE_MASK;
 	write_unc = (oncs >> NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT) &
 		NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK;
 	dsm = (oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) &
 		NVME_CTRLR_DATA_ONCS_DSM_MASK;
-	vwc_present = (cdata->vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) &
-		NVME_CTRLR_DATA_VWC_PRESENT_MASK;
 
 	oacs = cdata->oacs;
 	security = (oacs >> NVME_CTRLR_DATA_OACS_SECURITY_SHIFT) &
 		NVME_CTRLR_DATA_OACS_SECURITY_MASK;
 	fmt = (oacs >> NVME_CTRLR_DATA_OACS_FORMAT_SHIFT) &
 		NVME_CTRLR_DATA_OACS_FORMAT_MASK;
 	fw = (oacs >> NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT) &
 		NVME_CTRLR_DATA_OACS_FIRMWARE_MASK;
 	nsmgmt = (oacs >> NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT) &
 		NVME_CTRLR_DATA_OACS_NSMGMT_MASK;
 
 	fw_num_slots = (cdata->frmw >> NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT) &
 		NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK;
 	fw_slot1_ro = (cdata->frmw >> NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT) &
 		NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK;
 
 	ns_smart = (cdata->lpa >> NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT) &
 		NVME_CTRLR_DATA_LPA_NS_SMART_MASK;
 
 	sqes_min = (cdata->sqes >> NVME_CTRLR_DATA_SQES_MIN_SHIFT) &
 		NVME_CTRLR_DATA_SQES_MIN_MASK;
 	sqes_max = (cdata->sqes >> NVME_CTRLR_DATA_SQES_MAX_SHIFT) &
 		NVME_CTRLR_DATA_SQES_MAX_MASK;
 
 	cqes_min = (cdata->cqes >> NVME_CTRLR_DATA_CQES_MIN_SHIFT) &
 		NVME_CTRLR_DATA_CQES_MIN_MASK;
 	cqes_max = (cdata->cqes >> NVME_CTRLR_DATA_CQES_MAX_SHIFT) &
 		NVME_CTRLR_DATA_CQES_MAX_MASK;
 
 	printf("Controller Capabilities/Features\n");
 	printf("================================\n");
 	printf("Vendor ID:                   %04x\n", cdata->vid);
 	printf("Subsystem Vendor ID:         %04x\n", cdata->ssvid);
 	nvme_strvis(str, cdata->sn, sizeof(str), NVME_SERIAL_NUMBER_LENGTH);
 	printf("Serial Number:               %s\n", str);
 	nvme_strvis(str, cdata->mn, sizeof(str), NVME_MODEL_NUMBER_LENGTH);
 	printf("Model Number:                %s\n", str);
 	nvme_strvis(str, cdata->fr, sizeof(str), NVME_FIRMWARE_REVISION_LENGTH);
 	printf("Firmware Version:            %s\n", str);
 	printf("Recommended Arb Burst:       %d\n", cdata->rab);
 	printf("IEEE OUI Identifier:         %02x %02x %02x\n",
 		cdata->ieee[0], cdata->ieee[1], cdata->ieee[2]);
-	printf("Multi-Path I/O Capabilities: %s%s%s%s\n",
+	printf("Multi-Path I/O Capabilities: %s%s%s%s%s\n",
 	    (cdata->mic == 0) ? "Not Supported" : "",
+	    ((cdata->mic >> NVME_CTRLR_DATA_MIC_ANAR_SHIFT) &
+	     NVME_CTRLR_DATA_MIC_SRIOVVF_MASK) ? "Asymmetric, " : "",
 	    ((cdata->mic >> NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT) &
 	     NVME_CTRLR_DATA_MIC_SRIOVVF_MASK) ? "SR-IOV VF, " : "",
 	    ((cdata->mic >> NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT) &
 	     NVME_CTRLR_DATA_MIC_MCTRLRS_MASK) ? "Multiple controllers, " : "",
 	    ((cdata->mic >> NVME_CTRLR_DATA_MIC_MPORTS_SHIFT) &
 	     NVME_CTRLR_DATA_MIC_MPORTS_MASK) ? "Multiple ports" : "");
 	/* TODO: Use CAP.MPSMIN to determine true memory page size. */
 	printf("Max Data Transfer Size:      ");
 	if (cdata->mdts == 0)
 		printf("Unlimited\n");
 	else
 		printf("%ld\n", PAGE_SIZE * (1L << cdata->mdts));
 	printf("Controller ID:               0x%02x\n", cdata->ctrlr_id);
 	printf("Version:                     %d.%d.%d\n",
 	    (cdata->ver >> 16) & 0xffff, (cdata->ver >> 8) & 0xff,
 	    cdata->ver & 0xff);
 	printf("\n");
 
 	printf("Admin Command Set Attributes\n");
 	printf("============================\n");
 	printf("Security Send/Receive:       %s\n",
 		security ? "Supported" : "Not Supported");
 	printf("Format NVM:                  %s\n",
 		fmt ? "Supported" : "Not Supported");
 	printf("Firmware Activate/Download:  %s\n",
 		fw ? "Supported" : "Not Supported");
 	printf("Namespace Managment:         %s\n",
 		nsmgmt ? "Supported" : "Not Supported");
 	printf("Device Self-test:            %sSupported\n",
 	    ((oacs >> NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT) &
 	     NVME_CTRLR_DATA_OACS_SELFTEST_MASK) ? "" : "Not ");
 	printf("Directives:                  %sSupported\n",
 	    ((oacs >> NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT) &
 	     NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK) ? "" : "Not ");
 	printf("NVMe-MI Send/Receive:        %sSupported\n",
 	    ((oacs >> NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT) &
 	     NVME_CTRLR_DATA_OACS_NVMEMI_MASK) ? "" : "Not ");
 	printf("Virtualization Management:   %sSupported\n",
 	    ((oacs >> NVME_CTRLR_DATA_OACS_VM_SHIFT) &
 	     NVME_CTRLR_DATA_OACS_VM_MASK) ? "" : "Not ");
-	printf("Doorbell Buffer Config       %sSupported\n",
+	printf("Doorbell Buffer Config:      %sSupported\n",
 	    ((oacs >> NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT) &
 	     NVME_CTRLR_DATA_OACS_DBBUFFER_MASK) ? "" : "Not ");
+	printf("Get LBA Status:              %sSupported\n",
+	    ((oacs >> NVME_CTRLR_DATA_OACS_GETLBA_SHIFT) &
+	     NVME_CTRLR_DATA_OACS_GETLBA_MASK) ? "" : "Not ");
+	printf("Sanitize:                    ");
+	if (cdata->sanicap != 0) {
+		printf("%s%s%s\n",
+		    ((cdata->sanicap >> NVME_CTRLR_DATA_SANICAP_CES_SHIFT) &
+		     NVME_CTRLR_DATA_SANICAP_CES_SHIFT) ? "crypto, " : "",
+		    ((cdata->sanicap >> NVME_CTRLR_DATA_SANICAP_BES_SHIFT) &
+		     NVME_CTRLR_DATA_SANICAP_BES_SHIFT) ? "block, " : "",
+		    ((cdata->sanicap >> NVME_CTRLR_DATA_SANICAP_OWS_SHIFT) &
+		     NVME_CTRLR_DATA_SANICAP_OWS_SHIFT) ? "overwrite" : "");
+	} else {
+		printf("Not Supported\n");
+	}
 	printf("Abort Command Limit:         %d\n", cdata->acl+1);
 	printf("Async Event Request Limit:   %d\n", cdata->aerl+1);
 	printf("Number of Firmware Slots:    ");
 	if (fw != 0)
 		printf("%d\n", fw_num_slots);
 	else
 		printf("N/A\n");
 	printf("Firmware Slot 1 Read-Only:   ");
 	if (fw != 0)
 		printf("%s\n", fw_slot1_ro ? "Yes" : "No");
 	else
 		printf("N/A\n");
 	printf("Per-Namespace SMART Log:     %s\n",
 		ns_smart ? "Yes" : "No");
 	printf("Error Log Page Entries:      %d\n", cdata->elpe+1);
 	printf("Number of Power States:      %d\n", cdata->npss+1);
 
 	printf("\n");
 	printf("NVM Command Set Attributes\n");
 	printf("==========================\n");
 	printf("Submission Queue Entry Size\n");
 	printf("  Max:                       %d\n", 1 << sqes_max);
 	printf("  Min:                       %d\n", 1 << sqes_min);
 	printf("Completion Queue Entry Size\n");
 	printf("  Max:                       %d\n", 1 << cqes_max);
 	printf("  Min:                       %d\n", 1 << cqes_min);
 	printf("Number of Namespaces:        %d\n", cdata->nn);
 	printf("Compare Command:             %s\n",
 		compare ? "Supported" : "Not Supported");
 	printf("Write Uncorrectable Command: %s\n",
 		write_unc ? "Supported" : "Not Supported");
 	printf("Dataset Management Command:  %s\n",
 		dsm ? "Supported" : "Not Supported");
 	printf("Write Zeroes Command:        %sSupported\n",
 	    ((oncs >> NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT) &
 	     NVME_CTRLR_DATA_ONCS_WRZERO_MASK) ? "" : "Not ");
 	printf("Save Features:               %sSupported\n",
 	    ((oncs >> NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT) &
 	     NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK) ? "" : "Not ");
 	printf("Reservations:                %sSupported\n",
 	    ((oncs >> NVME_CTRLR_DATA_ONCS_RESERV_SHIFT) &
 	     NVME_CTRLR_DATA_ONCS_RESERV_MASK) ? "" : "Not ");
 	printf("Timestamp feature:           %sSupported\n",
 	    ((oncs >> NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT) &
 	     NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK) ? "" : "Not ");
+	printf("Verify feature:              %sSupported\n",
+	    ((oncs >> NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT) &
+	     NVME_CTRLR_DATA_ONCS_VERIFY_MASK) ? "" : "Not ");
 	printf("Fused Operation Support:     %s%s\n",
 	    (cdata->fuses == 0) ? "Not Supported" : "",
 	    ((cdata->fuses >> NVME_CTRLR_DATA_FUSES_CNW_SHIFT) &
 	     NVME_CTRLR_DATA_FUSES_CNW_MASK) ? "Compare and Write" : "");
 	printf("Format NVM Attributes:       %s%s Erase, %s Format\n",
 	    ((cdata->fna >> NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT) &
 	     NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK) ? "Crypto Erase, " : "",
 	    ((cdata->fna >> NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT) &
 	     NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK) ? "All-NVM" : "Per-NS",
 	    ((cdata->fna >> NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT) &
 	     NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK) ? "All-NVM" : "Per-NS");
-	printf("Volatile Write Cache:        %s\n",
-		vwc_present ? "Present" : "Not Present");
+	t = (cdata->vwc >> NVME_CTRLR_DATA_VWC_ALL_SHIFT) &
+	    NVME_CTRLR_DATA_VWC_ALL_MASK;
+	printf("Volatile Write Cache:        %s%s\n",
+	    ((cdata->vwc >> NVME_CTRLR_DATA_VWC_PRESENT_SHIFT) &
+	     NVME_CTRLR_DATA_VWC_PRESENT_MASK) ? "Present" : "Not Present",
+	    (t == NVME_CTRLR_DATA_VWC_ALL_NO) ? ", no flush all" :
+	    (t == NVME_CTRLR_DATA_VWC_ALL_YES) ? ", flush all" : "");
 
 	if (nsmgmt) {
 		printf("\n");
 		printf("Namespace Drive Attributes\n");
 		printf("==========================\n");
 		printf("NVM total cap:               %s\n",
 			   uint128_to_str(to128(cdata->untncap.tnvmcap), cbuf, sizeof(cbuf)));
 		printf("NVM unallocated cap:         %s\n",
 			   uint128_to_str(to128(cdata->untncap.unvmcap), cbuf, sizeof(cbuf)));
 	}
 }
Index: projects/fuse2/share/man/man4/cc_dctcp.4
===================================================================
--- projects/fuse2/share/man/man4/cc_dctcp.4	(revision 350434)
+++ projects/fuse2/share/man/man4/cc_dctcp.4	(revision 350435)
@@ -1,133 +1,146 @@
 .\"
 .\" Copyright (c) 2014 Midori Kato <katoon@sfc.wide.ad.jp>
 .\" Copyright (c) 2014 The FreeBSD Foundation
 .\" All rights reserved.
 .\"
 .\" Portions of this documentation were written at Keio University, Japan.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
 .\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\" $FreeBSD$
 .\"
-.Dd January 12, 2015
+.Dd July 29, 2019
 .Dt CC_DCTCP 4
 .Os
 .Sh NAME
 .Nm cc_dctcp
 .Nd DCTCP Congestion Control Algorithm
 .Sh DESCRIPTION
 The DCTCP (data center TCP) congestion control algorithm aims to maximise
 throughput and minimise latency in data center networks by utilising the
 proportion of Explicit Congestion Notification (ECN) marks received from capable
 hardware as a congestion signal.
 .Pp
 DCTCP uses fraction of ECN marked packets to update congestion window.
 The window reduction ratio is always <= 1/2.
 Only when all of the packets are
 marked, congestion window is halved.
 .Pp
 In order to keep the accuracy of the ECN marked fraction, a DCTCP receiver
 mirrors back incoming (or missing) CE marks by setting (or clearing) ECE marks.
 This feedback methodology is also adopted when the receiver uses delayed ACK.
 .Pp
 The
 .Fx
 DCTCP implementation includes two minor modifications for the one-sided
 deployment.
 Considering the situation that DCTCP is used as sender and classic
 ECN is used as receiver, DCTCP sets the CWR flag as the reaction to the ECE
 flag.
 In addition, when classic ECN is used as sender and DCTCP is used as
 receiver, DCTCP avoids to mirror back ACKs only when the CWR flag is
 set in the incoming packet.
 .Pp
-The other specifications are based on the paper and Internet Draft referenced
+The other specifications are based on the paper and the RFC referenced
 in the
 .Sx SEE ALSO
 section below.
 .Sh MIB Variables
 The algorithm exposes the following tunable variables in the
 .Va net.inet.tcp.cc.dctcp
 branch of the
 .Xr sysctl 3
 MIB:
-.Bl -tag -width ".Va alpha"
+.Bl -tag -width ".Va slowstart"
 .It Va alpha
-An initial estimator of the congestion on the link.
-Default is 0.
-.It Va dctcp_shift_g
-An estimation gain in the alpha calculation.
-Default is 16.
+The initial value to estimate the congestion on the link.
+The valid range is from 0 to 1024, where 1024 reduces the congestion
+window to half, if a CE is observed in the first window and
+.Va alpha
+could not yet adjust to the congestion level on that path.
+Default is 1024.
+.It Va shift_g
+An estimation gain in the
+.Va alpha
+calculation.
+This influences the responsiveness when adjusting alpha
+to the most recent observed window.
+Valid range from 0 to 10, the default is 4, resulting in an effective
+gain of 1 / ( 2 ^
+.Va shift_g
+), or 1/16th.
 .It Va slowstart
-A trigger to halve congestion window after slow start.
-Default does nothing to halve window.
+A flag if the congestion window should be reduced by one half after slow start.
+Valid settings 0 and 1, default 0.
 .El
 .Sh SEE ALSO
 .Xr cc_chd 4 ,
 .Xr cc_cubic 4 ,
 .Xr cc_hd 4 ,
 .Xr cc_htcp 4 ,
 .Xr cc_newreno 4 ,
 .Xr cc_vegas 4 ,
 .Xr mod_cc 4 ,
 .Xr tcp 4 ,
 .Xr mod_cc 9
 .Rs
 .%A "Mohammad Alizadeh"
 .%A "Albert Greenberg"
 .%A "David A. Maltz"
 .%A "Jitendra Padhye"
 .%A "Parveen Patel"
 .%A "Balaji Prabhakar"
 .%A "Sudipta Sengupta"
 .%A "Murari Sridharan"
 .%T "Data Center TCP (DCTCP)"
 .%U "http://research.microsoft.com/pubs/121386/dctcp-public.pdf"
 .%J "ACM SIGCOMM 2010"
 .%D "July 2010"
 .%P "63-74"
 .Re
 .Rs
 .%A "Stephen Bensley"
-.%A "Lars Eggert"
 .%A "Dave Thaler"
-.%T "Microsoft's Datacenter TCP (DCTCP): TCP Congestion Control for Datacenters"
-.%U "http://tools.ietf.org/html/draft-bensley-tcpm-dctcp-01"
+.%A "Praveen Balasubramanian"
+.%A "Lars Eggert"
+.%A "Glenn Judd"
+.%T "Data Center TCP (DCTCP): TCP Congestion Control for Data Centers"
+.%U "https://tools.ietf.org/html/rfc8257"
 .Re
 .Sh HISTORY
 The
 .Nm
 congestion control module first appeared in
 .Fx 11.0 .
 .Pp
 The module was first released in 2014 by Midori Kato studying at Keio
 University, Japan.
 .Sh AUTHORS
 .An -nosplit
 The
 .Nm
 congestion control module and this manual page were written by
 .An Midori Kato Mt katoon@sfc.wide.ad.jp
 and
 .An Lars Eggert Mt lars@netapp.com
 with help and modifications from
 .An Hiren Panchasara Mt hiren@FreeBSD.org
Index: projects/fuse2/sys/arm/ti/cpsw/if_cpsw.c
===================================================================
--- projects/fuse2/sys/arm/ti/cpsw/if_cpsw.c	(revision 350434)
+++ projects/fuse2/sys/arm/ti/cpsw/if_cpsw.c	(revision 350435)
@@ -1,2995 +1,2997 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Damjan Marion <dmarion@Freebsd.org>
  * Copyright (c) 2016 Rubicon Communications, LLC (Netgate)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * TI Common Platform Ethernet Switch (CPSW) Driver
  * Found in TI8148 "DaVinci" and AM335x "Sitara" SoCs.
  *
  * This controller is documented in the AM335x Technical Reference
  * Manual, in the TMS320DM814x DaVinci Digital Video Processors TRM
  * and in the TMS320C6452 3 Port Switch Ethernet Subsystem TRM.
  *
  * It is basically a single Ethernet port (port 0) wired internally to
  * a 3-port store-and-forward switch connected to two independent
  * "sliver" controllers (port 1 and port 2).  You can operate the
  * controller in a variety of different ways by suitably configuring
  * the slivers and the Address Lookup Engine (ALE) that routes packets
  * between the ports.
  *
  * This code was developed and tested on a BeagleBone with
  * an AM335x SoC.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cpsw.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <machine/stdarg.h>
 
 #include <net/ethernet.h>
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 
 #include <arm/ti/ti_scm.h>
 #include <arm/ti/am335x/am335x_scm.h>
 
 #include <dev/mii/mii.h>
 #include <dev/mii/miivar.h>
 
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <dev/fdt/fdt_common.h>
  
 #ifdef CPSW_ETHERSWITCH
 #include <dev/etherswitch/etherswitch.h>
 #include "etherswitch_if.h"
 #endif
 
 #include "if_cpswreg.h"
 #include "if_cpswvar.h"
 
 #include "miibus_if.h"
 
 /* Device probe/attach/detach. */
 static int cpsw_probe(device_t);
 static int cpsw_attach(device_t);
 static int cpsw_detach(device_t);
 static int cpswp_probe(device_t);
 static int cpswp_attach(device_t);
 static int cpswp_detach(device_t);
 
 static phandle_t cpsw_get_node(device_t, device_t);
 
 /* Device Init/shutdown. */
 static int cpsw_shutdown(device_t);
 static void cpswp_init(void *);
 static void cpswp_init_locked(void *);
 static void cpswp_stop_locked(struct cpswp_softc *);
 
 /* Device Suspend/Resume. */
 static int cpsw_suspend(device_t);
 static int cpsw_resume(device_t);
 
 /* Ioctl. */
 static int cpswp_ioctl(struct ifnet *, u_long command, caddr_t data);
 
 static int cpswp_miibus_readreg(device_t, int phy, int reg);
 static int cpswp_miibus_writereg(device_t, int phy, int reg, int value);
 static void cpswp_miibus_statchg(device_t);
 
 /* Send/Receive packets. */
 static void cpsw_intr_rx(void *arg);
 static struct mbuf *cpsw_rx_dequeue(struct cpsw_softc *);
 static void cpsw_rx_enqueue(struct cpsw_softc *);
 static void cpswp_start(struct ifnet *);
 static void cpsw_intr_tx(void *);
 static void cpswp_tx_enqueue(struct cpswp_softc *);
 static int cpsw_tx_dequeue(struct cpsw_softc *);
 
 /* Misc interrupts and watchdog. */
 static void cpsw_intr_rx_thresh(void *);
 static void cpsw_intr_misc(void *);
 static void cpswp_tick(void *);
 static void cpswp_ifmedia_sts(struct ifnet *, struct ifmediareq *);
 static int cpswp_ifmedia_upd(struct ifnet *);
 static void cpsw_tx_watchdog(void *);
 
 /* ALE support */
 static void cpsw_ale_read_entry(struct cpsw_softc *, uint16_t, uint32_t *);
 static void cpsw_ale_write_entry(struct cpsw_softc *, uint16_t, uint32_t *);
 static int cpsw_ale_mc_entry_set(struct cpsw_softc *, uint8_t, int, uint8_t *);
 static void cpsw_ale_dump_table(struct cpsw_softc *);
 static int cpsw_ale_update_vlan_table(struct cpsw_softc *, int, int, int, int,
 	int);
 static int cpswp_ale_update_addresses(struct cpswp_softc *, int);
 
 /* Statistics and sysctls. */
 static void cpsw_add_sysctls(struct cpsw_softc *);
 static void cpsw_stats_collect(struct cpsw_softc *);
 static int cpsw_stats_sysctl(SYSCTL_HANDLER_ARGS);
 
 #ifdef CPSW_ETHERSWITCH
 static etherswitch_info_t *cpsw_getinfo(device_t);
 static int cpsw_getport(device_t, etherswitch_port_t *);
 static int cpsw_setport(device_t, etherswitch_port_t *);
 static int cpsw_getconf(device_t, etherswitch_conf_t *);
 static int cpsw_getvgroup(device_t, etherswitch_vlangroup_t *);
 static int cpsw_setvgroup(device_t, etherswitch_vlangroup_t *);
 static int cpsw_readreg(device_t, int);
 static int cpsw_writereg(device_t, int, int);
 static int cpsw_readphy(device_t, int, int);
 static int cpsw_writephy(device_t, int, int, int);
 #endif
 
 /*
  * Arbitrary limit on number of segments in an mbuf to be transmitted.
  * Packets with more segments than this will be defragmented before
  * they are queued.
  */
 #define	CPSW_TXFRAGS		16
 
 /* Shared resources. */
 static device_method_t cpsw_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		cpsw_probe),
 	DEVMETHOD(device_attach,	cpsw_attach),
 	DEVMETHOD(device_detach,	cpsw_detach),
 	DEVMETHOD(device_shutdown,	cpsw_shutdown),
 	DEVMETHOD(device_suspend,	cpsw_suspend),
 	DEVMETHOD(device_resume,	cpsw_resume),
 	/* Bus interface */
 	DEVMETHOD(bus_add_child,	device_add_child_ordered),
 	/* OFW methods */
 	DEVMETHOD(ofw_bus_get_node,	cpsw_get_node),
 #ifdef CPSW_ETHERSWITCH
 	/* etherswitch interface */
 	DEVMETHOD(etherswitch_getinfo,	cpsw_getinfo),
 	DEVMETHOD(etherswitch_readreg,	cpsw_readreg),
 	DEVMETHOD(etherswitch_writereg,	cpsw_writereg),
 	DEVMETHOD(etherswitch_readphyreg,	cpsw_readphy),
 	DEVMETHOD(etherswitch_writephyreg,	cpsw_writephy),
 	DEVMETHOD(etherswitch_getport,	cpsw_getport),
 	DEVMETHOD(etherswitch_setport,	cpsw_setport),
 	DEVMETHOD(etherswitch_getvgroup,	cpsw_getvgroup),
 	DEVMETHOD(etherswitch_setvgroup,	cpsw_setvgroup),
 	DEVMETHOD(etherswitch_getconf,	cpsw_getconf),
 #endif
 	DEVMETHOD_END
 };
 
 static driver_t cpsw_driver = {
 	"cpswss",
 	cpsw_methods,
 	sizeof(struct cpsw_softc),
 };
 
 static devclass_t cpsw_devclass;
 
 DRIVER_MODULE(cpswss, simplebus, cpsw_driver, cpsw_devclass, 0, 0);
 
 /* Port/Slave resources. */
 static device_method_t cpswp_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		cpswp_probe),
 	DEVMETHOD(device_attach,	cpswp_attach),
 	DEVMETHOD(device_detach,	cpswp_detach),
 	/* MII interface */
 	DEVMETHOD(miibus_readreg,	cpswp_miibus_readreg),
 	DEVMETHOD(miibus_writereg,	cpswp_miibus_writereg),
 	DEVMETHOD(miibus_statchg,	cpswp_miibus_statchg),
 	DEVMETHOD_END
 };
 
 static driver_t cpswp_driver = {
 	"cpsw",
 	cpswp_methods,
 	sizeof(struct cpswp_softc),
 };
 
 static devclass_t cpswp_devclass;
 
 #ifdef CPSW_ETHERSWITCH
 DRIVER_MODULE(etherswitch, cpswss, etherswitch_driver, etherswitch_devclass, 0, 0);
 MODULE_DEPEND(cpswss, etherswitch, 1, 1, 1);
 #endif
 
 DRIVER_MODULE(cpsw, cpswss, cpswp_driver, cpswp_devclass, 0, 0);
 DRIVER_MODULE(miibus, cpsw, miibus_driver, miibus_devclass, 0, 0);
 MODULE_DEPEND(cpsw, ether, 1, 1, 1);
 MODULE_DEPEND(cpsw, miibus, 1, 1, 1);
 
 #ifdef CPSW_ETHERSWITCH
 static struct cpsw_vlangroups cpsw_vgroups[CPSW_VLANS];
 #endif
 
 static uint32_t slave_mdio_addr[] = { 0x4a100200, 0x4a100300 };
 
 static struct resource_spec irq_res_spec[] = {
 	{ SYS_RES_IRQ, 0, RF_ACTIVE | RF_SHAREABLE },
 	{ SYS_RES_IRQ, 1, RF_ACTIVE | RF_SHAREABLE },
 	{ SYS_RES_IRQ, 2, RF_ACTIVE | RF_SHAREABLE },
 	{ SYS_RES_IRQ, 3, RF_ACTIVE | RF_SHAREABLE },
 	{ -1, 0 }
 };
 
 static struct {
 	void (*cb)(void *);
 } cpsw_intr_cb[] = {
 	{ cpsw_intr_rx_thresh },
 	{ cpsw_intr_rx },
 	{ cpsw_intr_tx },
 	{ cpsw_intr_misc },
 };
 
 /* Number of entries here must match size of stats
  * array in struct cpswp_softc. */
 static struct cpsw_stat {
 	int	reg;
 	char *oid;
 } cpsw_stat_sysctls[CPSW_SYSCTL_COUNT] = {
 	{0x00, "GoodRxFrames"},
 	{0x04, "BroadcastRxFrames"},
 	{0x08, "MulticastRxFrames"},
 	{0x0C, "PauseRxFrames"},
 	{0x10, "RxCrcErrors"},
 	{0x14, "RxAlignErrors"},
 	{0x18, "OversizeRxFrames"},
 	{0x1c, "RxJabbers"},
 	{0x20, "ShortRxFrames"},
 	{0x24, "RxFragments"},
 	{0x30, "RxOctets"},
 	{0x34, "GoodTxFrames"},
 	{0x38, "BroadcastTxFrames"},
 	{0x3c, "MulticastTxFrames"},
 	{0x40, "PauseTxFrames"},
 	{0x44, "DeferredTxFrames"},
 	{0x48, "CollisionsTxFrames"},
 	{0x4c, "SingleCollisionTxFrames"},
 	{0x50, "MultipleCollisionTxFrames"},
 	{0x54, "ExcessiveCollisions"},
 	{0x58, "LateCollisions"},
 	{0x5c, "TxUnderrun"},
 	{0x60, "CarrierSenseErrors"},
 	{0x64, "TxOctets"},
 	{0x68, "RxTx64OctetFrames"},
 	{0x6c, "RxTx65to127OctetFrames"},
 	{0x70, "RxTx128to255OctetFrames"},
 	{0x74, "RxTx256to511OctetFrames"},
 	{0x78, "RxTx512to1024OctetFrames"},
 	{0x7c, "RxTx1024upOctetFrames"},
 	{0x80, "NetOctets"},
 	{0x84, "RxStartOfFrameOverruns"},
 	{0x88, "RxMiddleOfFrameOverruns"},
 	{0x8c, "RxDmaOverruns"}
 };
 
 /*
  * Basic debug support.
  */
 
 static void
 cpsw_debugf_head(const char *funcname)
 {
 	int t = (int)(time_second % (24 * 60 * 60));
 
 	printf("%02d:%02d:%02d %s ", t / (60 * 60), (t / 60) % 60, t % 60, funcname);
 }
 
 static void
 cpsw_debugf(const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
 	va_end(ap);
 	printf("\n");
 
 }
 
 #define	CPSW_DEBUGF(_sc, a) do {					\
 	if ((_sc)->debug) {						\
 		cpsw_debugf_head(__func__);				\
 		cpsw_debugf a;						\
 	}								\
 } while (0)
 
 /*
  * Locking macros
  */
 #define	CPSW_TX_LOCK(sc) do {						\
 		mtx_assert(&(sc)->rx.lock, MA_NOTOWNED);		\
 		mtx_lock(&(sc)->tx.lock);				\
 } while (0)
 
 #define	CPSW_TX_UNLOCK(sc)	mtx_unlock(&(sc)->tx.lock)
 #define	CPSW_TX_LOCK_ASSERT(sc)	mtx_assert(&(sc)->tx.lock, MA_OWNED)
 
 #define	CPSW_RX_LOCK(sc) do {						\
 		mtx_assert(&(sc)->tx.lock, MA_NOTOWNED);		\
 		mtx_lock(&(sc)->rx.lock);				\
 } while (0)
 
 #define	CPSW_RX_UNLOCK(sc)		mtx_unlock(&(sc)->rx.lock)
 #define	CPSW_RX_LOCK_ASSERT(sc)	mtx_assert(&(sc)->rx.lock, MA_OWNED)
 
 #define CPSW_PORT_LOCK(_sc) do {					\
 		mtx_assert(&(_sc)->lock, MA_NOTOWNED);			\
 		mtx_lock(&(_sc)->lock);					\
 } while (0)
 
 #define	CPSW_PORT_UNLOCK(_sc)	mtx_unlock(&(_sc)->lock)
 #define	CPSW_PORT_LOCK_ASSERT(_sc)	mtx_assert(&(_sc)->lock, MA_OWNED)
 
 /*
  * Read/Write macros
  */
 #define	cpsw_read_4(_sc, _reg)		bus_read_4((_sc)->mem_res, (_reg))
 #define	cpsw_write_4(_sc, _reg, _val)					\
 	bus_write_4((_sc)->mem_res, (_reg), (_val))
 
 #define	cpsw_cpdma_bd_offset(i)	(CPSW_CPPI_RAM_OFFSET + ((i)*16))
 
 #define	cpsw_cpdma_bd_paddr(sc, slot)					\
 	BUS_SPACE_PHYSADDR(sc->mem_res, slot->bd_offset)
 #define	cpsw_cpdma_read_bd(sc, slot, val)				\
 	bus_read_region_4(sc->mem_res, slot->bd_offset, (uint32_t *) val, 4)
 #define	cpsw_cpdma_write_bd(sc, slot, val)				\
 	bus_write_region_4(sc->mem_res, slot->bd_offset, (uint32_t *) val, 4)
 #define	cpsw_cpdma_write_bd_next(sc, slot, next_slot)			\
 	cpsw_write_4(sc, slot->bd_offset, cpsw_cpdma_bd_paddr(sc, next_slot))
 #define	cpsw_cpdma_write_bd_flags(sc, slot, val)			\
 	bus_write_2(sc->mem_res, slot->bd_offset + 14, val)
 #define	cpsw_cpdma_read_bd_flags(sc, slot)				\
 	bus_read_2(sc->mem_res, slot->bd_offset + 14)
 #define	cpsw_write_hdp_slot(sc, queue, slot)				\
 	cpsw_write_4(sc, (queue)->hdp_offset, cpsw_cpdma_bd_paddr(sc, slot))
 #define	CP_OFFSET (CPSW_CPDMA_TX_CP(0) - CPSW_CPDMA_TX_HDP(0))
 #define	cpsw_read_cp(sc, queue)						\
 	cpsw_read_4(sc, (queue)->hdp_offset + CP_OFFSET) 
 #define	cpsw_write_cp(sc, queue, val)					\
 	cpsw_write_4(sc, (queue)->hdp_offset + CP_OFFSET, (val))
 #define	cpsw_write_cp_slot(sc, queue, slot)				\
 	cpsw_write_cp(sc, queue, cpsw_cpdma_bd_paddr(sc, slot))
 
 #if 0
 /* XXX temporary function versions for debugging. */
 static void
 cpsw_write_hdp_slotX(struct cpsw_softc *sc, struct cpsw_queue *queue, struct cpsw_slot *slot)
 {
 	uint32_t reg = queue->hdp_offset;
 	uint32_t v = cpsw_cpdma_bd_paddr(sc, slot);
 	CPSW_DEBUGF(("HDP <=== 0x%08x (was 0x%08x)", v, cpsw_read_4(sc, reg)));
 	cpsw_write_4(sc, reg, v);
 }
 
 static void
 cpsw_write_cp_slotX(struct cpsw_softc *sc, struct cpsw_queue *queue, struct cpsw_slot *slot)
 {
 	uint32_t v = cpsw_cpdma_bd_paddr(sc, slot);
 	CPSW_DEBUGF(("CP <=== 0x%08x (expecting 0x%08x)", v, cpsw_read_cp(sc, queue)));
 	cpsw_write_cp(sc, queue, v);
 }
 #endif
 
 /*
  * Expanded dump routines for verbose debugging.
  */
 static void
 cpsw_dump_slot(struct cpsw_softc *sc, struct cpsw_slot *slot)
 {
 	static const char *flags[] = {"SOP", "EOP", "Owner", "EOQ",
 	    "TDownCmplt", "PassCRC", "Long", "Short", "MacCtl", "Overrun",
 	    "PktErr1", "PortEn/PktErr0", "RxVlanEncap", "Port2", "Port1",
 	    "Port0"};
 	struct cpsw_cpdma_bd bd;
 	const char *sep;
 	int i;
 
 	cpsw_cpdma_read_bd(sc, slot, &bd);
 	printf("BD Addr : 0x%08x   Next  : 0x%08x\n",
 	    cpsw_cpdma_bd_paddr(sc, slot), bd.next);
 	printf("  BufPtr: 0x%08x   BufLen: 0x%08x\n", bd.bufptr, bd.buflen);
 	printf("  BufOff: 0x%08x   PktLen: 0x%08x\n", bd.bufoff, bd.pktlen);
 	printf("  Flags: ");
 	sep = "";
 	for (i = 0; i < 16; ++i) {
 		if (bd.flags & (1 << (15 - i))) {
 			printf("%s%s", sep, flags[i]);
 			sep = ",";
 		}
 	}
 	printf("\n");
 	if (slot->mbuf) {
 		printf("  Ether:  %14D\n",
 		    (char *)(slot->mbuf->m_data), " ");
 		printf("  Packet: %16D\n",
 		    (char *)(slot->mbuf->m_data) + 14, " ");
 	}
 }
 
 #define	CPSW_DUMP_SLOT(cs, slot) do {				\
 	IF_DEBUG(sc) {						\
 		cpsw_dump_slot(sc, slot);			\
 	}							\
 } while (0)
 
 static void
 cpsw_dump_queue(struct cpsw_softc *sc, struct cpsw_slots *q)
 {
 	struct cpsw_slot *slot;
 	int i = 0;
 	int others = 0;
 
 	STAILQ_FOREACH(slot, q, next) {
 		if (i > CPSW_TXFRAGS)
 			++others;
 		else
 			cpsw_dump_slot(sc, slot);
 		++i;
 	}
 	if (others)
 		printf(" ... and %d more.\n", others);
 	printf("\n");
 }
 
 #define CPSW_DUMP_QUEUE(sc, q) do {				\
 	IF_DEBUG(sc) {						\
 		cpsw_dump_queue(sc, q);				\
 	}							\
 } while (0)
 
 static void
 cpsw_init_slots(struct cpsw_softc *sc)
 {
 	struct cpsw_slot *slot;
 	int i;
 
 	STAILQ_INIT(&sc->avail);
 
 	/* Put the slot descriptors onto the global avail list. */
 	for (i = 0; i < nitems(sc->_slots); i++) {
 		slot = &sc->_slots[i];
 		slot->bd_offset = cpsw_cpdma_bd_offset(i);
 		STAILQ_INSERT_TAIL(&sc->avail, slot, next);
 	}
 }
 
 static int
 cpsw_add_slots(struct cpsw_softc *sc, struct cpsw_queue *queue, int requested)
 {
 	const int max_slots = nitems(sc->_slots);
 	struct cpsw_slot *slot;
 	int i;
 
 	if (requested < 0)
 		requested = max_slots;
 
 	for (i = 0; i < requested; ++i) {
 		slot = STAILQ_FIRST(&sc->avail);
 		if (slot == NULL)
 			return (0);
 		if (bus_dmamap_create(sc->mbuf_dtag, 0, &slot->dmamap)) {
 			device_printf(sc->dev, "failed to create dmamap\n");
 			return (ENOMEM);
 		}
 		STAILQ_REMOVE_HEAD(&sc->avail, next);
 		STAILQ_INSERT_TAIL(&queue->avail, slot, next);
 		++queue->avail_queue_len;
 		++queue->queue_slots;
 	}
 	return (0);
 }
 
 static void
 cpsw_free_slot(struct cpsw_softc *sc, struct cpsw_slot *slot)
 {
 	int error;
 
 	if (slot->dmamap) {
 		if (slot->mbuf)
 			bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap);
 		error = bus_dmamap_destroy(sc->mbuf_dtag, slot->dmamap);
 		KASSERT(error == 0, ("Mapping still active"));
 		slot->dmamap = NULL;
 	}
 	if (slot->mbuf) {
 		m_freem(slot->mbuf);
 		slot->mbuf = NULL;
 	}
 }
 
 static void
 cpsw_reset(struct cpsw_softc *sc)
 {
 	int i;
 
 	callout_stop(&sc->watchdog.callout);
 
 	/* Reset RMII/RGMII wrapper. */
 	cpsw_write_4(sc, CPSW_WR_SOFT_RESET, 1);
 	while (cpsw_read_4(sc, CPSW_WR_SOFT_RESET) & 1)
 		;
 
 	/* Disable TX and RX interrupts for all cores. */
 	for (i = 0; i < 3; ++i) {
 		cpsw_write_4(sc, CPSW_WR_C_RX_THRESH_EN(i), 0x00);
 		cpsw_write_4(sc, CPSW_WR_C_TX_EN(i), 0x00);
 		cpsw_write_4(sc, CPSW_WR_C_RX_EN(i), 0x00);
 		cpsw_write_4(sc, CPSW_WR_C_MISC_EN(i), 0x00);
 	}
 
 	/* Reset CPSW subsystem. */
 	cpsw_write_4(sc, CPSW_SS_SOFT_RESET, 1);
 	while (cpsw_read_4(sc, CPSW_SS_SOFT_RESET) & 1)
 		;
 
 	/* Reset Sliver port 1 and 2 */
 	for (i = 0; i < 2; i++) {
 		/* Reset */
 		cpsw_write_4(sc, CPSW_SL_SOFT_RESET(i), 1);
 		while (cpsw_read_4(sc, CPSW_SL_SOFT_RESET(i)) & 1)
 			;
 	}
 
 	/* Reset DMA controller. */
 	cpsw_write_4(sc, CPSW_CPDMA_SOFT_RESET, 1);
 	while (cpsw_read_4(sc, CPSW_CPDMA_SOFT_RESET) & 1)
 		;
 
 	/* Disable TX & RX DMA */
 	cpsw_write_4(sc, CPSW_CPDMA_TX_CONTROL, 0);
 	cpsw_write_4(sc, CPSW_CPDMA_RX_CONTROL, 0);
 
 	/* Clear all queues. */
 	for (i = 0; i < 8; i++) {
 		cpsw_write_4(sc, CPSW_CPDMA_TX_HDP(i), 0);
 		cpsw_write_4(sc, CPSW_CPDMA_RX_HDP(i), 0);
 		cpsw_write_4(sc, CPSW_CPDMA_TX_CP(i), 0);
 		cpsw_write_4(sc, CPSW_CPDMA_RX_CP(i), 0);
 	}
 
 	/* Clear all interrupt Masks */
 	cpsw_write_4(sc, CPSW_CPDMA_RX_INTMASK_CLEAR, 0xFFFFFFFF);
 	cpsw_write_4(sc, CPSW_CPDMA_TX_INTMASK_CLEAR, 0xFFFFFFFF);
 }
 
 static void
 cpsw_init(struct cpsw_softc *sc)
 {
 	struct cpsw_slot *slot;
 	uint32_t reg;
 
 	/* Disable the interrupt pacing. */
 	reg = cpsw_read_4(sc, CPSW_WR_INT_CONTROL);
 	reg &= ~(CPSW_WR_INT_PACE_EN | CPSW_WR_INT_PRESCALE_MASK);
 	cpsw_write_4(sc, CPSW_WR_INT_CONTROL, reg);
 
 	/* Clear ALE */
 	cpsw_write_4(sc, CPSW_ALE_CONTROL, CPSW_ALE_CTL_CLEAR_TBL);
 
 	/* Enable ALE */
 	reg = CPSW_ALE_CTL_ENABLE;
 	if (sc->dualemac)
 		reg |= CPSW_ALE_CTL_VLAN_AWARE;
 	cpsw_write_4(sc, CPSW_ALE_CONTROL, reg);
 
 	/* Set Host Port Mapping. */
 	cpsw_write_4(sc, CPSW_PORT_P0_CPDMA_TX_PRI_MAP, 0x76543210);
 	cpsw_write_4(sc, CPSW_PORT_P0_CPDMA_RX_CH_MAP, 0);
 
 	/* Initialize ALE: set host port to forwarding(3). */
 	cpsw_write_4(sc, CPSW_ALE_PORTCTL(0),
 	    ALE_PORTCTL_INGRESS | ALE_PORTCTL_FORWARD);
 
 	cpsw_write_4(sc, CPSW_SS_PTYPE, 0);
 
 	/* Enable statistics for ports 0, 1 and 2 */
 	cpsw_write_4(sc, CPSW_SS_STAT_PORT_EN, 7);
 
 	/* Turn off flow control. */
 	cpsw_write_4(sc, CPSW_SS_FLOW_CONTROL, 0);
 
 	/* Make IP hdr aligned with 4 */
 	cpsw_write_4(sc, CPSW_CPDMA_RX_BUFFER_OFFSET, 2);
 
 	/* Initialize RX Buffer Descriptors */
 	cpsw_write_4(sc, CPSW_CPDMA_RX_PENDTHRESH(0), 0);
 	cpsw_write_4(sc, CPSW_CPDMA_RX_FREEBUFFER(0), 0);
 
 	/* Enable TX & RX DMA */
 	cpsw_write_4(sc, CPSW_CPDMA_TX_CONTROL, 1);
 	cpsw_write_4(sc, CPSW_CPDMA_RX_CONTROL, 1);
 
 	/* Enable Interrupts for core 0 */
 	cpsw_write_4(sc, CPSW_WR_C_RX_THRESH_EN(0), 0xFF);
 	cpsw_write_4(sc, CPSW_WR_C_RX_EN(0), 0xFF);
 	cpsw_write_4(sc, CPSW_WR_C_TX_EN(0), 0xFF);
 	cpsw_write_4(sc, CPSW_WR_C_MISC_EN(0), 0x1F);
 
 	/* Enable host Error Interrupt */
 	cpsw_write_4(sc, CPSW_CPDMA_DMA_INTMASK_SET, 3);
 
 	/* Enable interrupts for RX and TX on Channel 0 */
 	cpsw_write_4(sc, CPSW_CPDMA_RX_INTMASK_SET,
 	    CPSW_CPDMA_RX_INT(0) | CPSW_CPDMA_RX_INT_THRESH(0));
 	cpsw_write_4(sc, CPSW_CPDMA_TX_INTMASK_SET, 1);
 
 	/* Initialze MDIO - ENABLE, PREAMBLE=0, FAULTENB, CLKDIV=0xFF */
 	/* TODO Calculate MDCLK=CLK/(CLKDIV+1) */
 	cpsw_write_4(sc, MDIOCONTROL, MDIOCTL_ENABLE | MDIOCTL_FAULTENB | 0xff);
 
 	/* Select MII in GMII_SEL, Internal Delay mode */
 	//ti_scm_reg_write_4(0x650, 0);
 
 	/* Initialize active queues. */
 	slot = STAILQ_FIRST(&sc->tx.active);
 	if (slot != NULL)
 		cpsw_write_hdp_slot(sc, &sc->tx, slot);
 	slot = STAILQ_FIRST(&sc->rx.active);
 	if (slot != NULL)
 		cpsw_write_hdp_slot(sc, &sc->rx, slot);
 	cpsw_rx_enqueue(sc);
 	cpsw_write_4(sc, CPSW_CPDMA_RX_FREEBUFFER(0), sc->rx.active_queue_len);
 	cpsw_write_4(sc, CPSW_CPDMA_RX_PENDTHRESH(0), CPSW_TXFRAGS);
 
 	/* Activate network interface. */
 	sc->rx.running = 1;
 	sc->tx.running = 1;
 	sc->watchdog.timer = 0;
 	callout_init(&sc->watchdog.callout, 0);
 	callout_reset(&sc->watchdog.callout, hz, cpsw_tx_watchdog, sc);
 }
 
 /*
  *
  * Device Probe, Attach, Detach.
  *
  */
 
 static int
 cpsw_probe(device_t dev)
 {
 
 	if (!ofw_bus_status_okay(dev))
 		return (ENXIO);
 
 	if (!ofw_bus_is_compatible(dev, "ti,cpsw"))
 		return (ENXIO);
 
 	device_set_desc(dev, "3-port Switch Ethernet Subsystem");
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 cpsw_intr_attach(struct cpsw_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < CPSW_INTR_COUNT; i++) {
 		if (bus_setup_intr(sc->dev, sc->irq_res[i],
 		    INTR_TYPE_NET | INTR_MPSAFE, NULL,
 		    cpsw_intr_cb[i].cb, sc, &sc->ih_cookie[i]) != 0) {
 			return (-1);
 		}
 	}
 
 	return (0);
 }
 
 static void
 cpsw_intr_detach(struct cpsw_softc *sc)
 {
 	int i;
 
 	for (i = 0; i < CPSW_INTR_COUNT; i++) {
 		if (sc->ih_cookie[i]) {
 			bus_teardown_intr(sc->dev, sc->irq_res[i],
 			    sc->ih_cookie[i]);
 		}
 	}
 }
 
 static int
 cpsw_get_fdt_data(struct cpsw_softc *sc, int port)
 {
 	char *name;
 	int len, phy, vlan;
 	pcell_t phy_id[3], vlan_id;
 	phandle_t child;
 	unsigned long mdio_child_addr;
 
 	/* Find any slave with phy-handle/phy_id */
 	phy = -1;
 	vlan = -1;
 	for (child = OF_child(sc->node); child != 0; child = OF_peer(child)) {
 		if (OF_getprop_alloc(child, "name", (void **)&name) < 0)
 			continue;
 		if (sscanf(name, "slave@%lx", &mdio_child_addr) != 1) {
 			OF_prop_free(name);
 			continue;
 		}
 		OF_prop_free(name);
-		if (mdio_child_addr != slave_mdio_addr[port])
+
+		if (mdio_child_addr != slave_mdio_addr[port] &&
+		    mdio_child_addr != (slave_mdio_addr[port] & 0xFFF))
 			continue;
 
 		if (fdt_get_phyaddr(child, NULL, &phy, NULL) != 0){
 			/* Users with old DTB will have phy_id instead */
 			phy = -1;
 			len = OF_getproplen(child, "phy_id");
 			if (len / sizeof(pcell_t) == 2) {
 				/* Get phy address from fdt */
 				if (OF_getencprop(child, "phy_id", phy_id, len) > 0)
 					phy = phy_id[1];
 			}
 		}
 
 		len = OF_getproplen(child, "dual_emac_res_vlan");
 		if (len / sizeof(pcell_t) == 1) {
 			/* Get phy address from fdt */
 			if (OF_getencprop(child, "dual_emac_res_vlan",
 			    &vlan_id, len) > 0) {
 				vlan = vlan_id;
 			}
 		}
 
 		break;
 	}
 	if (phy == -1)
 		return (ENXIO);
 	sc->port[port].phy = phy;
 	sc->port[port].vlan = vlan;
 
 	return (0);
 }
 
 static int
 cpsw_attach(device_t dev)
 {
 	int error, i;
 	struct cpsw_softc *sc;
 	uint32_t reg;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 	sc->node = ofw_bus_get_node(dev);
 	getbinuptime(&sc->attach_uptime);
 
 	if (OF_getencprop(sc->node, "active_slave", &sc->active_slave,
 	    sizeof(sc->active_slave)) <= 0) {
 		sc->active_slave = 0;
 	}
 	if (sc->active_slave > 1)
 		sc->active_slave = 1;
 
 	if (OF_hasprop(sc->node, "dual_emac"))
 		sc->dualemac = 1;
 
 	for (i = 0; i < CPSW_PORTS; i++) {
 		if (!sc->dualemac && i != sc->active_slave)
 			continue;
 		if (cpsw_get_fdt_data(sc, i) != 0) {
 			device_printf(dev,
 			    "failed to get PHY address from FDT\n");
 			return (ENXIO);
 		}
 	}
 
 	/* Initialize mutexes */
 	mtx_init(&sc->tx.lock, device_get_nameunit(dev),
 	    "cpsw TX lock", MTX_DEF);
 	mtx_init(&sc->rx.lock, device_get_nameunit(dev),
 	    "cpsw RX lock", MTX_DEF);
 
 	/* Allocate IRQ resources */
 	error = bus_alloc_resources(dev, irq_res_spec, sc->irq_res);
 	if (error) {
 		device_printf(dev, "could not allocate IRQ resources\n");
 		cpsw_detach(dev);
 		return (ENXIO);
 	}
 
 	sc->mem_rid = 0;
 	sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY, 
 	    &sc->mem_rid, RF_ACTIVE);
 	if (sc->mem_res == NULL) {
 		device_printf(sc->dev, "failed to allocate memory resource\n");
 		cpsw_detach(dev);
 		return (ENXIO);
 	}
 
 	reg = cpsw_read_4(sc, CPSW_SS_IDVER);
 	device_printf(dev, "CPSW SS Version %d.%d (%d)\n", (reg >> 8 & 0x7),
 		reg & 0xFF, (reg >> 11) & 0x1F);
 
 	cpsw_add_sysctls(sc);
 
 	/* Allocate a busdma tag and DMA safe memory for mbufs. */
 	error = bus_dma_tag_create(
 		bus_get_dma_tag(sc->dev),	/* parent */
 		1, 0,				/* alignment, boundary */
 		BUS_SPACE_MAXADDR_32BIT,	/* lowaddr */
 		BUS_SPACE_MAXADDR,		/* highaddr */
 		NULL, NULL,			/* filtfunc, filtfuncarg */
 		MCLBYTES, CPSW_TXFRAGS,		/* maxsize, nsegments */
 		MCLBYTES, 0,			/* maxsegsz, flags */
 		NULL, NULL,			/* lockfunc, lockfuncarg */
 		&sc->mbuf_dtag);		/* dmatag */
 	if (error) {
 		device_printf(dev, "bus_dma_tag_create failed\n");
 		cpsw_detach(dev);
 		return (error);
 	}
 
 	/* Allocate a NULL buffer for padding. */
 	sc->nullpad = malloc(ETHER_MIN_LEN, M_DEVBUF, M_WAITOK | M_ZERO);
 
 	cpsw_init_slots(sc);
 
 	/* Allocate slots to TX and RX queues. */
 	STAILQ_INIT(&sc->rx.avail);
 	STAILQ_INIT(&sc->rx.active);
 	STAILQ_INIT(&sc->tx.avail);
 	STAILQ_INIT(&sc->tx.active);
 	// For now:  128 slots to TX, rest to RX.
 	// XXX TODO: start with 32/64 and grow dynamically based on demand.
 	if (cpsw_add_slots(sc, &sc->tx, 128) ||
 	    cpsw_add_slots(sc, &sc->rx, -1)) {
 		device_printf(dev, "failed to allocate dmamaps\n");
 		cpsw_detach(dev);
 		return (ENOMEM);
 	}
 	device_printf(dev, "Initial queue size TX=%d RX=%d\n",
 	    sc->tx.queue_slots, sc->rx.queue_slots);
 
 	sc->tx.hdp_offset = CPSW_CPDMA_TX_HDP(0);
 	sc->rx.hdp_offset = CPSW_CPDMA_RX_HDP(0);
 
 	if (cpsw_intr_attach(sc) == -1) {
 		device_printf(dev, "failed to setup interrupts\n");
 		cpsw_detach(dev);
 		return (ENXIO);
 	}
 
 #ifdef CPSW_ETHERSWITCH
 	for (i = 0; i < CPSW_VLANS; i++)
 		cpsw_vgroups[i].vid = -1;
 #endif
 
 	/* Reset the controller. */
 	cpsw_reset(sc);
 	cpsw_init(sc);
 
 	for (i = 0; i < CPSW_PORTS; i++) {
 		if (!sc->dualemac && i != sc->active_slave)
 			continue;
 		sc->port[i].dev = device_add_child(dev, "cpsw", i);
 		if (sc->port[i].dev == NULL) {
 			cpsw_detach(dev);
 			return (ENXIO);
 		}
 	}
 	bus_generic_probe(dev);
 	bus_generic_attach(dev);
 
 	return (0);
 }
 
 static int
 cpsw_detach(device_t dev)
 {
 	struct cpsw_softc *sc;
 	int error, i;
 
 	bus_generic_detach(dev);
  	sc = device_get_softc(dev);
 
 	for (i = 0; i < CPSW_PORTS; i++) {
 		if (sc->port[i].dev)
 			device_delete_child(dev, sc->port[i].dev);
 	}
 
 	if (device_is_attached(dev)) {
 		callout_stop(&sc->watchdog.callout);
 		callout_drain(&sc->watchdog.callout);
 	}
 
 	/* Stop and release all interrupts */
 	cpsw_intr_detach(sc);
 
 	/* Free dmamaps and mbufs */
 	for (i = 0; i < nitems(sc->_slots); ++i)
 		cpsw_free_slot(sc, &sc->_slots[i]);
 
 	/* Free null padding buffer. */
 	if (sc->nullpad)
 		free(sc->nullpad, M_DEVBUF);
 
 	/* Free DMA tag */
 	if (sc->mbuf_dtag) {
 		error = bus_dma_tag_destroy(sc->mbuf_dtag);
 		KASSERT(error == 0, ("Unable to destroy DMA tag"));
 	}
 
 	/* Free IO memory handler */
 	if (sc->mem_res != NULL)
 		bus_release_resource(dev, SYS_RES_MEMORY, sc->mem_rid, sc->mem_res);
 	bus_release_resources(dev, irq_res_spec, sc->irq_res);
 
 	/* Destroy mutexes */
 	mtx_destroy(&sc->rx.lock);
 	mtx_destroy(&sc->tx.lock);
 
 	/* Detach the switch device, if present. */
 	error = bus_generic_detach(dev);
 	if (error != 0)
 		return (error);
         
 	return (device_delete_children(dev));
 }
 
 static phandle_t
 cpsw_get_node(device_t bus, device_t dev)
 {
 
 	/* Share controller node with port device. */
 	return (ofw_bus_get_node(bus));
 }
 
 static int
 cpswp_probe(device_t dev)
 {
 
 	if (device_get_unit(dev) > 1) {
 		device_printf(dev, "Only two ports are supported.\n");
 		return (ENXIO);
 	}
 	device_set_desc(dev, "Ethernet Switch Port");
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 cpswp_attach(device_t dev)
 {
 	int error;
 	struct ifnet *ifp;
 	struct cpswp_softc *sc;
 	uint32_t reg;
 	uint8_t mac_addr[ETHER_ADDR_LEN];
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 	sc->pdev = device_get_parent(dev);
 	sc->swsc = device_get_softc(sc->pdev);
 	sc->unit = device_get_unit(dev);
 	sc->phy = sc->swsc->port[sc->unit].phy;
 	sc->vlan = sc->swsc->port[sc->unit].vlan;
 	if (sc->swsc->dualemac && sc->vlan == -1)
 		sc->vlan = sc->unit + 1;
 
 	if (sc->unit == 0) {
 		sc->physel = MDIOUSERPHYSEL0;
 		sc->phyaccess = MDIOUSERACCESS0;
 	} else {
 		sc->physel = MDIOUSERPHYSEL1;
 		sc->phyaccess = MDIOUSERACCESS1;
 	}
 
 	mtx_init(&sc->lock, device_get_nameunit(dev), "cpsw port lock",
 	    MTX_DEF);
 
 	/* Allocate network interface */
 	ifp = sc->ifp = if_alloc(IFT_ETHER);
 	if (ifp == NULL) {
 		cpswp_detach(dev);
 		return (ENXIO);
 	}
 
 	if_initname(ifp, device_get_name(sc->dev), sc->unit);
 	ifp->if_softc = sc;
 	ifp->if_flags = IFF_SIMPLEX | IFF_MULTICAST | IFF_BROADCAST;
 	ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_HWCSUM; //FIXME VLAN?
 	ifp->if_capenable = ifp->if_capabilities;
 
 	ifp->if_init = cpswp_init;
 	ifp->if_start = cpswp_start;
 	ifp->if_ioctl = cpswp_ioctl;
 
 	ifp->if_snd.ifq_drv_maxlen = sc->swsc->tx.queue_slots;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifp->if_snd.ifq_drv_maxlen);
 	IFQ_SET_READY(&ifp->if_snd);
 
 	/* Get high part of MAC address from control module (mac_id[0|1]_hi) */
 	ti_scm_reg_read_4(SCM_MAC_ID0_HI + sc->unit * 8, &reg);
 	mac_addr[0] = reg & 0xFF;
 	mac_addr[1] = (reg >>  8) & 0xFF;
 	mac_addr[2] = (reg >> 16) & 0xFF;
 	mac_addr[3] = (reg >> 24) & 0xFF;
 
 	/* Get low part of MAC address from control module (mac_id[0|1]_lo) */
 	ti_scm_reg_read_4(SCM_MAC_ID0_LO + sc->unit * 8, &reg);
 	mac_addr[4] = reg & 0xFF;
 	mac_addr[5] = (reg >>  8) & 0xFF;
 
 	error = mii_attach(dev, &sc->miibus, ifp, cpswp_ifmedia_upd,
 	    cpswp_ifmedia_sts, BMSR_DEFCAPMASK, sc->phy, MII_OFFSET_ANY, 0);
 	if (error) {
 		device_printf(dev, "attaching PHYs failed\n");
 		cpswp_detach(dev);
 		return (error);
 	}
 	sc->mii = device_get_softc(sc->miibus);
 
 	/* Select PHY and enable interrupts */
 	cpsw_write_4(sc->swsc, sc->physel,
 	    MDIO_PHYSEL_LINKINTENB | (sc->phy & 0x1F));
 
 	ether_ifattach(sc->ifp, mac_addr);
 	callout_init(&sc->mii_callout, 0);
 
 	return (0);
 }
 
 static int
 cpswp_detach(device_t dev)
 {
 	struct cpswp_softc *sc;
 
 	sc = device_get_softc(dev);
 	CPSW_DEBUGF(sc->swsc, (""));
 	if (device_is_attached(dev)) {
 		ether_ifdetach(sc->ifp);
 		CPSW_PORT_LOCK(sc);
 		cpswp_stop_locked(sc);
 		CPSW_PORT_UNLOCK(sc);
 		callout_drain(&sc->mii_callout);
 	}
 
 	bus_generic_detach(dev);
 
 	if_free(sc->ifp);
 	mtx_destroy(&sc->lock);
 
 	return (0);
 }
 
 /*
  *
  * Init/Shutdown.
  *
  */
 
 static int
 cpsw_ports_down(struct cpsw_softc *sc)
 {
 	struct cpswp_softc *psc;
 	struct ifnet *ifp1, *ifp2;
 
 	if (!sc->dualemac)
 		return (1);
 	psc = device_get_softc(sc->port[0].dev);
 	ifp1 = psc->ifp;
 	psc = device_get_softc(sc->port[1].dev);
 	ifp2 = psc->ifp;
 	if ((ifp1->if_flags & IFF_UP) == 0 && (ifp2->if_flags & IFF_UP) == 0)
 		return (1);
 
 	return (0);
 }
 
 static void
 cpswp_init(void *arg)
 {
 	struct cpswp_softc *sc = arg;
 
 	CPSW_DEBUGF(sc->swsc, (""));
 	CPSW_PORT_LOCK(sc);
 	cpswp_init_locked(arg);
 	CPSW_PORT_UNLOCK(sc);
 }
 
 static void
 cpswp_init_locked(void *arg)
 {
 #ifdef CPSW_ETHERSWITCH
 	int i;
 #endif
 	struct cpswp_softc *sc = arg;
 	struct ifnet *ifp;
 	uint32_t reg;
 
 	CPSW_DEBUGF(sc->swsc, (""));
 	CPSW_PORT_LOCK_ASSERT(sc);
 	ifp = sc->ifp;
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0)
 		return;
 
 	getbinuptime(&sc->init_uptime);
 
 	if (!sc->swsc->rx.running && !sc->swsc->tx.running) {
 		/* Reset the controller. */
 		cpsw_reset(sc->swsc);
 		cpsw_init(sc->swsc);
 	}
 
 	/* Set Slave Mapping. */
 	cpsw_write_4(sc->swsc, CPSW_SL_RX_PRI_MAP(sc->unit), 0x76543210);
 	cpsw_write_4(sc->swsc, CPSW_PORT_P_TX_PRI_MAP(sc->unit + 1),
 	    0x33221100);
 	cpsw_write_4(sc->swsc, CPSW_SL_RX_MAXLEN(sc->unit), 0x5f2);
 	/* Enable MAC RX/TX modules. */
 	/* TODO: Docs claim that IFCTL_B and IFCTL_A do the same thing? */
 	/* Huh?  Docs call bit 0 "Loopback" some places, "FullDuplex" others. */
 	reg = cpsw_read_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit));
 	reg |= CPSW_SL_MACTL_GMII_ENABLE;
 	cpsw_write_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit), reg);
 
 	/* Initialize ALE: set port to forwarding, initialize addrs */
 	cpsw_write_4(sc->swsc, CPSW_ALE_PORTCTL(sc->unit + 1),
 	    ALE_PORTCTL_INGRESS | ALE_PORTCTL_FORWARD);
 	cpswp_ale_update_addresses(sc, 1);
 
 	if (sc->swsc->dualemac) {
 		/* Set Port VID. */
 		cpsw_write_4(sc->swsc, CPSW_PORT_P_VLAN(sc->unit + 1),
 		    sc->vlan & 0xfff);
 		cpsw_ale_update_vlan_table(sc->swsc, sc->vlan,
 		    (1 << (sc->unit + 1)) | (1 << 0), /* Member list */
 		    (1 << (sc->unit + 1)) | (1 << 0), /* Untagged egress */
 		    (1 << (sc->unit + 1)) | (1 << 0), 0); /* mcast reg flood */
 #ifdef CPSW_ETHERSWITCH
 		for (i = 0; i < CPSW_VLANS; i++) {
 			if (cpsw_vgroups[i].vid != -1)
 				continue;
 			cpsw_vgroups[i].vid = sc->vlan;
 			break;
 		}
 #endif
 	}
 
 	mii_mediachg(sc->mii);
 	callout_reset(&sc->mii_callout, hz, cpswp_tick, sc);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 }
 
 static int
 cpsw_shutdown(device_t dev)
 {
 	struct cpsw_softc *sc;
 	struct cpswp_softc *psc;
 	int i;
 
  	sc = device_get_softc(dev);
 	CPSW_DEBUGF(sc, (""));
 	for (i = 0; i < CPSW_PORTS; i++) {
 		if (!sc->dualemac && i != sc->active_slave)
 			continue;
 		psc = device_get_softc(sc->port[i].dev);
 		CPSW_PORT_LOCK(psc);
 		cpswp_stop_locked(psc);
 		CPSW_PORT_UNLOCK(psc);
 	}
 
 	return (0);
 }
 
 static void
 cpsw_rx_teardown(struct cpsw_softc *sc)
 {
 	int i = 0;
 
 	CPSW_RX_LOCK(sc);
 	CPSW_DEBUGF(sc, ("starting RX teardown"));
 	sc->rx.teardown = 1;
 	cpsw_write_4(sc, CPSW_CPDMA_RX_TEARDOWN, 0);
 	CPSW_RX_UNLOCK(sc);
 	while (sc->rx.running) {
 		if (++i > 10) {
 			device_printf(sc->dev,
 			    "Unable to cleanly shutdown receiver\n");
 			return;
 		}
 		DELAY(200);
 	}
 	if (!sc->rx.running)
 		CPSW_DEBUGF(sc, ("finished RX teardown (%d retries)", i));
 }
 
 static void
 cpsw_tx_teardown(struct cpsw_softc *sc)
 {
 	int i = 0;
 
 	CPSW_TX_LOCK(sc);
 	CPSW_DEBUGF(sc, ("starting TX teardown"));
 	/* Start the TX queue teardown if queue is not empty. */
 	if (STAILQ_FIRST(&sc->tx.active) != NULL)
 		cpsw_write_4(sc, CPSW_CPDMA_TX_TEARDOWN, 0);
 	else
 		sc->tx.teardown = 1;
 	cpsw_tx_dequeue(sc);
 	while (sc->tx.running && ++i < 10) {
 		DELAY(200);
 		cpsw_tx_dequeue(sc);
 	}
 	if (sc->tx.running) {
 		device_printf(sc->dev,
 		    "Unable to cleanly shutdown transmitter\n");
 	}
 	CPSW_DEBUGF(sc,
 	    ("finished TX teardown (%d retries, %d idle buffers)", i,
 	     sc->tx.active_queue_len));
 	CPSW_TX_UNLOCK(sc);
 }
 
 static void
 cpswp_stop_locked(struct cpswp_softc *sc)
 {
 	struct ifnet *ifp;
 	uint32_t reg;
 
 	ifp = sc->ifp;
 	CPSW_DEBUGF(sc->swsc, (""));
 	CPSW_PORT_LOCK_ASSERT(sc);
 
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 		return;
 
 	/* Disable interface */
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 	ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	/* Stop ticker */
 	callout_stop(&sc->mii_callout);
 
 	/* Tear down the RX/TX queues. */
 	if (cpsw_ports_down(sc->swsc)) {
 		cpsw_rx_teardown(sc->swsc);
 		cpsw_tx_teardown(sc->swsc);
 	}
 
 	/* Stop MAC RX/TX modules. */
 	reg = cpsw_read_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit));
 	reg &= ~CPSW_SL_MACTL_GMII_ENABLE;
 	cpsw_write_4(sc->swsc, CPSW_SL_MACCONTROL(sc->unit), reg);
 
 	if (cpsw_ports_down(sc->swsc)) {
 		/* Capture stats before we reset controller. */
 		cpsw_stats_collect(sc->swsc);
 
 		cpsw_reset(sc->swsc);
 		cpsw_init(sc->swsc);
 	}
 }
 
 /*
  *  Suspend/Resume.
  */
 
 static int
 cpsw_suspend(device_t dev)
 {
 	struct cpsw_softc *sc;
 	struct cpswp_softc *psc;
 	int i;
 
 	sc = device_get_softc(dev);
 	CPSW_DEBUGF(sc, (""));
 	for (i = 0; i < CPSW_PORTS; i++) {
 		if (!sc->dualemac && i != sc->active_slave)
 			continue;
 		psc = device_get_softc(sc->port[i].dev);
 		CPSW_PORT_LOCK(psc);
 		cpswp_stop_locked(psc);
 		CPSW_PORT_UNLOCK(psc);
 	}
 
 	return (0);
 }
 
 static int
 cpsw_resume(device_t dev)
 {
 	struct cpsw_softc *sc;
 
 	sc  = device_get_softc(dev);
 	CPSW_DEBUGF(sc, ("UNIMPLEMENTED"));
 
 	return (0);
 }
 
 /*
  *
  *  IOCTL
  *
  */
 
 static void
 cpsw_set_promisc(struct cpswp_softc *sc, int set)
 {
 	uint32_t reg;
 
 	/*
 	 * Enabling promiscuous mode requires ALE_BYPASS to be enabled.
 	 * That disables the ALE forwarding logic and causes every
 	 * packet to be sent only to the host port.  In bypass mode,
 	 * the ALE processes host port transmit packets the same as in
 	 * normal mode.
 	 */
 	reg = cpsw_read_4(sc->swsc, CPSW_ALE_CONTROL);
 	reg &= ~CPSW_ALE_CTL_BYPASS;
 	if (set)
 		reg |= CPSW_ALE_CTL_BYPASS;
 	cpsw_write_4(sc->swsc, CPSW_ALE_CONTROL, reg);
 }
 
 static void
 cpsw_set_allmulti(struct cpswp_softc *sc, int set)
 {
 	if (set) {
 		printf("All-multicast mode unimplemented\n");
 	}
 }
 
 static int
 cpswp_ioctl(struct ifnet *ifp, u_long command, caddr_t data)
 {
 	struct cpswp_softc *sc;
 	struct ifreq *ifr;
 	int error;
 	uint32_t changed;
 
 	error = 0;
 	sc = ifp->if_softc;
 	ifr = (struct ifreq *)data;
 
 	switch (command) {
 	case SIOCSIFCAP:
 		changed = ifp->if_capenable ^ ifr->ifr_reqcap;
 		if (changed & IFCAP_HWCSUM) {
 			if ((ifr->ifr_reqcap & changed) & IFCAP_HWCSUM)
 				ifp->if_capenable |= IFCAP_HWCSUM;
 			else
 				ifp->if_capenable &= ~IFCAP_HWCSUM;
 		}
 		error = 0;
 		break;
 	case SIOCSIFFLAGS:
 		CPSW_PORT_LOCK(sc);
 		if (ifp->if_flags & IFF_UP) {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 				changed = ifp->if_flags ^ sc->if_flags;
 				CPSW_DEBUGF(sc->swsc,
 				    ("SIOCSIFFLAGS: UP & RUNNING (changed=0x%x)",
 				    changed));
 				if (changed & IFF_PROMISC)
 					cpsw_set_promisc(sc,
 					    ifp->if_flags & IFF_PROMISC);
 				if (changed & IFF_ALLMULTI)
 					cpsw_set_allmulti(sc,
 					    ifp->if_flags & IFF_ALLMULTI);
 			} else {
 				CPSW_DEBUGF(sc->swsc,
 				    ("SIOCSIFFLAGS: starting up"));
 				cpswp_init_locked(sc);
 			}
 		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 			CPSW_DEBUGF(sc->swsc, ("SIOCSIFFLAGS: shutting down"));
 			cpswp_stop_locked(sc);
 		}
 
 		sc->if_flags = ifp->if_flags;
 		CPSW_PORT_UNLOCK(sc);
 		break;
 	case SIOCADDMULTI:
 		cpswp_ale_update_addresses(sc, 0);
 		break;
 	case SIOCDELMULTI:
 		/* Ugh.  DELMULTI doesn't provide the specific address
 		   being removed, so the best we can do is remove
 		   everything and rebuild it all. */
 		cpswp_ale_update_addresses(sc, 1);
 		break;
 	case SIOCGIFMEDIA:
 	case SIOCSIFMEDIA:
 		error = ifmedia_ioctl(ifp, ifr, &sc->mii->mii_media, command);
 		break;
 	default:
 		error = ether_ioctl(ifp, command, data);
 	}
 	return (error);
 }
 
 /*
  *
  * MIIBUS
  *
  */
 static int
 cpswp_miibus_ready(struct cpsw_softc *sc, uint32_t reg)
 {
 	uint32_t r, retries = CPSW_MIIBUS_RETRIES;
 
 	while (--retries) {
 		r = cpsw_read_4(sc, reg);
 		if ((r & MDIO_PHYACCESS_GO) == 0)
 			return (1);
 		DELAY(CPSW_MIIBUS_DELAY);
 	}
 
 	return (0);
 }
 
 static int
 cpswp_miibus_readreg(device_t dev, int phy, int reg)
 {
 	struct cpswp_softc *sc;
 	uint32_t cmd, r;
 
 	sc = device_get_softc(dev);
 	if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) {
 		device_printf(dev, "MDIO not ready to read\n");
 		return (0);
 	}
 
 	/* Set GO, reg, phy */
 	cmd = MDIO_PHYACCESS_GO | (reg & 0x1F) << 21 | (phy & 0x1F) << 16;
 	cpsw_write_4(sc->swsc, sc->phyaccess, cmd);
 
 	if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) {
 		device_printf(dev, "MDIO timed out during read\n");
 		return (0);
 	}
 
 	r = cpsw_read_4(sc->swsc, sc->phyaccess);
 	if ((r & MDIO_PHYACCESS_ACK) == 0) {
 		device_printf(dev, "Failed to read from PHY.\n");
 		r = 0;
 	}
 	return (r & 0xFFFF);
 }
 
 static int
 cpswp_miibus_writereg(device_t dev, int phy, int reg, int value)
 {
 	struct cpswp_softc *sc;
 	uint32_t cmd;
 
 	sc = device_get_softc(dev);
 	if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) {
 		device_printf(dev, "MDIO not ready to write\n");
 		return (0);
 	}
 
 	/* Set GO, WRITE, reg, phy, and value */
 	cmd = MDIO_PHYACCESS_GO | MDIO_PHYACCESS_WRITE |
 	    (reg & 0x1F) << 21 | (phy & 0x1F) << 16 | (value & 0xFFFF);
 	cpsw_write_4(sc->swsc, sc->phyaccess, cmd);
 
 	if (!cpswp_miibus_ready(sc->swsc, sc->phyaccess)) {
 		device_printf(dev, "MDIO timed out during write\n");
 		return (0);
 	}
 
 	return (0);
 }
 
 static void
 cpswp_miibus_statchg(device_t dev)
 {
 	struct cpswp_softc *sc;
 	uint32_t mac_control, reg;
 
 	sc = device_get_softc(dev);
 	CPSW_DEBUGF(sc->swsc, (""));
 
 	reg = CPSW_SL_MACCONTROL(sc->unit);
 	mac_control = cpsw_read_4(sc->swsc, reg);
 	mac_control &= ~(CPSW_SL_MACTL_GIG | CPSW_SL_MACTL_IFCTL_A |
 	    CPSW_SL_MACTL_IFCTL_B | CPSW_SL_MACTL_FULLDUPLEX);
 
 	switch(IFM_SUBTYPE(sc->mii->mii_media_active)) {
 	case IFM_1000_SX:
 	case IFM_1000_LX:
 	case IFM_1000_CX:
 	case IFM_1000_T:
 		mac_control |= CPSW_SL_MACTL_GIG;
 		break;
 
 	case IFM_100_TX:
 		mac_control |= CPSW_SL_MACTL_IFCTL_A;
 		break;
 	}
 	if (sc->mii->mii_media_active & IFM_FDX)
 		mac_control |= CPSW_SL_MACTL_FULLDUPLEX;
 
 	cpsw_write_4(sc->swsc, reg, mac_control);
 }
 
 /*
  *
  * Transmit/Receive Packets.
  *
  */
 static void
 cpsw_intr_rx(void *arg)
 {
 	struct cpsw_softc *sc;
 	struct ifnet *ifp;
 	struct mbuf *received, *next;
 
 	sc = (struct cpsw_softc *)arg;
 	CPSW_RX_LOCK(sc);
 	if (sc->rx.teardown) {
 		sc->rx.running = 0;
 		sc->rx.teardown = 0;
 		cpsw_write_cp(sc, &sc->rx, 0xfffffffc);
 	}
 	received = cpsw_rx_dequeue(sc);
 	cpsw_rx_enqueue(sc);
 	cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 1);
 	CPSW_RX_UNLOCK(sc);
 
 	while (received != NULL) {
 		next = received->m_nextpkt;
 		received->m_nextpkt = NULL;
 		ifp = received->m_pkthdr.rcvif;
 		(*ifp->if_input)(ifp, received);
 		if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 		received = next;
 	}
 }
 
 static struct mbuf *
 cpsw_rx_dequeue(struct cpsw_softc *sc)
 {
 	int nsegs, port, removed;
 	struct cpsw_cpdma_bd bd;
 	struct cpsw_slot *last, *slot;
 	struct cpswp_softc *psc;
 	struct mbuf *m, *m0, *mb_head, *mb_tail;
 	uint16_t m0_flags;
 
 	nsegs = 0;
 	m0 = NULL;
 	last = NULL;
 	mb_head = NULL;
 	mb_tail = NULL;
 	removed = 0;
 
 	/* Pull completed packets off hardware RX queue. */
 	while ((slot = STAILQ_FIRST(&sc->rx.active)) != NULL) {
 		cpsw_cpdma_read_bd(sc, slot, &bd);
 
 		/*
 		 * Stop on packets still in use by hardware, but do not stop
 		 * on packets with the teardown complete flag, they will be
 		 * discarded later.
 		 */
 		if ((bd.flags & (CPDMA_BD_OWNER | CPDMA_BD_TDOWNCMPLT)) ==
 		    CPDMA_BD_OWNER)
 			break;
 
 		last = slot;
 		++removed;
 		STAILQ_REMOVE_HEAD(&sc->rx.active, next);
 		STAILQ_INSERT_TAIL(&sc->rx.avail, slot, next);
 
 		bus_dmamap_sync(sc->mbuf_dtag, slot->dmamap, BUS_DMASYNC_POSTREAD);
 		bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap);
 
 		m = slot->mbuf;
 		slot->mbuf = NULL;
 
 		if (bd.flags & CPDMA_BD_TDOWNCMPLT) {
 			CPSW_DEBUGF(sc, ("RX teardown is complete"));
 			m_freem(m);
 			sc->rx.running = 0;
 			sc->rx.teardown = 0;
 			break;
 		}
 
 		port = (bd.flags & CPDMA_BD_PORT_MASK) - 1;
 		KASSERT(port >= 0 && port <= 1,
 		    ("patcket received with invalid port: %d", port));
 		psc = device_get_softc(sc->port[port].dev);
 
 		/* Set up mbuf */
 		m->m_data += bd.bufoff;
 		m->m_len = bd.buflen;
 		if (bd.flags & CPDMA_BD_SOP) {
 			m->m_pkthdr.len = bd.pktlen;
 			m->m_pkthdr.rcvif = psc->ifp;
 			m->m_flags |= M_PKTHDR;
 			m0_flags = bd.flags;
 			m0 = m;
 		}
 		nsegs++;
 		m->m_next = NULL;
 		m->m_nextpkt = NULL;
 		if (bd.flags & CPDMA_BD_EOP && m0 != NULL) {
 			if (m0_flags & CPDMA_BD_PASS_CRC)
 				m_adj(m0, -ETHER_CRC_LEN);
 			m0_flags = 0;
 			m0 = NULL;
 			if (nsegs > sc->rx.longest_chain)
 				sc->rx.longest_chain = nsegs;
 			nsegs = 0;
 		}
 
 		if ((psc->ifp->if_capenable & IFCAP_RXCSUM) != 0) {
 			/* check for valid CRC by looking into pkt_err[5:4] */
 			if ((bd.flags &
 			    (CPDMA_BD_SOP | CPDMA_BD_PKT_ERR_MASK)) ==
 			    CPDMA_BD_SOP) {
 				m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED;
 				m->m_pkthdr.csum_flags |= CSUM_IP_VALID;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 		}
 
 		if (STAILQ_FIRST(&sc->rx.active) != NULL &&
 		    (bd.flags & (CPDMA_BD_EOP | CPDMA_BD_EOQ)) ==
 		    (CPDMA_BD_EOP | CPDMA_BD_EOQ)) {
 			cpsw_write_hdp_slot(sc, &sc->rx,
 			    STAILQ_FIRST(&sc->rx.active));
 			sc->rx.queue_restart++;
 		}
 
 		/* Add mbuf to packet list to be returned. */
 		if (mb_tail != NULL && (bd.flags & CPDMA_BD_SOP)) {
 			mb_tail->m_nextpkt = m;
 		} else if (mb_tail != NULL) {
 			mb_tail->m_next = m;
 		} else if (mb_tail == NULL && (bd.flags & CPDMA_BD_SOP) == 0) {
 			if (bootverbose)
 				printf(
 				    "%s: %s: discanding fragment packet w/o header\n",
 				    __func__, psc->ifp->if_xname);
 			m_freem(m);
 			continue;
 		} else {
 			mb_head = m;
 		}
 		mb_tail = m;
 	}
 
 	if (removed != 0) {
 		cpsw_write_cp_slot(sc, &sc->rx, last);
 		sc->rx.queue_removes += removed;
 		sc->rx.avail_queue_len += removed;
 		sc->rx.active_queue_len -= removed;
 		if (sc->rx.avail_queue_len > sc->rx.max_avail_queue_len)
 			sc->rx.max_avail_queue_len = sc->rx.avail_queue_len;
 		CPSW_DEBUGF(sc, ("Removed %d received packet(s) from RX queue", removed));
 	}
 
 	return (mb_head);
 }
 
 static void
 cpsw_rx_enqueue(struct cpsw_softc *sc)
 {
 	bus_dma_segment_t seg[1];
 	struct cpsw_cpdma_bd bd;
 	struct cpsw_slot *first_new_slot, *last_old_slot, *next, *slot;
 	int error, nsegs, added = 0;
 
 	/* Register new mbufs with hardware. */
 	first_new_slot = NULL;
 	last_old_slot = STAILQ_LAST(&sc->rx.active, cpsw_slot, next);
 	while ((slot = STAILQ_FIRST(&sc->rx.avail)) != NULL) {
 		if (first_new_slot == NULL)
 			first_new_slot = slot;
 		if (slot->mbuf == NULL) {
 			slot->mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			if (slot->mbuf == NULL) {
 				device_printf(sc->dev,
 				    "Unable to fill RX queue\n");
 				break;
 			}
 			slot->mbuf->m_len =
 			    slot->mbuf->m_pkthdr.len =
 			    slot->mbuf->m_ext.ext_size;
 		}
 
 		error = bus_dmamap_load_mbuf_sg(sc->mbuf_dtag, slot->dmamap,
 		    slot->mbuf, seg, &nsegs, BUS_DMA_NOWAIT);
 
 		KASSERT(nsegs == 1, ("More than one segment (nsegs=%d)", nsegs));
 		KASSERT(error == 0, ("DMA error (error=%d)", error));
 		if (error != 0 || nsegs != 1) {
 			device_printf(sc->dev,
 			    "%s: Can't prep RX buf for DMA (nsegs=%d, error=%d)\n",
 			    __func__, nsegs, error);
 			bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap);
 			m_freem(slot->mbuf);
 			slot->mbuf = NULL;
 			break;
 		}
 
 		bus_dmamap_sync(sc->mbuf_dtag, slot->dmamap, BUS_DMASYNC_PREREAD);
 
 		/* Create and submit new rx descriptor. */
 		if ((next = STAILQ_NEXT(slot, next)) != NULL)
 			bd.next = cpsw_cpdma_bd_paddr(sc, next);
 		else
 			bd.next = 0;
 		bd.bufptr = seg->ds_addr;
 		bd.bufoff = 0;
 		bd.buflen = MCLBYTES - 1;
 		bd.pktlen = bd.buflen;
 		bd.flags = CPDMA_BD_OWNER;
 		cpsw_cpdma_write_bd(sc, slot, &bd);
 		++added;
 
 		STAILQ_REMOVE_HEAD(&sc->rx.avail, next);
 		STAILQ_INSERT_TAIL(&sc->rx.active, slot, next);
 	}
 
 	if (added == 0 || first_new_slot == NULL)
 		return;
 
 	CPSW_DEBUGF(sc, ("Adding %d buffers to RX queue", added));
 
 	/* Link new entries to hardware RX queue. */
 	if (last_old_slot == NULL) {
 		/* Start a fresh queue. */
 		cpsw_write_hdp_slot(sc, &sc->rx, first_new_slot);
 	} else {
 		/* Add buffers to end of current queue. */
 		cpsw_cpdma_write_bd_next(sc, last_old_slot, first_new_slot);
 	}
 	sc->rx.queue_adds += added;
 	sc->rx.avail_queue_len -= added;
 	sc->rx.active_queue_len += added;
 	cpsw_write_4(sc, CPSW_CPDMA_RX_FREEBUFFER(0), added);
 	if (sc->rx.active_queue_len > sc->rx.max_active_queue_len)
 		sc->rx.max_active_queue_len = sc->rx.active_queue_len;
 }
 
 static void
 cpswp_start(struct ifnet *ifp)
 {
 	struct cpswp_softc *sc;
 
 	sc = ifp->if_softc;
 	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 ||
 	    sc->swsc->tx.running == 0) {
 		return;
 	}
 	CPSW_TX_LOCK(sc->swsc);
 	cpswp_tx_enqueue(sc);
 	cpsw_tx_dequeue(sc->swsc);
 	CPSW_TX_UNLOCK(sc->swsc);
 }
 
 static void
 cpsw_intr_tx(void *arg)
 {
 	struct cpsw_softc *sc;
 
 	sc = (struct cpsw_softc *)arg;
 	CPSW_TX_LOCK(sc);
 	if (cpsw_read_4(sc, CPSW_CPDMA_TX_CP(0)) == 0xfffffffc)
 		cpsw_write_cp(sc, &sc->tx, 0xfffffffc);
 	cpsw_tx_dequeue(sc);
 	cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 2);
 	CPSW_TX_UNLOCK(sc);
 }
 
 static void
 cpswp_tx_enqueue(struct cpswp_softc *sc)
 {
 	bus_dma_segment_t segs[CPSW_TXFRAGS];
 	struct cpsw_cpdma_bd bd;
 	struct cpsw_slot *first_new_slot, *last, *last_old_slot, *next, *slot;
 	struct mbuf *m0;
 	int error, nsegs, seg, added = 0, padlen;
 
 	/* Pull pending packets from IF queue and prep them for DMA. */
 	last = NULL;
 	first_new_slot = NULL;
 	last_old_slot = STAILQ_LAST(&sc->swsc->tx.active, cpsw_slot, next);
 	while ((slot = STAILQ_FIRST(&sc->swsc->tx.avail)) != NULL) {
 		IF_DEQUEUE(&sc->ifp->if_snd, m0);
 		if (m0 == NULL)
 			break;
 
 		slot->mbuf = m0;
 		padlen = ETHER_MIN_LEN - ETHER_CRC_LEN - m0->m_pkthdr.len;
 		if (padlen < 0)
 			padlen = 0;
 		else if (padlen > 0)
 			m_append(slot->mbuf, padlen, sc->swsc->nullpad);
 
 		/* Create mapping in DMA memory */
 		error = bus_dmamap_load_mbuf_sg(sc->swsc->mbuf_dtag,
 		    slot->dmamap, slot->mbuf, segs, &nsegs, BUS_DMA_NOWAIT);
 		/* If the packet is too fragmented, try to simplify. */
 		if (error == EFBIG ||
 		    (error == 0 && nsegs > sc->swsc->tx.avail_queue_len)) {
 			bus_dmamap_unload(sc->swsc->mbuf_dtag, slot->dmamap);
 			m0 = m_defrag(slot->mbuf, M_NOWAIT);
 			if (m0 == NULL) {
 				device_printf(sc->dev,
 				    "Can't defragment packet; dropping\n");
 				m_freem(slot->mbuf);
 			} else {
 				CPSW_DEBUGF(sc->swsc,
 				    ("Requeueing defragmented packet"));
 				IF_PREPEND(&sc->ifp->if_snd, m0);
 			}
 			slot->mbuf = NULL;
 			continue;
 		}
 		if (error != 0) {
 			device_printf(sc->dev,
 			    "%s: Can't setup DMA (error=%d), dropping packet\n",
 			    __func__, error);
 			bus_dmamap_unload(sc->swsc->mbuf_dtag, slot->dmamap);
 			m_freem(slot->mbuf);
 			slot->mbuf = NULL;
 			break;
 		}
 
 		bus_dmamap_sync(sc->swsc->mbuf_dtag, slot->dmamap,
 				BUS_DMASYNC_PREWRITE);
 
 		CPSW_DEBUGF(sc->swsc,
 		    ("Queueing TX packet: %d segments + %d pad bytes",
 		    nsegs, padlen));
 
 		if (first_new_slot == NULL)
 			first_new_slot = slot;
 
 		/* Link from the previous descriptor. */
 		if (last != NULL)
 			cpsw_cpdma_write_bd_next(sc->swsc, last, slot);
 
 		slot->ifp = sc->ifp;
 
 		/* If there is only one segment, the for() loop
 		 * gets skipped and the single buffer gets set up
 		 * as both SOP and EOP. */
 		if (nsegs > 1) {
 			next = STAILQ_NEXT(slot, next);
 			bd.next = cpsw_cpdma_bd_paddr(sc->swsc, next);
 		} else
 			bd.next = 0;
 		/* Start by setting up the first buffer. */
 		bd.bufptr = segs[0].ds_addr;
 		bd.bufoff = 0;
 		bd.buflen = segs[0].ds_len;
 		bd.pktlen = m_length(slot->mbuf, NULL);
 		bd.flags =  CPDMA_BD_SOP | CPDMA_BD_OWNER;
 		if (sc->swsc->dualemac) {
 			bd.flags |= CPDMA_BD_TO_PORT;
 			bd.flags |= ((sc->unit + 1) & CPDMA_BD_PORT_MASK);
 		}
 		for (seg = 1; seg < nsegs; ++seg) {
 			/* Save the previous buffer (which isn't EOP) */
 			cpsw_cpdma_write_bd(sc->swsc, slot, &bd);
 			STAILQ_REMOVE_HEAD(&sc->swsc->tx.avail, next);
 			STAILQ_INSERT_TAIL(&sc->swsc->tx.active, slot, next);
 			slot = STAILQ_FIRST(&sc->swsc->tx.avail);
 
 			/* Setup next buffer (which isn't SOP) */
 			if (nsegs > seg + 1) {
 				next = STAILQ_NEXT(slot, next);
 				bd.next = cpsw_cpdma_bd_paddr(sc->swsc, next);
 			} else
 				bd.next = 0;
 			bd.bufptr = segs[seg].ds_addr;
 			bd.bufoff = 0;
 			bd.buflen = segs[seg].ds_len;
 			bd.pktlen = 0;
 			bd.flags = CPDMA_BD_OWNER;
 		}
 
 		/* Save the final buffer. */
 		bd.flags |= CPDMA_BD_EOP;
 		cpsw_cpdma_write_bd(sc->swsc, slot, &bd);
 		STAILQ_REMOVE_HEAD(&sc->swsc->tx.avail, next);
 		STAILQ_INSERT_TAIL(&sc->swsc->tx.active, slot, next);
 
 		last = slot;
 		added += nsegs;
 		if (nsegs > sc->swsc->tx.longest_chain)
 			sc->swsc->tx.longest_chain = nsegs;
 
 		BPF_MTAP(sc->ifp, m0);
 	}
 
 	if (first_new_slot == NULL)
 		return;
 
 	/* Attach the list of new buffers to the hardware TX queue. */
 	if (last_old_slot != NULL &&
 	    (cpsw_cpdma_read_bd_flags(sc->swsc, last_old_slot) &
 	     CPDMA_BD_EOQ) == 0) {
 		/* Add buffers to end of current queue. */
 		cpsw_cpdma_write_bd_next(sc->swsc, last_old_slot,
 		    first_new_slot);
 	} else {
 		/* Start a fresh queue. */
 		cpsw_write_hdp_slot(sc->swsc, &sc->swsc->tx, first_new_slot);
 	}
 	sc->swsc->tx.queue_adds += added;
 	sc->swsc->tx.avail_queue_len -= added;
 	sc->swsc->tx.active_queue_len += added;
 	if (sc->swsc->tx.active_queue_len > sc->swsc->tx.max_active_queue_len) {
 		sc->swsc->tx.max_active_queue_len = sc->swsc->tx.active_queue_len;
 	}
 	CPSW_DEBUGF(sc->swsc, ("Queued %d TX packet(s)", added));
 }
 
 static int
 cpsw_tx_dequeue(struct cpsw_softc *sc)
 {
 	struct cpsw_slot *slot, *last_removed_slot = NULL;
 	struct cpsw_cpdma_bd bd;
 	uint32_t flags, removed = 0;
 
 	/* Pull completed buffers off the hardware TX queue. */
 	slot = STAILQ_FIRST(&sc->tx.active);
 	while (slot != NULL) {
 		flags = cpsw_cpdma_read_bd_flags(sc, slot);
 
 		/* TearDown complete is only marked on the SOP for the packet. */
 		if ((flags & (CPDMA_BD_SOP | CPDMA_BD_TDOWNCMPLT)) ==
 		    (CPDMA_BD_SOP | CPDMA_BD_TDOWNCMPLT)) {
 			sc->tx.teardown = 1;
 		}
 
 		if ((flags & (CPDMA_BD_SOP | CPDMA_BD_OWNER)) ==
 		    (CPDMA_BD_SOP | CPDMA_BD_OWNER) && sc->tx.teardown == 0)
 			break; /* Hardware is still using this packet. */
 
 		bus_dmamap_sync(sc->mbuf_dtag, slot->dmamap, BUS_DMASYNC_POSTWRITE);
 		bus_dmamap_unload(sc->mbuf_dtag, slot->dmamap);
 		m_freem(slot->mbuf);
 		slot->mbuf = NULL;
 
 		if (slot->ifp) {
 			if (sc->tx.teardown == 0)
 				if_inc_counter(slot->ifp, IFCOUNTER_OPACKETS, 1);
 			else
 				if_inc_counter(slot->ifp, IFCOUNTER_OQDROPS, 1);
 		}
 
 		/* Dequeue any additional buffers used by this packet. */
 		while (slot != NULL && slot->mbuf == NULL) {
 			STAILQ_REMOVE_HEAD(&sc->tx.active, next);
 			STAILQ_INSERT_TAIL(&sc->tx.avail, slot, next);
 			++removed;
 			last_removed_slot = slot;
 			slot = STAILQ_FIRST(&sc->tx.active);
 		}
 
 		cpsw_write_cp_slot(sc, &sc->tx, last_removed_slot);
 
 		/* Restart the TX queue if necessary. */
 		cpsw_cpdma_read_bd(sc, last_removed_slot, &bd);
 		if (slot != NULL && bd.next != 0 && (bd.flags &
 		    (CPDMA_BD_EOP | CPDMA_BD_OWNER | CPDMA_BD_EOQ)) ==
 		    (CPDMA_BD_EOP | CPDMA_BD_EOQ)) {
 			cpsw_write_hdp_slot(sc, &sc->tx, slot);
 			sc->tx.queue_restart++;
 			break;
 		}
 	}
 
 	if (removed != 0) {
 		sc->tx.queue_removes += removed;
 		sc->tx.active_queue_len -= removed;
 		sc->tx.avail_queue_len += removed;
 		if (sc->tx.avail_queue_len > sc->tx.max_avail_queue_len)
 			sc->tx.max_avail_queue_len = sc->tx.avail_queue_len;
 		CPSW_DEBUGF(sc, ("TX removed %d completed packet(s)", removed));
 	}
 
 	if (sc->tx.teardown && STAILQ_EMPTY(&sc->tx.active)) {
 		CPSW_DEBUGF(sc, ("TX teardown is complete"));
 		sc->tx.teardown = 0;
 		sc->tx.running = 0;
 	}
 
 	return (removed);
 }
 
 /*
  *
  * Miscellaneous interrupts.
  *
  */
 
 static void
 cpsw_intr_rx_thresh(void *arg)
 {
 	struct cpsw_softc *sc;
 	struct ifnet *ifp;
 	struct mbuf *received, *next;
 
 	sc = (struct cpsw_softc *)arg;
 	CPSW_RX_LOCK(sc);
 	received = cpsw_rx_dequeue(sc);
 	cpsw_rx_enqueue(sc);
 	cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 0);
 	CPSW_RX_UNLOCK(sc);
 
 	while (received != NULL) {
 		next = received->m_nextpkt;
 		received->m_nextpkt = NULL;
 		ifp = received->m_pkthdr.rcvif;
 		(*ifp->if_input)(ifp, received);
 		if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 		received = next;
 	}
 }
 
 static void
 cpsw_intr_misc_host_error(struct cpsw_softc *sc)
 {
 	uint32_t intstat;
 	uint32_t dmastat;
 	int txerr, rxerr, txchan, rxchan;
 
 	printf("\n\n");
 	device_printf(sc->dev,
 	    "HOST ERROR:  PROGRAMMING ERROR DETECTED BY HARDWARE\n");
 	printf("\n\n");
 	intstat = cpsw_read_4(sc, CPSW_CPDMA_DMA_INTSTAT_MASKED);
 	device_printf(sc->dev, "CPSW_CPDMA_DMA_INTSTAT_MASKED=0x%x\n", intstat);
 	dmastat = cpsw_read_4(sc, CPSW_CPDMA_DMASTATUS);
 	device_printf(sc->dev, "CPSW_CPDMA_DMASTATUS=0x%x\n", dmastat);
 
 	txerr = (dmastat >> 20) & 15;
 	txchan = (dmastat >> 16) & 7;
 	rxerr = (dmastat >> 12) & 15;
 	rxchan = (dmastat >> 8) & 7;
 
 	switch (txerr) {
 	case 0: break;
 	case 1:	printf("SOP error on TX channel %d\n", txchan);
 		break;
 	case 2:	printf("Ownership bit not set on SOP buffer on TX channel %d\n", txchan);
 		break;
 	case 3:	printf("Zero Next Buffer but not EOP on TX channel %d\n", txchan);
 		break;
 	case 4:	printf("Zero Buffer Pointer on TX channel %d\n", txchan);
 		break;
 	case 5:	printf("Zero Buffer Length on TX channel %d\n", txchan);
 		break;
 	case 6:	printf("Packet length error on TX channel %d\n", txchan);
 		break;
 	default: printf("Unknown error on TX channel %d\n", txchan);
 		break;
 	}
 
 	if (txerr != 0) {
 		printf("CPSW_CPDMA_TX%d_HDP=0x%x\n",
 		    txchan, cpsw_read_4(sc, CPSW_CPDMA_TX_HDP(txchan)));
 		printf("CPSW_CPDMA_TX%d_CP=0x%x\n",
 		    txchan, cpsw_read_4(sc, CPSW_CPDMA_TX_CP(txchan)));
 		cpsw_dump_queue(sc, &sc->tx.active);
 	}
 
 	switch (rxerr) {
 	case 0: break;
 	case 2:	printf("Ownership bit not set on RX channel %d\n", rxchan);
 		break;
 	case 4:	printf("Zero Buffer Pointer on RX channel %d\n", rxchan);
 		break;
 	case 5:	printf("Zero Buffer Length on RX channel %d\n", rxchan);
 		break;
 	case 6:	printf("Buffer offset too big on RX channel %d\n", rxchan);
 		break;
 	default: printf("Unknown RX error on RX channel %d\n", rxchan);
 		break;
 	}
 
 	if (rxerr != 0) {
 		printf("CPSW_CPDMA_RX%d_HDP=0x%x\n",
 		    rxchan, cpsw_read_4(sc,CPSW_CPDMA_RX_HDP(rxchan)));
 		printf("CPSW_CPDMA_RX%d_CP=0x%x\n",
 		    rxchan, cpsw_read_4(sc, CPSW_CPDMA_RX_CP(rxchan)));
 		cpsw_dump_queue(sc, &sc->rx.active);
 	}
 
 	printf("\nALE Table\n");
 	cpsw_ale_dump_table(sc);
 
 	// XXX do something useful here??
 	panic("CPSW HOST ERROR INTERRUPT");
 
 	// Suppress this interrupt in the future.
 	cpsw_write_4(sc, CPSW_CPDMA_DMA_INTMASK_CLEAR, intstat);
 	printf("XXX HOST ERROR INTERRUPT SUPPRESSED\n");
 	// The watchdog will probably reset the controller
 	// in a little while.  It will probably fail again.
 }
 
 static void
 cpsw_intr_misc(void *arg)
 {
 	struct cpsw_softc *sc = arg;
 	uint32_t stat = cpsw_read_4(sc, CPSW_WR_C_MISC_STAT(0));
 
 	if (stat & CPSW_WR_C_MISC_EVNT_PEND)
 		CPSW_DEBUGF(sc, ("Time sync event interrupt unimplemented"));
 	if (stat & CPSW_WR_C_MISC_STAT_PEND)
 		cpsw_stats_collect(sc);
 	if (stat & CPSW_WR_C_MISC_HOST_PEND)
 		cpsw_intr_misc_host_error(sc);
 	if (stat & CPSW_WR_C_MISC_MDIOLINK) {
 		cpsw_write_4(sc, MDIOLINKINTMASKED,
 		    cpsw_read_4(sc, MDIOLINKINTMASKED));
 	}
 	if (stat & CPSW_WR_C_MISC_MDIOUSER) {
 		CPSW_DEBUGF(sc,
 		    ("MDIO operation completed interrupt unimplemented"));
 	}
 	cpsw_write_4(sc, CPSW_CPDMA_CPDMA_EOI_VECTOR, 3);
 }
 
 /*
  *
  * Periodic Checks and Watchdog.
  *
  */
 
 static void
 cpswp_tick(void *msc)
 {
 	struct cpswp_softc *sc = msc;
 
 	/* Check for media type change */
 	mii_tick(sc->mii);
 	if (sc->media_status != sc->mii->mii_media.ifm_media) {
 		printf("%s: media type changed (ifm_media=%x)\n", __func__, 
 			sc->mii->mii_media.ifm_media);
 		cpswp_ifmedia_upd(sc->ifp);
 	}
 
 	/* Schedule another timeout one second from now */
 	callout_reset(&sc->mii_callout, hz, cpswp_tick, sc);
 }
 
 static void
 cpswp_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	struct cpswp_softc *sc;
 	struct mii_data *mii;
 
 	sc = ifp->if_softc;
 	CPSW_DEBUGF(sc->swsc, (""));
 	CPSW_PORT_LOCK(sc);
 
 	mii = sc->mii;
 	mii_pollstat(mii);
 
 	ifmr->ifm_active = mii->mii_media_active;
 	ifmr->ifm_status = mii->mii_media_status;
 	CPSW_PORT_UNLOCK(sc);
 }
 
 static int
 cpswp_ifmedia_upd(struct ifnet *ifp)
 {
 	struct cpswp_softc *sc;
 
 	sc = ifp->if_softc;
 	CPSW_DEBUGF(sc->swsc, (""));
 	CPSW_PORT_LOCK(sc);
 	mii_mediachg(sc->mii);
 	sc->media_status = sc->mii->mii_media.ifm_media;
 	CPSW_PORT_UNLOCK(sc);
 
 	return (0);
 }
 
 static void
 cpsw_tx_watchdog_full_reset(struct cpsw_softc *sc)
 {
 	struct cpswp_softc *psc;
 	int i;
 
 	cpsw_debugf_head("CPSW watchdog");
 	device_printf(sc->dev, "watchdog timeout\n");
 	printf("CPSW_CPDMA_TX%d_HDP=0x%x\n", 0,
 	    cpsw_read_4(sc, CPSW_CPDMA_TX_HDP(0)));
 	printf("CPSW_CPDMA_TX%d_CP=0x%x\n", 0,
 	    cpsw_read_4(sc, CPSW_CPDMA_TX_CP(0)));
 	cpsw_dump_queue(sc, &sc->tx.active);
 	for (i = 0; i < CPSW_PORTS; i++) {
 		if (!sc->dualemac && i != sc->active_slave)
 			continue;
 		psc = device_get_softc(sc->port[i].dev);
 		CPSW_PORT_LOCK(psc);
 		cpswp_stop_locked(psc);
 		CPSW_PORT_UNLOCK(psc);
 	}
 }
 
 static void
 cpsw_tx_watchdog(void *msc)
 {
 	struct cpsw_softc *sc;
 
 	sc = msc;
 	CPSW_TX_LOCK(sc);
 	if (sc->tx.active_queue_len == 0 || !sc->tx.running) {
 		sc->watchdog.timer = 0; /* Nothing to do. */
 	} else if (sc->tx.queue_removes > sc->tx.queue_removes_at_last_tick) {
 		sc->watchdog.timer = 0;  /* Stuff done while we weren't looking. */
 	} else if (cpsw_tx_dequeue(sc) > 0) {
 		sc->watchdog.timer = 0;  /* We just did something. */
 	} else {
 		/* There was something to do but it didn't get done. */
 		++sc->watchdog.timer;
 		if (sc->watchdog.timer > 5) {
 			sc->watchdog.timer = 0;
 			++sc->watchdog.resets;
 			cpsw_tx_watchdog_full_reset(sc);
 		}
 	}
 	sc->tx.queue_removes_at_last_tick = sc->tx.queue_removes;
 	CPSW_TX_UNLOCK(sc);
 
 	/* Schedule another timeout one second from now */
 	callout_reset(&sc->watchdog.callout, hz, cpsw_tx_watchdog, sc);
 }
 
 /*
  *
  * ALE support routines.
  *
  */
 
 static void
 cpsw_ale_read_entry(struct cpsw_softc *sc, uint16_t idx, uint32_t *ale_entry)
 {
 	cpsw_write_4(sc, CPSW_ALE_TBLCTL, idx & 1023);
 	ale_entry[0] = cpsw_read_4(sc, CPSW_ALE_TBLW0);
 	ale_entry[1] = cpsw_read_4(sc, CPSW_ALE_TBLW1);
 	ale_entry[2] = cpsw_read_4(sc, CPSW_ALE_TBLW2);
 }
 
 static void
 cpsw_ale_write_entry(struct cpsw_softc *sc, uint16_t idx, uint32_t *ale_entry)
 {
 	cpsw_write_4(sc, CPSW_ALE_TBLW0, ale_entry[0]);
 	cpsw_write_4(sc, CPSW_ALE_TBLW1, ale_entry[1]);
 	cpsw_write_4(sc, CPSW_ALE_TBLW2, ale_entry[2]);
 	cpsw_write_4(sc, CPSW_ALE_TBLCTL, 1 << 31 | (idx & 1023));
 }
 
 static void
 cpsw_ale_remove_all_mc_entries(struct cpsw_softc *sc)
 {
 	int i;
 	uint32_t ale_entry[3];
 
 	/* First four entries are link address and broadcast. */
 	for (i = 10; i < CPSW_MAX_ALE_ENTRIES; i++) {
 		cpsw_ale_read_entry(sc, i, ale_entry);
 		if ((ALE_TYPE(ale_entry) == ALE_TYPE_ADDR ||
 		    ALE_TYPE(ale_entry) == ALE_TYPE_VLAN_ADDR) &&
 		    ALE_MCAST(ale_entry)  == 1) { /* MCast link addr */
 			ale_entry[0] = ale_entry[1] = ale_entry[2] = 0;
 			cpsw_ale_write_entry(sc, i, ale_entry);
 		}
 	}
 }
 
 static int
 cpsw_ale_mc_entry_set(struct cpsw_softc *sc, uint8_t portmap, int vlan,
 	uint8_t *mac)
 {
 	int free_index = -1, matching_index = -1, i;
 	uint32_t ale_entry[3], ale_type;
 
 	/* Find a matching entry or a free entry. */
 	for (i = 10; i < CPSW_MAX_ALE_ENTRIES; i++) {
 		cpsw_ale_read_entry(sc, i, ale_entry);
 
 		/* Entry Type[61:60] is 0 for free entry */ 
 		if (free_index < 0 && ALE_TYPE(ale_entry) == 0)
 			free_index = i;
 
 		if ((((ale_entry[1] >> 8) & 0xFF) == mac[0]) &&
 		    (((ale_entry[1] >> 0) & 0xFF) == mac[1]) &&
 		    (((ale_entry[0] >>24) & 0xFF) == mac[2]) &&
 		    (((ale_entry[0] >>16) & 0xFF) == mac[3]) &&
 		    (((ale_entry[0] >> 8) & 0xFF) == mac[4]) &&
 		    (((ale_entry[0] >> 0) & 0xFF) == mac[5])) {
 			matching_index = i;
 			break;
 		}
 	}
 
 	if (matching_index < 0) {
 		if (free_index < 0)
 			return (ENOMEM);
 		i = free_index;
 	}
 
 	if (vlan != -1)
 		ale_type = ALE_TYPE_VLAN_ADDR << 28 | vlan << 16;
 	else
 		ale_type = ALE_TYPE_ADDR << 28;
 
 	/* Set MAC address */
 	ale_entry[0] = mac[2] << 24 | mac[3] << 16 | mac[4] << 8 | mac[5];
 	ale_entry[1] = mac[0] << 8 | mac[1];
 
 	/* Entry type[61:60] and Mcast fwd state[63:62] is fw(3). */
 	ale_entry[1] |= ALE_MCAST_FWD | ale_type;
 
 	/* Set portmask [68:66] */
 	ale_entry[2] = (portmap & 7) << 2;
 
 	cpsw_ale_write_entry(sc, i, ale_entry);
 
 	return 0;
 }
 
 static void
 cpsw_ale_dump_table(struct cpsw_softc *sc) {
 	int i;
 	uint32_t ale_entry[3];
 	for (i = 0; i < CPSW_MAX_ALE_ENTRIES; i++) {
 		cpsw_ale_read_entry(sc, i, ale_entry);
 		switch (ALE_TYPE(ale_entry)) {
 		case ALE_TYPE_VLAN:
 			printf("ALE[%4u] %08x %08x %08x ", i, ale_entry[2],
 				ale_entry[1], ale_entry[0]);
 			printf("type: %u ", ALE_TYPE(ale_entry));
 			printf("vlan: %u ", ALE_VLAN(ale_entry));
 			printf("untag: %u ", ALE_VLAN_UNTAG(ale_entry));
 			printf("reg flood: %u ", ALE_VLAN_REGFLOOD(ale_entry));
 			printf("unreg flood: %u ", ALE_VLAN_UNREGFLOOD(ale_entry));
 			printf("members: %u ", ALE_VLAN_MEMBERS(ale_entry));
 			printf("\n");
 			break;
 		case ALE_TYPE_ADDR:
 		case ALE_TYPE_VLAN_ADDR:
 			printf("ALE[%4u] %08x %08x %08x ", i, ale_entry[2],
 				ale_entry[1], ale_entry[0]);
 			printf("type: %u ", ALE_TYPE(ale_entry));
 			printf("mac: %02x:%02x:%02x:%02x:%02x:%02x ",
 				(ale_entry[1] >> 8) & 0xFF,
 				(ale_entry[1] >> 0) & 0xFF,
 				(ale_entry[0] >>24) & 0xFF,
 				(ale_entry[0] >>16) & 0xFF,
 				(ale_entry[0] >> 8) & 0xFF,
 				(ale_entry[0] >> 0) & 0xFF);
 			printf(ALE_MCAST(ale_entry) ? "mcast " : "ucast ");
 			if (ALE_TYPE(ale_entry) == ALE_TYPE_VLAN_ADDR)
 				printf("vlan: %u ", ALE_VLAN(ale_entry));
 			printf("port: %u ", ALE_PORTS(ale_entry));
 			printf("\n");
 			break;
 		}
 	}
 	printf("\n");
 }
 
 static int
 cpswp_ale_update_addresses(struct cpswp_softc *sc, int purge)
 {
 	uint8_t *mac;
 	uint32_t ale_entry[3], ale_type, portmask;
 	struct ifmultiaddr *ifma;
 
 	if (sc->swsc->dualemac) {
 		ale_type = ALE_TYPE_VLAN_ADDR << 28 | sc->vlan << 16;
 		portmask = 1 << (sc->unit + 1) | 1 << 0;
 	} else {
 		ale_type = ALE_TYPE_ADDR << 28;
 		portmask = 7;
 	}
 
 	/*
 	 * Route incoming packets for our MAC address to Port 0 (host).
 	 * For simplicity, keep this entry at table index 0 for port 1 and
 	 * at index 2 for port 2 in the ALE.
 	 */
         if_addr_rlock(sc->ifp);
 	mac = LLADDR((struct sockaddr_dl *)sc->ifp->if_addr->ifa_addr);
 	ale_entry[0] = mac[2] << 24 | mac[3] << 16 | mac[4] << 8 | mac[5];
 	ale_entry[1] = ale_type | mac[0] << 8 | mac[1]; /* addr entry + mac */
 	ale_entry[2] = 0; /* port = 0 */
 	cpsw_ale_write_entry(sc->swsc, 0 + 2 * sc->unit, ale_entry);
 
 	/* Set outgoing MAC Address for slave port. */
 	cpsw_write_4(sc->swsc, CPSW_PORT_P_SA_HI(sc->unit + 1),
 	    mac[3] << 24 | mac[2] << 16 | mac[1] << 8 | mac[0]);
 	cpsw_write_4(sc->swsc, CPSW_PORT_P_SA_LO(sc->unit + 1),
 	    mac[5] << 8 | mac[4]);
         if_addr_runlock(sc->ifp);
 
 	/* Keep the broadcast address at table entry 1 (or 3). */
 	ale_entry[0] = 0xffffffff; /* Lower 32 bits of MAC */
 	/* ALE_MCAST_FWD, Addr type, upper 16 bits of Mac */ 
 	ale_entry[1] = ALE_MCAST_FWD | ale_type | 0xffff;
 	ale_entry[2] = portmask << 2;
 	cpsw_ale_write_entry(sc->swsc, 1 + 2 * sc->unit, ale_entry);
 
 	/* SIOCDELMULTI doesn't specify the particular address
 	   being removed, so we have to remove all and rebuild. */
 	if (purge)
 		cpsw_ale_remove_all_mc_entries(sc->swsc);
 
         /* Set other multicast addrs desired. */
         if_maddr_rlock(sc->ifp);
         CK_STAILQ_FOREACH(ifma, &sc->ifp->if_multiaddrs, ifma_link) {
                 if (ifma->ifma_addr->sa_family != AF_LINK)
                         continue;
 		cpsw_ale_mc_entry_set(sc->swsc, portmask, sc->vlan,
 		    LLADDR((struct sockaddr_dl *)ifma->ifma_addr));
         }
         if_maddr_runlock(sc->ifp);
 
 	return (0);
 }
 
 static int
 cpsw_ale_update_vlan_table(struct cpsw_softc *sc, int vlan, int ports,
 	int untag, int mcregflood, int mcunregflood)
 {
 	int free_index, i, matching_index;
 	uint32_t ale_entry[3];
 
 	free_index = matching_index = -1;
 	/* Find a matching entry or a free entry. */
 	for (i = 5; i < CPSW_MAX_ALE_ENTRIES; i++) {
 		cpsw_ale_read_entry(sc, i, ale_entry);
 
 		/* Entry Type[61:60] is 0 for free entry */ 
 		if (free_index < 0 && ALE_TYPE(ale_entry) == 0)
 			free_index = i;
 
 		if (ALE_VLAN(ale_entry) == vlan) {
 			matching_index = i;
 			break;
 		}
 	}
 
 	if (matching_index < 0) {
 		if (free_index < 0)
 			return (-1);
 		i = free_index;
 	}
 
 	ale_entry[0] = (untag & 7) << 24 | (mcregflood & 7) << 16 |
 	    (mcunregflood & 7) << 8 | (ports & 7);
 	ale_entry[1] = ALE_TYPE_VLAN << 28 | vlan << 16;
 	ale_entry[2] = 0;
 	cpsw_ale_write_entry(sc, i, ale_entry);
 
 	return (0);
 }
 
 /*
  *
  * Statistics and Sysctls.
  *
  */
 
 #if 0
 static void
 cpsw_stats_dump(struct cpsw_softc *sc)
 {
 	int i;
 	uint32_t r;
 
 	for (i = 0; i < CPSW_SYSCTL_COUNT; ++i) {
 		r = cpsw_read_4(sc, CPSW_STATS_OFFSET +
 		    cpsw_stat_sysctls[i].reg);
 		CPSW_DEBUGF(sc, ("%s: %ju + %u = %ju", cpsw_stat_sysctls[i].oid,
 		    (intmax_t)sc->shadow_stats[i], r,
 		    (intmax_t)sc->shadow_stats[i] + r));
 	}
 }
 #endif
 
 static void
 cpsw_stats_collect(struct cpsw_softc *sc)
 {
 	int i;
 	uint32_t r;
 
 	CPSW_DEBUGF(sc, ("Controller shadow statistics updated."));
 
 	for (i = 0; i < CPSW_SYSCTL_COUNT; ++i) {
 		r = cpsw_read_4(sc, CPSW_STATS_OFFSET +
 		    cpsw_stat_sysctls[i].reg);
 		sc->shadow_stats[i] += r;
 		cpsw_write_4(sc, CPSW_STATS_OFFSET + cpsw_stat_sysctls[i].reg,
 		    r);
 	}
 }
 
 static int
 cpsw_stats_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct cpsw_softc *sc;
 	struct cpsw_stat *stat;
 	uint64_t result;
 
 	sc = (struct cpsw_softc *)arg1;
 	stat = &cpsw_stat_sysctls[oidp->oid_number];
 	result = sc->shadow_stats[oidp->oid_number];
 	result += cpsw_read_4(sc, CPSW_STATS_OFFSET + stat->reg);
 	return (sysctl_handle_64(oidp, &result, 0, req));
 }
 
 static int
 cpsw_stat_attached(SYSCTL_HANDLER_ARGS)
 {
 	struct cpsw_softc *sc;
 	struct bintime t;
 	unsigned result;
 
 	sc = (struct cpsw_softc *)arg1;
 	getbinuptime(&t);
 	bintime_sub(&t, &sc->attach_uptime);
 	result = t.sec;
 	return (sysctl_handle_int(oidp, &result, 0, req));
 }
 
 static int
 cpsw_intr_coalesce(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	struct cpsw_softc *sc;
 	uint32_t ctrl, intr_per_ms;
 
 	sc = (struct cpsw_softc *)arg1;
 	error = sysctl_handle_int(oidp, &sc->coal_us, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	ctrl = cpsw_read_4(sc, CPSW_WR_INT_CONTROL);
 	ctrl &= ~(CPSW_WR_INT_PACE_EN | CPSW_WR_INT_PRESCALE_MASK);
 	if (sc->coal_us == 0) {
 		/* Disable the interrupt pace hardware. */
 		cpsw_write_4(sc, CPSW_WR_INT_CONTROL, ctrl);
 		cpsw_write_4(sc, CPSW_WR_C_RX_IMAX(0), 0);
 		cpsw_write_4(sc, CPSW_WR_C_TX_IMAX(0), 0);
 		return (0);
 	}
 
 	if (sc->coal_us > CPSW_WR_C_IMAX_US_MAX)
 		sc->coal_us = CPSW_WR_C_IMAX_US_MAX;
 	if (sc->coal_us < CPSW_WR_C_IMAX_US_MIN)
 		sc->coal_us = CPSW_WR_C_IMAX_US_MIN;
 	intr_per_ms = 1000 / sc->coal_us;
 	/* Just to make sure... */
 	if (intr_per_ms > CPSW_WR_C_IMAX_MAX)
 		intr_per_ms = CPSW_WR_C_IMAX_MAX;
 	if (intr_per_ms < CPSW_WR_C_IMAX_MIN)
 		intr_per_ms = CPSW_WR_C_IMAX_MIN;
 
 	/* Set the prescale to produce 4us pulses from the 125 Mhz clock. */
 	ctrl |= (125 * 4) & CPSW_WR_INT_PRESCALE_MASK;
 
 	/* Enable the interrupt pace hardware. */
 	cpsw_write_4(sc, CPSW_WR_C_RX_IMAX(0), intr_per_ms);
 	cpsw_write_4(sc, CPSW_WR_C_TX_IMAX(0), intr_per_ms);
 	ctrl |= CPSW_WR_INT_C0_RX_PULSE | CPSW_WR_INT_C0_TX_PULSE;
 	cpsw_write_4(sc, CPSW_WR_INT_CONTROL, ctrl);
 
 	return (0);
 }
 
 static int
 cpsw_stat_uptime(SYSCTL_HANDLER_ARGS)
 {
 	struct cpsw_softc *swsc;
 	struct cpswp_softc *sc;
 	struct bintime t;
 	unsigned result;
 
 	swsc = arg1;
 	sc = device_get_softc(swsc->port[arg2].dev);
 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		getbinuptime(&t);
 		bintime_sub(&t, &sc->init_uptime);
 		result = t.sec;
 	} else
 		result = 0;
 	return (sysctl_handle_int(oidp, &result, 0, req));
 }
 
 static void
 cpsw_add_queue_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *node,
 	struct cpsw_queue *queue)
 {
 	struct sysctl_oid_list *parent;
 
 	parent = SYSCTL_CHILDREN(node);
 	SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "totalBuffers",
 	    CTLFLAG_RD, &queue->queue_slots, 0,
 	    "Total buffers currently assigned to this queue");
 	SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "activeBuffers",
 	    CTLFLAG_RD, &queue->active_queue_len, 0,
 	    "Buffers currently registered with hardware controller");
 	SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "maxActiveBuffers",
 	    CTLFLAG_RD, &queue->max_active_queue_len, 0,
 	    "Max value of activeBuffers since last driver reset");
 	SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "availBuffers",
 	    CTLFLAG_RD, &queue->avail_queue_len, 0,
 	    "Buffers allocated to this queue but not currently "
 	    "registered with hardware controller");
 	SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "maxAvailBuffers",
 	    CTLFLAG_RD, &queue->max_avail_queue_len, 0,
 	    "Max value of availBuffers since last driver reset");
 	SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "totalEnqueued",
 	    CTLFLAG_RD, &queue->queue_adds, 0,
 	    "Total buffers added to queue");
 	SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "totalDequeued",
 	    CTLFLAG_RD, &queue->queue_removes, 0,
 	    "Total buffers removed from queue");
 	SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "queueRestart",
 	    CTLFLAG_RD, &queue->queue_restart, 0,
 	    "Total times the queue has been restarted");
 	SYSCTL_ADD_UINT(ctx, parent, OID_AUTO, "longestChain",
 	    CTLFLAG_RD, &queue->longest_chain, 0,
 	    "Max buffers used for a single packet");
 }
 
 static void
 cpsw_add_watchdog_sysctls(struct sysctl_ctx_list *ctx, struct sysctl_oid *node,
 	struct cpsw_softc *sc)
 {
 	struct sysctl_oid_list *parent;
 
 	parent = SYSCTL_CHILDREN(node);
 	SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "resets",
 	    CTLFLAG_RD, &sc->watchdog.resets, 0,
 	    "Total number of watchdog resets");
 }
 
 static void
 cpsw_add_sysctls(struct cpsw_softc *sc)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid *stats_node, *queue_node, *node;
 	struct sysctl_oid_list *parent, *stats_parent, *queue_parent;
 	struct sysctl_oid_list *ports_parent, *port_parent;
 	char port[16];
 	int i;
 
 	ctx = device_get_sysctl_ctx(sc->dev);
 	parent = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
 
 	SYSCTL_ADD_INT(ctx, parent, OID_AUTO, "debug",
 	    CTLFLAG_RW, &sc->debug, 0, "Enable switch debug messages");
 
 	SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "attachedSecs",
 	    CTLTYPE_UINT | CTLFLAG_RD, sc, 0, cpsw_stat_attached, "IU",
 	    "Time since driver attach");
 
 	SYSCTL_ADD_PROC(ctx, parent, OID_AUTO, "intr_coalesce_us",
 	    CTLTYPE_UINT | CTLFLAG_RW, sc, 0, cpsw_intr_coalesce, "IU",
 	    "minimum time between interrupts");
 
 	node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "ports",
 	    CTLFLAG_RD, NULL, "CPSW Ports Statistics");
 	ports_parent = SYSCTL_CHILDREN(node);
 	for (i = 0; i < CPSW_PORTS; i++) {
 		if (!sc->dualemac && i != sc->active_slave)
 			continue;
 		port[0] = '0' + i;
 		port[1] = '\0';
 		node = SYSCTL_ADD_NODE(ctx, ports_parent, OID_AUTO,
 		    port, CTLFLAG_RD, NULL, "CPSW Port Statistics");
 		port_parent = SYSCTL_CHILDREN(node);
 		SYSCTL_ADD_PROC(ctx, port_parent, OID_AUTO, "uptime",
 		    CTLTYPE_UINT | CTLFLAG_RD, sc, i,
 		    cpsw_stat_uptime, "IU", "Seconds since driver init");
 	}
 
 	stats_node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "stats",
 				     CTLFLAG_RD, NULL, "CPSW Statistics");
 	stats_parent = SYSCTL_CHILDREN(stats_node);
 	for (i = 0; i < CPSW_SYSCTL_COUNT; ++i) {
 		SYSCTL_ADD_PROC(ctx, stats_parent, i,
 				cpsw_stat_sysctls[i].oid,
 				CTLTYPE_U64 | CTLFLAG_RD, sc, 0,
 				cpsw_stats_sysctl, "IU",
 				cpsw_stat_sysctls[i].oid);
 	}
 
 	queue_node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "queue",
 	    CTLFLAG_RD, NULL, "CPSW Queue Statistics");
 	queue_parent = SYSCTL_CHILDREN(queue_node);
 
 	node = SYSCTL_ADD_NODE(ctx, queue_parent, OID_AUTO, "tx",
 	    CTLFLAG_RD, NULL, "TX Queue Statistics");
 	cpsw_add_queue_sysctls(ctx, node, &sc->tx);
 
 	node = SYSCTL_ADD_NODE(ctx, queue_parent, OID_AUTO, "rx",
 	    CTLFLAG_RD, NULL, "RX Queue Statistics");
 	cpsw_add_queue_sysctls(ctx, node, &sc->rx);
 
 	node = SYSCTL_ADD_NODE(ctx, parent, OID_AUTO, "watchdog",
 	    CTLFLAG_RD, NULL, "Watchdog Statistics");
 	cpsw_add_watchdog_sysctls(ctx, node, sc);
 }
 
 #ifdef CPSW_ETHERSWITCH
 static etherswitch_info_t etherswitch_info = {
 	.es_nports =		CPSW_PORTS + 1,
 	.es_nvlangroups =	CPSW_VLANS,
 	.es_name =		"TI Common Platform Ethernet Switch (CPSW)",
 	.es_vlan_caps =		ETHERSWITCH_VLAN_DOT1Q,
 };
 
 static etherswitch_info_t *
 cpsw_getinfo(device_t dev)
 {
 	return (&etherswitch_info);
 }
 
 static int
 cpsw_getport(device_t dev, etherswitch_port_t *p)
 {
 	int err;
 	struct cpsw_softc *sc;
 	struct cpswp_softc *psc;
 	struct ifmediareq *ifmr;
 	uint32_t reg;
 
 	if (p->es_port < 0 || p->es_port > CPSW_PORTS)
 		return (ENXIO);
 
 	err = 0;
 	sc = device_get_softc(dev);
 	if (p->es_port == CPSW_CPU_PORT) {
 		p->es_flags |= ETHERSWITCH_PORT_CPU;
  		ifmr = &p->es_ifmr;
 		ifmr->ifm_current = ifmr->ifm_active =
 		    IFM_ETHER | IFM_1000_T | IFM_FDX;
 		ifmr->ifm_mask = 0;
 		ifmr->ifm_status = IFM_ACTIVE | IFM_AVALID;
 		ifmr->ifm_count = 0;
 	} else {
 		psc = device_get_softc(sc->port[p->es_port - 1].dev);
 		err = ifmedia_ioctl(psc->ifp, &p->es_ifr,
 		    &psc->mii->mii_media, SIOCGIFMEDIA);
 	}
 	reg = cpsw_read_4(sc, CPSW_PORT_P_VLAN(p->es_port));
 	p->es_pvid = reg & ETHERSWITCH_VID_MASK;
 
 	reg = cpsw_read_4(sc, CPSW_ALE_PORTCTL(p->es_port));
 	if (reg & ALE_PORTCTL_DROP_UNTAGGED)
 		p->es_flags |= ETHERSWITCH_PORT_DROPUNTAGGED;
 	if (reg & ALE_PORTCTL_INGRESS)
 		p->es_flags |= ETHERSWITCH_PORT_INGRESS;
 
 	return (err);
 }
 
 static int
 cpsw_setport(device_t dev, etherswitch_port_t *p)
 {
 	struct cpsw_softc *sc;
 	struct cpswp_softc *psc;
 	struct ifmedia *ifm;
 	uint32_t reg;
 
 	if (p->es_port < 0 || p->es_port > CPSW_PORTS)
 		return (ENXIO);
 
 	sc = device_get_softc(dev);
 	if (p->es_pvid != 0) {
 		cpsw_write_4(sc, CPSW_PORT_P_VLAN(p->es_port),
 		    p->es_pvid & ETHERSWITCH_VID_MASK);
 	}
 
 	reg = cpsw_read_4(sc, CPSW_ALE_PORTCTL(p->es_port));
 	if (p->es_flags & ETHERSWITCH_PORT_DROPUNTAGGED)
 		reg |= ALE_PORTCTL_DROP_UNTAGGED;
 	else
 		reg &= ~ALE_PORTCTL_DROP_UNTAGGED;
 	if (p->es_flags & ETHERSWITCH_PORT_INGRESS)
 		reg |= ALE_PORTCTL_INGRESS;
 	else
 		reg &= ~ALE_PORTCTL_INGRESS;
 	cpsw_write_4(sc, CPSW_ALE_PORTCTL(p->es_port), reg);
 
 	/* CPU port does not allow media settings. */
 	if (p->es_port == CPSW_CPU_PORT)
 		return (0);
 
 	psc = device_get_softc(sc->port[p->es_port - 1].dev);
 	ifm = &psc->mii->mii_media;
 
 	return (ifmedia_ioctl(psc->ifp, &p->es_ifr, ifm, SIOCSIFMEDIA));
 }
 
 static int
 cpsw_getconf(device_t dev, etherswitch_conf_t *conf)
 {
 
 	/* Return the VLAN mode. */
 	conf->cmd = ETHERSWITCH_CONF_VLAN_MODE;
 	conf->vlan_mode = ETHERSWITCH_VLAN_DOT1Q;
 
 	return (0);
 }
 
 static int
 cpsw_getvgroup(device_t dev, etherswitch_vlangroup_t *vg)
 {
 	int i, vid;
 	uint32_t ale_entry[3];
 	struct cpsw_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (vg->es_vlangroup >= CPSW_VLANS)
 		return (EINVAL);
 
 	vg->es_vid = 0;
 	vid = cpsw_vgroups[vg->es_vlangroup].vid;
 	if (vid == -1)
 		return (0);
 
 	for (i = 0; i < CPSW_MAX_ALE_ENTRIES; i++) {
 		cpsw_ale_read_entry(sc, i, ale_entry);
 		if (ALE_TYPE(ale_entry) != ALE_TYPE_VLAN)
 			continue;
 		if (vid != ALE_VLAN(ale_entry))
 			continue;
 
 		vg->es_fid = 0;
 		vg->es_vid = ALE_VLAN(ale_entry) | ETHERSWITCH_VID_VALID;
 		vg->es_member_ports = ALE_VLAN_MEMBERS(ale_entry);
 		vg->es_untagged_ports = ALE_VLAN_UNTAG(ale_entry);
 	}
 
 	return (0);
 }
 
 static void
 cpsw_remove_vlan(struct cpsw_softc *sc, int vlan)
 {
 	int i;
 	uint32_t ale_entry[3];
 
 	for (i = 0; i < CPSW_MAX_ALE_ENTRIES; i++) {
 		cpsw_ale_read_entry(sc, i, ale_entry);
 		if (ALE_TYPE(ale_entry) != ALE_TYPE_VLAN)
 			continue;
 		if (vlan != ALE_VLAN(ale_entry))
 			continue;
 		ale_entry[0] = ale_entry[1] = ale_entry[2] = 0;
 		cpsw_ale_write_entry(sc, i, ale_entry);
 		break;
 	}
 }
 
 static int
 cpsw_setvgroup(device_t dev, etherswitch_vlangroup_t *vg)
 {
 	int i;
 	struct cpsw_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	for (i = 0; i < CPSW_VLANS; i++) {
 		/* Is this Vlan ID in use by another vlangroup ? */
 		if (vg->es_vlangroup != i && cpsw_vgroups[i].vid == vg->es_vid)
 			return (EINVAL);
 	}
 
 	if (vg->es_vid == 0) {
 		if (cpsw_vgroups[vg->es_vlangroup].vid == -1)
 			return (0);
 		cpsw_remove_vlan(sc, cpsw_vgroups[vg->es_vlangroup].vid);
 		cpsw_vgroups[vg->es_vlangroup].vid = -1;
 		vg->es_untagged_ports = 0;
 		vg->es_member_ports = 0;
 		vg->es_vid = 0;
 		return (0);
 	}
 
 	vg->es_vid &= ETHERSWITCH_VID_MASK;
 	vg->es_member_ports &= CPSW_PORTS_MASK;
 	vg->es_untagged_ports &= CPSW_PORTS_MASK;
 
 	if (cpsw_vgroups[vg->es_vlangroup].vid != -1 &&
 	    cpsw_vgroups[vg->es_vlangroup].vid != vg->es_vid)
 		return (EINVAL);
 
 	cpsw_vgroups[vg->es_vlangroup].vid = vg->es_vid;
 	cpsw_ale_update_vlan_table(sc, vg->es_vid, vg->es_member_ports,
 	    vg->es_untagged_ports, vg->es_member_ports, 0);
 
 	return (0);
 }
 
 static int
 cpsw_readreg(device_t dev, int addr)
 {
 
 	/* Not supported. */
 	return (0);
 }
 
 static int
 cpsw_writereg(device_t dev, int addr, int value)
 {
 
 	/* Not supported. */
 	return (0);
 }
 
 static int
 cpsw_readphy(device_t dev, int phy, int reg)
 {
 
 	/* Not supported. */
 	return (0);
 }
 
 static int
 cpsw_writephy(device_t dev, int phy, int reg, int data)
 {
 
 	/* Not supported. */
 	return (0);
 }
 #endif
Index: projects/fuse2/sys/arm/ti/ti_hwmods.c
===================================================================
--- projects/fuse2/sys/arm/ti/ti_hwmods.c	(revision 350434)
+++ projects/fuse2/sys/arm/ti/ti_hwmods.c	(revision 350435)
@@ -1,204 +1,214 @@
 /*-
  * Copyright (c) 2015 Oleksandr Tymoshenko <gonzo@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 
 #include <dev/ofw/openfirm.h>
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
 
 #include <machine/bus.h>
 #include <machine/fdt.h>
 
 #include <arm/ti/ti_prcm.h>
 #include <arm/ti/ti_hwmods.h>
 
 struct hwmod {
 	const char	*name;
 	int		clock_id;
 };
 
 struct hwmod ti_hwmods[] = {
 	{"i2c1",	I2C1_CLK},
 	{"i2c2",	I2C2_CLK},
 	{"i2c3",	I2C3_CLK},
 	{"i2c4",	I2C4_CLK},
 	{"i2c5",	I2C5_CLK},
 
 	{"gpio1",	GPIO1_CLK},
 	{"gpio2",	GPIO2_CLK},
 	{"gpio3",	GPIO3_CLK},
 	{"gpio4",	GPIO4_CLK},
 	{"gpio5",	GPIO5_CLK},
 	{"gpio6",	GPIO6_CLK},
 	{"gpio7",	GPIO7_CLK},
 
 	{"mmc1",	MMC1_CLK},
 	{"mmc2",	MMC2_CLK},
 	{"mmc3",	MMC3_CLK},
 	{"mmc4",	MMC4_CLK},
 	{"mmc5",	MMC5_CLK},
 	{"mmc6",	MMC6_CLK},
 
 	{"epwmss0",	PWMSS0_CLK},
 	{"epwmss1",	PWMSS1_CLK},
 	{"epwmss2",	PWMSS2_CLK},
 
 	{"spi0",	SPI0_CLK},
 	{"spi1",	SPI1_CLK},
 
 	{"timer1",	TIMER1_CLK},
 	{"timer2",	TIMER2_CLK},
 	{"timer3",	TIMER3_CLK},
 	{"timer4",	TIMER4_CLK},
 	{"timer5",	TIMER5_CLK},
 	{"timer6",	TIMER6_CLK},
 	{"timer7",	TIMER7_CLK},
 
 	{"uart1",	UART1_CLK},
 	{"uart2",	UART2_CLK},
 	{"uart3",	UART3_CLK},
 	{"uart4",	UART4_CLK},
 	{"uart5",	UART5_CLK},
 	{"uart6",	UART6_CLK},
 	{"uart7",	UART7_CLK},
 
 	{NULL,		0}
 };
 
+static inline int
+ti_get_hwmods_prop(phandle_t node, void **name)
+{
+	int len;
+
+	if ((len = OF_getprop_alloc(node, "ti,hwmods", name)) > 0)
+		return (len);
+	return (OF_getprop_alloc(OF_parent(node), "ti,hwmods", name));
+}
+
 clk_ident_t
 ti_hwmods_get_clock(device_t dev)
 {
 	phandle_t node;
 	int len, l;
 	char *name;
 	char *buf;
 	int clk;
 	struct hwmod *hw;
 
 	if ((node = ofw_bus_get_node(dev)) == 0)
 		return (INVALID_CLK_IDENT);
 
-	if ((len = OF_getprop_alloc(OF_parent(node), "ti,hwmods", (void**)&name)) <= 0)
+	if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0)
 		return (INVALID_CLK_IDENT);
 
 	buf = name;
 
 	clk = INVALID_CLK_IDENT;
 	while ((len > 0) && (clk == INVALID_CLK_IDENT)) {
 		for (hw = ti_hwmods; hw->name != NULL; ++hw) {
 			if (strcmp(hw->name, name) == 0) {
 				clk = hw->clock_id;
 				break;
 			}
 		}
 
 		/* Slide to the next sub-string. */
 		l = strlen(name) + 1;
 		name += l;
 		len -= l;
 	}
 
 	if (len > 0)
 		device_printf(dev, "WARNING: more than one ti,hwmod \n");
 
 	OF_prop_free(buf);
 	return (clk);
 }
 
 int ti_hwmods_contains(device_t dev, const char *hwmod)
 {
 	phandle_t node;
 	int len, l;
 	char *name;
 	char *buf;
 	int result;
 
 	if ((node = ofw_bus_get_node(dev)) == 0)
 		return (0);
 
-	if ((len = OF_getprop_alloc(OF_parent(node), "ti,hwmods", (void**)&name)) <= 0)
+	if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0)
 		return (0);
 
 	buf = name;
 
 	result = 0;
 	while (len > 0) {
 		if (strcmp(name, hwmod) == 0) {
 			result = 1;
 			break;
 		}
 
 		/* Slide to the next sub-string. */
 		l = strlen(name) + 1;
 		name += l;
 		len -= l;
 	}
 
 	OF_prop_free(buf);
 
 	return (result);
 }
 
 int 
 ti_hwmods_get_unit(device_t dev, const char *hwmod)
 {
 	phandle_t node;
 	int l, len, hwmodlen, result;
 	char *name;
 	char *buf;
 
 	if ((node = ofw_bus_get_node(dev)) == 0)
 		return (0);
 
-	if ((len = OF_getprop_alloc(OF_parent(node), "ti,hwmods", (void**)&name)) <= 0)
+	if ((len = ti_get_hwmods_prop(node, (void **)&name)) <= 0)
 		return (0);
 
 	buf = name;
 	hwmodlen = strlen(hwmod);
 	result = 0;
 	while (len > 0) {
 		if (strncmp(name, hwmod, hwmodlen) == 0) {
                         result = (int)strtoul(name + hwmodlen, NULL, 10);
 			break;
 		}
 		/* Slide to the next sub-string. */
 		l = strlen(name) + 1;
 		name += l;
 		len -= l;
 	}
 
 	OF_prop_free(buf);
 	return (result);
 }
Index: projects/fuse2/sys/arm64/arm64/pmap.c
===================================================================
--- projects/fuse2/sys/arm64/arm64/pmap.c	(revision 350434)
+++ projects/fuse2/sys/arm64/arm64/pmap.c	(revision 350435)
@@ -1,5930 +1,5930 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  * Copyright (c) 2014 Andrew Turner
  * All rights reserved.
  * Copyright (c) 2014-2016 The FreeBSD Foundation
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * This software was developed by Andrew Turner under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/_unrhdr.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #include <machine/machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 
 #include <arm/include/physmem.h>
 
 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
 
 #define	NUL0E		L0_ENTRIES
 #define	NUL1E		(NUL0E * NL1PG)
 #define	NUL2E		(NUL1E * NL2PG)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 /*
  * These are configured by the mair_el1 register. This is set up in locore.S
  */
 #define	DEVICE_MEMORY	0
 #define	UNCACHED_MEMORY	1
 #define	CACHED_MEMORY	2
 
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
 #define	pa_to_pvh(pa)		(&pv_table[pmap_l2_pindex(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 /*
  * The presence of this flag indicates that the mapping is writeable.
  * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is
  * dirty.  This flag may only be set on managed mappings.
  */
 static pt_entry_t ATTR_SW_DBM;
 
 struct pmap kernel_pmap_store;
 
 /* Used for mapping ACPI memory before VM is initialized */
 #define	PMAP_PREINIT_MAPPING_COUNT	32
 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
 
 /*
  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
  * Always map entire L2 block for simplicity.
  * VA of L2 block = preinit_map_va + i * L2_SIZE
  */
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	size;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t kernel_vm_end = 0;
 
 /*
  * Data for the pv entry allocation mechanism.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
 
 /* This code assumes all L1 DMAP entries will be used */
 CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
 CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
 
 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
 extern pt_entry_t pagetable_dmap[];
 
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 static vm_paddr_t physmap[PHYSMAP_SIZE];
 static u_int physmap_idx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int superpages_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
     "Are large page mappings enabled?");
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 
 static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode);
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
     vm_offset_t va, struct rwlock **lockp);
 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
     u_int flags, vm_page_t m, struct rwlock **lockp);
 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 
 /*
  * These load the old table data and store the new value.
  * They need to be atomic as the System MMU may write to the table at
  * the same time as the CPU.
  */
 #define	pmap_clear(table)		atomic_store_64(table, 0)
 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
 #define	pmap_load(table)		(*table)
 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
 
 /********************/
 /* Inline functions */
 /********************/
 
 static __inline void
 pagecopy(void *s, void *d)
 {
 
 	memcpy(d, s, PAGE_SIZE);
 }
 
 static __inline pd_entry_t *
 pmap_l0(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_l0[pmap_l0_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
 	return (&l1[pmap_l1_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l1(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l0;
 
 	l0 = pmap_l0(pmap, va);
 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
 		return (NULL);
 
 	return (pmap_l0_to_l1(l0, va));
 }
 
 static __inline pd_entry_t *
 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
 {
 	pd_entry_t *l2;
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
 	return (&l2[pmap_l2_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l2(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = pmap_l1(pmap, va);
 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
 		return (NULL);
 
 	return (pmap_l1_to_l2(l1, va));
 }
 
 static __inline pt_entry_t *
 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t *l3;
 
 	l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
 	return (&l3[pmap_l3_index(va)]);
 }
 
 /*
  * Returns the lowest valid pde for a given virtual address.
  * The next level may or may not point to a valid page or block.
  */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
 {
 	pd_entry_t *l0, *l1, *l2, desc;
 
 	l0 = pmap_l0(pmap, va);
 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
 	if (desc != L0_TABLE) {
 		*level = -1;
 		return (NULL);
 	}
 
 	l1 = pmap_l0_to_l1(l0, va);
 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
 	if (desc != L1_TABLE) {
 		*level = 0;
 		return (l0);
 	}
 
 	l2 = pmap_l1_to_l2(l1, va);
 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
 	if (desc != L2_TABLE) {
 		*level = 1;
 		return (l1);
 	}
 
 	*level = 2;
 	return (l2);
 }
 
 /*
  * Returns the lowest valid pte block or table entry for a given virtual
  * address. If there are no valid entries return NULL and set the level to
  * the first invalid level.
  */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
 {
 	pd_entry_t *l1, *l2, desc;
 	pt_entry_t *l3;
 
 	l1 = pmap_l1(pmap, va);
 	if (l1 == NULL) {
 		*level = 0;
 		return (NULL);
 	}
 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
 	if (desc == L1_BLOCK) {
 		*level = 1;
 		return (l1);
 	}
 
 	if (desc != L1_TABLE) {
 		*level = 1;
 		return (NULL);
 	}
 
 	l2 = pmap_l1_to_l2(l1, va);
 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
 	if (desc == L2_BLOCK) {
 		*level = 2;
 		return (l2);
 	}
 
 	if (desc != L2_TABLE) {
 		*level = 2;
 		return (NULL);
 	}
 
 	*level = 3;
 	l3 = pmap_l2_to_l3(l2, va);
 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
 		return (NULL);
 
 	return (l3);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap __unused)
 {
 
 	return (superpages_enabled != 0);
 }
 
 bool
 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
     pd_entry_t **l2, pt_entry_t **l3)
 {
 	pd_entry_t *l0p, *l1p, *l2p;
 
 	if (pmap->pm_l0 == NULL)
 		return (false);
 
 	l0p = pmap_l0(pmap, va);
 	*l0 = l0p;
 
 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
 		return (false);
 
 	l1p = pmap_l0_to_l1(l0p, va);
 	*l1 = l1p;
 
 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
 		*l2 = NULL;
 		*l3 = NULL;
 		return (true);
 	}
 
 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
 		return (false);
 
 	l2p = pmap_l1_to_l2(l1p, va);
 	*l2 = l2p;
 
 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
 		*l3 = NULL;
 		return (true);
 	}
 
 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
 		return (false);
 
 	*l3 = pmap_l2_to_l3(l2p, va);
 
 	return (true);
 }
 
 static __inline int
 pmap_l3_valid(pt_entry_t l3)
 {
 
 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
 }
 
 
 CTASSERT(L1_BLOCK == L2_BLOCK);
 
 /*
  * Checks if the PTE is dirty.
  */
 static inline int
 pmap_pte_dirty(pt_entry_t pte)
 {
 
 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
 	KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0,
 	    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
 
 	return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 	    (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 static pt_entry_t *
 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
     u_int *l2_slot)
 {
 	pt_entry_t *l2;
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)l1pt;
 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
 
 	/* Check locore has used a table L1 map */
 	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
 	   ("Invalid bootstrap L1 table"));
 	/* Find the address of the L2 table */
 	l2 = (pt_entry_t *)init_pt_va;
 	*l2_slot = pmap_l2_index(va);
 
 	return (l2);
 }
 
 static vm_paddr_t
 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 
 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
 
 	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
 }
 
 static vm_offset_t
 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
     vm_offset_t freemempos)
 {
 	pt_entry_t *l2;
 	vm_offset_t va;
 	vm_paddr_t l2_pa, pa;
 	u_int l1_slot, l2_slot, prev_l1_slot;
 	int i;
 
 	dmap_phys_base = min_pa & ~L1_OFFSET;
 	dmap_phys_max = 0;
 	dmap_max_addr = 0;
 	l2 = NULL;
 	prev_l1_slot = -1;
 
 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
 	memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
 
 	for (i = 0; i < (physmap_idx * 2); i += 2) {
 		pa = physmap[i] & ~L2_OFFSET;
 		va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
 
 		/* Create L2 mappings at the start of the region */
 		if ((pa & L1_OFFSET) != 0) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			if (l1_slot != prev_l1_slot) {
 				prev_l1_slot = l1_slot;
 				l2 = (pt_entry_t *)freemempos;
 				l2_pa = pmap_early_vtophys(kern_l1,
 				    (vm_offset_t)l2);
 				freemempos += PAGE_SIZE;
 
 				pmap_store(&pagetable_dmap[l1_slot],
 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
 
 				memset(l2, 0, PAGE_SIZE);
 			}
 			KASSERT(l2 != NULL,
 			    ("pmap_bootstrap_dmap: NULL l2 map"));
 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
 			    pa += L2_SIZE, va += L2_SIZE) {
 				/*
 				 * We are on a boundary, stop to
 				 * create a level 1 block
 				 */
 				if ((pa & L1_OFFSET) == 0)
 					break;
 
 				l2_slot = pmap_l2_index(va);
 				KASSERT(l2_slot != 0, ("..."));
 				pmap_store(&l2[l2_slot],
 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 			}
 			KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
 			    ("..."));
 		}
 
 		for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
 		    (physmap[i + 1] - pa) >= L1_SIZE;
 		    pa += L1_SIZE, va += L1_SIZE) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			pmap_store(&pagetable_dmap[l1_slot],
 			    (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 			    ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
 		}
 
 		/* Create L2 mappings at the end of the region */
 		if (pa < physmap[i + 1]) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			if (l1_slot != prev_l1_slot) {
 				prev_l1_slot = l1_slot;
 				l2 = (pt_entry_t *)freemempos;
 				l2_pa = pmap_early_vtophys(kern_l1,
 				    (vm_offset_t)l2);
 				freemempos += PAGE_SIZE;
 
 				pmap_store(&pagetable_dmap[l1_slot],
 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
 
 				memset(l2, 0, PAGE_SIZE);
 			}
 			KASSERT(l2 != NULL,
 			    ("pmap_bootstrap_dmap: NULL l2 map"));
 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
 			    pa += L2_SIZE, va += L2_SIZE) {
 				l2_slot = pmap_l2_index(va);
 				pmap_store(&l2[l2_slot],
 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 			}
 		}
 
 		if (pa > dmap_phys_max) {
 			dmap_phys_max = pa;
 			dmap_max_addr = va;
 		}
 	}
 
 	cpu_tlb_flushID();
 
 	return (freemempos);
 }
 
 static vm_offset_t
 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
 {
 	vm_offset_t l2pt;
 	vm_paddr_t pa;
 	pd_entry_t *l1;
 	u_int l1_slot;
 
 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
 
 	l1 = (pd_entry_t *)l1pt;
 	l1_slot = pmap_l1_index(va);
 	l2pt = l2_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
 
 		pa = pmap_early_vtophys(l1pt, l2pt);
 		pmap_store(&l1[l1_slot],
 		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
 		l2pt += PAGE_SIZE;
 	}
 
 	/* Clean the L2 page table */
 	memset((void *)l2_start, 0, l2pt - l2_start);
 
 	return l2pt;
 }
 
 static vm_offset_t
 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
 {
 	vm_offset_t l3pt;
 	vm_paddr_t pa;
 	pd_entry_t *l2;
 	u_int l2_slot;
 
 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
 
 	l2 = pmap_l2(kernel_pmap, va);
 	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
 	l2_slot = pmap_l2_index(va);
 	l3pt = l3_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
 
 		pa = pmap_early_vtophys(l1pt, l3pt);
 		pmap_store(&l2[l2_slot],
 		    (pa & ~Ln_TABLE_MASK) | L2_TABLE);
 		l3pt += PAGE_SIZE;
 	}
 
 	/* Clean the L2 page table */
 	memset((void *)l3_start, 0, l3pt - l3_start);
 
 	return l3pt;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
     vm_size_t kernlen)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 	vm_offset_t va, freemempos;
 	vm_offset_t dpcpu, msgbufpv;
 	vm_paddr_t start_pa, pa, min_pa;
 	uint64_t kern_delta;
 	int i;
 
 #ifdef notyet
 	/* Determine whether the hardware implements DBM management. */
 	uint64_t reg = READ_SPECIALREG(ID_AA64MMFR1_EL1);
 	ATTR_SW_DBM = ID_AA64MMFR1_HAFDBS(reg) == ID_AA64MMFR1_HAFDBS_AF_DBS ?
 	    ATTR_DBM : _ATTR_SW_DBM;
 #else
 	ATTR_SW_DBM = _ATTR_SW_DBM;
 #endif
 
 	kern_delta = KERNBASE - kernstart;
 
 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
 	printf("%lx\n", l1pt);
 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
 
 	/* Set this early so we can use the pagetable walking functions */
 	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	/* Assume the address we were loaded to is a valid physical address */
 	min_pa = KERNBASE - kern_delta;
 
 	physmap_idx = arm_physmem_avail(physmap, nitems(physmap));
 	physmap_idx /= 2;
 
 	/*
 	 * Find the minimum physical address. physmap is sorted,
 	 * but may contain empty ranges.
 	 */
 	for (i = 0; i < (physmap_idx * 2); i += 2) {
 		if (physmap[i] == physmap[i + 1])
 			continue;
 		if (physmap[i] <= min_pa)
 			min_pa = physmap[i];
 	}
 
 	freemempos = KERNBASE + kernlen;
 	freemempos = roundup2(freemempos, PAGE_SIZE);
 
 	/* Create a direct map region early so we can use it for pa -> va */
 	freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
 
 	va = KERNBASE;
 	start_pa = pa = KERNBASE - kern_delta;
 
 	/*
 	 * Read the page table to find out what is already mapped.
 	 * This assumes we have mapped a block of memory from KERNBASE
 	 * using a single L1 entry.
 	 */
 	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
 
 	/* Sanity check the index, KERNBASE should be the first VA */
 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
 
 	/* Find how many pages we have mapped */
 	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
 		if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
 			break;
 
 		/* Check locore used L2 blocks */
 		KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
 		    ("Invalid bootstrap L2 table"));
 		KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
 		    ("Incorrect PA in L2 table"));
 
 		va += L2_SIZE;
 		pa += L2_SIZE;
 	}
 
 	va = roundup2(va, L1_SIZE);
 
 	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
 	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
 	/* And the l3 tables for the early devmap */
 	freemempos = pmap_bootstrap_l3(l1pt,
 	    VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos);
 
 	cpu_tlb_flushID();
 
 #define alloc_pages(var, np)						\
 	(var) = freemempos;						\
 	freemempos += (np * PAGE_SIZE);					\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	/* Allocate dynamic per-cpu area. */
 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
 	msgbufp = (void *)msgbufpv;
 
 	/* Reserve some VA space for early BIOS/ACPI mapping */
 	preinit_map_va = roundup2(freemempos, L2_SIZE);
 
 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
 	kernel_vm_end = virtual_avail;
 
 	pa = pmap_early_vtophys(l1pt, freemempos);
 
 	arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
 
 	cpu_tlb_flushID();
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
 	if (superpages_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = L2_SIZE;
 	}
 
 	/*
 	 * Initialize the pv chunk list mutex.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	vm_initialized = 1;
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static u_long pmap_l2_demotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_l2_demotions, 0, "2MB page demotions");
 
 static u_long pmap_l2_mappings;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_l2_mappings, 0, "2MB page mappings");
 
 static u_long pmap_l2_p_failures;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_l2_p_failures, 0, "2MB page promotion failures");
 
 static u_long pmap_l2_promotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_l2_promotions, 0, "2MB page promotions");
 
 /*
  * Invalidate a single TLB entry.
  */
 static __inline void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	sched_pin();
 	__asm __volatile(
 	    "dsb  ishst		\n"
 	    "tlbi vaae1is, %0	\n"
 	    "dsb  ish		\n"
 	    "isb		\n"
 	    : : "r"(va >> PAGE_SHIFT));
 	sched_unpin();
 }
 
 static __inline void
 pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	dsb(ishst);
 	for (addr = sva; addr < eva; addr += PAGE_SIZE) {
 		__asm __volatile(
 		    "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
 	}
 	__asm __volatile(
 	    "dsb  ish	\n"
 	    "isb	\n");
 }
 
 static __inline void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	sched_pin();
 	pmap_invalidate_range_nopin(pmap, sva, eva);
 	sched_unpin();
 }
 
 static __inline void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	sched_pin();
 	__asm __volatile(
 	    "dsb  ishst		\n"
 	    "tlbi vmalle1is	\n"
 	    "dsb  ish		\n"
 	    "isb		\n");
 	sched_unpin();
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t pa;
 	int lvl;
 
 	pa = 0;
 	PMAP_LOCK(pmap);
 	/*
 	 * Find the block or page map for this virtual address. pmap_pte
 	 * will return either a valid block/page entry, or NULL.
 	 */
 	pte = pmap_pte(pmap, va, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 		pa = tpte & ~ATTR_MASK;
 		switch(lvl) {
 		case 1:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
 			    ("pmap_extract: Invalid L1 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L1_OFFSET);
 			break;
 		case 2:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
 			    ("pmap_extract: Invalid L2 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L2_OFFSET);
 			break;
 		case 3:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
 			    ("pmap_extract: Invalid L3 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L3_OFFSET);
 			break;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pt_entry_t *pte, tpte;
 	vm_offset_t off;
 	vm_paddr_t pa;
 	vm_page_t m;
 	int lvl;
 
 	pa = 0;
 	m = NULL;
 	PMAP_LOCK(pmap);
 retry:
 	pte = pmap_pte(pmap, va, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 
 		KASSERT(lvl > 0 && lvl <= 3,
 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
 		CTASSERT(L1_BLOCK == L2_BLOCK);
 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
 		     tpte & ATTR_DESCR_MASK));
 		if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
 		    ((prot & VM_PROT_WRITE) == 0)) {
 			switch(lvl) {
 			case 1:
 				off = va & L1_OFFSET;
 				break;
 			case 2:
 				off = va & L2_OFFSET;
 				break;
 			case 3:
 			default:
 				off = 0;
 			}
 			if (vm_page_pa_tryrelock(pmap,
 			    (tpte & ~ATTR_MASK) | off, &pa))
 				goto retry;
 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
 			vm_page_wire(m);
 		}
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t pa;
 	int lvl;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		pa = 0;
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		if (pte != NULL) {
 			tpte = pmap_load(pte);
 			pa = tpte & ~ATTR_MASK;
 			switch(lvl) {
 			case 1:
 				KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
 				    ("pmap_kextract: Invalid L1 pte found: %lx",
 				    tpte & ATTR_DESCR_MASK));
 				pa |= (va & L1_OFFSET);
 				break;
 			case 2:
 				KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
 				    ("pmap_kextract: Invalid L2 pte found: %lx",
 				    tpte & ATTR_DESCR_MASK));
 				pa |= (va & L2_OFFSET);
 				break;
 			case 3:
 				KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
 				    ("pmap_kextract: Invalid L3 pte found: %lx",
 				    tpte & ATTR_DESCR_MASK));
 				pa |= (va & L3_OFFSET);
 				break;
 			}
 		}
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 void
 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, attr;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT((pa & L3_OFFSET) == 0,
 	   ("pmap_kenter: Invalid physical address"));
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kenter: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kenter: Mapping is not page-sized"));
 
 	attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE;
 	if (mode == DEVICE_MEMORY)
 		attr |= ATTR_XN;
 
 	va = sva;
 	while (size != 0) {
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
 
 		pte = pmap_l2_to_l3(pde, va);
 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
 
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 void
 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
 {
 
 	pmap_kenter(sva, size, pa, DEVICE_MEMORY);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 	int lvl;
 
 	pte = pmap_pte(kernel_pmap, va, &lvl);
 	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
 	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
 
 	pmap_clear(pte);
 	pmap_invalidate_page(kernel_pmap, va);
 }
 
 void
 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kremove_device: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kremove_device: Mapping is not page-sized"));
 
 	va = sva;
 	while (size != 0) {
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
 		KASSERT(lvl == 3,
 		    ("Invalid device pagetable level: %d != 3", lvl));
 		pmap_clear(pte);
 
 		va += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, pa;
 	vm_offset_t va;
 	vm_page_t m;
 	int i, lvl;
 
 	va = sva;
 	for (i = 0; i < count; i++) {
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2,
 		    ("pmap_qenter: Invalid level %d", lvl));
 
 		m = ma[i];
 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
 		    ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
 		if (m->md.pv_memattr == DEVICE_MEMORY)
 			pa |= ATTR_XN;
 		pte = pmap_l2_to_l3(pde, va);
 		pmap_load_store(pte, pa);
 
 		va += L3_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
 
 	va = sva;
 	while (count-- > 0) {
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		KASSERT(lvl == 3,
 		    ("Invalid device pagetable level: %d != 3", lvl));
 		if (pte != NULL) {
 			pmap_clear(pte);
 		}
 
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_l3(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUL2E + NUL1E)) {
 		/* l1 page */
 		pd_entry_t *l0;
 
 		l0 = pmap_l0(pmap, va);
 		pmap_clear(l0);
 	} else if (m->pindex >= NUL2E) {
 		/* l2 page */
 		pd_entry_t *l1;
 
 		l1 = pmap_l1(pmap, va);
 		pmap_clear(l1);
 	} else {
 		/* l3 page */
 		pd_entry_t *l2;
 
 		l2 = pmap_l2(pmap, va);
 		pmap_clear(l2);
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUL2E) {
 		/* We just released an l3, unhold the matching l2 */
 		pd_entry_t *l1, tl1;
 		vm_page_t l2pg;
 
 		l1 = pmap_l1(pmap, va);
 		tl1 = pmap_load(l1);
 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
 		pmap_unwire_l3(pmap, va, l2pg, free);
 	} else if (m->pindex < (NUL2E + NUL1E)) {
 		/* We just released an l2, unhold the matching l1 */
 		pd_entry_t *l0, tl0;
 		vm_page_t l1pg;
 
 		l0 = pmap_l0(pmap, va);
 		tl0 = pmap_load(l0);
 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
 		pmap_unwire_l3(pmap, va, l1pg, free);
 	}
 	pmap_invalidate_page(pmap, va);
 
 	/*
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
 	return (pmap_unwire_l3(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 	pmap->pm_l0 = kernel_pmap->pm_l0;
 	pmap->pm_root.rt_root = 0;
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 	vm_paddr_t l0phys;
 	vm_page_t l0pt;
 
 	/*
 	 * allocate the l0 page
 	 */
 	while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		vm_wait(NULL);
 
 	l0phys = VM_PAGE_TO_PHYS(l0pt);
 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
 
 	if ((l0pt->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_l0);
 
 	pmap->pm_root.rt_root = 0;
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 
 	return (1);
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, l1pg, l2pg;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			vm_wait(NULL);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= (NUL2E + NUL1E)) {
 		pd_entry_t *l0;
 		vm_pindex_t l0index;
 
 		l0index = ptepindex - (NUL2E + NUL1E);
 		l0 = &pmap->pm_l0[l0index];
 		pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
 	} else if (ptepindex >= NUL2E) {
 		vm_pindex_t l0index, l1index;
 		pd_entry_t *l0, *l1;
 		pd_entry_t tl0;
 
 		l1index = ptepindex - NUL2E;
 		l0index = l1index >> L0_ENTRIES_SHIFT;
 
 		l0 = &pmap->pm_l0[l0index];
 		tl0 = pmap_load(l0);
 		if (tl0 == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
 			l1pg->wire_count++;
 		}
 
 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
 	} else {
 		vm_pindex_t l0index, l1index;
 		pd_entry_t *l0, *l1, *l2;
 		pd_entry_t tl0, tl1;
 
 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
 		l0index = l1index >> L0_ENTRIES_SHIFT;
 
 		l0 = &pmap->pm_l0[l0index];
 		tl0 = pmap_load(l0);
 		if (tl0 == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 			tl0 = pmap_load(l0);
 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
 			l1 = &l1[l1index & Ln_ADDR_MASK];
 		} else {
 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
 			l1 = &l1[l1index & Ln_ADDR_MASK];
 			tl1 = pmap_load(l1);
 			if (tl1 == 0) {
 				/* recurse for allocating page dir */
 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
 				    lockp) == NULL) {
 					vm_page_unwire_noq(m);
 					vm_page_free_zero(m);
 					return (NULL);
 				}
 			} else {
 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
 				l2pg->wire_count++;
 			}
 		}
 
 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
 		pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	pd_entry_t *l1;
 	vm_page_t l2pg;
 	vm_pindex_t l2pindex;
 
 retry:
 	l1 = pmap_l1(pmap, va);
 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
 		/* Add a reference to the L2 page. */
 		l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
 		l2pg->wire_count++;
 	} else {
 		/* Allocate a L2 page. */
 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
 		if (l2pg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (l2pg);
 }
 
 static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pde, tpde;
 #ifdef INVARIANTS
 	pt_entry_t *pte;
 #endif
 	vm_page_t m;
 	int lvl;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_l2_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pde = pmap_pde(pmap, va, &lvl);
 
 	/*
 	 * If the page table page is mapped, we just increment the hold count,
 	 * and activate it. If we get a level 2 pde it will point to a level 3
 	 * table.
 	 */
 	switch (lvl) {
 	case -1:
 		break;
 	case 0:
 #ifdef INVARIANTS
 		pte = pmap_l0_to_l1(pde, va);
 		KASSERT(pmap_load(pte) == 0,
 		    ("pmap_alloc_l3: TODO: l0 superpages"));
 #endif
 		break;
 	case 1:
 #ifdef INVARIANTS
 		pte = pmap_l1_to_l2(pde, va);
 		KASSERT(pmap_load(pte) == 0,
 		    ("pmap_alloc_l3: TODO: l1 superpages"));
 #endif
 		break;
 	case 2:
 		tpde = pmap_load(pde);
 		if (tpde != 0) {
 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
 			m->wire_count++;
 			return (m);
 		}
 		break;
 	default:
 		panic("pmap_alloc_l3: Invalid level %d", lvl);
 	}
 
 	/*
 	 * Here if the pte page isn't mapped, or if it has been deallocated.
 	 */
 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
 	if (m == NULL && lockp != NULL)
 		goto retry;
 
 	return (m);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
 
 	vm_page_unwire_noq(m);
 	vm_page_free_zero(m);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *l0, *l1, *l2;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	addr = roundup2(addr, L2_SIZE);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
 		KASSERT(pmap_load(l0) != 0,
 		    ("pmap_growkernel: No level 0 kernel entry"));
 
 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
 		if (pmap_load(l1) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			pmap_store(l1, paddr | L1_TABLE);
 			continue; /* try again */
 		}
 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
 		if ((pmap_load(l2) & ATTR_AF) != 0) {
 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		pmap_load_store(l2, paddr | L2_TABLE);
 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
 
 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #if 0
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 #endif /* 0 */
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t next_pmap, pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed, lvl;
 	static int active_reclaims = 0;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	bzero(&pc_marker_b, sizeof(pc_marker_b));
 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 	pc_marker = (struct pv_chunk *)&pc_marker_b;
 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 
 	mtx_lock(&pv_chunks_mutex);
 	active_reclaims++;
 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 	    SLIST_EMPTY(&free)) {
 		next_pmap = pc->pc_pmap;
 		if (next_pmap == NULL) {
 			/*
 			 * The next chunk is a marker.  However, it is
 			 * not our marker, so active_reclaims must be
 			 * > 1.  Consequently, the next_chunk code
 			 * will not rotate the pv_chunks list.
 			 */
 			goto next_chunk;
 		}
 		mtx_unlock(&pv_chunks_mutex);
 
 		/*
 		 * A pv_chunk can only be removed from the pc_lru list
 		 * when both pv_chunks_mutex is owned and the
 		 * corresponding pmap is locked.
 		 */
 		if (pmap != next_pmap) {
 			if (pmap != NULL && pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			pmap = next_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			} else if (pmap != locked_pmap) {
 				if (PMAP_TRYLOCK(pmap)) {
 					mtx_lock(&pv_chunks_mutex);
 					continue;
 				} else {
 					pmap = NULL; /* pmap is not locked */
 					mtx_lock(&pv_chunks_mutex);
 					pc = TAILQ_NEXT(pc_marker, pc_lru);
 					if (pc == NULL ||
 					    pc->pc_pmap != next_pmap)
 						continue;
 					goto next_chunk;
 				}
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = ffsl(inuse) - 1;
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va, &lvl);
 				if (lvl != 2)
 					continue;
 				pte = pmap_l2_to_l3(pde, va);
 				tpte = pmap_load(pte);
 				if ((tpte & ATTR_SW_WIRED) != 0)
 					continue;
 				tpte = pmap_load_clear(pte);
 				pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
 				if (pmap_pte_dirty(tpte))
 					vm_page_dirty(m);
 				if ((tpte & ATTR_AF) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			mtx_lock(&pv_chunks_mutex);
 			goto next_chunk;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_dec(pmap, freed);
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 		    pc->pc_map[2] == PC_FREE2) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pv_chunks_mutex);
 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		mtx_lock(&pv_chunks_mutex);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 
 next_chunk:
 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 		if (active_reclaims == 1 && pmap != NULL) {
 			/*
 			 * Rotate the pv chunks list so that we do not
 			 * scan the same pv chunks that could not be
 			 * freed (because they contained a wired
 			 * and/or superpage mapping) on every
 			 * invocation of reclaim_pv_chunk().
 			 */
 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 				MPASS(pc->pc_pmap != NULL);
 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 			}
 		}
 	}
 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 	active_reclaims--;
 	mtx_unlock(&pv_chunks_mutex);
 	if (pmap != NULL && pmap != locked_pmap)
 		PMAP_UNLOCK(pmap);
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 		bit_count((bitstr_t *)pc->pc_map, 0,
 		    sizeof(pc->pc_map) * NBBY, &free);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((va & L2_OFFSET) == 0,
 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
 	KASSERT((pa & L2_OFFSET) == 0,
 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_l2: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
 }
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = l2e & ~ATTR_MASK;
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 static void
 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t newl2, oldl2;
 	vm_page_t ml3;
 	vm_paddr_t ml3pa;
 
 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	ml3 = pmap_remove_pt_page(pmap, va);
 	if (ml3 == NULL)
 		panic("pmap_remove_kernel_l2: Missing pt page");
 
 	ml3pa = VM_PAGE_TO_PHYS(ml3);
 	newl2 = ml3pa | L2_TABLE;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (ml3->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
 
 	/*
 	 * Demote the mapping.  The caller must have already invalidated the
 	 * mapping (i.e., the "break" in break-before-make).
 	 */
 	oldl2 = pmap_load_store(l2, newl2);
 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
 	    __func__, l2, oldl2));
 }
 
 /*
  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
  */
 static int
 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t old_l2;
 	vm_offset_t eva, va;
 	vm_page_t m, ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
 	old_l2 = pmap_load_clear(l2);
 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
 
 	/*
 	 * Since a promotion must break the 4KB page mappings before making
 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
 	 */
 	pmap_invalidate_page(pmap, sva);
 
 	if (old_l2 & ATTR_SW_WIRED)
 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
 	if (old_l2 & ATTR_SW_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
 		pvh = pa_to_pvh(old_l2 & ~ATTR_MASK);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + L2_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if (pmap_pte_dirty(old_l2))
 				vm_page_dirty(m);
 			if (old_l2 & ATTR_AF)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_l2(pmap, l2, sva);
 	} else {
 		ml3 = pmap_remove_pt_page(pmap, sva);
 		if (ml3 != NULL) {
 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_l2: l3 page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(ml3->wire_count == NL3PG,
 			    ("pmap_remove_l2: l3 page wire count error"));
 			ml3->wire_count = 0;
 			pmap_add_delayed_free_list(ml3, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, l1e, free));
 }
 
 /*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
 static int
 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t old_l3;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	old_l3 = pmap_load_clear(l3);
 	pmap_invalidate_page(pmap, va);
 	if (old_l3 & ATTR_SW_WIRED)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (old_l3 & ATTR_SW_MANAGED) {
 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
 		if (pmap_pte_dirty(old_l3))
 			vm_page_dirty(m);
 		if (old_l3 & ATTR_AF)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
  * Remove the specified range of addresses from the L3 page table that is
  * identified by the given L2 entry.
  */
 static void
 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct rwlock *new_lock;
 	pt_entry_t *l3, old_l3;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
 	va = eva;
 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
 		if (!pmap_l3_valid(pmap_load(l3))) {
 			if (va != eva) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = eva;
 			}
 			continue;
 		}
 		old_l3 = pmap_load_clear(l3);
 		if ((old_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 		pmap_resident_count_dec(pmap, 1);
 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
 			m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
 			if (pmap_pte_dirty(old_l3))
 				vm_page_dirty(m);
 			if ((old_l3 & ATTR_AF) != 0)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
 			if (new_lock != *lockp) {
 				if (*lockp != NULL) {
 					/*
 					 * Pending TLB invalidations must be
 					 * performed before the PV list lock is
 					 * released.  Otherwise, a concurrent
 					 * pmap_remove_all() on a physical page
 					 * could return while a stale TLB entry
 					 * still provides access to that page. 
 					 */
 					if (va != eva) {
 						pmap_invalidate_range(pmap, va,
 						    sva);
 						va = eva;
 					}
 					rw_wunlock(*lockp);
 				}
 				*lockp = new_lock;
 				rw_wlock(*lockp);
 			}
 			pmap_pvh_free(&m->md, pmap, sva);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    (m->flags & PG_FICTITIOUS) == 0) {
 				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 				if (TAILQ_EMPTY(&pvh->pv_list))
 					vm_page_aflag_clear(m, PGA_WRITEABLE);
 			}
 		}
 		if (va == eva)
 			va = sva;
 		if (pmap_unuse_pt(pmap, sva, l2e, free)) {
 			sva += L3_SIZE;
 			break;
 		}
 	}
 	if (va != eva)
 		pmap_invalidate_range(pmap, va, sva);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_offset_t va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t l3_paddr;
 	struct spglist free;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	SLIST_INIT(&free);
 
 	PMAP_LOCK(pmap);
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL)
 			continue;
 
 		l3_paddr = pmap_load(l2);
 
 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
 				    &free, &lock);
 				continue;
 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
 			    &lock) == NULL)
 				continue;
 			l3_paddr = pmap_load(l2);
 		}
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
 			continue;
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
 		    &lock);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t *pde, tpde;
 	pt_entry_t *pte, tpte;
 	vm_offset_t va;
 	struct spglist free;
 	int lvl, pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry:
 	rw_wlock(lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pte = pmap_pte(pmap, va, &lvl);
 		KASSERT(pte != NULL,
 		    ("pmap_remove_all: no page table entry found"));
 		KASSERT(lvl == 2,
 		    ("pmap_remove_all: invalid pte level %d", lvl));
 
 		pmap_demote_l2_locked(pmap, pte, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		pmap_resident_count_dec(pmap, 1);
 
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_remove_all: no page directory entry found"));
 		KASSERT(lvl == 2,
 		    ("pmap_remove_all: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 
 		pte = pmap_l2_to_l3(pde, pv->pv_va);
 		tpte = pmap_load_clear(pte);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		if (tpte & ATTR_SW_WIRED)
 			pmap->pm_stats.wired_count--;
 		if ((tpte & ATTR_AF) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (pmap_pte_dirty(tpte))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_l2: do the things to protect a 2MB page in a pmap
  */
 static void
 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
     pt_entry_t nbits)
 {
 	pd_entry_t old_l2;
 	vm_page_t m, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0,
 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
 	old_l2 = pmap_load(l2);
 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
 
 	/*
 	 * Return if the L2 entry already has the desired access restrictions
 	 * in place.
 	 */
 retry:
 	if ((old_l2 & mask) == nbits)
 		return;
 
 	/*
 	 * When a dirty read/write superpage mapping is write protected,
 	 * update the dirty field of each of the superpage's constituent 4KB
 	 * pages.
 	 */
 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
 	    (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) {
 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 			vm_page_dirty(mt);
 	}
 
 	if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
 		goto retry;
 
 	/*
 	 * Since a promotion must break the 4KB page mappings before making
 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
 	 */
 	pmap_invalidate_page(pmap, sva);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va, va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t *l3p, l3, mask, nbits;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	mask = nbits = 0;
 	if ((prot & VM_PROT_WRITE) == 0) {
 		mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM;
 		nbits |= ATTR_AP(ATTR_AP_RO);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0) {
 		mask |= ATTR_XN;
 		nbits |= ATTR_XN;
 	}
 	if (mask == 0)
 		return;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (pmap_load(l2) == 0)
 			continue;
 
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
 				continue;
 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
 				continue;
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_protect: Invalid L2 entry after demotion"));
 
 		if (va_next > eva)
 			va_next = eva;
 
 		va = va_next;
 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
 		    sva += L3_SIZE) {
 			l3 = pmap_load(l3p);
 retry:
 			/*
 			 * Go to the next L3 entry if the current one is
 			 * invalid or already has the desired access
 			 * restrictions in place.  (The latter case occurs
 			 * frequently.  For example, in a "buildworld"
 			 * workload, almost 1 out of 4 L3 entries already
 			 * have the desired restrictions.)
 			 */
 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
 					va = va_next;
 				}
 				continue;
 			}
 
 			/*
 			 * When a dirty read/write mapping is write protected,
 			 * update the page's dirty field.
 			 */
 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
 			    (nbits & ATTR_AP(ATTR_AP_RO)) != 0 &&
 			    pmap_pte_dirty(l3))
 				vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
 
 			if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits))
 				goto retry;
 			if (va == va_next)
 				va = sva;
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "mpte" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
 }
 
 /*
  * Performs a break-before-make update of a pmap entry. This is needed when
  * either promoting or demoting pages to ensure the TLB doesn't get into an
  * inconsistent state.
  */
 static void
 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
     vm_offset_t va, vm_size_t size)
 {
 	register_t intr;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Ensure we don't get switched out with the page table in an
 	 * inconsistent state. We also need to ensure no interrupts fire
 	 * as they may make use of an address we are about to invalidate.
 	 */
 	intr = intr_disable();
 	critical_enter();
 
 	/* Clear the old mapping */
 	pmap_clear(pte);
 	pmap_invalidate_range_nopin(pmap, va, va + size);
 
 	/* Create the new mapping */
 	pmap_store(pte, newpte);
 	dsb(ishst);
 
 	critical_exit();
 	intr_restore(intr);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & L2_OFFSET) == 0,
 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = va & ~L2_OFFSET;
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single level 2 table entry to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics.
  */
 static void
 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
 	vm_page_t mpte;
 	vm_offset_t sva;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	sva = va & ~L2_OFFSET;
 	firstl3 = pmap_l2_to_l3(l2, sva);
 	newl2 = pmap_load(firstl3);
 
 setl2:
 	if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) {
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 
 	if ((newl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 	    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 		if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM))
 			goto setl2;
 		newl2 &= ~ATTR_SW_DBM;
 	}
 
 	pa = newl2 + L2_SIZE - PAGE_SIZE;
 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
 		oldl3 = pmap_load(l3);
 setl3:
 		if ((oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 		    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
 			    ~ATTR_SW_DBM))
 				goto setl3;
 			oldl3 &= ~ATTR_SW_DBM;
 		}
 		if (oldl3 != pa) {
 			atomic_add_long(&pmap_l2_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the L2
 	 * mapping the superpage is demoted by pmap_demote_l2() or
 	 * destroyed by pmap_remove_l3().
 	 */
 	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_l2: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
 	    ("pmap_promote_l2: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		CTR2(KTR_PMAP,
 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	if ((newl2 & ATTR_SW_MANAGED) != 0)
 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
 
 	newl2 &= ~ATTR_DESCR_MASK;
 	newl2 |= L2_BLOCK;
 
 	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
 
 	atomic_add_long(&pmap_l2_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
 		    pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t new_l3, orig_l3;
 	pt_entry_t *l2, *l3;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	boolean_t nosleep;
 	int lvl, rv;
 
 	va = trunc_page(va);
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	pa = VM_PAGE_TO_PHYS(m);
 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
 	    L3_PAGE);
 	if ((prot & VM_PROT_WRITE) == 0)
 		new_l3 |= ATTR_AP(ATTR_AP_RO);
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		new_l3 |= ATTR_XN;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		new_l3 |= ATTR_SW_WIRED;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		new_l3 |= ATTR_SW_MANAGED;
 		if ((prot & VM_PROT_WRITE) != 0) {
 			new_l3 |= ATTR_SW_DBM;
 			if ((flags & VM_PROT_WRITE) == 0)
 				new_l3 |= ATTR_AP(ATTR_AP_RO);
 		}
 	}
 
 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */
 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
 		    flags, m, &lock);
 		goto out;
 	}
 	mpte = NULL;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	pde = pmap_pde(pmap, va, &lvl);
 	if (pde != NULL && lvl == 2) {
 		l3 = pmap_l2_to_l3(pde, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
 			mpte->wire_count++;
 		}
 		goto havel3;
 	} else if (pde != NULL && lvl == 1) {
 		l2 = pmap_l1_to_l2(pde, va);
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
 			l3 = &l3[pmap_l3_index(va)];
 			if (va < VM_MAXUSER_ADDRESS) {
 				mpte = PHYS_TO_VM_PAGE(
 				    pmap_load(l2) & ~ATTR_MASK);
 				mpte->wire_count++;
 			}
 			goto havel3;
 		}
 		/* We need to allocate an L3 table. */
 	}
 	if (va < VM_MAXUSER_ADDRESS) {
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 
 		/*
 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
 		 * to handle the possibility that a superpage mapping for "va"
 		 * was created while we slept.
 		 */
 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
 		    nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 		goto retry;
 	} else
 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
 
 havel3:
 	orig_l3 = pmap_load(l3);
 	opa = orig_l3 & ~ATTR_MASK;
 	pv = NULL;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if (pmap_l3_valid(orig_l3)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
 		    (orig_l3 & ATTR_SW_WIRED) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
 		    (orig_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
 			    (new_l3 & ATTR_SW_DBM) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.
 		 */
 		orig_l3 = pmap_load_clear(l3);
 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if (pmap_pte_dirty(orig_l3))
 				vm_page_dirty(om);
 			if ((orig_l3 & ATTR_AF) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			if ((m->oflags & VPO_UNMANAGED) != 0)
 				free_pv_entry(pmap, pv);
 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		pmap_invalidate_page(pmap, va);
 		orig_l3 = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((new_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((new_l3 & ATTR_SW_DBM) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 validate:
 	/*
 	 * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK
 	 * is set. Do it now, before the mapping is stored and made
 	 * valid for hardware table walk. If done later, then other can
 	 * access this page before caches are properly synced.
 	 * Don't do it for kernel memory which is mapped with exec
 	 * permission even if the memory isn't going to hold executable
 	 * code. The only time when icache sync is needed is after
 	 * kernel module is loaded and the relocation info is processed.
 	 * And it's done in elf_cpu_load_file().
 	*/
 	if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
 	    (opa != pa || (orig_l3 & ATTR_XN)))
 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
 
 	/*
 	 * Update the L3 entry
 	 */
 	if (pmap_l3_valid(orig_l3)) {
 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
 			/* same PA, different attributes */
 			/* XXXMJ need to reload orig_l3 for hardware DBM. */
 			pmap_load_store(l3, new_l3);
 			pmap_invalidate_page(pmap, va);
 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
 			    pmap_pte_dirty(orig_l3))
 				vm_page_dirty(m);
 		} else {
 			/*
 			 * orig_l3 == new_l3
 			 * This can happens if multiple threads simultaneously
 			 * access not yet mapped page. This bad for performance
 			 * since this can cause full demotion-NOP-promotion
 			 * cycle.
 			 * Another possible reasons are:
 			 * - VM and pmap memory layout are diverged
 			 * - tlb flush is missing somewhere and CPU doesn't see
 			 *   actual mapping.
 			 */
 			CTR4(KTR_PMAP, "%s: already mapped page - "
 			    "pmap %p va 0x%#lx pte 0x%lx",
 			    __func__, pmap, va, new_l3);
 		}
 	} else {
 		/* New mapping */
 		pmap_store(l3, new_l3);
 		dsb(ishst);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (pmap != pmap_kernel() &&
 	    (mpte == NULL || mpte->wire_count == NL3PG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0) {
 		pmap_promote_l2(pmap, pde, va, &lock);
 	}
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t new_l2;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
 	    ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L2_BLOCK);
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		new_l2 |= ATTR_SW_MANAGED;
 		new_l2 &= ~ATTR_AF;
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		new_l2 |= ATTR_XN;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l2 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *l2, old_l2;
 	vm_page_t l2pg, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
 	l2 = &l2[pmap_l2_index(va)];
 	if ((old_l2 = pmap_load(l2)) != 0) {
 		KASSERT(l2pg->wire_count > 1,
 		    ("pmap_enter_l2: l2pg's wire count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			l2pg->wire_count--;
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_FAILURE);
 		}
 		SLIST_INIT(&free);
 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
 			(void)pmap_remove_l2(pmap, l2, va,
 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
 		else
 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
 			    &free, lockp);
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			/*
 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
 			 * will leave the kernel page table page zero filled.
 			 * Nonetheless, the TLB could have an intermediate
 			 * entry for the kernel page table page.
 			 */
 			mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_l2: trie insert failed");
 			pmap_clear(l2);
 			pmap_invalidate_page(pmap, va);
 		} else
 			KASSERT(pmap_load(l2) == 0,
 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
 	}
 
 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_l3(pmap, va, l2pg, &free)) {
 				/*
 				 * Although "va" is not mapped, the TLB could
 				 * nonetheless have intermediate entries that
 				 * refer to the freed page table pages.
 				 * Invalidate those entries.
 				 *
 				 * XXX redundant invalidation (See
 				 * _pmap_unwire_l3().)
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((new_l2 & ATTR_SW_DBM) != 0)
 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((new_l2 & ATTR_SW_WIRED) != 0)
 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.
 	 */
 	pmap_store(l2, new_l2);
 	dsb(ishst);
 
 	atomic_add_long(&pmap_l2_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
 	    va, pmap);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[L2_SIZE / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
 			    &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *pde;
 	pt_entry_t *l2, *l3, l3_val;
 	vm_paddr_t pa;
 	int lvl;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t l2pindex;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		l2pindex = pmap_l2_pindex(va);
 		if (mpte && (mpte->pindex == l2pindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the l2 entry
 			 */
 			pde = pmap_pde(pmap, va, &lvl);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (lvl == 1) {
 				l2 = pmap_l1_to_l2(pde, va);
 				if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
 				    L2_BLOCK)
 					return (NULL);
 			}
 			if (lvl == 2 && pmap_load(pde) != 0) {
 				mpte =
 				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
 				mpte->wire_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		l3 = &l3[pmap_l3_index(va)];
 	} else {
 		mpte = NULL;
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
 		     va));
 		KASSERT(lvl == 2,
 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
 		l3 = pmap_l2_to_l3(pde, va);
 	}
 
 	/*
 	 * Abort if a mapping already exists.
 	 */
 	if (pmap_load(l3) != 0) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_l3(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	pa = VM_PAGE_TO_PHYS(m);
 	l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
 	    ATTR_AP(ATTR_AP_RO) | L3_PAGE;
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		l3_val |= ATTR_XN;
 	else if (va < VM_MAXUSER_ADDRESS)
 		l3_val |= ATTR_PXN;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		l3_val |= ATTR_SW_MANAGED;
 		l3_val &= ~ATTR_AF;
 	}
 
 	/* Sync icache before the mapping is stored to PTE */
 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
 
 	pmap_store(l3, l3_val);
 	dsb(ishst);
 
 	return (mpte);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t *l3;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (pmap_load(l2) == 0)
 			continue;
 
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
 				panic("pmap_unwire: l2 %#jx is missing "
 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_clear_bits(l2, ATTR_SW_WIRED);
 				pmap->pm_stats.wired_count -= L2_SIZE /
 				    PAGE_SIZE;
 				continue;
 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
 				panic("pmap_unwire: demotion failed");
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_unwire: Invalid l2 entry after demotion"));
 
 		if (va_next > eva)
 			va_next = eva;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			if (pmap_load(l3) == 0)
 				continue;
 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
 				panic("pmap_unwire: l3 %#jx is missing "
 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
 
 			/*
 			 * ATTR_SW_WIRED must be cleared atomically.  Although
 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
 			 * the System MMU may write to the entry concurrently.
 			 */
 			pmap_clear_bits(l3, ATTR_SW_WIRED);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  *
  *	Because the executable mappings created by this routine are copied,
  *	it should not have to flush the instruction cache.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	struct spglist free;
 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
 	vm_offset_t addr, end_addr, va_next;
 	vm_page_t dst_l2pg, dstmpte, srcmpte;
 
 	if (dst_addr != src_addr)
 		return;
 	end_addr = src_addr + len;
 	lock = NULL;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		l0 = pmap_l0(src_pmap, addr);
 		if (pmap_load(l0) == 0) {
 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 		l1 = pmap_l0_to_l1(l0, addr);
 		if (pmap_load(l1) == 0) {
 			va_next = (addr + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < addr)
 			va_next = end_addr;
 		l2 = pmap_l1_to_l2(l1, addr);
 		srcptepaddr = pmap_load(l2);
 		if (srcptepaddr == 0)
 			continue;
 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((addr & L2_OFFSET) != 0 ||
 			    addr + L2_SIZE > end_addr)
 				continue;
 			dst_l2pg = pmap_alloc_l2(dst_pmap, addr, NULL);
 			if (dst_l2pg == NULL)
 				break;
 			l2 = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_l2pg));
 			l2 = &l2[pmap_l2_index(addr)];
 			if (pmap_load(l2) == 0 &&
 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM, &lock))) {
 				mask = ATTR_AF | ATTR_SW_WIRED;
 				nbits = 0;
 				if ((srcptepaddr & ATTR_SW_DBM) != 0)
 					nbits |= ATTR_AP_RW_BIT;
 				pmap_store(l2, (srcptepaddr & ~mask) | nbits);
 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
 				    PAGE_SIZE);
 				atomic_add_long(&pmap_l2_mappings, 1);
 			} else
 				dst_l2pg->wire_count--;
 			continue;
 		}
 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_copy: invalid L2 entry"));
 		srcptepaddr &= ~ATTR_MASK;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 		if (va_next > end_addr)
 			va_next = end_addr;
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_l3_index(addr)];
 		dstmpte = NULL;
 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
 			ptetemp = pmap_load(src_pte);
 
 			/*
 			 * We only virtual copy managed pages.
 			 */
 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
 				continue;
 
 			if (dstmpte != NULL) {
 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
 				    ("dstmpte pindex/addr mismatch"));
 				dstmpte->wire_count++;
 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
 			    NULL)) == NULL)
 				goto out;
 			dst_pte = (pt_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 			dst_pte = &dst_pte[pmap_l3_index(addr)];
 			if (pmap_load(dst_pte) == 0 &&
 			    pmap_try_insert_pv_entry(dst_pmap, addr,
 			    PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
 				/*
 				 * Clear the wired, modified, and accessed
 				 * (referenced) bits during the copy.
 				 */
 				mask = ATTR_AF | ATTR_SW_WIRED;
 				nbits = 0;
 				if ((ptetemp & ATTR_SW_DBM) != 0)
 					nbits |= ATTR_AP_RW_BIT;
 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
 				pmap_resident_count_inc(dst_pmap, 1);
 			} else {
 				SLIST_INIT(&free);
 				if (pmap_unwire_l3(dst_pmap, addr, dstmpte,
 				    &free)) {
 					/*
 					 * Although "addr" is not mapped,
 					 * the TLB could nonetheless have
 					 * intermediate entries that refer
 					 * to the freed page table pages.
 					 * Invalidate those entries.
 					 *
 					 * XXX redundant invalidation
 					 */
 					pmap_invalidate_page(dst_pmap, addr);
 					vm_page_free_pages_toq(&free, true);
 				}
 				goto out;
 			}
 			/* Have we copied all of the valid mappings? */ 
 			if (dstmpte->wire_count >= srcmpte->wire_count)
 				break;
 		}
 	}
 out:
 	/*
 	 * XXX This barrier may not be needed because the destination pmap is
 	 * not active.
 	 */
 	dsb(ishst);
 
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t m_a, m_b;
 	vm_paddr_t p_a, p_b;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	int cnt;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		m_a = ma[a_offset >> PAGE_SHIFT];
 		p_a = m_a->phys_addr;
 		b_pg_offset = b_offset & PAGE_MASK;
 		m_b = mb[b_offset >> PAGE_SHIFT];
 		p_b = m_b->phys_addr;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
 			panic("!DMAP a %lx", p_a);
 		} else {
 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
 		}
 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
 			panic("!DMAP b %lx", p_b);
 		} else {
 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
 		}
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 
 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, lvl, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
 			if (pte != NULL &&
 			    (pmap_load(pte) & ATTR_SW_WIRED) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, tpte;
 	struct spglist free;
 	vm_page_t m, ml3, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx, lvl;
 	vm_paddr_t pa;
 
 	lock = NULL;
 
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = ffsl(inuse) - 1;
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
 				KASSERT(pde != NULL,
 				    ("Attempting to remove an unmapped page"));
 
 				switch(lvl) {
 				case 1:
 					pte = pmap_l1_to_l2(pde, pv->pv_va);
 					tpte = pmap_load(pte); 
 					KASSERT((tpte & ATTR_DESCR_MASK) ==
 					    L2_BLOCK,
 					    ("Attempting to remove an invalid "
 					    "block: %lx", tpte));
 					tpte = pmap_load(pte);
 					break;
 				case 2:
 					pte = pmap_l2_to_l3(pde, pv->pv_va);
 					tpte = pmap_load(pte);
 					KASSERT((tpte & ATTR_DESCR_MASK) ==
 					    L3_PAGE,
 					    ("Attempting to remove an invalid "
 					     "page: %lx", tpte));
 					break;
 				default:
 					panic(
 					    "Invalid page directory level: %d",
 					    lvl);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & ATTR_SW_WIRED) {
 					allfree = 0;
 					continue;
 				}
 
 				pa = tpte & ~ATTR_MASK;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad pte %#jx",
 				    (uintmax_t)tpte));
 
 				/*
 				 * Because this pmap is not active on other
 				 * processors, the dirty bit cannot have
 				 * changed state since we last loaded pte.
 				 */
 				pmap_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if (pmap_pte_dirty(tpte)) {
 					switch (lvl) {
 					case 1:
 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 						break;
 					case 2:
 						vm_page_dirty(m);
 						break;
 					}
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 				switch (lvl) {
 				case 1:
 					pmap_resident_count_dec(pmap,
 					    L2_SIZE / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & ~ATTR_MASK);
 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					ml3 = pmap_remove_pt_page(pmap,
 					    pv->pv_va);
 					if (ml3 != NULL) {
 						KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 						    ("pmap_remove_pages: l3 page not promoted"));
 						pmap_resident_count_dec(pmap,1);
 						KASSERT(ml3->wire_count == NL3PG,
 						    ("pmap_remove_pages: l3 page wire count error"));
 						ml3->wire_count = 0;
 						pmap_add_delayed_free_list(ml3,
 						    &free, FALSE);
 					}
 					break;
 				case 2:
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv,
 					    pv_next);
 					m->md.pv_gen++;
 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(
 						    VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m,
 							    PGA_WRITEABLE);
 					}
 					break;
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
 				    &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	pmap_invalidate_all(pmap);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * This is used to check if a page has been accessed or modified. As we
  * don't have a bit to see if it has been modified we have to assume it
  * has been if the page is read/write.
  */
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask, value;
 	pmap_t pmap;
 	int lvl, md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		KASSERT(lvl == 3,
 		    ("pmap_page_test_mappings: Invalid level %d", lvl));
 		mask = 0;
 		value = 0;
 		if (modified) {
 			mask |= ATTR_AP_RW_BIT;
 			value |= ATTR_AP(ATTR_AP_RW);
 		}
 		if (accessed) {
 			mask |= ATTR_AF | ATTR_DESCR_MASK;
 			value |= ATTR_AF | L3_PAGE;
 		}
 		rv = (pmap_load(pte) & mask) == value;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
 			KASSERT(lvl == 2,
 			    ("pmap_page_test_mappings: Invalid level %d", lvl));
 			mask = 0;
 			value = 0;
 			if (modified) {
 				mask |= ATTR_AP_RW_BIT;
 				value |= ATTR_AP(ATTR_AP_RW);
 			}
 			if (accessed) {
 				mask |= ATTR_AF | ATTR_DESCR_MASK;
 				value |= ATTR_AF | L2_BLOCK;
 			}
 			rv = (pmap_load(pte) & mask) == value;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *pte;
 	boolean_t rv;
 	int lvl;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, addr, &lvl);
 	if (pte != NULL && pmap_load(pte) != 0) {
 		rv = TRUE;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 	int lvl, md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		va = pv->pv_va;
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		oldpte = pmap_load(pte);
 retry:
 		if ((oldpte & ATTR_SW_DBM) != 0) {
 			if (!atomic_fcmpset_long(pte, &oldpte,
 			    (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM))
 				goto retry;
 			if ((oldpte & ATTR_AP_RW_BIT) ==
 			    ATTR_AP(ATTR_AP_RW))
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t *pde, tpde;
 	pt_entry_t *pte, tpte;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, lvl, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
 		KASSERT(lvl == 1,
 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 		KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
 		    ("pmap_ts_referenced: found an invalid l1 table"));
 		pte = pmap_l1_to_l2(pde, pv->pv_va);
 		tpte = pmap_load(pte);
 		if (pmap_pte_dirty(tpte)) {
 			/*
 			 * Although "tpte" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 
 		if ((tpte & ATTR_AF) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB pages,
 			 * it should not be cleared every time it is tested.
 			 * Apply a simple "hash" function on the physical page
 			 * number, the virtual superpage number, and the pmap
 			 * address to select one 4KB page out of the 512 on
 			 * which testing the reference bit will result in
 			 * clearing that reference bit.  This function is
 			 * designed to avoid the selection of the same 4KB page
 			 * for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
 			    (tpte & ATTR_SW_WIRED) == 0) {
 				pmap_clear_bits(pte, ATTR_AF);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
 		KASSERT(lvl == 2,
 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 		KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_ts_referenced: found an invalid l2 table"));
 		pte = pmap_l2_to_l3(pde, pv->pv_va);
 		tpte = pmap_load(pte);
 		if (pmap_pte_dirty(tpte))
 			vm_page_dirty(m);
 		if ((tpte & ATTR_AF) != 0) {
 			if ((tpte & ATTR_SW_WIRED) == 0) {
 				pmap_clear_bits(pte, ATTR_AF);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	struct rwlock *lock;
 	vm_offset_t va, va_next;
 	vm_page_t m;
 	pd_entry_t *l0, *l1, *l2, oldl2;
 	pt_entry_t *l3, oldl3;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 		l2 = pmap_l1_to_l2(l1, sva);
 		oldl2 = pmap_load(l2);
 		if (oldl2 == 0)
 			continue;
 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
 				continue;
 			lock = NULL;
 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
 				if (lock != NULL)
 					rw_wunlock(lock);
 
 				/*
 				 * The 2MB page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Since the underlying page
 			 * table page is fully populated, this removal never
 			 * frees a page table page.
 			 */
 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
 				l3 = pmap_l2_to_l3(l2, sva);
 				KASSERT(pmap_load(l3) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_l3(pmap, l3, sva, pmap_load(l2),
 				    NULL, &lock);
 			}
 			if (lock != NULL)
 				rw_wunlock(lock);
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_advise: invalid L2 entry after demotion"));
 		if (va_next > eva)
 			va_next = eva;
 		va = va_next;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			oldl3 = pmap_load(l3);
 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
 			    (ATTR_SW_MANAGED | L3_PAGE))
 				goto maybe_invlrng;
 			else if (pmap_pte_dirty(oldl3)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
 					vm_page_dirty(m);
 				}
 				while (!atomic_fcmpset_long(l3, &oldl3,
 				    (oldl3 & ~ATTR_AF) | ATTR_AP(ATTR_AP_RO)))
 					cpu_spinwait();
 			} else if ((oldl3 & ATTR_AF) != 0)
 				pmap_clear_bits(l3, ATTR_AF);
 			else
 				goto maybe_invlrng;
 			if (va == va_next)
 				va = sva;
 			continue;
 maybe_invlrng:
 			if (va != va_next) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = va_next;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *l2, oldl2;
 	pt_entry_t *l3, oldl3;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM
 	 * set.  If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		oldl2 = pmap_load(l2);
 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
 		    (oldl2 & ATTR_SW_WIRED) == 0) {
 			/*
 			 * Write protect the mapping to a single page so that
 			 * a subsequent write access may repromote.
 			 */
 			va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
 			l3 = pmap_l2_to_l3(l2, va);
 			oldl3 = pmap_load(l3);
 			while (!atomic_fcmpset_long(l3, &oldl3,
 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_AP(ATTR_AP_RO)))
 				cpu_spinwait();
 			vm_page_dirty(m);
 			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		oldl3 = pmap_load(l3);
 		if (pmap_l3_valid(oldl3) &&
 		    (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
 			pmap_set_bits(l3, ATTR_AP(ATTR_AP_RO));
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	pd_entry_t *pde;
 	pt_entry_t *l2;
 	int i, lvl, l2_blocks, free_l2_count, start_idx;
 
 	if (!vm_initialized) {
 		/*
 		 * No L3 ptables so map entire L2 blocks where start VA is:
 		 * 	preinit_map_va + start_idx * L2_SIZE
 		 * There may be duplicate mappings (multiple VA -> same PA) but
 		 * ARM64 dcache is always PIPT so that's acceptable.
 		 */
 		 if (size == 0)
 			 return (NULL);
 
 		 /* Calculate how many L2 blocks are needed for the mapping */
 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
 
 		offset = pa & L2_OFFSET;
 
 		if (preinit_map_va == 0)
 			return (NULL);
 
 		/* Map 2MiB L2 blocks from reserved VA space */
 
 		free_l2_count = 0;
 		start_idx = -1;
 		/* Find enough free contiguous VA space */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (free_l2_count > 0 && ppim->pa != 0) {
 				/* Not enough space here */
 				free_l2_count = 0;
 				start_idx = -1;
 				continue;
 			}
 
 			if (ppim->pa == 0) {
 				/* Free L2 block */
 				if (start_idx == -1)
 					start_idx = i;
 				free_l2_count++;
 				if (free_l2_count == l2_blocks)
 					break;
 			}
 		}
 		if (free_l2_count != l2_blocks)
 			panic("%s: too many preinit mappings", __func__);
 
 		va = preinit_map_va + (start_idx * L2_SIZE);
 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
 			/* Mark entries as allocated */
 			ppim = pmap_preinit_mapping + i;
 			ppim->pa = pa;
 			ppim->va = va + offset;
 			ppim->size = size;
 		}
 
 		/* Map L2 blocks */
 		pa = rounddown2(pa, L2_SIZE);
 		for (i = 0; i < l2_blocks; i++) {
 			pde = pmap_pde(kernel_pmap, va, &lvl);
 			KASSERT(pde != NULL,
 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
 			    va));
 			KASSERT(lvl == 1,
 			    ("pmap_mapbios: Invalid level %d", lvl));
 
 			/* Insert L2_BLOCK */
 			l2 = pmap_l1_to_l2(pde, va);
 			pmap_load_store(l2,
 			    pa | ATTR_DEFAULT | ATTR_XN |
 			    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 
 			va += L2_SIZE;
 			pa += L2_SIZE;
 		}
 		pmap_invalidate_all(kernel_pmap);
 
 		va = preinit_map_va + (start_idx * L2_SIZE);
 
 	} else {
 		/* kva_alloc may be used to map the pages */
 		offset = pa & PAGE_MASK;
 		size = round_page(offset + size);
 
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
 
 		/* L3 table is linked */
 		va = trunc_page(va);
 		pa = trunc_page(pa);
 		pmap_kenter(va, size, pa, CACHED_MEMORY);
 	}
 
 	return ((void *)(va + offset));
 }
 
 void
 pmap_unmapbios(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset, tmpsize, va_trunc;
 	pd_entry_t *pde;
 	pt_entry_t *l2;
 	int i, lvl, l2_blocks, block;
 	bool preinit_map;
 
 	l2_blocks =
 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
 
 	/* Remove preinit mapping */
 	preinit_map = false;
 	block = 0;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va) {
 			KASSERT(ppim->size == size,
 			    ("pmap_unmapbios: size mismatch"));
 			ppim->va = 0;
 			ppim->pa = 0;
 			ppim->size = 0;
 			preinit_map = true;
 			offset = block * L2_SIZE;
 			va_trunc = rounddown2(va, L2_SIZE) + offset;
 
 			/* Remove L2_BLOCK */
 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
 			KASSERT(pde != NULL,
 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
 			    va_trunc));
 			l2 = pmap_l1_to_l2(pde, va_trunc);
 			pmap_clear(l2);
 
 			if (block == (l2_blocks - 1))
 				break;
 			block++;
 		}
 	}
 	if (preinit_map) {
 		pmap_invalidate_all(kernel_pmap);
 		return;
 	}
 
 	/* Unmap the pages reserved with kva_alloc. */
 	if (vm_initialized) {
 		offset = va & PAGE_MASK;
 		size = round_page(offset + size);
 		va = trunc_page(va);
 
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
 
 		/* Unmap and invalidate the pages */
                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 			pmap_kremove(va + tmpsize);
 
 		kva_free(va, size);
 	}
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pv_memattr = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pv_memattr) != 0)
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.  If
  * the virtual address range is contained within the kernel map, then the
  * memory type for each of the corresponding ranges of the direct map is also
  * changed.  (The corresponding ranges of the direct map are those ranges that
  * map the same physical pages as the specified virtual address range.)  These
  * changes to the direct map are necessary because Intel describes the
  * behavior of their processors as "undefined" if two or more mappings to the
  * same physical page have different memory types.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range or the direct map.
  */
 static int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	pt_entry_t l3, *pte, *newpte;
 	int lvl;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	if (!VIRT_IN_DMAP(base))
 		return (EINVAL);
 
 	for (tmpva = base; tmpva < base + size; ) {
 		pte = pmap_pte(kernel_pmap, tmpva, &lvl);
 		if (pte == NULL)
 			return (EINVAL);
 
 		if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
 			/*
 			 * We already have the correct attribute,
 			 * ignore this entry.
 			 */
 			switch (lvl) {
 			default:
 				panic("Invalid DMAP table level: %d\n", lvl);
 			case 1:
 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
 				break;
 			case 2:
 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
 				break;
 			case 3:
 				tmpva += PAGE_SIZE;
 				break;
 			}
 		} else {
 			/*
 			 * Split the entry to an level 3 table, then
 			 * set the new attribute.
 			 */
 			switch (lvl) {
 			default:
 				panic("Invalid DMAP table level: %d\n", lvl);
 			case 1:
 				newpte = pmap_demote_l1(kernel_pmap, pte,
 				    tmpva & ~L1_OFFSET);
 				if (newpte == NULL)
 					return (EINVAL);
 				pte = pmap_l1_to_l2(pte, tmpva);
 			case 2:
 				newpte = pmap_demote_l2(kernel_pmap, pte,
 				    tmpva);
 				if (newpte == NULL)
 					return (EINVAL);
 				pte = pmap_l2_to_l3(pte, tmpva);
 			case 3:
 				/* Update the entry */
 				l3 = pmap_load(pte);
 				l3 &= ~ATTR_IDX_MASK;
 				l3 |= ATTR_IDX(mode);
 				if (mode == DEVICE_MEMORY)
 					l3 |= ATTR_XN;
 
 				pmap_update_entry(kernel_pmap, pte, l3, tmpva,
 				    PAGE_SIZE);
 
 				/*
 				 * If moving to a non-cacheable entry flush
 				 * the cache.
 				 */
 				if (mode == VM_MEMATTR_UNCACHEABLE)
 					cpu_dcache_wbinv_range(tmpva, L3_SIZE);
 
 				break;
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Create an L2 table to map all addresses within an L1 mapping.
  */
 static pt_entry_t *
 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
 {
 	pt_entry_t *l2, newl2, oldl1;
 	vm_offset_t tmpl1;
 	vm_paddr_t l2phys, phys;
 	vm_page_t ml2;
 	int i;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldl1 = pmap_load(l1);
 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
 	    ("pmap_demote_l1: Demoting a non-block entry"));
 	KASSERT((va & L1_OFFSET) == 0,
 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
 
 	tmpl1 = 0;
 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
 		tmpl1 = kva_alloc(PAGE_SIZE);
 		if (tmpl1 == 0)
 			return (NULL);
 	}
 
 	if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (NULL);
 	}
 
 	l2phys = VM_PAGE_TO_PHYS(ml2);
 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
 
 	/* Address the range points at */
 	phys = oldl1 & ~ATTR_MASK;
 	/* The attributed from the old l1 table to be copied */
 	newl2 = oldl1 & ATTR_MASK;
 
 	/* Create the new entries */
 	for (i = 0; i < Ln_ENTRIES; i++) {
 		l2[i] = newl2 | phys;
 		phys += L2_SIZE;
 	}
 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
 	    ("Invalid l2 page (%lx != %lx)", l2[0],
 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
 
 	if (tmpl1 != 0) {
 		pmap_kenter(tmpl1, PAGE_SIZE,
 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
 	}
 
 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
 
 	if (tmpl1 != 0) {
 		pmap_kremove(tmpl1);
 		kva_free(tmpl1, PAGE_SIZE);
 	}
 
 	return (l2);
 }
 
 static void
 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
 {
 	pt_entry_t *l3;
 
 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
 		*l3 = newl3;
 		newl3 += L3_SIZE;
 	}
 }
 
 static void
 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
     struct rwlock **lockp)
 {
 	struct spglist free;
 
 	SLIST_INIT(&free);
 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
 	    lockp);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * Create an L3 table to map all addresses within an L2 mapping.
  */
 static pt_entry_t *
 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *l3, newl3, oldl2;
 	vm_offset_t tmpl2;
 	vm_paddr_t l3phys;
 	vm_page_t ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	l3 = NULL;
 	oldl2 = pmap_load(l2);
 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_demote_l2: Demoting a non-block entry"));
 	va &= ~L2_OFFSET;
 
 	tmpl2 = 0;
 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
 		tmpl2 = kva_alloc(PAGE_SIZE);
 		if (tmpl2 == 0)
 			return (NULL);
 	}
 
 	/*
 	 * Invalidate the 2MB page mapping and return "failure" if the
 	 * mapping was never accessed.
 	 */
 	if ((oldl2 & ATTR_AF) == 0) {
 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
 		pmap_demote_l2_abort(pmap, va, l2, lockp);
 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		goto fail;
 	}
 
 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
 		    ("pmap_demote_l2: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * If the page table page is missing and the mapping
 		 * is for a kernel address, the mapping must belong to
 		 * the direct map.  Page table pages are preallocated
 		 * for every other part of the kernel address space,
 		 * so the direct map region is the only part of the
 		 * kernel address space that must be handled here.
 		 */
 		KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va),
 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
 
 		/*
 		 * If the 2MB page mapping belongs to the direct map
 		 * region of the kernel's address space, then the page
 		 * allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
 		 * priority is normal.
 		 */
 		ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 
 		/*
 		 * If the allocation of the new page table page fails,
 		 * invalidate the 2MB page mapping and return "failure".
 		 */
 		if (ml3 == NULL) {
 			pmap_demote_l2_abort(pmap, va, l2, lockp);
 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			goto fail;
 		}
 
 		if (va < VM_MAXUSER_ADDRESS) {
 			ml3->wire_count = NL3PG;
 			pmap_resident_count_inc(pmap, 1);
 		}
 	}
 	l3phys = VM_PAGE_TO_PHYS(ml3);
 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
 	newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
 	KASSERT((oldl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) !=
 	    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM),
 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * or the mapping attributes have changed, (re)initialize the L3 table.
 	 */
 	if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
 		pmap_fill_l3(l3, newl3);
 
 	/*
 	 * Map the temporary page so we don't lose access to the l2 table.
 	 */
 	if (tmpl2 != 0) {
 		pmap_kenter(tmpl2, PAGE_SIZE,
 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
 	}
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the L2 and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
 
 	/*
 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
 	 * the 2MB page mapping.
 	 */
 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
 
 	atomic_add_long(&pmap_l2_demotions, 1);
 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
 	    " in pmap %p %lx", va, pmap, l3[0]);
 
 fail:
 	if (tmpl2 != 0) {
 		pmap_kremove(tmpl2);
 		kva_free(tmpl2, PAGE_SIZE);
 	}
 
 	return (l3);
 
 }
 
 static pt_entry_t *
 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	struct rwlock *lock;
 	pt_entry_t *l3;
 
 	lock = NULL;
 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (l3);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t mask, pa;
 	int lvl, val;
 	bool managed;
 
 	PMAP_LOCK(pmap);
 retry:
 	val = 0;
 	pte = pmap_pte(pmap, addr, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 
 		switch (lvl) {
 		case 3:
 			mask = L3_OFFSET;
 			break;
 		case 2:
 			mask = L2_OFFSET;
 			break;
 		case 1:
 			mask = L1_OFFSET;
 			break;
 		default:
 			panic("pmap_mincore: invalid level %d", lvl);
 		}
 
 		managed = (tpte & ATTR_SW_MANAGED) != 0;
 		val = MINCORE_INCORE;
 		if (lvl != 3)
 			val |= MINCORE_SUPER;
 		if ((managed && pmap_pte_dirty(tpte)) || (!managed &&
 		    (tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((tpte & ATTR_AF) == ATTR_AF)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 
 		pa = (tpte & ~ATTR_MASK) | (addr & mask);
 	} else
 		managed = false;
 
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 
 	return (val);
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0);
 	__asm __volatile(
 	    "msr ttbr0_el1, %0	\n"
 	    "isb		\n"
 	    : : "r"(td->td_proc->p_md.md_l0addr));
 	pmap_invalidate_all(pmap);
 	critical_exit();
 }
 
 struct pcb *
 pmap_switch(struct thread *old, struct thread *new)
 {
 	pcpu_bp_harden bp_harden;
 	struct pcb *pcb;
 
 	/* Store the new curthread */
 	PCPU_SET(curthread, new);
 
 	/* And the new pcb */
 	pcb = new->td_pcb;
 	PCPU_SET(curpcb, pcb);
 
 	/*
 	 * TODO: We may need to flush the cache here if switching
 	 * to a user process.
 	 */
 
 	if (old == NULL ||
 	    old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) {
 		__asm __volatile(
 		    /* Switch to the new pmap */
 		    "msr	ttbr0_el1, %0	\n"
 		    "isb			\n"
 
 		    /* Invalidate the TLB */
 		    "dsb	ishst		\n"
 		    "tlbi	vmalle1is	\n"
 		    "dsb	ish		\n"
 		    "isb			\n"
 		    : : "r"(new->td_proc->p_md.md_l0addr));
 
 		/*
 		 * Stop userspace from training the branch predictor against
 		 * other processes. This will call into a CPU specific
 		 * function that clears the branch predictor state.
 		 */
 		bp_harden = PCPU_GET(bp_harden);
 		if (bp_harden != NULL)
 			bp_harden();
 	}
 
 	return (pcb);
 }
 
 void
 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
 {
 
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		cpu_icache_sync_range(va, sz);
 	} else {
 		u_int len, offset;
 		vm_paddr_t pa;
 
 		/* Find the length of data in this page to flush */
 		offset = va & PAGE_MASK;
 		len = imin(PAGE_SIZE - offset, sz);
 
 		while (sz != 0) {
 			/* Extract the physical address & find it in the DMAP */
 			pa = pmap_extract(pmap, va);
 			if (pa != 0)
 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
 
 			/* Move to the next page */
 			sz -= len;
 			va += len;
 			/* Set the length for the next iteration */
 			len = imin(PAGE_SIZE, sz);
 		}
 	}
 }
 
 int
 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
 {
 	pt_entry_t *pte;
 	register_t intr;
 	uint64_t ec, par;
 	int lvl, rv;
 
 	rv = KERN_FAILURE;
 
 	ec = ESR_ELx_EXCEPTION(esr);
 	switch (ec) {
 	case EXCP_INSN_ABORT_L:
 	case EXCP_INSN_ABORT:
 	case EXCP_DATA_ABORT_L:
 	case EXCP_DATA_ABORT:
 		break;
 	default:
 		return (rv);
 	}
 
 	/* Data and insn aborts use same encoding for FSC field. */
 	switch (esr & ISS_DATA_DFSC_MASK) {
 	case ISS_DATA_DFSC_AFF_L1:
 	case ISS_DATA_DFSC_AFF_L2:
 	case ISS_DATA_DFSC_AFF_L3:
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, far, &lvl);
 		if (pte != NULL) {
 			pmap_set_bits(pte, ATTR_AF);
 			rv = KERN_SUCCESS;
 			/*
 			 * XXXMJ as an optimization we could mark the entry
 			 * dirty if this is a write fault.
 			 */
 		}
 		PMAP_UNLOCK(pmap);
 		break;
 	case ISS_DATA_DFSC_PF_L1:
 	case ISS_DATA_DFSC_PF_L2:
 	case ISS_DATA_DFSC_PF_L3:
 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
 		    (esr & ISS_DATA_WnR) == 0)
 			return (rv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, far, &lvl);
 		if (pte != NULL &&
 		    (pmap_load(pte) & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 		    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 			pmap_clear_bits(pte, ATTR_AP_RW_BIT);
-			pmap_invalidate_page(pmap, trunc_page(far));
+			pmap_invalidate_page(pmap, far);
 			rv = KERN_SUCCESS;
 		}
 		PMAP_UNLOCK(pmap);
 		break;
 	case ISS_DATA_DFSC_TF_L0:
 	case ISS_DATA_DFSC_TF_L1:
 	case ISS_DATA_DFSC_TF_L2:
 	case ISS_DATA_DFSC_TF_L3:
 		PMAP_LOCK(pmap);
 		/* Ask the MMU to check the address */
 		intr = intr_disable();
 		if (pmap == kernel_pmap)
 			par = arm64_address_translate_s1e1r(far);
 		else
 			par = arm64_address_translate_s1e0r(far);
 		intr_restore(intr);
 		PMAP_UNLOCK(pmap);
 
 		/*
 		 * If the translation was successful the address was invalid
 		 * due to a break-before-make sequence. We can unlock and
 		 * return success to the trap handler.
 		 */
 		if (PAR_SUCCESS(par))
 			rv = KERN_SUCCESS;
 		break;
 	}
 
 	return (rv);
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < L2_SIZE)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & L2_OFFSET;
 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
 	    (*addr & L2_OFFSET) == superpage_offset)
 		return;
 	if ((*addr & L2_OFFSET) < superpage_offset)
 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
 	else
 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	int error, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (!PHYS_IN_DMAP(paddr)) {
 			panic(
 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (!PHYS_IN_DMAP(paddr)) {
 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
 		}
 	}
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
 }
Index: projects/fuse2/sys/arm64/conf/GENERIC
===================================================================
--- projects/fuse2/sys/arm64/conf/GENERIC	(revision 350434)
+++ projects/fuse2/sys/arm64/conf/GENERIC	(revision 350435)
@@ -1,327 +1,328 @@
 #
 # GENERIC -- Generic kernel configuration file for FreeBSD/arm64
 #
 # For more information on this file, please read the config(5) manual page,
 # and/or the handbook section on Kernel Configuration Files:
 #
 #    https://www.FreeBSD.org/doc/en_US.ISO8859-1/books/handbook/kernelconfig-config.html
 #
 # The handbook is also available locally in /usr/share/doc/handbook
 # if you've installed the doc distribution, otherwise always see the
 # FreeBSD World Wide Web server (https://www.FreeBSD.org/) for the
 # latest information.
 #
 # An exhaustive list of options and more detailed explanations of the
 # device lines is also present in the ../../conf/NOTES and NOTES files.
 # If you are in doubt as to the purpose or necessity of a line, check first
 # in NOTES.
 #
 # $FreeBSD$
 
 cpu		ARM64
 ident		GENERIC
 
 makeoptions	DEBUG=-g		# Build kernel with gdb(1) debug symbols
 makeoptions	WITH_CTF=1		# Run ctfconvert(1) for DTrace support
 
 options 	SCHED_ULE		# ULE scheduler
 options 	PREEMPTION		# Enable kernel thread preemption
 options 	VIMAGE			# Subsystem virtualization, e.g. VNET
 options 	INET			# InterNETworking
 options 	INET6			# IPv6 communications protocols
 options 	IPSEC_SUPPORT		# Allow kldload of ipsec and tcpmd5
 options 	TCP_HHOOK		# hhook(9) framework for TCP
 options 	TCP_OFFLOAD		# TCP offload
 options		TCP_RFC7413		# TCP Fast Open
 options 	SCTP			# Stream Control Transmission Protocol
 options 	FFS			# Berkeley Fast Filesystem
 options 	SOFTUPDATES		# Enable FFS soft updates support
 options 	UFS_ACL			# Support for access control lists
 options 	UFS_DIRHASH		# Improve performance on big directories
 options 	UFS_GJOURNAL		# Enable gjournal-based UFS journaling
 options 	QUOTA			# Enable disk quotas for UFS
 options 	MD_ROOT			# MD is a potential root device
 options 	NFSCL			# Network Filesystem Client
 options 	NFSD			# Network Filesystem Server
 options 	NFSLOCKD		# Network Lock Manager
 options 	NFS_ROOT		# NFS usable as /, requires NFSCL
 options 	MSDOSFS			# MSDOS Filesystem
 options 	CD9660			# ISO 9660 Filesystem
 options 	PROCFS			# Process filesystem (requires PSEUDOFS)
 options 	PSEUDOFS		# Pseudo-filesystem framework
 options 	GEOM_RAID		# Soft RAID functionality.
 options 	GEOM_LABEL		# Provides labelization
 options 	COMPAT_FREEBSD32	# Compatible with FreeBSD/arm
 options 	COMPAT_FREEBSD11	# Compatible with FreeBSD11
 options 	COMPAT_FREEBSD12	# Compatible with FreeBSD12
 options 	SCSI_DELAY=5000		# Delay (in ms) before probing SCSI
 options 	KTRACE			# ktrace(1) support
 options 	STACK			# stack(9) support
 options 	SYSVSHM			# SYSV-style shared memory
 options 	SYSVMSG			# SYSV-style message queues
 options 	SYSVSEM			# SYSV-style semaphores
 options 	_KPOSIX_PRIORITY_SCHEDULING # POSIX P1003_1B real-time extensions
 options 	PRINTF_BUFR_SIZE=128	# Prevent printf output being interspersed.
 options 	KBD_INSTALL_CDEV	# install a CDEV entry in /dev
 options 	HWPMC_HOOKS		# Necessary kernel hooks for hwpmc(4)
 options 	AUDIT			# Security event auditing
 options 	CAPABILITY_MODE		# Capsicum capability mode
 options 	CAPABILITIES		# Capsicum capabilities
 options 	MAC			# TrustedBSD MAC Framework
 options 	KDTRACE_FRAME		# Ensure frames are compiled in
 options 	KDTRACE_HOOKS		# Kernel DTrace hooks
 options 	VFP			# Floating-point support
 options 	RACCT			# Resource accounting framework
 options 	RACCT_DEFAULT_TO_DISABLED # Set kern.racct.enable=0 by default
 options 	RCTL			# Resource limits
 options 	SMP
 options 	INTRNG
 
 # Debugging support.  Always need this:
 options 	KDB			# Enable kernel debugger support.
 options 	KDB_TRACE		# Print a stack trace for a panic.
 # For full debugger support use (turn off in stable branch):
 options 	DDB			# Support DDB.
 #options 	GDB			# Support remote GDB.
 options 	DEADLKRES		# Enable the deadlock resolver
 options 	INVARIANTS		# Enable calls of extra sanity checking
 options 	INVARIANT_SUPPORT	# Extra sanity checks of internal structures, required by INVARIANTS
 options 	WITNESS			# Enable checks to detect deadlocks and cycles
 options 	WITNESS_SKIPSPIN	# Don't run witness on spinlocks for speed
 options 	MALLOC_DEBUG_MAXZONES=8	# Separate malloc(9) zones
 options 	ALT_BREAK_TO_DEBUGGER	# Enter debugger on keyboard escape sequence
 options 	USB_DEBUG		# enable debug msgs
 options 	VERBOSE_SYSINIT=0	# Support debug.verbose_sysinit, off by default
 
 # Kernel Sanitizers
 #options 	COVERAGE		# Generic kernel coverage. Used by KCOV
 #options 	KCOV			# Kernel Coverage Sanitizer
 # Warning: KUBSAN can result in a kernel too large for loader to load
 #options 	KUBSAN			# Kernel Undefined Behavior Sanitizer
 
 # Kernel dump features.
 options 	EKCD			# Support for encrypted kernel dumps
 options 	GZIO			# gzip-compressed kernel and user dumps
 options 	ZSTDIO			# zstd-compressed kernel and user dumps
 options 	NETDUMP			# netdump(4) client support
 
 # SoC support
 options 	SOC_ALLWINNER_A64
 options 	SOC_ALLWINNER_H5
 options 	SOC_CAVM_THUNDERX
 options 	SOC_HISI_HI6220
 options 	SOC_BRCM_BCM2837
 options 	SOC_MARVELL_8K
 options 	SOC_ROCKCHIP_RK3328
 options 	SOC_ROCKCHIP_RK3399
 options 	SOC_XILINX_ZYNQ
 
 # Timer drivers
 device		a10_timer
 
 # Annapurna Alpine drivers
 device		al_ccu			# Alpine Cache Coherency Unit
 device		al_nb_service		# Alpine North Bridge Service
 device		al_iofic		# I/O Fabric Interrupt Controller
 device		al_serdes		# Serializer/Deserializer
 device		al_udma			# Universal DMA
 
 # Qualcomm Snapdragon drivers
 device		qcom_gcc		# Global Clock Controller
 
 # VirtIO support
 device		virtio
 device		virtio_pci
 device		virtio_mmio
 device		virtio_blk
 device		vtnet
 
 # CPU frequency control
 device		cpufreq
 
 # Bus drivers
 device		pci
 device		al_pci		# Annapurna Alpine PCI-E
 options 	PCI_HP			# PCI-Express native HotPlug
 options 	PCI_IOV		# PCI SR-IOV support
 
 # PCI/PCI-X/PCIe Ethernet NICs that use iflib infrastructure
 device		iflib
 device		em		# Intel PRO/1000 Gigabit Ethernet Family
 device		ix		# Intel 10Gb Ethernet Family
 
 # Ethernet NICs
 device		mdio
 device		mii
 device		miibus		# MII bus support
 device		awg		# Allwinner EMAC Gigabit Ethernet
 device		axgbe		# AMD Opteron A1100 integrated NIC
 device		msk		# Marvell/SysKonnect Yukon II Gigabit Ethernet
 device		neta		# Marvell Armada 370/38x/XP/3700 NIC
 device  	smc		# SMSC LAN91C111
 device		vnic		# Cavium ThunderX NIC
 device		al_eth		# Annapurna Alpine Ethernet NIC
 device		dwc_rk		# Rockchip Designware
+device		dwc_socfpga	# Altera SOCFPGA Ethernet MAC
 
 # Etherswitch devices
 device		etherswitch	# Enable etherswitch support
 device		miiproxy	# Required for etherswitch
 device		e6000sw		# Marvell mv88e6085 based switches
 
 # Block devices
 device		ahci
 device		scbus
 device		da
 
 # ATA/SCSI peripherals
 device		pass		# Passthrough device (direct ATA/SCSI access)
 
 # NVM Express (NVMe) support
 device		nvme		# base NVMe driver
 options 	NVME_USE_NVD=0	# prefer the cam(4) based nda(4) driver
 device		nvd		# expose NVMe namespaces as disks, depends on nvme
 
 # MMC/SD/SDIO Card slot support
 device		sdhci
 device		sdhci_xenon		# Marvell Xenon SD/MMC controller
 device		aw_mmc			# Allwinner SD/MMC controller
 device		mmc			# mmc/sd bus
 device		mmcsd			# mmc/sd flash cards
 device		dwmmc
 device		rk_emmcphy
 
 # Serial (COM) ports
 device		uart		# Generic UART driver
 device		uart_msm	# Qualcomm MSM UART driver
 device		uart_mu		# RPI3 aux port
 device		uart_mvebu	# Armada 3700 UART driver
 device		uart_ns8250	# ns8250-type UART driver
 device		uart_snps
 device		pl011
 
 # USB support
 device		aw_ehci			# Allwinner EHCI USB interface (USB 2.0)
 device		aw_usbphy		# Allwinner USB PHY
 device		dwcotg			# DWC OTG controller
 device		ohci			# OHCI USB interface
 device		ehci			# EHCI USB interface (USB 2.0)
 device		ehci_mv			# Marvell EHCI USB interface
 device		xhci			# XHCI USB interface (USB 3.0)
 device		usb			# USB Bus (required)
 device		ukbd			# Keyboard
 device		umass			# Disks/Mass storage - Requires scbus and da
 
 # USB ethernet support
 device		muge
 device		smcphy
 device		smsc
 
 # Sound support
 device sound
 device a10_codec
 
 # DMA controller
 device		a31_dmac
 
 # GPIO / PINCTRL
 device		a37x0_gpio	# Marvell Armada 37x0 GPIO controller
 device		aw_gpio		# Allwinner GPIO controller
 device		gpio
 device		gpioled
 device		fdt_pinctrl
 device		gpioregulator
 device		mv_gpio		# Marvell GPIO controller
 device		mvebu_pinctrl	# Marvell Pinmux Controller
 device		rk_gpio		# RockChip GPIO Controller
 device		rk_pinctrl	# RockChip Pinmux Controller
 
 # I2C
 device		aw_rsb		# Allwinner Reduced Serial Bus
 device		bcm2835_bsc	# Broadcom BCM283x I2C bus
 device		iicbus
 device		iic
 device		twsi		# Allwinner I2C controller
 device		rk_i2c		# RockChip I2C controller
 device		syr827		# Silergy SYR827 PMIC
 device		sy8106a		# SY8106A Buck Regulator
 
 # Clock and reset controllers
 device		aw_ccu		# Allwinner clock controller
 
 # Interrupt controllers
 device		aw_nmi		# Allwinner NMI support
 device		mv_cp110_icu	# Marvell CP110 ICU
 device		mv_ap806_gicp	# Marvell AP806 GICP
 
 # Real-time clock support
 device		aw_rtc		# Allwinner Real-time Clock
 device		mv_rtc		# Marvell Real-time Clock
 
 # Watchdog controllers
 device		aw_wdog		# Allwinner Watchdog
 
 # Power management controllers
 device		axp81x		# X-Powers AXP81x PMIC
 device		rk805		# RockChip RK805 PMIC
 
 # EFUSE
 device		aw_sid		# Allwinner Secure ID EFUSE
 
 # Thermal sensors
 device		aw_thermal	# Allwinner Thermal Sensor Controller
 device		mv_thermal	# Marvell Thermal Sensor Controller
 
 # SPI
 device		spibus
 device		bcm2835_spi	# Broadcom BCM283x SPI bus
 
 # PWM
 device		pwm
 device		aw_pwm
 
 # Console
 device		vt
 device		kbdmux
 
 device		vt_efifb
 
 # EVDEV support
 device		evdev			# input event device support
 options		EVDEV_SUPPORT		# evdev support in legacy drivers
 device		uinput			# install /dev/uinput cdev
 device		aw_cir
 
 # Pseudo devices.
 device		crypto		# core crypto support
 device		loop		# Network loopback
 device		ether		# Ethernet support
 device		vlan		# 802.1Q VLAN support
 device		tuntap		# Packet tunnel.
 device		md		# Memory "disks"
 device		gif		# IPv6 and IPv4 tunneling
 device		firmware	# firmware assist module
 options 	EFIRT		# EFI Runtime Services
 
 # EXT_RESOURCES pseudo devices
 options 	EXT_RESOURCES
 device		clk
 device		phy
 device		hwreset
 device		nvmem
 device		regulator
 device		syscon
 device		aw_syscon
 
 # The `bpf' device enables the Berkeley Packet Filter.
 # Be aware of the administrative consequences of enabling this!
 # Note that 'bpf' is required for DHCP.
 device		bpf		# Berkeley packet filter
 
 # Chip-specific errata
 options 	THUNDERX_PASS_1_1_ERRATA
 
 options 	FDT
 device		acpi
 
 # DTBs
 makeoptions	MODULES_EXTRA="dtb/allwinner dtb/rockchip dtb/rpi"
Index: projects/fuse2/sys/compat/freebsd32/freebsd32_capability.c
===================================================================
--- projects/fuse2/sys/compat/freebsd32/freebsd32_capability.c	(revision 350434)
+++ projects/fuse2/sys/compat/freebsd32/freebsd32_capability.c	(revision 350435)
@@ -1,156 +1,157 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2013 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/filedesc.h>
+#include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysproto.h>
 
 #include <security/audit/audit.h>
 
 #include <compat/freebsd32/freebsd32_proto.h>
 
 #ifdef CAPABILITIES
 
 MALLOC_DECLARE(M_FILECAPS);
 
 int
 freebsd32_cap_ioctls_limit(struct thread *td,
     struct freebsd32_cap_ioctls_limit_args *uap)
 {
 	u_long *cmds;
 	uint32_t *cmds32;
 	size_t ncmds;
 	u_int i;
 	int error;
 
 	ncmds = uap->ncmds;
 
 	if (ncmds > 256)	/* XXX: Is 256 sane? */
 		return (EINVAL);
 
 	if (ncmds == 0) {
 		cmds = NULL;
 	} else {
 		cmds32 = malloc(sizeof(cmds32[0]) * ncmds, M_FILECAPS, M_WAITOK);
 		error = copyin(uap->cmds, cmds32, sizeof(cmds32[0]) * ncmds);
 		if (error != 0) {
 			free(cmds32, M_FILECAPS);
 			return (error);
 		}
 		cmds = malloc(sizeof(cmds[0]) * ncmds, M_FILECAPS, M_WAITOK);
 		for (i = 0; i < ncmds; i++)
 			cmds[i] = cmds32[i];
 		free(cmds32, M_FILECAPS);
 	}
 
 	return (kern_cap_ioctls_limit(td, uap->fd, cmds, ncmds));
 }
 
 int
 freebsd32_cap_ioctls_get(struct thread *td,
     struct freebsd32_cap_ioctls_get_args *uap)
 {
 	struct filedesc *fdp;
 	struct filedescent *fdep;
 	uint32_t *cmds32;
 	u_long *cmds;
 	size_t maxcmds;
 	int error, fd;
 	u_int i;
 
 	fd = uap->fd;
 	cmds32 = uap->cmds;
 	maxcmds = uap->maxcmds;
 
 	AUDIT_ARG_FD(fd);
 
 	fdp = td->td_proc->p_fd;
 	FILEDESC_SLOCK(fdp);
 
 	if (fget_locked(fdp, fd) == NULL) {
 		error = EBADF;
 		goto out;
 	}
 
 	/*
 	 * If all ioctls are allowed (fde_nioctls == -1 && fde_ioctls == NULL)
 	 * the only sane thing we can do is to not populate the given array and
 	 * return CAP_IOCTLS_ALL (actually, INT_MAX).
 	 */
 
 	fdep = &fdp->fd_ofiles[fd];
 	cmds = fdep->fde_ioctls;
 	if (cmds32 != NULL && cmds != NULL) {
 		for (i = 0; i < MIN(fdep->fde_nioctls, maxcmds); i++) {
 			error = suword32(&cmds32[i], cmds[i]);
 			if (error != 0)
 				goto out;
 		}
 	}
 	if (fdep->fde_nioctls == -1)
 		td->td_retval[0] = INT_MAX;
 	else
 		td->td_retval[0] = fdep->fde_nioctls;
 
 	error = 0;
 out:
 	FILEDESC_SUNLOCK(fdp);
 	return (error);
 }
 
 #else /* !CAPABILITIES */
 
 int
 freebsd32_cap_ioctls_limit(struct thread *td,
     struct freebsd32_cap_ioctls_limit_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 int
 freebsd32_cap_ioctls_get(struct thread *td,
     struct freebsd32_cap_ioctls_get_args *uap)
 {
 
 	return (ENOSYS);
 }
 
 #endif /* CAPABILITIES */
Index: projects/fuse2/sys/conf/files.arm64
===================================================================
--- projects/fuse2/sys/conf/files.arm64	(revision 350434)
+++ projects/fuse2/sys/conf/files.arm64	(revision 350435)
@@ -1,291 +1,292 @@
 # $FreeBSD$
 cloudabi32_vdso.o		optional	compat_cloudabi32	\
 	dependency	"$S/contrib/cloudabi/cloudabi_vdso_armv6_on_64bit.S"	\
 	compile-with	"${CC} -x assembler-with-cpp -m32 -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_armv6_on_64bit.S -o ${.TARGET}" \
 	no-obj no-implicit-rule						\
 	clean		"cloudabi32_vdso.o"
 #
 cloudabi32_vdso_blob.o		optional	compat_cloudabi32	\
 	dependency 	"cloudabi32_vdso.o"			\
 	compile-with	"${OBJCOPY} --input-target binary --output-target elf64-littleaarch64 --binary-architecture aarch64 cloudabi32_vdso.o ${.TARGET}" \
 	no-implicit-rule						\
 	clean		"cloudabi32_vdso_blob.o"
 #
 cloudabi64_vdso.o		optional	compat_cloudabi64	\
 	dependency	"$S/contrib/cloudabi/cloudabi_vdso_aarch64.S"	\
 	compile-with	"${CC} -x assembler-with-cpp -shared -nostdinc -nostdlib -Wl,-T$S/compat/cloudabi/cloudabi_vdso.lds $S/contrib/cloudabi/cloudabi_vdso_aarch64.S -o ${.TARGET}" \
 	no-obj no-implicit-rule						\
 	clean		"cloudabi64_vdso.o"
 #
 cloudabi64_vdso_blob.o		optional	compat_cloudabi64	\
 	dependency 	"cloudabi64_vdso.o"			\
 	compile-with	"${OBJCOPY} --input-target binary --output-target elf64-littleaarch64 --binary-architecture aarch64 cloudabi64_vdso.o ${.TARGET}" \
 	no-implicit-rule						\
 	clean		"cloudabi64_vdso_blob.o"
 #
 
 # Allwinner common files
 arm/allwinner/a10_ehci.c	optional	ehci aw_ehci fdt
 arm/allwinner/a10_timer.c	optional	a10_timer fdt
 arm/allwinner/a10_codec.c	optional	sound a10_codec
 arm/allwinner/a31_dmac.c	optional	a31_dmac
 arm/allwinner/sunxi_dma_if.m	optional	a31_dmac
 arm/allwinner/aw_cir.c		optional	evdev aw_cir fdt
 arm/allwinner/aw_gpio.c		optional	gpio aw_gpio fdt
 arm/allwinner/aw_mmc.c		optional	mmc aw_mmc fdt | mmccam aw_mmc fdt
 arm/allwinner/aw_nmi.c		optional	aw_nmi fdt \
 	compile-with "${NORMAL_C} -I$S/gnu/dts/include"
 arm/allwinner/aw_pwm.c		optional	aw_pwm fdt
 arm/allwinner/aw_rsb.c		optional	aw_rsb fdt
 arm/allwinner/aw_rtc.c		optional	aw_rtc fdt
 arm/allwinner/aw_sid.c		optional	aw_sid nvmem fdt
 arm/allwinner/aw_spi.c		optional	aw_spi fdt
 arm/allwinner/aw_syscon.c	optional	aw_syscon ext_resources syscon fdt
 arm/allwinner/aw_thermal.c	optional	aw_thermal nvmem fdt
 arm/allwinner/aw_usbphy.c	optional	ehci aw_usbphy fdt
 arm/allwinner/aw_wdog.c		optional	aw_wdog fdt
 arm/allwinner/axp81x.c		optional	axp81x fdt
 arm/allwinner/if_awg.c		optional	awg ext_resources syscon aw_sid nvmem fdt
 
 # Allwinner clock driver
 arm/allwinner/clkng/aw_ccung.c		optional	aw_ccu fdt
 arm/allwinner/clkng/aw_clk_frac.c	optional	aw_ccu fdt
 arm/allwinner/clkng/aw_clk_nkmp.c	optional	aw_ccu fdt
 arm/allwinner/clkng/aw_clk_nm.c		optional	aw_ccu fdt
 arm/allwinner/clkng/aw_clk_prediv_mux.c	optional	aw_ccu fdt
 arm/allwinner/clkng/ccu_a64.c		optional	soc_allwinner_a64 aw_ccu fdt
 arm/allwinner/clkng/ccu_h3.c		optional	soc_allwinner_h5 aw_ccu fdt
 arm/allwinner/clkng/ccu_sun8i_r.c	optional	aw_ccu fdt
 arm/allwinner/clkng/ccu_de2.c		optional	aw_ccu fdt
 
 # Allwinner padconf files
 arm/allwinner/a64/a64_padconf.c	optional	soc_allwinner_a64 fdt
 arm/allwinner/a64/a64_r_padconf.c optional	soc_allwinner_a64 fdt
 arm/allwinner/h3/h3_padconf.c	optional	soc_allwinner_h5 fdt
 arm/allwinner/h3/h3_r_padconf.c optional	soc_allwinner_h5 fdt
 
 arm/annapurna/alpine/alpine_ccu.c		optional	al_ccu fdt
 arm/annapurna/alpine/alpine_nb_service.c	optional	al_nb_service fdt
 arm/annapurna/alpine/alpine_pci.c		optional	al_pci fdt
 arm/annapurna/alpine/alpine_pci_msix.c		optional	al_pci fdt
 arm/annapurna/alpine/alpine_serdes.c		optional al_serdes fdt		\
 	no-depend	\
 	compile-with "${CC} -c -o ${.TARGET} ${CFLAGS} -I$S/contrib/alpine-hal -I$S/contrib/alpine-hal/eth ${PROF} ${.IMPSRC}"
 arm/arm/generic_timer.c		standard
 arm/arm/gic.c			standard
 arm/arm/gic_acpi.c		optional	acpi
 arm/arm/gic_fdt.c		optional	fdt
 arm/arm/pmu.c			standard
 arm/arm/physmem.c		standard
 arm/broadcom/bcm2835/bcm2835_audio.c		optional sound vchiq fdt \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 arm/broadcom/bcm2835/bcm2835_bsc.c		optional bcm2835_bsc soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_cpufreq.c		optional soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_dma.c		optional soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_fbd.c		optional vt soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_ft5406.c		optional evdev bcm2835_ft5406 soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_gpio.c		optional gpio soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_intr.c		optional soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_mbox.c		optional soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_rng.c		optional !random_loadable soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_sdhci.c		optional sdhci soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_sdhost.c		optional sdhci soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_spi.c		optional bcm2835_spi soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_vcio.c		optional soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2835_wdog.c		optional soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm2836.c			optional soc_brcm_bcm2837 fdt
 arm/broadcom/bcm2835/bcm283x_dwc_fdt.c		optional dwcotg fdt soc_brcm_bcm2837
 arm/mv/a37x0_gpio.c				optional a37x0_gpio gpio fdt
 arm/mv/gpio.c					optional mv_gpio fdt
 arm/mv/mvebu_pinctrl.c				optional mvebu_pinctrl fdt
 arm/mv/mv_cp110_icu.c				optional mv_cp110_icu fdt
 arm/mv/mv_ap806_gicp.c				optional mv_ap806_gicp fdt
 arm/mv/mv_ap806_clock.c				optional SOC_MARVELL_8K fdt
 arm/mv/mv_cp110_clock.c				optional SOC_MARVELL_8K fdt
 arm/mv/mv_thermal.c				optional SOC_MARVELL_8K mv_thermal fdt
 arm/mv/armada38x/armada38x_rtc.c		optional mv_rtc fdt
 arm/xilinx/uart_dev_cdnc.c			optional uart soc_xilinx_zynq
 arm64/acpica/acpi_iort.c	optional	acpi
 arm64/acpica/acpi_machdep.c	optional	acpi
 arm64/acpica/OsdEnvironment.c	optional	acpi
 arm64/acpica/acpi_wakeup.c	optional	acpi
 arm64/acpica/pci_cfgreg.c	optional	acpi	pci
 arm64/arm64/autoconf.c		standard
 arm64/arm64/bus_machdep.c	standard
 arm64/arm64/bus_space_asm.S	standard
 arm64/arm64/busdma_bounce.c	standard
 arm64/arm64/busdma_machdep.c	standard
 arm64/arm64/bzero.S		standard
 arm64/arm64/clock.c		standard
 arm64/arm64/copyinout.S		standard
 arm64/arm64/copystr.c		standard
 arm64/arm64/cpu_errata.c	standard
 arm64/arm64/cpufunc_asm.S	standard
 arm64/arm64/db_disasm.c		optional	ddb
 arm64/arm64/db_interface.c	optional	ddb
 arm64/arm64/db_trace.c		optional	ddb
 arm64/arm64/debug_monitor.c	optional	ddb
 arm64/arm64/disassem.c		optional	ddb
 arm64/arm64/dump_machdep.c	standard
 arm64/arm64/efirt_machdep.c	optional	efirt
 arm64/arm64/elf32_machdep.c	optional	compat_freebsd32
 arm64/arm64/elf_machdep.c	standard
 arm64/arm64/exception.S		standard
 arm64/arm64/freebsd32_machdep.c	optional	compat_freebsd32
 arm64/arm64/gicv3_its.c		optional	intrng fdt
 arm64/arm64/gic_v3.c		standard
 arm64/arm64/gic_v3_acpi.c	optional	acpi
 arm64/arm64/gic_v3_fdt.c	optional	fdt
 arm64/arm64/identcpu.c		standard
 arm64/arm64/in_cksum.c		optional	inet | inet6
 arm64/arm64/locore.S		standard	no-obj
 arm64/arm64/machdep.c		standard
 arm64/arm64/mem.c		standard
 arm64/arm64/memcpy.S		standard
 arm64/arm64/memmove.S		standard
 arm64/arm64/minidump_machdep.c	standard
 arm64/arm64/mp_machdep.c	optional	smp
 arm64/arm64/nexus.c		standard
 arm64/arm64/ofw_machdep.c	optional	fdt
 arm64/arm64/pmap.c		standard
 arm64/arm64/stack_machdep.c	optional	ddb | stack
 arm64/arm64/support.S		standard
 arm64/arm64/swtch.S		standard
 arm64/arm64/sys_machdep.c	standard
 arm64/arm64/trap.c		standard
 arm64/arm64/uio_machdep.c	standard
 arm64/arm64/uma_machdep.c	standard
 arm64/arm64/undefined.c		standard
 arm64/arm64/unwind.c		optional	ddb | kdtrace_hooks | stack
 arm64/arm64/vfp.c		standard
 arm64/arm64/vm_machdep.c	standard
 arm64/cavium/thunder_pcie_fdt.c		optional	soc_cavm_thunderx pci fdt
 arm64/cavium/thunder_pcie_pem.c		optional	soc_cavm_thunderx pci
 arm64/cavium/thunder_pcie_pem_fdt.c	optional	soc_cavm_thunderx pci fdt
 arm64/cavium/thunder_pcie_common.c	optional	soc_cavm_thunderx pci
 arm64/cloudabi32/cloudabi32_sysvec.c	optional compat_cloudabi32
 arm64/cloudabi64/cloudabi64_sysvec.c	optional compat_cloudabi64
 arm64/coresight/coresight.c			standard
 arm64/coresight/coresight_if.m			standard
 arm64/coresight/coresight-cmd.c			standard
 arm64/coresight/coresight-cpu-debug.c		standard
 arm64/coresight/coresight-dynamic-replicator.c	standard
 arm64/coresight/coresight-etm4x.c		standard
 arm64/coresight/coresight-funnel.c		standard
 arm64/coresight/coresight-tmc.c			standard
 arm64/qualcomm/qcom_gcc.c			optional qcom_gcc fdt
 contrib/vchiq/interface/compat/vchi_bsd.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_2835_arm.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_arm.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -Wno-unused -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_connected.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_core.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_kern_lib.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_kmod.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_shim.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 contrib/vchiq/interface/vchiq_arm/vchiq_util.c	optional vchiq soc_brcm_bcm2837 \
 	compile-with "${NORMAL_C} -DUSE_VCHIQ_ARM -D__VCCOREVER__=0x04000000 -I$S/contrib/vchiq"
 crypto/armv8/armv8_crypto.c	optional	armv8crypto
 armv8_crypto_wrap.o		optional	armv8crypto		\
 	dependency	"$S/crypto/armv8/armv8_crypto_wrap.c"		\
 	compile-with	"${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc:N-mgeneral-regs-only} -I$S/crypto/armv8/ ${WERROR} ${NO_WCAST_QUAL} ${PROF} -march=armv8-a+crypto ${.IMPSRC}" \
 	no-implicit-rule						\
 	clean		"armv8_crypto_wrap.o"
 crypto/blowfish/bf_enc.c	optional	crypto | ipsec | ipsec_support
 crypto/des/des_enc.c		optional	crypto | ipsec | ipsec_support | netsmb
 dev/acpica/acpi_bus_if.m	optional	acpi
 dev/acpica/acpi_if.m		optional	acpi
 dev/acpica/acpi_pci_link.c	optional	acpi pci
 dev/acpica/acpi_pcib.c		optional	acpi pci
 dev/acpica/acpi_pxm.c		optional	acpi
 dev/ahci/ahci_generic.c		optional	ahci
+dev/altera/dwc/if_dwc_socfpga.c	optional	fdt dwc_socfpga
 dev/axgbe/if_axgbe.c		optional	axgbe
 dev/axgbe/xgbe-desc.c		optional	axgbe
 dev/axgbe/xgbe-dev.c		optional	axgbe
 dev/axgbe/xgbe-drv.c		optional	axgbe
 dev/axgbe/xgbe-mdio.c		optional	axgbe
 dev/cpufreq/cpufreq_dt.c	optional	cpufreq fdt
 dev/iicbus/sy8106a.c		optional	sy8106a fdt
 dev/iicbus/twsi/mv_twsi.c	optional	twsi fdt
 dev/iicbus/twsi/a10_twsi.c	optional	twsi fdt
 dev/iicbus/twsi/twsi.c		optional	twsi fdt
 dev/hwpmc/hwpmc_arm64.c		optional	hwpmc
 dev/hwpmc/hwpmc_arm64_md.c	optional	hwpmc
 dev/mbox/mbox_if.m		optional	soc_brcm_bcm2837
 dev/mmc/host/dwmmc.c		optional	dwmmc fdt
 dev/mmc/host/dwmmc_hisi.c	optional	dwmmc fdt soc_hisi_hi6220
 dev/mmc/host/dwmmc_rockchip.c	optional	dwmmc fdt soc_rockchip_rk3328
 dev/neta/if_mvneta_fdt.c	optional	neta fdt
 dev/neta/if_mvneta.c		optional	neta mdio mii
 dev/ofw/ofw_cpu.c		optional	fdt
 dev/ofw/ofwpci.c		optional 	fdt pci
 dev/pci/pci_host_generic.c	optional	pci
 dev/pci/pci_host_generic_acpi.c	optional	pci acpi
 dev/pci/pci_host_generic_fdt.c	optional	pci fdt
 dev/psci/psci.c			standard
 dev/psci/psci_arm64.S		standard
 dev/psci/smccc.c		standard
 dev/sdhci/sdhci_xenon.c		optional	sdhci_xenon sdhci fdt
 dev/uart/uart_cpu_arm64.c	optional	uart
 dev/uart/uart_dev_mu.c		optional	uart uart_mu
 dev/uart/uart_dev_pl011.c	optional	uart pl011
 dev/usb/controller/dwc_otg_hisi.c optional	dwcotg fdt soc_hisi_hi6220
 dev/usb/controller/ehci_mv.c	optional	ehci_mv fdt
 dev/usb/controller/generic_ehci.c optional	ehci acpi
 dev/usb/controller/generic_ohci.c optional	ohci fdt
 dev/usb/controller/generic_usb_if.m optional	ohci fdt
 dev/usb/controller/usb_nop_xceiv.c	optional fdt ext_resources
 dev/usb/controller/generic_xhci.c	optional	xhci
 dev/usb/controller/generic_xhci_acpi.c	optional	xhci acpi
 dev/usb/controller/generic_xhci_fdt.c	optional	xhci fdt
 dev/vnic/mrml_bridge.c		optional	vnic fdt
 dev/vnic/nic_main.c		optional	vnic pci
 dev/vnic/nicvf_main.c		optional	vnic pci pci_iov
 dev/vnic/nicvf_queues.c		optional	vnic pci pci_iov
 dev/vnic/thunder_bgx_fdt.c	optional	vnic fdt
 dev/vnic/thunder_bgx.c		optional	vnic pci
 dev/vnic/thunder_mdio_fdt.c	optional	vnic fdt
 dev/vnic/thunder_mdio.c		optional	vnic
 dev/vnic/lmac_if.m		optional	inet | inet6 | vnic
 kern/kern_clocksource.c		standard
 kern/msi_if.m			optional	intrng
 kern/pic_if.m			optional	intrng
 kern/subr_devmap.c		standard
 kern/subr_intr.c		optional	intrng
 libkern/bcmp.c			standard
 libkern/memcmp.c		standard
 libkern/memset.c		standard
 libkern/arm64/crc32c_armv8.S	standard
 cddl/contrib/opensolaris/common/atomic/aarch64/opensolaris_atomic.S	optional zfs | dtrace compile-with "${CDDL_C}"
 cddl/dev/dtrace/aarch64/dtrace_asm.S			optional dtrace compile-with "${DTRACE_S}"
 cddl/dev/dtrace/aarch64/dtrace_subr.c			optional dtrace compile-with "${DTRACE_C}"
 cddl/dev/fbt/aarch64/fbt_isa.c				optional dtrace_fbt | dtraceall compile-with "${FBT_C}"
 
 # RockChip Drivers
 arm64/rockchip/rk3399_emmcphy.c		optional fdt rk_emmcphy soc_rockchip_rk3399
 arm64/rockchip/rk_i2c.c			optional fdt rk_i2c soc_rockchip_rk3328 | fdt rk_i2c soc_rockchip_rk3399
 arm64/rockchip/rk805.c			optional fdt rk805 soc_rockchip_rk3328 | fdt rk805 soc_rockchip_rk3399
 arm64/rockchip/rk_grf.c			optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399
 arm64/rockchip/rk_pinctrl.c		optional fdt rk_pinctrl soc_rockchip_rk3328 | fdt rk_pinctrl soc_rockchip_rk3399
 arm64/rockchip/rk_gpio.c		optional fdt rk_gpio soc_rockchip_rk3328 | fdt rk_gpio soc_rockchip_rk3399
 arm64/rockchip/if_dwc_rk.c		optional fdt dwc_rk soc_rockchip_rk3328 | fdt dwc_rk soc_rockchip_rk3399
 dev/dwc/if_dwc.c			optional fdt dwc_rk soc_rockchip_rk3328 | fdt dwc_rk soc_rockchip_rk3399
 dev/dwc/if_dwc_if.m			optional fdt dwc_rk soc_rockchip_rk3328 | fdt dwc_rk soc_rockchip_rk3399
 
 # RockChip Clock support
 arm64/rockchip/clk/rk_cru.c		optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399
 arm64/rockchip/clk/rk_clk_armclk.c	optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399
 arm64/rockchip/clk/rk_clk_composite.c	optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399
 arm64/rockchip/clk/rk_clk_gate.c	optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399
 arm64/rockchip/clk/rk_clk_mux.c		optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399
 arm64/rockchip/clk/rk_clk_pll.c		optional fdt soc_rockchip_rk3328 | fdt soc_rockchip_rk3399
 arm64/rockchip/clk/rk3328_cru.c		optional fdt soc_rockchip_rk3328
 arm64/rockchip/clk/rk3399_cru.c		optional fdt soc_rockchip_rk3399
 arm64/rockchip/clk/rk3399_pmucru.c	optional fdt soc_rockchip_rk3399
Index: projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c
===================================================================
--- projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c	(nonexistent)
+++ projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c	(revision 350435)
@@ -0,0 +1,113 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2019 Ruslan Bukin <br@bsdpad.com>
+ *
+ * This software was developed by SRI International and the University of
+ * Cambridge Computer Laboratory (Department of Computer Science and
+ * Technology) under DARPA contract HR0011-18-C-0016 ("ECATS"), as part of the
+ * DARPA SSITH research programme.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+
+#include <machine/bus.h>
+
+#include <dev/dwc/if_dwc.h>
+#include <dev/dwc/if_dwcvar.h>
+#include <dev/ofw/ofw_bus.h>
+#include <dev/ofw/ofw_bus_subr.h>
+
+#include "if_dwc_if.h"
+
+static int
+if_dwc_socfpga_probe(device_t dev)
+{
+
+	if (!ofw_bus_status_okay(dev))
+		return (ENXIO);
+
+	if (!ofw_bus_is_compatible(dev, "altr,socfpga-stmmac"))
+		return (ENXIO);
+
+	device_set_desc(dev, "Altera SOCFPGA Ethernet MAC");
+
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+if_dwc_socfpga_init(device_t dev)
+{
+
+	return (0);
+}
+
+static int
+if_dwc_socfpga_mac_type(device_t dev)
+{
+
+	return (DWC_GMAC);
+}
+
+static int
+if_dwc_socfpga_mii_clk(device_t dev)
+{
+	phandle_t root;
+
+	root = OF_finddevice("/");
+
+	if (ofw_bus_node_is_compatible(root, "altr,socfpga-stratix10"))
+		return (GMAC_MII_CLK_35_60M_DIV26);
+
+	/* Default value. */
+	return (GMAC_MII_CLK_25_35M_DIV16);
+}
+
+static device_method_t dwc_socfpga_methods[] = {
+	DEVMETHOD(device_probe,		if_dwc_socfpga_probe),
+
+	DEVMETHOD(if_dwc_init,		if_dwc_socfpga_init),
+	DEVMETHOD(if_dwc_mac_type,	if_dwc_socfpga_mac_type),
+	DEVMETHOD(if_dwc_mii_clk,	if_dwc_socfpga_mii_clk),
+
+	DEVMETHOD_END
+};
+
+static devclass_t dwc_socfpga_devclass;
+
+extern driver_t dwc_driver;
+
+DEFINE_CLASS_1(dwc, dwc_socfpga_driver, dwc_socfpga_methods,
+    sizeof(struct dwc_softc), dwc_driver);
+EARLY_DRIVER_MODULE(dwc_socfpga, simplebus, dwc_socfpga_driver,
+    dwc_socfpga_devclass, 0, 0, BUS_PASS_SUPPORTDEV + BUS_PASS_ORDER_MIDDLE);
+
+MODULE_DEPEND(dwc_socfpga, dwc, 1, 1, 1);

Property changes on: projects/fuse2/sys/dev/altera/dwc/if_dwc_socfpga.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c
===================================================================
--- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c	(revision 350434)
+++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_data_tlv.c	(revision 350435)
@@ -1,885 +1,886 @@
 /*-
  * Copyright (c) 2016 Landon Fuller <landonf@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
  *    redistribution must be conditioned upon including a substantially
  *    similar Disclaimer requirement for further binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGES.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifdef _KERNEL
 #include <sys/param.h>
 #include <sys/ctype.h>
+#include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #else /* !_KERNEL */
 #include <ctype.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #endif /* _KERNEL */
 
 #include "bhnd_nvram_private.h"
 
 #include "bhnd_nvram_datavar.h"
 
 #include "bhnd_nvram_data_tlvreg.h"
 
 /*
  * CFE TLV NVRAM data class.
  * 
  * The CFE-defined TLV NVRAM format is used on the WGT634U.
  */
 
 struct bhnd_nvram_tlv {
 	struct bhnd_nvram_data	 nv;	/**< common instance state */
 	struct bhnd_nvram_io	*data;	/**< backing buffer */
 	size_t			 count;	/**< variable count */
 };
 
 BHND_NVRAM_DATA_CLASS_DEFN(tlv, "WGT634U", BHND_NVRAM_DATA_CAP_DEVPATHS,
     sizeof(struct bhnd_nvram_tlv))
 
 /** Minimal TLV_ENV record header */
 struct bhnd_nvram_tlv_env_hdr {
 	uint8_t		tag;
 	uint8_t		size;
 } __packed;
 
 /** Minimal TLV_ENV record */
 struct bhnd_nvram_tlv_env {
 	struct bhnd_nvram_tlv_env_hdr	hdr;
 	uint8_t				flags;
 	char				envp[];
 } __packed;
 
 /* Return the length in bytes of an TLV_ENV's envp data */
 #define	NVRAM_TLV_ENVP_DATA_LEN(_env)	\
 	(((_env)->hdr.size < sizeof((_env)->flags)) ? 0 :	\
 	    ((_env)->hdr.size - sizeof((_env)->flags)))
 
 /* Maximum supported length of the envp data field, in bytes */
 #define	NVRAM_TLV_ENVP_DATA_MAX_LEN	\
 	(UINT8_MAX - sizeof(uint8_t) /* flags */)
 
 	
 static int				 bhnd_nvram_tlv_parse_size(
 					     struct bhnd_nvram_io *io,
 					     size_t *size);
 
 static int				 bhnd_nvram_tlv_next_record(
 					     struct bhnd_nvram_io *io,
 					     size_t *next, size_t *offset,
 					     uint8_t *tag);
 
 static struct bhnd_nvram_tlv_env	*bhnd_nvram_tlv_next_env(
 					     struct bhnd_nvram_tlv *tlv,
 					     size_t *next, void **cookiep);
 
 static struct bhnd_nvram_tlv_env	*bhnd_nvram_tlv_get_env(
 					     struct bhnd_nvram_tlv *tlv,
 					     void *cookiep);
 
 static void				*bhnd_nvram_tlv_to_cookie(
 					     struct bhnd_nvram_tlv *tlv,
 					     size_t io_offset);
 static size_t				 bhnd_nvram_tlv_to_offset(
 					     struct bhnd_nvram_tlv *tlv,
 					     void *cookiep);
 
 static int
 bhnd_nvram_tlv_probe(struct bhnd_nvram_io *io)
 {
 	struct bhnd_nvram_tlv_env	ident;
 	size_t				nbytes;
 	int				error;
 
 	nbytes = bhnd_nvram_io_getsize(io);
 
 	/* Handle what might be an empty TLV image */
 	if (nbytes < sizeof(ident)) {
 		uint8_t tag;
 
 		/* Fetch just the first tag */
 		error = bhnd_nvram_io_read(io, 0x0, &tag, sizeof(tag));
 		if (error)
 			return (error);
 
 		/* This *could* be an empty TLV image, but all we're
 		 * testing for here is a single 0x0 byte followed by EOF */
 		if (tag == NVRAM_TLV_TYPE_END)
 			return (BHND_NVRAM_DATA_PROBE_MAYBE);
 
 		return (ENXIO);
 	}
 
 	/* Otherwise, look at the initial header for a valid TLV ENV tag,
 	 * plus one byte of the entry data */
 	error = bhnd_nvram_io_read(io, 0x0, &ident,
 	    sizeof(ident) + sizeof(ident.envp[0]));
 	if (error)
 		return (error);
 
 	/* First entry should be a variable record (which we statically
 	 * assert as being defined to use a single byte size field) */
 	if (ident.hdr.tag != NVRAM_TLV_TYPE_ENV)
 		return (ENXIO);
 
 	_Static_assert(NVRAM_TLV_TYPE_ENV & NVRAM_TLV_TF_U8_LEN,
 	    "TYPE_ENV is not a U8-sized field");
 
 	/* The entry must be at least 3 characters ('x=\0') in length */
 	if (ident.hdr.size < 3)
 		return (ENXIO);
 
 	/* The first character should be a valid key char (alpha) */
 	if (!bhnd_nv_isalpha(ident.envp[0]))
 		return (ENXIO);
 
 	return (BHND_NVRAM_DATA_PROBE_DEFAULT);
 }
 
 static int
 bhnd_nvram_tlv_getvar_direct(struct bhnd_nvram_io *io, const char *name,
     void *buf, size_t *len, bhnd_nvram_type type)
 {
 	struct bhnd_nvram_tlv_env	 env;
 	char				 data[NVRAM_TLV_ENVP_DATA_MAX_LEN];
 	size_t				 data_len;
 	const char			*key, *value;
 	size_t				 keylen, vlen;
 	size_t				 namelen;
 	size_t				 next, off;
 	uint8_t				 tag;
 	int				 error;
 
 	namelen = strlen(name);
 
 	/* Iterate over the input looking for the requested variable */
 	next = 0;
 	while (!(error = bhnd_nvram_tlv_next_record(io, &next, &off, &tag))) {
 		switch (tag) {
 		case NVRAM_TLV_TYPE_END:
 			/* Not found */
 			return (ENOENT);
 
 		case NVRAM_TLV_TYPE_ENV:
 			/* Read the record header */
 			error = bhnd_nvram_io_read(io, off, &env, sizeof(env));
 			if (error) {
 				BHND_NV_LOG("error reading TLV_ENV record "
 				    "header: %d\n", error);
 				return (error);
 			}
 
 			/* Read the record data */
 			data_len = NVRAM_TLV_ENVP_DATA_LEN(&env);
 			error = bhnd_nvram_io_read(io, off + sizeof(env), data,
 			    data_len);
 			if (error) {
 				BHND_NV_LOG("error reading TLV_ENV record "
 				    "data: %d\n", error);
 				return (error);
 			}
 
 			/* Parse the key=value string */
 			error = bhnd_nvram_parse_env(data, data_len, '=', &key,
 			    &keylen, &value, &vlen);
 			if (error) {
 				BHND_NV_LOG("error parsing TLV_ENV data: %d\n",
 				    error);
 				return (error);
 			}
 
 			/* Match against requested variable name */
 			if (keylen == namelen && 
 			    strncmp(key, name, namelen) == 0)
 			{
 				return (bhnd_nvram_value_coerce(value, vlen,
 				    BHND_NVRAM_TYPE_STRING, buf, len, type));
 			}
 
 			break;
 
 		default:
 			/* Skip unknown tags */
 			break;
 		}
 	}
 
 	/* Hit I/O error */
 	return (error);
 }
 
 static int
 bhnd_nvram_tlv_serialize(bhnd_nvram_data_class *cls, bhnd_nvram_plist *props,
     bhnd_nvram_plist *options, void *outp, size_t *olen)
 {
 	bhnd_nvram_prop	*prop;
 	size_t		 limit, nbytes;
 	int		 error;
 
 	/* Determine output byte limit */
 	if (outp != NULL)
 		limit = *olen;
 	else
 		limit = 0;
 
 	nbytes = 0;
 
 	/* Write all properties */
 	prop = NULL;
 	while ((prop = bhnd_nvram_plist_next(props, prop)) != NULL) {
 		struct bhnd_nvram_tlv_env	 env;
 		const char			*name;
 		uint8_t				*p;
 		size_t				 name_len, value_len;
 		size_t				 rec_size;
 
 		env.hdr.tag = NVRAM_TLV_TYPE_ENV;
 		env.hdr.size = sizeof(env.flags);
 		env.flags = 0x0;
 
 		/* Fetch name value and add to record length */
 		name = bhnd_nvram_prop_name(prop);
 		name_len = strlen(name) + 1 /* '=' */;
 
 		if (UINT8_MAX - env.hdr.size < name_len) {
 			BHND_NV_LOG("%s name exceeds maximum TLV record "
 			    "length\n", name);
 			return (EFTYPE); /* would overflow TLV size */
 		}
 
 		env.hdr.size += name_len;
 
 		/* Add string value to record length */
 		error = bhnd_nvram_prop_encode(prop, NULL, &value_len,
 		    BHND_NVRAM_TYPE_STRING);
 		if (error) {
 			BHND_NV_LOG("error serializing %s to required type "
 			    "%s: %d\n", name,
 			    bhnd_nvram_type_name(BHND_NVRAM_TYPE_STRING),
 			    error);
 			return (error);
 		}
 
 		if (UINT8_MAX - env.hdr.size < value_len) {
 			BHND_NV_LOG("%s value exceeds maximum TLV record "
 			    "length\n", name);
 			return (EFTYPE); /* would overflow TLV size */
 		}
 
 		env.hdr.size += value_len;
 
 		/* Calculate total record size */
 		rec_size = sizeof(env.hdr) + env.hdr.size;
 		if (SIZE_MAX - nbytes < rec_size)
 			return (EFTYPE); /* would overflow size_t */
 
 		/* Calculate our output pointer */
 		if (nbytes > limit || limit - nbytes < rec_size) {
 			/* buffer is full; cannot write */
 			p = NULL;
 		} else {
 			p = (uint8_t *)outp + nbytes;
 		}
 
 		/* Write to output */
 		if (p != NULL) {
 			memcpy(p, &env, sizeof(env));
 			p += sizeof(env);
 	
 			memcpy(p, name, name_len - 1);
 			p[name_len - 1] = '=';
 			p += name_len;
 
 			error = bhnd_nvram_prop_encode(prop, p, &value_len,
 			    BHND_NVRAM_TYPE_STRING);
 			if (error) {
 				BHND_NV_LOG("error serializing %s to required "
 				    "type %s: %d\n", name,
 				    bhnd_nvram_type_name(
 					BHND_NVRAM_TYPE_STRING),
 				    error);
 				return (error);
 			}
 		}
 
 		nbytes += rec_size;
 	}
 
 	/* Write terminating END record */
 	if (limit > nbytes)
 		*((uint8_t *)outp + nbytes) = NVRAM_TLV_TYPE_END;
 
 	if (nbytes == SIZE_MAX)
 		return (EFTYPE); /* would overflow size_t */
 	nbytes++;
 
 	/* Provide required length */
 	*olen = nbytes;
 	if (limit < *olen) {
 		if (outp == NULL)
 			return (0);
 
 		return (ENOMEM);
 	}
 
 	return (0);
 }
 
 /**
  * Initialize @p tlv with the provided NVRAM TLV data mapped by @p src.
  * 
  * @param tlv A newly allocated data instance.
  */
 static int
 bhnd_nvram_tlv_init(struct bhnd_nvram_tlv *tlv, struct bhnd_nvram_io *src)
 {
 	struct bhnd_nvram_tlv_env	*env;
 	size_t				 size;
 	size_t				 next;
 	int				 error;
 
 	BHND_NV_ASSERT(tlv->data == NULL, ("tlv data already initialized"));
 
 	/* Determine the actual size of the TLV source data */
 	if ((error = bhnd_nvram_tlv_parse_size(src, &size)))
 		return (error);
 
 	/* Copy to our own internal buffer */
 	if ((tlv->data = bhnd_nvram_iobuf_copy_range(src, 0x0, size)) == NULL)
 		return (ENOMEM);
 
 	/* Initialize our backing buffer */
 	tlv->count = 0;
 	next = 0;
 	while ((env = bhnd_nvram_tlv_next_env(tlv, &next, NULL)) != NULL) {
 		size_t env_len;
 		size_t name_len;
 
 		/* TLV_ENV data must not be empty */
 		env_len = NVRAM_TLV_ENVP_DATA_LEN(env);
 		if (env_len == 0) {
 			BHND_NV_LOG("cannot parse zero-length TLV_ENV record "
 			    "data\n");
 			return (EINVAL);
 		}
 
 		/* Parse the key=value string, and then replace the '='
 		 * delimiter with '\0' to allow us to provide direct 
 		 * name pointers from our backing buffer */
 		error = bhnd_nvram_parse_env(env->envp, env_len, '=', NULL,
 		    &name_len, NULL, NULL);
 		if (error) {
 			BHND_NV_LOG("error parsing TLV_ENV data: %d\n", error);
 			return (error);
 		}
 
 		/* Replace '=' with '\0' */
 		*(env->envp + name_len) = '\0';
 
 		/* Add to variable count */
 		tlv->count++;
 	};
 
 	return (0);
 }
 
 static int
 bhnd_nvram_tlv_new(struct bhnd_nvram_data *nv, struct bhnd_nvram_io *io)
 {
 	
 	struct bhnd_nvram_tlv	*tlv;
 	int			 error;
 
 	/* Allocate and initialize the TLV data instance */
 	tlv = (struct bhnd_nvram_tlv *)nv;
 
 	/* Parse the TLV input data and initialize our backing
 	 * data representation */
 	if ((error = bhnd_nvram_tlv_init(tlv, io))) {
 		bhnd_nvram_tlv_free(nv);
 		return (error);
 	}
 
 	return (0);
 }
 
 static void
 bhnd_nvram_tlv_free(struct bhnd_nvram_data *nv)
 {
 	struct bhnd_nvram_tlv *tlv = (struct bhnd_nvram_tlv *)nv;
 	if (tlv->data != NULL)
 		bhnd_nvram_io_free(tlv->data);
 }
 
 size_t
 bhnd_nvram_tlv_count(struct bhnd_nvram_data *nv)
 {
 	struct bhnd_nvram_tlv *tlv = (struct bhnd_nvram_tlv *)nv;
 	return (tlv->count);
 }
 
 
 static bhnd_nvram_plist *
 bhnd_nvram_tlv_options(struct bhnd_nvram_data *nv)
 {
 	return (NULL);
 }
 
 static uint32_t
 bhnd_nvram_tlv_caps(struct bhnd_nvram_data *nv)
 {
 	return (BHND_NVRAM_DATA_CAP_READ_PTR|BHND_NVRAM_DATA_CAP_DEVPATHS);
 }
 
 static const char *
 bhnd_nvram_tlv_next(struct bhnd_nvram_data *nv, void **cookiep)
 {
 	struct bhnd_nvram_tlv		*tlv;
 	struct bhnd_nvram_tlv_env	*env;
 	size_t				 io_offset;
 
 	tlv = (struct bhnd_nvram_tlv *)nv;
 
 	/* Find next readable TLV record */
 	if (*cookiep == NULL) {
 		/* Start search at offset 0x0 */
 		io_offset = 0x0;
 		env = bhnd_nvram_tlv_next_env(tlv, &io_offset, cookiep);
 	} else {
 		/* Seek past the previous env record */
 		io_offset = bhnd_nvram_tlv_to_offset(tlv, *cookiep);
 		env = bhnd_nvram_tlv_next_env(tlv, &io_offset, NULL);
 		if (env == NULL)
 			BHND_NV_PANIC("invalid cookiep; record missing");
 
 		/* Advance to next env record, update the caller's cookiep */
 		env = bhnd_nvram_tlv_next_env(tlv, &io_offset, cookiep);
 	}
 
 	/* Check for EOF */
 	if (env == NULL)
 		return (NULL);
 
 	/* Return the NUL terminated name */
 	return (env->envp);
 }
 
 static void *
 bhnd_nvram_tlv_find(struct bhnd_nvram_data *nv, const char *name)
 {
 	return (bhnd_nvram_data_generic_find(nv, name));
 }
 
 static int
 bhnd_nvram_tlv_getvar_order(struct bhnd_nvram_data *nv, void *cookiep1,
     void *cookiep2)
 {
 	if (cookiep1 < cookiep2)
 		return (-1);
 
 	if (cookiep1 > cookiep2)
 		return (1);
 
 	return (0);
 }
 
 static int
 bhnd_nvram_tlv_getvar(struct bhnd_nvram_data *nv, void *cookiep, void *buf,
     size_t *len, bhnd_nvram_type type)
 {
 	return (bhnd_nvram_data_generic_rp_getvar(nv, cookiep, buf, len, type));
 }
 
 static int
 bhnd_nvram_tlv_copy_val(struct bhnd_nvram_data *nv, void *cookiep,
     bhnd_nvram_val **value)
 {
 	return (bhnd_nvram_data_generic_rp_copy_val(nv, cookiep, value));
 }
 
 static const void *
 bhnd_nvram_tlv_getvar_ptr(struct bhnd_nvram_data *nv, void *cookiep,
     size_t *len, bhnd_nvram_type *type)
 {
 	struct bhnd_nvram_tlv		*tlv;
 	struct bhnd_nvram_tlv_env	*env;
 	const char			*val;
 	int				 error;
 
 	tlv = (struct bhnd_nvram_tlv *)nv;
 
 	/* Fetch pointer to the TLV_ENV record */
 	if ((env = bhnd_nvram_tlv_get_env(tlv, cookiep)) == NULL)
 		BHND_NV_PANIC("invalid cookiep: %p", cookiep);
 
 	/* Parse value pointer and length from key\0value data */
 	error = bhnd_nvram_parse_env(env->envp, NVRAM_TLV_ENVP_DATA_LEN(env),
 	    '\0', NULL, NULL, &val, len);
 	if (error)
 		BHND_NV_PANIC("unexpected error parsing '%s'", env->envp);
 
 	/* Type is always CSTR */
 	*type = BHND_NVRAM_TYPE_STRING;
 
 	return (val);
 }
 
 static const char *
 bhnd_nvram_tlv_getvar_name(struct bhnd_nvram_data *nv, void *cookiep)
 {
 	struct bhnd_nvram_tlv		*tlv;
 	const struct bhnd_nvram_tlv_env	*env;
 
 	tlv = (struct bhnd_nvram_tlv *)nv;
 
 	/* Fetch pointer to the TLV_ENV record */
 	if ((env = bhnd_nvram_tlv_get_env(tlv, cookiep)) == NULL)
 		BHND_NV_PANIC("invalid cookiep: %p", cookiep);
 
 	/* Return name pointer */
 	return (&env->envp[0]);
 }
 
 static int
 bhnd_nvram_tlv_filter_setvar(struct bhnd_nvram_data *nv, const char *name,
     bhnd_nvram_val *value, bhnd_nvram_val **result)
 {
 	bhnd_nvram_val	*str;
 	const char	*inp;
 	bhnd_nvram_type	 itype;
 	size_t		 ilen;
 	size_t		 name_len, tlv_nremain;
 	int		 error;
 
 	tlv_nremain = NVRAM_TLV_ENVP_DATA_MAX_LEN;
 
 	/* Name (trimmed of any path prefix) must be valid */
 	if (!bhnd_nvram_validate_name(bhnd_nvram_trim_path_name(name)))
 		return (EINVAL);
 
 	/* 'name=' must fit within the maximum TLV_ENV record length */
 	name_len = strlen(name) + 1; /* '=' */
 	if (tlv_nremain < name_len) {
 		BHND_NV_LOG("'%s=' exceeds maximum TLV_ENV record length\n",
 		    name);
 		return (EINVAL);
 	}
 	tlv_nremain -= name_len;
 
 	/* Convert value to a (bcm-formatted) string */
 	error = bhnd_nvram_val_convert_new(&str, &bhnd_nvram_val_bcm_string_fmt,
 	    value, BHND_NVRAM_VAL_DYNAMIC);
 	if (error)
 		return (error);
 
 	/* The string value must fit within remaining TLV_ENV record length */
 	inp = bhnd_nvram_val_bytes(str, &ilen, &itype);
 	if (tlv_nremain < ilen) {
 		BHND_NV_LOG("'%.*s\\0' exceeds maximum TLV_ENV record length\n",
 		    BHND_NV_PRINT_WIDTH(ilen), inp);
 
 		bhnd_nvram_val_release(str);
 		return (EINVAL);
 	}
 	tlv_nremain -= name_len;
 
 	/* Success. Transfer result ownership to the caller. */
 	*result = str;
 	return (0);
 }
 
 static int
 bhnd_nvram_tlv_filter_unsetvar(struct bhnd_nvram_data *nv, const char *name)
 {
 	/* We permit deletion of any variable */
 	return (0);
 }
 
 /**
  * Iterate over the records starting at @p next, returning the parsed
  * record's @p tag, @p size, and @p offset.
  * 
  * @param		io		The I/O context to parse.
  * @param[in,out]	next		The next offset to be parsed, or 0x0
  *					to begin parsing. Upon successful
  *					return, will be set to the offset of the
  *					next record (or EOF, if
  *					NVRAM_TLV_TYPE_END was parsed).
  * @param[out]		offset		The record's value offset.
  * @param[out]		tag		The record's tag.
  * 
  * @retval 0		success
  * @retval EINVAL	if parsing @p io as TLV fails.
  * @retval non-zero	if reading @p io otherwise fails, a regular unix error
  *			code will be returned.
  */
 static int
 bhnd_nvram_tlv_next_record(struct bhnd_nvram_io *io, size_t *next, size_t
     *offset, uint8_t *tag)
 {
 	size_t		io_offset, io_size;
 	uint16_t	parsed_len;
 	uint8_t		len_hdr[2];
 	int		error;
 
 	io_offset = *next;
 	io_size = bhnd_nvram_io_getsize(io);
 
 	/* Save the record offset */
 	if (offset != NULL)
 		*offset = io_offset;
 
 	/* Fetch initial tag */
 	error = bhnd_nvram_io_read(io, io_offset, tag, sizeof(*tag));
 	if (error)
 		return (error);
 	io_offset++;
 
 	/* EOF */
 	if (*tag == NVRAM_TLV_TYPE_END) {
 		*next = io_offset;
 		return (0);
 	}
 
 	/* Read length field */
 	if (*tag & NVRAM_TLV_TF_U8_LEN) {
 		error = bhnd_nvram_io_read(io, io_offset, &len_hdr,
 		    sizeof(len_hdr[0]));
 		if (error) {
 			BHND_NV_LOG("error reading TLV record size: %d\n",
 			    error);
 			return (error);
 		}
 
 		parsed_len = len_hdr[0];
 		io_offset++;
 	} else {
 		error = bhnd_nvram_io_read(io, io_offset, &len_hdr,
 		    sizeof(len_hdr));
 		if (error) {
 			BHND_NV_LOG("error reading 16-bit TLV record "
 			    "size: %d\n", error);
 			return (error);
 		}
 
 		parsed_len = (len_hdr[0] << 8) | len_hdr[1];
 		io_offset += 2;
 	}
 
 	/* Advance to next record */
 	if (parsed_len > io_size || io_size - parsed_len < io_offset) {
 		/* Hit early EOF */
 		BHND_NV_LOG("TLV record length %hu truncated by input "
 		    "size of %zu\n", parsed_len, io_size);
 		return (EINVAL);
 	}
 
 	*next = io_offset + parsed_len;
 
 	/* Valid record found */
 	return (0);
 }
 
 /**
  * Parse the TLV data in @p io to determine the total size of the TLV
  * data mapped by @p io (which may be less than the size of @p io).
  */
 static int
 bhnd_nvram_tlv_parse_size(struct bhnd_nvram_io *io, size_t *size)
 {
 	size_t		next;
 	uint8_t		tag;
 	int		error;
 
 	/* We have to perform a minimal parse to determine the actual length */
 	next = 0x0;
 	*size = 0x0;
 
 	/* Iterate over the input until we hit END tag or the read fails */
 	do {
 		error = bhnd_nvram_tlv_next_record(io, &next, NULL, &tag);
 		if (error)
 			return (error);
 	} while (tag != NVRAM_TLV_TYPE_END);
 
 	/* Offset should now point to EOF */
 	BHND_NV_ASSERT(next <= bhnd_nvram_io_getsize(io),
 	    ("parse returned invalid EOF offset"));
 
 	*size = next;
 	return (0);
 }
 
 /**
  * Iterate over the records in @p tlv, returning a pointer to the next
  * NVRAM_TLV_TYPE_ENV record, or NULL if EOF is reached.
  * 
  * @param		tlv		The TLV instance.
  * @param[in,out]	next		The next offset to be parsed, or 0x0
  *					to begin parsing. Upon successful
  *					return, will be set to the offset of the
  *					next record.
  */
 static struct bhnd_nvram_tlv_env *
 bhnd_nvram_tlv_next_env(struct bhnd_nvram_tlv *tlv, size_t *next,
     void **cookiep)
 {
 	uint8_t	tag;
 	int	error;
 
 	/* Find the next TLV_ENV record, starting at @p next */
 	do {
 		void	*c;
 		size_t	 offset;
 
 		/* Fetch the next TLV record */
 		error = bhnd_nvram_tlv_next_record(tlv->data, next, &offset,
 		    &tag);
 		if (error) {
 			BHND_NV_LOG("unexpected error in next_record(): %d\n",
 			    error);
 			return (NULL);
 		}
 
 		/* Only interested in ENV records */
 		if (tag != NVRAM_TLV_TYPE_ENV)
 			continue;
 
 		/* Map and return TLV_ENV record pointer */
 		c = bhnd_nvram_tlv_to_cookie(tlv, offset);
 
 		/* Provide the cookiep value for the returned record */
 		if (cookiep != NULL)
 			*cookiep = c;
 
 		return (bhnd_nvram_tlv_get_env(tlv, c));
 	} while (tag != NVRAM_TLV_TYPE_END);
 
 	/* No remaining ENV records */
 	return (NULL);
 }
 
 /**
  * Return a pointer to the TLV_ENV record for @p cookiep, or NULL
  * if none vailable.
  */
 static struct bhnd_nvram_tlv_env *
 bhnd_nvram_tlv_get_env(struct bhnd_nvram_tlv *tlv, void *cookiep)
 {
 	struct bhnd_nvram_tlv_env	*env;
 	void				*ptr;
 	size_t				 navail;
 	size_t				 io_offset, io_size;
 	int				 error;
 	
 	io_size = bhnd_nvram_io_getsize(tlv->data);
 	io_offset = bhnd_nvram_tlv_to_offset(tlv, cookiep);
 
 	/* At EOF? */
 	if (io_offset == io_size)
 		return (NULL);
 
 	/* Fetch non-const pointer to the record entry */
 	error = bhnd_nvram_io_write_ptr(tlv->data, io_offset, &ptr,
 	    sizeof(env->hdr), &navail);
 	if (error) {
 		/* Should never occur with a valid cookiep */
 		BHND_NV_LOG("error mapping record for cookiep: %d\n", error);
 		return (NULL);
 	}
 
 	/* Validate the record pointer */
 	env = ptr;
 	if (env->hdr.tag != NVRAM_TLV_TYPE_ENV) {
 		/* Should never occur with a valid cookiep */
 		BHND_NV_LOG("non-ENV record mapped for %p\n", cookiep);
 		return (NULL);
 	}
 
 	/* Is the required variable name data is mapped? */
 	if (navail < sizeof(struct bhnd_nvram_tlv_env_hdr) + env->hdr.size ||
 	    env->hdr.size == sizeof(env->flags))
 	{
 		/* Should never occur with a valid cookiep */
 		BHND_NV_LOG("TLV_ENV variable data not mapped for %p\n",
 		    cookiep);
 		return (NULL);
 	}
 
 	return (env);
 }
 
 /**
  * Return a cookiep for the given I/O offset.
  */
 static void *
 bhnd_nvram_tlv_to_cookie(struct bhnd_nvram_tlv *tlv, size_t io_offset)
 {
 	const void	*ptr;
 	int		 error;
 
 	BHND_NV_ASSERT(io_offset < bhnd_nvram_io_getsize(tlv->data),
 	    ("io_offset %zu out-of-range", io_offset));
 	BHND_NV_ASSERT(io_offset < UINTPTR_MAX,
 	    ("io_offset %#zx exceeds UINTPTR_MAX", io_offset));
 
 	error = bhnd_nvram_io_read_ptr(tlv->data, 0x0, &ptr, io_offset, NULL);
 	if (error)
 		BHND_NV_PANIC("error mapping offset %zu: %d", io_offset, error);
 
 	ptr = (const uint8_t *)ptr + io_offset;
 	return (__DECONST(void *, ptr));
 }
 
 /* Convert a cookiep back to an I/O offset */
 static size_t
 bhnd_nvram_tlv_to_offset(struct bhnd_nvram_tlv *tlv, void *cookiep)
 {
 	const void	*ptr;
 	intptr_t	 offset;
 	size_t		 io_size;
 	int		 error;
 
 	BHND_NV_ASSERT(cookiep != NULL, ("null cookiep"));
 
 	io_size = bhnd_nvram_io_getsize(tlv->data);
 
 	error = bhnd_nvram_io_read_ptr(tlv->data, 0x0, &ptr, io_size, NULL);
 	if (error)
 		BHND_NV_PANIC("error mapping offset %zu: %d", io_size, error);
 
 	offset = (const uint8_t *)cookiep - (const uint8_t *)ptr;
 	BHND_NV_ASSERT(offset >= 0, ("invalid cookiep"));
 	BHND_NV_ASSERT((uintptr_t)offset < SIZE_MAX, ("cookiep > SIZE_MAX)"));
 	BHND_NV_ASSERT((uintptr_t)offset <= io_size, ("cookiep > io_size)"));
 
 	return ((size_t)offset);
 }
Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_store.c
===================================================================
--- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_store.c	(revision 350434)
+++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_store.c	(revision 350435)
@@ -1,1268 +1,1269 @@
 /*-
  * Copyright (c) 2015-2016 Landon Fuller <landonf@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
  *    redistribution must be conditioned upon including a substantially
  *    similar Disclaimer requirement for further binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGES.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/hash.h>
+#include <sys/limits.h>
 #include <sys/queue.h>
 
 #ifdef _KERNEL
 
 #include <sys/ctype.h>
 #include <sys/systm.h>
 
 #include <machine/_inttypes.h>
 
 #else /* !_KERNEL */
 
 #include <ctype.h>
 #include <errno.h>
 #include <inttypes.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
 #endif /* _KERNEL */
 
 #include "bhnd_nvram_private.h"
 #include "bhnd_nvram_datavar.h"
 
 #include "bhnd_nvram_storevar.h"
 
 /*
  * BHND NVRAM Store
  *
  * Manages in-memory and persistent representations of NVRAM data.
  */
 
 static int			 bhnd_nvstore_parse_data(
 				     struct bhnd_nvram_store *sc);
 
 static int			 bhnd_nvstore_parse_path_entries(
 				     struct bhnd_nvram_store *sc);
 
 static int			 bhnd_nvram_store_export_child(
 				     struct bhnd_nvram_store *sc,
 				     bhnd_nvstore_path *top,
 				     bhnd_nvstore_path *child,
 				     bhnd_nvram_plist *plist,
 				     uint32_t flags);
 
 static int			 bhnd_nvstore_export_merge(
 				     struct bhnd_nvram_store *sc,
 				     bhnd_nvstore_path *path,
 				     bhnd_nvram_plist *merged,
 				     uint32_t flags);
 
 static int			 bhnd_nvstore_export_devpath_alias(
 				     struct bhnd_nvram_store *sc,
 				     bhnd_nvstore_path *path,
 				     const char *devpath,
 				     bhnd_nvram_plist *plist,
 				     u_long *alias_val);
 
 /**
  * Allocate and initialize a new NVRAM data store instance.
  *
  * The caller is responsible for deallocating the instance via
  * bhnd_nvram_store_free().
  * 
  * @param[out] store On success, a pointer to the newly allocated NVRAM data
  * instance.
  * @param data The NVRAM data to be managed by the returned NVRAM data store
  * instance.
  *
  * @retval 0 success
  * @retval non-zero if an error occurs during allocation or initialization, a
  * regular unix error code will be returned.
  */
 int
 bhnd_nvram_store_new(struct bhnd_nvram_store **store,
     struct bhnd_nvram_data *data)
 {
 	struct bhnd_nvram_store *sc;
 	int			 error;
 
 	/* Allocate new instance */
 	sc = bhnd_nv_calloc(1, sizeof(*sc));
 	if (sc == NULL)
 		return (ENOMEM);
 
 	BHND_NVSTORE_LOCK_INIT(sc);
 	BHND_NVSTORE_LOCK(sc);
 
 	/* Initialize path hash table */
 	sc->num_paths = 0;
 	for (size_t i = 0; i < nitems(sc->paths); i++)
 		LIST_INIT(&sc->paths[i]);
 
 	/* Initialize alias hash table */
 	sc->num_aliases = 0;
 	for (size_t i = 0; i < nitems(sc->aliases); i++)
 		LIST_INIT(&sc->aliases[i]);
 
 	/* Retain the NVRAM data */
 	sc->data = bhnd_nvram_data_retain(data);
 	sc->data_caps = bhnd_nvram_data_caps(data);
 	sc->data_opts = bhnd_nvram_data_options(data);
 	if (sc->data_opts != NULL) {
 		bhnd_nvram_plist_retain(sc->data_opts);
 	} else {
 		sc->data_opts = bhnd_nvram_plist_new();
 		if (sc->data_opts == NULL) {
 			error = ENOMEM;
 			goto cleanup;
 		}
 	}
 
 	/* Register required root path */
 	error = bhnd_nvstore_register_path(sc, BHND_NVSTORE_ROOT_PATH,
 	    BHND_NVSTORE_ROOT_PATH_LEN);
 	if (error)
 		goto cleanup;
 
 	sc->root_path = bhnd_nvstore_get_path(sc, BHND_NVSTORE_ROOT_PATH,
 	    BHND_NVSTORE_ROOT_PATH_LEN);
 	BHND_NV_ASSERT(sc->root_path, ("missing root path"));
 
 	/* Parse all variables vended by our backing NVRAM data instance,
 	 * generating all path entries, alias entries, and variable indexes */
 	if ((error = bhnd_nvstore_parse_data(sc)))
 		goto cleanup;
 
 	*store = sc;
 
 	BHND_NVSTORE_UNLOCK(sc);
 	return (0);
 
 cleanup:
 	BHND_NVSTORE_UNLOCK(sc);
 	bhnd_nvram_store_free(sc);
 	return (error);
 }
 
 /**
  * Allocate and initialize a new NVRAM data store instance, parsing the
  * NVRAM data from @p io.
  *
  * The caller is responsible for deallocating the instance via
  * bhnd_nvram_store_free().
  * 
  * The NVRAM data mapped by @p io will be copied, and @p io may be safely
  * deallocated after bhnd_nvram_store_new() returns.
  * 
  * @param[out] store On success, a pointer to the newly allocated NVRAM data
  * instance.
  * @param io An I/O context mapping the NVRAM data to be copied and parsed.
  * @param cls The NVRAM data class to be used when parsing @p io, or NULL
  * to perform runtime identification of the appropriate data class.
  *
  * @retval 0 success
  * @retval non-zero if an error occurs during allocation or initialization, a
  * regular unix error code will be returned.
  */
 int
 bhnd_nvram_store_parse_new(struct bhnd_nvram_store **store,
     struct bhnd_nvram_io *io, bhnd_nvram_data_class *cls)
 {
 	struct bhnd_nvram_data	*data;
 	int			 error;
 
 
 	/* Try to parse the data */
 	if ((error = bhnd_nvram_data_new(cls, &data, io)))
 		return (error);
 
 	/* Try to create our new store instance */
 	error = bhnd_nvram_store_new(store, data);
 	bhnd_nvram_data_release(data);
 
 	return (error);
 }
 
 /**
  * Free an NVRAM store instance, releasing all associated resources.
  * 
  * @param sc A store instance previously allocated via
  * bhnd_nvram_store_new().
  */
 void
 bhnd_nvram_store_free(struct bhnd_nvram_store *sc)
 {
 	
 	/* Clean up alias hash table */
 	for (size_t i = 0; i < nitems(sc->aliases); i++) {
 		bhnd_nvstore_alias *alias, *anext;
 		LIST_FOREACH_SAFE(alias, &sc->aliases[i], na_link, anext)
 			bhnd_nv_free(alias);
 	}
 
 	/* Clean up path hash table */
 	for (size_t i = 0; i < nitems(sc->paths); i++) {
 		bhnd_nvstore_path *path, *pnext;
 		LIST_FOREACH_SAFE(path, &sc->paths[i], np_link, pnext)
 			bhnd_nvstore_path_free(path);
 	}
 
 	if (sc->data != NULL)
 		bhnd_nvram_data_release(sc->data);
 
 	if (sc->data_opts != NULL)
 		bhnd_nvram_plist_release(sc->data_opts);
 
 	BHND_NVSTORE_LOCK_DESTROY(sc);
 	bhnd_nv_free(sc);
 }
 
 /**
  * Parse all variables vended by our backing NVRAM data instance,
  * generating all path entries, alias entries, and variable indexes.
  * 
  * @param	sc	The NVRAM store instance to be initialized with
  *			paths, aliases, and data parsed from its backing
  *			data.
  *
  * @retval 0		success
  * @retval non-zero	if an error occurs during parsing, a regular unix error
  *			code will be returned.
  */
 static int
 bhnd_nvstore_parse_data(struct bhnd_nvram_store *sc)
 {
 	const char	*name;
 	void		*cookiep;
 	int		 error;
 
 	/* Parse and register all device paths and path aliases. This enables
 	 * resolution of _forward_ references to device paths aliases when
 	 * scanning variable entries below */
 	if ((error = bhnd_nvstore_parse_path_entries(sc)))
 		return (error);
 
 	/* Calculate the per-path variable counts, and report dangling alias
 	 * references as an error. */
 	cookiep = NULL;
 	while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) {
 		bhnd_nvstore_path	*path;
 		bhnd_nvstore_name_info	 info;
 
 		/* Parse the name info */
 		error = bhnd_nvstore_parse_name_info(name,
 		    BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info);
 		if (error)
 			return (error);
 
 		switch (info.type) {
 		case BHND_NVSTORE_VAR:
 			/* Fetch referenced path */
 			path = bhnd_nvstore_var_get_path(sc, &info);
 			if (path == NULL) {
 				BHND_NV_LOG("variable '%s' has dangling "
 					    "path reference\n", name);
 				return (EFTYPE);
 			}
 
 			/* Increment path variable count */
 			if (path->num_vars == SIZE_MAX) {
 				BHND_NV_LOG("more than SIZE_MAX variables in "
 				    "path %s\n", path->path_str);
 				return (EFTYPE);
 			}
 			path->num_vars++;
 			break;
 
 		case BHND_NVSTORE_ALIAS_DECL:
 			/* Skip -- path alias already parsed and recorded */
 			break;
 		}
 	}
 
 	/* If the backing NVRAM data instance vends only a single root ("/")
 	 * path, we may be able to skip generating an index for the root
 	 * path */
 	if (sc->num_paths == 1) {
 		bhnd_nvstore_path *path;
 
 		/* If the backing instance provides its own name-based lookup
 		 * indexing, we can skip generating a duplicate here */
 		if (sc->data_caps & BHND_NVRAM_DATA_CAP_INDEXED)
 			return (0);
 
 		/* If the sole root path contains fewer variables than the
 		 * minimum indexing threshhold, we do not need to generate an
 		 * index */
 		path = bhnd_nvstore_get_root_path(sc);
 		if (path->num_vars < BHND_NV_IDX_VAR_THRESHOLD)
 			return (0);
 	}
 
 	/* Allocate per-path index instances */
 	for (size_t i = 0; i < nitems(sc->paths); i++) {
 		bhnd_nvstore_path	*path;
 
 		LIST_FOREACH(path, &sc->paths[i], np_link) {
 			path->index = bhnd_nvstore_index_new(path->num_vars);
 			if (path->index == NULL)
 				return (ENOMEM);
 		}
 	}
 
 	/* Populate per-path indexes */
 	cookiep = NULL;
 	while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) {
 		bhnd_nvstore_name_info	 info;
 		bhnd_nvstore_path	*path;
 
 		/* Parse the name info */
 		error = bhnd_nvstore_parse_name_info(name,
 		    BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info);
 		if (error)
 			return (error);
 
 		switch (info.type) {
 		case BHND_NVSTORE_VAR:
 			/* Fetch referenced path */
 			path = bhnd_nvstore_var_get_path(sc, &info);
 			BHND_NV_ASSERT(path != NULL,
 			    ("dangling path reference"));
 
 			/* Append to index */
 			error = bhnd_nvstore_index_append(sc, path->index,
 			    cookiep);
 			if (error)
 				return (error);
 			break;
 
 		case BHND_NVSTORE_ALIAS_DECL:
 			/* Skip */
 			break;
 		}
 	}
 
 	/* Prepare indexes for querying */
 	for (size_t i = 0; i < nitems(sc->paths); i++) {
 		bhnd_nvstore_path	*path;
 
 		LIST_FOREACH(path, &sc->paths[i], np_link) {
 			error = bhnd_nvstore_index_prepare(sc, path->index);
 			if (error)
 				return (error);
 		}
 	}
 
 	return (0);
 }
 
 
 /**
  * Parse and register path and path alias entries for all declarations found in
  * the NVRAM data backing @p nvram.
  * 
  * @param sc		The NVRAM store instance.
  *
  * @retval 0		success
  * @retval non-zero	If parsing fails, a regular unix error code will be
  *			returned.
  */
 static int
 bhnd_nvstore_parse_path_entries(struct bhnd_nvram_store *sc)
 {
 	const char	*name;
 	void		*cookiep;
 	int		 error;
 
 	BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED);
 
 	/* Skip path registration if the data source does not support device
 	 * paths. */
 	if (!(sc->data_caps & BHND_NVRAM_DATA_CAP_DEVPATHS)) {
 		BHND_NV_ASSERT(sc->root_path != NULL, ("missing root path"));
 		return (0);
 	}
 
 	/* Otherwise, parse and register all paths and path aliases */
 	cookiep = NULL;
 	while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) {
 		bhnd_nvstore_name_info info;
 
 		/* Parse the name info */
 		error = bhnd_nvstore_parse_name_info(name,
 		    BHND_NVSTORE_NAME_INTERNAL, sc->data_caps, &info);
 		if (error)
 			return (error);
 
 		/* Register the path */
 		error = bhnd_nvstore_var_register_path(sc, &info, cookiep);
 		if (error) {
 			BHND_NV_LOG("failed to register path for %s: %d\n",
 			    name, error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 
 /**
  * Merge exported per-path variables (uncommitted, committed, or both) into 
  * the empty @p merged property list.
  * 
  * @param	sc	The NVRAM store instance.
  * @param	path	The NVRAM path to be exported.
  * @param	merged	The property list to populate with the merged results.
  * @param	flags	Export flags. See BHND_NVSTORE_EXPORT_*.
  * 
  * @retval 0		success
  * @retval ENOMEM	If allocation fails.
  * @retval non-zero	If merging the variables defined in @p path otherwise
  *			fails, a regular unix error code will be returned.
  */
 static int
 bhnd_nvstore_export_merge(struct bhnd_nvram_store *sc,
     bhnd_nvstore_path *path, bhnd_nvram_plist *merged, uint32_t flags)
 {
 	void	*cookiep, *idxp;
 	int	 error;
 
 	/* Populate merged list with all pending variables */
 	if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) {
 		bhnd_nvram_prop *prop;
 
 		prop = NULL;
 		while ((prop = bhnd_nvram_plist_next(path->pending, prop))) {
 			/* Skip variables marked for deletion */
 			if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED)) {
 				if (bhnd_nvram_prop_is_null(prop))
 					continue;
 			}
 
 			/* Append to merged list */
 			error = bhnd_nvram_plist_append(merged, prop);
 			if (error)
 				return (error);
 		}
 	}
 
 	/* Skip merging committed variables? */
 	if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMMITTED))
 		return (0);
 
 	/* Merge in the committed NVRAM variables */
 	idxp = NULL;
 	while ((cookiep = bhnd_nvstore_path_data_next(sc, path, &idxp))) {
 		const char	*name;
 		bhnd_nvram_val	*val;
 
 		/* Fetch the variable name */
 		name = bhnd_nvram_data_getvar_name(sc->data, cookiep);
 
 		/* Trim device path prefix */
 		if (sc->data_caps & BHND_NVRAM_DATA_CAP_DEVPATHS)
 			name = bhnd_nvram_trim_path_name(name);
 
 		/* Skip if already defined in pending updates */
 		if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED)) {
 			if (bhnd_nvram_plist_contains(path->pending, name))
 				continue;
 		}
 
 		/* Skip if higher precedence value was already defined. This
 		 * may occur if the underlying data store contains duplicate
 		 * keys; iteration will always return the definition with
 		 * the highest precedence first */
 		if (bhnd_nvram_plist_contains(merged, name))
 			continue;
 
 		/* Fetch the variable's value representation */
 		if ((error = bhnd_nvram_data_copy_val(sc->data, cookiep, &val)))
 			return (error);
 
 		/* Add to path variable list */
 		error = bhnd_nvram_plist_append_val(merged, name, val);
 		bhnd_nvram_val_release(val);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 /**
  * Find a free alias value for @p path, and append the devpathXX alias
  * declaration to @p plist.
  * 
  * @param	sc		The NVRAM store instance.
  * @param	path		The NVRAM path for which a devpath alias
  *				variable should be produced.
  * @param	devpath		The devpathXX path value for @p path.
  * @param	plist		The property list to which @p path's devpath
  *				variable will be appended.
  * @param[out]	alias_val	On success, will be set to the alias value
  *				allocated for @p path.
  * 
  * @retval 0		success
  * @retval ENOMEM	If allocation fails.
  * @retval non-zero	If merging the variables defined in @p path otherwise
  *			fails, a regular unix error code will be returned.
  */
 static int
 bhnd_nvstore_export_devpath_alias(struct bhnd_nvram_store *sc,
     bhnd_nvstore_path *path, const char *devpath, bhnd_nvram_plist *plist,
     u_long *alias_val)
 {
 	bhnd_nvstore_alias	*alias;
 	char			*pathvar;
 	int			 error;
 
 	*alias_val = 0;
 
 	/* Prefer alias value already reserved for this path. */
 	alias = bhnd_nvstore_find_alias(sc, path->path_str);
 	if (alias != NULL) {
 		*alias_val = alias->alias;
 
 		/* Allocate devpathXX variable name */
 		bhnd_nv_asprintf(&pathvar, "devpath%lu", *alias_val);
 		if (pathvar == NULL)
 			return (ENOMEM);
 
 		/* Append alias variable to property list */
 		error = bhnd_nvram_plist_append_string(plist, pathvar, devpath);
 
 		BHND_NV_ASSERT(error != EEXIST, ("reserved alias %lu:%s in use",
 		   * alias_val, path->path_str));
 
 		bhnd_nv_free(pathvar);
 		return (error);
 	}
 
 	/* Find the next free devpathXX alias entry */
 	while (1) {
 		/* Skip existing reserved alias values */
 		while (bhnd_nvstore_get_alias(sc, *alias_val) != NULL) {
 			if (*alias_val == ULONG_MAX)
 				return (ENOMEM);
 
 			(*alias_val)++;
 		}
 
 		/* Allocate devpathXX variable name */
 		bhnd_nv_asprintf(&pathvar, "devpath%lu", *alias_val);
 		if (pathvar == NULL)
 			return (ENOMEM);
 
 		/* If not in-use, we can terminate the search */
 		if (!bhnd_nvram_plist_contains(plist, pathvar))
 			break;
 
 		/* Keep searching */
 		bhnd_nv_free(pathvar);
 
 		if (*alias_val == ULONG_MAX)
 			return (ENOMEM);
 
 		(*alias_val)++;
 	}
 
 	/* Append alias variable to property list */
 	error = bhnd_nvram_plist_append_string(plist, pathvar, devpath);
 
 	bhnd_nv_free(pathvar);
 	return (error);
 }
 
 /**
  * Export a single @p child path's properties, appending the result to @p plist.
  * 
  * @param	sc		The NVRAM store instance.
  * @param	top		The root NVRAM path being exported.
  * @param	child		The NVRAM path to be exported.
  * @param	plist		The property list to which @p child's exported
  *				properties should be appended.
  * @param	flags		Export flags. See BHND_NVSTORE_EXPORT_*.
  * 
  * @retval 0		success
  * @retval ENOMEM	If allocation fails.
  * @retval non-zero	If merging the variables defined in @p path otherwise
  *			fails, a regular unix error code will be returned.
  */
 static int
 bhnd_nvram_store_export_child(struct bhnd_nvram_store *sc,
     bhnd_nvstore_path *top, bhnd_nvstore_path *child, bhnd_nvram_plist *plist,
     uint32_t flags)
 {
 	bhnd_nvram_plist	*path_vars;
 	bhnd_nvram_prop		*prop;
 	const char		*relpath;
 	char			*prefix, *namebuf;
 	size_t			 prefix_len, relpath_len;
 	size_t			 namebuf_size, num_props;
 	bool			 emit_compact_devpath;
 	int			 error;
 
 	BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED);
 
 	prefix = NULL;
 	num_props = 0;
 	path_vars = NULL;
 	namebuf = NULL;
 
 	/* Determine the path relative to the top-level path */
 	relpath = bhnd_nvstore_parse_relpath(top->path_str, child->path_str);
 	if (relpath == NULL) {
 		/* Skip -- not a child of the root path */
 		return (0);
 	}
 	relpath_len = strlen(relpath);
 
 	/* Skip sub-path if export of children was not requested,  */
 	if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_CHILDREN) && relpath_len > 0)
 		return (0);
 
 	/* Collect all variables to be included in the export */
 	if ((path_vars = bhnd_nvram_plist_new()) == NULL)
 		return (ENOMEM);
 
 	if ((error = bhnd_nvstore_export_merge(sc, child, path_vars, flags))) {
 		bhnd_nvram_plist_release(path_vars);
 		return (error);
 	}
 
 	/* Skip if no children are to be exported */
 	if (bhnd_nvram_plist_count(path_vars) == 0) {
 		bhnd_nvram_plist_release(path_vars);
 		return (0);
 	}
 
 	/* Determine appropriate device path encoding */
 	emit_compact_devpath = false;
 	if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS)) {
 		/* Re-encode as compact (if non-empty path) */
 		if (relpath_len > 0)
 			emit_compact_devpath = true;
 	} else if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS)) {
 		/* Re-encode with fully expanded device path */
 		emit_compact_devpath = false;
 	} else if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) {
 		/* Preserve existing encoding of this path */
 		if (bhnd_nvstore_find_alias(sc, child->path_str) != NULL)
 			emit_compact_devpath = true;
 	} else {
 		BHND_NV_LOG("invalid device path flag: %#" PRIx32, flags);
 		error = EINVAL;
 		goto finished;
 	}
 
 	/* Allocate variable device path prefix to use for all property names,
 	 * and if using compact encoding, emit the devpathXX= variable */
 	prefix = NULL;
 	prefix_len = 0;
 	if (emit_compact_devpath) {
 		u_long	alias_val;
 		int	len;
 
 		/* Reserve an alias value and append the devpathXX= variable to
 		 * the property list */
 		error = bhnd_nvstore_export_devpath_alias(sc, child, relpath,
 		    plist, &alias_val);
 		if (error)
 			goto finished;
 
 		/* Allocate variable name prefix */
 		len = bhnd_nv_asprintf(&prefix, "%lu:", alias_val);
 		if (prefix == NULL) {
 			error = ENOMEM;
 			goto finished;
 		}
 	
 		prefix_len = len;
 	} else if (relpath_len > 0) {
 		int len;
 
 		/* Allocate the variable name prefix, appending '/' to the
 		 * relative path */
 		len = bhnd_nv_asprintf(&prefix, "%s/", relpath);
 		if (prefix == NULL) {
 			error = ENOMEM;
 			goto finished;
 		}
 
 		prefix_len = len;
 	}
 
 	/* If prefixing of variable names is required, allocate a name
 	 * formatting buffer */
 	namebuf_size = 0;
 	if (prefix != NULL) {
 		size_t	maxlen;
 
 		/* Find the maximum name length */
 		maxlen = 0;
 		prop = NULL;
 		while ((prop = bhnd_nvram_plist_next(path_vars, prop))) {
 			const char *name;
 
 			name = bhnd_nvram_prop_name(prop);
 			maxlen = bhnd_nv_ummax(strlen(name), maxlen);
 		}
 
 		/* Allocate name buffer (path-prefix + name + '\0') */
 		namebuf_size = prefix_len + maxlen + 1;
 		namebuf = bhnd_nv_malloc(namebuf_size);
 		if (namebuf == NULL) {
 			error = ENOMEM;
 			goto finished;
 		}
 	}
 
 	/* Append all path variables to the export plist, prepending the
 	 * device-path prefix to the variable names, if required */
 	prop = NULL;
 	while ((prop = bhnd_nvram_plist_next(path_vars, prop)) != NULL) {
 		const char *name;
 
 		/* Prepend device prefix to the variable name */
 		name = bhnd_nvram_prop_name(prop);
 		if (prefix != NULL) {
 			int len;
 
 			/*
 			 * Write prefixed variable name to our name buffer.
 			 * 
 			 * We precalcuate the size when scanning all names 
 			 * above, so this should always succeed.
 			 */
 			len = snprintf(namebuf, namebuf_size, "%s%s", prefix,
 			    name);
 			if (len < 0 || (size_t)len >= namebuf_size)
 				BHND_NV_PANIC("invalid max_name_len");
 
 			name = namebuf;
 		}
 
 		/* Add property to export plist */
 		error = bhnd_nvram_plist_append_val(plist, name,
 		    bhnd_nvram_prop_val(prop));
 		if (error)
 			goto finished;
 	}
 
 	/* Success */
 	error = 0;
 
 finished:
 	if (prefix != NULL)
 		bhnd_nv_free(prefix);
 
 	if (namebuf != NULL)
 		bhnd_nv_free(namebuf);
 
 	if (path_vars != NULL)
 		bhnd_nvram_plist_release(path_vars);
 
 	return (error);
 }
 
 /**
  * Export a flat, ordered NVRAM property list representation of all NVRAM
  * properties at @p path.
  * 
  * @param	sc	The NVRAM store instance.
  * @param	path	The NVRAM path to export, or NULL to select the root
  *			path.
  * @param[out]	cls	On success, will be set to the backing data class
  *			of @p sc. If the data class is are not desired,
  *			a NULL pointer may be provided.
  * @param[out]	props	On success, will be set to a caller-owned property
  *			list containing the exported properties. The caller is
  *			responsible for releasing this value via
  *			bhnd_nvram_plist_release().
  * @param[out]	options	On success, will be set to a caller-owned property
  *			list containing the current NVRAM serialization options
  *			for @p sc. The caller is responsible for releasing this
  *			value via bhnd_nvram_plist_release().
  * @param	flags	Export flags. See BHND_NVSTORE_EXPORT_*.
  * 
  * @retval 0		success
  * @retval EINVAL	If @p flags is invalid.
  * @retval ENOENT	The requested path was not found.
  * @retval ENOMEM	If allocation fails.
  * @retval non-zero	If export of  @p path otherwise fails, a regular unix
  *			error code will be returned.
  */
 int
 bhnd_nvram_store_export(struct bhnd_nvram_store *sc, const char *path,
     bhnd_nvram_data_class **cls, bhnd_nvram_plist **props,
     bhnd_nvram_plist **options, uint32_t flags)
 {
 	bhnd_nvram_plist	*unordered;
 	bhnd_nvstore_path	*top;
 	bhnd_nvram_prop		*prop;
 	const char		*name;
 	void			*cookiep;
 	size_t			 num_dpath_flags;
 	int			 error;
 	
 	*props = NULL;
 	unordered = NULL;
 	num_dpath_flags = 0;
 	if (options != NULL)
 		*options = NULL;
 
 	/* Default to exporting root path */
 	if (path == NULL)
 		path = BHND_NVSTORE_ROOT_PATH;
 
 	/* Default to exporting all properties */
 	if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMMITTED) &&
 	    !BHND_NVSTORE_GET_FLAG(flags, EXPORT_UNCOMMITTED))
 	{
 		flags |= BHND_NVSTORE_EXPORT_ALL_VARS;
 	}
 
 	/* Default to preserving the current device path encoding */
 	if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS) &&
 	    !BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS))
 	{
 		flags |= BHND_NVSTORE_EXPORT_PRESERVE_DEVPATHS;
 	}
 
 	/* Exactly one device path encoding flag must be set */
 	if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_COMPACT_DEVPATHS))
 		num_dpath_flags++;
 
 	if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_EXPAND_DEVPATHS))
 		num_dpath_flags++;
 
 	if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS))
 		num_dpath_flags++;
 
 	if (num_dpath_flags != 1)
 		return (EINVAL);
 
 	/* If EXPORT_DELETED is set, EXPORT_UNCOMMITTED must be set too */
 	if (BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED) &&
 	    !BHND_NVSTORE_GET_FLAG(flags, EXPORT_DELETED))
 	{
 		return (EINVAL);
 	}
 
 	/* Lock internal state before querying paths/properties */
 	BHND_NVSTORE_LOCK(sc);
 
 	/* Fetch referenced path */
 	top = bhnd_nvstore_get_path(sc, path, strlen(path));
 	if (top == NULL) {
 		error = ENOENT;
 		goto failed;
 	}
 
 	/* Allocate new, empty property list */
 	if ((unordered = bhnd_nvram_plist_new()) == NULL) {
 		error = ENOMEM;
 		goto failed;
 	}
 
 	/* Export the top-level path first */
 	error = bhnd_nvram_store_export_child(sc, top, top, unordered, flags);
 	if (error)
 		goto failed;
 
 	/* Attempt to export any children of the root path */
 	for (size_t i = 0; i < nitems(sc->paths); i++) {
 		bhnd_nvstore_path *child;
 
 		LIST_FOREACH(child, &sc->paths[i], np_link) {
 			/* Top-level path was already exported */
 			if (child == top)
 				continue;
 
 			error = bhnd_nvram_store_export_child(sc, top,
 			    child, unordered, flags);
 			if (error)
 				goto failed;
 		}
 	}
 
 	/* If requested, provide the current class and serialization options */
 	if (cls != NULL)
 		*cls = bhnd_nvram_data_get_class(sc->data);
 
 	if (options != NULL)
 		*options = bhnd_nvram_plist_retain(sc->data_opts);
 
 	/*
 	 * If we're re-encoding device paths, don't bother preserving the
 	 * existing NVRAM variable order; our variable names will not match
 	 * the existing backing NVRAM data.
 	 */
 	if (!BHND_NVSTORE_GET_FLAG(flags, EXPORT_PRESERVE_DEVPATHS)) {
 		*props = unordered;
 		unordered = NULL;
 
 		goto finished;
 	}
 
 	/* 
 	 * Re-order the flattened output to match the existing NVRAM variable
 	 * ordering.
 	 * 
 	 * We append all new variables at the end of the input; this should
 	 * reduce the delta that needs to be written (e.g. to flash) when
 	 * committing NVRAM updates, and should result in a serialization
 	 * identical to the input serialization if uncommitted updates are
 	 * excluded from the export.
 	 */
 	if ((*props = bhnd_nvram_plist_new()) == NULL) {
 		error = ENOMEM;
 		goto failed;
 	}
 
 	/* Using the backing NVRAM data ordering to order all variables
 	 * currently defined in the backing store */ 
 	cookiep = NULL;
 	while ((name = bhnd_nvram_data_next(sc->data, &cookiep))) {
 		prop = bhnd_nvram_plist_get_prop(unordered, name);
 		if (prop == NULL)
 			continue;
 
 		/* Append to ordered result */
 		if ((error = bhnd_nvram_plist_append(*props, prop)))
 			goto failed;
 	
 		/* Remove from unordered list */
 		bhnd_nvram_plist_remove(unordered, name);
 	}
 
 	/* Any remaining variables are new, and should be appended to the
 	 * end of the export list */
 	prop = NULL;
 	while ((prop = bhnd_nvram_plist_next(unordered, prop)) != NULL) {
 		if ((error = bhnd_nvram_plist_append(*props, prop)))
 			goto failed;
 	}
 
 	/* Export complete */
 finished:
 	BHND_NVSTORE_UNLOCK(sc);
 
 	if (unordered != NULL)
 		bhnd_nvram_plist_release(unordered);
 
 	return (0);
 
 failed:
 	BHND_NVSTORE_UNLOCK(sc);
 
 	if (unordered != NULL)
 		bhnd_nvram_plist_release(unordered);
 
 	if (options != NULL && *options != NULL)
 		bhnd_nvram_plist_release(*options);
 
 	if (*props != NULL)
 		bhnd_nvram_plist_release(*props);
 
 	return (error);
 }
 
 /**
  * Encode all NVRAM properties at @p path, using the @p store's current NVRAM
  * data format.
  * 
  * @param	sc	The NVRAM store instance.
  * @param	path	The NVRAM path to export, or NULL to select the root
  *			path.
  * @param[out]	data	On success, will be set to the newly serialized value.
  *			The caller is responsible for freeing this value
  *			via bhnd_nvram_io_free().
  * @param	flags	Export flags. See BHND_NVSTORE_EXPORT_*.
  *
  * @retval 0		success
  * @retval EINVAL	If @p flags is invalid.
  * @retval ENOENT	The requested path was not found.
  * @retval ENOMEM	If allocation fails.
  * @retval non-zero	If serialization of  @p path otherwise fails, a regular
  *			unix error code will be returned.
  */
 int
 bhnd_nvram_store_serialize(struct bhnd_nvram_store *sc, const char *path,
    struct bhnd_nvram_io **data,  uint32_t flags)
 {
 	bhnd_nvram_plist	*props;
 	bhnd_nvram_plist	*options;
 	bhnd_nvram_data_class	*cls;
 	struct bhnd_nvram_io	*io;
 	void			*outp;
 	size_t			 olen;
 	int			 error;
 
 	props = NULL;
 	options = NULL;
 	io = NULL;
 
 	/* Perform requested export */
 	error = bhnd_nvram_store_export(sc, path, &cls, &props, &options,
 	    flags);
 	if (error)
 		return (error);
 
 	/* Determine serialized size */
 	error = bhnd_nvram_data_serialize(cls, props, options, NULL, &olen);
 	if (error)
 		goto failed;
 
 	/* Allocate output buffer */
 	if ((io = bhnd_nvram_iobuf_empty(olen, olen)) == NULL) {
 		error = ENOMEM;
 		goto failed;
 	}
 
 	/* Fetch write pointer */
 	if ((error = bhnd_nvram_io_write_ptr(io, 0, &outp, olen, NULL)))
 		goto failed;
 
 	/* Perform serialization */
 	error = bhnd_nvram_data_serialize(cls, props, options, outp, &olen);
 	if (error)
 		goto failed;
 
 	if ((error = bhnd_nvram_io_setsize(io, olen)))
 		goto failed;
 
 	/* Success */
 	bhnd_nvram_plist_release(props);
 	bhnd_nvram_plist_release(options);
 
 	*data = io;
 	return (0);
 
 failed:
 	if (props != NULL)
 		bhnd_nvram_plist_release(props);
 
 	if (options != NULL)
 		bhnd_nvram_plist_release(options);
 
 	if (io != NULL)
 		bhnd_nvram_io_free(io);
 
 	return (error);
 }
 
 /**
  * Read an NVRAM variable.
  *
  * @param		sc	The NVRAM parser state.
  * @param		name	The NVRAM variable name.
  * @param[out]		outp	On success, the requested value will be written
  *				to this buffer. This argment may be NULL if
  *				the value is not desired.
  * @param[in,out]	olen	The capacity of @p outp. On success, will be set
  *				to the actual size of the requested value.
  * @param		otype	The requested data type to be written to
  *				@p outp.
  *
  * @retval 0		success
  * @retval ENOENT	The requested variable was not found.
  * @retval ENOMEM	If @p outp is non-NULL and a buffer of @p olen is too
  *			small to hold the requested value.
  * @retval non-zero	If reading @p name otherwise fails, a regular unix
  *			error code will be returned.
   */
 int
 bhnd_nvram_store_getvar(struct bhnd_nvram_store *sc, const char *name,
     void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	bhnd_nvstore_name_info	 info;
 	bhnd_nvstore_path	*path;
 	bhnd_nvram_prop		*prop;
 	void			*cookiep;
 	int			 error;
 
 	BHND_NVSTORE_LOCK(sc);
 
 	/* Parse the variable name */
 	error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_EXTERNAL,
 	    sc->data_caps, &info);
 	if (error)
 		goto finished;
 
 	/* Fetch the variable's enclosing path entry */
 	if ((path = bhnd_nvstore_var_get_path(sc, &info)) == NULL) {
 		error = ENOENT;
 		goto finished;
 	}
 
 	/* Search uncommitted updates first */
 	prop = bhnd_nvstore_path_get_update(sc, path, info.name);
 	if (prop != NULL) {
 		if (bhnd_nvram_prop_is_null(prop)) {
 			/* NULL denotes a pending deletion */
 			error = ENOENT;
 		} else {
 			error = bhnd_nvram_prop_encode(prop, outp, olen, otype);
 		}
 		goto finished;
 	}
 
 	/* Search the backing NVRAM data */
 	cookiep = bhnd_nvstore_path_data_lookup(sc, path, info.name);
 	if (cookiep != NULL) {
 		/* Found in backing store */
 		error = bhnd_nvram_data_getvar(sc->data, cookiep, outp, olen,
 		     otype);
 		goto finished;
 	}
 
 	/* Not found */
 	error = ENOENT;
 
 finished:
 	BHND_NVSTORE_UNLOCK(sc);
 	return (error);
 }
 
 /**
  * Common bhnd_nvram_store_set*() and bhnd_nvram_store_unsetvar()
  * implementation.
  * 
  * If @p value is NULL, the variable will be marked for deletion.
  */
 static int
 bhnd_nvram_store_setval_common(struct bhnd_nvram_store *sc, const char *name,
     bhnd_nvram_val *value)
 {
 	bhnd_nvstore_path	*path;
 	bhnd_nvstore_name_info	 info;
 	int			 error;
 
 	BHND_NVSTORE_LOCK_ASSERT(sc, MA_OWNED);
 
 	/* Parse the variable name */
 	error = bhnd_nvstore_parse_name_info(name, BHND_NVSTORE_NAME_EXTERNAL,
 	    sc->data_caps, &info);
 	if (error)
 		return (error);
 
 	/* Fetch the variable's enclosing path entry */
 	if ((path = bhnd_nvstore_var_get_path(sc, &info)) == NULL)
 		return (error);
 
 	/* Register the update entry */
 	return (bhnd_nvstore_path_register_update(sc, path, info.name, value));
 }
 
 /**
  * Set an NVRAM variable.
  * 
  * @param	sc	The NVRAM parser state.
  * @param	name	The NVRAM variable name.
  * @param	value	The new value.
  *
  * @retval 0		success
  * @retval ENOENT	The requested variable @p name was not found.
  * @retval EINVAL	If @p value is invalid.
  */
 int
 bhnd_nvram_store_setval(struct bhnd_nvram_store *sc, const char *name,
     bhnd_nvram_val *value)
 {
 	int error;
 
 	BHND_NVSTORE_LOCK(sc);
 	error = bhnd_nvram_store_setval_common(sc, name, value);
 	BHND_NVSTORE_UNLOCK(sc);
 
 	return (error);
 }
 
 /**
  * Set an NVRAM variable.
  * 
  * @param		sc	The NVRAM parser state.
  * @param		name	The NVRAM variable name.
  * @param[out]		inp	The new value.
  * @param[in,out]	ilen	The size of @p inp.
  * @param		itype	The data type of @p inp.
  *
  * @retval 0		success
  * @retval ENOENT	The requested variable @p name was not found.
  * @retval EINVAL	If the new value is invalid.
  * @retval EINVAL	If @p name is read-only.
  */
 int
 bhnd_nvram_store_setvar(struct bhnd_nvram_store *sc, const char *name,
     const void *inp, size_t ilen, bhnd_nvram_type itype)
 {
 	bhnd_nvram_val	val;
 	int		error;
 
 	error = bhnd_nvram_val_init(&val, NULL, inp, ilen, itype,
 	    BHND_NVRAM_VAL_FIXED|BHND_NVRAM_VAL_BORROW_DATA);
 	if (error) {
 		BHND_NV_LOG("error initializing value: %d\n", error);
 		return (EINVAL);
 	}
 
 	BHND_NVSTORE_LOCK(sc);
 	error = bhnd_nvram_store_setval_common(sc, name, &val);
 	BHND_NVSTORE_UNLOCK(sc);
 
 	bhnd_nvram_val_release(&val);
 
 	return (error);
 }
 
 /**
  * Unset an NVRAM variable.
  * 
  * @param		sc	The NVRAM parser state.
  * @param		name	The NVRAM variable name.
  *
  * @retval 0		success
  * @retval ENOENT	The requested variable @p name was not found.
  * @retval EINVAL	If @p name is read-only.
  */
 int
 bhnd_nvram_store_unsetvar(struct bhnd_nvram_store *sc, const char *name)
 {
 	int error;
 
 	BHND_NVSTORE_LOCK(sc);
 	error = bhnd_nvram_store_setval_common(sc, name, BHND_NVRAM_VAL_NULL);
 	BHND_NVSTORE_UNLOCK(sc);
 
 	return (error);
 }
Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value.c
===================================================================
--- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value.c	(revision 350434)
+++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value.c	(revision 350435)
@@ -1,1936 +1,1937 @@
 /*-
  * Copyright (c) 2015-2016 Landon Fuller <landonf@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
  *    redistribution must be conditioned upon including a substantially
  *    similar Disclaimer requirement for further binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGES.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/limits.h>
 #include <sys/sbuf.h>
 
 #ifdef _KERNEL
 
 #include <sys/ctype.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 
 #include <machine/_inttypes.h>
 
 #else /* !_KERNEL */
 
 #include <ctype.h>
 #include <inttypes.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 
 #endif /* _KERNEL */
 
 #include "bhnd_nvram_private.h"
 
 #include "bhnd_nvram_valuevar.h"
 
 static int	 bhnd_nvram_val_fmt_filter(const bhnd_nvram_val_fmt **fmt,
 		     const void *inp, size_t ilen, bhnd_nvram_type itype);
 
 static void	*bhnd_nvram_val_alloc_bytes(bhnd_nvram_val *value, size_t ilen,
 		     bhnd_nvram_type itype, uint32_t flags);
 static int	 bhnd_nvram_val_set(bhnd_nvram_val *value, const void *inp,
 		     size_t ilen, bhnd_nvram_type itype, uint32_t flags);
 static int	 bhnd_nvram_val_set_inline(bhnd_nvram_val *value,
 		     const void *inp, size_t ilen, bhnd_nvram_type itype);
 
 
 static int	 bhnd_nvram_val_encode_data(const void *inp, size_t ilen,
 		     bhnd_nvram_type itype, void *outp, size_t *olen,
 		     bhnd_nvram_type otype);
 static int	 bhnd_nvram_val_encode_int(const void *inp, size_t ilen,
 		     bhnd_nvram_type itype, void *outp, size_t *olen,
 		     bhnd_nvram_type otype);
 static int	 bhnd_nvram_val_encode_null(const void *inp, size_t ilen,
 		     bhnd_nvram_type itype, void *outp, size_t *olen,
 		     bhnd_nvram_type otype);
 static int	 bhnd_nvram_val_encode_bool(const void *inp, size_t ilen,
 		     bhnd_nvram_type itype, void *outp, size_t *olen,
 		     bhnd_nvram_type otype);
 static int	 bhnd_nvram_val_encode_string(const void *inp, size_t ilen,
 		     bhnd_nvram_type itype, void *outp, size_t *olen,
 		     bhnd_nvram_type otype);
 
 /** Initialize an empty value instance with @p _fmt, @p _storage, and
  *  an implicit callee-owned reference */
 #define	BHND_NVRAM_VAL_INITIALIZER(_fmt, _storage)		\
 	(bhnd_nvram_val) {					\
 		.refs = 1,					\
 		.val_storage = _storage,			\
 		.fmt = _fmt,					\
 		.data_storage = BHND_NVRAM_VAL_DATA_NONE,	\
 	};
 
 /** Assert that @p value's backing representation state has initialized
  *  as empty. */
 #define	BHND_NVRAM_VAL_ASSERT_EMPTY(_value)			\
 	BHND_NV_ASSERT(						\
 	    value->data_storage == BHND_NVRAM_VAL_DATA_NONE &&	\
 	    value->data_len == 0 &&				\
 	    value->data.ptr == NULL,				\
 	    ("previously initialized value"))
 
 /** Return true if BHND_NVRAM_VAL_BORROW_DATA or BHND_NVRAM_VAL_STATIC_DATA is
  *  set in @p _flags (e.g. we should attempt to directly reference external
  *  data */
 #define	BHND_NVRAM_VAL_EXTREF_BORROWED_DATA(_flags)		\
 	(((_flags) & BHND_NVRAM_VAL_BORROW_DATA) ||		\
 	 ((_flags) & BHND_NVRAM_VAL_STATIC_DATA))
 
 /** Flags permitted when performing val-based initialization via
  *  bhnd_nvram_val_convert_init() or bhnd_nvram_val_convert_new() */
 #define	BHND_NVRAM_VALID_CONV_FLAGS	\
 	(BHND_NVRAM_VAL_FIXED |		\
 	 BHND_NVRAM_VAL_DYNAMIC |	\
 	 BHND_NVRAM_VAL_COPY_DATA)
 
 /** Returns true if @p _val must be copied in bhnd_nvram_val_copy(), false
  *  if its reference count may be safely incremented */
 #define	BHND_NVRAM_VAL_NEED_COPY(_val)				\
 	((_val)->val_storage == BHND_NVRAM_VAL_STORAGE_AUTO ||	\
 	 (_val)->data_storage == BHND_NVRAM_VAL_DATA_EXT_WEAK)
 
 volatile u_int			 refs;		/**< reference count */
 bhnd_nvram_val_storage		 val_storage;	/**< value structure storage */
 const bhnd_nvram_val_fmt	*fmt;		/**< value format */
 bhnd_nvram_val_data_storage	 data_storage;	/**< data storage */
 bhnd_nvram_type			 data_type;	/**< data type */
 size_t				 data_len;	/**< data size */
 
 /* Shared NULL value instance */
 bhnd_nvram_val bhnd_nvram_val_null = {
 	.refs		= 1,
 	.val_storage	= BHND_NVRAM_VAL_STORAGE_STATIC,
 	.fmt		= &bhnd_nvram_val_null_fmt,
 	.data_storage	= BHND_NVRAM_VAL_DATA_INLINE,
 	.data_type	= BHND_NVRAM_TYPE_NULL,
 	.data_len	= 0,
 };
 
 /**
  * Return the human-readable name of @p fmt.
  */
 const char *
 bhnd_nvram_val_fmt_name(const bhnd_nvram_val_fmt *fmt)
 {
 	return (fmt->name);
 }
 
 /**
  * Return the default format for values of @p type.
  */
 const bhnd_nvram_val_fmt *
 bhnd_nvram_val_default_fmt(bhnd_nvram_type type)
 {
 	switch (type) {
 	case BHND_NVRAM_TYPE_UINT8:
 		return (&bhnd_nvram_val_uint8_fmt);
 	case BHND_NVRAM_TYPE_UINT16:
 		return (&bhnd_nvram_val_uint16_fmt);
 	case BHND_NVRAM_TYPE_UINT32:
 		return (&bhnd_nvram_val_uint32_fmt);
 	case BHND_NVRAM_TYPE_UINT64:
 		return (&bhnd_nvram_val_uint64_fmt);
 	case BHND_NVRAM_TYPE_INT8:
 		return (&bhnd_nvram_val_int8_fmt);
 	case BHND_NVRAM_TYPE_INT16:
 		return (&bhnd_nvram_val_int16_fmt);
 	case BHND_NVRAM_TYPE_INT32:
 		return (&bhnd_nvram_val_int32_fmt);
 	case BHND_NVRAM_TYPE_INT64:
 		return (&bhnd_nvram_val_int64_fmt);
 	case BHND_NVRAM_TYPE_CHAR:
 		return (&bhnd_nvram_val_char_fmt);
 	case BHND_NVRAM_TYPE_STRING:
 		return (&bhnd_nvram_val_string_fmt);
 	case BHND_NVRAM_TYPE_BOOL:
 		return (&bhnd_nvram_val_bool_fmt);
 	case BHND_NVRAM_TYPE_NULL:
 		return (&bhnd_nvram_val_null_fmt);
 	case BHND_NVRAM_TYPE_DATA:
 		return (&bhnd_nvram_val_data_fmt);
 	case BHND_NVRAM_TYPE_UINT8_ARRAY:
 		return (&bhnd_nvram_val_uint8_array_fmt);
 	case BHND_NVRAM_TYPE_UINT16_ARRAY:
 		return (&bhnd_nvram_val_uint16_array_fmt);
 	case BHND_NVRAM_TYPE_UINT32_ARRAY:
 		return (&bhnd_nvram_val_uint32_array_fmt);
 	case BHND_NVRAM_TYPE_UINT64_ARRAY:
 		return (&bhnd_nvram_val_uint64_array_fmt);
 	case BHND_NVRAM_TYPE_INT8_ARRAY:
 		return (&bhnd_nvram_val_int8_array_fmt);
 	case BHND_NVRAM_TYPE_INT16_ARRAY:
 		return (&bhnd_nvram_val_int16_array_fmt);
 	case BHND_NVRAM_TYPE_INT32_ARRAY:
 		return (&bhnd_nvram_val_int32_array_fmt);
 	case BHND_NVRAM_TYPE_INT64_ARRAY:
 		return (&bhnd_nvram_val_int64_array_fmt);
 	case BHND_NVRAM_TYPE_CHAR_ARRAY:
 		return (&bhnd_nvram_val_char_array_fmt);
 	case BHND_NVRAM_TYPE_STRING_ARRAY:
 		return (&bhnd_nvram_val_string_array_fmt);
 	case BHND_NVRAM_TYPE_BOOL_ARRAY:
 		return (&bhnd_nvram_val_bool_array_fmt);
 	}
 	
 	/* Quiesce gcc4.2 */
 	BHND_NV_PANIC("bhnd nvram type %u unknown", type);
 }
 
 /**
  * Determine whether @p fmt (or new format delegated to by @p fmt) is
  * capable of direct initialization from buffer @p inp.
  * 
  * @param[in,out]	fmt	Indirect pointer to the NVRAM value format. If
  *				the format instance cannot handle the data type
  *				directly, it may delegate to a new format
  *				instance. On success, this parameter will be
  *				set to the format that should be used when
  *				performing initialization from @p inp.
  * @param		inp	Input data.
  * @param		ilen	Input data length.
  * @param		itype	Input data type.
  *
  * @retval 0		If initialization from @p inp is supported.
  * @retval EFTYPE	If initialization from @p inp is unsupported.
  * @retval EFAULT	if @p ilen is not correctly aligned for elements of
  *			@p itype.
  */
 static int
 bhnd_nvram_val_fmt_filter(const bhnd_nvram_val_fmt **fmt, const void *inp,
     size_t ilen, bhnd_nvram_type itype)
 {
 	const bhnd_nvram_val_fmt	*ofmt, *nfmt;
 	int				 error;
 
 	nfmt = ofmt = *fmt;
 
 	/* Validate alignment */
 	if ((error = bhnd_nvram_value_check_aligned(inp, ilen, itype)))
 		return (error);
 
 	/* If the format does not provide a filter function, it only supports
 	 * direct initialization from its native type */
 	if (ofmt->op_filter == NULL) {
 		if (itype == ofmt->native_type)
 			return (0);
 
 		return (EFTYPE);
 	}
 
 	/* Use the filter function to determine whether direct initialization
 	 * from itype is permitted */
 	error = ofmt->op_filter(&nfmt, inp, ilen, itype);
 	if (error)
 		return (error);
 
 	/* Retry filter with new format? */
 	if (ofmt != nfmt) {
 		error = bhnd_nvram_val_fmt_filter(&nfmt, inp, ilen, itype);
 		if (error)
 			return (error);
 
 		/* Success -- provide delegated format to caller */
 		*fmt = nfmt;
 	}
 
 	/* Value can be initialized with provided format and input type */
 	return (0);
 }
 
 /* Common initialization support for bhnd_nvram_val_init() and
  * bhnd_nvram_val_new() */
 static int
 bhnd_nvram_val_init_common(bhnd_nvram_val *value,
     bhnd_nvram_val_storage val_storage, const bhnd_nvram_val_fmt *fmt,
     const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags)
 {
 	void		*outp;
 	bhnd_nvram_type	 otype;
 	size_t		 olen;
 	int		 error;
 
 	/* If the value format is unspecified, we use the default format
 	 * for the input data type */
 	if (fmt == NULL)
 		fmt = bhnd_nvram_val_default_fmt(itype);
 
 	/* Determine expected data type, and allow the format to delegate to
 	 * a new format instance */
 	if ((error = bhnd_nvram_val_fmt_filter(&fmt, inp, ilen, itype))) {
 		/* Direct initialization from the provided input type is
 		 * not supported; alue must be initialized with the format's
 		 * native type */
 		otype = fmt->native_type;
 	} else {
 		/* Value can be initialized with provided input type */
 		otype = itype;
 	}
 
 	/* Initialize value instance */
 	*value = BHND_NVRAM_VAL_INITIALIZER(fmt, val_storage);
 
 	/* If input data already in native format, init directly. */
 	if (otype == itype) {
 		error = bhnd_nvram_val_set(value, inp, ilen, itype, flags);
 		if (error)
 			return (error);
 
 		return (0);
 	}
 	
 	/* Determine size when encoded in native format */
 	error = bhnd_nvram_value_coerce(inp, ilen, itype, NULL, &olen, otype);
 	if (error)
 		return (error);
 	
 	/* Fetch reference to (or allocate) an appropriately sized buffer */
 	outp = bhnd_nvram_val_alloc_bytes(value, olen, otype, flags);
 	if (outp == NULL)
 		return (ENOMEM);
 	
 	/* Perform encode */
 	error = bhnd_nvram_value_coerce(inp, ilen, itype, outp, &olen, otype);
 	if (error)
 		return (error);
 	
 	return (0);
 }
 
 /**
  * Initialize an externally allocated instance of @p value with @p fmt from the
  * given @p inp buffer of @p itype and @p ilen.
  *
  * On success, the caller owns a reference to @p value, and is responsible for
  * freeing any resources allocated for @p value via bhnd_nvram_val_release().
  *
  * @param	value	The externally allocated value instance to be
  *			initialized.
  * @param	fmt	The value's format, or NULL to use the default format
  *			for @p itype.
  * @param	inp	Input buffer.
  * @param	ilen	Input buffer length.
  * @param	itype	Input buffer type.
  * @param	flags	Value flags (see BHND_NVRAM_VAL_*).
  * 
  * @retval 0		success
  * @retval ENOMEM	If allocation fails.
  * @retval EFTYPE	If @p fmt initialization from @p itype is unsupported.
  * @retval EFAULT	if @p ilen is not correctly aligned for elements of
  *			@p itype.
  * @retval ERANGE	If value coercion would overflow (or underflow) the
  *			@p fmt representation.
  */
 int
 bhnd_nvram_val_init(bhnd_nvram_val *value, const bhnd_nvram_val_fmt *fmt,
     const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags)
 {
 	int error;
 
 	error = bhnd_nvram_val_init_common(value, BHND_NVRAM_VAL_STORAGE_AUTO,
 	    fmt, inp, ilen, itype, flags);
 	if (error)
 		bhnd_nvram_val_release(value);
 
 	return (error);
 }
 
 /**
  * Allocate a value instance with @p fmt, and attempt to initialize its internal
  * representation from the given @p inp buffer of @p itype and @p ilen.
  *
  * On success, the caller owns a reference to @p value, and is responsible for
  * freeing any resources allocated for @p value via bhnd_nvram_val_release().
  *
  * @param[out]	value	On success, the allocated value instance.
  * @param	fmt	The value's format, or NULL to use the default format
  *			for @p itype.
  * @param	inp	Input buffer.
  * @param	ilen	Input buffer length.
  * @param	itype	Input buffer type.
  * @param	flags	Value flags (see BHND_NVRAM_VAL_*).
  * 
  * @retval 0		success
  * @retval ENOMEM	If allocation fails.
  * @retval EFTYPE	If @p fmt initialization from @p itype is unsupported.
  * @retval EFAULT	if @p ilen is not correctly aligned for elements of
  *			@p itype.
  * @retval ERANGE	If value coercion would overflow (or underflow) the
  *			@p fmt representation.
  */
 int
 bhnd_nvram_val_new(bhnd_nvram_val **value, const bhnd_nvram_val_fmt *fmt,
     const void *inp, size_t ilen, bhnd_nvram_type itype, uint32_t flags)
 {
 	int error;
 
 	/* Allocate new instance */
 	if ((*value = bhnd_nv_malloc(sizeof(**value))) == NULL)
 		return (ENOMEM);
 
 	/* Perform common initialization. */
 	error = bhnd_nvram_val_init_common(*value,
 	    BHND_NVRAM_VAL_STORAGE_DYNAMIC, fmt, inp, ilen, itype, flags);
 	if (error) {
 		/* Will also free() the value allocation */
 		bhnd_nvram_val_release(*value);
 	}
 
 	return (error);
 }
 
 
 /* Common initialization support for bhnd_nvram_val_convert_init() and
  * bhnd_nvram_val_convert_new() */
 static int
 bhnd_nvram_val_convert_common(bhnd_nvram_val *value,
     bhnd_nvram_val_storage val_storage, const bhnd_nvram_val_fmt *fmt,
     bhnd_nvram_val *src, uint32_t flags)
 {
 	const void	*inp;
 	void		*outp;
 	bhnd_nvram_type	 itype, otype;
 	size_t		 ilen, olen;
 	int		 error;
 
 	/* Determine whether direct initialization from the source value's
 	 * existing data type is supported by the new format */
 	inp = bhnd_nvram_val_bytes(src, &ilen, &itype);
 	if (bhnd_nvram_val_fmt_filter(&fmt, inp, ilen, itype) == 0) {
 		/* Adjust value flags based on the source data storage */
 		switch (src->data_storage) {
 		case BHND_NVRAM_VAL_DATA_NONE:
 		case BHND_NVRAM_VAL_DATA_INLINE:
 		case BHND_NVRAM_VAL_DATA_EXT_WEAK:
 		case BHND_NVRAM_VAL_DATA_EXT_ALLOC:
 			break;
 
 		case BHND_NVRAM_VAL_DATA_EXT_STATIC:
 			/* If the source data has static storage duration,
 			 * we should apply that transitively */
 			if (flags & BHND_NVRAM_VAL_BORROW_DATA)
 				flags |= BHND_NVRAM_VAL_STATIC_DATA;
 
 			break;
 		}
 
 		/* Delegate to standard initialization */
 		return (bhnd_nvram_val_init_common(value, val_storage, fmt, inp,
 		    ilen, itype, flags));
 	} 
 
 	/* Value must be initialized with the format's native type */
 	otype = fmt->native_type;
 
 	/* Initialize value instance */
 	*value = BHND_NVRAM_VAL_INITIALIZER(fmt, val_storage);
 
 	/* Determine size when encoded in native format */
 	if ((error = bhnd_nvram_val_encode(src, NULL, &olen, otype)))
 		return (error);
 	
 	/* Fetch reference to (or allocate) an appropriately sized buffer */
 	outp = bhnd_nvram_val_alloc_bytes(value, olen, otype, flags);
 	if (outp == NULL)
 		return (ENOMEM);
 	
 	/* Perform encode */
 	if ((error = bhnd_nvram_val_encode(src, outp, &olen, otype)))
 		return (error);
 
 	return (0);
 }
 
 /**
  * Initialize an externally allocated instance of @p value with @p fmt, and
  * attempt to initialize its internal representation from the given @p src
  * value.
  *
  * On success, the caller owns a reference to @p value, and is responsible for
  * freeing any resources allocated for @p value via bhnd_nvram_val_release().
  *
  * @param	value	The externally allocated value instance to be
  *			initialized.
  * @param	fmt	The value's format.
  * @param	src	Input value to be converted.
  * @param	flags	Value flags (see BHND_NVRAM_VAL_*).
  * 
  * @retval 0		success
  * @retval ENOMEM	If allocation fails.
  * @retval EFTYPE	If @p fmt initialization from @p src is unsupported.
  * @retval EFAULT	if @p ilen is not correctly aligned for elements of
  *			@p itype.
  * @retval ERANGE	If value coercion of @p src would overflow
  *			(or underflow) the @p fmt representation.
  */
 int
 bhnd_nvram_val_convert_init(bhnd_nvram_val *value,
     const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags)
 {
 	int error;
 
 	error = bhnd_nvram_val_convert_common(value,
 	    BHND_NVRAM_VAL_STORAGE_AUTO, fmt, src, flags);
 	if (error)
 		bhnd_nvram_val_release(value);
 
 	return (error);
 }
 
 /**
  * Allocate a value instance with @p fmt, and attempt to initialize its internal
  * representation from the given @p src value.
  *
  * On success, the caller owns a reference to @p value, and is responsible for
  * freeing any resources allocated for @p value via bhnd_nvram_val_release().
  *
  * @param[out]	value	On success, the allocated value instance.
  * @param	fmt	The value's format.
  * @param	src	Input value to be converted.
  * @param	flags	Value flags (see BHND_NVRAM_VAL_*).
  * 
  * @retval 0		success
  * @retval ENOMEM	If allocation fails.
  * @retval EFTYPE	If @p fmt initialization from @p src is unsupported.
  * @retval EFAULT	if @p ilen is not correctly aligned for elements of
  *			@p itype.
  * @retval ERANGE	If value coercion of @p src would overflow
  *			(or underflow) the @p fmt representation.
  */
 int
 bhnd_nvram_val_convert_new(bhnd_nvram_val **value,
     const bhnd_nvram_val_fmt *fmt, bhnd_nvram_val *src, uint32_t flags)
 {
 	int error;
 
 	/* Allocate new instance */
 	if ((*value = bhnd_nv_malloc(sizeof(**value))) == NULL)
 		return (ENOMEM);
 
 	/* Perform common initialization. */
 	error = bhnd_nvram_val_convert_common(*value,
 	    BHND_NVRAM_VAL_STORAGE_DYNAMIC, fmt, src, flags);
 	if (error) {
 		/* Will also free() the value allocation */
 		bhnd_nvram_val_release(*value);
 	}
 
 	return (error);
 }
 
 /**
  * Copy or retain a reference to @p value.
  * 
  * On success, the caller is responsible for freeing the result via
  * bhnd_nvram_val_release().
  * 
  * @param	value	The value to be copied (or retained).
  * 
  * @retval bhnd_nvram_val	if @p value was successfully copied or retained.
  * @retval NULL			if allocation failed.
  */
 bhnd_nvram_val *
 bhnd_nvram_val_copy(bhnd_nvram_val *value)
 {
 	bhnd_nvram_val		*result;
 	const void		*bytes;
 	bhnd_nvram_type		 type;
 	size_t			 len;
 	uint32_t		 flags;
 	int			 error;
 
 	switch (value->val_storage) {
 	case BHND_NVRAM_VAL_STORAGE_STATIC:
 		/* If static, can return as-is */
 		return (value);
 
 	case BHND_NVRAM_VAL_STORAGE_DYNAMIC:
 		if (!BHND_NVRAM_VAL_NEED_COPY(value)) {
 			refcount_acquire(&value->refs);
 			return (value);
 		}
 
 		/* Perform copy below */
 		break;
 
 	case BHND_NVRAM_VAL_STORAGE_AUTO:
 		BHND_NV_ASSERT(value->refs == 1, ("non-allocated value has "
 		    "active refcount (%u)", value->refs));
 
 		/* Perform copy below */
 		break;
 	}
 
 
 	/* Compute the new value's flags based on the source value */
 	switch (value->data_storage) {
 	case BHND_NVRAM_VAL_DATA_NONE:
 	case BHND_NVRAM_VAL_DATA_INLINE:
 	case BHND_NVRAM_VAL_DATA_EXT_WEAK:
 	case BHND_NVRAM_VAL_DATA_EXT_ALLOC:
 		/* Copy the source data and permit additional allocation if the
 		 * value cannot be represented inline */
 		flags = BHND_NVRAM_VAL_COPY_DATA|BHND_NVRAM_VAL_DYNAMIC;
 		break;
 	case BHND_NVRAM_VAL_DATA_EXT_STATIC:
 		flags = BHND_NVRAM_VAL_STATIC_DATA;
 		break;
 	default:
 		BHND_NV_PANIC("invalid storage type: %d", value->data_storage);
 	}
 
 	/* Allocate new value copy */
 	bytes = bhnd_nvram_val_bytes(value, &len, &type);
 	error = bhnd_nvram_val_new(&result, value->fmt, bytes, len, type,
 	    flags);
 	if (error) {
 		BHND_NV_LOG("copy failed: %d", error);
 		return (NULL);
 	}
 
 	return (result);
 }
 
 /**
  * Release a reference to @p value.
  *
  * If this is the last reference, all associated resources will be freed.
  * 
  * @param	value	The value to be released.
  */
 void
 bhnd_nvram_val_release(bhnd_nvram_val *value)
 {
 	BHND_NV_ASSERT(value->refs >= 1, ("value over-released"));
 
 	/* Skip if value is static */
 	if (value->val_storage == BHND_NVRAM_VAL_STORAGE_STATIC)
 		return;
 
 	/* Drop reference */
 	if (!refcount_release(&value->refs))
 		return;
 
 	/* Free allocated external representation data */
 	switch (value->data_storage) {
 	case BHND_NVRAM_VAL_DATA_EXT_ALLOC:
 		bhnd_nv_free(__DECONST(void *, value->data.ptr));
 		break;
 	case BHND_NVRAM_VAL_DATA_NONE:
 	case BHND_NVRAM_VAL_DATA_INLINE:
 	case BHND_NVRAM_VAL_DATA_EXT_WEAK:
 	case BHND_NVRAM_VAL_DATA_EXT_STATIC:
 		/* Nothing to free */
 		break;
 	}
 
 	/* Free instance if dynamically allocated */
 	if (value->val_storage == BHND_NVRAM_VAL_STORAGE_DYNAMIC)
 		bhnd_nv_free(value);
 }
 
 /**
  * Standard BHND_NVRAM_TYPE_NULL encoding implementation.
  */
 static int
 bhnd_nvram_val_encode_null(const void *inp, size_t ilen, bhnd_nvram_type itype,
     void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	size_t	limit, nbytes;
 
 	BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_NULL,
 	    ("unsupported type: %d", itype));
 
 	/* Determine output byte limit */
 	if (outp != NULL)
 		limit = *olen;
 	else
 		limit = 0;
 
 	nbytes = 0;
 
 	/* Write to output */
 	switch (otype) {
 	case BHND_NVRAM_TYPE_NULL:
 		/* Can be directly encoded as a zero-length NULL value */
 		nbytes = 0;
 		break;
 	default:
 		/* Not representable */
 		return (EFTYPE);
 	}
 
 	/* Provide required length */
 	*olen = nbytes;
 	if (limit < *olen) {
 		if (outp == NULL)
 			return (0);
 
 		return (ENOMEM);
 	}
 
 	return (0);
 }
 
 /**
  * Standard BHND_NVRAM_TYPE_BOOL encoding implementation.
  */
 static int
 bhnd_nvram_val_encode_bool(const void *inp, size_t ilen, bhnd_nvram_type itype,
     void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	bhnd_nvram_bool_t	bval;
 	size_t			limit, nbytes, nelem;
 	int			error;
 
 	BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_BOOL,
 	    ("unsupported type: %d", itype));
 
 	/* Determine output byte limit */
 	if (outp != NULL)
 		limit = *olen;
 	else
 		limit = 0;
 
 	/* Must be exactly one element in input */
 	if ((error = bhnd_nvram_value_nelem(inp, ilen, itype, &nelem)))
 		return (error);
 
 	if (nelem != 1)
 		return (EFTYPE);
 
 	/* Fetch (and normalize) boolean value */
 	bval = (*(const bhnd_nvram_bool_t *)inp != 0) ? true : false;
 
 	/* Write to output */
 	switch (otype) {
 	case BHND_NVRAM_TYPE_NULL:
 		/* False can be directly encoded as a zero-length NULL value */
 		if (bval != false)
 			return (EFTYPE);
 
 		nbytes = 0;
 		break;
 
 	case BHND_NVRAM_TYPE_STRING:
 	case BHND_NVRAM_TYPE_STRING_ARRAY: {
 		/* Can encode as "true" or "false" */
 		const char *str = bval ? "true" : "false";
 
 		nbytes = strlen(str) + 1;
 		if (limit > nbytes)
 			strcpy(outp, str);
 
 		break;
 	}
 
 	default:
 		/* If output type is an integer, we can delegate to standard
 		 * integer encoding to encode as zero or one. */
 		if (bhnd_nvram_is_int_type(otype)) {
 			uint8_t	ival = bval ? 1 : 0;
 
 			return (bhnd_nvram_val_encode_int(&ival, sizeof(ival),
 			    BHND_NVRAM_TYPE_UINT8, outp, olen, otype));
 		}
 
 		/* Otherwise not representable */
 		return (EFTYPE);
 	}
 
 	/* Provide required length */
 	*olen = nbytes;
 	if (limit < *olen) {
 		if (outp == NULL)
 			return (0);
 
 		return (ENOMEM);
 	}
 
 	return (0);
 }
 
 /**
  * Standard BHND_NVRAM_TYPE_DATA encoding implementation.
  */
 static int
 bhnd_nvram_val_encode_data(const void *inp, size_t ilen, bhnd_nvram_type itype,
     void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	BHND_NV_ASSERT(itype == BHND_NVRAM_TYPE_DATA,
 	    ("unsupported type: %d", itype));
 
 	/* Write to output */
 	switch (otype) {
 	case BHND_NVRAM_TYPE_STRING:
 	case BHND_NVRAM_TYPE_STRING_ARRAY:
 		/* If encoding as a string, produce an EFI-style hexadecimal
 		 * byte array (HF1F...) by interpreting the octet string
 		 * as an array of uint8 values */
 		return (bhnd_nvram_value_printf("H%[]02hhX", inp, ilen,
 		    BHND_NVRAM_TYPE_UINT8_ARRAY, outp, olen, ""));
 
 	default:
 		/* Fall back on direct interpretation as an array of 8-bit
 		 * integers array */
 		return (bhnd_nvram_value_coerce(inp, ilen,
 		    BHND_NVRAM_TYPE_UINT8_ARRAY, outp, olen, otype));
 	}
 }
 
 
 /**
  * Standard string/char array/char encoding implementation.
  *
  * Input type must be one of:
  * - BHND_NVRAM_TYPE_STRING
  * - BHND_NVRAM_TYPE_CHAR
  * - BHND_NVRAM_TYPE_CHAR_ARRAY
  */
 static int
 bhnd_nvram_val_encode_string(const void *inp, size_t ilen,
     bhnd_nvram_type itype, void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	const char	*cstr;
 	bhnd_nvram_type	 otype_base;
 	size_t		 cstr_size, cstr_len;
 	size_t		 limit, nbytes;
 
 	BHND_NV_ASSERT(
 	    itype == BHND_NVRAM_TYPE_STRING ||
 	    itype == BHND_NVRAM_TYPE_CHAR ||
 	    itype == BHND_NVRAM_TYPE_CHAR_ARRAY,
 	    ("unsupported type: %d", itype));
 
 	cstr = inp;
 	cstr_size = ilen;
 	nbytes = 0;
 	otype_base = bhnd_nvram_base_type(otype);
 
 	/* Determine output byte limit */
 	if (outp != NULL)
 		limit = *olen;
 	else
 		limit = 0;
 
 	/* Determine string length, minus trailing NUL (if any) */
 	cstr_len = strnlen(cstr, cstr_size);
 
 	/* Parse the string data and write to output */
 	switch (otype) {
 	case BHND_NVRAM_TYPE_NULL:
 		/* Only an empty string may be represented as a NULL value */
 		if (cstr_len != 0)
 			return (EFTYPE);
 
 		*olen = 0;
 		return (0);
 
 	case BHND_NVRAM_TYPE_CHAR:
 	case BHND_NVRAM_TYPE_CHAR_ARRAY:
 		/* String must contain exactly 1 non-terminating-NUL character
 		 * to be represented as a single char */
 		if (!bhnd_nvram_is_array_type(otype)) {
 			if (cstr_len != 1)
 				return (EFTYPE);
 		}
 
 		/* Copy out the characters directly (excluding trailing NUL) */
 		for (size_t i = 0; i < cstr_len; i++) {
 			if (limit > nbytes)
 				*((uint8_t *)outp + nbytes) = cstr[i];
 			nbytes++;
 		}
 
 		/* Provide required length */
 		*olen = nbytes;
 		if (limit < *olen && outp != NULL)
 			return (ENOMEM);
 
 		return (0);
 
 	case BHND_NVRAM_TYPE_BOOL:
 	case BHND_NVRAM_TYPE_BOOL_ARRAY: {
 		const char		*p;
 		size_t			 plen;
 		bhnd_nvram_bool_t	 bval;
 
 		/* Trim leading/trailing whitespace */
 		p = cstr;
 		plen = bhnd_nvram_trim_field(&p, cstr_len, '\0');
 
 		/* Parse string representation */
 		if (strncasecmp(p, "true", plen) == 0 ||
 		    strncasecmp(p, "yes", plen) == 0 ||
 		    strncmp(p, "1", plen) == 0)
 		{
 			bval = true;
 		} else if (strncasecmp(p, "false", plen) == 0 ||
 		    strncasecmp(p, "no", plen) == 0 ||
 		    strncmp(p, "0", plen) == 0)
 		{
 			bval = false;
 		} else {
 			/* Not a recognized boolean string */
 			return (EFTYPE);
 		}
 
 		/* Write to output */
 		nbytes = sizeof(bhnd_nvram_bool_t);
 		if (limit >= nbytes)
 			*((bhnd_nvram_bool_t *)outp) = bval;
 
 		/* Provide required length */
 		*olen = nbytes;
 		if (limit < *olen && outp != NULL)
 			return (ENOMEM);
 
 		return (0);
 	}
 
 	case BHND_NVRAM_TYPE_DATA: {
 		const char	*p;
 		size_t		 plen, parsed_len;
 		int		 error;
 
 		/* Trim leading/trailing whitespace */
 		p = cstr;
 		plen = bhnd_nvram_trim_field(&p, cstr_len, '\0');
 
 		/* Check for EFI-style hexadecimal byte array string format.
 		 * Must have a 'H' prefix  */
 		if (plen < 1 || bhnd_nv_toupper(*p) != 'H')
 			return (EFTYPE);
 
 		/* Skip leading 'H' */
 		p++;
 		plen--;
 
 		/* Parse the input string's two-char octets until the end
 		 * of input is reached. The last octet may contain only
 		 * one char */
 		while (plen > 0) {
 			uint8_t	byte;
 			size_t	byte_len = sizeof(byte);
 
 			/* Parse next two-character hex octet */
 			error = bhnd_nvram_parse_int(p, bhnd_nv_ummin(plen, 2),
 			    16, &parsed_len, &byte, &byte_len, otype_base);
 			if (error) {
 				BHND_NV_DEBUG("error parsing '%.*s' as "
 				    "integer: %d\n", BHND_NV_PRINT_WIDTH(plen),
 				     p, error);
 
 				return (error);
 			}
 
 			/* Write to output */
 			if (limit > nbytes)
 				*((uint8_t *)outp + nbytes) = byte;
 			nbytes++;
 
 			/* Advance input */
 			p += parsed_len;
 			plen -= parsed_len;
 		}
 
 		/* Provide required length */
 		*olen = nbytes;
 		if (limit < *olen && outp != NULL)
 			return (ENOMEM);
 
 		return (0);
 	}
 
 	case BHND_NVRAM_TYPE_UINT8:
 	case BHND_NVRAM_TYPE_UINT8_ARRAY:
 	case BHND_NVRAM_TYPE_UINT16:
 	case BHND_NVRAM_TYPE_UINT16_ARRAY:
 	case BHND_NVRAM_TYPE_UINT32:
 	case BHND_NVRAM_TYPE_UINT32_ARRAY:
 	case BHND_NVRAM_TYPE_UINT64:
 	case BHND_NVRAM_TYPE_UINT64_ARRAY:
 	case BHND_NVRAM_TYPE_INT8:
 	case BHND_NVRAM_TYPE_INT8_ARRAY:
 	case BHND_NVRAM_TYPE_INT16:
 	case BHND_NVRAM_TYPE_INT16_ARRAY:
 	case BHND_NVRAM_TYPE_INT32:
 	case BHND_NVRAM_TYPE_INT32_ARRAY:
 	case BHND_NVRAM_TYPE_INT64:
 	case BHND_NVRAM_TYPE_INT64_ARRAY: {
 		const char	*p;
 		size_t		 plen, parsed_len;
 		int		 error;
 
 		/* Trim leading/trailing whitespace */
 		p = cstr;
 		plen = bhnd_nvram_trim_field(&p, cstr_len, '\0');
 
 		/* Try to parse the integer value */
 		error = bhnd_nvram_parse_int(p, plen, 0, &parsed_len, outp,
 		    olen, otype_base);
 		if (error) {
 			BHND_NV_DEBUG("error parsing '%.*s' as integer: %d\n",
 			    BHND_NV_PRINT_WIDTH(plen), p, error);
 			return (error);
 		}
 
 		/* Do additional bytes remain unparsed? */
 		if (plen != parsed_len) {
 			BHND_NV_DEBUG("error parsing '%.*s' as a single "
 			    "integer value; trailing garbage '%.*s'\n",
 			    BHND_NV_PRINT_WIDTH(plen), p,
 			    BHND_NV_PRINT_WIDTH(plen-parsed_len), p+parsed_len);
 			return (EFTYPE);
 		}
 
 		return (0);
 	}
 
 	case BHND_NVRAM_TYPE_STRING:
 	case BHND_NVRAM_TYPE_STRING_ARRAY:
 		/* Copy out the string representation as-is */
 		*olen = cstr_size;
 
 		/* Need additional space for trailing NUL? */
 		if (cstr_len == cstr_size)
 			(*olen)++;
 
 		/* Skip output? */
 		if (outp == NULL)
 			return (0);
 
 		/* Verify required length */
 		if (limit < *olen)
 			return (ENOMEM);
 
 		/* Copy and NUL terminate */
 		strncpy(outp, cstr, cstr_len);
 		*((char *)outp + cstr_len) = '\0';
 
 		return (0);
 	}
 
 	BHND_NV_PANIC("unknown type %s", bhnd_nvram_type_name(otype));
 }
 
 /**
  * Standard integer encoding implementation.
  */
 static int
 bhnd_nvram_val_encode_int(const void *inp, size_t ilen, bhnd_nvram_type itype,
     void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	bhnd_nvram_type	 otype_base;
 	size_t		 limit, nbytes;
 	bool		 itype_signed, otype_signed, otype_int;
 	union {
 		uint64_t	u64;
 		int64_t		i64;
 	} intv;
 
 	BHND_NV_ASSERT(bhnd_nvram_is_int_type(itype), ("non-integer type"));
 
 	/* Determine output byte limit */
 	if (outp != NULL)
 		limit = *olen;
 	else
 		limit = 0;
 
 	/* Fetch output type info */
 	otype_base = bhnd_nvram_base_type(otype);
 	otype_int = bhnd_nvram_is_int_type(otype);
 	otype_signed = bhnd_nvram_is_signed_type(otype_base);
 
 	/*
 	 * Promote integer value to a common 64-bit representation.
 	 */
 	switch (itype) {
 	case BHND_NVRAM_TYPE_UINT8:
 		if (ilen != sizeof(uint8_t))
 			return (EFAULT);
 
 		itype_signed = false;
 		intv.u64 = *(const uint8_t *)inp;
 		break;
 
 	case BHND_NVRAM_TYPE_UINT16:
 		if (ilen != sizeof(uint16_t))
 			return (EFAULT);
 
 		itype_signed = false;
 		intv.u64 = *(const uint16_t *)inp;
 		break;
 
 	case BHND_NVRAM_TYPE_UINT32:
 		if (ilen != sizeof(uint32_t))
 			return (EFAULT);
 
 		itype_signed = false;
 		intv.u64 = *(const uint32_t *)inp;
 		break;
 
 	case BHND_NVRAM_TYPE_UINT64:
 		if (ilen != sizeof(uint64_t))
 			return (EFAULT);
 
 		itype_signed = false;
 		intv.u64 = *(const uint64_t *)inp;
 		break;
 
 	case BHND_NVRAM_TYPE_INT8:
 		if (ilen != sizeof(int8_t))
 			return (EFAULT);
 
 		itype_signed = true;
 		intv.i64 = *(const int8_t *)inp;
 		break;
 
 	case BHND_NVRAM_TYPE_INT16:
 		if (ilen != sizeof(int16_t))
 			return (EFAULT);
 
 		itype_signed = true;
 		intv.i64 = *(const int16_t *)inp;
 		break;
 
 	case BHND_NVRAM_TYPE_INT32:
 		if (ilen != sizeof(int32_t))
 			return (EFAULT);
 
 		itype_signed = true;
 		intv.i64 = *(const int32_t *)inp;
 		break;
 
 	case BHND_NVRAM_TYPE_INT64:
 		if (ilen != sizeof(int32_t))
 			return (EFAULT);
 
 		itype_signed = true;
 		intv.i64 = *(const int32_t *)inp;
 		break;
 
 	default:
 		BHND_NV_PANIC("invalid type %d\n", itype);
 	}
 
 	/* Perform signed/unsigned conversion */
 	if (itype_signed && otype_int && !otype_signed) {
 		if (intv.i64 < 0) {
 			/* Can't represent negative value */
 			BHND_NV_LOG("cannot represent %" PRId64 " as %s\n",
 			    intv.i64, bhnd_nvram_type_name(otype));
 
 			return (ERANGE);
 		}
 
 		/* Convert to unsigned representation */
 		intv.u64 = intv.i64;
 
 	} else if (!itype_signed && otype_int && otype_signed) {
 		/* Handle unsigned -> signed coercions */
 		if (intv.u64 > INT64_MAX) {
 			/* Can't represent positive value */
 			BHND_NV_LOG("cannot represent %" PRIu64 " as %s\n",
 			    intv.u64, bhnd_nvram_type_name(otype));
 			return (ERANGE);
 		}
 
 		/* Convert to signed representation */
 		intv.i64 = intv.u64;
 	}
 
 	/* Write output */
 	switch (otype) {
 	case BHND_NVRAM_TYPE_NULL:
 		/* Cannot encode an integer value as NULL */
 		return (EFTYPE);
 
 	case BHND_NVRAM_TYPE_BOOL: {
 		bhnd_nvram_bool_t bval;
 
 		if (intv.u64 == 0 || intv.u64 == 1) {
 			bval = intv.u64;
 		} else {
 			/* Encoding as a bool would lose information */
 			return (ERANGE);
 		}
 
 		nbytes = sizeof(bhnd_nvram_bool_t);
 		if (limit >= nbytes)
 			*((bhnd_nvram_bool_t *)outp) = bval;
 
 		break;
 	}
 
 	case BHND_NVRAM_TYPE_CHAR:
 	case BHND_NVRAM_TYPE_CHAR_ARRAY:
 	case BHND_NVRAM_TYPE_DATA:
 	case BHND_NVRAM_TYPE_UINT8:
 	case BHND_NVRAM_TYPE_UINT8_ARRAY:
 		if (intv.u64 > UINT8_MAX)
 			return (ERANGE);
 
 		nbytes = sizeof(uint8_t);
 		if (limit >= nbytes)
 			*((uint8_t *)outp) = (uint8_t)intv.u64;
 		break;
 
 	case BHND_NVRAM_TYPE_UINT16:
 	case BHND_NVRAM_TYPE_UINT16_ARRAY:
 		if (intv.u64 > UINT16_MAX)
 			return (ERANGE);
 
 		nbytes = sizeof(uint16_t);
 		if (limit >= nbytes)
 			*((uint16_t *)outp) = (uint16_t)intv.u64;
 		break;
 
 	case BHND_NVRAM_TYPE_UINT32:
 	case BHND_NVRAM_TYPE_UINT32_ARRAY:
 		if (intv.u64 > UINT32_MAX)
 			return (ERANGE);
 
 		nbytes = sizeof(uint32_t);
 		if (limit >= nbytes)
 			*((uint32_t *)outp) = (uint32_t)intv.u64;
 		break;
 
 	case BHND_NVRAM_TYPE_UINT64:
 	case BHND_NVRAM_TYPE_UINT64_ARRAY:
 		nbytes = sizeof(uint64_t);
 		if (limit >= nbytes)
 			*((uint64_t *)outp) = intv.u64;
 		break;
 
 	case BHND_NVRAM_TYPE_INT8:
 	case BHND_NVRAM_TYPE_INT8_ARRAY:
 		if (intv.i64 < INT8_MIN || intv.i64 > INT8_MAX)
 			return (ERANGE);
 
 		nbytes = sizeof(int8_t);
 		if (limit >= nbytes)
 			*((int8_t *)outp) = (int8_t)intv.i64;
 		break;
 
 	case BHND_NVRAM_TYPE_INT16:
 	case BHND_NVRAM_TYPE_INT16_ARRAY:
 		if (intv.i64 < INT16_MIN || intv.i64 > INT16_MAX)
 			return (ERANGE);
 
 		nbytes = sizeof(int16_t);
 		if (limit >= nbytes)
 			*((int16_t *)outp) = (int16_t)intv.i64;
 		break;
 
 	case BHND_NVRAM_TYPE_INT32:
 	case BHND_NVRAM_TYPE_INT32_ARRAY:
 		if (intv.i64 < INT32_MIN || intv.i64 > INT32_MAX)
 			return (ERANGE);
 
 		nbytes = sizeof(int32_t);
 		if (limit >= nbytes)
 			*((int32_t *)outp) = (int32_t)intv.i64;
 		break;
 
 	case BHND_NVRAM_TYPE_INT64:
 	case BHND_NVRAM_TYPE_INT64_ARRAY:
 		nbytes = sizeof(int64_t);
 		if (limit >= nbytes)
 			*((int64_t *)outp) = intv.i64;
 		break;
 
 	case BHND_NVRAM_TYPE_STRING:
 	case BHND_NVRAM_TYPE_STRING_ARRAY: {
 		ssize_t len;
 	
 		/* Attempt to write the entry + NUL */
 		if (otype_signed) {
 			len = snprintf(outp, limit, "%" PRId64, intv.i64);
 		} else {
 			len = snprintf(outp, limit, "%" PRIu64, intv.u64);
 		}
 
 		if (len < 0) {
 			BHND_NV_LOG("snprintf() failed: %zd\n", len);
 			return (EFTYPE);
 		}
 
 		/* Set total length to the formatted string length, plus
 		 * trailing NUL */
 		nbytes = len + 1;
 		break;
 	}
 
 	default:
 		BHND_NV_LOG("unknown type %s\n", bhnd_nvram_type_name(otype));
 		return (EFTYPE);
 	}
 
 	/* Provide required length */
 	*olen = nbytes;
 	if (limit < *olen) {
 		if (outp == NULL)
 			return (0);
 
 		return (ENOMEM);
 	}
 
 	return (0);
 }
 
 /**
  * Encode the given @p value as @p otype, writing the result to @p outp.
  *
  * @param		value	The value to be encoded.
  * @param[out]		outp	On success, the value will be written to this 
  *				buffer. This argment may be NULL if the value is
  *				not desired.
  * @param[in,out]	olen	The capacity of @p outp. On success, will be set
  *				to the actual size of the requested value.
  * @param		otype	The data type to be written to @p outp.
  *
  * @retval 0		success
  * @retval ENOMEM	If the @p outp is non-NULL, and the provided @p olen
  *			is too small to hold the encoded value.
  * @retval EFTYPE	If value coercion from @p value to @p otype is
  *			impossible.
  * @retval ERANGE	If value coercion would overflow (or underflow) the
  *			a @p otype representation.
  */
 int
 bhnd_nvram_val_encode(bhnd_nvram_val *value, void *outp, size_t *olen,
     bhnd_nvram_type otype)
 {
 	/* Prefer format implementation */
 	if (value->fmt->op_encode != NULL)
 		return (value->fmt->op_encode(value, outp, olen, otype));
 
 	return (bhnd_nvram_val_generic_encode(value, outp, olen, otype));
 }
 
 /**
  * Encode the given @p value's element as @p otype, writing the result to
  * @p outp.
  *
  * @param		inp	The element to be be encoded. Must be a value
  *				previously returned by bhnd_nvram_val_next()
  *				or bhnd_nvram_val_elem().
  * @param		ilen	The size of @p inp, as returned by
  *				bhnd_nvram_val_next() or bhnd_nvram_val_elem().
  * @param[out]		outp	On success, the value will be written to this 
  *				buffer. This argment may be NULL if the value is
  *				not desired.
  * @param[in,out]	olen	The capacity of @p outp. On success, will be set
  *				to the actual size of the requested value.
  * @param		otype	The data type to be written to @p outp.
  *
  * @retval 0		success
  * @retval ENOMEM	If the @p outp is non-NULL, and the provided @p olen
  *			is too small to hold the encoded value.
  * @retval EFTYPE	If value coercion from @p value to @p otype is
  *			impossible.
  * @retval ERANGE	If value coercion would overflow (or underflow) the
  *			a @p otype representation.
  */
 int
 bhnd_nvram_val_encode_elem(bhnd_nvram_val *value, const void *inp,
     size_t ilen, void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	/* Prefer format implementation */
 	if (value->fmt->op_encode_elem != NULL) {
 		return (value->fmt->op_encode_elem(value, inp, ilen, outp,
 		    olen, otype));
 	}
 
 	return (bhnd_nvram_val_generic_encode_elem(value, inp, ilen, outp,
 	    olen, otype));
 }
 
 /**
  * Return the type, size, and a pointer to the internal representation
  * of @p value.
  * 
  * @param	value	The value to be queried.
  * @param[out]	olen	Size of the returned data, in bytes.
  * @param[out]	otype	Data type.
  */
 const void *
 bhnd_nvram_val_bytes(bhnd_nvram_val *value, size_t *olen,
     bhnd_nvram_type *otype)
 {
 	/* Provide type and length */
 	*otype = value->data_type;
 	*olen = value->data_len;
 
 	switch (value->data_storage) {
 	case BHND_NVRAM_VAL_DATA_EXT_ALLOC:
 	case BHND_NVRAM_VAL_DATA_EXT_STATIC:
 	case BHND_NVRAM_VAL_DATA_EXT_WEAK:
 		/* Return a pointer to external storage */
 		return (value->data.ptr);
 
 	case BHND_NVRAM_VAL_DATA_INLINE:
 		/* Return a pointer to inline storage */
 		return (&value->data);
 
 	case BHND_NVRAM_VAL_DATA_NONE:
 		BHND_NV_PANIC("uninitialized value");
 	}
 
 	BHND_NV_PANIC("unknown storage type: %d", value->data_storage);
 }
 
 /**
  * Iterate over all array elements in @p value.
  *
  * @param		value	The value to be iterated
  * @param		prev	A value pointer previously returned by
  *				bhnd_nvram_val_next() or bhnd_nvram_val_elem(),
  *				or NULL to begin iteration at the first element.
  * @param[in,out]	olen	If @p prev is non-NULL, @p olen must be a
  *				pointer to the length previously returned by
  *				bhnd_nvram_val_next() or bhnd_nvram_val_elem().
  *				On success, will be set to the next element's
  *				length, in bytes.
  *
  * @retval non-NULL	A borrowed reference to the element data.
  * @retval NULL		If the end of the element array is reached.
  */
 const void *
 bhnd_nvram_val_next(bhnd_nvram_val *value, const void *prev, size_t *olen)
 {
 	/* Prefer the format implementation */
 	if (value->fmt->op_next != NULL)
 		return (value->fmt->op_next(value, prev, olen));
 
 	return (bhnd_nvram_val_generic_next(value, prev, olen));
 }
 
 /**
  * Return the value's data type.
  *
  * @param	value	The value to be queried.
  */
 bhnd_nvram_type
 bhnd_nvram_val_type(bhnd_nvram_val *value)
 {
 	return (value->data_type);
 }
 
 /**
  * Return value's element data type.
  *
  * @param	value	The value to be queried.
  */
 bhnd_nvram_type
 bhnd_nvram_val_elem_type(bhnd_nvram_val *value)
 {
 	return (bhnd_nvram_base_type(value->data_type));
 }
 
 /**
  * Return the total number of elements represented by @p value.
  */
 size_t
 bhnd_nvram_val_nelem(bhnd_nvram_val *value)
 {
 	const void	*bytes;
 	bhnd_nvram_type	 type;
 	size_t		 nelem, len;
 	int		 error;
 
 	/* Prefer format implementation */
 	if (value->fmt->op_nelem != NULL)
 		return (value->fmt->op_nelem(value));
 
 	/*
 	 * If a custom op_next() is defined, bhnd_nvram_value_nelem() almost
 	 * certainly cannot produce a valid element count; it assumes a standard
 	 * data format that may not apply when custom iteration is required.
 	 *
 	 * Instead, use bhnd_nvram_val_next() to parse the backing data and
 	 * produce a total count.
 	 */
 	if (value->fmt->op_next != NULL) {
 		const void *next;
 
 		next = NULL;
 		nelem = 0;
 		while ((next = bhnd_nvram_val_next(value, next, &len)) != NULL)
 			nelem++;
 
 		return (nelem);
 	}
 
 	/* Otherwise, compute the standard element count */
 	bytes = bhnd_nvram_val_bytes(value, &len, &type);
 	if ((error = bhnd_nvram_value_nelem(bytes, len, type, &nelem))) {
 		/* Should always succeed */
 		BHND_NV_PANIC("error calculating element count for type '%s' "
 		    "with length %zu: %d\n", bhnd_nvram_type_name(type), len,
 		    error);
 	}
 
 	return (nelem);
 }
 
 /**
  * Generic implementation of bhnd_nvram_val_op_encode(), compatible with
  * all supported NVRAM data types.
  */
 int
 bhnd_nvram_val_generic_encode(bhnd_nvram_val *value, void *outp, size_t *olen,
     bhnd_nvram_type otype)
 {
 	const void	*inp;
 	bhnd_nvram_type	 itype;
 	size_t		 ilen;
 	const void	*next;
 	bhnd_nvram_type	 otype_base;
 	size_t		 limit, nelem, nbytes;
 	size_t		 next_len;
 	int		 error;
 
 	nbytes = 0;
 	nelem = 0;
 	otype_base = bhnd_nvram_base_type(otype);
 	inp = bhnd_nvram_val_bytes(value, &ilen, &itype);
 
 	/*
 	 * Normally, an array type is not universally representable as
 	 * non-array type.
 	 * 
 	 * As exceptions, we support conversion directly to/from:
 	 *	- CHAR_ARRAY/STRING:
 	 *		->STRING	Interpret the character array as a
 	 *			 	non-NUL-terminated string.
 	 *		->CHAR_ARRAY	Trim the trailing NUL from the string.
 	 */
 #define	BHND_NV_IS_ISO_CONV(_lhs, _rhs)		\
 	((itype == BHND_NVRAM_TYPE_ ## _lhs &&	\
 	  otype == BHND_NVRAM_TYPE_ ## _rhs) ||	\
 	 (itype == BHND_NVRAM_TYPE_ ## _rhs &&	\
 	  otype == BHND_NVRAM_TYPE_ ## _lhs))
 
 	if (BHND_NV_IS_ISO_CONV(CHAR_ARRAY, STRING)) {
 		return (bhnd_nvram_val_encode_elem(value, inp, ilen, outp, olen,
 		    otype));
 	}
 
 #undef	BHND_NV_IS_ISO_CONV
 
 	/*
 	 * If both input and output are non-array types, try to encode them
 	 * without performing element iteration.
 	 */
 	if (!bhnd_nvram_is_array_type(itype) &&
 	    !bhnd_nvram_is_array_type(otype))
 	{
 		return (bhnd_nvram_val_encode_elem(value, inp, ilen, outp, olen,
 		    otype));
 	}
 
 	/* Determine output byte limit */
 	if (outp != NULL)
 		limit = *olen;
 	else
 		limit = 0;
 
 	/* Iterate over our array elements and encode as the requested
 	 * type */
 	next = NULL;
 	while ((next = bhnd_nvram_val_next(value, next, &next_len))) {
 		void			*elem_outp;
 		size_t			 elem_nbytes;
 
 		/* If the output type is not an array type, we can only encode
 		 * one element */
 		nelem++;
 		if (nelem > 1 && !bhnd_nvram_is_array_type(otype)) {
 			return (EFTYPE);
 		}
 
 		/* Determine output offset / limit */
 		if (nbytes >= limit) {
 			elem_nbytes = 0;
 			elem_outp = NULL;
 		} else {
 			elem_nbytes = limit - nbytes;
 			elem_outp = (uint8_t *)outp + nbytes;
 		}
 
 		/* Attempt encode */
 		error = bhnd_nvram_val_encode_elem(value, next, next_len,
 		    elem_outp, &elem_nbytes, otype_base);
 
 		/* If encoding failed for any reason other than ENOMEM (which
 		 * we'll detect and report below), return immediately */
 		if (error && error != ENOMEM)
 			return (error);
 
 		/* Add to total length */
 		if (SIZE_MAX - nbytes < elem_nbytes)
 			return (EFTYPE); /* would overflow size_t */
 
 		nbytes += elem_nbytes;
 	}
 
 	/* Provide the actual length */
 	*olen = nbytes;
 
 	/* If no output was requested, nothing left to do */
 	if (outp == NULL)
 		return (0);
 
 	/* Otherwise, report a memory error if the output buffer was too
 	 * small */
 	if (limit < nbytes)
 		return (ENOMEM);
 
 	return (0);
 }
 
 /**
  * Generic implementation of bhnd_nvram_val_op_encode_elem(), compatible with
  * all supported NVRAM data types.
  */
 int
 bhnd_nvram_val_generic_encode_elem(bhnd_nvram_val *value, const void *inp,
     size_t ilen, void *outp, size_t *olen, bhnd_nvram_type otype)
 {
 	bhnd_nvram_type itype;
 
 	itype = bhnd_nvram_val_elem_type(value);
 	switch (itype) {
 	case BHND_NVRAM_TYPE_NULL:
 		return (bhnd_nvram_val_encode_null(inp, ilen, itype, outp, olen,
 		    otype));
 
 	case BHND_NVRAM_TYPE_DATA:
 		return (bhnd_nvram_val_encode_data(inp, ilen, itype, outp,
 		    olen, otype));
 
 	case BHND_NVRAM_TYPE_STRING:
 	case BHND_NVRAM_TYPE_CHAR:
 		return (bhnd_nvram_val_encode_string(inp, ilen, itype, outp,
 		    olen, otype));
 
 	case BHND_NVRAM_TYPE_BOOL:
 		return (bhnd_nvram_val_encode_bool(inp, ilen, itype, outp, olen,
 		    otype));
 
 	case BHND_NVRAM_TYPE_UINT8:
 	case BHND_NVRAM_TYPE_UINT16:
 	case BHND_NVRAM_TYPE_UINT32:
 	case BHND_NVRAM_TYPE_UINT64:
 	case BHND_NVRAM_TYPE_INT8:
 	case BHND_NVRAM_TYPE_INT16:
 	case BHND_NVRAM_TYPE_INT32:
 	case BHND_NVRAM_TYPE_INT64:
 		return (bhnd_nvram_val_encode_int(inp, ilen, itype, outp, olen,
 		    otype));	
 	default:
 		BHND_NV_PANIC("missing encode_elem() implementation");
 	}
 }
 
 /**
  * Generic implementation of bhnd_nvram_val_op_next(), compatible with
  * all supported NVRAM data types.
  */
 const void *
 bhnd_nvram_val_generic_next(bhnd_nvram_val *value, const void *prev,
     size_t *olen)
 {
 	const uint8_t	*inp;
 	bhnd_nvram_type	 itype;
 	size_t		 ilen;
 
 	/* Iterate over the backing representation */
 	inp = bhnd_nvram_val_bytes(value, &ilen, &itype);
 	return (bhnd_nvram_value_array_next(inp, ilen, itype, prev, olen));
 }
 
 /**
  * Initialize the representation of @p value with @p ptr.
  *
  * @param	value	The value to be initialized.
  * @param	inp	The external representation.
  * @param	ilen	The external representation length, in bytes.
  * @param	itype	The external representation's data type.
  * @param	flags	Value flags.
  * 
  * @retval 0		success.
  * @retval ENOMEM	if allocation fails
  * @retval EFTYPE	if @p itype is not an array type, and @p ilen is not
  *			equal to the size of a single element of @p itype.
  * @retval EFAULT	if @p ilen is not correctly aligned for elements of
  *			@p itype.
  */
 static int
 bhnd_nvram_val_set(bhnd_nvram_val *value, const void *inp, size_t ilen,
     bhnd_nvram_type itype, uint32_t flags)
 {
 	void	*bytes;
 	int	 error;
 
 	BHND_NVRAM_VAL_ASSERT_EMPTY(value);
 
 	/* Validate alignment */
 	if ((error = bhnd_nvram_value_check_aligned(inp, ilen, itype)))
 		return (error);
 
 	/* Reference the external data */
 	if ((flags & BHND_NVRAM_VAL_BORROW_DATA) ||
 	    (flags & BHND_NVRAM_VAL_STATIC_DATA))
 	{
 		if (flags & BHND_NVRAM_VAL_STATIC_DATA)
 			value->data_storage = BHND_NVRAM_VAL_DATA_EXT_STATIC;
 		else
 			value->data_storage = BHND_NVRAM_VAL_DATA_EXT_WEAK;
 
 		value->data.ptr = inp;
 		value->data_type = itype;
 		value->data_len = ilen;
 		return (0);
 	}
 
 	/* Fetch reference to (or allocate) an appropriately sized buffer */
 	bytes = bhnd_nvram_val_alloc_bytes(value, ilen, itype, flags);
 	if (bytes == NULL)
 		return (ENOMEM);
 
 	/* Copy data */
 	memcpy(bytes, inp, ilen);
 
 	return (0);
 }
 
 /**
  * Initialize the internal inline representation of @p value with a copy of
  * the data referenced by @p inp of @p itype.
  * 
  * If @p inp is NULL, @p itype and @p ilen will be validated, but no data will
  * be copied.
  *
  * @param	value	The value to be initialized.
  * @param	inp	The input data to be copied, or NULL to verify
  *			that data of @p ilen and @p itype can be represented
  *			inline.
  * @param	ilen	The size of the external buffer to be allocated.
  * @param	itype	The type of the external buffer to be allocated.
  * 
  * @retval 0		success
  * @retval ENOMEM	if @p ilen is too large to be represented inline.
  * @retval EFAULT	if @p ilen is not correctly aligned for elements of
  *			@p itype.
  */
 static int
 bhnd_nvram_val_set_inline(bhnd_nvram_val *value, const void *inp, size_t ilen,
     bhnd_nvram_type itype)
 {
 	BHND_NVRAM_VAL_ASSERT_EMPTY(value);
 
 #define	NV_STORE_INIT_INLINE()	do {					\
 	value->data_len = ilen;						\
 	value->data_type = itype;					\
 } while(0)
 
 #define	NV_STORE_INLINE(_type, _dest)	do {				\
 	if (ilen != sizeof(_type))					\
 		return (EFAULT);					\
 									\
 	if (inp != NULL) {						\
 		value->data._dest[0] = *(const _type *)inp;		\
 		NV_STORE_INIT_INLINE();					\
 	}								\
 } while (0)
 
 #define	NV_COPY_ARRRAY_INLINE(_type, _dest)	do {		\
 	if (ilen % sizeof(_type) != 0)				\
 		return (EFAULT);				\
 								\
 	if (ilen > nitems(value->data. _dest))			\
 		return (ENOMEM);				\
 								\
 	if (inp == NULL)					\
 		return (0);					\
 								\
 	memcpy(&value->data._dest, inp, ilen);			\
 	if (inp != NULL) {					\
 		memcpy(&value->data._dest, inp, ilen);		\
 		NV_STORE_INIT_INLINE();				\
 	}							\
 } while (0)
 
 	/* Attempt to copy to inline storage */
 	switch (itype) {
 	case BHND_NVRAM_TYPE_NULL:
 		if (ilen != 0)
 			return (EFAULT);
 
 		/* Nothing to copy */
 		NV_STORE_INIT_INLINE();
 		return (0);
 
 	case BHND_NVRAM_TYPE_CHAR:
 		NV_STORE_INLINE(uint8_t, ch);
 		return (0);
 
 	case BHND_NVRAM_TYPE_BOOL:
 		NV_STORE_INLINE(bhnd_nvram_bool_t, b);
 		return(0);
 
 	case BHND_NVRAM_TYPE_UINT8:
 	case BHND_NVRAM_TYPE_INT8:
 		NV_STORE_INLINE(uint8_t, u8);
 		return (0);
 
 	case BHND_NVRAM_TYPE_UINT16:
 	case BHND_NVRAM_TYPE_INT16:
 		NV_STORE_INLINE(uint16_t, u16);
 		return (0);
 
 	case BHND_NVRAM_TYPE_UINT32:
 	case BHND_NVRAM_TYPE_INT32:
 		NV_STORE_INLINE(uint32_t, u32);
 		return (0);
 
 	case BHND_NVRAM_TYPE_UINT64:
 	case BHND_NVRAM_TYPE_INT64:
 		NV_STORE_INLINE(uint32_t, u32);
 		return (0);
 
 	case BHND_NVRAM_TYPE_CHAR_ARRAY:
 		NV_COPY_ARRRAY_INLINE(uint8_t, ch);
 		return (0);
 
 	case BHND_NVRAM_TYPE_DATA:
 	case BHND_NVRAM_TYPE_UINT8_ARRAY:
 	case BHND_NVRAM_TYPE_INT8_ARRAY:
 		NV_COPY_ARRRAY_INLINE(uint8_t, u8);
 		return (0);
 
 	case BHND_NVRAM_TYPE_UINT16_ARRAY:
 	case BHND_NVRAM_TYPE_INT16_ARRAY:
 		NV_COPY_ARRRAY_INLINE(uint16_t, u16);
 		return (0);
 
 	case BHND_NVRAM_TYPE_UINT32_ARRAY:
 	case BHND_NVRAM_TYPE_INT32_ARRAY:
 		NV_COPY_ARRRAY_INLINE(uint32_t, u32);
 		return (0);
 
 	case BHND_NVRAM_TYPE_UINT64_ARRAY:
 	case BHND_NVRAM_TYPE_INT64_ARRAY:
 		NV_COPY_ARRRAY_INLINE(uint64_t, u64);
 		return (0);
 
 	case BHND_NVRAM_TYPE_BOOL_ARRAY:
 		NV_COPY_ARRRAY_INLINE(bhnd_nvram_bool_t, b);
 		return(0);
 
 	case BHND_NVRAM_TYPE_STRING:
 	case BHND_NVRAM_TYPE_STRING_ARRAY:
 		if (ilen > sizeof(value->data.ch))
 			return (ENOMEM);
 
 		if (inp != NULL) {
 			memcpy(&value->data.ch, inp, ilen);
 			NV_STORE_INIT_INLINE();
 		}
 
 		return (0);
 	}
 
 #undef	NV_STORE_INIT_INLINE
 #undef	NV_STORE_INLINE
 #undef	NV_COPY_ARRRAY_INLINE
 
 	BHND_NV_PANIC("unknown data type %d", itype);
 }
 
 /**
  * Initialize the internal representation of @p value with a buffer allocation
  * of @p len and @p itype, returning a pointer to the allocated buffer.
  * 
  * If a buffer of @p len and @p itype can be represented inline, no
  * external buffer will be allocated, and instead a pointer to the inline
  * data representation will be returned.
  *
  * @param	value	The value to be initialized.
  * @param	ilen	The size of the external buffer to be allocated.
  * @param	itype	The type of the external buffer to be allocated.
  * @param	flags	Value flags.
  * 
  * @retval non-null	The newly allocated buffer.
  * @retval NULL		If allocation failed.
  * @retval NULL		If @p value is an externally allocated instance.
  */
 static void *
 bhnd_nvram_val_alloc_bytes(bhnd_nvram_val *value, size_t ilen,
     bhnd_nvram_type itype, uint32_t flags)
 {
 	void *ptr;
 
 	BHND_NVRAM_VAL_ASSERT_EMPTY(value);
 
 	/* Can we use inline storage? */
 	if (bhnd_nvram_val_set_inline(value, NULL, ilen, itype) == 0) {
 		BHND_NV_ASSERT(sizeof(value->data) >= ilen,
 		    ("ilen exceeds inline storage"));
 
 		value->data_type = itype;
 		value->data_len = ilen;
 		value->data_storage = BHND_NVRAM_VAL_DATA_INLINE;
 		return (&value->data);
 	}
 
 	/* Is allocation permitted? */
 	if (!(flags & BHND_NVRAM_VAL_DYNAMIC))
 		return (NULL);
 
 	/* Allocate external storage */
 	if ((ptr = bhnd_nv_malloc(ilen)) == NULL)
 		return (NULL);
 
 	value->data.ptr = ptr;
 	value->data_len = ilen;
 	value->data_type = itype;
 	value->data_storage = BHND_NVRAM_VAL_DATA_EXT_ALLOC;
 
 	return (ptr);
 }
Index: projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c
===================================================================
--- projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c	(revision 350434)
+++ projects/fuse2/sys/dev/bhnd/nvram/bhnd_nvram_value_prf.c	(revision 350435)
@@ -1,882 +1,883 @@
 /*-
  * Copyright (c) 2015-2016 Landon Fuller <landonf@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
  *    redistribution must be conditioned upon including a substantially
  *    similar Disclaimer requirement for further binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGES.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/limits.h>
 #include <sys/sbuf.h>
 
 #ifdef _KERNEL
 
 #include <sys/ctype.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 
 #include <machine/_inttypes.h>
 
 #else /* !_KERNEL */
 
 #include <ctype.h>
 #include <inttypes.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <string.h>
 
 #endif /* _KERNEL */
 
 #include "bhnd_nvram_private.h"
 #include "bhnd_nvram_valuevar.h"
 
 #ifdef _KERNEL
 #define	bhnd_nv_hex2ascii(hex)	hex2ascii(hex)
 #else /* !_KERNEL */
 static char const bhnd_nv_hex2ascii[] = "0123456789abcdefghijklmnopqrstuvwxyz";
 #define	bhnd_nv_hex2ascii(hex)		(bhnd_nv_hex2ascii[hex])
 #endif /* _KERNEL */
 
 /**
  * Maximum size, in bytes, of a string-encoded NVRAM integer value, not
  * including any prefix (0x, 0, etc).
  * 
  * We assume the largest possible encoding is the base-2 representation
  * of a 64-bit integer.
  */
 #define NV_NUMSTR_MAX	((sizeof(uint64_t) * CHAR_BIT) + 1)
 
 /**
  * Format a string representation of @p value using @p fmt, with, writing the
  * result to @p outp.
  *
  * @param		value	The value to be formatted.
  * @param		fmt	The format string.
  * @param[out]		outp	On success, the string will be written to this 
  *				buffer. This argment may be NULL if the value is
  *				not desired.
  * @param[in,out]	olen	The capacity of @p outp. On success, will be set
  *				to the actual number of bytes required for the
  *				requested string encoding (including a trailing
  *				NUL).
  * 
  * Refer to bhnd_nvram_val_vprintf() for full format string documentation.
  *
  * @retval 0		success
  * @retval EINVAL	If @p fmt contains unrecognized format string
  *			specifiers.
  * @retval ENOMEM	If the @p outp is non-NULL, and the provided @p olen
  *			is too small to hold the encoded value.
  * @retval EFTYPE	If value coercion from @p value to a single string
  *			value via @p fmt is unsupported.
  * @retval ERANGE	If value coercion of @p value would overflow (or
  *			underflow) the representation defined by @p fmt.
  */
 int
 bhnd_nvram_val_printf(bhnd_nvram_val *value, const char *fmt, char *outp,
     size_t *olen, ...)
 {
 	va_list	ap;
 	int	error;
 
 	va_start(ap, olen);
 	error = bhnd_nvram_val_vprintf(value, fmt, outp, olen, ap);
 	va_end(ap);
 
 	return (error);
 }
 
 
 /**
  * Format a string representation of the elements of @p value using @p fmt,
  * writing the result to @p outp.
  *
  * @param		value	The value to be formatted.
  * @param		fmt	The format string.
  * @param[out]		outp	On success, the string will be written to this 
  *				buffer. This argment may be NULL if the value is
  *				not desired.
  * @param[in,out]	olen	The capacity of @p outp. On success, will be set
  *				to the actual number of bytes required for the
  *				requested string encoding (including a trailing
  *				NUL).
  * @param		ap	Argument list.
  *
  * @par Format Strings
  * 
  * Value format strings are similar, but not identical to, those used
  * by printf(3).
  * 
  * Format specifier format:
  *     %[repeat][flags][width][.precision][length modifier][specifier]
  *
  * The format specifier is interpreted as an encoding directive for an
  * individual value element; each format specifier will fetch the next element
  * from the value, encode the element as the appropriate type based on the
  * length modifiers and specifier, and then format the result as a string.
  * 
  * For example, given a string value of '0x000F', and a format specifier of
  * '%#hhx', the value will be asked to encode its first element as
  * BHND_NVRAM_TYPE_UINT8. String formatting will then be applied to the 8-bit
  * unsigned integer representation, producing a string value of "0xF".
  * 
  * Repeat:
  * - [digits]		Repeatedly apply the format specifier to the input
  *			value's elements up to `digits` times. The delimiter
  *			must be passed as a string in the next variadic
  *			argument.
  * - []			Repeatedly apply the format specifier to the input
  *			value's elements until all elements have been. The
  *			processed. The delimiter must be passed as a string in
  *			the next variadic argument.
  * - [*]		Repeatedly apply the format specifier to the input
  *			value's elements. The repeat count is read from the
  *			next variadic argument as a size_t value
  * 
  * Flags:
  * - '#'		use alternative form (e.g. 0x/0X prefixing of hex
  *			strings).
  * - '0'		zero padding
  * - '-'		left adjust padding
  * - '+'		include a sign character
  * - ' '		include a space in place of a sign character for
  *			positive numbers.
  * 
  * Width/Precision:
  * - digits		minimum field width.
  * - *			read the minimum field width from the next variadic
  *			argument as a ssize_t value. A negative value enables
  *			left adjustment.
  * - .digits		field precision.
  * - .*			read the field precision from the next variadic argument
  *			as a ssize_t value. A negative value enables left
  *			adjustment.
  *
  * Length Modifiers:
  * - 'hh', 'I8'		Convert the value to an 8-bit signed or unsigned
  *			integer.
  * - 'h', 'I16'		Convert the value to an 16-bit signed or unsigned
  *			integer.
  * - 'l', 'I32'		Convert the value to an 32-bit signed or unsigned
  *			integer.
  * - 'll', 'j', 'I64'	Convert the value to an 64-bit signed or unsigned
  *			integer.
  * 
  * Data Specifiers:
  * - 'd', 'i'		Convert and format as a signed decimal integer.
  * - 'u'		Convert and format as an unsigned decimal integer.
  * - 'o'		Convert and format as an unsigned octal integer.
  * - 'x'		Convert and format as an unsigned hexadecimal integer,
  *			using lowercase hex digits.
  * - 'X'		Convert and format as an unsigned hexadecimal integer,
  *			using uppercase hex digits.
  * - 's'		Convert and format as a string.
  * - '%'		Print a literal '%' character.
  *
  * @retval 0		success
  * @retval EINVAL	If @p fmt contains unrecognized format string
  *			specifiers.
  * @retval ENOMEM	If the @p outp is non-NULL, and the provided @p olen
  *			is too small to hold the encoded value.
  * @retval EFTYPE	If value coercion from @p value to a single string
  *			value via @p fmt is unsupported.
  * @retval ERANGE	If value coercion of @p value would overflow (or
  *			underflow) the representation defined by @p fmt.
  */
 int
 bhnd_nvram_val_vprintf(bhnd_nvram_val *value, const char *fmt, char *outp,
     size_t *olen, va_list ap)
 {
 	const void	*elem;
 	size_t		 elen;
 	size_t		 limit, nbytes;
 	int		 error;
 
 	elem = NULL;
 
 	/* Determine output byte limit */
 	nbytes = 0;
 	if (outp != NULL)
 		limit = *olen;
 	else
 		limit = 0;
 
 #define	WRITE_CHAR(_c)	do {			\
 	if (limit > nbytes)			\
 		*(outp + nbytes) = _c;		\
 						\
 	if (nbytes == SIZE_MAX)			\
 		return (EFTYPE);		\
 	nbytes++;				\
 } while (0)
 
 	/* Encode string value as per the format string */
 	for (const char *p = fmt; *p != '\0'; p++) {
 		const char	*delim;
 		size_t		 precision, width, delim_len;
 		u_long		 repeat, bits;
 		bool		 alt_form, ladjust, have_precision;
 		char		 padc, signc, lenc;
 
 		padc = ' ';
 		signc = '\0';
 		lenc = '\0';
 		delim = "";
 		delim_len = 0;
 
 		ladjust = false;
 		alt_form = false;
 
 		have_precision = false;
 		precision = 1;
 		bits = 32;
 		width = 0;
 		repeat = 1;
 
 		/* Copy all input to output until we hit a format specifier */
 		if (*p != '%') {
 			WRITE_CHAR(*p);
 			continue;
 		}
 
 		/* Hit '%' -- is this followed by an escaped '%' literal? */
 		p++;
 		if (*p == '%') {
 			WRITE_CHAR('%');
 			p++;
 			continue;
 		}
 
 		/* Parse repeat specifier */
 		if (*p == '[') {
 			p++;
 			
 			/* Determine repeat count */
 			if (*p == ']') {
 				/* Repeat consumes all input */
 				repeat = bhnd_nvram_val_nelem(value);
 			} else if (*p == '*') {
 				/* Repeat is supplied as an argument */
 				repeat = va_arg(ap, size_t);
 				p++;
 			} else {
 				char *endp;
 
 				/* Repeat specified as argument */
 				repeat = strtoul(p, &endp, 10);
 				if (p == endp) {
 					BHND_NV_LOG("error parsing repeat "
 						    "count at '%s'", p);
 					return (EINVAL);
 				}
 				
 				/* Advance past repeat count */
 				p = endp;
 			}
 
 			/* Advance past terminating ']' */
 			if (*p != ']') {
 				BHND_NV_LOG("error parsing repeat count at "
 				    "'%s'", p);
 				return (EINVAL);
 			}
 			p++;
 
 			delim = va_arg(ap, const char *);
 			delim_len = strlen(delim);
 		}
 
 		/* Parse flags */
 		while (*p != '\0') {
 			const char	*np;
 			bool		 stop;
 
 			stop = false;
 			np = p+1;
 	
 			switch (*p) {
 			case '#':
 				alt_form = true;
 				break;
 			case '0':
 				padc = '0';
 				break;
 			case '-':
 				ladjust = true;
 				break;
 			case ' ':
 				/* Must not override '+' */
 				if (signc != '+')
 					signc = ' ';
 				break;
 			case '+':
 				signc = '+';
 				break;
 			default:
 				/* Non-flag character */
 				stop = true;
 				break;
 			}
 
 			if (stop)
 				break;
 			else
 				p = np;
 		}
 
 		/* Parse minimum width */
 		if (*p == '*') {
 			ssize_t arg;
 
 			/* Width is supplied as an argument */
 			arg = va_arg(ap, int);
 
 			/* Negative width argument is interpreted as
 			 * '-' flag followed by positive width */
 			if (arg < 0) {
 				ladjust = true;
 				arg = -arg;
 			}
 
 			width = arg;
 			p++;
 		} else if (bhnd_nv_isdigit(*p)) {
 			uint32_t	v;
 			size_t		len, parsed;
 
 			/* Parse width value */
 			len = sizeof(v);
 			error = bhnd_nvram_parse_int(p, strlen(p), 10, &parsed,
 			    &v, &len, BHND_NVRAM_TYPE_UINT32);
 			if (error) {
 				BHND_NV_LOG("error parsing width %s: %d\n", p,
 				    error);
 				return (EINVAL);
 			}
 
 			/* Save width and advance input */
 			width = v;
 			p += parsed;
 		}
 
 		/* Parse precision */
 		if (*p == '.') {
 			uint32_t	v;
 			size_t		len, parsed;
 
 			p++;
 			have_precision = true;
 
 			if (*p == '*') {
 				ssize_t arg;
 
 				/* Precision is specified as an argument */
 				arg = va_arg(ap, int);
 
 				/* Negative precision argument is interpreted
 				 * as '-' flag followed by positive
 				 * precision */
 				if (arg < 0) {
 					ladjust = true;
 					arg = -arg;
 				}
 
 				precision = arg;
 			} else if (!bhnd_nv_isdigit(*p)) {
 				/* Implicit precision of 0 */
 				precision = 0;
 			} else {
 				/* Parse precision value */
 				len = sizeof(v);
 				error = bhnd_nvram_parse_int(p, strlen(p), 10,
 				    &parsed, &v, &len,
 				    BHND_NVRAM_TYPE_UINT32);
 				if (error) {
 					BHND_NV_LOG("error parsing width %s: "
 					    "%d\n", p, error);
 					return (EINVAL);
 				}
 
 				/* Save precision and advance input */
 				precision = v;
 				p += parsed;
 			}
 		}
 
 		/* Parse length modifiers */
 		while (*p != '\0') {
 			const char	*np;
 			bool		 stop;
 			
 			stop = false;
 			np = p+1;
 
 			switch (*p) {
 			case 'h':
 				if (lenc == '\0') {
 					/* Set initial length value */
 					lenc = *p;
 					bits = 16;
 				} else if (lenc == *p && bits == 16) {
 					/* Modify previous length value */
 					bits = 8;
 				} else {
 					BHND_NV_LOG("invalid length modifier "
 					    "%c\n", *p);
 					return (EINVAL);
 				}
 				break;
 
 			case 'l':
 				if (lenc == '\0') {
 					/* Set initial length value */
 					lenc = *p;
 					bits = 32;
 				} else if (lenc == *p && bits == 32) {
 					/* Modify previous length value */
 					bits = 64;
 				} else {
 					BHND_NV_LOG("invalid length modifier "
 					    "%c\n", *p);
 					return (EINVAL);
 				}
 				break;
 
 			case 'j':
 				/* Conflicts with all other length
 				 * specifications, and may only occur once */
 				if (lenc != '\0') {
 					BHND_NV_LOG("invalid length modifier "
 					    "%c\n", *p);
 					return (EINVAL);
 				}
 
 				lenc = *p;
 				bits = 64;
 				break;
 
 			case 'I': {
 				char	*endp;
 
 				/* Conflicts with all other length
 				 * specifications, and may only occur once */
 				if (lenc != '\0') {
 					BHND_NV_LOG("invalid length modifier "
 					    "%c\n", *p);
 					return (EINVAL);
 				}
 
 				lenc = *p;
 
 				/* Parse the length specifier value */
 				p++;
 				bits = strtoul(p, &endp, 10);
 				if (p == endp) {
 					BHND_NV_LOG("invalid size specifier: "
 					    "%s\n", p);
 					return (EINVAL);
 				}
 
 				/* Advance input past the parsed integer */
 				np = endp;
 				break;
 			}
 			default:
 				/* Non-length modifier character */
 				stop = true;
 				break;
 			}
 
 			if (stop)
 				break;
 			else
 				p = np;
 		}
 
 		/* Parse conversion specifier and format the value(s) */
 		for (u_long n = 0; n < repeat; n++) {
 			bhnd_nvram_type	arg_type;
 			size_t		arg_size;
 			size_t		i;
 			u_long		base;
 			bool		is_signed, is_upper;
 
 			is_signed = false;
 			is_upper = false;
 			base = 0;
 
 			/* Fetch next element */
 			elem = bhnd_nvram_val_next(value, elem, &elen);
 			if (elem == NULL) {
 				BHND_NV_LOG("format string references more "
 				    "than %zu available value elements\n",
 				    bhnd_nvram_val_nelem(value));
 				return (EINVAL);
 			}
 
 			/*
 			 * If this is not the first value, append the delimiter.
 			 */
 			if (n > 0) {
 				size_t nremain = 0;
 				if (limit > nbytes)
 					nremain = limit - nbytes;
 	
 				if (nremain >= delim_len)
 					memcpy(outp + nbytes, delim, delim_len);
 
 				/* Add delimiter length to the total byte count */
 				if (SIZE_MAX - nbytes < delim_len)
 					return (EFTYPE); /* overflows size_t */
 
 				nbytes += delim_len;
 			}
 
 			/* Parse integer conversion specifiers */
 			switch (*p) {
 			case 'd':
 			case 'i':
 				base = 10;
 				is_signed = true;
 				break;
 
 			case 'u':
 				base = 10;
 				break;
 
 			case 'o':
 				base = 8;
 				break;
 
 			case 'x':
 				base = 16;
 				break;
 
 			case 'X':
 				base = 16;
 				is_upper = true;
 				break;
 			}
 
 			/* Format argument */
 			switch (*p) {
 #define	NV_ENCODE_INT(_width) do { 					\
 	arg_type = (is_signed) ? BHND_NVRAM_TYPE_INT ## _width :	\
 	    BHND_NVRAM_TYPE_UINT ## _width;				\
 	arg_size = sizeof(v.u ## _width);				\
 	error = bhnd_nvram_val_encode_elem(value, elem, elen,		\
 	    &v.u ## _width, &arg_size, arg_type);			\
 	if (error) {							\
 		BHND_NV_LOG("error encoding argument as %s: %d\n",	\
 		     bhnd_nvram_type_name(arg_type), error);		\
 		return (error);						\
 	}								\
 									\
 	if (is_signed) {						\
 		if (v.i ## _width < 0) {				\
 			add_neg = true;					\
 			numval = (int64_t)-(v.i ## _width);		\
 		} else {						\
 			numval = (int64_t) (v.i ## _width);		\
 		}							\
 	} else {							\
 		numval = v.u ## _width;					\
 	}								\
 } while(0)
 			case 'd':
 			case 'i':
 			case 'u':
 			case 'o':
 			case 'x':
 			case 'X': {
 				char		 numbuf[NV_NUMSTR_MAX];
 				char		*sptr;
 				uint64_t	 numval;
 				size_t		 slen;
 				bool		 add_neg;
 				union {
 					uint8_t		u8;
 					uint16_t	u16;
 					uint32_t	u32;
 					uint64_t	u64;
 					int8_t		i8;
 					int16_t		i16;
 					int32_t		i32;
 					int64_t		i64;
 				} v;
 
 				add_neg = false;
 
 				/* If precision is specified, it overrides
 				 * (and behaves identically) to a zero-prefixed
 				 * minimum width */
 				if (have_precision) {
 					padc = '0';
 					width = precision;
 					ladjust = false;
 				}
 
 				/* If zero-padding is used, value must be right
 				 * adjusted */
 				if (padc == '0')
 					ladjust = false;
 
 				/* Request encode to the appropriate integer
 				 * type, and then promote to common 64-bit
 				 * representation */
 				switch (bits) {
 				case 8:
 					NV_ENCODE_INT(8);
 					break;
 				case 16:
 					NV_ENCODE_INT(16);
 					break;
 				case 32:
 					NV_ENCODE_INT(32);
 					break;
 				case 64:
 					NV_ENCODE_INT(64);
 					break;
 				default:
 					BHND_NV_LOG("invalid length specifier: "
 					    "%lu\n", bits);
 					return (EINVAL);
 				}
 #undef	NV_ENCODE_INT
 
 				/* If a precision of 0 is specified and the
 				 * value is also zero, no characters should
 				 * be produced */
 				if (have_precision && precision == 0 &&
 				    numval == 0)
 				{
 					break;
 				}
 
 				/* Emit string representation to local buffer */
 				BHND_NV_ASSERT(base <= 16, ("invalid base"));
 				sptr = numbuf + nitems(numbuf) - 1;
 				for (slen = 0; slen < sizeof(numbuf); slen++) {
 					char		c;
 					uint64_t	n;
 
 					n = numval % base;
 					c = bhnd_nv_hex2ascii(n);
 					if (is_upper)
 						c = bhnd_nv_toupper(c);
 
 					sptr--;
 					*sptr = c;
 
 					numval /= (uint64_t)base;
 					if (numval == 0) {
 						slen++;
 						break;
 					}
 				}
 
 				arg_size = slen;
 
 				/* Reserve space for 0/0x prefix? */
 				if (alt_form) {
 					if (numval == 0) {
 						/* If 0, no prefix */
 						alt_form = false;
 					} else if (base == 8) {
 						arg_size += 1; /* 0 */
 					} else if (base == 16) {
 						arg_size += 2; /* 0x/0X */
 					}
 				}
 
 				/* Reserve space for ' ', '+', or '-' prefix? */
 				if (add_neg || signc != '\0') {
 					if (add_neg)
 						signc = '-';
 
 					arg_size++;
 				}
 
 				/* Right adjust (if using spaces) */
 				if (!ladjust && padc != '0') {
 					for (i = arg_size;  i < width; i++)
 						WRITE_CHAR(padc);
 				}
 
 				if (signc != '\0')
 					WRITE_CHAR(signc);
 
 				if (alt_form) {
 					if (base == 8) {
 						WRITE_CHAR('0');
 					} else if (base == 16) {
 						WRITE_CHAR('0');
 						if (is_upper)
 							WRITE_CHAR('X');
 						else
 							WRITE_CHAR('x');
 					}
 				}
 
 				/* Right adjust (if using zeros) */
 				if (!ladjust && padc == '0') {
 					for (i = slen;  i < width; i++)
 						WRITE_CHAR(padc);
 				}
 
 				/* Write the string to our output buffer */
 				if (limit > nbytes && limit - nbytes >= slen)
 					memcpy(outp + nbytes, sptr, slen);
 
 				/* Update the total byte count */
 				if (SIZE_MAX - nbytes < arg_size)
 					return (EFTYPE); /* overflows size_t */
 
 				nbytes += arg_size;
 
 				/* Left adjust */
 				for (i = arg_size; ladjust && i < width; i++)
 					WRITE_CHAR(padc);
 
 				break;
 			}
 
 			case 's': {
 				char	*s;
 				size_t	 slen;
 
 				/* Query the total length of the element when
 				 * converted to a string */
 				arg_type = BHND_NVRAM_TYPE_STRING;
 				error = bhnd_nvram_val_encode_elem(value, elem,
 				    elen, NULL, &arg_size, arg_type);
 				if (error) {
 					BHND_NV_LOG("error encoding argument "
 					    "as %s: %d\n",
 					    bhnd_nvram_type_name(arg_type),
 					    error);
 					return (error);
 				}
 
 				/* Do not include trailing NUL in the string
 				 * length */
 				if (arg_size > 0)
 					arg_size--;
 
 				/* Right adjust */
 				for (i = arg_size; !ladjust && i < width; i++)
 					WRITE_CHAR(padc);
 
 				/* Determine output positition and remaining
 				 * buffer space */
 				if (limit > nbytes) {
 					s = outp + nbytes;
 					slen = limit - nbytes;
 				} else {
 					s = NULL;
 					slen = 0;
 				}
 
 				/* Encode the string to our output buffer */
 				error = bhnd_nvram_val_encode_elem(value, elem,
 				    elen, s, &slen, arg_type);
 				if (error && error != ENOMEM) {
 					BHND_NV_LOG("error encoding argument "
 					    "as %s: %d\n",
 					    bhnd_nvram_type_name(arg_type),
 					    error);
 					return (error);
 				}
 
 				/* Update the total byte count */
 				if (SIZE_MAX - nbytes < arg_size)
 					return (EFTYPE); /* overflows size_t */
 
 				nbytes += arg_size;
 
 				/* Left adjust */
 				for (i = arg_size; ladjust && i < width; i++)
 					WRITE_CHAR(padc);
 
 				break;
 			}
 
 			case 'c': {
 				char c;
 
 				arg_type = BHND_NVRAM_TYPE_CHAR;
 				arg_size = bhnd_nvram_type_width(arg_type);
 
 				/* Encode as single character */
 				error = bhnd_nvram_val_encode_elem(value, elem,
 				    elen, &c, &arg_size, arg_type);
 				if (error) {
 					BHND_NV_LOG("error encoding argument "
 					    "as %s: %d\n",
 					    bhnd_nvram_type_name(arg_type),
 					    error);
 					return (error);
 				}
 
 				BHND_NV_ASSERT(arg_size == sizeof(c),
 				    ("invalid encoded size"));
 
 				/* Right adjust */
 				for (i = arg_size; !ladjust && i < width; i++)
 					WRITE_CHAR(padc);
 
 				WRITE_CHAR(padc);
 
 				/* Left adjust */
 				for (i = arg_size; ladjust && i < width; i++)
 					WRITE_CHAR(padc);
 
 				break;
 			}
 			}
 		}
 	}
 
 	/* Append terminating NUL */
 	if (limit > nbytes)
 		*(outp + nbytes) = '\0';
 
 	if (nbytes < SIZE_MAX)
 		nbytes++;
 	else
 		return (EFTYPE);
 
 	/* Report required space */
 	*olen = nbytes;
 	if (limit < nbytes) {
 		if (outp != NULL)
 			return (ENOMEM);
 	}
 
 	return (0);
 }
Index: projects/fuse2/sys/dev/drm2/drmP.h
===================================================================
--- projects/fuse2/sys/dev/drm2/drmP.h	(revision 350434)
+++ projects/fuse2/sys/dev/drm2/drmP.h	(revision 350435)
@@ -1,1955 +1,1956 @@
 /**
  * \file drmP.h
  * Private header for Direct Rendering Manager
  *
  * \author Rickard E. (Rik) Faith <faith@valinux.com>
  * \author Gareth Hughes <gareth@valinux.com>
  */
 
 /*
  * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas.
  * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California.
  * Copyright (c) 2009-2010, Code Aurora Forum.
  * All rights reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  * OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _DRM_P_H_
 #define _DRM_P_H_
 
 #if defined(_KERNEL) || defined(__KERNEL__)
 
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/sglist.h>
 #include <sys/stat.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/fcntl.h>
 #include <sys/uio.h>
 #include <sys/filio.h>
 #include <sys/rwlock.h>
 #include <sys/selinfo.h>
 #include <sys/sysctl.h>
 #include <sys/bus.h>
 #include <sys/queue.h>
 #include <sys/signalvar.h>
 #include <sys/poll.h>
 #include <sys/sbuf.h>
 #include <sys/taskqueue.h>
 #include <sys/tree.h>
 #include <sys/vmmeter.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_param.h>
 #include <vm/vm_phys.h>
 #include <machine/bus.h>
 #include <machine/resource.h>
 #if defined(__i386__) || defined(__amd64__)
 #include <machine/specialreg.h>
 #endif
 #include <machine/sysarch.h>
 #include <sys/endian.h>
 #include <sys/mman.h>
 #include <sys/rman.h>
 #include <sys/memrange.h>
 #include <dev/agp/agpvar.h>
 #include <sys/agpio.h>
 #include <sys/mutex.h>
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 #include <sys/selinfo.h>
 #include <sys/bus.h>
 
 #include <dev/drm2/drm.h>
 #include <dev/drm2/drm_sarea.h>
 
 #include <dev/drm2/drm_atomic.h>
 #include <dev/drm2/drm_linux_list.h>
 #include <dev/drm2/drm_gem_names.h>
 
 #include <dev/drm2/drm_os_freebsd.h>
 
 #if defined(CONFIG_AGP) || (defined(CONFIG_AGP_MODULE) && defined(MODULE))
 #define __OS_HAS_AGP 1
 #else
 #define __OS_HAS_AGP 0
 #endif
 #if defined(CONFIG_MTRR)
 #define __OS_HAS_MTRR 1
 #else
 #define __OS_HAS_MTRR 0
 #endif
 
 struct drm_file;
 struct drm_device;
 
 #include <dev/drm2/drm_hashtab.h>
 #include <dev/drm2/drm_mm.h>
 
 #include "opt_drm.h"
 #include "opt_syscons.h"
 #ifdef DRM_DEBUG
 #undef DRM_DEBUG
 #define DRM_DEBUG_DEFAULT_ON 1
 #endif /* DRM_DEBUG */
 
 #define	DRM_DEBUGBITS_DEBUG		0x1
 #define	DRM_DEBUGBITS_KMS		0x2
 #define	DRM_DEBUGBITS_FAILED_IOCTL	0x4
 
 #undef DRM_LINUX
 #define DRM_LINUX 0
 
 /***********************************************************************/
 /** \name DRM template customization defaults */
 /*@{*/
 
 /* driver capabilities and requirements mask */
 #define DRIVER_USE_AGP     0x1
 #define DRIVER_REQUIRE_AGP 0x2
 #define DRIVER_USE_MTRR    0x4
 #define DRIVER_PCI_DMA     0x8
 #define DRIVER_SG          0x10
 #define DRIVER_HAVE_DMA    0x20
 #define DRIVER_HAVE_IRQ    0x40
 #define DRIVER_IRQ_SHARED  0x80
 #define DRIVER_IRQ_VBL     0x100
 #define DRIVER_DMA_QUEUE   0x200
 #define DRIVER_FB_DMA      0x400
 #define DRIVER_IRQ_VBL2    0x800
 #define DRIVER_GEM         0x1000
 #define DRIVER_MODESET     0x2000
 #define DRIVER_PRIME       0x4000
 
 #define DRIVER_BUS_PCI 0x1
 #define DRIVER_BUS_PLATFORM 0x2
 #define DRIVER_BUS_USB 0x3
 
 /***********************************************************************/
 /** \name Begin the DRM... */
 /*@{*/
 
 #define DRM_DEBUG_CODE 2	  /**< Include debugging code if > 1, then
 				     also include looping detection. */
 
 #define DRM_MAGIC_HASH_ORDER  4  /**< Size of key hash table. Must be power of 2. */
 #define DRM_KERNEL_CONTEXT    0	 /**< Change drm_resctx if changed */
 #define DRM_RESERVED_CONTEXTS 1	 /**< Change drm_resctx if changed */
 #define DRM_LOOPING_LIMIT     5000000
 #define DRM_TIME_SLICE	      (HZ/20)  /**< Time slice for GLXContexts */
 #define DRM_LOCK_SLICE	      1	/**< Time slice for lock, in jiffies */
 
 #define DRM_FLAG_DEBUG	  0x01
 
 #define DRM_MAX_CTXBITMAP (PAGE_SIZE * 8)
 #define DRM_MAP_HASH_OFFSET 0x10000000
 
 /*@}*/
 
 /***********************************************************************/
 /** \name Macros to make printk easier */
 /*@{*/
 
 /**
  * Error output.
  *
  * \param fmt printf() like format string.
  * \param arg arguments
  */
 #define DRM_ERROR(fmt, ...) \
 	printf("error: [" DRM_NAME ":pid%d:%s] *ERROR* " fmt,		\
 	    DRM_CURRENTPID, __func__ , ##__VA_ARGS__)
 
 #define DRM_WARNING(fmt, ...)  printf("warning: [" DRM_NAME "] " fmt , ##__VA_ARGS__)
 #define DRM_INFO(fmt, ...)  printf("info: [" DRM_NAME "] " fmt , ##__VA_ARGS__)
 
 /**
  * Debug output.
  *
  * \param fmt printf() like format string.
  * \param arg arguments
  */
 #define DRM_DEBUG(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_DEBUG) != 0)			\
 		printf("[" DRM_NAME ":pid%d:%s] " fmt, DRM_CURRENTPID,	\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_DEBUG_DRIVER(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_DEBUG_KMS(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME ":KMS:pid%d:%s] " fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG(fmt, ...) do {						\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID,	\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG_KMS(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG_MODE(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:pid%d:%s]" fmt, DRM_CURRENTPID,	\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 #define DRM_LOG_DRIVER(fmt, ...) do {					\
 	if ((drm_debug & DRM_DEBUGBITS_KMS) != 0)			\
 		printf("[" DRM_NAME "]:KMS:pid%d:%s]" fmt, DRM_CURRENTPID,\
 			__func__ , ##__VA_ARGS__);			\
 } while (0)
 
 /*@}*/
 
 /***********************************************************************/
 /** \name Internal types and structures */
 /*@{*/
 
 #define DRM_ARRAY_SIZE(x) ARRAY_SIZE(x)
 
 #define DRM_LEFTCOUNT(x) (((x)->rp + (x)->count - (x)->wp) % ((x)->count + 1))
 #define DRM_BUFCOUNT(x) ((x)->count - DRM_LEFTCOUNT(x))
 
 #define DRM_IF_VERSION(maj, min) (maj << 16 | min)
 
 /**
  * Test that the hardware lock is held by the caller, returning otherwise.
  *
  * \param dev DRM device.
  * \param filp file pointer of the caller.
  */
 #define LOCK_TEST_WITH_RETURN( dev, _file_priv )				\
 do {										\
 	if (!_DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock) ||	\
 	    _file_priv->master->lock.file_priv != _file_priv)	{		\
 		DRM_ERROR( "%s called without lock held, held  %d owner %p %p\n",\
 			   __func__, _DRM_LOCK_IS_HELD(_file_priv->master->lock.hw_lock->lock),\
 			   _file_priv->master->lock.file_priv, _file_priv);	\
 		return -EINVAL;							\
 	}									\
 } while (0)
 
 /**
  * Ioctl function type.
  *
  * \param inode device inode.
  * \param file_priv DRM file private pointer.
  * \param cmd command.
  * \param arg argument.
  */
 typedef int drm_ioctl_t(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 #define DRM_IOCTL_NR(n)                ((n) & 0xff)
 #define DRM_MAJOR       226
 
 #define DRM_AUTH	0x1
 #define	DRM_MASTER	0x2
 #define DRM_ROOT_ONLY	0x4
 #define DRM_CONTROL_ALLOW 0x8
 #define DRM_UNLOCKED	0x10
 
 struct drm_ioctl_desc {
 	unsigned long cmd;
 	int flags;
 	drm_ioctl_t *func;
 	unsigned int cmd_drv;
 };
 
 /**
  * Creates a driver or general drm_ioctl_desc array entry for the given
  * ioctl, for use by drm_ioctl().
  */
 
 #define DRM_IOCTL_DEF(ioctl, _func, _flags) \
 	[DRM_IOCTL_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, .cmd_drv = 0}
 
 #define DRM_IOCTL_DEF_DRV(ioctl, _func, _flags)			\
 	[DRM_IOCTL_NR(DRM_##ioctl)] = {.cmd = DRM_##ioctl, .func = _func, .flags = _flags, .cmd_drv = DRM_IOCTL_##ioctl}
 
 struct drm_magic_entry {
 	struct list_head head;
 	struct drm_hash_item hash_item;
 	struct drm_file *priv;
 };
 
 /**
  * DMA buffer.
  */
 struct drm_buf {
 	int idx;		       /**< Index into master buflist */
 	int total;		       /**< Buffer size */
 	int order;		       /**< log-base-2(total) */
 	int used;		       /**< Amount of buffer in use (for DMA) */
 	unsigned long offset;	       /**< Byte offset (used internally) */
 	void *address;		       /**< Address of buffer */
 	unsigned long bus_address;     /**< Bus address of buffer */
 	struct drm_buf *next;	       /**< Kernel-only: used for free list */
 	__volatile__ int waiting;      /**< On kernel DMA queue */
 	__volatile__ int pending;      /**< On hardware DMA queue */
 	struct drm_file *file_priv;    /**< Private of holding file descr */
 	int context;		       /**< Kernel queue for this buffer */
 	int while_locked;	       /**< Dispatch this buffer while locked */
 	enum {
 		DRM_LIST_NONE = 0,
 		DRM_LIST_FREE = 1,
 		DRM_LIST_WAIT = 2,
 		DRM_LIST_PEND = 3,
 		DRM_LIST_PRIO = 4,
 		DRM_LIST_RECLAIM = 5
 	} list;			       /**< Which list we're on */
 
 	int dev_priv_size;		 /**< Size of buffer private storage */
 	void *dev_private;		 /**< Per-buffer private storage */
 };
 
 struct drm_freelist {
 	int initialized;	       /**< Freelist in use */
 	atomic_t count;		       /**< Number of free buffers */
 	struct drm_buf *next;	       /**< End pointer */
 
 #ifdef FREEBSD_NOTYET
 	wait_queue_head_t waiting;     /**< Processes waiting on free bufs */
 #endif /* defined(FREEBSD_NOTYET) */
 	int low_mark;		       /**< Low water mark */
 	int high_mark;		       /**< High water mark */
 #ifdef FREEBSD_NOTYET
 	atomic_t wfh;		       /**< If waiting for high mark */
 	spinlock_t lock;
 #endif /* defined(FREEBSD_NOTYET) */
 };
 
 typedef struct drm_dma_handle {
 	void *vaddr;
 	bus_addr_t busaddr;
 	bus_dma_tag_t tag;
 	bus_dmamap_t map;
 } drm_dma_handle_t;
 
 /**
  * Buffer entry.  There is one of this for each buffer size order.
  */
 struct drm_buf_entry {
 	int buf_size;			/**< size */
 	int buf_count;			/**< number of buffers */
 	struct drm_buf *buflist;		/**< buffer list */
 	int seg_count;
 	int page_order;
 	struct drm_dma_handle **seglist;
 
 	struct drm_freelist freelist;
 };
 
 /* Event queued up for userspace to read */
 struct drm_pending_event {
 	struct drm_event *event;
 	struct list_head link;
 	struct drm_file *file_priv;
 	pid_t pid; /* pid of requester, no guarantee it's valid by the time
 		      we deliver the event, for tracing only */
 	void (*destroy)(struct drm_pending_event *event);
 };
 
 /* initial implementaton using a linked list - todo hashtab */
 struct drm_prime_file_private {
 	struct list_head head;
 	struct mtx lock;
 };
 
 struct drm_file {
 	int authenticated;
 	pid_t pid;
 	uid_t uid;
 	drm_magic_t magic;
 	unsigned long ioctl_count;
 	struct list_head lhead;
 	struct drm_minor *minor;
 	unsigned long lock_count;
 
 	void *driver_priv;
 	struct drm_gem_names object_names;
 
 	int is_master; /* this file private is a master for a minor */
 	struct drm_master *master; /* master this node is currently associated with
 				      N.B. not always minor->master */
 	struct list_head fbs;
 
 	struct selinfo event_poll;
 	struct list_head event_list;
 	int event_space;
 
 	struct drm_prime_file_private prime;
 };
 
 /**
  * Lock data.
  */
 struct drm_lock_data {
 	struct drm_hw_lock *hw_lock;	/**< Hardware lock */
 	/** Private of lock holder's file (NULL=kernel) */
 	struct drm_file *file_priv;
 	wait_queue_head_t lock_queue;	/**< Queue of blocked processes */
 	unsigned long lock_time;	/**< Time of last lock in jiffies */
 	struct mtx spinlock;
 	uint32_t kernel_waiters;
 	uint32_t user_waiters;
 	int idle_has_lock;
 };
 
 /**
  * DMA data.
  */
 struct drm_device_dma {
 
 	struct drm_buf_entry bufs[DRM_MAX_ORDER + 1];	/**< buffers, grouped by their size order */
 	int buf_count;			/**< total number of buffers */
 	struct drm_buf **buflist;		/**< Vector of pointers into drm_device_dma::bufs */
 	int seg_count;
 	int page_count;			/**< number of pages */
 	unsigned long *pagelist;	/**< page list */
 	unsigned long byte_count;
 	enum {
 		_DRM_DMA_USE_AGP = 0x01,
 		_DRM_DMA_USE_SG = 0x02,
 		_DRM_DMA_USE_FB = 0x04,
 		_DRM_DMA_USE_PCI_RO = 0x08
 	} flags;
 
 };
 
 /**
  * AGP memory entry.  Stored as a doubly linked list.
  */
 struct drm_agp_mem {
 	unsigned long handle;		/**< handle */
 	DRM_AGP_MEM *memory;
 	unsigned long bound;		/**< address */
 	int pages;
 	struct list_head head;
 };
 
 /**
  * AGP data.
  *
  * \sa drm_agp_init() and drm_device::agp.
  */
 struct drm_agp_head {
 	DRM_AGP_KERN agp_info;		/**< AGP device information */
 	struct list_head memory;
 	unsigned long mode;		/**< AGP mode */
 	device_t bridge;
 	int enabled;			/**< whether the AGP bus as been enabled */
 	int acquired;			/**< whether the AGP device has been acquired */
 	unsigned long base;
 	int agp_mtrr;
 	int cant_use_aperture;
 };
 
 /**
  * Scatter-gather memory.
  */
 struct drm_sg_mem {
 	vm_offset_t vaddr;
 	vm_paddr_t *busaddr;
 	vm_pindex_t pages;
 };
 
 struct drm_sigdata {
 	int context;
 	struct drm_hw_lock *lock;
 };
 
 /**
  * Kernel side of a mapping
  */
 #define DRM_MAP_HANDLE_BITS	(sizeof(void *) == 4 ? 4 : 24)
 #define DRM_MAP_HANDLE_SHIFT	(sizeof(void *) * 8 - DRM_MAP_HANDLE_BITS)
 
 struct drm_local_map {
 	resource_size_t offset;	 /**< Requested physical address (0 for SAREA)*/
 	unsigned long size;	 /**< Requested physical size (bytes) */
 	enum drm_map_type type;	 /**< Type of memory to map */
 	enum drm_map_flags flags;	 /**< Flags */
 	void *handle;		 /**< User-space: "Handle" to pass to mmap() */
 				 /**< Kernel-space: kernel-virtual address */
 	int mtrr;		 /**< MTRR slot used */
 
 				  /* Private data                         */
 	drm_dma_handle_t *dmah;
 };
 
 typedef struct drm_local_map drm_local_map_t;
 
 /**
  * Mappings list
  */
 struct drm_map_list {
 	struct list_head head;		/**< list head */
 	struct drm_hash_item hash;
 	struct drm_local_map *map;	/**< mapping */
 	uint64_t user_token;
 	struct drm_master *master;
 	struct drm_mm_node *file_offset_node;	/**< fake offset */
 };
 
 /**
  * Context handle list
  */
 struct drm_ctx_list {
 	struct list_head head;		/**< list head */
 	drm_context_t handle;		/**< context handle */
 	struct drm_file *tag;		/**< associated fd private data */
 };
 
 /* location of GART table */
 #define DRM_ATI_GART_MAIN 1
 #define DRM_ATI_GART_FB   2
 
 #define DRM_ATI_GART_PCI 1
 #define DRM_ATI_GART_PCIE 2
 #define DRM_ATI_GART_IGP 3
 
 struct drm_ati_pcigart_info {
 	int gart_table_location;
 	int gart_reg_if;
 	void *addr;
 	dma_addr_t bus_addr;
 	dma_addr_t table_mask;
 	struct drm_dma_handle *table_handle;
 	struct drm_local_map mapping;
 	int table_size;
 	struct drm_dma_handle *dmah; /* handle for ATI PCIGART table FIXME */
 };
 
 /**
  * GEM specific mm private for tracking GEM objects
  */
 struct drm_gem_mm {
 	struct unrhdr *idxunr;
 	struct drm_open_hash offset_hash; /**< User token hash table for maps */
 };
 
 /**
  * This structure defines the drm_mm memory object, which will be used by the
  * DRM for its buffer objects.
  */
 struct drm_gem_object {
 	/** Reference count of this object */
 	u_int refcount;
 
 	/** Handle count of this object. Each handle also holds a reference */
 	atomic_t handle_count; /* number of handles on this object */
 
 	/** Related drm device */
 	struct drm_device *dev;
 
 	/** File representing the shmem storage: filp in Linux parlance */
 	vm_object_t vm_obj;
 
 	/* Mapping info for this object */
 	bool on_map;
 	struct drm_hash_item map_list;
 
 	/**
 	 * Size of the object, in bytes.  Immutable over the object's
 	 * lifetime.
 	 */
 	size_t size;
 
 	/**
 	 * Global name for this object, starts at 1. 0 means unnamed.
 	 * Access is covered by the object_name_lock in the related drm_device
 	 */
 	int name;
 
 	/**
 	 * Memory domains. These monitor which caches contain read/write data
 	 * related to the object. When transitioning from one set of domains
 	 * to another, the driver is called to ensure that caches are suitably
 	 * flushed and invalidated
 	 */
 	uint32_t read_domains;
 	uint32_t write_domain;
 
 	/**
 	 * While validating an exec operation, the
 	 * new read/write domain values are computed here.
 	 * They will be transferred to the above values
 	 * at the point that any cache flushing occurs
 	 */
 	uint32_t pending_read_domains;
 	uint32_t pending_write_domain;
 
 	void *driver_private;
 
 #ifdef FREEBSD_NOTYET
 	/* dma buf exported from this GEM object */
 	struct dma_buf *export_dma_buf;
 
 	/* dma buf attachment backing this object */
 	struct dma_buf_attachment *import_attach;
 #endif /* FREEBSD_NOTYET */
 };
 
 #include <dev/drm2/drm_crtc.h>
 
 /* per-master structure */
 struct drm_master {
 
 	u_int refcount; /* refcount for this master */
 
 	struct list_head head; /**< each minor contains a list of masters */
 	struct drm_minor *minor; /**< link back to minor we are a master for */
 
 	char *unique;			/**< Unique identifier: e.g., busid */
 	int unique_len;			/**< Length of unique field */
 	int unique_size;		/**< amount allocated */
 
 	int blocked;			/**< Blocked due to VC switch? */
 
 	/** \name Authentication */
 	/*@{ */
 	struct drm_open_hash magiclist;
 	struct list_head magicfree;
 	/*@} */
 
 	struct drm_lock_data lock;	/**< Information on hardware lock */
 
 	void *driver_priv; /**< Private structure for driver to use */
 };
 
 /* Size of ringbuffer for vblank timestamps. Just double-buffer
  * in initial implementation.
  */
 #define DRM_VBLANKTIME_RBSIZE 2
 
 /* Flags and return codes for get_vblank_timestamp() driver function. */
 #define DRM_CALLED_FROM_VBLIRQ 1
 #define DRM_VBLANKTIME_SCANOUTPOS_METHOD (1 << 0)
 #define DRM_VBLANKTIME_INVBL             (1 << 1)
 
 /* get_scanout_position() return flags */
 #define DRM_SCANOUTPOS_VALID        (1 << 0)
 #define DRM_SCANOUTPOS_INVBL        (1 << 1)
 #define DRM_SCANOUTPOS_ACCURATE     (1 << 2)
 
 struct drm_bus {
 	int bus_type;
 	int (*get_irq)(struct drm_device *dev);
 	void (*free_irq)(struct drm_device *dev);
 	const char *(*get_name)(struct drm_device *dev);
 	int (*set_busid)(struct drm_device *dev, struct drm_master *master);
 	int (*set_unique)(struct drm_device *dev, struct drm_master *master,
 			  struct drm_unique *unique);
 	int (*irq_by_busid)(struct drm_device *dev, struct drm_irq_busid *p);
 	/* hooks that are for PCI */
 	int (*agp_init)(struct drm_device *dev);
 
 };
 
 /**
  * DRM driver structure. This structure represent the common code for
  * a family of cards. There will one drm_device for each card present
  * in this family
  */
 struct drm_driver {
 	int (*load) (struct drm_device *, unsigned long flags);
 	int (*firstopen) (struct drm_device *);
 	int (*open) (struct drm_device *, struct drm_file *);
 	void (*preclose) (struct drm_device *, struct drm_file *file_priv);
 	void (*postclose) (struct drm_device *, struct drm_file *);
 	void (*lastclose) (struct drm_device *);
 	int (*unload) (struct drm_device *);
 	int (*suspend) (struct drm_device *, pm_message_t state);
 	int (*resume) (struct drm_device *);
 	int (*dma_ioctl) (struct drm_device *dev, void *data, struct drm_file *file_priv);
 	int (*dma_quiescent) (struct drm_device *);
 	int (*context_dtor) (struct drm_device *dev, int context);
 
 	/**
 	 * get_vblank_counter - get raw hardware vblank counter
 	 * @dev: DRM device
 	 * @crtc: counter to fetch
 	 *
 	 * Driver callback for fetching a raw hardware vblank counter for @crtc.
 	 * If a device doesn't have a hardware counter, the driver can simply
 	 * return the value of drm_vblank_count. The DRM core will account for
 	 * missed vblank events while interrupts where disabled based on system
 	 * timestamps.
 	 *
 	 * Wraparound handling and loss of events due to modesetting is dealt
 	 * with in the DRM core code.
 	 *
 	 * RETURNS
 	 * Raw vblank counter value.
 	 */
 	u32 (*get_vblank_counter) (struct drm_device *dev, int crtc);
 
 	/**
 	 * enable_vblank - enable vblank interrupt events
 	 * @dev: DRM device
 	 * @crtc: which irq to enable
 	 *
 	 * Enable vblank interrupts for @crtc.  If the device doesn't have
 	 * a hardware vblank counter, this routine should be a no-op, since
 	 * interrupts will have to stay on to keep the count accurate.
 	 *
 	 * RETURNS
 	 * Zero on success, appropriate errno if the given @crtc's vblank
 	 * interrupt cannot be enabled.
 	 */
 	int (*enable_vblank) (struct drm_device *dev, int crtc);
 
 	/**
 	 * disable_vblank - disable vblank interrupt events
 	 * @dev: DRM device
 	 * @crtc: which irq to enable
 	 *
 	 * Disable vblank interrupts for @crtc.  If the device doesn't have
 	 * a hardware vblank counter, this routine should be a no-op, since
 	 * interrupts will have to stay on to keep the count accurate.
 	 */
 	void (*disable_vblank) (struct drm_device *dev, int crtc);
 
 	/**
 	 * Called by \c drm_device_is_agp.  Typically used to determine if a
 	 * card is really attached to AGP or not.
 	 *
 	 * \param dev  DRM device handle
 	 *
 	 * \returns
 	 * One of three values is returned depending on whether or not the
 	 * card is absolutely \b not AGP (return of 0), absolutely \b is AGP
 	 * (return of 1), or may or may not be AGP (return of 2).
 	 */
 	int (*device_is_agp) (struct drm_device *dev);
 
 	/**
 	 * Called by vblank timestamping code.
 	 *
 	 * Return the current display scanout position from a crtc.
 	 *
 	 * \param dev  DRM device.
 	 * \param crtc Id of the crtc to query.
 	 * \param *vpos Target location for current vertical scanout position.
 	 * \param *hpos Target location for current horizontal scanout position.
 	 *
 	 * Returns vpos as a positive number while in active scanout area.
 	 * Returns vpos as a negative number inside vblank, counting the number
 	 * of scanlines to go until end of vblank, e.g., -1 means "one scanline
 	 * until start of active scanout / end of vblank."
 	 *
 	 * \return Flags, or'ed together as follows:
 	 *
 	 * DRM_SCANOUTPOS_VALID = Query successful.
 	 * DRM_SCANOUTPOS_INVBL = Inside vblank.
 	 * DRM_SCANOUTPOS_ACCURATE = Returned position is accurate. A lack of
 	 * this flag means that returned position may be offset by a constant
 	 * but unknown small number of scanlines wrt. real scanout position.
 	 *
 	 */
 	int (*get_scanout_position) (struct drm_device *dev, int crtc,
 				     int *vpos, int *hpos);
 
 	/**
 	 * Called by \c drm_get_last_vbltimestamp. Should return a precise
 	 * timestamp when the most recent VBLANK interval ended or will end.
 	 *
 	 * Specifically, the timestamp in @vblank_time should correspond as
 	 * closely as possible to the time when the first video scanline of
 	 * the video frame after the end of VBLANK will start scanning out,
 	 * the time immediately after end of the VBLANK interval. If the
 	 * @crtc is currently inside VBLANK, this will be a time in the future.
 	 * If the @crtc is currently scanning out a frame, this will be the
 	 * past start time of the current scanout. This is meant to adhere
 	 * to the OpenML OML_sync_control extension specification.
 	 *
 	 * \param dev dev DRM device handle.
 	 * \param crtc crtc for which timestamp should be returned.
 	 * \param *max_error Maximum allowable timestamp error in nanoseconds.
 	 *                   Implementation should strive to provide timestamp
 	 *                   with an error of at most *max_error nanoseconds.
 	 *                   Returns true upper bound on error for timestamp.
 	 * \param *vblank_time Target location for returned vblank timestamp.
 	 * \param flags 0 = Defaults, no special treatment needed.
 	 * \param       DRM_CALLED_FROM_VBLIRQ = Function is called from vblank
 	 *	        irq handler. Some drivers need to apply some workarounds
 	 *              for gpu-specific vblank irq quirks if flag is set.
 	 *
 	 * \returns
 	 * Zero if timestamping isn't supported in current display mode or a
 	 * negative number on failure. A positive status code on success,
 	 * which describes how the vblank_time timestamp was computed.
 	 */
 	int (*get_vblank_timestamp) (struct drm_device *dev, int crtc,
 				     int *max_error,
 				     struct timeval *vblank_time,
 				     unsigned flags);
 
 	/* these have to be filled in */
 
 	irqreturn_t(*irq_handler) (DRM_IRQ_ARGS);
 	void (*irq_preinstall) (struct drm_device *dev);
 	int (*irq_postinstall) (struct drm_device *dev);
 	void (*irq_uninstall) (struct drm_device *dev);
 	void (*set_version) (struct drm_device *dev,
 			     struct drm_set_version *sv);
 
 	/* Master routines */
 	int (*master_create)(struct drm_device *dev, struct drm_master *master);
 	void (*master_destroy)(struct drm_device *dev, struct drm_master *master);
 	/**
 	 * master_set is called whenever the minor master is set.
 	 * master_drop is called whenever the minor master is dropped.
 	 */
 
 	int (*master_set)(struct drm_device *dev, struct drm_file *file_priv,
 			  bool from_open);
 	void (*master_drop)(struct drm_device *dev, struct drm_file *file_priv,
 			    bool from_release);
 
 	/**
 	 * Driver-specific constructor for drm_gem_objects, to set up
 	 * obj->driver_private.
 	 *
 	 * Returns 0 on success.
 	 */
 	int (*gem_init_object) (struct drm_gem_object *obj);
 	void (*gem_free_object) (struct drm_gem_object *obj);
 	int (*gem_open_object) (struct drm_gem_object *, struct drm_file *);
 	void (*gem_close_object) (struct drm_gem_object *, struct drm_file *);
 
 #ifdef FREEBSD_NOTYET
 	/* prime: */
 	/* export handle -> fd (see drm_gem_prime_handle_to_fd() helper) */
 	int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv,
 				uint32_t handle, uint32_t flags, int *prime_fd);
 	/* import fd -> handle (see drm_gem_prime_fd_to_handle() helper) */
 	int (*prime_fd_to_handle)(struct drm_device *dev, struct drm_file *file_priv,
 				int prime_fd, uint32_t *handle);
 	/* export GEM -> dmabuf */
 	struct dma_buf * (*gem_prime_export)(struct drm_device *dev,
 				struct drm_gem_object *obj, int flags);
 	/* import dmabuf -> GEM */
 	struct drm_gem_object * (*gem_prime_import)(struct drm_device *dev,
 				struct dma_buf *dma_buf);
 #endif /* defined(FREEBSD_NOTYET) */
 
 	/* dumb alloc support */
 	int (*dumb_create)(struct drm_file *file_priv,
 			   struct drm_device *dev,
 			   struct drm_mode_create_dumb *args);
 	int (*dumb_map_offset)(struct drm_file *file_priv,
 			       struct drm_device *dev, uint32_t handle,
 			       uint64_t *offset);
 	int (*dumb_destroy)(struct drm_file *file_priv,
 			    struct drm_device *dev,
 			    uint32_t handle);
 
 	/* Driver private ops for this object */
 	struct cdev_pager_ops *gem_pager_ops;
 
 	int	(*sysctl_init)(struct drm_device *dev,
 		    struct sysctl_ctx_list *ctx, struct sysctl_oid *top);
 	void	(*sysctl_cleanup)(struct drm_device *dev);
 
 	int major;
 	int minor;
 	int patchlevel;
 	char *name;
 	char *desc;
 	char *date;
 
 	u32 driver_features;
 	int dev_priv_size;
 	struct drm_ioctl_desc *ioctls;
 	int num_ioctls;
 	struct drm_bus *bus;
 #ifdef COMPAT_FREEBSD32
 	struct drm_ioctl_desc *compat_ioctls;
 	int *num_compat_ioctls;
 #endif
 
 	int	buf_priv_size;
 };
 
 #define DRM_MINOR_UNASSIGNED 0
 #define DRM_MINOR_LEGACY 1
 #define DRM_MINOR_CONTROL 2
 #define DRM_MINOR_RENDER 3
 
 /**
  * DRM minor structure. This structure represents a drm minor number.
  */
 struct drm_minor {
 	int index;			/**< Minor device number */
 	int type;                       /**< Control or render */
 	struct cdev *device;		/**< Device number for mknod */
 	device_t kdev;			/**< OS device */
 	struct drm_device *dev;
 
 	struct drm_master *master; /* currently active master for this node */
 	struct list_head master_list;
 	struct drm_mode_group mode_group;
 
 	struct sigio *buf_sigio;	/* Processes waiting for SIGIO     */
 };
 
 /* mode specified on the command line */
 struct drm_cmdline_mode {
 	bool specified;
 	bool refresh_specified;
 	bool bpp_specified;
 	int xres, yres;
 	int bpp;
 	int refresh;
 	bool rb;
 	bool interlace;
 	bool cvt;
 	bool margins;
 	enum drm_connector_force force;
 };
 
 
 struct drm_pending_vblank_event {
 	struct drm_pending_event base;
 	int pipe;
 	struct drm_event_vblank event;
 };
 
 /**
  * DRM device structure. This structure represent a complete card that
  * may contain multiple heads.
  */
 struct drm_device {
 	int if_version;			/**< Highest interface version set */
 
 	/** \name Locks */
 	/*@{ */
 	struct mtx count_lock;		/**< For inuse, drm_device::open_count, drm_device::buf_use */
 	struct sx dev_struct_lock;	/**< For others */
 	/*@} */
 
 	/** \name Usage Counters */
 	/*@{ */
 	int open_count;			/**< Outstanding files open */
 	atomic_t ioctl_count;		/**< Outstanding IOCTLs pending */
 	atomic_t vma_count;		/**< Outstanding vma areas open */
 	int buf_use;			/**< Buffers in use -- cannot alloc */
 	atomic_t buf_alloc;		/**< Buffer allocation in progress */
 	/*@} */
 
 	/** \name Performance counters */
 	/*@{ */
 	unsigned long counters;
 	enum drm_stat_type types[15];
 	atomic_t counts[15];
 	/*@} */
 
 	struct list_head filelist;
 
 	/** \name Memory management */
 	/*@{ */
 	struct list_head maplist;	/**< Linked list of regions */
 	int map_count;			/**< Number of mappable regions */
 	struct drm_open_hash map_hash;	/**< User token hash table for maps */
 
 	/** \name Context handle management */
 	/*@{ */
 	struct list_head ctxlist;	/**< Linked list of context handles */
 	int ctx_count;			/**< Number of context handles */
 	struct mtx ctxlist_mutex;	/**< For ctxlist */
 	drm_local_map_t **context_sareas;
 	int max_context;
 	unsigned long *ctx_bitmap;
 
 	/*@} */
 
 	/** \name DMA support */
 	/*@{ */
 	struct drm_device_dma *dma;		/**< Optional pointer for DMA support */
 	/*@} */
 
 	/** \name Context support */
 	/*@{ */
 	int irq_enabled;		/**< True if irq handler is enabled */
 	atomic_t context_flag;		/**< Context swapping flag */
 	atomic_t interrupt_flag;	/**< Interruption handler flag */
 	atomic_t dma_flag;		/**< DMA dispatch flag */
 	wait_queue_head_t context_wait;	/**< Processes waiting on ctx switch */
 	int last_checked;		/**< Last context checked for DMA */
 	int last_context;		/**< Last current context */
 	unsigned long last_switch;	/**< jiffies at last context switch */
 	/*@} */
 
 	/** \name VBLANK IRQ support */
 	/*@{ */
 
 	/*
 	 * At load time, disabling the vblank interrupt won't be allowed since
 	 * old clients may not call the modeset ioctl and therefore misbehave.
 	 * Once the modeset ioctl *has* been called though, we can safely
 	 * disable them when unused.
 	 */
 	int vblank_disable_allowed;
 
 	atomic_t *_vblank_count;        /**< number of VBLANK interrupts (driver must alloc the right number of counters) */
 	struct timeval *_vblank_time;   /**< timestamp of current vblank_count (drivers must alloc right number of fields) */
 	struct mtx vblank_time_lock;    /**< Protects vblank count and time updates during vblank enable/disable */
 	struct mtx vbl_lock;
 	atomic_t *vblank_refcount;      /* number of users of vblank interruptsper crtc */
 	u32 *last_vblank;               /* protected by dev->vbl_lock, used */
 					/* for wraparound handling */
 	int *vblank_enabled;            /* so we don't call enable more than
 					   once per disable */
 	int *vblank_inmodeset;          /* Display driver is setting mode */
 	u32 *last_vblank_wait;		/* Last vblank seqno waited per CRTC */
 	struct callout vblank_disable_callout;
 
 	u32 max_vblank_count;           /**< size of vblank counter register */
 
 	/**
 	 * List of events
 	 */
 	struct list_head vblank_event_list;
 	struct mtx event_lock;
 
 	/*@} */
 
 	struct drm_agp_head *agp;	/**< AGP data */
 
 	device_t dev;			/* Device instance from newbus */
 	uint16_t pci_device;		/* PCI device id */
 	uint16_t pci_vendor;		/* PCI vendor id */
 	uint16_t pci_subdevice;		/* PCI subsystem device id */
 	uint16_t pci_subvendor;		/* PCI subsystem vendor id */
 
 	struct drm_sg_mem *sg;	/**< Scatter gather memory */
 	unsigned int num_crtcs;                  /**< Number of CRTCs on this device */
 	void *dev_private;		/**< device private data */
 	void *mm_private;
 	struct drm_sigdata sigdata;	   /**< For block_all_signals */
 	sigset_t sigmask;
 
 	struct drm_driver *driver;
 	struct drm_local_map *agp_buffer_map;
 	unsigned int agp_buffer_token;
 	struct drm_minor *control;		/**< Control node for card */
 	struct drm_minor *primary;		/**< render type primary screen head */
 
         struct drm_mode_config mode_config;	/**< Current mode config */
 
 	/** \name GEM information */
 	/*@{ */
 	struct sx object_name_lock;
 	struct drm_gem_names object_names;
 	/*@} */
 	int switch_power_state;
 
 	atomic_t unplugged; /* device has been unplugged or gone away */
 
 				/* Locks */
 	struct mtx	  dma_lock;	/* protects dev->dma */
 	struct mtx	  irq_lock;	/* protects irq condition checks */
 
 				/* Context support */
 	int		  irq;		/* Interrupt used by board	   */
 	int		  msi_enabled;	/* MSI enabled */
 	int		  irqrid;	/* Interrupt used by board */
 	struct resource   *irqr;	/* Resource for interrupt used by board	   */
 	void		  *irqh;	/* Handle from bus_setup_intr      */
 
 	/* Storage of resource pointers for drm_get_resource_* */
 #define	DRM_MAX_PCI_RESOURCE	6
 	struct resource   *pcir[DRM_MAX_PCI_RESOURCE];
 	int		  pcirid[DRM_MAX_PCI_RESOURCE];
 	struct mtx	  pcir_lock;
 
 	int		  pci_domain;
 	int		  pci_bus;
 	int		  pci_slot;
 	int		  pci_func;
 
 				/* Sysctl support */
 	struct drm_sysctl_info *sysctl;
 	int		  sysctl_node_idx;
 
 	void		  *drm_ttm_bdev;
 
 	void *sysctl_private;
 	char busid_str[128];
 	int modesetting;
 
 	const drm_pci_id_list_t *id_entry;	/* PCI ID, name, and chipset private */
 };
 
 #define DRM_SWITCH_POWER_ON 0
 #define DRM_SWITCH_POWER_OFF 1
 #define DRM_SWITCH_POWER_CHANGING 2
 
 static __inline__ int drm_core_check_feature(struct drm_device *dev,
 					     int feature)
 {
 	return ((dev->driver->driver_features & feature) ? 1 : 0);
 }
 
 static inline int drm_dev_to_irq(struct drm_device *dev)
 {
 	return dev->driver->bus->get_irq(dev);
 }
 
 
 #if __OS_HAS_AGP
 static inline int drm_core_has_AGP(struct drm_device *dev)
 {
 	return drm_core_check_feature(dev, DRIVER_USE_AGP);
 }
 #else
 #define drm_core_has_AGP(dev) (0)
 #endif
 
 #if __OS_HAS_MTRR
 static inline int drm_core_has_MTRR(struct drm_device *dev)
 {
 	return drm_core_check_feature(dev, DRIVER_USE_MTRR);
 }
 
 #define DRM_MTRR_WC		MDF_WRITECOMBINE
 
 int drm_mtrr_add(unsigned long offset, unsigned long size, unsigned int flags);
 int drm_mtrr_del(int handle, unsigned long offset, unsigned long size, unsigned int flags);
 
 #else
 #define drm_core_has_MTRR(dev) (0)
 
 #define DRM_MTRR_WC		0
 
 static inline int drm_mtrr_add(unsigned long offset, unsigned long size,
 			       unsigned int flags)
 {
 	return 0;
 }
 
 static inline int drm_mtrr_del(int handle, unsigned long offset,
 			       unsigned long size, unsigned int flags)
 {
 	return 0;
 }
 #endif
 
 /******************************************************************/
 /** \name Internal function definitions */
 /*@{*/
 
 				/* Driver support (drm_drv.h) */
 d_ioctl_t drm_ioctl;
 extern int drm_lastclose(struct drm_device *dev);
 
 				/* Device support (drm_fops.h) */
 extern struct sx drm_global_mutex;
 d_open_t drm_open;
 d_read_t drm_read;
 extern void drm_release(void *data);
 
 				/* Mapping support (drm_vm.h) */
 d_mmap_t drm_mmap;
 int	drm_mmap_single(struct cdev *kdev, vm_ooffset_t *offset,
 	    vm_size_t size, struct vm_object **obj_res, int nprot);
 d_poll_t drm_poll;
 
 
 				/* Misc. IOCTL support (drm_ioctl.h) */
 extern int drm_irq_by_busid(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
 extern int drm_getunique(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_setunique(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_getmap(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_getclient(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_getstats(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_getcap(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_setversion(struct drm_device *dev, void *data,
 			  struct drm_file *file_priv);
 extern int drm_noop(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 
 				/* Context IOCTL support (drm_context.h) */
 extern int drm_resctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_addctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_modctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_getctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_switchctx(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_newctx(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_rmctx(struct drm_device *dev, void *data,
 		     struct drm_file *file_priv);
 
 extern int drm_ctxbitmap_init(struct drm_device *dev);
 extern void drm_ctxbitmap_cleanup(struct drm_device *dev);
 extern void drm_ctxbitmap_free(struct drm_device *dev, int ctx_handle);
 
 extern int drm_setsareactx(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 extern int drm_getsareactx(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 
 				/* Authentication IOCTL support (drm_auth.h) */
 extern int drm_getmagic(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_authmagic(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_remove_magic(struct drm_master *master, drm_magic_t magic);
 
 /* Cache management (drm_cache.c) */
 void drm_clflush_pages(vm_page_t *pages, unsigned long num_pages);
 void drm_clflush_virt_range(char *addr, unsigned long length);
 
 				/* Locking IOCTL support (drm_lock.h) */
 extern int drm_lock(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 extern int drm_unlock(struct drm_device *dev, void *data,
 		      struct drm_file *file_priv);
 extern int drm_lock_free(struct drm_lock_data *lock_data, unsigned int context);
 extern void drm_idlelock_take(struct drm_lock_data *lock_data);
 extern void drm_idlelock_release(struct drm_lock_data *lock_data);
 
 /*
  * These are exported to drivers so that they can implement fencing using
  * DMA quiscent + idle. DMA quiescent usually requires the hardware lock.
  */
 
 extern int drm_i_have_hw_lock(struct drm_device *dev, struct drm_file *file_priv);
 
 				/* Buffer management support (drm_bufs.h) */
 extern int drm_addbufs_agp(struct drm_device *dev, struct drm_buf_desc * request);
 extern int drm_addbufs_pci(struct drm_device *dev, struct drm_buf_desc * request);
 extern int drm_addmap(struct drm_device *dev, resource_size_t offset,
 		      unsigned int size, enum drm_map_type type,
 		      enum drm_map_flags flags, struct drm_local_map **map_ptr);
 extern int drm_addmap_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
 extern int drm_rmmap(struct drm_device *dev, struct drm_local_map *map);
 extern int drm_rmmap_locked(struct drm_device *dev, struct drm_local_map *map);
 extern int drm_rmmap_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 extern int drm_addbufs(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 extern int drm_infobufs(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_markbufs(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_freebufs(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_mapbufs(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 extern int drm_order(unsigned long size);
 
 				/* DMA support (drm_dma.h) */
 extern int drm_dma_setup(struct drm_device *dev);
 extern void drm_dma_takedown(struct drm_device *dev);
 extern void drm_free_buffer(struct drm_device *dev, struct drm_buf * buf);
 extern void drm_core_reclaim_buffers(struct drm_device *dev,
 				     struct drm_file *filp);
 
 				/* IRQ support (drm_irq.h) */
 extern int drm_control(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 extern int drm_irq_install(struct drm_device *dev);
 extern int drm_irq_uninstall(struct drm_device *dev);
 
 extern int drm_vblank_init(struct drm_device *dev, int num_crtcs);
 extern int drm_wait_vblank(struct drm_device *dev, void *data,
 			   struct drm_file *filp);
 extern int drm_vblank_wait(struct drm_device *dev, unsigned int *vbl_seq);
 extern u32 drm_vblank_count(struct drm_device *dev, int crtc);
 extern u32 drm_vblank_count_and_time(struct drm_device *dev, int crtc,
 				     struct timeval *vblanktime);
 extern void drm_send_vblank_event(struct drm_device *dev, int crtc,
 				     struct drm_pending_vblank_event *e);
 extern bool drm_handle_vblank(struct drm_device *dev, int crtc);
 extern int drm_vblank_get(struct drm_device *dev, int crtc);
 extern void drm_vblank_put(struct drm_device *dev, int crtc);
 extern void drm_vblank_off(struct drm_device *dev, int crtc);
 extern void drm_vblank_cleanup(struct drm_device *dev);
 extern u32 drm_get_last_vbltimestamp(struct drm_device *dev, int crtc,
 				     struct timeval *tvblank, unsigned flags);
 extern int drm_calc_vbltimestamp_from_scanoutpos(struct drm_device *dev,
 						 int crtc, int *max_error,
 						 struct timeval *vblank_time,
 						 unsigned flags,
 						 struct drm_crtc *refcrtc);
 extern void drm_calc_timestamping_constants(struct drm_crtc *crtc);
 
 extern bool
 drm_mode_parse_command_line_for_connector(const char *mode_option,
 					  struct drm_connector *connector,
 					  struct drm_cmdline_mode *mode);
 
 extern struct drm_display_mode *
 drm_mode_create_from_cmdline_mode(struct drm_device *dev,
 				  struct drm_cmdline_mode *cmd);
 
 /* Modesetting support */
 extern void drm_vblank_pre_modeset(struct drm_device *dev, int crtc);
 extern void drm_vblank_post_modeset(struct drm_device *dev, int crtc);
 extern int drm_modeset_ctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 
 
 				/* Stub support (drm_stub.h) */
 extern int drm_setmaster_ioctl(struct drm_device *dev, void *data,
 			       struct drm_file *file_priv);
 extern int drm_dropmaster_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *file_priv);
 struct drm_master *drm_master_create(struct drm_minor *minor);
 extern struct drm_master *drm_master_get(struct drm_master *master);
 extern void drm_master_put(struct drm_master **master);
 
 extern void drm_put_dev(struct drm_device *dev);
 extern int drm_put_minor(struct drm_minor **minor);
 extern void drm_unplug_dev(struct drm_device *dev);
 extern unsigned int drm_debug;
 extern unsigned int drm_notyet;
 
 extern unsigned int drm_vblank_offdelay;
 extern unsigned int drm_timestamp_precision;
 extern unsigned int drm_timestamp_monotonic;
 
 extern struct drm_local_map *drm_getsarea(struct drm_device *dev);
 
 
 #ifdef FREEBSD_NOTYET
 extern int drm_gem_prime_handle_to_fd(struct drm_device *dev,
 		struct drm_file *file_priv, uint32_t handle, uint32_t flags,
 		int *prime_fd);
 extern int drm_gem_prime_fd_to_handle(struct drm_device *dev,
 		struct drm_file *file_priv, int prime_fd, uint32_t *handle);
 
 extern int drm_prime_handle_to_fd_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv);
 extern int drm_prime_fd_to_handle_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv);
 
 extern int drm_prime_sg_to_page_addr_arrays(struct sg_table *sgt, vm_page_t *pages,
 					    dma_addr_t *addrs, int max_pages);
 extern struct sg_table *drm_prime_pages_to_sg(vm_page_t *pages, int nr_pages);
 extern void drm_prime_gem_destroy(struct drm_gem_object *obj, struct sg_table *sg);
 
 
 void drm_prime_init_file_private(struct drm_prime_file_private *prime_fpriv);
 void drm_prime_destroy_file_private(struct drm_prime_file_private *prime_fpriv);
 int drm_prime_add_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t handle);
 int drm_prime_lookup_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf, uint32_t *handle);
 void drm_prime_remove_imported_buf_handle(struct drm_prime_file_private *prime_fpriv, struct dma_buf *dma_buf);
 
 int drm_prime_add_dma_buf(struct drm_device *dev, struct drm_gem_object *obj);
 int drm_prime_lookup_obj(struct drm_device *dev, struct dma_buf *buf,
 			 struct drm_gem_object **obj);
 #endif /* FREEBSD_NOTYET */
 
 				/* Scatter Gather Support (drm_scatter.h) */
 extern void drm_sg_cleanup(struct drm_sg_mem * entry);
 extern int drm_sg_alloc_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_sg_alloc(struct drm_device *dev, struct drm_scatter_gather * request);
 extern int drm_sg_free(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 
 			       /* ATI PCIGART support (ati_pcigart.h) */
 extern int drm_ati_pcigart_init(struct drm_device *dev,
 				struct drm_ati_pcigart_info * gart_info);
 extern int drm_ati_pcigart_cleanup(struct drm_device *dev,
 				   struct drm_ati_pcigart_info * gart_info);
 
 extern drm_dma_handle_t *drm_pci_alloc(struct drm_device *dev, size_t size,
 				       size_t align, dma_addr_t maxaddr);
 extern void __drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah);
 extern void drm_pci_free(struct drm_device *dev, drm_dma_handle_t * dmah);
 
 /* Graphics Execution Manager library functions (drm_gem.c) */
 int drm_gem_init(struct drm_device *dev);
 void drm_gem_destroy(struct drm_device *dev);
 void drm_gem_object_release(struct drm_gem_object *obj);
 void drm_gem_object_free(struct drm_gem_object *obj);
 struct drm_gem_object *drm_gem_object_alloc(struct drm_device *dev,
 					    size_t size);
 int drm_gem_object_init(struct drm_device *dev,
 			struct drm_gem_object *obj, size_t size);
 int drm_gem_private_object_init(struct drm_device *dev,
 			struct drm_gem_object *obj, size_t size);
 void drm_gem_object_handle_free(struct drm_gem_object *obj);
 int drm_gem_mmap_single(struct drm_device *dev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **obj_res, int nprot);
 void drm_gem_pager_dtr(void *obj);
 
 #include <dev/drm2/drm_global.h>
 
 static inline void
 drm_gem_object_reference(struct drm_gem_object *obj)
 {
 
 	KASSERT(obj->refcount > 0, ("Dangling obj %p", obj));
 	refcount_acquire(&obj->refcount);
 }
 
 static inline void
 drm_gem_object_unreference(struct drm_gem_object *obj)
 {
 
 	if (obj == NULL)
 		return;
 	if (refcount_release(&obj->refcount))
 		drm_gem_object_free(obj);
 }
 
 static inline void
 drm_gem_object_unreference_unlocked(struct drm_gem_object *obj)
 {
 	if (obj != NULL) {
 		struct drm_device *dev = obj->dev;
 		DRM_LOCK(dev);
 		drm_gem_object_unreference(obj);
 		DRM_UNLOCK(dev);
 	}
 }
 
 int drm_gem_handle_create(struct drm_file *file_priv,
 			  struct drm_gem_object *obj,
 			  u32 *handlep);
 int drm_gem_handle_delete(struct drm_file *filp, u32 handle);
 
 static inline void
 drm_gem_object_handle_reference(struct drm_gem_object *obj)
 {
 	drm_gem_object_reference(obj);
 	atomic_inc(&obj->handle_count);
 }
 
 static inline void
 drm_gem_object_handle_unreference(struct drm_gem_object *obj)
 {
 	if (obj == NULL)
 		return;
 
 	if (atomic_read(&obj->handle_count) == 0)
 		return;
 	/*
 	 * Must bump handle count first as this may be the last
 	 * ref, in which case the object would disappear before we
 	 * checked for a name
 	 */
 	if (atomic_dec_and_test(&obj->handle_count))
 		drm_gem_object_handle_free(obj);
 	drm_gem_object_unreference(obj);
 }
 
 static inline void
 drm_gem_object_handle_unreference_unlocked(struct drm_gem_object *obj)
 {
 	if (obj == NULL)
 		return;
 
 	if (atomic_read(&obj->handle_count) == 0)
 		return;
 
 	/*
 	* Must bump handle count first as this may be the last
 	* ref, in which case the object would disappear before we
 	* checked for a name
 	*/
 
 	if (atomic_dec_and_test(&obj->handle_count))
 		drm_gem_object_handle_free(obj);
 	drm_gem_object_unreference_unlocked(obj);
 }
 
 void drm_gem_free_mmap_offset(struct drm_gem_object *obj);
 int drm_gem_create_mmap_offset(struct drm_gem_object *obj);
 
 struct drm_gem_object *drm_gem_object_lookup(struct drm_device *dev,
 					     struct drm_file *filp,
 					     u32 handle);
 int drm_gem_close_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int drm_gem_flink_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int drm_gem_open_ioctl(struct drm_device *dev, void *data,
 		       struct drm_file *file_priv);
 void drm_gem_open(struct drm_device *dev, struct drm_file *file_private);
 void drm_gem_release(struct drm_device *dev, struct drm_file *file_private);
 
 extern void drm_core_ioremap(struct drm_local_map *map, struct drm_device *dev);
 extern void drm_core_ioremap_wc(struct drm_local_map *map, struct drm_device *dev);
 extern void drm_core_ioremapfree(struct drm_local_map *map, struct drm_device *dev);
 
 static __inline__ struct drm_local_map *drm_core_findmap(struct drm_device *dev,
 							 unsigned int token)
 {
 	struct drm_map_list *_entry;
 	list_for_each_entry(_entry, &dev->maplist, head)
 	    if (_entry->user_token == token)
 		return _entry->map;
 	return NULL;
 }
 
 static __inline__ void drm_core_dropmap(struct drm_local_map *map)
 {
 }
 
 #include <dev/drm2/drm_mem_util.h>
 
 extern int drm_fill_in_dev(struct drm_device *dev,
 			   struct drm_driver *driver);
 extern void drm_cancel_fill_in_dev(struct drm_device *dev);
 int drm_get_minor(struct drm_device *dev, struct drm_minor **minor, int type);
 /*@}*/
 
 /* PCI section */
 int drm_pci_device_is_agp(struct drm_device *dev);
 int drm_pci_device_is_pcie(struct drm_device *dev);
 
 extern int drm_get_pci_dev(device_t kdev, struct drm_device *dev,
 			   struct drm_driver *driver);
 
 #define DRM_PCIE_SPEED_25 1
 #define DRM_PCIE_SPEED_50 2
 #define DRM_PCIE_SPEED_80 4
 
 extern int drm_pcie_get_speed_cap_mask(struct drm_device *dev, u32 *speed_mask);
 
 #define	drm_can_sleep()	(DRM_HZ & 1)
 
 /* Platform section */
 int drm_get_platform_dev(device_t kdev, struct drm_device *dev,
 			 struct drm_driver *driver);
 
 /* FreeBSD specific -- should be moved to drm_os_freebsd.h */
 
 #define	DRM_GEM_MAPPING_MASK	(3ULL << 62)
 #define	DRM_GEM_MAPPING_KEY	(2ULL << 62) /* Non-canonical address form */
 #define	DRM_GEM_MAX_IDX		0x3fffff
 #define	DRM_GEM_MAPPING_IDX(o)	(((o) >> 40) & DRM_GEM_MAX_IDX)
 #define	DRM_GEM_MAPPING_OFF(i)	(((uint64_t)(i)) << 40)
 #define	DRM_GEM_MAPPING_MAPOFF(o) \
     ((o) & ~(DRM_GEM_MAPPING_OFF(DRM_GEM_MAX_IDX) | DRM_GEM_MAPPING_KEY))
 
 SYSCTL_DECL(_hw_drm);
 
 #define DRM_DEV_MODE	(S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
 #define DRM_DEV_UID	UID_ROOT
 #define DRM_DEV_GID	GID_VIDEO
 
 #define DRM_WAKEUP(w)		wakeup((void *)w)
 #define DRM_WAKEUP_INT(w)	wakeup(w)
 #define DRM_INIT_WAITQUEUE(queue) do {(void)(queue);} while (0)
 
 #define DRM_CURPROC		curthread
 #define DRM_STRUCTPROC		struct thread
 #define DRM_SPINTYPE		struct mtx
 #define DRM_SPININIT(l,name)	mtx_init(l, name, NULL, MTX_DEF)
 #define DRM_SPINUNINIT(l)	mtx_destroy(l)
 #define DRM_SPINLOCK(l)		mtx_lock(l)
 #define DRM_SPINUNLOCK(u)	mtx_unlock(u)
 #define DRM_SPINLOCK_IRQSAVE(l, irqflags) do {		\
 	mtx_lock(l);					\
 	(void)irqflags;					\
 } while (0)
 #define DRM_SPINUNLOCK_IRQRESTORE(u, irqflags) mtx_unlock(u)
 #define DRM_SPINLOCK_ASSERT(l)	mtx_assert(l, MA_OWNED)
 #define	DRM_LOCK_SLEEP(dev, chan, flags, msg, timeout)			\
     (sx_sleep((chan), &(dev)->dev_struct_lock, (flags), (msg), (timeout)))
 #if defined(INVARIANTS)
 #define	DRM_LOCK_ASSERT(dev)	sx_assert(&(dev)->dev_struct_lock, SA_XLOCKED)
 #define	DRM_UNLOCK_ASSERT(dev)	sx_assert(&(dev)->dev_struct_lock, SA_UNLOCKED)
 #else
 #define	DRM_LOCK_ASSERT(d)
 #define	DRM_UNLOCK_ASSERT(d)
 #endif
 
 #define DRM_SYSCTL_HANDLER_ARGS	(SYSCTL_HANDLER_ARGS)
 
 enum {
 	DRM_IS_NOT_AGP,
 	DRM_IS_AGP,
 	DRM_MIGHT_BE_AGP
 };
 
 #define DRM_VERIFYAREA_READ( uaddr, size )		\
 	(!useracc(__DECONST(caddr_t, uaddr), size, VM_PROT_READ))
 
 #define DRM_COPY_TO_USER(user, kern, size) \
 	copyout(kern, user, size)
 #define DRM_COPY_FROM_USER(kern, user, size) \
 	copyin(user, kern, size)
 #define DRM_COPY_FROM_USER_UNCHECKED(arg1, arg2, arg3) 	\
 	copyin(arg2, arg1, arg3)
 #define DRM_COPY_TO_USER_UNCHECKED(arg1, arg2, arg3)	\
 	copyout(arg2, arg1, arg3)
 #define DRM_GET_USER_UNCHECKED(val, uaddr)		\
 	((val) = fuword32(uaddr), 0)
 
 #define DRM_GET_PRIV_SAREA(_dev, _ctx, _map) do {	\
 	(_map) = (_dev)->context_sareas[_ctx];		\
 } while(0)
 
 /* Returns -errno to shared code */
 #define DRM_WAIT_ON( ret, queue, timeout, condition )		\
 for ( ret = 0 ; !ret && !(condition) ; ) {			\
 	DRM_UNLOCK(dev);					\
 	mtx_lock(&dev->irq_lock);				\
 	if (!(condition))					\
 	    ret = -mtx_sleep(&(queue), &dev->irq_lock, 		\
 		PCATCH, "drmwtq", (timeout));			\
 	    if (ret == -ERESTART)				\
 	        ret = -ERESTARTSYS;				\
 	mtx_unlock(&dev->irq_lock);				\
 	DRM_LOCK(dev);						\
 }
 
 #define	dev_err(dev, fmt, ...)						\
 	device_printf((dev), "error: " fmt, ## __VA_ARGS__)
 #define	dev_warn(dev, fmt, ...)						\
 	device_printf((dev), "warning: " fmt, ## __VA_ARGS__)
 #define	dev_info(dev, fmt, ...)						\
 	device_printf((dev), "info: " fmt, ## __VA_ARGS__)
 #define	dev_dbg(dev, fmt, ...) do {					\
 	if ((drm_debug& DRM_DEBUGBITS_KMS) != 0) {			\
 		device_printf((dev), "debug: " fmt, ## __VA_ARGS__);	\
 	}								\
 } while (0)
 
 struct drm_msi_blacklist_entry
 {
 	int vendor;
 	int device;
 };
 
 struct drm_vblank_info {
 	wait_queue_head_t queue;	/* vblank wait queue */
 	atomic_t count;			/* number of VBLANK interrupts */
 					/* (driver must alloc the right number of counters) */
 	atomic_t refcount;		/* number of users of vblank interrupts */
 	u32 last;			/* protected by dev->vbl_lock, used */
 					/* for wraparound handling */
 	int enabled;			/* so we don't call enable more than */
 					/* once per disable */
 	int inmodeset;			/* Display driver is setting mode */
 };
 
 #ifndef DMA_BIT_MASK
 #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : (1ULL<<(n)) - 1)
 #endif
 
 #define upper_32_bits(n) ((u32)(((n) >> 16) >> 16))
 
 enum dmi_field {
         DMI_NONE,
         DMI_BIOS_VENDOR,
         DMI_BIOS_VERSION,
         DMI_BIOS_DATE,
         DMI_SYS_VENDOR,
         DMI_PRODUCT_NAME,
         DMI_PRODUCT_VERSION,
         DMI_PRODUCT_SERIAL,
         DMI_PRODUCT_UUID,
         DMI_BOARD_VENDOR,
         DMI_BOARD_NAME,
         DMI_BOARD_VERSION,
         DMI_BOARD_SERIAL,
         DMI_BOARD_ASSET_TAG,
         DMI_CHASSIS_VENDOR,
         DMI_CHASSIS_TYPE,
         DMI_CHASSIS_VERSION,
         DMI_CHASSIS_SERIAL,
         DMI_CHASSIS_ASSET_TAG,
         DMI_STRING_MAX,
 };
 
 struct dmi_strmatch {
 	unsigned char slot;
 	char substr[79];
 };
 
 struct dmi_system_id {
         int (*callback)(const struct dmi_system_id *);
         const char *ident;
         struct dmi_strmatch matches[4];
 };
 #define	DMI_MATCH(a, b) {(a), (b)}
 bool dmi_check_system(const struct dmi_system_id *);
 
 /* Device setup support (drm_drv.c) */
 int	drm_probe_helper(device_t kdev, const drm_pci_id_list_t *idlist);
 int	drm_attach_helper(device_t kdev, const drm_pci_id_list_t *idlist,
 	    struct drm_driver *driver);
 int	drm_generic_suspend(device_t kdev);
 int	drm_generic_resume(device_t kdev);
 int	drm_generic_detach(device_t kdev);
 
 void drm_event_wakeup(struct drm_pending_event *e);
 
 int drm_add_busid_modesetting(struct drm_device *dev,
     struct sysctl_ctx_list *ctx, struct sysctl_oid *top);
 
 /* Buffer management support (drm_bufs.c) */
 unsigned long drm_get_resource_start(struct drm_device *dev,
 				     unsigned int resource);
 unsigned long drm_get_resource_len(struct drm_device *dev,
 				   unsigned int resource);
 
 /* IRQ support (drm_irq.c) */
 irqreturn_t drm_irq_handler(DRM_IRQ_ARGS);
 void	drm_driver_irq_preinstall(struct drm_device *dev);
 void	drm_driver_irq_postinstall(struct drm_device *dev);
 void	drm_driver_irq_uninstall(struct drm_device *dev);
 
 /* sysctl support (drm_sysctl.h) */
 extern int		drm_sysctl_init(struct drm_device *dev);
 extern int		drm_sysctl_cleanup(struct drm_device *dev);
 
 int	drm_version(struct drm_device *dev, void *data,
 		    struct drm_file *file_priv);
 
 /* consistent PCI memory functions (drm_pci.c) */
 int	drm_pci_set_busid(struct drm_device *dev, struct drm_master *master);
 int	drm_pci_set_unique(struct drm_device *dev, struct drm_master *master,
 	    struct drm_unique *u);
 int	drm_pci_agp_init(struct drm_device *dev);
 int	drm_pci_enable_msi(struct drm_device *dev);
 void	drm_pci_disable_msi(struct drm_device *dev);
 
 struct ttm_bo_device;
 int ttm_bo_mmap_single(struct ttm_bo_device *bdev, vm_ooffset_t *offset,
     vm_size_t size, struct vm_object **obj_res, int nprot);
 struct ttm_buffer_object;
 void ttm_bo_release_mmap(struct ttm_buffer_object *bo);
 
 #if  __OS_HAS_AGP
 				/* Memory management support (drm_memory.h) */
 extern void drm_free_agp(DRM_AGP_MEM * handle, int pages);
 extern int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start);
 #ifdef FREEBSD_NOTYET
 extern DRM_AGP_MEM *drm_agp_bind_pages(struct drm_device *dev,
 				       struct page **pages,
 				       unsigned long num_pages,
 				       uint32_t gtt_offset,
 				       uint32_t type);
 #endif /* FREEBSD_NOTYET */
 extern int drm_unbind_agp(DRM_AGP_MEM * handle);
 
 				/* AGP/GART support (drm_agpsupport.h) */
 extern struct drm_agp_head *drm_agp_init(struct drm_device *dev);
 extern int drm_agp_acquire(struct drm_device *dev);
 extern int drm_agp_acquire_ioctl(struct drm_device *dev, void *data,
 				 struct drm_file *file_priv);
 extern int drm_agp_release(struct drm_device *dev);
 extern int drm_agp_release_ioctl(struct drm_device *dev, void *data,
 				 struct drm_file *file_priv);
 extern int drm_agp_enable(struct drm_device *dev, struct drm_agp_mode mode);
 extern int drm_agp_enable_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *file_priv);
 extern int drm_agp_info(struct drm_device *dev, struct drm_agp_info *info);
 extern int drm_agp_info_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_agp_alloc(struct drm_device *dev, struct drm_agp_buffer *request);
 extern int drm_agp_alloc_ioctl(struct drm_device *dev, void *data,
 			 struct drm_file *file_priv);
 extern int drm_agp_free(struct drm_device *dev, struct drm_agp_buffer *request);
 extern int drm_agp_free_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 extern int drm_agp_unbind(struct drm_device *dev, struct drm_agp_binding *request);
 extern int drm_agp_unbind_ioctl(struct drm_device *dev, void *data,
 			  struct drm_file *file_priv);
 extern int drm_agp_bind(struct drm_device *dev, struct drm_agp_binding *request);
 extern int drm_agp_bind_ioctl(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 
 #else
 
 static inline void drm_free_agp(DRM_AGP_MEM * handle, int pages)
 {
 }
 
 static inline int drm_bind_agp(DRM_AGP_MEM * handle, unsigned int start)
 {
 	return -ENODEV;
 }
 
 static inline int drm_unbind_agp(DRM_AGP_MEM * handle)
 {
 	return -ENODEV;
 }
 #ifdef FREEBSD_NOTYET
 static inline struct agp_memory *drm_agp_bind_pages(struct drm_device *dev,
 					      struct page **pages,
 					      unsigned long num_pages,
 					      uint32_t gtt_offset,
 					      uint32_t type)
 {
 	return NULL;
 }
 #endif
 static inline struct drm_agp_head *drm_agp_init(struct drm_device *dev)
 {
 	return NULL;
 }
 
 static inline void drm_agp_clear(struct drm_device *dev)
 {
 }
 
 static inline int drm_agp_acquire(struct drm_device *dev)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_acquire_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_release(struct drm_device *dev)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_release_ioctl(struct drm_device *dev, void *data,
 					struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_enable(struct drm_device *dev,
 				 struct drm_agp_mode mode)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_enable_ioctl(struct drm_device *dev, void *data,
 				       struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_info(struct drm_device *dev,
 			       struct drm_agp_info *info)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_info_ioctl(struct drm_device *dev, void *data,
 				     struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_alloc(struct drm_device *dev,
 				struct drm_agp_buffer *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_alloc_ioctl(struct drm_device *dev, void *data,
 				      struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_free(struct drm_device *dev,
 			       struct drm_agp_buffer *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_free_ioctl(struct drm_device *dev, void *data,
 				     struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_unbind(struct drm_device *dev,
 				 struct drm_agp_binding *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_unbind_ioctl(struct drm_device *dev, void *data,
 				       struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_bind(struct drm_device *dev,
 			       struct drm_agp_binding *request)
 {
 	return -ENODEV;
 }
 
 static inline int drm_agp_bind_ioctl(struct drm_device *dev, void *data,
 				     struct drm_file *file_priv)
 {
 	return -ENODEV;
 }
 
 #endif /* __OS_HAS_AGP */
 
 #endif				/* __KERNEL__ */
 #endif
Index: projects/fuse2/sys/dev/mii/micphy.c
===================================================================
--- projects/fuse2/sys/dev/mii/micphy.c	(revision 350434)
+++ projects/fuse2/sys/dev/mii/micphy.c	(revision 350435)
@@ -1,324 +1,328 @@
 /*-
- * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
+ * Copyright (c) 2014,2019 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
- * Micrel KSZ9021 Gigabit Ethernet Transceiver
+ * Micrel KSZ8081/KSZ9021/KSZ9031 Gigabit Ethernet Transceiver
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/socket.h>
 #include <sys/errno.h>
 #include <sys/module.h>
 #include <sys/bus.h>
 #include <sys/malloc.h>
 
 #include <machine/bus.h>
 
 #include <net/if.h>
 #include <net/if_media.h>
 
 #include <dev/mii/mii.h>
 #include <dev/mii/miivar.h>
 #include "miidevs.h"
 
 #include "miibus_if.h"
 
 #include <dev/fdt/fdt_common.h>
 #include <dev/ofw/openfirm.h>
 #include <dev/ofw/ofw_bus.h>
 #include <dev/ofw/ofw_bus_subr.h>
+#include <dev/mii/mii_fdt.h>
 
 #define	MII_KSZPHY_EXTREG			0x0b
 #define	 KSZPHY_EXTREG_WRITE			(1 << 15)
 #define	MII_KSZPHY_EXTREG_WRITE			0x0c
 #define	MII_KSZPHY_EXTREG_READ			0x0d
 #define	MII_KSZPHY_CLK_CONTROL_PAD_SKEW		0x104
 #define	MII_KSZPHY_RX_DATA_PAD_SKEW		0x105
 #define	MII_KSZPHY_TX_DATA_PAD_SKEW		0x106
 /* KSZ9031 */
 #define	MII_KSZ9031_MMD_ACCESS_CTRL		0x0d
 #define	MII_KSZ9031_MMD_ACCESS_DATA		0x0e
 #define	 MII_KSZ9031_MMD_DATA_NOINC		(1 << 14)
 #define	MII_KSZ9031_CONTROL_PAD_SKEW		0x4
 #define	MII_KSZ9031_RX_DATA_PAD_SKEW		0x5
 #define	MII_KSZ9031_TX_DATA_PAD_SKEW		0x6
 #define	MII_KSZ9031_CLOCK_PAD_SKEW		0x8
 
 #define	MII_KSZ8081_PHYCTL2			0x1f
 
 #define	PS_TO_REG(p)	((p) / 200)
 
 static int micphy_probe(device_t);
 static int micphy_attach(device_t);
 static void micphy_reset(struct mii_softc *);
 static int micphy_service(struct mii_softc *, struct mii_data *, int);
 
 static device_method_t micphy_methods[] = {
 	/* device interface */
 	DEVMETHOD(device_probe,		micphy_probe),
 	DEVMETHOD(device_attach,	micphy_attach),
 	DEVMETHOD(device_detach,	mii_phy_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD_END
 };
 
 static devclass_t micphy_devclass;
 
 static driver_t micphy_driver = {
 	"micphy",
 	micphy_methods,
 	sizeof(struct mii_softc)
 };
 
 DRIVER_MODULE(micphy, miibus, micphy_driver, micphy_devclass, 0, 0);
 
 static const struct mii_phydesc micphys[] = {
 	MII_PHY_DESC(MICREL, KSZ8081),
 	MII_PHY_DESC(MICREL, KSZ9021),
 	MII_PHY_DESC(MICREL, KSZ9031),
 	MII_PHY_END
 };
 
 static const struct mii_phy_funcs micphy_funcs = {
 	micphy_service,
 	ukphy_status,
 	micphy_reset
 };
 
 static uint32_t
 ksz9031_read(struct mii_softc *sc, uint32_t devaddr, uint32_t reg)
 {
 	/* Set up device address and register. */
         PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL, devaddr);
         PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_DATA, reg);
 
 	/* Select register data for MMD and read the value. */
         PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL,
 	    MII_KSZ9031_MMD_DATA_NOINC | devaddr);
 
 	return (PHY_READ(sc, MII_KSZ9031_MMD_ACCESS_DATA));
 }
 
 static void
 ksz9031_write(struct mii_softc *sc, uint32_t devaddr, uint32_t reg,
 	uint32_t val)
 {
 
 	/* Set up device address and register. */
 	PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL, devaddr);
 	PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_DATA, reg);
 
 	/* Select register data for MMD and write the value. */
 	PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_CTRL,
 	    MII_KSZ9031_MMD_DATA_NOINC | devaddr);
 	PHY_WRITE(sc, MII_KSZ9031_MMD_ACCESS_DATA, val);
 }
 
 static uint32_t
 ksz9021_read(struct mii_softc *sc, uint32_t reg)
 {
 
 	PHY_WRITE(sc, MII_KSZPHY_EXTREG, reg);
 
 	return (PHY_READ(sc, MII_KSZPHY_EXTREG_READ));
 }
 
 static void
 ksz9021_write(struct mii_softc *sc, uint32_t reg, uint32_t val)
 {
 
 	PHY_WRITE(sc, MII_KSZPHY_EXTREG, KSZPHY_EXTREG_WRITE | reg);
 	PHY_WRITE(sc, MII_KSZPHY_EXTREG_WRITE, val);
 }
 
 static void
 ksz90x1_load_values(struct mii_softc *sc, phandle_t node,
     uint32_t dev, uint32_t reg, char *field1, uint32_t f1mask, int f1off,
     char *field2, uint32_t f2mask, int f2off, char *field3, uint32_t f3mask,
     int f3off, char *field4, uint32_t f4mask, int f4off)
 {
 	pcell_t dts_value[1];
 	int len;
 	int val;
 
 	if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ9031)
 		val = ksz9031_read(sc, dev, reg);
 	else
 		val = ksz9021_read(sc, reg);
 
 	if ((len = OF_getproplen(node, field1)) > 0) {
 		OF_getencprop(node, field1, dts_value, len);
 		val &= ~(f1mask << f1off);
 		val |= (PS_TO_REG(dts_value[0]) & f1mask) << f1off;
 	}
 
 	if (field2 != NULL && (len = OF_getproplen(node, field2)) > 0) {
 		OF_getencprop(node, field2, dts_value, len);
 		val &= ~(f2mask << f2off);
 		val |= (PS_TO_REG(dts_value[0]) & f2mask) << f2off;
 	}
 
 	if (field3 != NULL && (len = OF_getproplen(node, field3)) > 0) {
 		OF_getencprop(node, field3, dts_value, len);
 		val &= ~(f3mask << f3off);
 		val |= (PS_TO_REG(dts_value[0]) & f3mask) << f3off;
 	}
 
 	if (field4 != NULL && (len = OF_getproplen(node, field4)) > 0) {
 		OF_getencprop(node, field4, dts_value, len);
 		val &= ~(f4mask << f4off);
 		val |= (PS_TO_REG(dts_value[0]) & f4mask) << f4off;
 	}
 
 	if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ9031)
 		ksz9031_write(sc, dev, reg, val);
 	else
 		ksz9021_write(sc, reg, val);
 }
 
 static void
 ksz9031_load_values(struct mii_softc *sc, phandle_t node)
 {
 
 	ksz90x1_load_values(sc, node, 2, MII_KSZ9031_CONTROL_PAD_SKEW,
 	    "txen-skew-ps", 0xf, 0, "rxdv-skew-ps", 0xf, 4,
 	    NULL, 0, 0, NULL, 0, 0);
 	ksz90x1_load_values(sc, node, 2, MII_KSZ9031_RX_DATA_PAD_SKEW,
 	    "rxd0-skew-ps", 0xf, 0, "rxd1-skew-ps", 0xf, 4,
 	    "rxd2-skew-ps", 0xf, 8, "rxd3-skew-ps", 0xf, 12);
 	ksz90x1_load_values(sc, node, 2, MII_KSZ9031_TX_DATA_PAD_SKEW,
 	    "txd0-skew-ps", 0xf, 0, "txd1-skew-ps", 0xf, 4,
 	    "txd2-skew-ps", 0xf, 8, "txd3-skew-ps", 0xf, 12);
 	ksz90x1_load_values(sc, node, 2, MII_KSZ9031_CLOCK_PAD_SKEW,
 	    "rxc-skew-ps", 0x1f, 0, "txc-skew-ps", 0x1f, 5,
 	    NULL, 0, 0, NULL, 0, 0);
 }
 
 static void
 ksz9021_load_values(struct mii_softc *sc, phandle_t node)
 {
 
 	ksz90x1_load_values(sc, node, 0, MII_KSZPHY_CLK_CONTROL_PAD_SKEW,
 	    "txen-skew-ps", 0xf, 0, "txc-skew-ps", 0xf, 4,
 	    "rxdv-skew-ps", 0xf, 8, "rxc-skew-ps", 0xf, 12);
 	ksz90x1_load_values(sc, node, 0, MII_KSZPHY_RX_DATA_PAD_SKEW,
 	    "rxd0-skew-ps", 0xf, 0, "rxd1-skew-ps", 0xf, 4,
 	    "rxd2-skew-ps", 0xf, 8, "rxd3-skew-ps", 0xf, 12);
 	ksz90x1_load_values(sc, node, 0, MII_KSZPHY_TX_DATA_PAD_SKEW,
 	    "txd0-skew-ps", 0xf, 0, "txd1-skew-ps", 0xf, 4,
 	    "txd2-skew-ps", 0xf, 8, "txd3-skew-ps", 0xf, 12);
 }
 
 static int
 micphy_probe(device_t dev)
 {
 
 	return (mii_phy_dev_probe(dev, micphys, BUS_PROBE_DEFAULT));
 }
 
 static int
 micphy_attach(device_t dev)
 {
+	mii_fdt_phy_config_t *cfg;
 	struct mii_softc *sc;
 	phandle_t node;
 	device_t miibus;
 	device_t parent;
 
 	sc = device_get_softc(dev);
 
 	mii_phy_dev_attach(dev, MIIF_NOMANPAUSE, &micphy_funcs, 1);
 	mii_phy_setmedia(sc);
 
 	/* Nothing further to configure for 8081 model. */
 	if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ8081)
 		return (0);
 
 	miibus = device_get_parent(dev);
 	parent = device_get_parent(miibus);
 
 	if ((node = ofw_bus_get_node(parent)) == -1)
 		return (ENXIO);
 
+	cfg = mii_fdt_get_config(dev);
+
 	if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ9031)
-		ksz9031_load_values(sc, node);
+		ksz9031_load_values(sc, cfg->phynode);
 	else
-		ksz9021_load_values(sc, node);
+		ksz9021_load_values(sc, cfg->phynode);
 
 	return (0);
 }
 
 static void
 micphy_reset(struct mii_softc *sc)
 {
 	int reg;
 
 	/*
 	 * The 8081 has no "sticky bits" that survive a soft reset; several bits
 	 * in the Phy Control Register 2 must be preserved across the reset.
 	 * These bits are set up by the bootloader; they control how the phy
 	 * interfaces to the board (such as clock frequency and LED behavior).
 	 */
 	if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ8081)
 		reg = PHY_READ(sc, MII_KSZ8081_PHYCTL2);
 	mii_phy_reset(sc);
 	if (sc->mii_mpd_model == MII_MODEL_MICREL_KSZ8081)
 		PHY_WRITE(sc, MII_KSZ8081_PHYCTL2, reg);
 }
 
 static int
 micphy_service(struct mii_softc *sc, struct mii_data *mii, int cmd)
 {
 
 	switch (cmd) {
 	case MII_POLLSTAT:
 		break;
 
 	case MII_MEDIACHG:
 		mii_phy_setmedia(sc);
 		break;
 
 	case MII_TICK:
 		if (mii_phy_tick(sc) == EJUSTRETURN)
 			return (0);
 		break;
 	}
 
 	/* Update the media status. */
 	PHY_STATUS(sc);
 
 	/* Callback if something changed. */
 	mii_phy_update(sc, cmd);
 	return (0);
 }
Index: projects/fuse2/sys/dev/nvme/nvme.h
===================================================================
--- projects/fuse2/sys/dev/nvme/nvme.h	(revision 350434)
+++ projects/fuse2/sys/dev/nvme/nvme.h	(revision 350435)
@@ -1,1536 +1,1674 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 2012-2013 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef __NVME_H__
 #define __NVME_H__
 
 #ifdef _KERNEL
 #include <sys/types.h>
 #endif
 
 #include <sys/param.h>
 #include <sys/endian.h>
 
 #define	NVME_PASSTHROUGH_CMD		_IOWR('n', 0, struct nvme_pt_command)
 #define	NVME_RESET_CONTROLLER		_IO('n', 1)
 
 #define	NVME_IO_TEST			_IOWR('n', 100, struct nvme_io_test)
 #define	NVME_BIO_TEST			_IOWR('n', 101, struct nvme_io_test)
 
 /*
  * Macros to deal with NVME revisions, as defined VS register
  */
 #define NVME_REV(x, y)			(((x) << 16) | ((y) << 8))
 #define NVME_MAJOR(r)			(((r) >> 16) & 0xffff)
 #define NVME_MINOR(r)			(((r) >> 8) & 0xff)
 
 /*
  * Use to mark a command to apply to all namespaces, or to retrieve global
  *  log pages.
  */
 #define NVME_GLOBAL_NAMESPACE_TAG	((uint32_t)0xFFFFFFFF)
 
 /* Cap nvme to 1MB transfers driver explodes with larger sizes */
 #define NVME_MAX_XFER_SIZE		(MAXPHYS < (1<<20) ? MAXPHYS : (1<<20))
 
 /* Register field definitions */
 #define NVME_CAP_LO_REG_MQES_SHIFT			(0)
 #define NVME_CAP_LO_REG_MQES_MASK			(0xFFFF)
 #define NVME_CAP_LO_REG_CQR_SHIFT			(16)
 #define NVME_CAP_LO_REG_CQR_MASK			(0x1)
 #define NVME_CAP_LO_REG_AMS_SHIFT			(17)
 #define NVME_CAP_LO_REG_AMS_MASK			(0x3)
 #define NVME_CAP_LO_REG_TO_SHIFT			(24)
 #define NVME_CAP_LO_REG_TO_MASK				(0xFF)
 #define NVME_CAP_LO_MQES(x) \
 	(((x) >> NVME_CAP_LO_REG_MQES_SHIFT) & NVME_CAP_LO_REG_MQES_MASK)
 #define NVME_CAP_LO_CQR(x) \
 	(((x) >> NVME_CAP_LO_REG_CQR_SHIFT) & NVME_CAP_LO_REG_CQR_MASK)
 #define NVME_CAP_LO_AMS(x) \
 	(((x) >> NVME_CAP_LO_REG_AMS_SHIFT) & NVME_CAP_LO_REG_AMS_MASK)
 #define NVME_CAP_LO_TO(x) \
 	(((x) >> NVME_CAP_LO_REG_TO_SHIFT) & NVME_CAP_LO_REG_TO_MASK)
 
 #define NVME_CAP_HI_REG_DSTRD_SHIFT			(0)
 #define NVME_CAP_HI_REG_DSTRD_MASK			(0xF)
 #define NVME_CAP_HI_REG_CSS_NVM_SHIFT			(5)
 #define NVME_CAP_HI_REG_CSS_NVM_MASK			(0x1)
 #define NVME_CAP_HI_REG_MPSMIN_SHIFT			(16)
 #define NVME_CAP_HI_REG_MPSMIN_MASK			(0xF)
 #define NVME_CAP_HI_REG_MPSMAX_SHIFT			(20)
 #define NVME_CAP_HI_REG_MPSMAX_MASK			(0xF)
 #define NVME_CAP_HI_DSTRD(x) \
 	(((x) >> NVME_CAP_HI_REG_DSTRD_SHIFT) & NVME_CAP_HI_REG_DSTRD_MASK)
 #define NVME_CAP_HI_CSS_NVM(x) \
 	(((x) >> NVME_CAP_HI_REG_CSS_NVM_SHIFT) & NVME_CAP_HI_REG_CSS_NVM_MASK)
 #define NVME_CAP_HI_MPSMIN(x) \
 	(((x) >> NVME_CAP_HI_REG_MPSMIN_SHIFT) & NVME_CAP_HI_REG_MPSMIN_MASK)
 #define NVME_CAP_HI_MPSMAX(x) \
 	(((x) >> NVME_CAP_HI_REG_MPSMAX_SHIFT) & NVME_CAP_HI_REG_MPSMAX_MASK)
 
 #define NVME_CC_REG_EN_SHIFT				(0)
 #define NVME_CC_REG_EN_MASK				(0x1)
 #define NVME_CC_REG_CSS_SHIFT				(4)
 #define NVME_CC_REG_CSS_MASK				(0x7)
 #define NVME_CC_REG_MPS_SHIFT				(7)
 #define NVME_CC_REG_MPS_MASK				(0xF)
 #define NVME_CC_REG_AMS_SHIFT				(11)
 #define NVME_CC_REG_AMS_MASK				(0x7)
 #define NVME_CC_REG_SHN_SHIFT				(14)
 #define NVME_CC_REG_SHN_MASK				(0x3)
 #define NVME_CC_REG_IOSQES_SHIFT			(16)
 #define NVME_CC_REG_IOSQES_MASK				(0xF)
 #define NVME_CC_REG_IOCQES_SHIFT			(20)
 #define NVME_CC_REG_IOCQES_MASK				(0xF)
 
 #define NVME_CSTS_REG_RDY_SHIFT				(0)
 #define NVME_CSTS_REG_RDY_MASK				(0x1)
 #define NVME_CSTS_REG_CFS_SHIFT				(1)
 #define NVME_CSTS_REG_CFS_MASK				(0x1)
 #define NVME_CSTS_REG_SHST_SHIFT			(2)
 #define NVME_CSTS_REG_SHST_MASK				(0x3)
 
 #define NVME_CSTS_GET_SHST(csts)			(((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK)
 
 #define NVME_AQA_REG_ASQS_SHIFT				(0)
 #define NVME_AQA_REG_ASQS_MASK				(0xFFF)
 #define NVME_AQA_REG_ACQS_SHIFT				(16)
 #define NVME_AQA_REG_ACQS_MASK				(0xFFF)
 
 /* Command field definitions */
 
 #define NVME_CMD_FUSE_SHIFT				(8)
 #define NVME_CMD_FUSE_MASK				(0x3)
 
 #define NVME_STATUS_P_SHIFT				(0)
 #define NVME_STATUS_P_MASK				(0x1)
 #define NVME_STATUS_SC_SHIFT				(1)
 #define NVME_STATUS_SC_MASK				(0xFF)
 #define NVME_STATUS_SCT_SHIFT				(9)
 #define NVME_STATUS_SCT_MASK				(0x7)
 #define NVME_STATUS_M_SHIFT				(14)
 #define NVME_STATUS_M_MASK				(0x1)
 #define NVME_STATUS_DNR_SHIFT				(15)
 #define NVME_STATUS_DNR_MASK				(0x1)
 
 #define NVME_STATUS_GET_P(st)				(((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK)
 #define NVME_STATUS_GET_SC(st)				(((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK)
 #define NVME_STATUS_GET_SCT(st)				(((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK)
 #define NVME_STATUS_GET_M(st)				(((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK)
 #define NVME_STATUS_GET_DNR(st)				(((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK)
 
 #define NVME_PWR_ST_MPS_SHIFT				(0)
 #define NVME_PWR_ST_MPS_MASK				(0x1)
 #define NVME_PWR_ST_NOPS_SHIFT				(1)
 #define NVME_PWR_ST_NOPS_MASK				(0x1)
 #define NVME_PWR_ST_RRT_SHIFT				(0)
 #define NVME_PWR_ST_RRT_MASK				(0x1F)
 #define NVME_PWR_ST_RRL_SHIFT				(0)
 #define NVME_PWR_ST_RRL_MASK				(0x1F)
 #define NVME_PWR_ST_RWT_SHIFT				(0)
 #define NVME_PWR_ST_RWT_MASK				(0x1F)
 #define NVME_PWR_ST_RWL_SHIFT				(0)
 #define NVME_PWR_ST_RWL_MASK				(0x1F)
 #define NVME_PWR_ST_IPS_SHIFT				(6)
 #define NVME_PWR_ST_IPS_MASK				(0x3)
 #define NVME_PWR_ST_APW_SHIFT				(0)
 #define NVME_PWR_ST_APW_MASK				(0x7)
 #define NVME_PWR_ST_APS_SHIFT				(6)
 #define NVME_PWR_ST_APS_MASK				(0x3)
 
 /** Controller Multi-path I/O and Namespace Sharing Capabilities */
 /* More then one port */
 #define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT		(0)
 #define NVME_CTRLR_DATA_MIC_MPORTS_MASK			(0x1)
 /* More then one controller */
 #define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT		(1)
 #define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK		(0x1)
 /* SR-IOV Virtual Function */
 #define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT		(2)
 #define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK		(0x1)
+/* Asymmetric Namespace Access Reporting */
+#define NVME_CTRLR_DATA_MIC_ANAR_SHIFT			(3)
+#define NVME_CTRLR_DATA_MIC_ANAR_MASK			(0x1)
 
 /** OACS - optional admin command support */
 /* supports security send/receive commands */
 #define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT		(0)
 #define NVME_CTRLR_DATA_OACS_SECURITY_MASK		(0x1)
 /* supports format nvm command */
 #define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT		(1)
 #define NVME_CTRLR_DATA_OACS_FORMAT_MASK		(0x1)
 /* supports firmware activate/download commands */
 #define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT		(2)
 #define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK		(0x1)
 /* supports namespace management commands */
 #define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT		(3)
 #define NVME_CTRLR_DATA_OACS_NSMGMT_MASK		(0x1)
 /* supports Device Self-test command */
 #define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT		(4)
 #define NVME_CTRLR_DATA_OACS_SELFTEST_MASK		(0x1)
 /* supports Directives */
 #define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT		(5)
 #define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK		(0x1)
 /* supports NVMe-MI Send/Receive */
 #define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT		(6)
 #define NVME_CTRLR_DATA_OACS_NVMEMI_MASK		(0x1)
 /* supports Virtualization Management */
 #define NVME_CTRLR_DATA_OACS_VM_SHIFT			(7)
 #define NVME_CTRLR_DATA_OACS_VM_MASK			(0x1)
 /* supports Doorbell Buffer Config */
 #define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT		(8)
 #define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK		(0x1)
+/* supports Get LBA Status */
+#define NVME_CTRLR_DATA_OACS_GETLBA_SHIFT		(9)
+#define NVME_CTRLR_DATA_OACS_GETLBA_MASK		(0x1)
 
 /** firmware updates */
 /* first slot is read-only */
 #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT		(0)
 #define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK		(0x1)
 /* number of firmware slots */
 #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT		(1)
 #define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK		(0x7)
+/* firmware activation without reset */
+#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_SHIFT		(4)
+#define NVME_CTRLR_DATA_FRMW_ACT_WO_RESET_MASK		(0x1)
 
 /** log page attributes */
 /* per namespace smart/health log page */
 #define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT		(0)
 #define NVME_CTRLR_DATA_LPA_NS_SMART_MASK		(0x1)
 
 /** AVSCC - admin vendor specific command configuration */
 /* admin vendor specific commands use spec format */
 #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT		(0)
 #define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK		(0x1)
 
 /** Autonomous Power State Transition Attributes */
 /* Autonomous Power State Transitions supported */
 #define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT		(0)
 #define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK		(0x1)
 
+/** Sanitize Capabilities */
+/* Crypto Erase Support  */
+#define NVME_CTRLR_DATA_SANICAP_CES_SHIFT		(0)
+#define NVME_CTRLR_DATA_SANICAP_CES_MASK		(0x1)
+/* Block Erase Support */
+#define NVME_CTRLR_DATA_SANICAP_BES_SHIFT		(1)
+#define NVME_CTRLR_DATA_SANICAP_BES_MASK		(0x1)
+/* Overwrite Support */
+#define NVME_CTRLR_DATA_SANICAP_OWS_SHIFT		(2)
+#define NVME_CTRLR_DATA_SANICAP_OWS_MASK		(0x1)
+/* No-Deallocate Inhibited  */
+#define NVME_CTRLR_DATA_SANICAP_NDI_SHIFT		(29)
+#define NVME_CTRLR_DATA_SANICAP_NDI_MASK		(0x1)
+/* No-Deallocate Modifies Media After Sanitize */
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT		(30)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_MASK		(0x3)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_UNDEF		(0)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_NO		(1)
+#define NVME_CTRLR_DATA_SANICAP_NODMMAS_YES		(2)
+
 /** submission queue entry size */
 #define NVME_CTRLR_DATA_SQES_MIN_SHIFT			(0)
 #define NVME_CTRLR_DATA_SQES_MIN_MASK			(0xF)
 #define NVME_CTRLR_DATA_SQES_MAX_SHIFT			(4)
 #define NVME_CTRLR_DATA_SQES_MAX_MASK			(0xF)
 
 /** completion queue entry size */
 #define NVME_CTRLR_DATA_CQES_MIN_SHIFT			(0)
 #define NVME_CTRLR_DATA_CQES_MIN_MASK			(0xF)
 #define NVME_CTRLR_DATA_CQES_MAX_SHIFT			(4)
 #define NVME_CTRLR_DATA_CQES_MAX_MASK			(0xF)
 
 /** optional nvm command support */
 #define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT		(0)
 #define NVME_CTRLR_DATA_ONCS_COMPARE_MASK		(0x1)
 #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT		(1)
 #define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK		(0x1)
 #define NVME_CTRLR_DATA_ONCS_DSM_SHIFT			(2)
 #define NVME_CTRLR_DATA_ONCS_DSM_MASK			(0x1)
 #define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT		(3)
 #define NVME_CTRLR_DATA_ONCS_WRZERO_MASK		(0x1)
 #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT		(4)
 #define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK		(0x1)
 #define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT		(5)
 #define NVME_CTRLR_DATA_ONCS_RESERV_MASK		(0x1)
 #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT		(6)
 #define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK		(0x1)
+#define NVME_CTRLR_DATA_ONCS_VERIFY_SHIFT		(7)
+#define NVME_CTRLR_DATA_ONCS_VERIFY_MASK		(0x1)
 
 /** Fused Operation Support */
 #define NVME_CTRLR_DATA_FUSES_CNW_SHIFT		(0)
 #define NVME_CTRLR_DATA_FUSES_CNW_MASK		(0x1)
 
 /** Format NVM Attributes */
 #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT		(0)
 #define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK		(0x1)
 #define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT		(1)
 #define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK		(0x1)
 #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT		(2)
 #define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK		(0x1)
 
 /** volatile write cache */
+/* volatile write cache present */
 #define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT		(0)
 #define NVME_CTRLR_DATA_VWC_PRESENT_MASK		(0x1)
+/* flush all namespaces supported */
+#define NVME_CTRLR_DATA_VWC_ALL_SHIFT			(1)
+#define NVME_CTRLR_DATA_VWC_ALL_MASK			(0x3)
+#define NVME_CTRLR_DATA_VWC_ALL_UNKNOWN			(0)
+#define NVME_CTRLR_DATA_VWC_ALL_NO			(2)
+#define NVME_CTRLR_DATA_VWC_ALL_YES			(3)
 
 /** namespace features */
 /* thin provisioning */
 #define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT		(0)
 #define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK		(0x1)
 /* NAWUN, NAWUPF, and NACWU fields are valid */
 #define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT		(1)
 #define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK		(0x1)
 /* Deallocated or Unwritten Logical Block errors supported */
 #define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT		(2)
 #define NVME_NS_DATA_NSFEAT_DEALLOC_MASK		(0x1)
 /* NGUID and EUI64 fields are not reusable */
 #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT		(3)
 #define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK		(0x1)
+/* NPWG, NPWA, NPDG, NPDA, and NOWS are valid */
+#define NVME_NS_DATA_NSFEAT_NPVALID_SHIFT		(4)
+#define NVME_NS_DATA_NSFEAT_NPVALID_MASK		(0x1)
 
 /** formatted lba size */
 #define NVME_NS_DATA_FLBAS_FORMAT_SHIFT			(0)
 #define NVME_NS_DATA_FLBAS_FORMAT_MASK			(0xF)
 #define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT		(4)
 #define NVME_NS_DATA_FLBAS_EXTENDED_MASK		(0x1)
 
 /** metadata capabilities */
 /* metadata can be transferred as part of data prp list */
 #define NVME_NS_DATA_MC_EXTENDED_SHIFT			(0)
 #define NVME_NS_DATA_MC_EXTENDED_MASK			(0x1)
 /* metadata can be transferred with separate metadata pointer */
 #define NVME_NS_DATA_MC_POINTER_SHIFT			(1)
 #define NVME_NS_DATA_MC_POINTER_MASK			(0x1)
 
 /** end-to-end data protection capabilities */
 /* protection information type 1 */
 #define NVME_NS_DATA_DPC_PIT1_SHIFT			(0)
 #define NVME_NS_DATA_DPC_PIT1_MASK			(0x1)
 /* protection information type 2 */
 #define NVME_NS_DATA_DPC_PIT2_SHIFT			(1)
 #define NVME_NS_DATA_DPC_PIT2_MASK			(0x1)
 /* protection information type 3 */
 #define NVME_NS_DATA_DPC_PIT3_SHIFT			(2)
 #define NVME_NS_DATA_DPC_PIT3_MASK			(0x1)
 /* first eight bytes of metadata */
 #define NVME_NS_DATA_DPC_MD_START_SHIFT			(3)
 #define NVME_NS_DATA_DPC_MD_START_MASK			(0x1)
 /* last eight bytes of metadata */
 #define NVME_NS_DATA_DPC_MD_END_SHIFT			(4)
 #define NVME_NS_DATA_DPC_MD_END_MASK			(0x1)
 
 /** end-to-end data protection type settings */
 /* protection information type */
 #define NVME_NS_DATA_DPS_PIT_SHIFT			(0)
 #define NVME_NS_DATA_DPS_PIT_MASK			(0x7)
 /* 1 == protection info transferred at start of metadata */
 /* 0 == protection info transferred at end of metadata */
 #define NVME_NS_DATA_DPS_MD_START_SHIFT			(3)
 #define NVME_NS_DATA_DPS_MD_START_MASK			(0x1)
 
 /** Namespace Multi-path I/O and Namespace Sharing Capabilities */
 /* the namespace may be attached to two or more controllers */
 #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT		(0)
 #define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK		(0x1)
 
 /** Reservation Capabilities */
 /* Persist Through Power Loss */
 #define NVME_NS_DATA_RESCAP_PTPL_SHIFT		(0)
 #define NVME_NS_DATA_RESCAP_PTPL_MASK		(0x1)
 /* supports the Write Exclusive */
 #define NVME_NS_DATA_RESCAP_WR_EX_SHIFT		(1)
 #define NVME_NS_DATA_RESCAP_WR_EX_MASK		(0x1)
 /* supports the Exclusive Access */
 #define NVME_NS_DATA_RESCAP_EX_AC_SHIFT		(2)
 #define NVME_NS_DATA_RESCAP_EX_AC_MASK		(0x1)
 /* supports the Write Exclusive – Registrants Only */
 #define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT	(3)
 #define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK	(0x1)
 /* supports the Exclusive Access - Registrants Only */
 #define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT	(4)
 #define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK	(0x1)
 /* supports the Write Exclusive – All Registrants */
 #define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT	(5)
 #define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK	(0x1)
 /* supports the Exclusive Access - All Registrants */
 #define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT	(6)
 #define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK	(0x1)
 /* Ignore Existing Key is used as defined in revision 1.3 or later */
 #define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT	(7)
 #define NVME_NS_DATA_RESCAP_IEKEY13_MASK	(0x1)
 
 /** Format Progress Indicator */
 /* percentage of the Format NVM command that remains to be completed */
 #define NVME_NS_DATA_FPI_PERC_SHIFT		(0)
 #define NVME_NS_DATA_FPI_PERC_MASK		(0x7f)
 /* namespace supports the Format Progress Indicator */
 #define NVME_NS_DATA_FPI_SUPP_SHIFT		(7)
 #define NVME_NS_DATA_FPI_SUPP_MASK		(0x1)
 
 /** Deallocate Logical Block Features */
 /* deallocated logical block read behavior */
 #define NVME_NS_DATA_DLFEAT_READ_SHIFT		(0)
 #define NVME_NS_DATA_DLFEAT_READ_MASK		(0x07)
 #define NVME_NS_DATA_DLFEAT_READ_NR		(0x00)
 #define NVME_NS_DATA_DLFEAT_READ_00		(0x01)
 #define NVME_NS_DATA_DLFEAT_READ_FF		(0x02)
 /* supports the Deallocate bit in the Write Zeroes */
 #define NVME_NS_DATA_DLFEAT_DWZ_SHIFT		(3)
 #define NVME_NS_DATA_DLFEAT_DWZ_MASK		(0x01)
 /* Guard field for deallocated logical blocks is set to the CRC  */
 #define NVME_NS_DATA_DLFEAT_GCRC_SHIFT		(4)
 #define NVME_NS_DATA_DLFEAT_GCRC_MASK		(0x01)
 
 /** lba format support */
 /* metadata size */
 #define NVME_NS_DATA_LBAF_MS_SHIFT			(0)
 #define NVME_NS_DATA_LBAF_MS_MASK			(0xFFFF)
 /* lba data size */
 #define NVME_NS_DATA_LBAF_LBADS_SHIFT			(16)
 #define NVME_NS_DATA_LBAF_LBADS_MASK			(0xFF)
 /* relative performance */
 #define NVME_NS_DATA_LBAF_RP_SHIFT			(24)
 #define NVME_NS_DATA_LBAF_RP_MASK			(0x3)
 
 enum nvme_critical_warning_state {
 	NVME_CRIT_WARN_ST_AVAILABLE_SPARE		= 0x1,
 	NVME_CRIT_WARN_ST_TEMPERATURE			= 0x2,
 	NVME_CRIT_WARN_ST_DEVICE_RELIABILITY		= 0x4,
 	NVME_CRIT_WARN_ST_READ_ONLY			= 0x8,
 	NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP	= 0x10,
 };
 #define NVME_CRIT_WARN_ST_RESERVED_MASK			(0xE0)
 
 /* slot for current FW */
 #define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT		(0)
 #define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK		(0x7)
 
 /* CC register SHN field values */
 enum shn_value {
 	NVME_SHN_NORMAL		= 0x1,
 	NVME_SHN_ABRUPT		= 0x2,
 };
 
 /* CSTS register SHST field values */
 enum shst_value {
 	NVME_SHST_NORMAL	= 0x0,
 	NVME_SHST_OCCURRING	= 0x1,
 	NVME_SHST_COMPLETE	= 0x2,
 };
 
 struct nvme_registers
 {
 	/** controller capabilities */
 	uint32_t		cap_lo;
 	uint32_t		cap_hi;
 
 	uint32_t		vs;	/* version */
 	uint32_t		intms;	/* interrupt mask set */
 	uint32_t		intmc;	/* interrupt mask clear */
 
 	/** controller configuration */
 	uint32_t		cc;
 
 	uint32_t		reserved1;
 
 	/** controller status */
 	uint32_t		csts;
 
 	uint32_t		reserved2;
 
 	/** admin queue attributes */
 	uint32_t		aqa;
 
 	uint64_t		asq;	/* admin submission queue base addr */
 	uint64_t		acq;	/* admin completion queue base addr */
 	uint32_t		reserved3[0x3f2];
 
 	struct {
 	    uint32_t		sq_tdbl; /* submission queue tail doorbell */
 	    uint32_t		cq_hdbl; /* completion queue head doorbell */
 	} doorbell[1] __packed;
 } __packed;
 
 _Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers");
 
 struct nvme_command
 {
 	/* dword 0 */
 	uint8_t opc;		/* opcode */
 	uint8_t fuse;		/* fused operation */
 	uint16_t cid;		/* command identifier */
 
 	/* dword 1 */
 	uint32_t nsid;		/* namespace identifier */
 
 	/* dword 2-3 */
 	uint32_t rsvd2;
 	uint32_t rsvd3;
 
 	/* dword 4-5 */
 	uint64_t mptr;		/* metadata pointer */
 
 	/* dword 6-7 */
 	uint64_t prp1;		/* prp entry 1 */
 
 	/* dword 8-9 */
 	uint64_t prp2;		/* prp entry 2 */
 
 	/* dword 10-15 */
 	uint32_t cdw10;		/* command-specific */
 	uint32_t cdw11;		/* command-specific */
 	uint32_t cdw12;		/* command-specific */
 	uint32_t cdw13;		/* command-specific */
 	uint32_t cdw14;		/* command-specific */
 	uint32_t cdw15;		/* command-specific */
 } __packed;
 
 _Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command");
 
 struct nvme_completion {
 
 	/* dword 0 */
 	uint32_t		cdw0;	/* command-specific */
 
 	/* dword 1 */
 	uint32_t		rsvd1;
 
 	/* dword 2 */
 	uint16_t		sqhd;	/* submission queue head pointer */
 	uint16_t		sqid;	/* submission queue identifier */
 
 	/* dword 3 */
 	uint16_t		cid;	/* command identifier */
 	uint16_t		status;
 } __packed;
 
 _Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion");
 
 struct nvme_dsm_range {
 	uint32_t attributes;
 	uint32_t length;
 	uint64_t starting_lba;
 } __packed;
 
 /* Largest DSM Trim that can be done */
 #define NVME_MAX_DSM_TRIM		4096
 
 _Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage");
 
 /* status code types */
 enum nvme_status_code_type {
 	NVME_SCT_GENERIC		= 0x0,
 	NVME_SCT_COMMAND_SPECIFIC	= 0x1,
 	NVME_SCT_MEDIA_ERROR		= 0x2,
 	/* 0x3-0x6 - reserved */
 	NVME_SCT_VENDOR_SPECIFIC	= 0x7,
 };
 
 /* generic command status codes */
 enum nvme_generic_command_status_code {
 	NVME_SC_SUCCESS				= 0x00,
 	NVME_SC_INVALID_OPCODE			= 0x01,
 	NVME_SC_INVALID_FIELD			= 0x02,
 	NVME_SC_COMMAND_ID_CONFLICT		= 0x03,
 	NVME_SC_DATA_TRANSFER_ERROR		= 0x04,
 	NVME_SC_ABORTED_POWER_LOSS		= 0x05,
 	NVME_SC_INTERNAL_DEVICE_ERROR		= 0x06,
 	NVME_SC_ABORTED_BY_REQUEST		= 0x07,
 	NVME_SC_ABORTED_SQ_DELETION		= 0x08,
 	NVME_SC_ABORTED_FAILED_FUSED		= 0x09,
 	NVME_SC_ABORTED_MISSING_FUSED		= 0x0a,
 	NVME_SC_INVALID_NAMESPACE_OR_FORMAT	= 0x0b,
 	NVME_SC_COMMAND_SEQUENCE_ERROR		= 0x0c,
 	NVME_SC_INVALID_SGL_SEGMENT_DESCR	= 0x0d,
 	NVME_SC_INVALID_NUMBER_OF_SGL_DESCR	= 0x0e,
 	NVME_SC_DATA_SGL_LENGTH_INVALID		= 0x0f,
 	NVME_SC_METADATA_SGL_LENGTH_INVALID	= 0x10,
 	NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID	= 0x11,
 	NVME_SC_INVALID_USE_OF_CMB		= 0x12,
 	NVME_SC_PRP_OFFET_INVALID		= 0x13,
 	NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED	= 0x14,
 	NVME_SC_OPERATION_DENIED		= 0x15,
 	NVME_SC_SGL_OFFSET_INVALID		= 0x16,
 	/* 0x17 - reserved */
 	NVME_SC_HOST_ID_INCONSISTENT_FORMAT	= 0x18,
 	NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED	= 0x19,
 	NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID	= 0x1a,
 	NVME_SC_ABORTED_DUE_TO_PREEMPT		= 0x1b,
 	NVME_SC_SANITIZE_FAILED			= 0x1c,
 	NVME_SC_SANITIZE_IN_PROGRESS		= 0x1d,
 	NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID	= 0x1e,
 	NVME_SC_NOT_SUPPORTED_IN_CMB		= 0x1f,
 
 	NVME_SC_LBA_OUT_OF_RANGE		= 0x80,
 	NVME_SC_CAPACITY_EXCEEDED		= 0x81,
 	NVME_SC_NAMESPACE_NOT_READY		= 0x82,
 	NVME_SC_RESERVATION_CONFLICT		= 0x83,
 	NVME_SC_FORMAT_IN_PROGRESS		= 0x84,
 };
 
 /* command specific status codes */
 enum nvme_command_specific_status_code {
 	NVME_SC_COMPLETION_QUEUE_INVALID	= 0x00,
 	NVME_SC_INVALID_QUEUE_IDENTIFIER	= 0x01,
 	NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED	= 0x02,
 	NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED	= 0x03,
 	/* 0x04 - reserved */
 	NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05,
 	NVME_SC_INVALID_FIRMWARE_SLOT		= 0x06,
 	NVME_SC_INVALID_FIRMWARE_IMAGE		= 0x07,
 	NVME_SC_INVALID_INTERRUPT_VECTOR	= 0x08,
 	NVME_SC_INVALID_LOG_PAGE		= 0x09,
 	NVME_SC_INVALID_FORMAT			= 0x0a,
 	NVME_SC_FIRMWARE_REQUIRES_RESET		= 0x0b,
 	NVME_SC_INVALID_QUEUE_DELETION		= 0x0c,
 	NVME_SC_FEATURE_NOT_SAVEABLE		= 0x0d,
 	NVME_SC_FEATURE_NOT_CHANGEABLE		= 0x0e,
 	NVME_SC_FEATURE_NOT_NS_SPECIFIC		= 0x0f,
 	NVME_SC_FW_ACT_REQUIRES_NVMS_RESET	= 0x10,
 	NVME_SC_FW_ACT_REQUIRES_RESET		= 0x11,
 	NVME_SC_FW_ACT_REQUIRES_TIME		= 0x12,
 	NVME_SC_FW_ACT_PROHIBITED		= 0x13,
 	NVME_SC_OVERLAPPING_RANGE		= 0x14,
 	NVME_SC_NS_INSUFFICIENT_CAPACITY	= 0x15,
 	NVME_SC_NS_ID_UNAVAILABLE		= 0x16,
 	/* 0x17 - reserved */
 	NVME_SC_NS_ALREADY_ATTACHED		= 0x18,
 	NVME_SC_NS_IS_PRIVATE			= 0x19,
 	NVME_SC_NS_NOT_ATTACHED			= 0x1a,
 	NVME_SC_THIN_PROV_NOT_SUPPORTED		= 0x1b,
 	NVME_SC_CTRLR_LIST_INVALID		= 0x1c,
 	NVME_SC_SELT_TEST_IN_PROGRESS		= 0x1d,
 	NVME_SC_BOOT_PART_WRITE_PROHIB		= 0x1e,
 	NVME_SC_INVALID_CTRLR_ID		= 0x1f,
 	NVME_SC_INVALID_SEC_CTRLR_STATE		= 0x20,
 	NVME_SC_INVALID_NUM_OF_CTRLR_RESRC	= 0x21,
 	NVME_SC_INVALID_RESOURCE_ID		= 0x22,
 
 	NVME_SC_CONFLICTING_ATTRIBUTES		= 0x80,
 	NVME_SC_INVALID_PROTECTION_INFO		= 0x81,
 	NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE	= 0x82,
 };
 
 /* media error status codes */
 enum nvme_media_error_status_code {
 	NVME_SC_WRITE_FAULTS			= 0x80,
 	NVME_SC_UNRECOVERED_READ_ERROR		= 0x81,
 	NVME_SC_GUARD_CHECK_ERROR		= 0x82,
 	NVME_SC_APPLICATION_TAG_CHECK_ERROR	= 0x83,
 	NVME_SC_REFERENCE_TAG_CHECK_ERROR	= 0x84,
 	NVME_SC_COMPARE_FAILURE			= 0x85,
 	NVME_SC_ACCESS_DENIED			= 0x86,
 	NVME_SC_DEALLOCATED_OR_UNWRITTEN	= 0x87,
 };
 
 /* admin opcodes */
 enum nvme_admin_opcode {
 	NVME_OPC_DELETE_IO_SQ			= 0x00,
 	NVME_OPC_CREATE_IO_SQ			= 0x01,
 	NVME_OPC_GET_LOG_PAGE			= 0x02,
 	/* 0x03 - reserved */
 	NVME_OPC_DELETE_IO_CQ			= 0x04,
 	NVME_OPC_CREATE_IO_CQ			= 0x05,
 	NVME_OPC_IDENTIFY			= 0x06,
 	/* 0x07 - reserved */
 	NVME_OPC_ABORT				= 0x08,
 	NVME_OPC_SET_FEATURES			= 0x09,
 	NVME_OPC_GET_FEATURES			= 0x0a,
 	/* 0x0b - reserved */
 	NVME_OPC_ASYNC_EVENT_REQUEST		= 0x0c,
 	NVME_OPC_NAMESPACE_MANAGEMENT		= 0x0d,
 	/* 0x0e-0x0f - reserved */
 	NVME_OPC_FIRMWARE_ACTIVATE		= 0x10,
 	NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD	= 0x11,
 	NVME_OPC_DEVICE_SELF_TEST		= 0x14,
 	NVME_OPC_NAMESPACE_ATTACHMENT		= 0x15,
 	NVME_OPC_KEEP_ALIVE			= 0x18,
 	NVME_OPC_DIRECTIVE_SEND			= 0x19,
 	NVME_OPC_DIRECTIVE_RECEIVE		= 0x1a,
 	NVME_OPC_VIRTUALIZATION_MANAGEMENT	= 0x1c,
 	NVME_OPC_NVME_MI_SEND			= 0x1d,
 	NVME_OPC_NVME_MI_RECEIVE		= 0x1e,
 	NVME_OPC_DOORBELL_BUFFER_CONFIG		= 0x7c,
 
 	NVME_OPC_FORMAT_NVM			= 0x80,
 	NVME_OPC_SECURITY_SEND			= 0x81,
 	NVME_OPC_SECURITY_RECEIVE		= 0x82,
 	NVME_OPC_SANITIZE			= 0x84,
 };
 
 /* nvme nvm opcodes */
 enum nvme_nvm_opcode {
 	NVME_OPC_FLUSH				= 0x00,
 	NVME_OPC_WRITE				= 0x01,
 	NVME_OPC_READ				= 0x02,
 	/* 0x03 - reserved */
 	NVME_OPC_WRITE_UNCORRECTABLE		= 0x04,
 	NVME_OPC_COMPARE			= 0x05,
 	/* 0x06 - reserved */
 	NVME_OPC_WRITE_ZEROES			= 0x08,
 	/* 0x07 - reserved */
 	NVME_OPC_DATASET_MANAGEMENT		= 0x09,
 	/* 0x0a-0x0c - reserved */
 	NVME_OPC_RESERVATION_REGISTER		= 0x0d,
 	NVME_OPC_RESERVATION_REPORT		= 0x0e,
 	/* 0x0f-0x10 - reserved */
 	NVME_OPC_RESERVATION_ACQUIRE		= 0x11,
 	/* 0x12-0x14 - reserved */
 	NVME_OPC_RESERVATION_RELEASE		= 0x15,
 };
 
 enum nvme_feature {
 	/* 0x00 - reserved */
 	NVME_FEAT_ARBITRATION			= 0x01,
 	NVME_FEAT_POWER_MANAGEMENT		= 0x02,
 	NVME_FEAT_LBA_RANGE_TYPE		= 0x03,
 	NVME_FEAT_TEMPERATURE_THRESHOLD		= 0x04,
 	NVME_FEAT_ERROR_RECOVERY		= 0x05,
 	NVME_FEAT_VOLATILE_WRITE_CACHE		= 0x06,
 	NVME_FEAT_NUMBER_OF_QUEUES		= 0x07,
 	NVME_FEAT_INTERRUPT_COALESCING		= 0x08,
 	NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09,
 	NVME_FEAT_WRITE_ATOMICITY		= 0x0A,
 	NVME_FEAT_ASYNC_EVENT_CONFIGURATION	= 0x0B,
 	NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C,
 	NVME_FEAT_HOST_MEMORY_BUFFER		= 0x0D,
 	NVME_FEAT_TIMESTAMP			= 0x0E,
 	NVME_FEAT_KEEP_ALIVE_TIMER		= 0x0F,
 	NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT	= 0x10,
 	NVME_FEAT_NON_OP_POWER_STATE_CONFIG	= 0x11,
 	/* 0x12-0x77 - reserved */
 	/* 0x78-0x7f - NVMe Management Interface */
 	NVME_FEAT_SOFTWARE_PROGRESS_MARKER	= 0x80,
 	/* 0x81-0xBF - command set specific (reserved) */
 	/* 0xC0-0xFF - vendor specific */
 };
 
 enum nvme_dsm_attribute {
 	NVME_DSM_ATTR_INTEGRAL_READ		= 0x1,
 	NVME_DSM_ATTR_INTEGRAL_WRITE		= 0x2,
 	NVME_DSM_ATTR_DEALLOCATE		= 0x4,
 };
 
 enum nvme_activate_action {
 	NVME_AA_REPLACE_NO_ACTIVATE		= 0x0,
 	NVME_AA_REPLACE_ACTIVATE		= 0x1,
 	NVME_AA_ACTIVATE			= 0x2,
 };
 
 struct nvme_power_state {
 	/** Maximum Power */
 	uint16_t	mp;			/* Maximum Power */
 	uint8_t		ps_rsvd1;
 	uint8_t		mps_nops;		/* Max Power Scale, Non-Operational State */
 
 	uint32_t	enlat;			/* Entry Latency */
 	uint32_t	exlat;			/* Exit Latency */
 
 	uint8_t		rrt;			/* Relative Read Throughput */
 	uint8_t		rrl;			/* Relative Read Latency */
 	uint8_t		rwt;			/* Relative Write Throughput */
 	uint8_t		rwl;			/* Relative Write Latency */
 
 	uint16_t	idlp;			/* Idle Power */
 	uint8_t		ips;			/* Idle Power Scale */
 	uint8_t		ps_rsvd8;
 
 	uint16_t	actp;			/* Active Power */
 	uint8_t		apw_aps;		/* Active Power Workload, Active Power Scale */
 	uint8_t		ps_rsvd10[9];
 } __packed;
 
 _Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state");
 
 #define NVME_SERIAL_NUMBER_LENGTH	20
 #define NVME_MODEL_NUMBER_LENGTH	40
 #define NVME_FIRMWARE_REVISION_LENGTH	8
 
 struct nvme_controller_data {
 
 	/* bytes 0-255: controller capabilities and features */
 
 	/** pci vendor id */
 	uint16_t		vid;
 
 	/** pci subsystem vendor id */
 	uint16_t		ssvid;
 
 	/** serial number */
 	uint8_t			sn[NVME_SERIAL_NUMBER_LENGTH];
 
 	/** model number */
 	uint8_t			mn[NVME_MODEL_NUMBER_LENGTH];
 
 	/** firmware revision */
 	uint8_t			fr[NVME_FIRMWARE_REVISION_LENGTH];
 
 	/** recommended arbitration burst */
 	uint8_t			rab;
 
 	/** ieee oui identifier */
 	uint8_t			ieee[3];
 
 	/** multi-interface capabilities */
 	uint8_t			mic;
 
 	/** maximum data transfer size */
 	uint8_t			mdts;
 
 	/** Controller ID */
 	uint16_t		ctrlr_id;
 
 	/** Version */
 	uint32_t		ver;
 
 	/** RTD3 Resume Latency */
 	uint32_t		rtd3r;
 
 	/** RTD3 Enter Latency */
 	uint32_t		rtd3e;
 
 	/** Optional Asynchronous Events Supported */
 	uint32_t		oaes;	/* bitfield really */
 
 	/** Controller Attributes */
 	uint32_t		ctratt;	/* bitfield really */
 
-	uint8_t			reserved1[12];
+	/** Read Recovery Levels Supported */
+	uint16_t		rrls;
 
+	uint8_t			reserved1[9];
+
+	/** Controller Type */
+	uint8_t			cntrltype;
+
 	/** FRU Globally Unique Identifier */
 	uint8_t			fguid[16];
 
-	uint8_t			reserved2[128];
+	/** Command Retry Delay Time 1 */
+	uint16_t		crdt1;
 
+	/** Command Retry Delay Time 2 */
+	uint16_t		crdt2;
+
+	/** Command Retry Delay Time 3 */
+	uint16_t		crdt3;
+
+	uint8_t			reserved2[122];
+
 	/* bytes 256-511: admin command set attributes */
 
 	/** optional admin command support */
 	uint16_t		oacs;
 
 	/** abort command limit */
 	uint8_t			acl;
 
 	/** asynchronous event request limit */
 	uint8_t			aerl;
 
 	/** firmware updates */
 	uint8_t			frmw;
 
 	/** log page attributes */
 	uint8_t			lpa;
 
 	/** error log page entries */
 	uint8_t			elpe;
 
 	/** number of power states supported */
 	uint8_t			npss;
 
 	/** admin vendor specific command configuration */
 	uint8_t			avscc;
 
 	/** Autonomous Power State Transition Attributes */
 	uint8_t			apsta;
 
 	/** Warning Composite Temperature Threshold */
 	uint16_t		wctemp;
 
 	/** Critical Composite Temperature Threshold */
 	uint16_t		cctemp;
 
 	/** Maximum Time for Firmware Activation */
 	uint16_t		mtfa;
 
 	/** Host Memory Buffer Preferred Size */
 	uint32_t		hmpre;
 
 	/** Host Memory Buffer Minimum Size */
 	uint32_t		hmmin;
 
 	/** Name space capabilities  */
 	struct {
 		/* if nsmgmt, report tnvmcap and unvmcap */
 		uint8_t    tnvmcap[16];
 		uint8_t    unvmcap[16];
 	} __packed untncap;
 
 	/** Replay Protected Memory Block Support */
 	uint32_t		rpmbs; /* Really a bitfield */
 
 	/** Extended Device Self-test Time */
 	uint16_t		edstt;
 
 	/** Device Self-test Options */
 	uint8_t			dsto; /* Really a bitfield */
 
 	/** Firmware Update Granularity */
 	uint8_t			fwug;
 
 	/** Keep Alive Support */
 	uint16_t		kas;
 
 	/** Host Controlled Thermal Management Attributes */
 	uint16_t		hctma; /* Really a bitfield */
 
 	/** Minimum Thermal Management Temperature */
 	uint16_t		mntmt;
 
 	/** Maximum Thermal Management Temperature */
 	uint16_t		mxtmt;
 
 	/** Sanitize Capabilities */
 	uint32_t		sanicap; /* Really a bitfield */
 
-	uint8_t			reserved3[180];
+	/** Host Memory Buffer Minimum Descriptor Entry Size */
+	uint32_t		hmminds;
+
+	/** Host Memory Maximum Descriptors Entries */
+	uint16_t		hmmaxd;
+
+	/** NVM Set Identifier Maximum */
+	uint16_t		nsetidmax;
+
+	/** Endurance Group Identifier Maximum */
+	uint16_t		endgidmax;
+
+	/** ANA Transition Time */
+	uint8_t			anatt;
+
+	/** Asymmetric Namespace Access Capabilities */
+	uint8_t			anacap;
+
+	/** ANA Group Identifier Maximum */
+	uint32_t		anagrpmax;
+
+	/** Number of ANA Group Identifiers */
+	uint32_t		nanagrpid;
+
+	/** Persistent Event Log Size */
+	uint32_t		pels;
+
+	uint8_t			reserved3[156];
 	/* bytes 512-703: nvm command set attributes */
 
 	/** submission queue entry size */
 	uint8_t			sqes;
 
 	/** completion queue entry size */
 	uint8_t			cqes;
 
 	/** Maximum Outstanding Commands */
 	uint16_t		maxcmd;
 
 	/** number of namespaces */
 	uint32_t		nn;
 
 	/** optional nvm command support */
 	uint16_t		oncs;
 
 	/** fused operation support */
 	uint16_t		fuses;
 
 	/** format nvm attributes */
 	uint8_t			fna;
 
 	/** volatile write cache */
 	uint8_t			vwc;
 
 	/** Atomic Write Unit Normal */
 	uint16_t		awun;
 
 	/** Atomic Write Unit Power Fail */
 	uint16_t		awupf;
 
 	/** NVM Vendor Specific Command Configuration */
 	uint8_t			nvscc;
-	uint8_t			reserved5;
 
+	/** Namespace Write Protection Capabilities */
+	uint8_t			nwpc;
+
 	/** Atomic Compare & Write Unit */
 	uint16_t		acwu;
 	uint16_t		reserved6;
 
 	/** SGL Support */
 	uint32_t		sgls;
 
+	/** Maximum Number of Allowed Namespaces */
+	uint32_t		mnan;
+
 	/* bytes 540-767: Reserved */
-	uint8_t			reserved7[228];
+	uint8_t			reserved7[224];
 
 	/** NVM Subsystem NVMe Qualified Name */
 	uint8_t			subnqn[256];
 
 	/* bytes 1024-1791: Reserved */
 	uint8_t			reserved8[768];
 
 	/* bytes 1792-2047: NVMe over Fabrics specification */
 	uint8_t			reserved9[256];
 
 	/* bytes 2048-3071: power state descriptors */
 	struct nvme_power_state power_state[32];
 
 	/* bytes 3072-4095: vendor specific */
 	uint8_t			vs[1024];
 } __packed __aligned(4);
 
 _Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data");
 
 struct nvme_namespace_data {
 
 	/** namespace size */
 	uint64_t		nsze;
 
 	/** namespace capacity */
 	uint64_t		ncap;
 
 	/** namespace utilization */
 	uint64_t		nuse;
 
 	/** namespace features */
 	uint8_t			nsfeat;
 
 	/** number of lba formats */
 	uint8_t			nlbaf;
 
 	/** formatted lba size */
 	uint8_t			flbas;
 
 	/** metadata capabilities */
 	uint8_t			mc;
 
 	/** end-to-end data protection capabilities */
 	uint8_t			dpc;
 
 	/** end-to-end data protection type settings */
 	uint8_t			dps;
 
 	/** Namespace Multi-path I/O and Namespace Sharing Capabilities */
 	uint8_t			nmic;
 
 	/** Reservation Capabilities */
 	uint8_t			rescap;
 
 	/** Format Progress Indicator */
 	uint8_t			fpi;
 
 	/** Deallocate Logical Block Features */
 	uint8_t			dlfeat;
 
 	/** Namespace Atomic Write Unit Normal  */
 	uint16_t		nawun;
 
 	/** Namespace Atomic Write Unit Power Fail */
 	uint16_t		nawupf;
 
 	/** Namespace Atomic Compare & Write Unit */
 	uint16_t		nacwu;
 
 	/** Namespace Atomic Boundary Size Normal */
 	uint16_t		nabsn;
 
 	/** Namespace Atomic Boundary Offset */
 	uint16_t		nabo;
 
 	/** Namespace Atomic Boundary Size Power Fail */
 	uint16_t		nabspf;
 
 	/** Namespace Optimal IO Boundary */
 	uint16_t		noiob;
 
 	/** NVM Capacity */
 	uint8_t			nvmcap[16];
 
-	/* bytes 64-103: Reserved */
-	uint8_t			reserved5[40];
+	/** Namespace Preferred Write Granularity  */
+	uint16_t		npwg;
 
+	/** Namespace Preferred Write Alignment */
+	uint16_t		npwa;
+
+	/** Namespace Preferred Deallocate Granularity */
+	uint16_t		npdg;
+
+	/** Namespace Preferred Deallocate Alignment */
+	uint16_t		npda;
+
+	/** Namespace Optimal Write Size */
+	uint16_t		nows;
+
+	/* bytes 74-91: Reserved */
+	uint8_t			reserved5[18];
+
+	/** ANA Group Identifier */
+	uint32_t		anagrpid;
+
+	/* bytes 96-98: Reserved */
+	uint8_t			reserved6[3];
+
+	/** Namespace Attributes */
+	uint8_t			nsattr;
+
+	/** NVM Set Identifier */
+	uint16_t		nvmsetid;
+
+	/** Endurance Group Identifier */
+	uint16_t		endgid;
+
 	/** Namespace Globally Unique Identifier */
 	uint8_t			nguid[16];
 
 	/** IEEE Extended Unique Identifier */
 	uint8_t			eui64[8];
 
 	/** lba format support */
 	uint32_t		lbaf[16];
 
-	uint8_t			reserved6[192];
+	uint8_t			reserved7[192];
 
 	uint8_t			vendor_specific[3712];
 } __packed __aligned(4);
 
 _Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data");
 
 enum nvme_log_page {
 
 	/* 0x00 - reserved */
 	NVME_LOG_ERROR			= 0x01,
 	NVME_LOG_HEALTH_INFORMATION	= 0x02,
 	NVME_LOG_FIRMWARE_SLOT		= 0x03,
 	NVME_LOG_CHANGED_NAMESPACE	= 0x04,
 	NVME_LOG_COMMAND_EFFECT		= 0x05,
 	/* 0x06-0x7F - reserved */
 	/* 0x80-0xBF - I/O command set specific */
 	NVME_LOG_RES_NOTIFICATION	= 0x80,
 	/* 0xC0-0xFF - vendor specific */
 
 	/*
 	 * The following are Intel Specific log pages, but they seem
 	 * to be widely implemented.
 	 */
 	INTEL_LOG_READ_LAT_LOG		= 0xc1,
 	INTEL_LOG_WRITE_LAT_LOG		= 0xc2,
 	INTEL_LOG_TEMP_STATS		= 0xc5,
 	INTEL_LOG_ADD_SMART		= 0xca,
 	INTEL_LOG_DRIVE_MKT_NAME	= 0xdd,
 
 	/*
 	 * HGST log page, with lots ofs sub pages.
 	 */
 	HGST_INFO_LOG			= 0xc1,
 };
 
 struct nvme_error_information_entry {
 
 	uint64_t		error_count;
 	uint16_t		sqid;
 	uint16_t		cid;
 	uint16_t		status;
 	uint16_t		error_location;
 	uint64_t		lba;
 	uint32_t		nsid;
 	uint8_t			vendor_specific;
 	uint8_t			reserved[35];
 } __packed __aligned(4);
 
 _Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry");
 
 struct nvme_health_information_page {
 
 	uint8_t			critical_warning;
 	uint16_t		temperature;
 	uint8_t			available_spare;
 	uint8_t			available_spare_threshold;
 	uint8_t			percentage_used;
 
 	uint8_t			reserved[26];
 
 	/*
 	 * Note that the following are 128-bit values, but are
 	 *  defined as an array of 2 64-bit values.
 	 */
 	/* Data Units Read is always in 512-byte units. */
 	uint64_t		data_units_read[2];
 	/* Data Units Written is always in 512-byte units. */
 	uint64_t		data_units_written[2];
 	/* For NVM command set, this includes Compare commands. */
 	uint64_t		host_read_commands[2];
 	uint64_t		host_write_commands[2];
 	/* Controller Busy Time is reported in minutes. */
 	uint64_t		controller_busy_time[2];
 	uint64_t		power_cycles[2];
 	uint64_t		power_on_hours[2];
 	uint64_t		unsafe_shutdowns[2];
 	uint64_t		media_errors[2];
 	uint64_t		num_error_info_log_entries[2];
 	uint32_t		warning_temp_time;
 	uint32_t		error_temp_time;
 	uint16_t		temp_sensor[8];
 
 	uint8_t			reserved2[296];
 } __packed __aligned(4);
 
 _Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page");
 
 struct nvme_firmware_page {
 
 	uint8_t			afi;
 	uint8_t			reserved[7];
 	uint64_t		revision[7]; /* revisions for 7 slots */
 	uint8_t			reserved2[448];
 } __packed __aligned(4);
 
 _Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page");
 
 struct nvme_ns_list {
 	uint32_t		ns[1024];
 } __packed __aligned(4);
 
 _Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list");
 
 struct intel_log_temp_stats
 {
 	uint64_t	current;
 	uint64_t	overtemp_flag_last;
 	uint64_t	overtemp_flag_life;
 	uint64_t	max_temp;
 	uint64_t	min_temp;
 	uint64_t	_rsvd[5];
 	uint64_t	max_oper_temp;
 	uint64_t	min_oper_temp;
 	uint64_t	est_offset;
 } __packed __aligned(4);
 
 _Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats");
 
 #define NVME_TEST_MAX_THREADS	128
 
 struct nvme_io_test {
 
 	enum nvme_nvm_opcode	opc;
 	uint32_t		size;
 	uint32_t		time;	/* in seconds */
 	uint32_t		num_threads;
 	uint32_t		flags;
 	uint64_t		io_completed[NVME_TEST_MAX_THREADS];
 };
 
 enum nvme_io_test_flags {
 
 	/*
 	 * Specifies whether dev_refthread/dev_relthread should be
 	 *  called during NVME_BIO_TEST.  Ignored for other test
 	 *  types.
 	 */
 	NVME_TEST_FLAG_REFTHREAD =	0x1,
 };
 
 struct nvme_pt_command {
 
 	/*
 	 * cmd is used to specify a passthrough command to a controller or
 	 *  namespace.
 	 *
 	 * The following fields from cmd may be specified by the caller:
 	 *	* opc  (opcode)
 	 *	* nsid (namespace id) - for admin commands only
 	 *	* cdw10-cdw15
 	 *
 	 * Remaining fields must be set to 0 by the caller.
 	 */
 	struct nvme_command	cmd;
 
 	/*
 	 * cpl returns completion status for the passthrough command
 	 *  specified by cmd.
 	 *
 	 * The following fields will be filled out by the driver, for
 	 *  consumption by the caller:
 	 *	* cdw0
 	 *	* status (except for phase)
 	 *
 	 * Remaining fields will be set to 0 by the driver.
 	 */
 	struct nvme_completion	cpl;
 
 	/* buf is the data buffer associated with this passthrough command. */
 	void *			buf;
 
 	/*
 	 * len is the length of the data buffer associated with this
 	 *  passthrough command.
 	 */
 	uint32_t		len;
 
 	/*
 	 * is_read = 1 if the passthrough command will read data into the
 	 *  supplied buffer from the controller.
 	 *
 	 * is_read = 0 if the passthrough command will write data from the
 	 *  supplied buffer to the controller.
 	 */
 	uint32_t		is_read;
 
 	/*
 	 * driver_lock is used by the driver only.  It must be set to 0
 	 *  by the caller.
 	 */
 	struct mtx *		driver_lock;
 };
 
 #define nvme_completion_is_error(cpl)					\
 	(NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0)
 
 void	nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen);
 
 #ifdef _KERNEL
 
 struct bio;
 
 struct nvme_namespace;
 struct nvme_controller;
 struct nvme_consumer;
 
 typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *);
 
 typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *);
 typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *);
 typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *,
 				     uint32_t, void *, uint32_t);
 typedef void (*nvme_cons_fail_fn_t)(void *);
 
 enum nvme_namespace_flags {
 	NVME_NS_DEALLOCATE_SUPPORTED	= 0x1,
 	NVME_NS_FLUSH_SUPPORTED		= 0x2,
 };
 
 int	nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
 				   struct nvme_pt_command *pt,
 				   uint32_t nsid, int is_user_buffer,
 				   int is_admin_cmd);
 
 /* Admin functions */
 void	nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr,
 				   uint8_t feature, uint32_t cdw11,
 				   void *payload, uint32_t payload_size,
 				   nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr,
 				   uint8_t feature, uint32_t cdw11,
 				   void *payload, uint32_t payload_size,
 				   nvme_cb_fn_t cb_fn, void *cb_arg);
 void	nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr,
 				    uint8_t log_page, uint32_t nsid,
 				    void *payload, uint32_t payload_size,
 				    nvme_cb_fn_t cb_fn, void *cb_arg);
 
 /* NVM I/O functions */
 int	nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload,
 			  uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
 			  void *cb_arg);
 int	nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp,
 			      nvme_cb_fn_t cb_fn, void *cb_arg);
 int	nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload,
 			 uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn,
 			 void *cb_arg);
 int	nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp,
 			      nvme_cb_fn_t cb_fn, void *cb_arg);
 int	nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload,
 			       uint8_t num_ranges, nvme_cb_fn_t cb_fn,
 			       void *cb_arg);
 int	nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn,
 			  void *cb_arg);
 int	nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset,
 		     size_t len);
 
 /* Registration functions */
 struct nvme_consumer *	nvme_register_consumer(nvme_cons_ns_fn_t    ns_fn,
 					       nvme_cons_ctrlr_fn_t ctrlr_fn,
 					       nvme_cons_async_fn_t async_fn,
 					       nvme_cons_fail_fn_t  fail_fn);
 void		nvme_unregister_consumer(struct nvme_consumer *consumer);
 
 /* Controller helper functions */
 device_t	nvme_ctrlr_get_device(struct nvme_controller *ctrlr);
 const struct nvme_controller_data *
 		nvme_ctrlr_get_data(struct nvme_controller *ctrlr);
 static inline bool
 nvme_ctrlr_has_dataset_mgmt(const struct nvme_controller_data *cd)
 {
 	/* Assumes cd was byte swapped by nvme_controller_data_swapbytes() */
 	return ((cd->oncs >> NVME_CTRLR_DATA_ONCS_DSM_SHIFT) &
 		NVME_CTRLR_DATA_ONCS_DSM_MASK);
 }
 
 /* Namespace helper functions */
 uint32_t	nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns);
 uint32_t	nvme_ns_get_sector_size(struct nvme_namespace *ns);
 uint64_t	nvme_ns_get_num_sectors(struct nvme_namespace *ns);
 uint64_t	nvme_ns_get_size(struct nvme_namespace *ns);
 uint32_t	nvme_ns_get_flags(struct nvme_namespace *ns);
 const char *	nvme_ns_get_serial_number(struct nvme_namespace *ns);
 const char *	nvme_ns_get_model_number(struct nvme_namespace *ns);
 const struct nvme_namespace_data *
 		nvme_ns_get_data(struct nvme_namespace *ns);
 uint32_t	nvme_ns_get_stripesize(struct nvme_namespace *ns);
 
 int	nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp,
 			    nvme_cb_fn_t cb_fn);
 
 /*
  * Command building helper functions -- shared with CAM
  * These functions assume allocator zeros out cmd structure
  * CAM's xpt_get_ccb and the request allocator for nvme both
  * do zero'd allocations.
  */
 static inline
 void	nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid)
 {
 
 	cmd->opc = NVME_OPC_FLUSH;
 	cmd->nsid = htole32(nsid);
 }
 
 static inline
 void	nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid,
     uint64_t lba, uint32_t count)
 {
 	cmd->opc = rwcmd;
 	cmd->nsid = htole32(nsid);
 	cmd->cdw10 = htole32(lba & 0xffffffffu);
 	cmd->cdw11 = htole32(lba >> 32);
 	cmd->cdw12 = htole32(count-1);
 }
 
 static inline
 void	nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid,
     uint64_t lba, uint32_t count)
 {
 	nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count);
 }
 
 static inline
 void	nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid,
     uint64_t lba, uint32_t count)
 {
 	nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count);
 }
 
 static inline
 void	nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid,
     uint32_t num_ranges)
 {
 	cmd->opc = NVME_OPC_DATASET_MANAGEMENT;
 	cmd->nsid = htole32(nsid);
 	cmd->cdw10 = htole32(num_ranges - 1);
 	cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE);
 }
 
 extern int nvme_use_nvd;
 
 #endif /* _KERNEL */
 
 /* Endianess conversion functions for NVMe structs */
 static inline
 void	nvme_completion_swapbytes(struct nvme_completion *s)
 {
 
 	s->cdw0 = le32toh(s->cdw0);
 	/* omit rsvd1 */
 	s->sqhd = le16toh(s->sqhd);
 	s->sqid = le16toh(s->sqid);
 	/* omit cid */
 	s->status = le16toh(s->status);
 }
 
 static inline
 void	nvme_power_state_swapbytes(struct nvme_power_state *s)
 {
 
 	s->mp = le16toh(s->mp);
 	s->enlat = le32toh(s->enlat);
 	s->exlat = le32toh(s->exlat);
 	s->idlp = le16toh(s->idlp);
 	s->actp = le16toh(s->actp);
 }
 
 static inline
 void	nvme_controller_data_swapbytes(struct nvme_controller_data *s)
 {
 	int i;
 
 	s->vid = le16toh(s->vid);
 	s->ssvid = le16toh(s->ssvid);
 	s->ctrlr_id = le16toh(s->ctrlr_id);
 	s->ver = le32toh(s->ver);
 	s->rtd3r = le32toh(s->rtd3r);
 	s->rtd3e = le32toh(s->rtd3e);
 	s->oaes = le32toh(s->oaes);
 	s->ctratt = le32toh(s->ctratt);
+	s->rrls = le16toh(s->rrls);
+	s->crdt1 = le16toh(s->crdt1);
+	s->crdt2 = le16toh(s->crdt2);
+	s->crdt3 = le16toh(s->crdt3);
 	s->oacs = le16toh(s->oacs);
 	s->wctemp = le16toh(s->wctemp);
 	s->cctemp = le16toh(s->cctemp);
 	s->mtfa = le16toh(s->mtfa);
 	s->hmpre = le32toh(s->hmpre);
 	s->hmmin = le32toh(s->hmmin);
 	s->rpmbs = le32toh(s->rpmbs);
 	s->edstt = le16toh(s->edstt);
 	s->kas = le16toh(s->kas);
 	s->hctma = le16toh(s->hctma);
 	s->mntmt = le16toh(s->mntmt);
 	s->mxtmt = le16toh(s->mxtmt);
 	s->sanicap = le32toh(s->sanicap);
+	s->hmminds = le32toh(s->hmminds);
+	s->hmmaxd = le16toh(s->hmmaxd);
+	s->nsetidmax = le16toh(s->nsetidmax);
+	s->endgidmax = le16toh(s->endgidmax);
+	s->anagrpmax = le32toh(s->anagrpmax);
+	s->nanagrpid = le32toh(s->nanagrpid);
+	s->pels = le32toh(s->pels);
 	s->maxcmd = le16toh(s->maxcmd);
 	s->nn = le32toh(s->nn);
 	s->oncs = le16toh(s->oncs);
 	s->fuses = le16toh(s->fuses);
 	s->awun = le16toh(s->awun);
 	s->awupf = le16toh(s->awupf);
 	s->acwu = le16toh(s->acwu);
 	s->sgls = le32toh(s->sgls);
+	s->mnan = le32toh(s->mnan);
 	for (i = 0; i < 32; i++)
 		nvme_power_state_swapbytes(&s->power_state[i]);
 }
 
 static inline
 void	nvme_namespace_data_swapbytes(struct nvme_namespace_data *s)
 {
 	int i;
 
 	s->nsze = le64toh(s->nsze);
 	s->ncap = le64toh(s->ncap);
 	s->nuse = le64toh(s->nuse);
 	s->nawun = le16toh(s->nawun);
 	s->nawupf = le16toh(s->nawupf);
 	s->nacwu = le16toh(s->nacwu);
 	s->nabsn = le16toh(s->nabsn);
 	s->nabo = le16toh(s->nabo);
 	s->nabspf = le16toh(s->nabspf);
 	s->noiob = le16toh(s->noiob);
+	s->npwg = le16toh(s->npwg);
+	s->npwa = le16toh(s->npwa);
+	s->npdg = le16toh(s->npdg);
+	s->npda = le16toh(s->npda);
+	s->nows = le16toh(s->nows);
+	s->anagrpid = le32toh(s->anagrpid);
+	s->nvmsetid = le16toh(s->nvmsetid);
+	s->endgid = le16toh(s->endgid);
 	for (i = 0; i < 16; i++)
 		s->lbaf[i] = le32toh(s->lbaf[i]);
 }
 
 static inline
 void	nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s)
 {
 
 	s->error_count = le64toh(s->error_count);
 	s->sqid = le16toh(s->sqid);
 	s->cid = le16toh(s->cid);
 	s->status = le16toh(s->status);
 	s->error_location = le16toh(s->error_location);
 	s->lba = le64toh(s->lba);
 	s->nsid = le32toh(s->nsid);
 }
 
 static inline
 void	nvme_le128toh(void *p)
 {
 #if _BYTE_ORDER != _LITTLE_ENDIAN
 	/* Swap 16 bytes in place */
 	char *tmp = (char*)p;
 	char b;
 	int i;
 	for (i = 0; i < 8; i++) {
 		b = tmp[i];
 		tmp[i] = tmp[15-i];
 		tmp[15-i] = b;
 	}
 #else
 	(void)p;
 #endif
 }
 
 static inline
 void	nvme_health_information_page_swapbytes(struct nvme_health_information_page *s)
 {
 	int i;
 
 	s->temperature = le16toh(s->temperature);
 	nvme_le128toh((void *)s->data_units_read);
 	nvme_le128toh((void *)s->data_units_written);
 	nvme_le128toh((void *)s->host_read_commands);
 	nvme_le128toh((void *)s->host_write_commands);
 	nvme_le128toh((void *)s->controller_busy_time);
 	nvme_le128toh((void *)s->power_cycles);
 	nvme_le128toh((void *)s->power_on_hours);
 	nvme_le128toh((void *)s->unsafe_shutdowns);
 	nvme_le128toh((void *)s->media_errors);
 	nvme_le128toh((void *)s->num_error_info_log_entries);
 	s->warning_temp_time = le32toh(s->warning_temp_time);
 	s->error_temp_time = le32toh(s->error_temp_time);
 	for (i = 0; i < 8; i++)
 		s->temp_sensor[i] = le16toh(s->temp_sensor[i]);
 }
 
 
 static inline
 void	nvme_firmware_page_swapbytes(struct nvme_firmware_page *s)
 {
 	int i;
 
 	for (i = 0; i < 7; i++)
 		s->revision[i] = le64toh(s->revision[i]);
 }
 
 static inline
 void	nvme_ns_list_swapbytes(struct nvme_ns_list *s)
 {
 	int i;
 
 	for (i = 0; i < 1024; i++)
 		s->ns[i] = le32toh(s->ns[i]);
 }
 
 static inline
 void	intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s)
 {
 
 	s->current = le64toh(s->current);
 	s->overtemp_flag_last = le64toh(s->overtemp_flag_last);
 	s->overtemp_flag_life = le64toh(s->overtemp_flag_life);
 	s->max_temp = le64toh(s->max_temp);
 	s->min_temp = le64toh(s->min_temp);
 	/* omit _rsvd[] */
 	s->max_oper_temp = le64toh(s->max_oper_temp);
 	s->min_oper_temp = le64toh(s->min_oper_temp);
 	s->est_offset = le64toh(s->est_offset);
 }
 
 #endif /* __NVME_H__ */
Index: projects/fuse2/sys/dev/usb/net/if_urndis.c
===================================================================
--- projects/fuse2/sys/dev/usb/net/if_urndis.c	(revision 350434)
+++ projects/fuse2/sys/dev/usb/net/if_urndis.c	(revision 350435)
@@ -1,1055 +1,1058 @@
 /*	$OpenBSD: if_urndis.c,v 1.46 2013/12/09 15:45:29 pirofti Exp $ */
 
 /*
  * Copyright (c) 2010 Jonathan Armani <armani@openbsd.org>
  * Copyright (c) 2010 Fabien Romano <fabien@openbsd.org>
  * Copyright (c) 2010 Michael Knudsen <mk@openbsd.org>
  * Copyright (c) 2014 Hans Petter Selasky <hselasky@freebsd.org>
  * All rights reserved.
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
  * copyright notice and this permission notice appear in all copies.
  *
  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/stdint.h>
 #include <sys/stddef.h>
 #include <sys/param.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/callout.h>
 #include <sys/malloc.h>
 #include <sys/priv.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/rndis.h>
 
 #include <dev/usb/usb.h>
 #include <dev/usb/usbdi.h>
 #include <dev/usb/usbdi_util.h>
 #include "usbdevs.h"
 
 #define	USB_DEBUG_VAR urndis_debug
 #include <dev/usb/usb_debug.h>
 #include <dev/usb/usb_process.h>
 #include "usb_if.h"
 
 #include <dev/usb/net/usb_ethernet.h>
 #include <dev/usb/net/if_urndisreg.h>
 
 #include <dev/usb/usb_cdc.h>
 
 static device_probe_t urndis_probe;
 static device_attach_t urndis_attach;
 static device_detach_t urndis_detach;
 static device_suspend_t urndis_suspend;
 static device_resume_t urndis_resume;
 
 static usb_callback_t urndis_bulk_write_callback;
 static usb_callback_t urndis_bulk_read_callback;
 static usb_callback_t urndis_intr_read_callback;
 
 static uether_fn_t urndis_attach_post;
 static uether_fn_t urndis_init;
 static uether_fn_t urndis_stop;
 static uether_fn_t urndis_start;
 static uether_fn_t urndis_setmulti;
 static uether_fn_t urndis_setpromisc;
 
 static uint32_t	urndis_ctrl_query(struct urndis_softc *sc, uint32_t oid,
 		    struct rndis_query_req *msg, uint16_t len,
 		    const void **rbuf, uint16_t *rbufsz);
 static uint32_t	urndis_ctrl_set(struct urndis_softc *sc, uint32_t oid,
 		    struct rndis_set_req *msg, uint16_t len);
 static uint32_t	urndis_ctrl_handle_init(struct urndis_softc *sc,
 		    const struct rndis_comp_hdr *hdr);
 static uint32_t	urndis_ctrl_handle_query(struct urndis_softc *sc,
 		    const struct rndis_comp_hdr *hdr, const void **buf,
 		    uint16_t *bufsz);
 static uint32_t	urndis_ctrl_handle_reset(struct urndis_softc *sc,
 		    const struct rndis_comp_hdr *hdr);
 static uint32_t	urndis_ctrl_init(struct urndis_softc *sc);
 static uint32_t	urndis_ctrl_halt(struct urndis_softc *sc);
 
 #ifdef USB_DEBUG
 static int urndis_debug = 0;
 static	SYSCTL_NODE(_hw_usb, OID_AUTO, urndis, CTLFLAG_RW, 0, "USB RNDIS-Ethernet");
 SYSCTL_INT(_hw_usb_urndis, OID_AUTO, debug, CTLFLAG_RWTUN, &urndis_debug, 0,
     "Debug level");
 #endif
 
 static const struct usb_config urndis_config[URNDIS_N_TRANSFER] = {
 	[URNDIS_BULK_RX] = {
 		.type = UE_BULK,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_RX,
 		.if_index = 0,
 		.frames = 1,
 		.bufsize = RNDIS_RX_MAXLEN,
 		.flags = {.short_xfer_ok = 1,},
 		.callback = urndis_bulk_read_callback,
 		.timeout = 0,		/* no timeout */
 		.usb_mode = USB_MODE_HOST,
 	},
 
 	[URNDIS_BULK_TX] = {
 		.type = UE_BULK,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_TX,
 		.if_index = 0,
 		.frames = RNDIS_TX_FRAMES_MAX,
 		.bufsize = (RNDIS_TX_FRAMES_MAX * RNDIS_TX_MAXLEN),
 		.flags = {
 			.force_short_xfer = 1,
 		},
 		.callback = urndis_bulk_write_callback,
 		.timeout = 10000,	/* 10 seconds */
 		.usb_mode = USB_MODE_HOST,
 	},
 
 	[URNDIS_INTR_RX] = {
 		.type = UE_INTERRUPT,
 		.endpoint = UE_ADDR_ANY,
 		.direction = UE_DIR_RX,
 		.if_index = 1,
 		.bufsize = 0,	/* use wMaxPacketSize */
 		.flags = {.short_xfer_ok = 1,.no_pipe_ok = 1,},
 		.callback = urndis_intr_read_callback,
 		.timeout = 0,
 		.usb_mode = USB_MODE_HOST,
 	},
 };
 
 static device_method_t urndis_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe, urndis_probe),
 	DEVMETHOD(device_attach, urndis_attach),
 	DEVMETHOD(device_detach, urndis_detach),
 	DEVMETHOD(device_suspend, urndis_suspend),
 	DEVMETHOD(device_resume, urndis_resume),
 
 	DEVMETHOD_END
 };
 
 static driver_t urndis_driver = {
 	.name = "urndis",
 	.methods = urndis_methods,
 	.size = sizeof(struct urndis_softc),
 };
 
 static devclass_t urndis_devclass;
 
 static const STRUCT_USB_HOST_ID urndis_host_devs[] = {
 	/* Generic RNDIS class match */
 	{USB_IFACE_CLASS(UICLASS_CDC),
 		USB_IFACE_SUBCLASS(UISUBCLASS_ABSTRACT_CONTROL_MODEL),
 		USB_IFACE_PROTOCOL(0xff)},
 	{USB_IFACE_CLASS(UICLASS_WIRELESS), USB_IFACE_SUBCLASS(UISUBCLASS_RF),
 		USB_IFACE_PROTOCOL(UIPROTO_RNDIS)},
 	{USB_IFACE_CLASS(UICLASS_IAD), USB_IFACE_SUBCLASS(UISUBCLASS_SYNC),
 		USB_IFACE_PROTOCOL(UIPROTO_ACTIVESYNC)},
 	/* HP-WebOS */
 	{USB_VENDOR(USB_VENDOR_PALM), USB_IFACE_CLASS(UICLASS_CDC),
 		USB_IFACE_SUBCLASS(UISUBCLASS_ABSTRACT_CONTROL_MODEL),
 		USB_IFACE_PROTOCOL(0xff)},
+	/* Nokia 7 plus */
+	{USB_IFACE_CLASS(UICLASS_IAD), USB_IFACE_SUBCLASS(0x4),
+		USB_IFACE_PROTOCOL(UIPROTO_ACTIVESYNC)},
 };
 
 DRIVER_MODULE(urndis, uhub, urndis_driver, urndis_devclass, NULL, NULL);
 MODULE_VERSION(urndis, 1);
 MODULE_DEPEND(urndis, uether, 1, 1, 1);
 MODULE_DEPEND(urndis, usb, 1, 1, 1);
 MODULE_DEPEND(urndis, ether, 1, 1, 1);
 USB_PNP_HOST_INFO(urndis_host_devs);
 
 static const struct usb_ether_methods urndis_ue_methods = {
 	.ue_attach_post = urndis_attach_post,
 	.ue_start = urndis_start,
 	.ue_init = urndis_init,
 	.ue_stop = urndis_stop,
 	.ue_setmulti = urndis_setmulti,
 	.ue_setpromisc = urndis_setpromisc,
 };
 
 static int
 urndis_probe(device_t dev)
 {
 	struct usb_attach_arg *uaa = device_get_ivars(dev);
 
 	return (usbd_lookup_id_by_uaa(urndis_host_devs, sizeof(urndis_host_devs), uaa));
 }
 
 static void
 urndis_attach_post(struct usb_ether *ue)
 {
 
 	/* no-op */
 }
 
 static int
 urndis_attach(device_t dev)
 {
 	static struct {
 		union {
 			struct rndis_query_req query;
 			struct rndis_set_req set;
 		} hdr;
 		union {
 			uint8_t eaddr[ETHER_ADDR_LEN];
 			uint32_t filter;
 		} ibuf;
 	} msg;
 	struct urndis_softc *sc = device_get_softc(dev);
 	struct usb_ether *ue = &sc->sc_ue;
 	struct usb_attach_arg *uaa = device_get_ivars(dev);
 	struct usb_cdc_cm_descriptor *cmd;
 	const void *buf;
 	uint16_t bufsz;
 	uint8_t iface_index[2] = { uaa->info.bIfaceIndex + 1, uaa->info.bIfaceIndex };
 	int error;
 	uint8_t i;
 
 	sc->sc_ue.ue_udev = uaa->device;
 	sc->sc_ifaceno_ctl = uaa->info.bIfaceNum;
 
 	cmd = usbd_find_descriptor(uaa->device, NULL, uaa->info.bIfaceIndex,
 	    UDESC_CS_INTERFACE, 0xFF, UDESCSUB_CDC_CM, 0xFF);
 	if (cmd != NULL) {
 		DPRINTF("Call Mode Descriptor found, dataif=%d\n", cmd->bDataInterface);
 		iface_index[0] = cmd->bDataInterface;
 	}
 
 	device_set_usb_desc(dev);
 
 	mtx_init(&sc->sc_mtx, device_get_nameunit(dev), NULL, MTX_DEF);
 
 	/* scan the alternate settings looking for a valid one */
 	for (i = 0; i != 32; i++) {
 		error = usbd_set_alt_interface_index(uaa->device,
 		    iface_index[0], i);
 
 		if (error != 0)
 			break;
 
 		error = usbd_transfer_setup(uaa->device,
 		    iface_index, sc->sc_xfer, urndis_config,
 		    URNDIS_N_TRANSFER, sc, &sc->sc_mtx);
 
 		if (error == 0)
 			break;
 	}
 	if ((error != 0) || (i == 32)) {
 		device_printf(dev, "No valid alternate setting found\n");
 		goto detach;
 	}
 
 	/* Initialize device - must be done before even querying it */
 	URNDIS_LOCK(sc);
 	error = urndis_ctrl_init(sc);
 	URNDIS_UNLOCK(sc);
 	if (error != (int)RNDIS_STATUS_SUCCESS) {
 		device_printf(dev, "Unable to initialize hardware\n");
 		goto detach;
 	}
 
 	/* Determine MAC address */
 	memset(msg.ibuf.eaddr, 0, sizeof(msg.ibuf.eaddr));
 	URNDIS_LOCK(sc);
 	error = urndis_ctrl_query(sc, OID_802_3_PERMANENT_ADDRESS,
 	    &msg.hdr.query, sizeof(msg.hdr.query) + sizeof(msg.ibuf.eaddr),
 	    &buf, &bufsz);
 	URNDIS_UNLOCK(sc);
 	if (error != (int)RNDIS_STATUS_SUCCESS) {
 		device_printf(dev, "Unable to get hardware address\n");
 		goto detach;
 	}
 	if (bufsz != ETHER_ADDR_LEN) {
 		device_printf(dev, "Invalid address length: %d bytes\n", bufsz);
 		goto detach;
 	}
 	memcpy(&sc->sc_ue.ue_eaddr, buf, ETHER_ADDR_LEN);
 
 	/* Initialize packet filter */
 	sc->sc_filter = NDIS_PACKET_TYPE_BROADCAST |
 	    NDIS_PACKET_TYPE_ALL_MULTICAST;
 	msg.ibuf.filter = htole32(sc->sc_filter);
 	URNDIS_LOCK(sc);
 	error = urndis_ctrl_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
 	    &msg.hdr.set, sizeof(msg.hdr.set) + sizeof(msg.ibuf.filter));
 	URNDIS_UNLOCK(sc);
 	if (error != (int)RNDIS_STATUS_SUCCESS) {
 		device_printf(dev, "Unable to set data filters\n");
 		goto detach;
 	}
 
 	ue->ue_sc = sc;
 	ue->ue_dev = dev;
 	ue->ue_udev = uaa->device;
 	ue->ue_mtx = &sc->sc_mtx;
 	ue->ue_methods = &urndis_ue_methods;
 
 	error = uether_ifattach(ue);
 	if (error) {
 		device_printf(dev, "Could not attach interface\n");
 		goto detach;
 	}
 
 	URNDIS_LOCK(sc);
 	/* start interrupt endpoint, if any */
 	usbd_transfer_start(sc->sc_xfer[URNDIS_INTR_RX]);
 	URNDIS_UNLOCK(sc);
 
 	return (0);			/* success */
 
 detach:
 	(void)urndis_detach(dev);
 	return (ENXIO);			/* failure */
 }
 
 static int
 urndis_detach(device_t dev)
 {
 	struct urndis_softc *sc = device_get_softc(dev);
 	struct usb_ether *ue = &sc->sc_ue;
 
 	/* stop all USB transfers first */
 	usbd_transfer_unsetup(sc->sc_xfer, URNDIS_N_TRANSFER);
 
 	uether_ifdetach(ue);
 
 	URNDIS_LOCK(sc);
 	(void)urndis_ctrl_halt(sc);
 	URNDIS_UNLOCK(sc);
 
 	mtx_destroy(&sc->sc_mtx);
 
 	return (0);
 }
 
 static void
 urndis_start(struct usb_ether *ue)
 {
 	struct urndis_softc *sc = uether_getsc(ue);
 
 	/*
 	 * Start the USB transfers, if not already started:
 	 */
 	usbd_transfer_start(sc->sc_xfer[URNDIS_BULK_TX]);
 	usbd_transfer_start(sc->sc_xfer[URNDIS_BULK_RX]);
 }
 
 static void
 urndis_init(struct usb_ether *ue)
 {
 	struct urndis_softc *sc = uether_getsc(ue);
 	struct ifnet *ifp = uether_getifp(ue);
 
 	URNDIS_LOCK_ASSERT(sc, MA_OWNED);
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 
 	/* stall data write direction, which depends on USB mode */
 	usbd_xfer_set_stall(sc->sc_xfer[URNDIS_BULK_TX]);
 
 	/* start data transfers */
 	urndis_start(ue);
 }
 
 static void
 urndis_stop(struct usb_ether *ue)
 {
 	struct urndis_softc *sc = uether_getsc(ue);
 	struct ifnet *ifp = uether_getifp(ue);
 
 	URNDIS_LOCK_ASSERT(sc, MA_OWNED);
 
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 
 	/*
 	 * stop all the transfers, if not already stopped:
 	 */
 	usbd_transfer_stop(sc->sc_xfer[URNDIS_BULK_RX]);
 	usbd_transfer_stop(sc->sc_xfer[URNDIS_BULK_TX]);
 }
 
 static void
 urndis_setmulti(struct usb_ether *ue)
 {
 
 	/* no-op */
 }
 
 static void
 urndis_setpromisc(struct usb_ether *ue)
 {
 
 	/* no-op */
 }
 
 static int
 urndis_suspend(device_t dev)
 {
 
 	device_printf(dev, "Suspending\n");
 	return (0);
 }
 
 static int
 urndis_resume(device_t dev)
 {
 
 	device_printf(dev, "Resuming\n");
 	return (0);
 }
 
 static usb_error_t
 urndis_ctrl_msg(struct urndis_softc *sc, uint8_t rt, uint8_t r,
     uint16_t index, uint16_t value, void *buf, uint16_t buflen)
 {
 	usb_device_request_t req;
 
 	req.bmRequestType = rt;
 	req.bRequest = r;
 	USETW(req.wValue, value);
 	USETW(req.wIndex, index);
 	USETW(req.wLength, buflen);
 
 	return (usbd_do_request_flags(sc->sc_ue.ue_udev,
 	    &sc->sc_mtx, &req, buf, (rt & UT_READ) ?
 	    USB_SHORT_XFER_OK : 0, NULL, 2000 /* ms */ ));
 }
 
 static usb_error_t
 urndis_ctrl_send(struct urndis_softc *sc, void *buf, uint16_t len)
 {
 	usb_error_t err;
 
 	err = urndis_ctrl_msg(sc, UT_WRITE_CLASS_INTERFACE,
 	    UCDC_SEND_ENCAPSULATED_COMMAND, sc->sc_ifaceno_ctl, 0, buf, len);
 
 	DPRINTF("%s\n", usbd_errstr(err));
 
 	return (err);
 }
 
 static struct rndis_comp_hdr *
 urndis_ctrl_recv(struct urndis_softc *sc)
 {
 	struct rndis_comp_hdr *hdr;
 	usb_error_t err;
 
 	err = urndis_ctrl_msg(sc, UT_READ_CLASS_INTERFACE,
 	    UCDC_GET_ENCAPSULATED_RESPONSE, sc->sc_ifaceno_ctl, 0,
 	    sc->sc_response_buf, RNDIS_RESPONSE_LEN);
 
 	if (err != USB_ERR_NORMAL_COMPLETION)
 		return (NULL);
 
 	hdr = (struct rndis_comp_hdr *)sc->sc_response_buf;
 
 	DPRINTF("type 0x%x len %u\n", le32toh(hdr->rm_type),
 	    le32toh(hdr->rm_len));
 
 	if (le32toh(hdr->rm_len) > RNDIS_RESPONSE_LEN) {
 		DPRINTF("ctrl message error: wrong size %u > %u\n",
 		    le32toh(hdr->rm_len), RNDIS_RESPONSE_LEN);
 		return (NULL);
 	}
 	return (hdr);
 }
 
 static uint32_t
 urndis_ctrl_handle(struct urndis_softc *sc, struct rndis_comp_hdr *hdr,
     const void **buf, uint16_t *bufsz)
 {
 	uint32_t rval;
 
 	DPRINTF("\n");
 
 	if (buf != NULL && bufsz != NULL) {
 		*buf = NULL;
 		*bufsz = 0;
 	}
 	switch (le32toh(hdr->rm_type)) {
 	case REMOTE_NDIS_INITIALIZE_CMPLT:
 		rval = urndis_ctrl_handle_init(sc, hdr);
 		break;
 
 	case REMOTE_NDIS_QUERY_CMPLT:
 		rval = urndis_ctrl_handle_query(sc, hdr, buf, bufsz);
 		break;
 
 	case REMOTE_NDIS_RESET_CMPLT:
 		rval = urndis_ctrl_handle_reset(sc, hdr);
 		break;
 
 	case REMOTE_NDIS_KEEPALIVE_CMPLT:
 	case REMOTE_NDIS_SET_CMPLT:
 		rval = le32toh(hdr->rm_status);
 		break;
 
 	default:
 		device_printf(sc->sc_ue.ue_dev,
 		    "ctrl message error: unknown event 0x%x\n",
 		    le32toh(hdr->rm_type));
 		rval = RNDIS_STATUS_FAILURE;
 		break;
 	}
 	return (rval);
 }
 
 static uint32_t
 urndis_ctrl_handle_init(struct urndis_softc *sc,
     const struct rndis_comp_hdr *hdr)
 {
 	const struct rndis_init_comp *msg;
 
 	msg = (const struct rndis_init_comp *)hdr;
 
 	DPRINTF("len %u rid %u status 0x%x "
 	    "ver_major %u ver_minor %u devflags 0x%x medium 0x%x pktmaxcnt %u "
 	    "pktmaxsz %u align %u aflistoffset %u aflistsz %u\n",
 	    le32toh(msg->rm_len),
 	    le32toh(msg->rm_rid),
 	    le32toh(msg->rm_status),
 	    le32toh(msg->rm_ver_major),
 	    le32toh(msg->rm_ver_minor),
 	    le32toh(msg->rm_devflags),
 	    le32toh(msg->rm_medium),
 	    le32toh(msg->rm_pktmaxcnt),
 	    le32toh(msg->rm_pktmaxsz),
 	    le32toh(msg->rm_align),
 	    le32toh(msg->rm_aflistoffset),
 	    le32toh(msg->rm_aflistsz));
 
 	if (le32toh(msg->rm_status) != RNDIS_STATUS_SUCCESS) {
 		DPRINTF("init failed 0x%x\n", le32toh(msg->rm_status));
 		return (le32toh(msg->rm_status));
 	}
 	if (le32toh(msg->rm_devflags) != RNDIS_DF_CONNECTIONLESS) {
 		DPRINTF("wrong device type (current type: 0x%x)\n",
 		    le32toh(msg->rm_devflags));
 		return (RNDIS_STATUS_FAILURE);
 	}
 	if (le32toh(msg->rm_medium) != RNDIS_MEDIUM_802_3) {
 		DPRINTF("medium not 802.3 (current medium: 0x%x)\n",
 		    le32toh(msg->rm_medium));
 		return (RNDIS_STATUS_FAILURE);
 	}
 	sc->sc_lim_pktsz = le32toh(msg->rm_pktmaxsz);
 
 	return (le32toh(msg->rm_status));
 }
 
 static uint32_t
 urndis_ctrl_handle_query(struct urndis_softc *sc,
     const struct rndis_comp_hdr *hdr, const void **buf, uint16_t *bufsz)
 {
 	const struct rndis_query_comp *msg;
 	uint64_t limit;
 
 	msg = (const struct rndis_query_comp *)hdr;
 
 	DPRINTF("len %u rid %u status 0x%x "
 	    "buflen %u bufoff %u\n",
 	    le32toh(msg->rm_len),
 	    le32toh(msg->rm_rid),
 	    le32toh(msg->rm_status),
 	    le32toh(msg->rm_infobuflen),
 	    le32toh(msg->rm_infobufoffset));
 
 	*buf = NULL;
 	*bufsz = 0;
 	if (le32toh(msg->rm_status) != RNDIS_STATUS_SUCCESS) {
 		DPRINTF("query failed 0x%x\n", le32toh(msg->rm_status));
 		return (le32toh(msg->rm_status));
 	}
 	limit = le32toh(msg->rm_infobuflen);
 	limit += le32toh(msg->rm_infobufoffset);
 	limit += RNDIS_HEADER_OFFSET;
 
 	if (limit > (uint64_t)le32toh(msg->rm_len)) {
 		DPRINTF("ctrl message error: invalid query info "
 		    "len/offset/end_position(%u/%u/%u) -> "
 		    "go out of buffer limit %u\n",
 		    le32toh(msg->rm_infobuflen),
 		    le32toh(msg->rm_infobufoffset),
 		    le32toh(msg->rm_infobuflen) +
 		    le32toh(msg->rm_infobufoffset) + RNDIS_HEADER_OFFSET,
 		    le32toh(msg->rm_len));
 		return (RNDIS_STATUS_FAILURE);
 	}
 	*buf = ((const uint8_t *)msg) + RNDIS_HEADER_OFFSET +
 	    le32toh(msg->rm_infobufoffset);
 	*bufsz = le32toh(msg->rm_infobuflen);
 
 	return (le32toh(msg->rm_status));
 }
 
 static uint32_t
 urndis_ctrl_handle_reset(struct urndis_softc *sc,
     const struct rndis_comp_hdr *hdr)
 {
 	const struct rndis_reset_comp *msg;
 	uint32_t rval;
 
 	msg = (const struct rndis_reset_comp *)hdr;
 
 	rval = le32toh(msg->rm_status);
 
 	DPRINTF("len %u status 0x%x "
 	    "adrreset %u\n",
 	    le32toh(msg->rm_len),
 	    rval,
 	    le32toh(msg->rm_adrreset));
 
 	if (rval != RNDIS_STATUS_SUCCESS) {
 		DPRINTF("reset failed 0x%x\n", rval);
 		return (rval);
 	}
 	if (msg->rm_adrreset != 0) {
 		struct {
 			struct rndis_set_req hdr;
 			uint32_t filter;
 		} msg_filter;
 
 		msg_filter.filter = htole32(sc->sc_filter);
 
 		rval = urndis_ctrl_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
 		    &msg_filter.hdr, sizeof(msg_filter));
 
 		if (rval != RNDIS_STATUS_SUCCESS) {
 			DPRINTF("unable to reset data filters\n");
 			return (rval);
 		}
 	}
 	return (rval);
 }
 
 static uint32_t
 urndis_ctrl_init(struct urndis_softc *sc)
 {
 	struct rndis_init_req msg;
 	struct rndis_comp_hdr *hdr;
 	uint32_t rval;
 
 	msg.rm_type = htole32(REMOTE_NDIS_INITIALIZE_MSG);
 	msg.rm_len = htole32(sizeof(msg));
 	msg.rm_rid = 0;
 	msg.rm_ver_major = htole32(RNDIS_VERSION_MAJOR);
 	msg.rm_ver_minor = htole32(1);
 	msg.rm_max_xfersz = htole32(RNDIS_RX_MAXLEN);
 
 	DPRINTF("type %u len %u rid %u ver_major %u "
 	    "ver_minor %u max_xfersz %u\n",
 	    le32toh(msg.rm_type),
 	    le32toh(msg.rm_len),
 	    le32toh(msg.rm_rid),
 	    le32toh(msg.rm_ver_major),
 	    le32toh(msg.rm_ver_minor),
 	    le32toh(msg.rm_max_xfersz));
 
 	rval = urndis_ctrl_send(sc, &msg, sizeof(msg));
 
 	if (rval != RNDIS_STATUS_SUCCESS) {
 		DPRINTF("init failed\n");
 		return (rval);
 	}
 	if ((hdr = urndis_ctrl_recv(sc)) == NULL) {
 		DPRINTF("unable to get init response\n");
 		return (RNDIS_STATUS_FAILURE);
 	}
 	rval = urndis_ctrl_handle(sc, hdr, NULL, NULL);
 
 	return (rval);
 }
 
 static uint32_t
 urndis_ctrl_halt(struct urndis_softc *sc)
 {
 	struct rndis_halt_req msg;
 	uint32_t rval;
 
 	msg.rm_type = htole32(REMOTE_NDIS_HALT_MSG);
 	msg.rm_len = htole32(sizeof(msg));
 	msg.rm_rid = 0;
 
 	DPRINTF("type %u len %u rid %u\n",
 	    le32toh(msg.rm_type),
 	    le32toh(msg.rm_len),
 	    le32toh(msg.rm_rid));
 
 	rval = urndis_ctrl_send(sc, &msg, sizeof(msg));
 
 	if (rval != RNDIS_STATUS_SUCCESS)
 		DPRINTF("halt failed\n");
 
 	return (rval);
 }
 
 /*
  * NB: Querying a device has the requirement of using an input buffer the size
  *     of the expected reply or larger, except for variably sized replies.
  */
 static uint32_t
 urndis_ctrl_query(struct urndis_softc *sc, uint32_t oid,
     struct rndis_query_req *msg, uint16_t len, const void **rbuf,
     uint16_t *rbufsz)
 {
 	struct rndis_comp_hdr *hdr;
 	uint32_t datalen, rval;
 
 	msg->rm_type = htole32(REMOTE_NDIS_QUERY_MSG);
 	msg->rm_len = htole32(len);
 	msg->rm_rid = 0;		/* XXX */
 	msg->rm_oid = htole32(oid);
 	datalen = len - sizeof(*msg);
 	msg->rm_infobuflen = htole32(datalen);
 	if (datalen != 0) {
 		msg->rm_infobufoffset = htole32(sizeof(*msg) -
 		    RNDIS_HEADER_OFFSET);
 	} else {
 		msg->rm_infobufoffset = 0;
 	}
 	msg->rm_devicevchdl = 0;
 
 	DPRINTF("type %u len %u rid %u oid 0x%x "
 	    "infobuflen %u infobufoffset %u devicevchdl %u\n",
 	    le32toh(msg->rm_type),
 	    le32toh(msg->rm_len),
 	    le32toh(msg->rm_rid),
 	    le32toh(msg->rm_oid),
 	    le32toh(msg->rm_infobuflen),
 	    le32toh(msg->rm_infobufoffset),
 	    le32toh(msg->rm_devicevchdl));
 
 	rval = urndis_ctrl_send(sc, msg, len);
 
 	if (rval != RNDIS_STATUS_SUCCESS) {
 		DPRINTF("query failed\n");
 		return (rval);
 	}
 	if ((hdr = urndis_ctrl_recv(sc)) == NULL) {
 		DPRINTF("unable to get query response\n");
 		return (RNDIS_STATUS_FAILURE);
 	}
 	rval = urndis_ctrl_handle(sc, hdr, rbuf, rbufsz);
 
 	return (rval);
 }
 
 static uint32_t
 urndis_ctrl_set(struct urndis_softc *sc, uint32_t oid,
     struct rndis_set_req *msg, uint16_t len)
 {
 	struct rndis_comp_hdr *hdr;
 	uint32_t datalen, rval;
 
 	msg->rm_type = htole32(REMOTE_NDIS_SET_MSG);
 	msg->rm_len = htole32(len);
 	msg->rm_rid = 0;		/* XXX */
 	msg->rm_oid = htole32(oid);
 	datalen = len - sizeof(*msg);
 	msg->rm_infobuflen = htole32(datalen);
 	if (datalen != 0) {
 		msg->rm_infobufoffset = htole32(sizeof(*msg) -
 		    RNDIS_HEADER_OFFSET);
 	} else {
 		msg->rm_infobufoffset = 0;
 	}
 	msg->rm_devicevchdl = 0;
 
 	DPRINTF("type %u len %u rid %u oid 0x%x "
 	    "infobuflen %u infobufoffset %u devicevchdl %u\n",
 	    le32toh(msg->rm_type),
 	    le32toh(msg->rm_len),
 	    le32toh(msg->rm_rid),
 	    le32toh(msg->rm_oid),
 	    le32toh(msg->rm_infobuflen),
 	    le32toh(msg->rm_infobufoffset),
 	    le32toh(msg->rm_devicevchdl));
 
 	rval = urndis_ctrl_send(sc, msg, len);
 
 	if (rval != RNDIS_STATUS_SUCCESS) {
 		DPRINTF("set failed\n");
 		return (rval);
 	}
 	if ((hdr = urndis_ctrl_recv(sc)) == NULL) {
 		DPRINTF("unable to get set response\n");
 		return (RNDIS_STATUS_FAILURE);
 	}
 	rval = urndis_ctrl_handle(sc, hdr, NULL, NULL);
 	if (rval != RNDIS_STATUS_SUCCESS)
 		DPRINTF("set failed 0x%x\n", rval);
 
 	return (rval);
 }
 
 static void
 urndis_bulk_read_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct urndis_softc *sc = usbd_xfer_softc(xfer);
 	struct usb_page_cache *pc = usbd_xfer_get_frame(xfer, 0);
 	struct ifnet *ifp = uether_getifp(&sc->sc_ue);
 	struct rndis_packet_msg msg;
 	struct mbuf *m;
 	int actlen;
 	int aframes;
 	int offset;
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 		usbd_xfer_status(xfer, &actlen, NULL, &aframes, NULL);
 
 		DPRINTFN(1, "received %u bytes in %u frames\n", actlen, aframes);
 
 		for (offset = 0; actlen >= (uint32_t)sizeof(msg);) {
 			/* copy out header */
 			usbd_copy_out(pc, offset, &msg, sizeof(msg));
 
 			if (le32toh(0x1234567U) != 0x1234567U) {
 				/* swap endianness */
 				msg.rm_type = le32toh(msg.rm_type);
 				msg.rm_len = le32toh(msg.rm_len);
 				msg.rm_dataoffset = le32toh(msg.rm_dataoffset);
 				msg.rm_datalen = le32toh(msg.rm_datalen);
 				msg.rm_oobdataoffset = le32toh(msg.rm_oobdataoffset);
 				msg.rm_oobdatalen = le32toh(msg.rm_oobdatalen);
 				msg.rm_oobdataelements = le32toh(msg.rm_oobdataelements);
 				msg.rm_pktinfooffset = le32toh(msg.rm_pktinfooffset);
 				msg.rm_pktinfolen = le32toh(msg.rm_pktinfolen);
 				msg.rm_vchandle = le32toh(msg.rm_vchandle);
 				msg.rm_reserved = le32toh(msg.rm_reserved);
 			}
 
 			DPRINTF("len %u data(off:%u len:%u) "
 			    "oobdata(off:%u len:%u nb:%u) perpacket(off:%u len:%u)\n",
 			    msg.rm_len, msg.rm_dataoffset, msg.rm_datalen,
 			    msg.rm_oobdataoffset, msg.rm_oobdatalen,
 			    msg.rm_oobdataelements, msg.rm_pktinfooffset,
 			    msg.rm_pktinfooffset);
 
 			/* sanity check the RNDIS header */
 			if (msg.rm_type != REMOTE_NDIS_PACKET_MSG) {
 				DPRINTF("invalid type 0x%x != 0x%x\n",
 				    msg.rm_type, REMOTE_NDIS_PACKET_MSG);
 				goto tr_setup;
 			} else if (msg.rm_len < (uint32_t)sizeof(msg)) {
 				DPRINTF("invalid msg len %u < %u\n",
 				    msg.rm_len, (unsigned)sizeof(msg));
 				goto tr_setup;
 			} else if (msg.rm_len > (uint32_t)actlen) {
 				DPRINTF("invalid msg len %u > buffer "
 				    "len %u\n", msg.rm_len, actlen);
 				goto tr_setup;
 			} else if (msg.rm_dataoffset >= (uint32_t)actlen) {
 				DPRINTF("invalid msg dataoffset %u > buffer "
 				    "dataoffset %u\n", msg.rm_dataoffset, actlen);
 				goto tr_setup;
 			} else if (msg.rm_datalen > (uint32_t)actlen) {
 				DPRINTF("invalid msg datalen %u > buffer "
 				    "datalen %u\n", msg.rm_datalen, actlen);
 				goto tr_setup;
 			} else if ((msg.rm_dataoffset + msg.rm_datalen +
 			    (uint32_t)__offsetof(struct rndis_packet_msg,
 			    rm_dataoffset)) > (uint32_t)actlen) {
 				DPRINTF("invalid dataoffset %u larger than %u\n",
 				    msg.rm_dataoffset + msg.rm_datalen +
 				    (uint32_t)__offsetof(struct rndis_packet_msg,
 				    rm_dataoffset), actlen);
 				goto tr_setup;
 			} else if (msg.rm_datalen < (uint32_t)sizeof(struct ether_header)) {
 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 				DPRINTF("invalid ethernet size "
 				    "%u < %u\n", msg.rm_datalen, (unsigned)sizeof(struct ether_header));
 				goto tr_setup;
 			} else if (msg.rm_datalen > (uint32_t)(MCLBYTES - ETHER_ALIGN)) {
 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 				DPRINTF("invalid ethernet size "
 				    "%u > %u\n",
 				    msg.rm_datalen, (unsigned)MCLBYTES);
 				goto tr_setup;
 			} else if (msg.rm_datalen > (uint32_t)(MHLEN - ETHER_ALIGN)) {
 				m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 			} else {
 				m = m_gethdr(M_NOWAIT, MT_DATA);
 			}
 
 			/* check if we have a buffer */
 			if (m != NULL) {
 				m->m_len = m->m_pkthdr.len = msg.rm_datalen + ETHER_ALIGN;
 				m_adj(m, ETHER_ALIGN);
 
 				usbd_copy_out(pc, offset + msg.rm_dataoffset +
 				    __offsetof(struct rndis_packet_msg,
 				    rm_dataoffset), m->m_data, msg.rm_datalen);
 
 				/* enqueue */
 				uether_rxmbuf(&sc->sc_ue, m, msg.rm_datalen);
 			} else {
 				if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 			}
 			offset += msg.rm_len;
 			actlen -= msg.rm_len;
 		}
 
 	case USB_ST_SETUP:
 tr_setup:
 		usbd_xfer_set_frame_len(xfer, 0, RNDIS_RX_MAXLEN);
 		usbd_xfer_set_frames(xfer, 1);
 		usbd_transfer_submit(xfer);
 		uether_rxflush(&sc->sc_ue);	/* must be last */
 		break;
 
 	default:			/* Error */
 		DPRINTFN(1, "error = %s\n", usbd_errstr(error));
 
 		if (error != USB_ERR_CANCELLED) {
 			/* try to clear stall first */
 			usbd_xfer_set_stall(xfer);
 			usbd_xfer_set_frames(xfer, 0);
 			usbd_transfer_submit(xfer);
 		}
 		break;
 	}
 }
 
 static void
 urndis_bulk_write_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	struct rndis_packet_msg msg;
 	struct urndis_softc *sc = usbd_xfer_softc(xfer);
 	struct ifnet *ifp = uether_getifp(&sc->sc_ue);
 	struct mbuf *m;
 	unsigned x;
 	int actlen;
 	int aframes;
 
 	usbd_xfer_status(xfer, &actlen, NULL, &aframes, NULL);
 
 	DPRINTFN(1, "\n");
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 		DPRINTFN(11, "%u bytes in %u frames\n", actlen, aframes);
 
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 
 		/* FALLTHROUGH */
 	case USB_ST_SETUP:
 tr_setup:
 		memset(&msg, 0, sizeof(msg));
 
 		for (x = 0; x != RNDIS_TX_FRAMES_MAX; x++) {
 			struct usb_page_cache *pc = usbd_xfer_get_frame(xfer, x);
 
 			usbd_xfer_set_frame_offset(xfer, x * RNDIS_TX_MAXLEN, x);
 
 next_pkt:
 			IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
 
 			if (m == NULL)
 				break;
 
 			if ((m->m_pkthdr.len + sizeof(msg)) > RNDIS_TX_MAXLEN) {
 				DPRINTF("Too big packet\n");
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 				/* Free buffer */
 				m_freem(m);
 				goto next_pkt;
 			}
 			msg.rm_type = htole32(REMOTE_NDIS_PACKET_MSG);
 			msg.rm_len = htole32(sizeof(msg) + m->m_pkthdr.len);
 
 			msg.rm_dataoffset = htole32(RNDIS_DATA_OFFSET);
 			msg.rm_datalen = htole32(m->m_pkthdr.len);
 
 			/* copy in all data */
 			usbd_copy_in(pc, 0, &msg, sizeof(msg));
 			usbd_m_copy_in(pc, sizeof(msg), m, 0, m->m_pkthdr.len);
 			usbd_xfer_set_frame_len(xfer, x, sizeof(msg) + m->m_pkthdr.len);
 
 			/*
 			 * If there's a BPF listener, bounce a copy of
 			 * this frame to him:
 			 */
 			BPF_MTAP(ifp, m);
 
 			/* Free buffer */
 			m_freem(m);
 		}
 		if (x != 0) {
 			usbd_xfer_set_frames(xfer, x);
 			usbd_transfer_submit(xfer);
 		}
 		break;
 
 	default:			/* Error */
 		DPRINTFN(11, "transfer error, %s\n", usbd_errstr(error));
 
 		/* count output errors */
 		if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 
 		if (error != USB_ERR_CANCELLED) {
 			/* try to clear stall first */
 			usbd_xfer_set_stall(xfer);
 			goto tr_setup;
 		}
 		break;
 	}
 }
 
 static void
 urndis_intr_read_callback(struct usb_xfer *xfer, usb_error_t error)
 {
 	int actlen;
 
 	usbd_xfer_status(xfer, &actlen, NULL, NULL, NULL);
 
 	switch (USB_GET_STATE(xfer)) {
 	case USB_ST_TRANSFERRED:
 
 		DPRINTF("Received %d bytes\n", actlen);
 
 		/* TODO: decode some indications */
 
 		/* FALLTHROUGH */
 	case USB_ST_SETUP:
 tr_setup:
 		usbd_xfer_set_frame_len(xfer, 0, usbd_xfer_max_len(xfer));
 		usbd_transfer_submit(xfer);
 		break;
 
 	default:			/* Error */
 		if (error != USB_ERR_CANCELLED) {
 			/* start clear stall */
 			usbd_xfer_set_stall(xfer);
 			goto tr_setup;
 		}
 		break;
 	}
 }
Index: projects/fuse2/sys/fs/devfs/devfs_vnops.c
===================================================================
--- projects/fuse2/sys/fs/devfs/devfs_vnops.c	(revision 350434)
+++ projects/fuse2/sys/fs/devfs/devfs_vnops.c	(revision 350435)
@@ -1,1995 +1,1996 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000-2004
  *	Poul-Henning Kamp.  All rights reserved.
  * Copyright (c) 1989, 1992-1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software donated to Berkeley by
  * Jan-Simon Pendry.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kernfs_vnops.c	8.15 (Berkeley) 5/21/95
  * From: FreeBSD: src/sys/miscfs/kernfs/kernfs_vnops.c 1.43
  *
  * $FreeBSD$
  */
 
 /*
  * TODO:
  *	mkdir: want it ?
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/dirent.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/ttycom.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
 static struct vop_vector devfs_vnodeops;
 static struct vop_vector devfs_specops;
 static struct fileops devfs_ops_f;
 
 #include <fs/devfs/devfs.h>
 #include <fs/devfs/devfs_int.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 
 static MALLOC_DEFINE(M_CDEVPDATA, "DEVFSP", "Metainfo for cdev-fp data");
 
 struct mtx	devfs_de_interlock;
 MTX_SYSINIT(devfs_de_interlock, &devfs_de_interlock, "devfs interlock", MTX_DEF);
 struct sx	clone_drain_lock;
 SX_SYSINIT(clone_drain_lock, &clone_drain_lock, "clone events drain lock");
 struct mtx	cdevpriv_mtx;
 MTX_SYSINIT(cdevpriv_mtx, &cdevpriv_mtx, "cdevpriv lock", MTX_DEF);
 
 SYSCTL_DECL(_vfs_devfs);
 
 static int devfs_dotimes;
 SYSCTL_INT(_vfs_devfs, OID_AUTO, dotimes, CTLFLAG_RW,
     &devfs_dotimes, 0, "Update timestamps on DEVFS with default precision");
 
 /*
  * Update devfs node timestamp.  Note that updates are unlocked and
  * stat(2) could see partially updated times.
  */
 static void
 devfs_timestamp(struct timespec *tsp)
 {
 	time_t ts;
 
 	if (devfs_dotimes) {
 		vfs_timestamp(tsp);
 	} else {
 		ts = time_second;
 		if (tsp->tv_sec != ts) {
 			tsp->tv_sec = ts;
 			tsp->tv_nsec = 0;
 		}
 	}
 }
 
 static int
 devfs_fp_check(struct file *fp, struct cdev **devp, struct cdevsw **dswp,
     int *ref)
 {
 
 	*dswp = devvn_refthread(fp->f_vnode, devp, ref);
 	if (*devp != fp->f_data) {
 		if (*dswp != NULL)
 			dev_relthread(*devp, *ref);
 		return (ENXIO);
 	}
 	KASSERT((*devp)->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(*devp)));
 	if (*dswp == NULL)
 		return (ENXIO);
 	curthread->td_fpop = fp;
 	return (0);
 }
 
 int
 devfs_get_cdevpriv(void **datap)
 {
 	struct file *fp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (EBADF);
 	p = fp->f_cdevpriv;
 	if (p != NULL) {
 		error = 0;
 		*datap = p->cdpd_data;
 	} else
 		error = ENOENT;
 	return (error);
 }
 
 int
 devfs_set_cdevpriv(void *priv, d_priv_dtor_t *priv_dtr)
 {
 	struct file *fp;
 	struct cdev_priv *cdp;
 	struct cdev_privdata *p;
 	int error;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return (ENOENT);
 	cdp = cdev2priv((struct cdev *)fp->f_data);
 	p = malloc(sizeof(struct cdev_privdata), M_CDEVPDATA, M_WAITOK);
 	p->cdpd_data = priv;
 	p->cdpd_dtr = priv_dtr;
 	p->cdpd_fp = fp;
 	mtx_lock(&cdevpriv_mtx);
 	if (fp->f_cdevpriv == NULL) {
 		LIST_INSERT_HEAD(&cdp->cdp_fdpriv, p, cdpd_list);
 		fp->f_cdevpriv = p;
 		mtx_unlock(&cdevpriv_mtx);
 		error = 0;
 	} else {
 		mtx_unlock(&cdevpriv_mtx);
 		free(p, M_CDEVPDATA);
 		error = EBUSY;
 	}
 	return (error);
 }
 
 void
 devfs_destroy_cdevpriv(struct cdev_privdata *p)
 {
 
 	mtx_assert(&cdevpriv_mtx, MA_OWNED);
 	KASSERT(p->cdpd_fp->f_cdevpriv == p,
 	    ("devfs_destoy_cdevpriv %p != %p", p->cdpd_fp->f_cdevpriv, p));
 	p->cdpd_fp->f_cdevpriv = NULL;
 	LIST_REMOVE(p, cdpd_list);
 	mtx_unlock(&cdevpriv_mtx);
 	(p->cdpd_dtr)(p->cdpd_data);
 	free(p, M_CDEVPDATA);
 }
 
 static void
 devfs_fpdrop(struct file *fp)
 {
 	struct cdev_privdata *p;
 
 	mtx_lock(&cdevpriv_mtx);
 	if ((p = fp->f_cdevpriv) == NULL) {
 		mtx_unlock(&cdevpriv_mtx);
 		return;
 	}
 	devfs_destroy_cdevpriv(p);
 }
 
 void
 devfs_clear_cdevpriv(void)
 {
 	struct file *fp;
 
 	fp = curthread->td_fpop;
 	if (fp == NULL)
 		return;
 	devfs_fpdrop(fp);
 }
 
 /*
  * On success devfs_populate_vp() returns with dmp->dm_lock held.
  */
 static int
 devfs_populate_vp(struct vnode *vp)
 {
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	int locked;
 
 	ASSERT_VOP_LOCKED(vp, "devfs_populate_vp");
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	locked = VOP_ISLOCKED(vp);
 
 	sx_xlock(&dmp->dm_lock);
 	DEVFS_DMP_HOLD(dmp);
 
 	/* Can't call devfs_populate() with the vnode lock held. */
 	VOP_UNLOCK(vp, 0);
 	devfs_populate(dmp);
 
 	sx_xunlock(&dmp->dm_lock);
 	vn_lock(vp, locked | LK_RETRY);
 	sx_xlock(&dmp->dm_lock);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ERESTART);
 	}
 	if ((vp->v_iflag & VI_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 	de = vp->v_data;
 	KASSERT(de != NULL,
 	    ("devfs_populate_vp: vp->v_data == NULL but vnode not doomed"));
 	if ((de->de_flags & DE_DOOMED) != 0) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ERESTART);
 	}
 
 	return (0);
 }
 
 static int
 devfs_vptocnp(struct vop_vptocnp_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode **dvp = ap->a_vpp;
 	struct devfs_mount *dmp;
 	char *buf = ap->a_buf;
 	int *buflen = ap->a_buflen;
 	struct devfs_dirent *dd, *de;
 	int i, error;
 
 	dmp = VFSTODEVFS(vp->v_mount);
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	i = *buflen;
 	dd = vp->v_data;
 
 	if (vp->v_type == VCHR) {
 		i -= strlen(dd->de_cdp->cdp_c.si_name);
 		if (i < 0) {
 			error = ENOMEM;
 			goto finished;
 		}
 		bcopy(dd->de_cdp->cdp_c.si_name, buf + i,
 		    strlen(dd->de_cdp->cdp_c.si_name));
 		de = dd->de_dir;
 	} else if (vp->v_type == VDIR) {
 		if (dd == dmp->dm_rootdir) {
 			*dvp = vp;
 			vref(*dvp);
 			goto finished;
 		}
 		i -= dd->de_dirent->d_namlen;
 		if (i < 0) {
 			error = ENOMEM;
 			goto finished;
 		}
 		bcopy(dd->de_dirent->d_name, buf + i,
 		    dd->de_dirent->d_namlen);
 		de = dd;
 	} else {
 		error = ENOENT;
 		goto finished;
 	}
 	*buflen = i;
 	de = devfs_parent_dirent(de);
 	if (de == NULL) {
 		error = ENOENT;
 		goto finished;
 	}
 	mtx_lock(&devfs_de_interlock);
 	*dvp = de->de_vnode;
 	if (*dvp != NULL) {
 		VI_LOCK(*dvp);
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(*dvp);
 		VI_UNLOCK(*dvp);
 		vref(*dvp);
 		vdrop(*dvp);
 	} else {
 		mtx_unlock(&devfs_de_interlock);
 		error = ENOENT;
 	}
 finished:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /*
  * Construct the fully qualified path name relative to the mountpoint.
  * If a NULL cnp is provided, no '/' is appended to the resulting path.
  */
 char *
 devfs_fqpn(char *buf, struct devfs_mount *dmp, struct devfs_dirent *dd,
     struct componentname *cnp)
 {
 	int i;
 	struct devfs_dirent *de;
 
 	sx_assert(&dmp->dm_lock, SA_LOCKED);
 
 	i = SPECNAMELEN;
 	buf[i] = '\0';
 	if (cnp != NULL)
 		i -= cnp->cn_namelen;
 	if (i < 0)
 		 return (NULL);
 	if (cnp != NULL)
 		bcopy(cnp->cn_nameptr, buf + i, cnp->cn_namelen);
 	de = dd;
 	while (de != dmp->dm_rootdir) {
 		if (cnp != NULL || i < SPECNAMELEN) {
 			i--;
 			if (i < 0)
 				 return (NULL);
 			buf[i] = '/';
 		}
 		i -= de->de_dirent->d_namlen;
 		if (i < 0)
 			 return (NULL);
 		bcopy(de->de_dirent->d_name, buf + i,
 		    de->de_dirent->d_namlen);
 		de = devfs_parent_dirent(de);
 		if (de == NULL)
 			return (NULL);
 	}
 	return (buf + i);
 }
 
 static int
 devfs_allocv_drop_refs(int drop_dm_lock, struct devfs_mount *dmp,
 	struct devfs_dirent *de)
 {
 	int not_found;
 
 	not_found = 0;
 	if (de->de_flags & DE_DOOMED)
 		not_found = 1;
 	if (DEVFS_DE_DROP(de)) {
 		KASSERT(not_found == 1, ("DEVFS de dropped but not doomed"));
 		devfs_dirent_free(de);
 	}
 	if (DEVFS_DMP_DROP(dmp)) {
 		KASSERT(not_found == 1,
 			("DEVFS mount struct freed before dirent"));
 		not_found = 2;
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 	}
 	if (not_found == 1 || (drop_dm_lock && not_found != 2))
 		sx_unlock(&dmp->dm_lock);
 	return (not_found);
 }
 
 static void
 devfs_insmntque_dtr(struct vnode *vp, void *arg)
 {
 	struct devfs_dirent *de;
 
 	de = (struct devfs_dirent *)arg;
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = NULL;
 	de->de_vnode = NULL;
 	mtx_unlock(&devfs_de_interlock);
 	vgone(vp);
 	vput(vp);
 }
 
 /*
  * devfs_allocv shall be entered with dmp->dm_lock held, and it drops
  * it on return.
  */
 int
 devfs_allocv(struct devfs_dirent *de, struct mount *mp, int lockmode,
     struct vnode **vpp)
 {
 	int error;
 	struct vnode *vp;
 	struct cdev *dev;
 	struct devfs_mount *dmp;
 	struct cdevsw *dsw;
 
 	dmp = VFSTODEVFS(mp);
 	if (de->de_flags & DE_DOOMED) {
 		sx_xunlock(&dmp->dm_lock);
 		return (ENOENT);
 	}
 loop:
 	DEVFS_DE_HOLD(de);
 	DEVFS_DMP_HOLD(dmp);
 	mtx_lock(&devfs_de_interlock);
 	vp = de->de_vnode;
 	if (vp != NULL) {
 		VI_LOCK(vp);
 		mtx_unlock(&devfs_de_interlock);
 		sx_xunlock(&dmp->dm_lock);
 		vget(vp, lockmode | LK_INTERLOCK | LK_RETRY, curthread);
 		sx_xlock(&dmp->dm_lock);
 		if (devfs_allocv_drop_refs(0, dmp, de)) {
 			vput(vp);
 			return (ENOENT);
 		}
 		else if ((vp->v_iflag & VI_DOOMED) != 0) {
 			mtx_lock(&devfs_de_interlock);
 			if (de->de_vnode == vp) {
 				de->de_vnode = NULL;
 				vp->v_data = NULL;
 			}
 			mtx_unlock(&devfs_de_interlock);
 			vput(vp);
 			goto loop;
 		}
 		sx_xunlock(&dmp->dm_lock);
 		*vpp = vp;
 		return (0);
 	}
 	mtx_unlock(&devfs_de_interlock);
 	if (de->de_dirent->d_type == DT_CHR) {
 		if (!(de->de_cdp->cdp_flags & CDP_ACTIVE)) {
 			devfs_allocv_drop_refs(1, dmp, de);
 			return (ENOENT);
 		}
 		dev = &de->de_cdp->cdp_c;
 	} else {
 		dev = NULL;
 	}
 	error = getnewvnode("devfs", mp, &devfs_vnodeops, &vp);
 	if (error != 0) {
 		devfs_allocv_drop_refs(1, dmp, de);
 		printf("devfs_allocv: failed to allocate new vnode\n");
 		return (error);
 	}
 
 	if (de->de_dirent->d_type == DT_CHR) {
 		vp->v_type = VCHR;
 		VI_LOCK(vp);
 		dev_lock();
 		dev_refl(dev);
 		/* XXX: v_rdev should be protect by vnode lock */
 		vp->v_rdev = dev;
 		KASSERT(vp->v_usecount == 1,
 		    ("%s %d (%d)\n", __func__, __LINE__, vp->v_usecount));
 		dev->si_usecount += vp->v_usecount;
 		/* Special casing of ttys for deadfs.  Probably redundant. */
 		dsw = dev->si_devsw;
 		if (dsw != NULL && (dsw->d_flags & D_TTY) != 0)
 			vp->v_vflag |= VV_ISTTY;
 		dev_unlock();
 		VI_UNLOCK(vp);
 		if ((dev->si_flags & SI_ETERNAL) != 0)
 			vp->v_vflag |= VV_ETERNALDEV;
 		vp->v_op = &devfs_specops;
 	} else if (de->de_dirent->d_type == DT_DIR) {
 		vp->v_type = VDIR;
 	} else if (de->de_dirent->d_type == DT_LNK) {
 		vp->v_type = VLNK;
 	} else {
 		vp->v_type = VBAD;
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWITNESS);
 	VN_LOCK_ASHARE(vp);
 	mtx_lock(&devfs_de_interlock);
 	vp->v_data = de;
 	de->de_vnode = vp;
 	mtx_unlock(&devfs_de_interlock);
 	error = insmntque1(vp, mp, devfs_insmntque_dtr, de);
 	if (error != 0) {
 		(void) devfs_allocv_drop_refs(1, dmp, de);
 		return (error);
 	}
 	if (devfs_allocv_drop_refs(0, dmp, de)) {
 		vput(vp);
 		return (ENOENT);
 	}
 #ifdef MAC
 	mac_devfs_vnode_associate(mp, de, vp);
 #endif
 	sx_xunlock(&dmp->dm_lock);
 	*vpp = vp;
 	return (0);
 }
 
 static int
 devfs_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *de;
 	struct proc *p;
 	int error;
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	error = vaccess(vp->v_type, de->de_mode, de->de_uid, de->de_gid,
 	    ap->a_accmode, ap->a_cred, NULL);
 	if (error == 0)
 		return (0);
 	if (error != EACCES)
 		return (error);
 	p = ap->a_td->td_proc;
 	/* We do, however, allow access to the controlling terminal */
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == de->de_cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 _Static_assert(((FMASK | FCNTLFLAGS) & (FLASTCLOSE | FREVOKE)) == 0,
     "devfs-only flag reuse failed");
 
 static int
 devfs_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *oldvp;
 	struct thread *td = ap->a_td;
 	struct proc *p;
 	struct cdev *dev = vp->v_rdev;
 	struct cdevsw *dsw;
 	int dflags, error, ref, vp_locked;
 
 	/*
 	 * XXX: Don't call d_close() if we were called because of
 	 * XXX: insmntque1() failure.
 	 */
 	if (vp->v_data == NULL)
 		return (0);
 
 	/*
 	 * Hack: a tty device that is a controlling terminal
 	 * has a reference from the session structure.
 	 * We cannot easily tell that a character device is
 	 * a controlling terminal, unless it is the closing
 	 * process' controlling terminal.  In that case,
 	 * if the reference count is 2 (this last descriptor
 	 * plus the session), release the reference from the session.
 	 */
 	if (td != NULL) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		if (vp == p->p_session->s_ttyvp) {
 			PROC_UNLOCK(p);
 			oldvp = NULL;
 			sx_xlock(&proctree_lock);
 			if (vp == p->p_session->s_ttyvp) {
 				SESS_LOCK(p->p_session);
 				VI_LOCK(vp);
 				if (count_dev(dev) == 2 &&
 				    (vp->v_iflag & VI_DOOMED) == 0) {
 					p->p_session->s_ttyvp = NULL;
 					p->p_session->s_ttydp = NULL;
 					oldvp = vp;
 				}
 				VI_UNLOCK(vp);
 				SESS_UNLOCK(p->p_session);
 			}
 			sx_xunlock(&proctree_lock);
 			if (oldvp != NULL)
 				vrele(oldvp);
 		} else
 			PROC_UNLOCK(p);
 	}
 	/*
 	 * We do not want to really close the device if it
 	 * is still in use unless we are trying to close it
 	 * forcibly. Since every use (buffer, vnode, swap, cmap)
 	 * holds a reference to the vnode, and because we mark
 	 * any other vnodes that alias this device, when the
 	 * sum of the reference counts on all the aliased
 	 * vnodes descends to one, we are on last close.
 	 */
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	dflags = 0;
 	VI_LOCK(vp);
 	if (vp->v_iflag & VI_DOOMED) {
 		/* Forced close. */
 		dflags |= FREVOKE | FNONBLOCK;
 	} else if (dsw->d_flags & D_TRACKCLOSE) {
 		/* Keep device updated on status. */
 	} else if (count_dev(dev) > 1) {
 		VI_UNLOCK(vp);
 		dev_relthread(dev, ref);
 		return (0);
 	}
 	if (count_dev(dev) == 1)
 		dflags |= FLASTCLOSE;
 	vholdl(vp);
 	VI_UNLOCK(vp);
 	vp_locked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs_close() on un-referenced struct cdev *(%s)", devtoname(dev)));
 	error = dsw->d_close(dev, ap->a_fflag | dflags, S_IFCHR, td);
 	dev_relthread(dev, ref);
 	vn_lock(vp, vp_locked | LK_RETRY);
 	vdrop(vp);
 	return (error);
 }
 
 static int
 devfs_close_f(struct file *fp, struct thread *td)
 {
 	int error;
 	struct file *fpop;
 
 	/*
 	 * NB: td may be NULL if this descriptor is closed due to
 	 * garbage collection from a closed UNIX domain socket.
 	 */
 	fpop = curthread->td_fpop;
 	curthread->td_fpop = fp;
 	error = vnops.fo_close(fp, td);
 	curthread->td_fpop = fpop;
 
 	/*
 	 * The f_cdevpriv cannot be assigned non-NULL value while we
 	 * are destroying the file.
 	 */
 	if (fp->f_cdevpriv != NULL)
 		devfs_fpdrop(fp);
 	return (error);
 }
 
 static int
 devfs_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	struct cdev *dev;
 	struct timeval boottime;
 	int error;
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xunlock(&dmp->dm_lock);
 
 	de = vp->v_data;
 	KASSERT(de != NULL, ("Null dirent in devfs_getattr vp=%p", vp));
 	if (vp->v_type == VDIR) {
 		de = de->de_dir;
 		KASSERT(de != NULL,
 		    ("Null dir dirent in devfs_getattr vp=%p", vp));
 	}
 	vap->va_uid = de->de_uid;
 	vap->va_gid = de->de_gid;
 	vap->va_mode = de->de_mode;
 	if (vp->v_type == VLNK)
 		vap->va_size = strlen(de->de_symlink);
 	else if (vp->v_type == VDIR)
 		vap->va_size = vap->va_bytes = DEV_BSIZE;
 	else
 		vap->va_size = 0;
 	if (vp->v_type != VDIR)
 		vap->va_bytes = 0;
 	vap->va_blocksize = DEV_BSIZE;
 	vap->va_type = vp->v_type;
 
 	getboottime(&boottime);
 #define fix(aa)							\
 	do {							\
 		if ((aa).tv_sec <= 3600) {			\
 			(aa).tv_sec = boottime.tv_sec;		\
 			(aa).tv_nsec = boottime.tv_usec * 1000; \
 		}						\
 	} while (0)
 
 	if (vp->v_type != VCHR)  {
 		fix(de->de_atime);
 		vap->va_atime = de->de_atime;
 		fix(de->de_mtime);
 		vap->va_mtime = de->de_mtime;
 		fix(de->de_ctime);
 		vap->va_ctime = de->de_ctime;
 	} else {
 		dev = vp->v_rdev;
 		fix(dev->si_atime);
 		vap->va_atime = dev->si_atime;
 		fix(dev->si_mtime);
 		vap->va_mtime = dev->si_mtime;
 		fix(dev->si_ctime);
 		vap->va_ctime = dev->si_ctime;
 
 		vap->va_rdev = cdev2priv(dev)->cdp_inode;
 	}
 	vap->va_gen = 0;
 	vap->va_flags = 0;
 	vap->va_filerev = 0;
 	vap->va_nlink = de->de_links;
 	vap->va_fileid = de->de_inode;
 
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_ioctl_f(struct file *fp, u_long com, void *data, struct ucred *cred, struct thread *td)
 {
 	struct file *fpop;
 	int error;
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	error = vnops.fo_ioctl(fp, com, data, cred, td);
 	td->td_fpop = fpop;
 	return (error);
 }
 
 void *
 fiodgname_buf_get_ptr(void *fgnp, u_long com)
 {
 	union {
 		struct fiodgname_arg	fgn;
 #ifdef COMPAT_FREEBSD32
 		struct fiodgname_arg32	fgn32;
 #endif
 	} *fgnup;
 
 	fgnup = fgnp;
 	switch (com) {
 	case FIODGNAME:
 		return (fgnup->fgn.buf);
 #ifdef COMPAT_FREEBSD32
 	case FIODGNAME_32:
 		return ((void *)(uintptr_t)fgnup->fgn32.buf);
 #endif
 	default:
 		panic("Unhandled ioctl command %ld", com);
 	}
 }
 
 static int
 devfs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct fiodgname_arg *fgn;
 	struct vnode *vpold, *vp;
 	struct cdevsw *dsw;
 	struct thread *td;
 	struct cdev *dev;
 	int error, ref, i;
 	const char *p;
 	u_long com;
 
 	vp = ap->a_vp;
 	com = ap->a_command;
 	td = ap->a_td;
 
 	dsw = devvn_refthread(vp, &dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	KASSERT(dev->si_refcount > 0,
 	    ("devfs: un-referenced struct cdev *(%s)", devtoname(dev)));
 
 	switch (com) {
 	case FIODTYPE:
 		*(int *)ap->a_data = dsw->d_flags & D_TYPEMASK;
 		error = 0;
 		break;
 	case FIODGNAME:
 #ifdef	COMPAT_FREEBSD32
 	case FIODGNAME_32:
 #endif
 		fgn = ap->a_data;
 		p = devtoname(dev);
 		i = strlen(p) + 1;
 		if (i > fgn->len)
 			error = EINVAL;
 		else
 			error = copyout(p, fiodgname_buf_get_ptr(fgn, com), i);
 		break;
 	default:
 		error = dsw->d_ioctl(dev, com, ap->a_data, ap->a_fflag, td);
 	}
 
 	dev_relthread(dev, ref);
 	if (error == ENOIOCTL)
 		error = ENOTTY;
 
 	if (error == 0 && com == TIOCSCTTY) {
 		/* Do nothing if reassigning same control tty */
 		sx_slock(&proctree_lock);
 		if (td->td_proc->p_session->s_ttyvp == vp) {
 			sx_sunlock(&proctree_lock);
 			return (0);
 		}
 
 		vpold = td->td_proc->p_session->s_ttyvp;
 		VREF(vp);
 		SESS_LOCK(td->td_proc->p_session);
 		td->td_proc->p_session->s_ttyvp = vp;
 		td->td_proc->p_session->s_ttydp = cdev2priv(dev);
 		SESS_UNLOCK(td->td_proc->p_session);
 
 		sx_sunlock(&proctree_lock);
 
 		/* Get rid of reference to old control tty */
 		if (vpold)
 			vrele(vpold);
 	}
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_kqfilter_f(struct file *fp, struct knote *kn)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 	struct thread *td;
 
 	td = curthread;
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error)
 		return (error);
 	error = dsw->d_kqfilter(dev, kn);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return (error);
 }
 
 static inline int
 devfs_prison_check(struct devfs_dirent *de, struct thread *td)
 {
 	struct cdev_priv *cdp;
 	struct ucred *dcr;
 	struct proc *p;
 	int error;
 
 	cdp = de->de_cdp;
 	if (cdp == NULL)
 		return (0);
 	dcr = cdp->cdp_c.si_cred;
 	if (dcr == NULL)
 		return (0);
 
 	error = prison_check(td->td_ucred, dcr);
 	if (error == 0)
 		return (0);
 	/* We do, however, allow access to the controlling terminal */
 	p = td->td_proc;
 	PROC_LOCK(p);
 	if (!(p->p_flag & P_CONTROLT)) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	if (p->p_session->s_ttydp == cdp)
 		error = 0;
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 static int
 devfs_lookupx(struct vop_lookup_args *ap, int *dm_unlock)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct thread *td;
 	struct devfs_dirent *de, *dd;
 	struct devfs_dirent **dde;
 	struct devfs_mount *dmp;
 	struct mount *mp;
 	struct cdev *cdev;
 	int error, flags, nameiop, dvplocked;
 	char specname[SPECNAMELEN + 1], *pname;
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dvp = ap->a_dvp;
 	pname = cnp->cn_nameptr;
 	td = cnp->cn_thread;
 	flags = cnp->cn_flags;
 	nameiop = cnp->cn_nameiop;
 	mp = dvp->v_mount;
 	dmp = VFSTODEVFS(mp);
 	dd = dvp->v_data;
 	*vpp = NULLVP;
 
 	if ((flags & ISLASTCN) && nameiop == RENAME)
 		return (EOPNOTSUPP);
 
 	if (dvp->v_type != VDIR)
 		return (ENOTDIR);
 
 	if ((flags & ISDOTDOT) && (dvp->v_vflag & VV_ROOT))
 		return (EIO);
 
 	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, td);
 	if (error)
 		return (error);
 
 	if (cnp->cn_namelen == 1 && *pname == '.') {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		*vpp = dvp;
 		VREF(dvp);
 		return (0);
 	}
 
 	if (flags & ISDOTDOT) {
 		if ((flags & ISLASTCN) && nameiop != LOOKUP)
 			return (EINVAL);
 		de = devfs_parent_dirent(dd);
 		if (de == NULL)
 			return (ENOENT);
 		dvplocked = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 		error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK,
 		    vpp);
 		*dm_unlock = 0;
 		vn_lock(dvp, dvplocked | LK_RETRY);
 		return (error);
 	}
 
 	dd = dvp->v_data;
 	de = devfs_find(dd, cnp->cn_nameptr, cnp->cn_namelen, 0);
 	while (de == NULL) {	/* While(...) so we can use break */
 
 		if (nameiop == DELETE)
 			return (ENOENT);
 
 		/*
 		 * OK, we didn't have an entry for the name we were asked for
 		 * so we try to see if anybody can create it on demand.
 		 */
 		pname = devfs_fqpn(specname, dmp, dd, cnp);
 		if (pname == NULL)
 			break;
 
 		cdev = NULL;
 		DEVFS_DMP_HOLD(dmp);
 		sx_xunlock(&dmp->dm_lock);
 		sx_slock(&clone_drain_lock);
 		EVENTHANDLER_INVOKE(dev_clone,
 		    td->td_ucred, pname, strlen(pname), &cdev);
 		sx_sunlock(&clone_drain_lock);
 
 		if (cdev == NULL)
 			sx_xlock(&dmp->dm_lock);
 		else if (devfs_populate_vp(dvp) != 0) {
 			*dm_unlock = 0;
 			sx_xlock(&dmp->dm_lock);
 			if (DEVFS_DMP_DROP(dmp)) {
 				sx_xunlock(&dmp->dm_lock);
 				devfs_unmount_final(dmp);
 			} else
 				sx_xunlock(&dmp->dm_lock);
 			dev_rel(cdev);
 			return (ENOENT);
 		}
 		if (DEVFS_DMP_DROP(dmp)) {
 			*dm_unlock = 0;
 			sx_xunlock(&dmp->dm_lock);
 			devfs_unmount_final(dmp);
 			if (cdev != NULL)
 				dev_rel(cdev);
 			return (ENOENT);
 		}
 
 		if (cdev == NULL)
 			break;
 
 		dev_lock();
 		dde = &cdev2priv(cdev)->cdp_dirents[dmp->dm_idx];
 		if (dde != NULL && *dde != NULL)
 			de = *dde;
 		dev_unlock();
 		dev_rel(cdev);
 		break;
 	}
 
 	if (de == NULL || de->de_flags & DE_WHITEOUT) {
 		if ((nameiop == CREATE || nameiop == RENAME) &&
 		    (flags & (LOCKPARENT | WANTPARENT)) && (flags & ISLASTCN)) {
 			cnp->cn_flags |= SAVENAME;
 			return (EJUSTRETURN);
 		}
 		return (ENOENT);
 	}
 
 	if (devfs_prison_check(de, td))
 		return (ENOENT);
 
 	if ((cnp->cn_nameiop == DELETE) && (flags & ISLASTCN)) {
 		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
 		if (error)
 			return (error);
 		if (*vpp == dvp) {
 			VREF(dvp);
 			*vpp = dvp;
 			return (0);
 		}
 	}
 	error = devfs_allocv(de, mp, cnp->cn_lkflags & LK_TYPE_MASK, vpp);
 	*dm_unlock = 0;
 	return (error);
 }
 
 static int
 devfs_lookup(struct vop_lookup_args *ap)
 {
 	int j;
 	struct devfs_mount *dmp;
 	int dm_unlock;
 
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOTDIR);
 
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	dm_unlock = 1;
 	j = devfs_lookupx(ap, &dm_unlock);
 	if (dm_unlock == 1)
 		sx_xunlock(&dmp->dm_lock);
 	return (j);
 }
 
 static int
 devfs_mknod(struct vop_mknod_args *ap)
 {
 	struct componentname *cnp;
 	struct vnode *dvp, **vpp;
 	struct devfs_dirent *dd, *de;
 	struct devfs_mount *dmp;
 	int error;
 
 	/*
 	 * The only type of node we should be creating here is a
 	 * character device, for anything else return EOPNOTSUPP.
 	 */
 	if (ap->a_vap->va_type != VCHR)
 		return (EOPNOTSUPP);
 	dvp = ap->a_dvp;
 	dmp = VFSTODEVFS(dvp->v_mount);
 
 	cnp = ap->a_cnp;
 	vpp = ap->a_vpp;
 	dd = dvp->v_data;
 
 	error = ENOENT;
 	sx_xlock(&dmp->dm_lock);
 	TAILQ_FOREACH(de, &dd->de_dlist, de_list) {
 		if (cnp->cn_namelen != de->de_dirent->d_namlen)
 			continue;
 		if (de->de_dirent->d_type == DT_CHR &&
 		    (de->de_cdp->cdp_flags & CDP_ACTIVE) == 0)
 			continue;
 		if (bcmp(cnp->cn_nameptr, de->de_dirent->d_name,
 		    de->de_dirent->d_namlen) != 0)
 			continue;
 		if (de->de_flags & DE_WHITEOUT)
 			break;
 		goto notfound;
 	}
 	if (de == NULL)
 		goto notfound;
 	de->de_flags &= ~DE_WHITEOUT;
 	error = devfs_allocv(de, dvp->v_mount, LK_EXCLUSIVE, vpp);
 	return (error);
 notfound:
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 devfs_open(struct vop_open_args *ap)
 {
 	struct thread *td = ap->a_td;
 	struct vnode *vp = ap->a_vp;
 	struct cdev *dev = vp->v_rdev;
 	struct file *fp = ap->a_fp;
 	int error, ref, vlocked;
 	struct cdevsw *dsw;
 	struct file *fpop;
 	struct mtx *mtxp;
 
 	if (vp->v_type == VBLK)
 		return (ENXIO);
 
 	if (dev == NULL)
 		return (ENXIO);
 
 	/* Make this field valid before any I/O in d_open. */
 	if (dev->si_iosize_max == 0)
 		dev->si_iosize_max = DFLTPHYS;
 
 	dsw = dev_refthread(dev, &ref);
 	if (dsw == NULL)
 		return (ENXIO);
 	if (fp == NULL && dsw->d_fdopen != NULL) {
 		dev_relthread(dev, ref);
 		return (ENXIO);
 	}
 
 	vlocked = VOP_ISLOCKED(vp);
 	VOP_UNLOCK(vp, 0);
 
 	fpop = td->td_fpop;
 	td->td_fpop = fp;
 	if (fp != NULL) {
 		fp->f_data = dev;
 		fp->f_vnode = vp;
 	}
 	if (dsw->d_fdopen != NULL)
 		error = dsw->d_fdopen(dev, ap->a_mode, td, fp);
 	else
 		error = dsw->d_open(dev, ap->a_mode, S_IFCHR, td);
 	/* Clean up any cdevpriv upon error. */
 	if (error != 0)
 		devfs_clear_cdevpriv();
 	td->td_fpop = fpop;
 
 	vn_lock(vp, vlocked | LK_RETRY);
 	dev_relthread(dev, ref);
 	if (error != 0) {
 		if (error == ERESTART)
 			error = EINTR;
 		return (error);
 	}
 
 #if 0	/* /dev/console */
 	KASSERT(fp != NULL, ("Could not vnode bypass device on NULL fp"));
 #else
 	if (fp == NULL)
 		return (error);
 #endif
 	if (fp->f_ops == &badfileops)
 		finit(fp, fp->f_flag, DTYPE_VNODE, dev, &devfs_ops_f);
 	mtxp = mtx_pool_find(mtxpool_sleep, fp);
 
 	/*
 	 * Hint to the dofilewrite() to not force the buffer draining
 	 * on the writer to the file.  Most likely, the write would
 	 * not need normal buffers.
 	 */
 	mtx_lock(mtxp);
 	fp->f_vnread_flags |= FDEVFS_VNODE;
 	mtx_unlock(mtxp);
 	return (error);
 }
 
 static int
 devfs_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = INT_MAX;
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_MAX_CANON:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_CANON;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAX_INPUT:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = MAX_INPUT;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_VDISABLE:
 		if (ap->a_vp->v_vflag & VV_ISTTY) {
 			*ap->a_retval = _POSIX_VDISABLE;
 			return (0);
 		}
 		return (EINVAL);
 	case _PC_MAC_PRESENT:
 #ifdef MAC
 		/*
 		 * If MAC is enabled, devfs automatically supports
 		 * trivial non-persistant label storage.
 		 */
 		*ap->a_retval = 1;
 #else
 		*ap->a_retval = 0;
 #endif
 		return (0);
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 	/* NOTREACHED */
 }
 
 /* ARGSUSED */
 static int
 devfs_poll_f(struct file *fp, int events, struct ucred *cred, struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	int error, ref;
 	struct file *fpop;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_poll(fp, events, cred, td);
 		return (error);
 	}
 	error = dsw->d_poll(dev, events, td);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	return(error);
 }
 
 /*
  * Print out the contents of a special device vnode.
  */
 static int
 devfs_print(struct vop_print_args *ap)
 {
 
 	printf("\tdev %s\n", devtoname(ap->a_vp->v_rdev));
 	return (0);
 }
 
 static int
 devfs_read_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int ioflag, error, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_read(fp, uio, cred, flags, td);
 		return (error);
 	}
 	resid = uio->uio_resid;
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 	error = dsw->d_read(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0))
 		devfs_timestamp(&dev->si_atime);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
 static int
 devfs_readdir(struct vop_readdir_args *ap)
 {
 	int error;
 	struct uio *uio;
 	struct dirent *dp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de;
 	struct devfs_mount *dmp;
 	off_t off;
 	int *tmp_ncookies = NULL;
 
 	if (ap->a_vp->v_type != VDIR)
 		return (ENOTDIR);
 
 	uio = ap->a_uio;
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 
 	/*
 	 * XXX: This is a temporary hack to get around this filesystem not
 	 * supporting cookies. We store the location of the ncookies pointer
 	 * in a temporary variable before calling vfs_subr.c:vfs_read_dirent()
 	 * and set the number of cookies to 0. We then set the pointer to
 	 * NULL so that vfs_read_dirent doesn't try to call realloc() on 
 	 * ap->a_cookies. Later in this function, we restore the ap->a_ncookies
 	 * pointer to its original location before returning to the caller.
 	 */
 	if (ap->a_ncookies != NULL) {
 		tmp_ncookies = ap->a_ncookies;
 		*ap->a_ncookies = 0;
 		ap->a_ncookies = NULL;
 	}
 
 	dmp = VFSTODEVFS(ap->a_vp->v_mount);
 	if (devfs_populate_vp(ap->a_vp) != 0) {
 		if (tmp_ncookies != NULL)
 			ap->a_ncookies = tmp_ncookies;
 		return (EIO);
 	}
 	error = 0;
 	de = ap->a_vp->v_data;
 	off = 0;
 	TAILQ_FOREACH(dd, &de->de_dlist, de_list) {
 		KASSERT(dd->de_cdp != (void *)0xdeadc0de, ("%s %d\n", __func__, __LINE__));
 		if (dd->de_flags & (DE_COVERED | DE_WHITEOUT))
 			continue;
 		if (devfs_prison_check(dd, uio->uio_td))
 			continue;
 		if (dd->de_dirent->d_type == DT_DIR)
 			de = dd->de_dir;
 		else
 			de = dd;
 		dp = dd->de_dirent;
 		MPASS(dp->d_reclen == GENERIC_DIRSIZ(dp));
 		if (dp->d_reclen > uio->uio_resid)
 			break;
 		dp->d_fileno = de->de_inode;
 		/* NOTE: d_off is the offset for the *next* entry. */
 		dp->d_off = off + dp->d_reclen;
 		if (off >= uio->uio_offset) {
 			error = vfs_read_dirent(ap, dp, off);
 			if (error)
 				break;
 		}
 		off += dp->d_reclen;
 	}
 	sx_xunlock(&dmp->dm_lock);
 	uio->uio_offset = off;
 
 	/*
 	 * Restore ap->a_ncookies if it wasn't originally NULL in the first
 	 * place.
 	 */
 	if (tmp_ncookies != NULL)
 		ap->a_ncookies = tmp_ncookies;
 
 	return (error);
 }
 
 static int
 devfs_readlink(struct vop_readlink_args *ap)
 {
 	struct devfs_dirent *de;
 
 	de = ap->a_vp->v_data;
 	return (uiomove(de->de_symlink, strlen(de->de_symlink), ap->a_uio));
 }
 
 static int
 devfs_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	mtx_lock(&devfs_de_interlock);
 	de = vp->v_data;
 	if (de != NULL) {
 		de->de_vnode = NULL;
 		vp->v_data = NULL;
 	}
 	mtx_unlock(&devfs_de_interlock);
 	vnode_destroy_vobject(vp);
 	return (0);
 }
 
 static int
 devfs_reclaim_vchr(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp;
 	struct cdev *dev;
 
 	vp = ap->a_vp;
 	MPASS(vp->v_type == VCHR);
 
 	devfs_reclaim(ap);
 
 	VI_LOCK(vp);
 	dev_lock();
 	dev = vp->v_rdev;
 	vp->v_rdev = NULL;
 	if (dev != NULL)
 		dev->si_usecount -= vp->v_usecount;
 	dev_unlock();
 	VI_UNLOCK(vp);
 	if (dev != NULL)
 		dev_rel(dev);
 	return (0);
 }
 
 static int
 devfs_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered;
 	struct devfs_mount *dmp = VFSTODEVFS(vp->v_mount);
 
 	ASSERT_VOP_ELOCKED(dvp, "devfs_remove");
 	ASSERT_VOP_ELOCKED(vp, "devfs_remove");
 
 	sx_xlock(&dmp->dm_lock);
 	dd = ap->a_dvp->v_data;
 	de = vp->v_data;
 	if (de->de_cdp == NULL) {
 		TAILQ_REMOVE(&dd->de_dlist, de, de_list);
 		if (de->de_dirent->d_type == DT_LNK) {
 			de_covered = devfs_find(dd, de->de_dirent->d_name,
 			    de->de_dirent->d_namlen, 0);
 			if (de_covered != NULL)
 				de_covered->de_flags &= ~DE_COVERED;
 		}
 		/* We need to unlock dvp because devfs_delete() may lock it. */
 		VOP_UNLOCK(vp, 0);
 		if (dvp != vp)
 			VOP_UNLOCK(dvp, 0);
 		devfs_delete(dmp, de, 0);
 		sx_xunlock(&dmp->dm_lock);
 		if (dvp != vp)
 			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	} else {
 		de->de_flags |= DE_WHITEOUT;
 		sx_xunlock(&dmp->dm_lock);
 	}
 	return (0);
 }
 
 /*
  * Revoke is called on a tty when a terminal session ends.  The vnode
  * is orphaned by setting v_op to deadfs so we need to let go of it
  * as well so that we create a new one next time around.
  *
  */
 static int
 devfs_revoke(struct vop_revoke_args *ap)
 {
 	struct vnode *vp = ap->a_vp, *vp2;
 	struct cdev *dev;
 	struct cdev_priv *cdp;
 	struct devfs_dirent *de;
 	u_int i;
 
 	KASSERT((ap->a_flags & REVOKEALL) != 0, ("devfs_revoke !REVOKEALL"));
 
 	dev = vp->v_rdev;
 	cdp = cdev2priv(dev);
  
 	dev_lock();
 	cdp->cdp_inuse++;
 	dev_unlock();
 
 	vhold(vp);
 	vgone(vp);
 	vdrop(vp);
 
 	VOP_UNLOCK(vp,0);
  loop:
 	for (;;) {
 		mtx_lock(&devfs_de_interlock);
 		dev_lock();
 		vp2 = NULL;
 		for (i = 0; i <= cdp->cdp_maxdirent; i++) {
 			de = cdp->cdp_dirents[i];
 			if (de == NULL)
 				continue;
 
 			vp2 = de->de_vnode;
 			if (vp2 != NULL) {
 				dev_unlock();
 				VI_LOCK(vp2);
 				mtx_unlock(&devfs_de_interlock);
 				if (vget(vp2, LK_EXCLUSIVE | LK_INTERLOCK,
 				    curthread))
 					goto loop;
 				vhold(vp2);
 				vgone(vp2);
 				vdrop(vp2);
 				vput(vp2);
 				break;
 			} 
 		}
 		if (vp2 != NULL) {
 			continue;
 		}
 		dev_unlock();
 		mtx_unlock(&devfs_de_interlock);
 		break;
 	}
 	dev_lock();
 	cdp->cdp_inuse--;
 	if (!(cdp->cdp_flags & CDP_ACTIVE) && cdp->cdp_inuse == 0) {
 		TAILQ_REMOVE(&cdevp_list, cdp, cdp_list);
 		dev_unlock();
 		dev_rel(&cdp->cdp_c);
 	} else
 		dev_unlock();
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	return (0);
 }
 
 static int
 devfs_rioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_mount *dmp;
 	int error;
 
 	vp = ap->a_vp;
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 	if (vp->v_iflag & VI_DOOMED) {
 		VOP_UNLOCK(vp, 0);
 		return (EBADF);
 	}
 	dmp = VFSTODEVFS(vp->v_mount);
 	sx_xlock(&dmp->dm_lock);
 	VOP_UNLOCK(vp, 0);
 	DEVFS_DMP_HOLD(dmp);
 	devfs_populate(dmp);
 	if (DEVFS_DMP_DROP(dmp)) {
 		sx_xunlock(&dmp->dm_lock);
 		devfs_unmount_final(dmp);
 		return (ENOENT);
 	}
 	error = devfs_rules_ioctl(dmp, ap->a_command, ap->a_data, ap->a_td);
 	sx_xunlock(&dmp->dm_lock);
 	return (error);
 }
 
 static int
 devfs_rread(struct vop_read_args *ap)
 {
 
 	if (ap->a_vp->v_type != VDIR)
 		return (EINVAL);
 	return (VOP_READDIR(ap->a_vp, ap->a_uio, ap->a_cred, NULL, NULL, NULL));
 }
 
 static int
 devfs_setattr(struct vop_setattr_args *ap)
 {
 	struct devfs_dirent *de;
 	struct vattr *vap;
 	struct vnode *vp;
 	struct thread *td;
 	int c, error;
 	uid_t uid;
 	gid_t gid;
 
 	vap = ap->a_vap;
 	vp = ap->a_vp;
 	td = curthread;
 	if ((vap->va_type != VNON) ||
 	    (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) ||
 	    (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) ||
 	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
 	    (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) ||
 	    (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 
 	error = devfs_populate_vp(vp);
 	if (error != 0)
 		return (error);
 
 	de = vp->v_data;
 	if (vp->v_type == VDIR)
 		de = de->de_dir;
 
 	c = 0;
 	if (vap->va_uid == (uid_t)VNOVAL)
 		uid = de->de_uid;
 	else
 		uid = vap->va_uid;
 	if (vap->va_gid == (gid_t)VNOVAL)
 		gid = de->de_gid;
 	else
 		gid = vap->va_gid;
 	if (uid != de->de_uid || gid != de->de_gid) {
 		if ((ap->a_cred->cr_uid != de->de_uid) || uid != de->de_uid ||
 		    (gid != de->de_gid && !groupmember(gid, ap->a_cred))) {
 			error = priv_check(td, PRIV_VFS_CHOWN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_uid = uid;
 		de->de_gid = gid;
 		c = 1;
 	}
 
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (ap->a_cred->cr_uid != de->de_uid) {
 			error = priv_check(td, PRIV_VFS_ADMIN);
 			if (error != 0)
 				goto ret;
 		}
 		de->de_mode = vap->va_mode;
 		c = 1;
 	}
 
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		error = vn_utimes_perm(vp, vap, ap->a_cred, td);
 		if (error != 0)
 			goto ret;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_atime = vap->va_atime;
 			else
 				de->de_atime = vap->va_atime;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			if (vp->v_type == VCHR)
 				vp->v_rdev->si_mtime = vap->va_mtime;
 			else
 				de->de_mtime = vap->va_mtime;
 		}
 		c = 1;
 	}
 
 	if (c) {
 		if (vp->v_type == VCHR)
 			vfs_timestamp(&vp->v_rdev->si_ctime);
 		else
 			vfs_timestamp(&de->de_mtime);
 	}
 
 ret:
 	sx_xunlock(&VFSTODEVFS(vp->v_mount)->dm_lock);
 	return (error);
 }
 
 #ifdef MAC
 static int
 devfs_setlabel(struct vop_setlabel_args *ap)
 {
 	struct vnode *vp;
 	struct devfs_dirent *de;
 
 	vp = ap->a_vp;
 	de = vp->v_data;
 
 	mac_vnode_relabel(ap->a_cred, vp, ap->a_label);
 	mac_devfs_update(vp->v_mount, de, vp);
 
 	return (0);
 }
 #endif
 
 static int
 devfs_stat_f(struct file *fp, struct stat *sb, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_stat(fp, sb, cred, td));
 }
 
 static int
 devfs_symlink(struct vop_symlink_args *ap)
 {
 	int i, error;
 	struct devfs_dirent *dd;
 	struct devfs_dirent *de, *de_covered, *de_dotdot;
 	struct devfs_mount *dmp;
 
 	error = priv_check(curthread, PRIV_DEVFS_SYMLINK);
 	if (error)
 		return(error);
 	dmp = VFSTODEVFS(ap->a_dvp->v_mount);
 	if (devfs_populate_vp(ap->a_dvp) != 0)
 		return (ENOENT);
 
 	dd = ap->a_dvp->v_data;
 	de = devfs_newdirent(ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen);
 	de->de_flags = DE_USER;
 	de->de_uid = 0;
 	de->de_gid = 0;
 	de->de_mode = 0755;
 	de->de_inode = alloc_unr(devfs_inos);
 	de->de_dir = dd;
 	de->de_dirent->d_type = DT_LNK;
 	i = strlen(ap->a_target) + 1;
 	de->de_symlink = malloc(i, M_DEVFS, M_WAITOK);
 	bcopy(ap->a_target, de->de_symlink, i);
 #ifdef MAC
 	mac_devfs_create_symlink(ap->a_cnp->cn_cred, dmp->dm_mount, dd, de);
 #endif
 	de_covered = devfs_find(dd, de->de_dirent->d_name,
 	    de->de_dirent->d_namlen, 0);
 	if (de_covered != NULL) {
 		if ((de_covered->de_flags & DE_USER) != 0) {
 			devfs_delete(dmp, de, DEVFS_DEL_NORECURSE);
 			sx_xunlock(&dmp->dm_lock);
 			return (EEXIST);
 		}
 		KASSERT((de_covered->de_flags & DE_COVERED) == 0,
 		    ("devfs_symlink: entry %p already covered", de_covered));
 		de_covered->de_flags |= DE_COVERED;
 	}
 
 	de_dotdot = TAILQ_FIRST(&dd->de_dlist);		/* "." */
 	de_dotdot = TAILQ_NEXT(de_dotdot, de_list);	/* ".." */
 	TAILQ_INSERT_AFTER(&dd->de_dlist, de_dotdot, de, de_list);
 	devfs_dir_ref_de(dmp, dd);
 	devfs_rules_apply(dmp, de);
 
 	return (devfs_allocv(de, ap->a_dvp->v_mount, LK_EXCLUSIVE, ap->a_vpp));
 }
 
 static int
 devfs_truncate_f(struct file *fp, off_t length, struct ucred *cred, struct thread *td)
 {
 
 	return (vnops.fo_truncate(fp, length, cred, td));
 }
 
 static int
 devfs_write_f(struct file *fp, struct uio *uio, struct ucred *cred,
     int flags, struct thread *td)
 {
 	struct cdev *dev;
 	int error, ioflag, ref;
 	ssize_t resid;
 	struct cdevsw *dsw;
 	struct file *fpop;
 
 	if (uio->uio_resid > DEVFS_IOSIZE_MAX)
 		return (EINVAL);
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0) {
 		error = vnops.fo_write(fp, uio, cred, flags, td);
 		return (error);
 	}
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td));
 	ioflag = fp->f_flag & (O_NONBLOCK | O_DIRECT | O_FSYNC);
 	if (ioflag & O_DIRECT)
 		ioflag |= IO_DIRECT;
 	foffset_lock_uio(fp, uio, flags | FOF_NOLOCK);
 
 	resid = uio->uio_resid;
 
 	error = dsw->d_write(dev, uio, ioflag);
 	if (uio->uio_resid != resid || (error == 0 && resid != 0)) {
 		devfs_timestamp(&dev->si_ctime);
 		dev->si_mtime = dev->si_ctime;
 	}
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 
 	foffset_unlock_uio(fp, uio, flags | FOF_NOLOCK | FOF_NEXTOFF);
 	return (error);
 }
 
 static int
 devfs_mmap_f(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags, vm_ooffset_t foff,
     struct thread *td)
 {
 	struct cdev *dev;
 	struct cdevsw *dsw;
 	struct mount *mp;
 	struct vnode *vp;
 	struct file *fpop;
 	vm_object_t object;
 	vm_prot_t maxprot;
 	int error, ref;
 
 	vp = fp->f_vnode;
 
 	/*
 	 * Ensure that file and memory protections are
 	 * compatible.
 	 */
 	mp = vp->v_mount;
 	if (mp != NULL && (mp->mnt_flag & MNT_NOEXEC) != 0) {
 		maxprot = VM_PROT_NONE;
 		if ((prot & VM_PROT_EXECUTE) != 0)
 			return (EACCES);
 	} else
 		maxprot = VM_PROT_EXECUTE;
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_READ;
 	else if ((prot & VM_PROT_READ) != 0)
 		return (EACCES);
 
 	/*
 	 * If we are sharing potential changes via MAP_SHARED and we
 	 * are trying to get write permission although we opened it
 	 * without asking for it, bail out.
 	 *
 	 * Note that most character devices always share mappings.
 	 * The one exception is that D_MMAP_ANON devices
 	 * (i.e. /dev/zero) permit private writable mappings.
 	 *
 	 * Rely on vm_mmap_cdev() to fail invalid MAP_PRIVATE requests
 	 * as well as updating maxprot to permit writing for
 	 * D_MMAP_ANON devices rather than doing that here.
 	 */
 	if ((flags & MAP_SHARED) != 0) {
 		if ((fp->f_flag & FWRITE) != 0)
 			maxprot |= VM_PROT_WRITE;
 		else if ((prot & VM_PROT_WRITE) != 0)
 			return (EACCES);
 	}
 	maxprot &= cap_maxprot;
 
 	fpop = td->td_fpop;
 	error = devfs_fp_check(fp, &dev, &dsw, &ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_cdev(td, size, prot, &maxprot, &flags, dev, dsw, &foff,
 	    &object);
 	td->td_fpop = fpop;
 	dev_relthread(dev, ref);
 	if (error != 0)
 		return (error);
 
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
 	    foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(object);
 	return (error);
 }
 
 dev_t
 dev2udev(struct cdev *x)
 {
 	if (x == NULL)
 		return (NODEV);
 	return (cdev2priv(x)->cdp_inode);
 }
 
 static struct fileops devfs_ops_f = {
 	.fo_read =	devfs_read_f,
 	.fo_write =	devfs_write_f,
 	.fo_truncate =	devfs_truncate_f,
 	.fo_ioctl =	devfs_ioctl_f,
 	.fo_poll =	devfs_poll_f,
 	.fo_kqfilter =	devfs_kqfilter_f,
 	.fo_stat =	devfs_stat_f,
 	.fo_close =	devfs_close_f,
 	.fo_chmod =	vn_chmod,
 	.fo_chown =	vn_chown,
 	.fo_sendfile =	vn_sendfile,
 	.fo_seek =	vn_seek,
 	.fo_fill_kinfo = vn_fill_kinfo,
 	.fo_mmap =	devfs_mmap_f,
 	.fo_flags =	DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 /* Vops for non-CHR vnodes in /dev. */
 static struct vop_vector devfs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_rioctl,
 	.vop_lookup =		devfs_lookup,
 	.vop_mknod =		devfs_mknod,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_read =		devfs_rread,
 	.vop_readdir =		devfs_readdir,
 	.vop_readlink =		devfs_readlink,
 	.vop_reclaim =		devfs_reclaim,
 	.vop_remove =		devfs_remove,
 	.vop_revoke =		devfs_revoke,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_symlink =		devfs_symlink,
 	.vop_vptocnp =		devfs_vptocnp,
 };
 
 /* Vops for VCHR vnodes in /dev. */
 static struct vop_vector devfs_specops = {
 	.vop_default =		&default_vnodeops,
 
 	.vop_access =		devfs_access,
 	.vop_bmap =		VOP_PANIC,
 	.vop_close =		devfs_close,
 	.vop_create =		VOP_PANIC,
 	.vop_fsync =		vop_stdfsync,
 	.vop_getattr =		devfs_getattr,
 	.vop_ioctl =		devfs_ioctl,
 	.vop_link =		VOP_PANIC,
 	.vop_mkdir =		VOP_PANIC,
 	.vop_mknod =		VOP_PANIC,
 	.vop_open =		devfs_open,
 	.vop_pathconf =		devfs_pathconf,
 	.vop_poll =		dead_poll,
 	.vop_print =		devfs_print,
 	.vop_read =		dead_read,
 	.vop_readdir =		VOP_PANIC,
 	.vop_readlink =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_reclaim =		devfs_reclaim_vchr,
 	.vop_remove =		devfs_remove,
 	.vop_rename =		VOP_PANIC,
 	.vop_revoke =		devfs_revoke,
 	.vop_rmdir =		VOP_PANIC,
 	.vop_setattr =		devfs_setattr,
 #ifdef MAC
 	.vop_setlabel =		devfs_setlabel,
 #endif
 	.vop_strategy =		VOP_PANIC,
 	.vop_symlink =		VOP_PANIC,
 	.vop_vptocnp =		devfs_vptocnp,
 	.vop_write =		dead_write,
 };
 
 /*
  * Our calling convention to the device drivers used to be that we passed
  * vnode.h IO_* flags to read()/write(), but we're moving to fcntl.h O_ 
  * flags instead since that's what open(), close() and ioctl() takes and
  * we don't really want vnode.h in device drivers.
  * We solved the source compatibility by redefining some vnode flags to
  * be the same as the fcntl ones and by sending down the bitwise OR of
  * the respective fcntl/vnode flags.  These CTASSERTS make sure nobody
  * pulls the rug out under this.
  */
 CTASSERT(O_NONBLOCK == IO_NDELAY);
 CTASSERT(O_FSYNC == IO_SYNC);
Index: projects/fuse2/sys/fs/ext2fs/ext2_vnops.c
===================================================================
--- projects/fuse2/sys/fs/ext2fs/ext2_vnops.c	(revision 350434)
+++ projects/fuse2/sys/fs/ext2fs/ext2_vnops.c	(revision 350435)
@@ -1,2346 +1,2347 @@
 /*-
  *  modified for EXT2FS support in Lites 1.1
  *
  *  Aug 1995, Godmar Back (gback@cs.utah.edu)
  *  University of Utah, Department of Computer Science
  */
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.7 (Berkeley) 2/3/94
  *	@(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95
  * $FreeBSD$
  */
 
 #include "opt_suiddir.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
+#include <sys/limits.h>
 #include <sys/sdt.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/endian.h>
 #include <sys/priv.h>
 #include <sys/rwlock.h>
 #include <sys/mount.h>
 #include <sys/unistd.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/lockf.h>
 #include <sys/event.h>
 #include <sys/conf.h>
 #include <sys/file.h>
 #include <sys/extattr.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 
 #include "opt_directio.h"
 
 #include <ufs/ufs/dir.h>
 
 #include <fs/ext2fs/fs.h>
 #include <fs/ext2fs/inode.h>
 #include <fs/ext2fs/ext2_acl.h>
 #include <fs/ext2fs/ext2fs.h>
 #include <fs/ext2fs/ext2_extern.h>
 #include <fs/ext2fs/ext2_dinode.h>
 #include <fs/ext2fs/ext2_dir.h>
 #include <fs/ext2fs/ext2_mount.h>
 #include <fs/ext2fs/ext2_extattr.h>
 #include <fs/ext2fs/ext2_extents.h>
 
 SDT_PROVIDER_DECLARE(ext2fs);
 /*
  * ext2fs trace probe:
  * arg0: verbosity. Higher numbers give more verbose messages
  * arg1: Textual message
  */
 SDT_PROBE_DEFINE2(ext2fs, , vnops, trace, "int", "char*");
 
 static int ext2_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *);
 static void ext2_itimes_locked(struct vnode *);
 
 static vop_access_t	ext2_access;
 static int ext2_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ext2_chown(struct vnode *, uid_t, gid_t, struct ucred *,
     struct thread *);
 static vop_close_t	ext2_close;
 static vop_create_t	ext2_create;
 static vop_fsync_t	ext2_fsync;
 static vop_getattr_t	ext2_getattr;
 static vop_ioctl_t	ext2_ioctl;
 static vop_link_t	ext2_link;
 static vop_mkdir_t	ext2_mkdir;
 static vop_mknod_t	ext2_mknod;
 static vop_open_t	ext2_open;
 static vop_pathconf_t	ext2_pathconf;
 static vop_print_t	ext2_print;
 static vop_read_t	ext2_read;
 static vop_readlink_t	ext2_readlink;
 static vop_remove_t	ext2_remove;
 static vop_rename_t	ext2_rename;
 static vop_rmdir_t	ext2_rmdir;
 static vop_setattr_t	ext2_setattr;
 static vop_strategy_t	ext2_strategy;
 static vop_symlink_t	ext2_symlink;
 static vop_write_t	ext2_write;
 static vop_deleteextattr_t	ext2_deleteextattr;
 static vop_getextattr_t	ext2_getextattr;
 static vop_listextattr_t	ext2_listextattr;
 static vop_setextattr_t	ext2_setextattr;
 static vop_vptofh_t	ext2_vptofh;
 static vop_close_t	ext2fifo_close;
 static vop_kqfilter_t	ext2fifo_kqfilter;
 
 /* Global vfs data structures for ext2. */
 struct vop_vector ext2_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_access =		ext2_access,
 	.vop_bmap =		ext2_bmap,
 	.vop_cachedlookup =	ext2_lookup,
 	.vop_close =		ext2_close,
 	.vop_create =		ext2_create,
 	.vop_fsync =		ext2_fsync,
 	.vop_getpages =		vnode_pager_local_getpages,
 	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_getattr =		ext2_getattr,
 	.vop_inactive =		ext2_inactive,
 	.vop_ioctl =		ext2_ioctl,
 	.vop_link =		ext2_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_mkdir =		ext2_mkdir,
 	.vop_mknod =		ext2_mknod,
 	.vop_open =		ext2_open,
 	.vop_pathconf =		ext2_pathconf,
 	.vop_poll =		vop_stdpoll,
 	.vop_print =		ext2_print,
 	.vop_read =		ext2_read,
 	.vop_readdir =		ext2_readdir,
 	.vop_readlink =		ext2_readlink,
 	.vop_reallocblks =	ext2_reallocblks,
 	.vop_reclaim =		ext2_reclaim,
 	.vop_remove =		ext2_remove,
 	.vop_rename =		ext2_rename,
 	.vop_rmdir =		ext2_rmdir,
 	.vop_setattr =		ext2_setattr,
 	.vop_strategy =		ext2_strategy,
 	.vop_symlink =		ext2_symlink,
 	.vop_write =		ext2_write,
 	.vop_deleteextattr =	ext2_deleteextattr,
 	.vop_getextattr =	ext2_getextattr,
 	.vop_listextattr =	ext2_listextattr,
 	.vop_setextattr =	ext2_setextattr,
 #ifdef UFS_ACL
 	.vop_getacl =		ext2_getacl,
 	.vop_setacl =		ext2_setacl,
 	.vop_aclcheck =		ext2_aclcheck,
 #endif /* UFS_ACL */
 	.vop_vptofh =		ext2_vptofh,
 };
 
 struct vop_vector ext2_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		ext2_access,
 	.vop_close =		ext2fifo_close,
 	.vop_fsync =		ext2_fsync,
 	.vop_getattr =		ext2_getattr,
 	.vop_inactive =		ext2_inactive,
 	.vop_kqfilter =		ext2fifo_kqfilter,
 	.vop_pathconf =		ext2_pathconf,
 	.vop_print =		ext2_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ext2_reclaim,
 	.vop_setattr =		ext2_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_vptofh =		ext2_vptofh,
 };
 
 /*
  * A virgin directory (no blushing please).
  * Note that the type and namlen fields are reversed relative to ext2.
  * Also, we don't use `struct odirtemplate', since it would just cause
  * endianness problems.
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, 1, EXT2_FT_DIR, ".",
 	0, DIRBLKSIZ - 12, 2, EXT2_FT_DIR, ".."
 };
 static struct dirtemplate omastertemplate = {
 	0, 12, 1, EXT2_FT_UNKNOWN, ".",
 	0, DIRBLKSIZ - 12, 2, EXT2_FT_UNKNOWN, ".."
 };
 
 static void
 ext2_itimes_locked(struct vnode *vp)
 {
 	struct inode *ip;
 	struct timespec ts;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 
 	ip = VTOI(vp);
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 	if ((vp->v_type == VBLK || vp->v_type == VCHR))
 		ip->i_flag |= IN_LAZYMOD;
 	else
 		ip->i_flag |= IN_MODIFIED;
 	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 		vfs_timestamp(&ts);
 		if (ip->i_flag & IN_ACCESS) {
 			ip->i_atime = ts.tv_sec;
 			ip->i_atimensec = ts.tv_nsec;
 		}
 		if (ip->i_flag & IN_UPDATE) {
 			ip->i_mtime = ts.tv_sec;
 			ip->i_mtimensec = ts.tv_nsec;
 			ip->i_modrev++;
 		}
 		if (ip->i_flag & IN_CHANGE) {
 			ip->i_ctime = ts.tv_sec;
 			ip->i_ctimensec = ts.tv_nsec;
 		}
 	}
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 void
 ext2_itimes(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	ext2_itimes_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Create a regular file
  */
 static int
 ext2_create(struct vop_create_args *ap)
 {
 	int error;
 
 	error =
 	    ext2_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp);
 	if (error != 0)
 		return (error);
 	if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp);
 	return (0);
 }
 
 static int
 ext2_open(struct vop_open_args *ap)
 {
 
 	if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((VTOI(ap->a_vp)->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 
 	vnode_create_vobject(ap->a_vp, VTOI(ap->a_vp)->i_size, ap->a_td);
 
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 static int
 ext2_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1)
 		ext2_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 static int
 ext2_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 	int error;
 
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		return (EOPNOTSUPP);
 
 	/*
 	 * Disallow write attempts on read-only file systems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the file system.
 	 */
 	if (accmode & VWRITE) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 	}
 
 	/* If immutable bit set, nobody gets to write it. */
 	if ((accmode & VWRITE) && (ip->i_flags & (SF_IMMUTABLE | SF_SNAPSHOT)))
 		return (EPERM);
 
 	error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
 	    ap->a_accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 static int
 ext2_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
 	ext2_itimes(vp);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = dev2udev(ip->i_devvp->v_rdev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_nlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	vap->va_rdev = ip->i_rdev;
 	vap->va_size = ip->i_size;
 	vap->va_atime.tv_sec = ip->i_atime;
 	vap->va_atime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_atimensec : 0;
 	vap->va_mtime.tv_sec = ip->i_mtime;
 	vap->va_mtime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_mtimensec : 0;
 	vap->va_ctime.tv_sec = ip->i_ctime;
 	vap->va_ctime.tv_nsec = E2DI_HAS_XTIME(ip) ? ip->i_ctimensec : 0;
 	if E2DI_HAS_XTIME(ip) {
 		vap->va_birthtime.tv_sec = ip->i_birthtime;
 		vap->va_birthtime.tv_nsec = ip->i_birthnsec;
 	}
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_bytes = dbtob((u_quad_t)ip->i_blocks);
 	vap->va_type = IFTOVT(ip->i_mode);
 	vap->va_filerev = ip->i_modrev;
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 static int
 ext2_setattr(struct vop_setattr_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		/* Disallow flags not supported by ext2fs. */
 		if (vap->va_flags & ~(SF_APPEND | SF_IMMUTABLE | UF_NODUMP))
 			return (EOPNOTSUPP);
 
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes and privileged processes in
 		 * jail() are not permitted to unset system flags, or
 		 * modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 */
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) {
 			if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 		} else {
 			if (ip->i_flags & (SF_IMMUTABLE | SF_APPEND) ||
 			    ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE))
 				return (EPERM);
 		}
 		ip->i_flags = vap->va_flags;
 		ip->i_flag |= IN_CHANGE;
 		if (ip->i_flags & (IMMUTABLE | APPEND))
 			return (0);
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((error = ext2_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    td)) != 0)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * Disallow write attempts on read-only file systems;
 		 * unless the file is a socket, fifo, or a block or
 		 * character device resident on the file system.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			break;
 		default:
 			break;
 		}
 		if ((error = ext2_truncate(vp, vap->va_size, 0, cred, td)) != 0)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * From utimes(2):
 		 * If times is NULL, ... The caller must be the owner of
 		 * the file, have permission to write the file, or be the
 		 * super-user.
 		 * If times is non-NULL, ... The caller must be the owner of
 		 * the file or be the super-user.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)) &&
 		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
 		    (error = VOP_ACCESS(vp, VWRITE, cred, td))))
 			return (error);
 		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_ACCESS;
 			ip->i_atime = vap->va_atime.tv_sec;
 			ip->i_atimensec = vap->va_atime.tv_nsec;
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_UPDATE;
 			ip->i_mtime = vap->va_mtime.tv_sec;
 			ip->i_mtimensec = vap->va_mtime.tv_nsec;
 		}
 		ip->i_birthtime = vap->va_birthtime.tv_sec;
 		ip->i_birthnsec = vap->va_birthtime.tv_nsec;
 		error = ext2_update(vp, 0);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		error = ext2_chmod(vp, (int)vap->va_mode, cred, td);
 	}
 	return (error);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ext2_chmod(struct vnode *vp, int mode, struct ucred *cred, struct thread *td)
 {
 	struct inode *ip = VTOI(vp);
 	int error;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		error = priv_check_cred(cred, PRIV_VFS_STICKYFILE);
 		if (error)
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID);
 		if (error)
 			return (error);
 	}
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	ip->i_flag |= IN_CHANGE;
 	return (0);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ext2_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred,
     struct thread *td)
 {
 	struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file
 	 * to a group of which we are not a member, the caller must
 	 * have privilege.
 	 */
 	if (uid != ip->i_uid || (gid != ip->i_gid &&
 	    !groupmember(gid, cred))) {
 		error = priv_check_cred(cred, PRIV_VFS_CHOWN);
 		if (error)
 			return (error);
 	}
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 	ip->i_gid = gid;
 	ip->i_uid = uid;
 	ip->i_flag |= IN_CHANGE;
 	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID) != 0)
 			ip->i_mode &= ~(ISUID | ISGID);
 	}
 	return (0);
 }
 
 /*
  * Synch an open file.
  */
 /* ARGSUSED */
 static int
 ext2_fsync(struct vop_fsync_args *ap)
 {
 	/*
 	 * Flush all dirty buffers associated with a vnode.
 	 */
 
 	vop_stdfsync(ap);
 
 	return (ext2_update(ap->a_vp, ap->a_waitfor == MNT_WAIT));
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 static int
 ext2_mknod(struct vop_mknod_args *ap)
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	ino_t ino;
 	int error;
 
 	error = ext2_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		if (!(ip->i_flag & IN_E4EXTENTS))
 			ip->i_rdev = vap->va_rdev;
 	}
 	/*
 	 * Remove inode, then reload it through VFS_VGET so it is
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.	 XXX I don't believe this is necessary now.
 	 */
 	(*vpp)->v_type = VNON;
 	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
 	vgone(*vpp);
 	vput(*vpp);
 	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		*vpp = NULL;
 		return (error);
 	}
 	return (0);
 }
 
 static int
 ext2_remove(struct vop_remove_args *ap)
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	error = ext2_dirremove(dvp, ap->a_cnp);
 	if (error == 0) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 out:
 	return (error);
 }
 
 /*
  * link vnode call
  */
 static int
 ext2_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	int error;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_link: no name");
 #endif
 	ip = VTOI(vp);
 	if ((nlink_t)ip->i_nlink >= EXT4_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	ip->i_nlink++;
 	ip->i_flag |= IN_CHANGE;
 	error = ext2_update(vp, !DOINGASYNC(vp));
 	if (!error)
 		error = ext2_direnter(ip, tdvp, cnp);
 	if (error) {
 		ip->i_nlink--;
 		ip->i_flag |= IN_CHANGE;
 	}
 out:
 	return (error);
 }
 
 static int
 ext2_inc_nlink(struct inode *ip)
 {
 
 	ip->i_nlink++;
 
 	if (S_ISDIR(ip->i_mode) &&
 	    EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK) &&
 	    ip->i_nlink > 1) {
 		if (ip->i_nlink >= EXT4_LINK_MAX || ip->i_nlink == 2)
 			ip->i_nlink = 1;
 	} else if (ip->i_nlink > EXT4_LINK_MAX) {
 		ip->i_nlink--;
 		return (EMLINK);
 	}
 
 	return (0);
 }
 
 static void
 ext2_dec_nlink(struct inode *ip)
 {
 
 	if (!S_ISDIR(ip->i_mode) || ip->i_nlink > 2)
 		ip->i_nlink--;
 }
 
 /*
  * Rename system call.
  * 	rename("foo", "bar");
  * is essentially
  *	unlink("bar");
  *	link("foo", "bar");
  *	unlink("foo");
  * but ``atomically''.  Can't do full commit without saving state in the
  * inode on disk which isn't feasible at this time.  Best we can do is
  * always guarantee the target exists.
  *
  * Basic algorithm is:
  *
  * 1) Bump link count on source while we're linking it to the
  *    target.  This also ensure the inode won't be deleted out
  *    from underneath us while we work (it may be truncated by
  *    a concurrent `trunc' or `open' for creation).
  * 2) Link source to destination.  If destination already exists,
  *    delete it first.
  * 3) Unlink source reference to inode if still around. If a
  *    directory was moved and the parent of the destination
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
 static int
 ext2_rename(struct vop_rename_args *ap)
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct inode *ip, *xp, *dp;
 	struct dirtemplate *dirbuf;
 	int doingdirectory = 0, oldparent = 0, newparent = 0;
 	int error = 0;
 	u_char namlen;
 
 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ext2_rename: no name");
 #endif
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 abortit:
 		if (tdvp == tvp)
 			vrele(tdvp);
 		else
 			vput(tdvp);
 		if (tvp)
 			vput(tvp);
 		vrele(fdvp);
 		vrele(fvp);
 		return (error);
 	}
 
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto abortit;
 	}
 
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  Temporarily just warn if they do.
 	 */
 	if (fvp == tvp) {
 		SDT_PROBE2(ext2fs, , vnops, trace, 1,
 		    "rename: fvp == tvp (can't happen)");
 		error = 0;
 		goto abortit;
 	}
 
 	if ((error = vn_lock(fvp, LK_EXCLUSIVE)) != 0)
 		goto abortit;
 	dp = VTOI(fdvp);
 	ip = VTOI(fvp);
 	if (ip->i_nlink >= EXT4_LINK_MAX &&
 	    !EXT2_HAS_RO_COMPAT_FEATURE(ip->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) {
 		VOP_UNLOCK(fvp, 0);
 		error = EMLINK;
 		goto abortit;
 	}
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (dp->i_flags & APPEND)) {
 		VOP_UNLOCK(fvp, 0);
 		error = EPERM;
 		goto abortit;
 	}
 	if ((ip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    dp == ip || (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT ||
 		    (ip->i_flag & IN_RENAME)) {
 			VOP_UNLOCK(fvp, 0);
 			error = EINVAL;
 			goto abortit;
 		}
 		ip->i_flag |= IN_RENAME;
 		oldparent = dp->i_number;
 		doingdirectory++;
 	}
 	vrele(fdvp);
 
 	/*
 	 * When the target exists, both the directory
 	 * and target vnodes are returned locked.
 	 */
 	dp = VTOI(tdvp);
 	xp = NULL;
 	if (tvp)
 		xp = VTOI(tvp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	ext2_inc_nlink(ip);
 	ip->i_flag |= IN_CHANGE;
 	if ((error = ext2_update(fvp, !DOINGASYNC(fvp))) != 0) {
 		VOP_UNLOCK(fvp, 0);
 		goto bad;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..". We must repeat the call
 	 * to namei, as the parent directory is unlocked by the
 	 * call to checkpath().
 	 */
 	error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 	VOP_UNLOCK(fvp, 0);
 	if (oldparent != dp->i_number)
 		newparent = dp->i_number;
 	if (doingdirectory && newparent) {
 		if (error)	/* write access check above */
 			goto bad;
 		if (xp != NULL)
 			vput(tvp);
 		error = ext2_checkpath(ip, dp, tcnp->cn_cred);
 		if (error)
 			goto out;
 		VREF(tdvp);
 		error = relookup(tdvp, &tvp, tcnp);
 		if (error)
 			goto out;
 		vrele(tdvp);
 		dp = VTOI(tdvp);
 		xp = NULL;
 		if (tvp)
 			xp = VTOI(tvp);
 	}
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (xp == NULL) {
 		if (dp->i_devvp != ip->i_devvp)
 			panic("ext2_rename: EXDEV");
 		/*
 		 * Account for ".." in new directory.
 		 * When source and destination have the same
 		 * parent we don't fool with the link count.
 		 */
 		if (doingdirectory && newparent) {
 			error = ext2_inc_nlink(dp);
 			if (error)
 				goto bad;
 
 			dp->i_flag |= IN_CHANGE;
 			error = ext2_update(tdvp, !DOINGASYNC(tdvp));
 			if (error)
 				goto bad;
 		}
 		error = ext2_direnter(ip, tdvp, tcnp);
 		if (error) {
 			if (doingdirectory && newparent) {
 				ext2_dec_nlink(dp);
 				dp->i_flag |= IN_CHANGE;
 				(void)ext2_update(tdvp, 1);
 			}
 			goto bad;
 		}
 		vput(tdvp);
 	} else {
 		if (xp->i_devvp != dp->i_devvp || xp->i_devvp != ip->i_devvp)
 			panic("ext2_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (xp->i_number == ip->i_number)
 			panic("ext2_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the user must
 		 * own the parent directory, or the destination of the rename,
 		 * otherwise the destination may not be changed (except by
 		 * root). This implements append-only directories.
 		 */
 		if ((dp->i_mode & S_ISTXT) && tcnp->cn_cred->cr_uid != 0 &&
 		    tcnp->cn_cred->cr_uid != dp->i_uid &&
 		    xp->i_uid != tcnp->cn_cred->cr_uid) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((xp->i_mode & IFMT) == IFDIR) {
 			if (!ext2_dirempty(xp, dp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		error = ext2_dirrewrite(dp, ip, tcnp);
 		if (error)
 			goto bad;
 		/*
 		 * If the target directory is in the same
 		 * directory as the source directory,
 		 * decrement the link count on the parent
 		 * of the target directory.
 		 */
 		if (doingdirectory && !newparent) {
 			ext2_dec_nlink(dp);
 			dp->i_flag |= IN_CHANGE;
 		}
 		vput(tdvp);
 		/*
 		 * Adjust the link count of the target to
 		 * reflect the dirrewrite above.  If this is
 		 * a directory it is empty and there are
 		 * no links to it, so we can squash the inode and
 		 * any space associated with it.  We disallowed
 		 * renaming over top of a directory with links to
 		 * it above, as the remaining link would point to
 		 * a directory without "." or ".." entries.
 		 */
 		ext2_dec_nlink(xp);
 		if (doingdirectory) {
 			if (--xp->i_nlink != 0)
 				panic("ext2_rename: linked directory");
 			error = ext2_truncate(tvp, (off_t)0, IO_SYNC,
 			    tcnp->cn_cred, tcnp->cn_thread);
 		}
 		xp->i_flag |= IN_CHANGE;
 		vput(tvp);
 		xp = NULL;
 	}
 
 	/*
 	 * 3) Unlink the source.
 	 */
 	fcnp->cn_flags &= ~MODMASK;
 	fcnp->cn_flags |= LOCKPARENT | LOCKLEAF;
 	VREF(fdvp);
 	error = relookup(fdvp, &fvp, fcnp);
 	if (error == 0)
 		vrele(fdvp);
 	if (fvp != NULL) {
 		xp = VTOI(fvp);
 		dp = VTOI(fdvp);
 	} else {
 		/*
 		 * From name has disappeared.  IN_RENAME is not sufficient
 		 * to protect against directory races due to timing windows,
 		 * so we can't panic here.
 		 */
 		vrele(ap->a_fvp);
 		return (0);
 	}
 	/*
 	 * Ensure that the directory entry still exists and has not
 	 * changed while the new name has been entered. If the source is
 	 * a file then the entry may have been unlinked or renamed. In
 	 * either case there is no further work to be done. If the source
 	 * is a directory then it cannot have been rmdir'ed; its link
 	 * count of three would cause a rmdir to fail with ENOTEMPTY.
 	 * The IN_RENAME flag ensures that it cannot be moved by another
 	 * rename.
 	 */
 	if (xp != ip) {
 		/*
 		 * From name resolves to a different inode.  IN_RENAME is
 		 * not sufficient protection against timing window races
 		 * so we can't panic here.
 		 */
 	} else {
 		/*
 		 * If the source is a directory with a
 		 * new parent, the link count of the old
 		 * parent directory must be decremented
 		 * and ".." set to point to the new parent.
 		 */
 		if (doingdirectory && newparent) {
 			ext2_dec_nlink(dp);
 			dp->i_flag |= IN_CHANGE;
 			dirbuf = malloc(dp->i_e2fs->e2fs_bsize, M_TEMP, M_WAITOK | M_ZERO);
 			if (!dirbuf) {
 				error = ENOMEM;
 				goto bad;
 			}
 			error = vn_rdwr(UIO_READ, fvp, (caddr_t)dirbuf,
 			    ip->i_e2fs->e2fs_bsize, (off_t)0,
 			    UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 			    tcnp->cn_cred, NOCRED, NULL, NULL);
 			if (error == 0) {
 				/* Like ufs little-endian: */
 				namlen = dirbuf->dotdot_type;
 				if (namlen != 2 ||
 				    dirbuf->dotdot_name[0] != '.' ||
 				    dirbuf->dotdot_name[1] != '.') {
 					ext2_dirbad(xp, (doff_t)12,
 					    "rename: mangled dir");
 				} else {
 					dirbuf->dotdot_ino = newparent;
 					/*
 					 * dirblock 0 could be htree root,
 					 * try both csum update functions.
 					 */
 					ext2_dirent_csum_set(ip,
 					    (struct ext2fs_direct_2 *)dirbuf);
 					ext2_dx_csum_set(ip,
 					    (struct ext2fs_direct_2 *)dirbuf);
 					(void)vn_rdwr(UIO_WRITE, fvp,
 					    (caddr_t)dirbuf,
 					    ip->i_e2fs->e2fs_bsize,
 					    (off_t)0, UIO_SYSSPACE,
 					    IO_NODELOCKED | IO_SYNC |
 					    IO_NOMACCHECK, tcnp->cn_cred,
 					    NOCRED, NULL, NULL);
 					cache_purge(fdvp);
 				}
 			}
 			free(dirbuf, M_TEMP);
 		}
 		error = ext2_dirremove(fdvp, fcnp);
 		if (!error) {
 			ext2_dec_nlink(xp);
 			xp->i_flag |= IN_CHANGE;
 		}
 		xp->i_flag &= ~IN_RENAME;
 	}
 	if (dp)
 		vput(fdvp);
 	if (xp)
 		vput(fvp);
 	vrele(ap->a_fvp);
 	return (error);
 
 bad:
 	if (xp)
 		vput(ITOV(xp));
 	vput(ITOV(dp));
 out:
 	if (doingdirectory)
 		ip->i_flag &= ~IN_RENAME;
 	if (vn_lock(fvp, LK_EXCLUSIVE) == 0) {
 		ext2_dec_nlink(ip);
 		ip->i_flag |= IN_CHANGE;
 		ip->i_flag &= ~IN_RENAME;
 		vput(fvp);
 	} else
 		vrele(fvp);
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ext2_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp,
     mode_t dmode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *dacl, *acl;
 
 	acl = acl_alloc(M_WAITOK);
 	dacl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL from parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.  If the ACL is empty, fall through to
 		 * the "not defined or available" case.
 		 */
 		if (acl->acl_cnt != 0) {
 			dmode = acl_posix1e_newfilemode(dmode, acl);
 			ip->i_mode = dmode;
 			*dacl = *acl;
 			ext2_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = dmode;
 		error = 0;
 		goto out;
 
 	default:
 		goto out;
 	}
 
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	if (error == 0)
 		error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above
 		 * was supposed to free acl.
 		 */
 #ifdef DEBUG
 		printf("ext2_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
 #endif	/* DEBUG */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 	acl_free(dacl);
 
 	return (error);
 }
 
 static int
 ext2_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp,
     mode_t mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *acl;
 
 	acl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL for parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.
 		 */
 		if (acl->acl_cnt != 0) {
 			/*
 			 * Two possible ways for default ACL to not
 			 * be present.  First, the EA can be
 			 * undefined, or second, the default ACL can
 			 * be blank.  If it's blank, fall through to
 			 * the it's not defined case.
 			 */
 			mode = acl_posix1e_newfilemode(mode, acl);
 			ip->i_mode = mode;
 			ext2_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = mode;
 		error = 0;
 		goto out;
 
 	default:
 		goto out;
 	}
 
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above was
 		 * supposed to free acl.
 		 */
 		printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()\n");
 		/* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()"); */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 
 	return (error);
 }
 
 #endif /* UFS_ACL */
 
 /*
  * Mkdir system call
  */
 static int
 ext2_mkdir(struct vop_mkdir_args *ap)
 {
 	struct m_ext2fs *fs;
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct dirtemplate dirtemplate, *dtp;
 	char *buf = NULL;
 	int error, dmode;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if ((nlink_t)dp->i_nlink >= EXT4_LINK_MAX &&
 	    !EXT2_HAS_RO_COMPAT_FEATURE(dp->i_e2fs, EXT2F_ROCOMPAT_DIR_NLINK)) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ext2_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	error = ext2_valloc(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	fs = ip->i_e2fs;
 	ip->i_gid = dp->i_gid;
 #ifdef SUIDDIR
 	{
 		/*
 		 * if we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TOO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * The new directory also inherits the SUID bit.
 		 * If user's UID and dir UID are the same,
 		 * 'give it away' so that the SUID is still forced on.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (dp->i_mode & ISUID) && dp->i_uid) {
 			dmode |= ISUID;
 			ip->i_uid = dp->i_uid;
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 		}
 	}
 #else
 	ip->i_uid = cnp->cn_cred->cr_uid;
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 2;
 	if (cnp->cn_flags & ISWHITEOUT)
 		ip->i_flags |= UF_OPAQUE;
 	error = ext2_update(tvp, 1);
 
 	/*
 	 * Bump link count in parent directory
 	 * to reflect work done below.  Should
 	 * be done before reference is created
 	 * so reparation is possible if we crash.
 	 */
 	ext2_inc_nlink(dp);
 	dp->i_flag |= IN_CHANGE;
 	error = ext2_update(dvp, !DOINGASYNC(dvp));
 	if (error)
 		goto bad;
 
 	/* Initialize directory with "." and ".." from static template. */
 	if (EXT2_HAS_INCOMPAT_FEATURE(ip->i_e2fs,
 	    EXT2F_INCOMPAT_FTYPE))
 		dtp = &mastertemplate;
 	else
 		dtp = &omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 	/*
 	 * note that in ext2 DIRBLKSIZ == blocksize, not DEV_BSIZE so let's
 	 * just redefine it - for this function only
 	 */
 #undef  DIRBLKSIZ
 #define DIRBLKSIZ  VTOI(dvp)->i_e2fs->e2fs_bsize
 	dirtemplate.dotdot_reclen = DIRBLKSIZ - 12;
 	buf = malloc(DIRBLKSIZ, M_TEMP, M_WAITOK | M_ZERO);
 	if (!buf) {
 		error = ENOMEM;
 		ext2_dec_nlink(dp);
 		dp->i_flag |= IN_CHANGE;
 		goto bad;
 	}
 	if (EXT2_HAS_RO_COMPAT_FEATURE(fs, EXT2F_ROCOMPAT_METADATA_CKSUM)) {
 		dirtemplate.dotdot_reclen -= sizeof(struct ext2fs_direct_tail);
 		ext2_init_dirent_tail(EXT2_DIRENT_TAIL(buf, DIRBLKSIZ));
 	}
 	memcpy(buf, &dirtemplate, sizeof(dirtemplate));
 	ext2_dirent_csum_set(ip, (struct ext2fs_direct_2 *)buf);
 	error = vn_rdwr(UIO_WRITE, tvp, (caddr_t)buf,
 	    DIRBLKSIZ, (off_t)0, UIO_SYSSPACE,
 	    IO_NODELOCKED | IO_SYNC | IO_NOMACCHECK, cnp->cn_cred, NOCRED,
 	    NULL, NULL);
 	if (error) {
 		ext2_dec_nlink(dp);
 		dp->i_flag |= IN_CHANGE;
 		goto bad;
 	}
 	if (DIRBLKSIZ > VFSTOEXT2(dvp->v_mount)->um_mountp->mnt_stat.f_bsize)
 		/* XXX should grow with balloc() */
 		panic("ext2_mkdir: blksize");
 	else {
 		ip->i_size = DIRBLKSIZ;
 		ip->i_flag |= IN_CHANGE;
 	}
 
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ext2_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 
 #endif /* UFS_ACL */
 
 	/* Directory set up, now install its entry in the parent directory. */
 	error = ext2_direnter(ip, dvp, cnp);
 	if (error) {
 		ext2_dec_nlink(dp);
 		dp->i_flag |= IN_CHANGE;
 	}
 bad:
 	/*
 	 * No need to do an explicit VOP_TRUNCATE here, vrele will do this
 	 * for us because we set the link count to 0.
 	 */
 	if (error) {
 		ip->i_nlink = 0;
 		ip->i_flag |= IN_CHANGE;
 		vput(tvp);
 	} else
 		*ap->a_vpp = tvp;
 out:
 	free(buf, M_TEMP);
 	return (error);
 #undef  DIRBLKSIZ
 #define DIRBLKSIZ  DEV_BSIZE
 }
 
 /*
  * Rmdir system call.
  */
 static int
 ext2_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	int error;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 
 	/*
 	 * Verify the directory is empty (and valid).
 	 * (Rmdir ".." won't be valid since
 	 *  ".." will contain a reference to
 	 *  the current directory and thus be
 	 *  non-empty.)
 	 */
 	if (!ext2_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND)
 	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 	error = ext2_dirremove(dvp, cnp);
 	if (error)
 		goto out;
 	ext2_dec_nlink(dp);
 	dp->i_flag |= IN_CHANGE;
 	cache_purge(dvp);
 	VOP_UNLOCK(dvp, 0);
 	/*
 	 * Truncate inode.  The only stuff left
 	 * in the directory is "." and "..".
 	 */
 	ip->i_nlink = 0;
 	error = ext2_truncate(vp, (off_t)0, IO_SYNC, cnp->cn_cred,
 	    cnp->cn_thread);
 	cache_purge(ITOV(ip));
 	if (vn_lock(dvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		VOP_UNLOCK(vp, 0);
 		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 out:
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 static int
 ext2_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *vp, **vpp = ap->a_vpp;
 	struct inode *ip;
 	int len, error;
 
 	error = ext2_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp);
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, (char *)ip->i_shortlink, len);
 		ip->i_size = len;
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target),
 		    len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 		    ap->a_cnp->cn_cred, NOCRED, NULL, NULL);
 	if (error)
 		vput(vp);
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 static int
 ext2_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	int isize;
 
 	isize = ip->i_size;
 	if (isize < vp->v_mount->mnt_maxsymlinklen) {
 		uiomove((char *)ip->i_shortlink, isize, ap->a_uio);
 		return (0);
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the ext2_bmaparray() operation may not
  * deadlock on memory.  See ext2_bmap() for details.
  */
 static int
 ext2_strategy(struct vop_strategy_args *ap)
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
 	daddr_t blkno;
 	int error;
 
 	if (vp->v_type == VBLK || vp->v_type == VCHR)
 		panic("ext2_strategy: spec");
 	if (bp->b_blkno == bp->b_lblkno) {
 
 		if (VTOI(ap->a_vp)->i_flag & IN_E4EXTENTS)
 			error = ext4_bmapext(vp, bp->b_lblkno, &blkno, NULL, NULL);
 		else
 			error = ext2_bmaparray(vp, bp->b_lblkno, &blkno, NULL, NULL);
 
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bo = VFSTOEXT2(vp->v_mount)->um_bo;
 	BO_STRATEGY(bo, bp);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 static int
 ext2_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	vn_printf(ip->i_devvp, "\tino %ju", (uintmax_t)ip->i_number);
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 ext2fifo_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	VI_LOCK(vp);
 	if (vp->v_usecount > 1)
 		ext2_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ext2 kqfilter routines if needed
  */
 static int
 ext2fifo_kqfilter(struct vop_kqfilter_args *ap)
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = vfs_kqfilter(ap);
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to ext2 filesystems.
  */
 static int
 ext2_pathconf(struct vop_pathconf_args *ap)
 {
 	int error = 0;
 
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		if (EXT2_HAS_RO_COMPAT_FEATURE(VTOI(ap->a_vp)->i_e2fs,
 		    EXT2F_ROCOMPAT_DIR_NLINK))
 			*ap->a_retval = INT_MAX;
 		else
 			*ap->a_retval = EXT4_LINK_MAX;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
 
 #ifdef UFS_ACL
 	case _PC_ACL_EXTENDED:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_PATH_MAX:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 		break;
 #endif /* UFS_ACL */
 
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1;	/* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
 /*
  * Vnode operation to remove a named attribute.
  */
 static int
 ext2_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error)
 		return (error);
 
 	error = ENOATTR;
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_delete(ip, ap->a_attrnamespace, ap->a_name);
 		if (error != ENOATTR)
 			return (error);
 	}
 
 	if (ip->i_facl)
 		error = ext2_extattr_block_delete(ip, ap->a_attrnamespace, ap->a_name);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve a named extended attribute.
  */
 static int
 ext2_getextattr(struct vop_getextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error)
 		return (error);
 
 	if (ap->a_size != NULL)
 		*ap->a_size = 0;
 
 	error = ENOATTR;
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_get(ip, ap->a_attrnamespace,
 		    ap->a_name, ap->a_uio, ap->a_size);
 		if (error != ENOATTR)
 			return (error);
 	}
 
 	if (ip->i_facl)
 		error = ext2_extattr_block_get(ip, ap->a_attrnamespace,
 		    ap->a_name, ap->a_uio, ap->a_size);
 
 	return (error);
 }
 
 /*
  * Vnode operation to retrieve extended attributes on a vnode.
  */
 static int
 ext2_listextattr(struct vop_listextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VREAD);
 	if (error)
 		return (error);
 
 	if (ap->a_size != NULL)
 		*ap->a_size = 0;
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_list(ip, ap->a_attrnamespace,
 		    ap->a_uio, ap->a_size);
 		if (error)
 			return (error);
 	}
 
 	if (ip->i_facl)
 		error = ext2_extattr_block_list(ip, ap->a_attrnamespace,
 		    ap->a_uio, ap->a_size);
 
 	return (error);
 }
 
 /*
  * Vnode operation to set a named attribute.
  */
 static int
 ext2_setextattr(struct vop_setextattr_args *ap)
 {
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	int error;
 
 	ip = VTOI(ap->a_vp);
 	fs = ip->i_e2fs;
 
 	if (!EXT2_HAS_COMPAT_FEATURE(ip->i_e2fs, EXT2F_COMPAT_EXT_ATTR))
 		return (EOPNOTSUPP);
 
 	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
 	    ap->a_cred, ap->a_td, VWRITE);
 	if (error)
 		return (error);
 
 	error = ext2_extattr_valid_attrname(ap->a_attrnamespace, ap->a_name);
 	if (error)
 		return (error);
 
 	if (EXT2_INODE_SIZE(fs) != E2FS_REV0_INODE_SIZE) {
 		error = ext2_extattr_inode_set(ip, ap->a_attrnamespace,
 		    ap->a_name, ap->a_uio);
 		if (error != ENOSPC)
 			return (error);
 	}
 
 	error = ext2_extattr_block_set(ip, ap->a_attrnamespace,
 	    ap->a_name, ap->a_uio);
 
 	return (error);
 }
 
 /*
  * Vnode pointer to File handle
  */
 /* ARGSUSED */
 static int
 ext2_vptofh(struct vop_vptofh_args *ap)
 {
 	struct inode *ip;
 	struct ufid *ufhp;
 
 	ip = VTOI(ap->a_vp);
 	ufhp = (struct ufid *)ap->a_fhp;
 	ufhp->ufid_len = sizeof(struct ufid);
 	ufhp->ufid_ino = ip->i_number;
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ext2_vinit(struct mount *mntp, struct vop_vector *fifoops, struct vnode **vpp)
 {
 	struct inode *ip;
 	struct vnode *vp;
 
 	vp = *vpp;
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
 	/*
 	 * Only unallocated inodes should be of type VNON.
 	 */
 	if (ip->i_mode != 0 && vp->v_type == VNON)
 		return (EINVAL);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 
 	if (ip->i_number == EXT2_ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 	ip->i_modrev = init_va_filerev();
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  */
 static int
 ext2_makeinode(int mode, struct vnode *dvp, struct vnode **vpp,
     struct componentname *cnp)
 {
 	struct inode *ip, *pdir;
 	struct vnode *tvp;
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ext2_makeinode: no name");
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	error = ext2_valloc(dvp, mode, cnp->cn_cred, &tvp);
 	if (error) {
 		return (error);
 	}
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 #ifdef SUIDDIR
 	{
 		/*
 		 * if we are
 		 * not the owner of the directory,
 		 * and we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TOO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * Note that this drops off the execute bits for security.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (pdir->i_mode & ISUID) &&
 		    (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
 			ip->i_uid = pdir->i_uid;
 			mode &= ~07111;
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 		}
 	}
 #else
 	ip->i_uid = cnp->cn_cred->cr_uid;
 #endif
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_nlink = 1;
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred)) {
 		if (priv_check_cred(cnp->cn_cred, PRIV_VFS_RETAINSUGID))
 			ip->i_mode &= ~ISGID;
 	}
 
 	if (cnp->cn_flags & ISWHITEOUT)
 		ip->i_flags |= UF_OPAQUE;
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	error = ext2_update(tvp, !DOINGASYNC(tvp));
 	if (error)
 		goto bad;
 
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ext2_do_posix1e_acl_inheritance_file(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* UFS_ACL */
 
 	error = ext2_direnter(ip, dvp, cnp);
 	if (error)
 		goto bad;
 
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	ip->i_nlink = 0;
 	ip->i_flag |= IN_CHANGE;
 	vput(tvp);
 	return (error);
 }
 
 /*
  * Vnode op for reading.
  */
 static int
 ext2_read(struct vop_read_args *ap)
 {
 	struct vnode *vp;
 	struct inode *ip;
 	struct uio *uio;
 	struct m_ext2fs *fs;
 	struct buf *bp;
 	daddr_t lbn, nextlbn;
 	off_t bytesinfile;
 	long size, xfersize, blkoffset;
 	int error, orig_resid, seqcount;
 	int ioflag;
 
 	vp = ap->a_vp;
 	uio = ap->a_uio;
 	ioflag = ap->a_ioflag;
 
 	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
 	ip = VTOI(vp);
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_READ)
 		panic("%s: mode", "ext2_read");
 
 	if (vp->v_type == VLNK) {
 		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
 			panic("%s: short symlink", "ext2_read");
 	} else if (vp->v_type != VREG && vp->v_type != VDIR)
 		panic("%s: type %d", "ext2_read", vp->v_type);
 #endif
 	orig_resid = uio->uio_resid;
 	KASSERT(orig_resid >= 0, ("ext2_read: uio->uio_resid < 0"));
 	if (orig_resid == 0)
 		return (0);
 	KASSERT(uio->uio_offset >= 0, ("ext2_read: uio->uio_offset < 0"));
 	fs = ip->i_e2fs;
 	if (uio->uio_offset < ip->i_size &&
 	    uio->uio_offset >= fs->e2fs_maxfilesize)
 		return (EOVERFLOW);
 
 	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
 		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
 			break;
 		lbn = lblkno(fs, uio->uio_offset);
 		nextlbn = lbn + 1;
 		size = blksize(fs, ip, lbn);
 		blkoffset = blkoff(fs, uio->uio_offset);
 
 		xfersize = fs->e2fs_fsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (bytesinfile < xfersize)
 			xfersize = bytesinfile;
 
 		if (lblktosize(fs, nextlbn) >= ip->i_size)
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
 			error = cluster_read(vp, ip->i_size, lbn, size,
 			    NOCRED, blkoffset + uio->uio_resid, seqcount,
 			    0, &bp);
 		} else if (seqcount > 1) {
 			u_int nextsize = blksize(fs, ip, nextlbn);
 
 			error = breadn(vp, lbn,
 			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
 		} else
 			error = bread(vp, lbn, size, NOCRED, &bp);
 		if (error) {
 			brelse(bp);
 			bp = NULL;
 			break;
 		}
 
 		/*
 		 * We should only get non-zero b_resid when an I/O error
 		 * has occurred, which should cause us to break above.
 		 * However, if the short read did not cause an error,
 		 * then we want to ensure that we do not uiomove bad
 		 * or uninitialized data.
 		 */
 		size -= bp->b_resid;
 		if (size < xfersize) {
 			if (size == 0)
 				break;
 			xfersize = size;
 		}
 		error = uiomove((char *)bp->b_data + blkoffset,
 		    (int)xfersize, uio);
 		if (error)
 			break;
 		vfs_bio_brelse(bp, ioflag);
 	}
 
 	/*
 	 * This can only happen in the case of an error because the loop
 	 * above resets bp to NULL on each iteration and on normal
 	 * completion has not set a new value into it. so it must have come
 	 * from a 'break' statement
 	 */
 	if (bp != NULL)
 		vfs_bio_brelse(bp, ioflag);
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		ip->i_flag |= IN_ACCESS;
 	return (error);
 }
 
 static int
 ext2_ioctl(struct vop_ioctl_args *ap)
 {
 
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
 	case FIOSEEKHOLE:
 		return (vn_bmap_seekhole(ap->a_vp, ap->a_command,
 		    (off_t *)ap->a_data, ap->a_cred));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /*
  * Vnode op for writing.
  */
 static int
 ext2_write(struct vop_write_args *ap)
 {
 	struct vnode *vp;
 	struct uio *uio;
 	struct inode *ip;
 	struct m_ext2fs *fs;
 	struct buf *bp;
 	daddr_t lbn;
 	off_t osize;
 	int blkoffset, error, flags, ioflag, resid, size, seqcount, xfersize;
 
 	ioflag = ap->a_ioflag;
 	uio = ap->a_uio;
 	vp = ap->a_vp;
 
 	seqcount = ioflag >> IO_SEQSHIFT;
 	ip = VTOI(vp);
 
 #ifdef INVARIANTS
 	if (uio->uio_rw != UIO_WRITE)
 		panic("%s: mode", "ext2_write");
 #endif
 
 	switch (vp->v_type) {
 	case VREG:
 		if (ioflag & IO_APPEND)
 			uio->uio_offset = ip->i_size;
 		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
 			return (EPERM);
 		/* FALLTHROUGH */
 	case VLNK:
 		break;
 	case VDIR:
 		/* XXX differs from ffs -- this is called from ext2_mkdir(). */
 		if ((ioflag & IO_SYNC) == 0)
 			panic("ext2_write: nonsync dir write");
 		break;
 	default:
 		panic("ext2_write: type %p %d (%jd,%jd)", (void *)vp,
 		    vp->v_type, (intmax_t)uio->uio_offset,
 		    (intmax_t)uio->uio_resid);
 	}
 
 	KASSERT(uio->uio_resid >= 0, ("ext2_write: uio->uio_resid < 0"));
 	KASSERT(uio->uio_offset >= 0, ("ext2_write: uio->uio_offset < 0"));
 	fs = ip->i_e2fs;
 	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->e2fs_maxfilesize)
 		return (EFBIG);
 	/*
 	 * Maybe this should be above the vnode op call, but so long as
 	 * file servers have no limits, I don't think it matters.
 	 */
 	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
 		return (EFBIG);
 
 	resid = uio->uio_resid;
 	osize = ip->i_size;
 	if (seqcount > BA_SEQMAX)
 		flags = BA_SEQMAX << BA_SEQSHIFT;
 	else
 		flags = seqcount << BA_SEQSHIFT;
 	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
 		flags |= IO_SYNC;
 
 	for (error = 0; uio->uio_resid > 0;) {
 		lbn = lblkno(fs, uio->uio_offset);
 		blkoffset = blkoff(fs, uio->uio_offset);
 		xfersize = fs->e2fs_fsize - blkoffset;
 		if (uio->uio_resid < xfersize)
 			xfersize = uio->uio_resid;
 		if (uio->uio_offset + xfersize > ip->i_size)
 			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
 
 		/*
 		 * We must perform a read-before-write if the transfer size
 		 * does not cover the entire buffer.
 		 */
 		if (fs->e2fs_bsize > xfersize)
 			flags |= BA_CLRBUF;
 		else
 			flags &= ~BA_CLRBUF;
 		error = ext2_balloc(ip, lbn, blkoffset + xfersize,
 		    ap->a_cred, &bp, flags);
 		if (error != 0)
 			break;
 
 		if ((ioflag & (IO_SYNC | IO_INVAL)) == (IO_SYNC | IO_INVAL))
 			bp->b_flags |= B_NOCACHE;
 		if (uio->uio_offset + xfersize > ip->i_size)
 			ip->i_size = uio->uio_offset + xfersize;
 		size = blksize(fs, ip, lbn) - bp->b_resid;
 		if (size < xfersize)
 			xfersize = size;
 
 		error =
 		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
 		/*
 		 * If the buffer is not already filled and we encounter an
 		 * error while trying to fill it, we have to clear out any
 		 * garbage data from the pages instantiated for the buffer.
 		 * If we do not, a failed uiomove() during a write can leave
 		 * the prior contents of the pages exposed to a userland mmap.
 		 *
 		 * Note that we need only clear buffers with a transfer size
 		 * equal to the block size because buffers with a shorter
 		 * transfer size were cleared above by the call to ext2_balloc()
 		 * with the BA_CLRBUF flag set.
 		 *
 		 * If the source region for uiomove identically mmaps the
 		 * buffer, uiomove() performed the NOP copy, and the buffer
 		 * content remains valid because the page fault handler
 		 * validated the pages.
 		 */
 		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
 		    fs->e2fs_bsize == xfersize)
 			vfs_bio_clrbuf(bp);
 
 		vfs_bio_set_flags(bp, ioflag);
 
 		/*
 		 * If IO_SYNC each buffer is written synchronously.  Otherwise
 		 * if we have a severe page deficiency write the buffer
 		 * asynchronously.  Otherwise try to cluster, and if that
 		 * doesn't do it then either do an async write (if O_DIRECT),
 		 * or a delayed write (if not).
 		 */
 		if (ioflag & IO_SYNC) {
 			(void)bwrite(bp);
 		} else if (vm_page_count_severe() ||
 			    buf_dirty_count_severe() ||
 		    (ioflag & IO_ASYNC)) {
 			bp->b_flags |= B_CLUSTEROK;
 			bawrite(bp);
 		} else if (xfersize + blkoffset == fs->e2fs_fsize) {
 			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
 				bp->b_flags |= B_CLUSTEROK;
 				cluster_write(vp, bp, ip->i_size, seqcount, 0);
 			} else {
 				bawrite(bp);
 			}
 		} else if (ioflag & IO_DIRECT) {
 			bp->b_flags |= B_CLUSTEROK;
 			bawrite(bp);
 		} else {
 			bp->b_flags |= B_CLUSTEROK;
 			bdwrite(bp);
 		}
 		if (error || xfersize == 0)
 			break;
 	}
 	/*
 	 * If we successfully wrote any data, and we are not the superuser
 	 * we clear the setuid and setgid bits as a precaution against
 	 * tampering.
 	 */
 	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
 	    ap->a_cred) {
 		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID))
 			ip->i_mode &= ~(ISUID | ISGID);
 	}
 	if (error) {
 		if (ioflag & IO_UNIT) {
 			(void)ext2_truncate(vp, osize,
 			    ioflag & IO_SYNC, ap->a_cred, uio->uio_td);
 			uio->uio_offset -= resid - uio->uio_resid;
 			uio->uio_resid = resid;
 		}
 	}
 	if (uio->uio_resid != resid) {
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		if (ioflag & IO_SYNC)
 			error = ext2_update(vp, 1);
 	}
 	return (error);
 }
Index: projects/fuse2/sys/fs/fuse/fuse_vnops.c
===================================================================
--- projects/fuse2/sys/fs/fuse/fuse_vnops.c	(revision 350434)
+++ projects/fuse2/sys/fs/fuse/fuse_vnops.c	(revision 350435)
@@ -1,2470 +1,2471 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2007-2009 Google Inc. and Amit Singh
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  *
  * * Redistributions of source code must retain the above copyright
  *   notice, this list of conditions and the following disclaimer.
  * * Redistributions in binary form must reproduce the above
  *   copyright notice, this list of conditions and the following disclaimer
  *   in the documentation and/or other materials provided with the
  *   distribution.
  * * Neither the name of Google Inc. nor the names of its
  *   contributors may be used to endorse or promote products derived from
  *   this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * Copyright (C) 2005 Csaba Henk.
  * All rights reserved.
  *
  * Copyright (c) 2019 The FreeBSD Foundation
  *
  * Portions of this software were developed by BFF Storage Systems, LLC under
  * sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/queue.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
 #include <sys/namei.h>
 #include <sys/extattr.h>
 #include <sys/stat.h>
 #include <sys/unistd.h>
 #include <sys/filedesc.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
 #include <sys/dirent.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 #include <vm/vnode_pager.h>
 #include <vm/vm_object.h>
 
 #include "fuse.h"
 #include "fuse_file.h"
 #include "fuse_internal.h"
 #include "fuse_ipc.h"
 #include "fuse_node.h"
 #include "fuse_io.h"
 
 #include <sys/priv.h>
 
 /* Maximum number of hardlinks to a single FUSE file */
 #define FUSE_LINK_MAX                      UINT32_MAX
 
 SDT_PROVIDER_DECLARE(fusefs);
 /* 
  * Fuse trace probe:
  * arg0: verbosity.  Higher numbers give more verbose messages
  * arg1: Textual message
  */
 SDT_PROBE_DEFINE2(fusefs, , vnops, trace, "int", "char*");
 
 /* vnode ops */
 static vop_access_t fuse_vnop_access;
 static vop_advlock_t fuse_vnop_advlock;
 static vop_bmap_t fuse_vnop_bmap;
 static vop_close_t fuse_fifo_close;
 static vop_close_t fuse_vnop_close;
 static vop_create_t fuse_vnop_create;
 static vop_deleteextattr_t fuse_vnop_deleteextattr;
 static vop_fdatasync_t fuse_vnop_fdatasync;
 static vop_fsync_t fuse_vnop_fsync;
 static vop_getattr_t fuse_vnop_getattr;
 static vop_getextattr_t fuse_vnop_getextattr;
 static vop_inactive_t fuse_vnop_inactive;
 static vop_link_t fuse_vnop_link;
 static vop_listextattr_t fuse_vnop_listextattr;
 static vop_lookup_t fuse_vnop_lookup;
 static vop_mkdir_t fuse_vnop_mkdir;
 static vop_mknod_t fuse_vnop_mknod;
 static vop_open_t fuse_vnop_open;
 static vop_pathconf_t fuse_vnop_pathconf;
 static vop_read_t fuse_vnop_read;
 static vop_readdir_t fuse_vnop_readdir;
 static vop_readlink_t fuse_vnop_readlink;
 static vop_reclaim_t fuse_vnop_reclaim;
 static vop_remove_t fuse_vnop_remove;
 static vop_rename_t fuse_vnop_rename;
 static vop_rmdir_t fuse_vnop_rmdir;
 static vop_setattr_t fuse_vnop_setattr;
 static vop_setextattr_t fuse_vnop_setextattr;
 static vop_strategy_t fuse_vnop_strategy;
 static vop_symlink_t fuse_vnop_symlink;
 static vop_write_t fuse_vnop_write;
 static vop_getpages_t fuse_vnop_getpages;
 static vop_print_t fuse_vnop_print;
 static vop_vptofh_t fuse_vnop_vptofh;
 
 struct vop_vector fuse_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_access =		fuse_vnop_access,
 	.vop_close =		fuse_fifo_close,
 	.vop_fsync =		fuse_vnop_fsync,
 	.vop_getattr =		fuse_vnop_getattr,
 	.vop_inactive =		fuse_vnop_inactive,
 	.vop_pathconf =		fuse_vnop_pathconf,
 	.vop_print =		fuse_vnop_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		fuse_vnop_reclaim,
 	.vop_setattr =		fuse_vnop_setattr,
 	.vop_write =		VOP_PANIC,
 	.vop_vptofh =		fuse_vnop_vptofh,
 };
 
 struct vop_vector fuse_vnops = {
 	.vop_allocate =	VOP_EINVAL,
 	.vop_default = &default_vnodeops,
 	.vop_access = fuse_vnop_access,
 	.vop_advlock = fuse_vnop_advlock,
 	.vop_bmap = fuse_vnop_bmap,
 	.vop_close = fuse_vnop_close,
 	.vop_create = fuse_vnop_create,
 	.vop_deleteextattr = fuse_vnop_deleteextattr,
 	.vop_fsync = fuse_vnop_fsync,
 	.vop_fdatasync = fuse_vnop_fdatasync,
 	.vop_getattr = fuse_vnop_getattr,
 	.vop_getextattr = fuse_vnop_getextattr,
 	.vop_inactive = fuse_vnop_inactive,
 	/*
 	 * TODO: implement vop_ioctl after upgrading to protocol 7.16.
 	 * FUSE_IOCTL was added in 7.11, but 32-bit compat is broken until
 	 * 7.16.
 	 */
 	.vop_link = fuse_vnop_link,
 	.vop_listextattr = fuse_vnop_listextattr,
 	.vop_lookup = fuse_vnop_lookup,
 	.vop_mkdir = fuse_vnop_mkdir,
 	.vop_mknod = fuse_vnop_mknod,
 	.vop_open = fuse_vnop_open,
 	.vop_pathconf = fuse_vnop_pathconf,
 	/*
 	 * TODO: implement vop_poll after upgrading to protocol 7.21.
 	 * FUSE_POLL was added in protocol 7.11, but it's kind of broken until
 	 * 7.21, which adds the ability for the client to choose which poll
 	 * events it wants, and for a client to deregister a file handle
 	 */
 	.vop_read = fuse_vnop_read,
 	.vop_readdir = fuse_vnop_readdir,
 	.vop_readlink = fuse_vnop_readlink,
 	.vop_reclaim = fuse_vnop_reclaim,
 	.vop_remove = fuse_vnop_remove,
 	.vop_rename = fuse_vnop_rename,
 	.vop_rmdir = fuse_vnop_rmdir,
 	.vop_setattr = fuse_vnop_setattr,
 	.vop_setextattr = fuse_vnop_setextattr,
 	.vop_strategy = fuse_vnop_strategy,
 	.vop_symlink = fuse_vnop_symlink,
 	.vop_write = fuse_vnop_write,
 	.vop_getpages = fuse_vnop_getpages,
 	.vop_print = fuse_vnop_print,
 	.vop_vptofh = fuse_vnop_vptofh,
 };
 
 uma_zone_t fuse_pbuf_zone;
 
 #define fuse_vm_page_lock(m)		vm_page_lock((m));
 #define fuse_vm_page_unlock(m)		vm_page_unlock((m));
 #define fuse_vm_page_lock_queues()	((void)0)
 #define fuse_vm_page_unlock_queues()	((void)0)
 
 /* Check permission for extattr operations, much like extattr_check_cred */
 static int
 fuse_extattr_check_cred(struct vnode *vp, int ns, struct ucred *cred,
 	struct thread *td, accmode_t accmode)
 {
 	struct mount *mp = vnode_mount(vp);
 	struct fuse_data *data = fuse_get_mpdata(mp);
 
 	/*
 	 * Kernel-invoked always succeeds.
 	 */
 	if (cred == NOCRED)
 		return (0);
 
 	/*
 	 * Do not allow privileged processes in jail to directly manipulate
 	 * system attributes.
 	 */
 	switch (ns) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		if (data->dataflags & FSESS_DEFAULT_PERMISSIONS) {
 			return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM));
 		}
 		/* FALLTHROUGH */
 	case EXTATTR_NAMESPACE_USER:
 		return (fuse_internal_access(vp, accmode, td, cred));
 	default:
 		return (EPERM);
 	}
 }
 
 /* Get a filehandle for a directory */
 static int
 fuse_filehandle_get_dir(struct vnode *vp, struct fuse_filehandle **fufhp,
 	struct ucred *cred, pid_t pid)
 {
 	if (fuse_filehandle_get(vp, FREAD, fufhp, cred, pid) == 0)
 		return 0;
 	return fuse_filehandle_get(vp, FEXEC, fufhp, cred, pid);
 }
 
 /* Send FUSE_FLUSH for this vnode */
 static int
 fuse_flush(struct vnode *vp, struct ucred *cred, pid_t pid, int fflag)
 {
 	struct fuse_flush_in *ffi;
 	struct fuse_filehandle *fufh;
 	struct fuse_dispatcher fdi;
 	struct thread *td = curthread;
 	struct mount *mp = vnode_mount(vp);
 	int err;
 
 	if (!fsess_isimpl(vnode_mount(vp), FUSE_FLUSH))
 		return 0;
 
 	err = fuse_filehandle_getrw(vp, fflag, &fufh, cred, pid);
 	if (err)
 		return err;
 
 	fdisp_init(&fdi, sizeof(*ffi));
 	fdisp_make_vp(&fdi, FUSE_FLUSH, vp, td, cred);
 	ffi = fdi.indata;
 	ffi->fh = fufh->fh_id;
 	/* 
 	 * If the file has a POSIX lock then we're supposed to set lock_owner.
 	 * If not, then lock_owner is undefined.  So we may as well always set
 	 * it.
 	 */
 	ffi->lock_owner = td->td_proc->p_pid;
 
 	err = fdisp_wait_answ(&fdi);
 	if (err == ENOSYS) {
 		fsess_set_notimpl(mp, FUSE_FLUSH);
 		err = 0;
 	}
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /* Close wrapper for fifos.  */
 static int
 fuse_fifo_close(struct vop_close_args *ap)
 {
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
     struct vnop_access_args {
 	struct vnode *a_vp;
 #if VOP_ACCESS_TAKES_ACCMODE_T
 	accmode_t a_accmode;
 #else
 	int a_mode;
 #endif
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_access(struct vop_access_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int accmode = ap->a_accmode;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_data *data = fuse_get_mpdata(vnode_mount(vp));
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		if (vnode_isvroot(vp)) {
 			return 0;
 		}
 		return ENXIO;
 	}
 	if (!(data->dataflags & FSESS_INITED)) {
 		if (vnode_isvroot(vp)) {
 			if (priv_check_cred(cred, PRIV_VFS_ADMIN) ||
 			    (fuse_match_cred(data->daemoncred, cred) == 0)) {
 				return 0;
 			}
 		}
 		return EBADF;
 	}
 	if (vnode_islnk(vp)) {
 		return 0;
 	}
 
 	err = fuse_internal_access(vp, accmode, ap->a_td, ap->a_cred);
 	return err;
 }
 
 /*
  * struct vop_advlock_args {
  *	struct vop_generic_args a_gen;
  *	struct vnode *a_vp;
  *	void *a_id;
  *	int a_op;
  *	struct flock *a_fl;
  *	int a_flags;
  * }
  */
 static int
 fuse_vnop_advlock(struct vop_advlock_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct flock *fl = ap->a_fl;
 	struct thread *td = curthread;
 	struct ucred *cred = td->td_ucred;
 	pid_t pid = td->td_proc->p_pid;
 	struct fuse_filehandle *fufh;
 	struct fuse_dispatcher fdi;
 	struct fuse_lk_in *fli;
 	struct fuse_lk_out *flo;
 	enum fuse_opcode op;
 	int dataflags, err;
 	int flags = ap->a_flags;
 
 	dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (!(dataflags & FSESS_POSIX_LOCKS))
 		return vop_stdadvlock(ap);
 	/* FUSE doesn't properly support flock until protocol 7.17 */
 	if (flags & F_FLOCK)
 		return vop_stdadvlock(ap);
 
 	err = fuse_filehandle_get_anyflags(vp, &fufh, cred, pid);
 	if (err)
 		return err;
 
 	fdisp_init(&fdi, sizeof(*fli));
 
 	switch(ap->a_op) {
 	case F_GETLK:
 		op = FUSE_GETLK;
 		break;
 	case F_SETLK:
 		op = FUSE_SETLK;
 		break;
 	case F_SETLKW:
 		op = FUSE_SETLKW;
 		break;
 	default:
 		return EINVAL;
 	}
 
 	fdisp_make_vp(&fdi, op, vp, td, cred);
 	fli = fdi.indata;
 	fli->fh = fufh->fh_id;
 	fli->owner = fl->l_pid;
 	fli->lk.start = fl->l_start;
 	if (fl->l_len != 0)
 		fli->lk.end = fl->l_start + fl->l_len - 1;
 	else
 		fli->lk.end = INT64_MAX;
 	fli->lk.type = fl->l_type;
 	fli->lk.pid = fl->l_pid;
 
 	err = fdisp_wait_answ(&fdi);
 	fdisp_destroy(&fdi);
 
 	if (err == 0 && op == FUSE_GETLK) {
 		flo = fdi.answ;
 		fl->l_type = flo->lk.type;
 		fl->l_pid = flo->lk.pid;
 		if (flo->lk.type != F_UNLCK) {
 			fl->l_start = flo->lk.start;
 			if (flo->lk.end == INT64_MAX)
 				fl->l_len = 0;
 			else
 				fl->l_len = flo->lk.end - flo->lk.start + 1;
 			fl->l_start = flo->lk.start;
 		}
 	}
 
 	return err;
 }
 
 /* {
 	struct vnode *a_vp;
 	daddr_t a_bn;
 	struct bufobj **a_bop;
 	daddr_t *a_bnp;
 	int *a_runp;
 	int *a_runb;
 } */
 static int
 fuse_vnop_bmap(struct vop_bmap_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj **bo = ap->a_bop;
 	struct thread *td = curthread;
 	struct mount *mp;
 	struct fuse_dispatcher fdi;
 	struct fuse_bmap_in *fbi;
 	struct fuse_bmap_out *fbo;
 	struct fuse_data *data;
 	uint64_t biosize;
 	off_t filesize;
 	daddr_t lbn = ap->a_bn;
 	daddr_t *pbn = ap->a_bnp;
 	int *runp = ap->a_runp;
 	int *runb = ap->a_runb;
 	int error = 0;
 	int maxrun;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	mp = vnode_mount(vp);
 	data = fuse_get_mpdata(mp);
 	biosize = fuse_iosize(vp);
 	maxrun = MIN(vp->v_mount->mnt_iosize_max / biosize - 1,
 		data->max_readahead_blocks);
 
 	if (bo != NULL)
 		*bo = &vp->v_bufobj;
 
 	/*
 	 * The FUSE_BMAP operation does not include the runp and runb
 	 * variables, so we must guess.  Report nonzero contiguous runs so
 	 * cluster_read will combine adjacent reads.  It's worthwhile to reduce
 	 * upcalls even if we don't know the true physical layout of the file.
 	 * 
 	 * FUSE file systems may opt out of read clustering in two ways:
 	 * * mounting with -onoclusterr
 	 * * Setting max_readahead <= maxbcachebuf during FUSE_INIT
 	 */
 	if (runb != NULL)
 		*runb = MIN(lbn, maxrun);
 	if (runp != NULL) {
 		error = fuse_vnode_size(vp, &filesize, td->td_ucred, td);
 		if (error == 0)
 			*runp = MIN(MAX(0, filesize / biosize - lbn - 1),
 				    maxrun);
 		else
 			*runp = 0;
 	}
 
 	if (fsess_isimpl(mp, FUSE_BMAP)) {
 		fdisp_init(&fdi, sizeof(*fbi));
 		fdisp_make_vp(&fdi, FUSE_BMAP, vp, td, td->td_ucred);
 		fbi = fdi.indata;
 		fbi->block = lbn;
 		fbi->blocksize = biosize;
 		error = fdisp_wait_answ(&fdi);
 		if (error == ENOSYS) {
 			fdisp_destroy(&fdi);
 			fsess_set_notimpl(mp, FUSE_BMAP);
 			error = 0;
 		} else {
 			fbo = fdi.answ;
 			if (error == 0 && pbn != NULL)
 				*pbn = fbo->block;
 			fdisp_destroy(&fdi);
 			return error;
 		}
 	}
 
 	/* If the daemon doesn't support BMAP, make up a sensible default */
 	if (pbn != NULL)
 		*pbn = lbn * btodb(biosize);
 	return (error);
 }
 
 /*
     struct vop_close_args {
 	struct vnode *a_vp;
 	int  a_fflag;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_close(struct vop_close_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct ucred *cred = ap->a_cred;
 	int fflag = ap->a_fflag;
 	struct thread *td = ap->a_td;
 	pid_t pid = td->td_proc->p_pid;
 	int err = 0;
 
 	if (fuse_isdeadfs(vp))
 		return 0;
 	if (vnode_isdir(vp))
 		return 0;
 	if (fflag & IO_NDELAY)
 		return 0;
 
 	err = fuse_flush(vp, cred, pid, fflag);
 	/* TODO: close the file handle, if we're sure it's no longer used */
 	if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 		fuse_vnode_savesize(vp, cred, td->td_proc->p_pid);
 	}
 	return err;
 }
 
 static void
 fdisp_make_mknod_for_fallback(
 	struct fuse_dispatcher *fdip,
 	struct componentname *cnp,
 	struct vnode *dvp,
 	uint64_t parentnid,
 	struct thread *td,
 	struct ucred *cred,
 	mode_t mode,
 	enum fuse_opcode *op)
 {
 	struct fuse_mknod_in *fmni;
 
 	fdisp_init(fdip, sizeof(*fmni) + cnp->cn_namelen + 1);
 	*op = FUSE_MKNOD;
 	fdisp_make(fdip, *op, vnode_mount(dvp), parentnid, td, cred);
 	fmni = fdip->indata;
 	fmni->mode = mode;
 	fmni->rdev = 0;
 	memcpy((char *)fdip->indata + sizeof(*fmni), cnp->cn_nameptr,
 	    cnp->cn_namelen);
 	((char *)fdip->indata)[sizeof(*fmni) + cnp->cn_namelen] = '\0';
 }
 /*
     struct vnop_create_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_create(struct vop_create_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	struct fuse_data *data;
 	struct fuse_create_in *fci;
 	struct fuse_entry_out *feo;
 	struct fuse_open_out *foo;
 	struct fuse_dispatcher fdi, fdi2;
 	struct fuse_dispatcher *fdip = &fdi;
 	struct fuse_dispatcher *fdip2 = NULL;
 
 	int err;
 
 	struct mount *mp = vnode_mount(dvp);
 	data = fuse_get_mpdata(mp);
 	uint64_t parentnid = VTOFUD(dvp)->nid;
 	mode_t mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	enum fuse_opcode op;
 	int flags;
 
 	if (fuse_isdeadfs(dvp))
 		return ENXIO;
 
 	/* FUSE expects sockets to be created with FUSE_MKNOD */
 	if (vap->va_type == VSOCK)
 		return fuse_internal_mknod(dvp, vpp, cnp, vap);
 
 	/* 
 	 * VOP_CREATE doesn't tell us the open(2) flags, so we guess.  Only a
 	 * writable mode makes sense, and we might as well include readability
 	 * too.
 	 */
 	flags = O_RDWR;
 
 	bzero(&fdi, sizeof(fdi));
 
 	if (vap->va_type != VREG)
 		return (EINVAL);
 
 	if (!fsess_isimpl(mp, FUSE_CREATE) || vap->va_type == VSOCK) {
 		/* Fallback to FUSE_MKNOD/FUSE_OPEN */
 		fdisp_make_mknod_for_fallback(fdip, cnp, dvp, parentnid, td,
 			cred, mode, &op);
 	} else {
 		/* Use FUSE_CREATE */
 		size_t insize;
 
 		op = FUSE_CREATE;
 		fdisp_init(fdip, sizeof(*fci) + cnp->cn_namelen + 1);
 		fdisp_make(fdip, op, vnode_mount(dvp), parentnid, td, cred);
 		fci = fdip->indata;
 		fci->mode = mode;
 		fci->flags = O_CREAT | flags;
 		if (fuse_libabi_geq(data, 7, 12)) {
 			insize = sizeof(*fci);
 			fci->umask = td->td_proc->p_fd->fd_cmask;
 		} else {
 			insize = sizeof(struct fuse_open_in);
 		}
 
 		memcpy((char *)fdip->indata + insize, cnp->cn_nameptr,
 		    cnp->cn_namelen);
 		((char *)fdip->indata)[insize + cnp->cn_namelen] = '\0';
 	}
 
 	err = fdisp_wait_answ(fdip);
 
 	if (err) {
 		if (err == ENOSYS && op == FUSE_CREATE) {
 			fsess_set_notimpl(mp, FUSE_CREATE);
 			fdisp_destroy(fdip);
 			fdisp_make_mknod_for_fallback(fdip, cnp, dvp,
 				parentnid, td, cred, mode, &op);
 			err = fdisp_wait_answ(fdip);
 		}
 		if (err)
 			goto out;
 	}
 
 	feo = fdip->answ;
 
 	if ((err = fuse_internal_checkentry(feo, vap->va_type))) {
 		goto out;
 	}
 
 	if (op == FUSE_CREATE) {
 		foo = (struct fuse_open_out*)(feo + 1);
 	} else {
 		/* Issue a separate FUSE_OPEN */
 		struct fuse_open_in *foi;
 
 		fdip2 = &fdi2;
 		fdisp_init(fdip2, sizeof(*foi));
 		fdisp_make(fdip2, FUSE_OPEN, vnode_mount(dvp), feo->nodeid, td,
 			cred);
 		foi = fdip2->indata;
 		foi->flags = flags;
 		err = fdisp_wait_answ(fdip2);
 		if (err)
 			goto out;
 		foo = fdip2->answ;
 	}
 	err = fuse_vnode_get(mp, feo, feo->nodeid, dvp, vpp, cnp, vap->va_type);
 	if (err) {
 		struct fuse_release_in *fri;
 		uint64_t nodeid = feo->nodeid;
 		uint64_t fh_id = foo->fh;
 
 		fdisp_init(fdip, sizeof(*fri));
 		fdisp_make(fdip, FUSE_RELEASE, mp, nodeid, td, cred);
 		fri = fdip->indata;
 		fri->fh = fh_id;
 		fri->flags = flags;
 		fuse_insert_callback(fdip->tick, fuse_internal_forget_callback);
 		fuse_insert_message(fdip->tick, false);
 		goto out;
 	}
 	ASSERT_VOP_ELOCKED(*vpp, "fuse_vnop_create");
 	fuse_internal_cache_attrs(*vpp, &feo->attr, feo->attr_valid,
 		feo->attr_valid_nsec, NULL);
 
 	fuse_filehandle_init(*vpp, FUFH_RDWR, NULL, td, cred, foo);
 	fuse_vnode_open(*vpp, foo->open_flags, td);
 	/* 
 	 * Purge the parent's attribute cache because the daemon should've
 	 * updated its mtime and ctime
 	 */
 	fuse_vnode_clear_attr_cache(dvp);
 	cache_purge_negative(dvp);
 
 out:
 	if (fdip2)
 		fdisp_destroy(fdip2);
 	fdisp_destroy(fdip);
 	return err;
 }
 
 /*
     struct vnop_fdatasync_args {
 	struct vop_generic_args a_gen;
 	struct vnode * a_vp;
 	struct thread * a_td;
     };
 */
 static int
 fuse_vnop_fdatasync(struct vop_fdatasync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	int waitfor = MNT_WAIT;
 
 	int err = 0;
 
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if ((err = vop_stdfdatasync_buf(ap)))
 		return err;
 
 	return fuse_internal_fsync(vp, td, waitfor, true);
 }
 
 /*
     struct vnop_fsync_args {
 	struct vop_generic_args a_gen;
 	struct vnode * a_vp;
 	int  a_waitfor;
 	struct thread * a_td;
     };
 */
 static int
 fuse_vnop_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	int waitfor = ap->a_waitfor;
 	int err = 0;
 
 	if (fuse_isdeadfs(vp)) {
 		return 0;
 	}
 	if ((err = vop_stdfsync(ap)))
 		return err;
 
 	return fuse_internal_fsync(vp, td, waitfor, false);
 }
 
 /*
     struct vnop_getattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getattr(struct vop_getattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 
 	int err = 0;
 	int dataflags;
 
 	dataflags = fuse_get_mpdata(vnode_mount(vp))->dataflags;
 
 	/* Note that we are not bailing out on a dead file system just yet. */
 
 	if (!(dataflags & FSESS_INITED)) {
 		if (!vnode_isvroot(vp)) {
 			fdata_set_dead(fuse_get_mpdata(vnode_mount(vp)));
 			err = ENOTCONN;
 			return err;
 		} else {
 			goto fake;
 		}
 	}
 	err = fuse_internal_getattr(vp, vap, cred, td);
 	if (err == ENOTCONN && vnode_isvroot(vp)) {
 		/* see comment in fuse_vfsop_statfs() */
 		goto fake;
 	} else {
 		return err;
 	}
 
 fake:
 	bzero(vap, sizeof(*vap));
 	vap->va_type = vnode_vtype(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_inactive_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_inactive(struct vop_inactive_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh, *fufh_tmp;
 
 	int need_flush = 1;
 
 	LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
 		if (need_flush && vp->v_type == VREG) {
 			if ((VTOFUD(vp)->flag & FN_SIZECHANGE) != 0) {
 				fuse_vnode_savesize(vp, NULL, 0);
 			}
 			if ((fvdat->flag & FN_REVOKED) != 0)
 				fuse_io_invalbuf(vp, td);
 			else
 				fuse_io_flushbuf(vp, MNT_WAIT, td);
 			need_flush = 0;
 		}
 		fuse_filehandle_close(vp, fufh, td, NULL);
 	}
 
 	if ((fvdat->flag & FN_REVOKED) != 0)
 		vrecycle(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_link_args {
 	struct vnode *a_tdvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_link(struct vop_link_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 
 	struct vattr *vap = VTOVA(vp);
 
 	struct fuse_dispatcher fdi;
 	struct fuse_entry_out *feo;
 	struct fuse_link_in fli;
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_mount(tdvp) != vnode_mount(vp)) {
 		return EXDEV;
 	}
 
 	/*
 	 * This is a seatbelt check to protect naive userspace filesystems from
 	 * themselves and the limitations of the FUSE IPC protocol.  If a
 	 * filesystem does not allow attribute caching, assume it is capable of
 	 * validating that nlink does not overflow.
 	 */
 	if (vap != NULL && vap->va_nlink >= FUSE_LINK_MAX)
 		return EMLINK;
 	fli.oldnodeid = VTOI(vp);
 
 	fdisp_init(&fdi, 0);
 	fuse_internal_newentry_makerequest(vnode_mount(tdvp), VTOI(tdvp), cnp,
 	    FUSE_LINK, &fli, sizeof(fli), &fdi);
 	if ((err = fdisp_wait_answ(&fdi))) {
 		goto out;
 	}
 	feo = fdi.answ;
 
 	err = fuse_internal_checkentry(feo, vnode_vtype(vp));
 	if (!err) {
 		/* 
 		 * Purge the parent's attribute cache because the daemon
 		 * should've updated its mtime and ctime
 		 */
 		fuse_vnode_clear_attr_cache(tdvp);
 		fuse_internal_cache_attrs(vp, &feo->attr, feo->attr_valid,
 			feo->attr_valid_nsec, NULL);
 	}
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 struct fuse_lookup_alloc_arg {
 	struct fuse_entry_out *feo;
 	struct componentname *cnp;
 	uint64_t nid;
 	enum vtype vtyp;
 };
 
 /* Callback for vn_get_ino */
 static int
 fuse_lookup_alloc(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
 {
 	struct fuse_lookup_alloc_arg *flaa = arg;
 
 	return fuse_vnode_get(mp, flaa->feo, flaa->nid, NULL, vpp, flaa->cnp,
 		flaa->vtyp);
 }
 
 SDT_PROBE_DEFINE3(fusefs, , vnops, cache_lookup,
 	"int", "struct timespec*", "struct timespec*");
 /*
     struct vnop_lookup_args {
 	struct vnodeop_desc *a_desc;
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
     };
 */
 int
 fuse_vnop_lookup(struct vop_lookup_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct thread *td = cnp->cn_thread;
 	struct ucred *cred = cnp->cn_cred;
 
 	int nameiop = cnp->cn_nameiop;
 	int flags = cnp->cn_flags;
 	int wantparent = flags & (LOCKPARENT | WANTPARENT);
 	int islastcn = flags & ISLASTCN;
 	struct mount *mp = vnode_mount(dvp);
 
 	int err = 0;
 	int lookup_err = 0;
 	struct vnode *vp = NULL;
 
 	struct fuse_dispatcher fdi;
 	bool did_lookup = false;
 	struct fuse_entry_out *feo = NULL;
 	enum vtype vtyp;	/* vnode type of target */
 	off_t filesize;		/* filesize of target */
 
 	uint64_t nid;
 
 	if (fuse_isdeadfs(dvp)) {
 		*vpp = NULL;
 		return ENXIO;
 	}
 	if (!vnode_isdir(dvp))
 		return ENOTDIR;
 
 	if (islastcn && vfs_isrdonly(mp) && (nameiop != LOOKUP))
 		return EROFS;
 
 	if ((err = fuse_internal_access(dvp, VEXEC, td, cred)))
 		return err;
 
 	if (flags & ISDOTDOT) {
 		KASSERT(VTOFUD(dvp)->flag & FN_PARENT_NID,
 			("Looking up .. is TODO"));
 		nid = VTOFUD(dvp)->parent_nid;
 		if (nid == 0)
 			return ENOENT;
 		/* .. is obviously a directory */
 		vtyp = VDIR;
 		filesize = 0;
 	} else if (cnp->cn_namelen == 1 && *(cnp->cn_nameptr) == '.') {
 		nid = VTOI(dvp);
 		/* . is obviously a directory */
 		vtyp = VDIR;
 		filesize = 0;
 	} else {
 		struct timespec now, timeout;
 
 		err = cache_lookup(dvp, vpp, cnp, &timeout, NULL);
 		getnanouptime(&now);
 		SDT_PROBE3(fusefs, , vnops, cache_lookup, err, &timeout, &now);
 		switch (err) {
 		case -1:		/* positive match */
 			if (timespeccmp(&timeout, &now, >)) {
 				counter_u64_add(fuse_lookup_cache_hits, 1);
 			} else {
 				/* Cache timeout */
 				counter_u64_add(fuse_lookup_cache_misses, 1);
 				bintime_clear(
 					&VTOFUD(*vpp)->entry_cache_timeout);
 				cache_purge(*vpp);
 				if (dvp != *vpp)
 					vput(*vpp);
 				else 
 					vrele(*vpp);
 				*vpp = NULL;
 				break;
 			}
 			return 0;
 
 		case 0:		/* no match in cache */
 			counter_u64_add(fuse_lookup_cache_misses, 1);
 			break;
 
 		case ENOENT:		/* negative match */
 			getnanouptime(&now);
 			if (timespeccmp(&timeout, &now, <=)) {
 				/* Cache timeout */
 				cache_purge_negative(dvp);
 				break;
 			}
 			/* fall through */
 		default:
 			return err;
 		}
 
 		nid = VTOI(dvp);
 		fdisp_init(&fdi, cnp->cn_namelen + 1);
 		fdisp_make(&fdi, FUSE_LOOKUP, mp, nid, td, cred);
 
 		memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 		((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 		lookup_err = fdisp_wait_answ(&fdi);
 		did_lookup = true;
 
 		if (!lookup_err) {
 			/* lookup call succeeded */
 			feo = (struct fuse_entry_out *)fdi.answ;
 			nid = feo->nodeid;
 			if (nid == 0) {
 				/* zero nodeid means ENOENT and cache it */
 				struct timespec timeout;
 
 				fdi.answ_stat = ENOENT;
 				lookup_err = ENOENT;
 				if (cnp->cn_flags & MAKEENTRY) {
 					fuse_validity_2_timespec(feo, &timeout);
 					cache_enter_time(dvp, *vpp, cnp,
 						&timeout, NULL);
 				}
 			} else if (nid == FUSE_ROOT_ID) {
 				lookup_err = EINVAL;
 			}
 			vtyp = IFTOVT(feo->attr.mode);
 			filesize = feo->attr.size;
 		}
 		if (lookup_err && (!fdi.answ_stat || lookup_err != ENOENT)) {
 			fdisp_destroy(&fdi);
 			return lookup_err;
 		}
 	}
 	/* lookup_err, if non-zero, must be ENOENT at this point */
 
 	if (lookup_err) {
 		/* Entry not found */
 		if ((nameiop == CREATE || nameiop == RENAME) && islastcn) {
 			err = fuse_internal_access(dvp, VWRITE, td, cred);
 			if (!err) {
 				/*
 				 * Set the SAVENAME flag to hold onto the
 				 * pathname for use later in VOP_CREATE or
 				 * VOP_RENAME.
 				 */
 				cnp->cn_flags |= SAVENAME;
 
 				err = EJUSTRETURN;
 			}
 		} else {
 			err = ENOENT;
 		}
 	} else {
 		/* Entry was found */
 		if (flags & ISDOTDOT) {
 			struct fuse_lookup_alloc_arg flaa;
 
 			flaa.nid = nid;
 			flaa.feo = feo;
 			flaa.cnp = cnp;
 			flaa.vtyp = vtyp;
 			err = vn_vget_ino_gen(dvp, fuse_lookup_alloc, &flaa, 0,
 				&vp);
 			*vpp = vp;
 		} else if (nid == VTOI(dvp)) {
 			vref(dvp);
 			*vpp = dvp;
 		} else {
 			struct fuse_vnode_data *fvdat;
 
 			err = fuse_vnode_get(vnode_mount(dvp), feo, nid, dvp,
 			    &vp, cnp, vtyp);
 			if (err)
 				goto out;
 			*vpp = vp;
 
 			/*
 			 * In the case where we are looking up a FUSE node
 			 * represented by an existing cached vnode, and the
 			 * true size reported by FUSE_LOOKUP doesn't match
 			 * the vnode's cached size, then any cached writes
 			 * beyond the file's current size are lost.
 			 *
 			 * We can get here:
 			 * * following attribute cache expiration, or
 			 * * due a bug in the daemon, or
 			 */
 			fvdat = VTOFUD(vp);
 			if (vnode_isreg(vp) &&
 			    filesize != fvdat->cached_attrs.va_size &&
 			    fvdat->flag & FN_SIZECHANGE) {
 				/*
 				 * The FN_SIZECHANGE flag reflects a dirty
 				 * append.  If userspace lets us know our cache
 				 * is invalid, that write was lost.  (Dirty
 				 * writes that do not cause append are also
 				 * lost, but we don't detect them here.)
 				 *
 				 * XXX: Maybe disable WB caching on this mount.
 				 */
 				printf("%s: WB cache incoherent on %s!\n",
 				    __func__,
 				    vnode_mount(vp)->mnt_stat.f_mntonname);
 
 				fvdat->flag &= ~FN_SIZECHANGE;
 			}
 
 			MPASS(feo != NULL);
 			fuse_internal_cache_attrs(*vpp, &feo->attr,
 				feo->attr_valid, feo->attr_valid_nsec, NULL);
 			fuse_validity_2_bintime(feo->entry_valid,
 				feo->entry_valid_nsec,
 				&fvdat->entry_cache_timeout);
 
 			if ((nameiop == DELETE || nameiop == RENAME) &&
 				islastcn)
 			{
 				struct vattr dvattr;
 
 				err = fuse_internal_access(dvp, VWRITE, td,
 					cred);
 				if (err != 0)
 					goto out;
 				/* 
 				 * if the parent's sticky bit is set, check
 				 * whether we're allowed to remove the file.
 				 * Need to figure out the vnode locking to make
 				 * this work.
 				 */
 				fuse_internal_getattr(dvp, &dvattr, cred, td);
 				if ((dvattr.va_mode & S_ISTXT) &&
 					fuse_internal_access(dvp, VADMIN, td,
 						cred) &&
 					fuse_internal_access(*vpp, VADMIN, td,
 						cred)) {
 					err = EPERM;
 					goto out;
 				}
 			}
 
 			if (islastcn && (
 				(nameiop == DELETE) ||
 				(nameiop == RENAME && wantparent))) {
 				cnp->cn_flags |= SAVENAME;
 			}
 
 		}
 	}
 out:
 	if (err) {
 		if (vp != NULL && dvp != vp)
 			vput(vp);
 		else if (vp != NULL)
 			vrele(vp);
 		*vpp = NULL;
 	}
 	if (did_lookup)
 		fdisp_destroy(&fdi);
 
 	return err;
 }
 
 /*
     struct vnop_mkdir_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mkdir(struct vop_mkdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 
 	struct fuse_mkdir_in fmdi;
 
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	fmdi.mode = MAKEIMODE(vap->va_type, vap->va_mode);
 	fmdi.umask = curthread->td_proc->p_fd->fd_cmask;
 
 	return (fuse_internal_newentry(dvp, vpp, cnp, FUSE_MKDIR, &fmdi,
 	    sizeof(fmdi), VDIR));
 }
 
 /*
     struct vnop_mknod_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
     };
 */
 static int
 fuse_vnop_mknod(struct vop_mknod_args *ap)
 {
 
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	struct vattr *vap = ap->a_vap;
 
 	if (fuse_isdeadfs(dvp))
 		return ENXIO;
 
 	return fuse_internal_mknod(dvp, vpp, cnp, vap);
 }
 
 /*
     struct vop_open_args {
 	struct vnode *a_vp;
 	int  a_mode;
 	struct ucred *a_cred;
 	struct thread *a_td;
 	int a_fdidx; / struct file *a_fp;
     };
 */
 static int
 fuse_vnop_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	int a_mode = ap->a_mode;
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	pid_t pid = td->td_proc->p_pid;
 	struct fuse_vnode_data *fvdat;
 
 	if (fuse_isdeadfs(vp))
 		return ENXIO;
 	if (vp->v_type == VCHR || vp->v_type == VBLK || vp->v_type == VFIFO)
 		return (EOPNOTSUPP);
 	if ((a_mode & (FREAD | FWRITE | FEXEC)) == 0)
 		return EINVAL;
 
 	fvdat = VTOFUD(vp);
 
 	if (fuse_filehandle_validrw(vp, a_mode, cred, pid)) {
 		fuse_vnode_open(vp, 0, td);
 		return 0;
 	}
 
 	return fuse_filehandle_open(vp, a_mode, NULL, td, cred);
 }
 
 static int
 fuse_vnop_pathconf(struct vop_pathconf_args *ap)
 {
 
 	switch (ap->a_name) {
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		return (0);
 	case _PC_NAME_MAX:
 		*ap->a_retval = NAME_MAX;
 		return (0);
 	case _PC_LINK_MAX:
 		*ap->a_retval = MIN(LONG_MAX, FUSE_LINK_MAX);
 		return (0);
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		return (0);
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		return (0);
 	default:
 		return (vop_stdpathconf(ap));
 	}
 }
 
 /*
     struct vnop_read_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_read(struct vop_read_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 	pid_t pid = curthread->td_proc->p_pid;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
 }
 
 /*
     struct vnop_readdir_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	int *a_eofflag;
 	int *a_ncookies;
 	u_long **a_cookies;
     };
 */
 static int
 fuse_vnop_readdir(struct vop_readdir_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 	struct fuse_filehandle *fufh = NULL;
 	struct fuse_iov cookediov;
 	int err = 0;
 	u_long *cookies;
 	off_t startoff;
 	ssize_t tresid;
 	int ncookies;
 	bool closefufh = false;
 	pid_t pid = curthread->td_proc->p_pid;
 
 	if (ap->a_eofflag)
 		*ap->a_eofflag = 0;
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (				/* XXXIP ((uio_iovcnt(uio) > 1)) || */
 	    (uio_resid(uio) < sizeof(struct dirent))) {
 		return EINVAL;
 	}
 
 	tresid = uio->uio_resid;
 	startoff = uio->uio_offset;
 	err = fuse_filehandle_get_dir(vp, &fufh, cred, pid);
 	if (err == EBADF && vnode_mount(vp)->mnt_flag & MNT_EXPORTED) {
 		/* 
 		 * nfsd will do VOP_READDIR without first doing VOP_OPEN.  We
 		 * must implicitly open the directory here
 		 */
 		err = fuse_filehandle_open(vp, FREAD, &fufh, curthread, cred);
 		if (err == 0) {
 			/*
 			 * When a directory is opened, it must be read from
 			 * the beginning.  Hopefully, the "startoff" still
 			 * exists as an offset cookie for the directory.
 			 * If not, it will read the entire directory without
 			 * returning any entries and just return eof.
 			 */
 			uio->uio_offset = 0;
 		}
 		closefufh = true;
 	}
 	if (err)
 		return (err);
 	if (ap->a_ncookies != NULL) {
 		ncookies = uio->uio_resid /
 			(offsetof(struct dirent, d_name) + 4) + 1;
 		cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	} else {
 		ncookies = 0;
 		cookies = NULL;
 	}
 #define DIRCOOKEDSIZE FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + MAXNAMLEN + 1)
 	fiov_init(&cookediov, DIRCOOKEDSIZE);
 
 	err = fuse_internal_readdir(vp, uio, startoff, fufh, &cookediov,
 		&ncookies, cookies);
 
 	fiov_teardown(&cookediov);
 	if (closefufh)
 		fuse_filehandle_close(vp, fufh, curthread, cred);
 
 	if (ap->a_ncookies != NULL) {
 		if (err == 0) {
 			*ap->a_ncookies -= ncookies;
 		} else {
 			free(*ap->a_cookies, M_TEMP);
 			*ap->a_ncookies = 0;
 			*ap->a_cookies = NULL;
 		}
 	}
 	if (err == 0 && tresid == uio->uio_resid)
 		*ap->a_eofflag = 1;
 
 	return err;
 }
 
 /*
     struct vnop_readlink_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_readlink(struct vop_readlink_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct ucred *cred = ap->a_cred;
 
 	struct fuse_dispatcher fdi;
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (!vnode_islnk(vp)) {
 		return EINVAL;
 	}
 	fdisp_init(&fdi, 0);
 	err = fdisp_simple_putget_vp(&fdi, FUSE_READLINK, vp, curthread, cred);
 	if (err) {
 		goto out;
 	}
 	if (((char *)fdi.answ)[0] == '/' &&
 	    fuse_get_mpdata(vnode_mount(vp))->dataflags & FSESS_PUSH_SYMLINKS_IN) {
 		char *mpth = vnode_mount(vp)->mnt_stat.f_mntonname;
 
 		err = uiomove(mpth, strlen(mpth), uio);
 	}
 	if (!err) {
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 	}
 out:
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_reclaim_args {
 	struct vnode *a_vp;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct thread *td = ap->a_td;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_filehandle *fufh, *fufh_tmp;
 
 	if (!fvdat) {
 		panic("FUSE: no vnode data during recycling");
 	}
 	LIST_FOREACH_SAFE(fufh, &fvdat->handles, next, fufh_tmp) {
 		printf("FUSE: vnode being reclaimed with open fufh "
 			"(type=%#x)", fufh->fufh_type);
 		fuse_filehandle_close(vp, fufh, td, NULL);
 	}
 
 	if ((!fuse_isdeadfs(vp)) && (fvdat->nlookup)) {
 		fuse_internal_forget_send(vnode_mount(vp), td, NULL, VTOI(vp),
 		    fvdat->nlookup);
 	}
 	fuse_vnode_setparent(vp, NULL);
 	cache_purge(vp);
 	vfs_hash_remove(vp);
 	vnode_destroy_vobject(vp);
 	fuse_vnode_destroy(vp);
 
 	return 0;
 }
 
 /*
     struct vnop_remove_args {
 	struct vnode *a_dvp;
 	struct vnode *a_vp;
 	struct componentname *a_cnp;
     };
 */
 static int
 fuse_vnop_remove(struct vop_remove_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 	struct componentname *cnp = ap->a_cnp;
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (vnode_isdir(vp)) {
 		return EPERM;
 	}
 
 	err = fuse_internal_remove(dvp, vp, cnp, FUSE_UNLINK);
 
 	return err;
 }
 
 /*
     struct vnop_rename_args {
 	struct vnode *a_fdvp;
 	struct vnode *a_fvp;
 	struct componentname *a_fcnp;
 	struct vnode *a_tdvp;
 	struct vnode *a_tvp;
 	struct componentname *a_tcnp;
     };
 */
 static int
 fuse_vnop_rename(struct vop_rename_args *ap)
 {
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *tvp = ap->a_tvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct fuse_data *data;
 	bool newparent = fdvp != tdvp;
 	bool isdir = fvp->v_type == VDIR;
 	int err = 0;
 
 	if (fuse_isdeadfs(fdvp)) {
 		return ENXIO;
 	}
 	if (fvp->v_mount != tdvp->v_mount ||
 	    (tvp && fvp->v_mount != tvp->v_mount)) {
 		SDT_PROBE2(fusefs, , vnops, trace, 1, "cross-device rename");
 		err = EXDEV;
 		goto out;
 	}
 	cache_purge(fvp);
 
 	/*
 	 * FUSE library is expected to check if target directory is not
 	 * under the source directory in the file system tree.
 	 * Linux performs this check at VFS level.
 	 */
 	/* 
 	 * If source is a directory, and it will get a new parent, user must
 	 * have write permission to it, so ".." can be modified.
 	 */
 	data = fuse_get_mpdata(vnode_mount(tdvp));
 	if (data->dataflags & FSESS_DEFAULT_PERMISSIONS && isdir && newparent) {
 		err = fuse_internal_access(fvp, VWRITE,
 			tcnp->cn_thread, tcnp->cn_cred);
 		if (err)
 			goto out;
 	}
 	sx_xlock(&data->rename_lock);
 	err = fuse_internal_rename(fdvp, fcnp, tdvp, tcnp);
 	if (err == 0) {
 		if (tdvp != fdvp)
 			fuse_vnode_setparent(fvp, tdvp);
 		if (tvp != NULL)
 			fuse_vnode_setparent(tvp, NULL);
 	}
 	sx_unlock(&data->rename_lock);
 
 	if (tvp != NULL && tvp != fvp) {
 		cache_purge(tvp);
 	}
 	if (vnode_isdir(fvp)) {
 		if ((tvp != NULL) && vnode_isdir(tvp)) {
 			cache_purge(tdvp);
 		}
 		cache_purge(fdvp);
 	}
 out:
 	if (tdvp == tvp) {
 		vrele(tdvp);
 	} else {
 		vput(tdvp);
 	}
 	if (tvp != NULL) {
 		vput(tvp);
 	}
 	vrele(fdvp);
 	vrele(fvp);
 
 	return err;
 }
 
 /*
     struct vnop_rmdir_args {
 	    struct vnode *a_dvp;
 	    struct vnode *a_vp;
 	    struct componentname *a_cnp;
     } *ap;
 */
 static int
 fuse_vnop_rmdir(struct vop_rmdir_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode *vp = ap->a_vp;
 
 	int err;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 	if (VTOFUD(vp) == VTOFUD(dvp)) {
 		return EINVAL;
 	}
 	err = fuse_internal_remove(dvp, vp, ap->a_cnp, FUSE_RMDIR);
 
 	return err;
 }
 
 /*
     struct vnop_setattr_args {
 	struct vnode *a_vp;
 	struct vattr *a_vap;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setattr(struct vop_setattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct vattr *vap = ap->a_vap;
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	struct mount *mp;
 	struct fuse_data *data;
 	struct vattr old_va;
 	int dataflags;
 	int err = 0, err2;
 	accmode_t accmode = 0;
 	bool checkperm;
 	bool drop_suid = false;
 	gid_t cr_gid;
 
 	mp = vnode_mount(vp);
 	data = fuse_get_mpdata(mp);
 	dataflags = data->dataflags;
 	checkperm = dataflags & FSESS_DEFAULT_PERMISSIONS;
 	if (cred->cr_ngroups > 0)
 		cr_gid = cred->cr_groups[0];
 	else
 		cr_gid = 0;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (vap->va_uid != (uid_t)VNOVAL) {
 		if (checkperm) {
 			/* Only root may change a file's owner */
 			err = priv_check_cred(cred, PRIV_VFS_CHOWN);
 			if (err) {
 				/* As a special case, allow the null chown */
 				err2 = fuse_internal_getattr(vp, &old_va, cred,
 					td);
 				if (err2)
 					return (err2);
 				if (vap->va_uid != old_va.va_uid)
 					return err;
 				else
 					accmode |= VADMIN;
 				drop_suid = true;
 			} else
 				accmode |= VADMIN;
 		} else
 			accmode |= VADMIN;
 	}
 	if (vap->va_gid != (gid_t)VNOVAL) {
 		if (checkperm && priv_check_cred(cred, PRIV_VFS_CHOWN))
 			drop_suid = true;
 		if (checkperm && !groupmember(vap->va_gid, cred))
 		{
 			/*
 			 * Non-root users may only chgrp to one of their own
 			 * groups 
 			 */
 			err = priv_check_cred(cred, PRIV_VFS_CHOWN);
 			if (err) {
 				/* As a special case, allow the null chgrp */
 				err2 = fuse_internal_getattr(vp, &old_va, cred,
 					td);
 				if (err2)
 					return (err2);
 				if (vap->va_gid != old_va.va_gid)
 					return err;
 				accmode |= VADMIN;
 			} else
 				accmode |= VADMIN;
 		} else
 			accmode |= VADMIN;
 	}
 	if (vap->va_size != VNOVAL) {
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			if (vfs_isrdonly(mp))
 				return (EROFS);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support shared memory objects in the file
 			 * system, and have dubious support for truncating
 			 * symlinks.  Just ignore the request in other cases.
 			 */
 			return (0);
 		}
 		/* Don't set accmode.  Permission to trunc is checked upstack */
 	}
 	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
 		if (vap->va_vaflags & VA_UTIMES_NULL)
 			accmode |= VWRITE;
 		else
 			accmode |= VADMIN;
 	}
 	if (drop_suid) {
 		if (vap->va_mode != (mode_t)VNOVAL)
 			vap->va_mode &= ~(S_ISUID | S_ISGID);
 		else {
 			err = fuse_internal_getattr(vp, &old_va, cred, td);
 			if (err)
 				return (err);
 			vap->va_mode = old_va.va_mode & ~(S_ISUID | S_ISGID);
 		}
 	}
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		/* Only root may set the sticky bit on non-directories */
 		if (checkperm && vp->v_type != VDIR && (vap->va_mode & S_ISTXT)
 		    && priv_check_cred(cred, PRIV_VFS_STICKYFILE))
 			return EFTYPE;
 		if (checkperm && (vap->va_mode & S_ISGID)) {
 			err = fuse_internal_getattr(vp, &old_va, cred, td);
 			if (err)
 				return (err);
 			if (!groupmember(old_va.va_gid, cred)) {
 				err = priv_check_cred(cred, PRIV_VFS_SETGID);
 				if (err)
 					return (err);
 			}
 		}
 		accmode |= VADMIN;
 	}
 
 	if (vfs_isrdonly(mp))
 		return EROFS;
 
 	err = fuse_internal_access(vp, accmode, td, cred);
 	if (err)
 		return err;
 	else
 		return fuse_internal_setattr(vp, vap, td, cred);
 }
 
 /*
     struct vnop_strategy_args {
 	struct vnode *a_vp;
 	struct buf *a_bp;
     };
 */
 static int
 fuse_vnop_strategy(struct vop_strategy_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct buf *bp = ap->a_bp;
 
 	if (!vp || fuse_isdeadfs(vp)) {
 		bp->b_ioflags |= BIO_ERROR;
 		bp->b_error = ENXIO;
 		bufdone(bp);
 		return 0;
 	}
 
 	/*
 	 * VOP_STRATEGY always returns zero and signals error via bp->b_ioflags.
 	 * fuse_io_strategy sets bp's error fields
 	 */
 	(void)fuse_io_strategy(vp, bp);
 
 	return 0;
 }
 
 
 /*
     struct vnop_symlink_args {
 	struct vnode *a_dvp;
 	struct vnode **a_vpp;
 	struct componentname *a_cnp;
 	struct vattr *a_vap;
 	char *a_target;
     };
 */
 static int
 fuse_vnop_symlink(struct vop_symlink_args *ap)
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vnode **vpp = ap->a_vpp;
 	struct componentname *cnp = ap->a_cnp;
 	const char *target = ap->a_target;
 
 	struct fuse_dispatcher fdi;
 
 	int err;
 	size_t len;
 
 	if (fuse_isdeadfs(dvp)) {
 		return ENXIO;
 	}
 	/*
 	 * Unlike the other creator type calls, here we have to create a message
 	 * where the name of the new entry comes first, and the data describing
 	 * the entry comes second.
 	 * Hence we can't rely on our handy fuse_internal_newentry() routine,
 	 * but put together the message manually and just call the core part.
 	 */
 
 	len = strlen(target) + 1;
 	fdisp_init(&fdi, len + cnp->cn_namelen + 1);
 	fdisp_make_vp(&fdi, FUSE_SYMLINK, dvp, curthread, NULL);
 
 	memcpy(fdi.indata, cnp->cn_nameptr, cnp->cn_namelen);
 	((char *)fdi.indata)[cnp->cn_namelen] = '\0';
 	memcpy((char *)fdi.indata + cnp->cn_namelen + 1, target, len);
 
 	err = fuse_internal_newentry_core(dvp, vpp, cnp, VLNK, &fdi);
 	fdisp_destroy(&fdi);
 	return err;
 }
 
 /*
     struct vnop_write_args {
 	struct vnode *a_vp;
 	struct uio *a_uio;
 	int  a_ioflag;
 	struct ucred *a_cred;
     };
 */
 static int
 fuse_vnop_write(struct vop_write_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	int ioflag = ap->a_ioflag;
 	struct ucred *cred = ap->a_cred;
 	pid_t pid = curthread->td_proc->p_pid;
 
 	if (fuse_isdeadfs(vp)) {
 		return ENXIO;
 	}
 
 	if (VTOFUD(vp)->flag & FN_DIRECTIO) {
 		ioflag |= IO_DIRECT;
 	}
 
 	return fuse_io_dispatch(vp, uio, ioflag, cred, pid);
 }
 
 static daddr_t
 fuse_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
 {
 	const int biosize = fuse_iosize(vp);
 
 	return (off / biosize);
 }
 
 static int
 fuse_gbp_getblksz(struct vnode *vp, daddr_t lbn)
 {
 	off_t filesize;
 	int blksz, err;
 	const int biosize = fuse_iosize(vp);
 
 	err = fuse_vnode_size(vp, &filesize, NULL, NULL);
 	KASSERT(err == 0, ("vfs_bio_getpages can't handle errors here"));
 	if (err)
 		return biosize;
 
 	if ((off_t)lbn * biosize >= filesize) {
 		blksz = 0;
 	} else if ((off_t)(lbn + 1) * biosize > filesize) {
 		blksz = filesize - (off_t)lbn *biosize;
 	} else {
 		blksz = biosize;
 	}
 	return (blksz);
 }
 
 /*
     struct vnop_getpages_args {
 	struct vnode *a_vp;
 	vm_page_t *a_m;
 	int a_count;
 	int a_reqpage;
     };
 */
 static int
 fuse_vnop_getpages(struct vop_getpages_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
 	if (!fsess_opt_mmap(vnode_mount(vp))) {
 		SDT_PROBE2(fusefs, , vnops, trace, 1,
 			"called on non-cacheable vnode??\n");
 		return (VM_PAGER_ERROR);
 	}
 
 	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
 	    ap->a_rahead, fuse_gbp_getblkno, fuse_gbp_getblksz));
 }
 
 static const char extattr_namespace_separator = '.';
 
 /*
     struct vop_getextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct uio *a_uio;
 	size_t *a_size;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_getextattr(struct vop_getextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_getxattr_in *get_xattr_in;
 	struct fuse_getxattr_out *get_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	char *attr_str;
 	size_t len;
 	int err;
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_GETXATTR))
 		return EOPNOTSUPP;
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
 	if (err)
 		return err;
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*get_xattr_in));
 	fdisp_make_vp(&fdi, FUSE_GETXATTR, vp, td, cred);
 
 	get_xattr_in = fdi.indata;
 	/*
 	 * Check to see whether we're querying the available size or
 	 * issuing the actual request.  If we pass in 0, we get back struct
 	 * fuse_getxattr_out.  If we pass in a non-zero size, we get back
 	 * that much data, without the struct fuse_getxattr_out header.
 	 */
 	if (uio == NULL)
 		get_xattr_in->size = 0;
 	else
 		get_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*get_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS) {
 			fsess_set_notimpl(mp, FUSE_GETXATTR);
 			err = EOPNOTSUPP;
 		}
 		goto out;
 	}
 
 	get_xattr_out = fdi.answ;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = get_xattr_out->size;
 
 	if (uio != NULL)
 		err = uiomove(fdi.answ, fdi.iosize, uio);
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_setextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct uio *a_uio;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_setextattr(struct vop_setextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_setxattr_in *set_xattr_in;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 	
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_SETXATTR))
 		return EOPNOTSUPP;
 
 	if (vfs_isrdonly(mp))
 		return EROFS;
 
 	/* Deleting xattrs must use VOP_DELETEEXTATTR instead */
 	if (ap->a_uio == NULL) {
 		/*
 		 * If we got here as fallback from VOP_DELETEEXTATTR, then
 		 * return EOPNOTSUPP.
 		 */
 		if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
 			return (EOPNOTSUPP);
 		else
 			return (EINVAL);
 	}
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
 		VWRITE);
 	if (err)
 		return err;
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len + sizeof(*set_xattr_in) + uio->uio_resid);
 	fdisp_make_vp(&fdi, FUSE_SETXATTR, vp, td, cred);
 
 	set_xattr_in = fdi.indata;
 	set_xattr_in->size = uio->uio_resid;
 
 	attr_str = (char *)fdi.indata + sizeof(*set_xattr_in);
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = uiomove((char *)fdi.indata + sizeof(*set_xattr_in) + len,
 	    uio->uio_resid, uio);
 	if (err != 0) {
 		goto out;
 	}
 
 	err = fdisp_wait_answ(&fdi);
 
 	if (err == ENOSYS) {
 		fsess_set_notimpl(mp, FUSE_SETXATTR);
 		err = EOPNOTSUPP;
 	}
 	if (err == ERESTART) {
 		/* Can't restart after calling uiomove */
 		err = EINTR;
 	}
 
 out:
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
  * The Linux / FUSE extended attribute list is simply a collection of
  * NUL-terminated strings.  The FreeBSD extended attribute list is a single
  * byte length followed by a non-NUL terminated string.  So, this allows
  * conversion of the Linux / FUSE format to the FreeBSD format in place.
  * Linux attribute names are reported with the namespace as a prefix (e.g.
  * "user.attribute_name"), but in FreeBSD they are reported without the
  * namespace prefix (e.g. "attribute_name").  So, we're going from:
  *
  * user.attr_name1\0user.attr_name2\0
  *
  * to:
  *
  * <num>attr_name1<num>attr_name2
  *
  * Where "<num>" is a single byte number of characters in the attribute name.
  * 
  * Args:
  * prefix - exattr namespace prefix string
  * list, list_len - input list with namespace prefixes
  * bsd_list, bsd_list_len - output list compatible with bsd vfs
  */
 static int
 fuse_xattrlist_convert(char *prefix, const char *list, int list_len,
     char *bsd_list, int *bsd_list_len)
 {
 	int len, pos, dist_to_next, prefix_len;
 
 	pos = 0;
 	*bsd_list_len = 0;
 	prefix_len = strlen(prefix);
 
 	while (pos < list_len && list[pos] != '\0') {
 		dist_to_next = strlen(&list[pos]) + 1;
 		if (bcmp(&list[pos], prefix, prefix_len) == 0 &&
 		    list[pos + prefix_len] == extattr_namespace_separator) {
 			len = dist_to_next -
 			    (prefix_len + sizeof(extattr_namespace_separator)) - 1;
 			if (len >= EXTATTR_MAXNAMELEN)
 				return (ENAMETOOLONG);
 
 			bsd_list[*bsd_list_len] = len;
 			memcpy(&bsd_list[*bsd_list_len + 1],
 			    &list[pos + prefix_len +
 			    sizeof(extattr_namespace_separator)], len);
 
 			*bsd_list_len += len + 1;
 		}
 
 		pos += dist_to_next;
 	}
 
 	return (0);
 }
 
 /*
     struct vop_listextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	struct uio *a_uio;
 	size_t *a_size;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_listextattr(struct vop_listextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct fuse_dispatcher fdi;
 	struct fuse_listxattr_in *list_xattr_in;
 	struct fuse_listxattr_out *list_xattr_out;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	size_t len;
 	char *prefix;
 	char *attr_str;
 	char *bsd_list = NULL;
 	char *linux_list;
 	int bsd_list_len;
 	int linux_list_len;
 	int err;
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_LISTXATTR))
 		return EOPNOTSUPP;
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td, VREAD);
 	if (err)
 		return err;
 
 	/*
 	 * Add space for a NUL and the period separator if enabled.
 	 * Default to looking for user attributes.
 	 */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) + 1;
 
 	fdisp_init(&fdi, sizeof(*list_xattr_in) + len);
 	fdisp_make_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 
 	/*
 	 * Retrieve Linux / FUSE compatible list size.
 	 */
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = 0;
 	attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
 	snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0) {
 		if (err == ENOSYS) {
 			fsess_set_notimpl(mp, FUSE_LISTXATTR);
 			err = EOPNOTSUPP;
 		}
 		goto out;
 	}
 
 	list_xattr_out = fdi.answ;
 	linux_list_len = list_xattr_out->size;
 	if (linux_list_len == 0) {
 		if (ap->a_size != NULL)
 			*ap->a_size = linux_list_len;
 		goto out;
 	}
 
 	/*
 	 * Retrieve Linux / FUSE compatible list values.
 	 */
 	fdisp_refresh_vp(&fdi, FUSE_LISTXATTR, vp, td, cred);
 	list_xattr_in = fdi.indata;
 	list_xattr_in->size = linux_list_len + sizeof(*list_xattr_out);
 	attr_str = (char *)fdi.indata + sizeof(*list_xattr_in);
 	snprintf(attr_str, len, "%s%c", prefix, extattr_namespace_separator);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err != 0)
 		goto out;
 
 	linux_list = fdi.answ;
 	linux_list_len = fdi.iosize;
 
 	/*
 	 * Retrieve the BSD compatible list values.
 	 * The Linux / FUSE attribute list format isn't the same
 	 * as FreeBSD's format. So we need to transform it into
 	 * FreeBSD's format before giving it to the user.
 	 */
 	bsd_list = malloc(linux_list_len, M_TEMP, M_WAITOK);
 	err = fuse_xattrlist_convert(prefix, linux_list, linux_list_len,
 	    bsd_list, &bsd_list_len);
 	if (err != 0)
 		goto out;
 
 	if (ap->a_size != NULL)
 		*ap->a_size = bsd_list_len;
 
 	if (uio != NULL)
 		err = uiomove(bsd_list, bsd_list_len, uio);
 
 out:
 	free(bsd_list, M_TEMP);
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vop_deleteextattr_args {
 	struct vop_generic_args a_gen;
 	struct vnode *a_vp;
 	int a_attrnamespace;
 	const char *a_name;
 	struct ucred *a_cred;
 	struct thread *a_td;
     };
 */
 static int
 fuse_vnop_deleteextattr(struct vop_deleteextattr_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct fuse_dispatcher fdi;
 	struct mount *mp = vnode_mount(vp);
 	struct thread *td = ap->a_td;
 	struct ucred *cred = ap->a_cred;
 	char *prefix;
 	size_t len;
 	char *attr_str;
 	int err;
 
 	if (fuse_isdeadfs(vp))
 		return (ENXIO);
 
 	if (!fsess_isimpl(mp, FUSE_REMOVEXATTR))
 		return EOPNOTSUPP;
 
 	if (vfs_isrdonly(mp))
 		return EROFS;
 
 	err = fuse_extattr_check_cred(vp, ap->a_attrnamespace, cred, td,
 		VWRITE);
 	if (err)
 		return err;
 
 	/* Default to looking for user attributes. */
 	if (ap->a_attrnamespace == EXTATTR_NAMESPACE_SYSTEM)
 		prefix = EXTATTR_NAMESPACE_SYSTEM_STRING;
 	else
 		prefix = EXTATTR_NAMESPACE_USER_STRING;
 
 	len = strlen(prefix) + sizeof(extattr_namespace_separator) +
 	    strlen(ap->a_name) + 1;
 
 	fdisp_init(&fdi, len);
 	fdisp_make_vp(&fdi, FUSE_REMOVEXATTR, vp, td, cred);
 
 	attr_str = fdi.indata;
 	snprintf(attr_str, len, "%s%c%s", prefix, extattr_namespace_separator,
 	    ap->a_name);
 
 	err = fdisp_wait_answ(&fdi);
 	if (err == ENOSYS) {
 		fsess_set_notimpl(mp, FUSE_REMOVEXATTR);
 		err = EOPNOTSUPP;
 	}
 
 	fdisp_destroy(&fdi);
 	return (err);
 }
 
 /*
     struct vnop_print_args {
 	struct vnode *a_vp;
     };
 */
 static int
 fuse_vnop_print(struct vop_print_args *ap)
 {
 	struct fuse_vnode_data *fvdat = VTOFUD(ap->a_vp);
 
 	printf("nodeid: %ju, parent nodeid: %ju, nlookup: %ju, flag: %#x\n",
 	    (uintmax_t)VTOILLU(ap->a_vp), (uintmax_t)fvdat->parent_nid,
 	    (uintmax_t)fvdat->nlookup,
 	    fvdat->flag);
 
 	return 0;
 }
 	
 /*
  * Get an NFS filehandle for a FUSE file.
  *
  * This will only work for FUSE file systems that guarantee the uniqueness of
  * nodeid:generation, which most don't.
  */
 /*
 vop_vptofh {
 	IN struct vnode *a_vp;
 	IN struct fid *a_fhp;
 };
 */
 static int
 fuse_vnop_vptofh(struct vop_vptofh_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct fuse_vnode_data *fvdat = VTOFUD(vp);
 	struct fuse_fid *fhp = (struct fuse_fid *)(ap->a_fhp);
 	_Static_assert(sizeof(struct fuse_fid) <= sizeof(struct fid),
 		"FUSE fid type is too big");
 	struct mount *mp = vnode_mount(vp);
 	struct fuse_data *data = fuse_get_mpdata(mp);
 	struct vattr va;
 	int err;
 
 	if (!(data->dataflags & FSESS_EXPORT_SUPPORT))
 		return EOPNOTSUPP;
 
 	err = fuse_internal_getattr(vp, &va, curthread->td_ucred, curthread);
 	if (err)
 		return err;
 
 	/*ip = VTOI(ap->a_vp);*/
 	/*ufhp = (struct ufid *)ap->a_fhp;*/
 	fhp->len = sizeof(struct fuse_fid);
 	fhp->nid = fvdat->nid;
 	if (fvdat->generation <= UINT32_MAX)
 		fhp->gen = fvdat->generation;
 	else
 		return EOVERFLOW;
 	return (0);
 }
 
 
Index: projects/fuse2/sys/kern/kern_event.c
===================================================================
--- projects/fuse2/sys/kern/kern_event.c	(revision 350434)
+++ projects/fuse2/sys/kern/kern_event.c	(revision 350435)
@@ -1,2740 +1,2741 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
  * Copyright 2004 John-Mark Gurney <jmg@FreeBSD.org>
  * Copyright (c) 2009 Apple, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 #include "opt_kqueue.h"
 
 #ifdef COMPAT_FREEBSD11
 #define	_WANT_FREEBSD11_KEVENT
 #endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rwlock.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/unistd.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
 #include <sys/event.h>
 #include <sys/eventvar.h>
 #include <sys/poll.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
 #include <sys/sigio.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/user.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #include <machine/atomic.h>
 
 #include <vm/uma.h>
 
 static MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 /*
  * This lock is used if multiple kq locks are required.  This possibly
  * should be made into a per proc lock.
  */
 static struct mtx	kq_global;
 MTX_SYSINIT(kq_global, &kq_global, "kqueue order", MTX_DEF);
 #define KQ_GLOBAL_LOCK(lck, haslck)	do {	\
 	if (!haslck)				\
 		mtx_lock(lck);			\
 	haslck = 1;				\
 } while (0)
 #define KQ_GLOBAL_UNLOCK(lck, haslck)	do {	\
 	if (haslck)				\
 		mtx_unlock(lck);			\
 	haslck = 0;				\
 } while (0)
 
 TASKQUEUE_DEFINE_THREAD(kqueue_ctx);
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
 static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
 		    struct thread *td, int mflag);
 static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
 static void	kqueue_destroy(struct kqueue *kq);
 static void	kqueue_drain(struct kqueue *kq, struct thread *td);
 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
 		    uintptr_t ident, int mflag);
 static void	kqueue_task(void *arg, int pending);
 static int	kqueue_scan(struct kqueue *kq, int maxevents,
 		    struct kevent_copyops *k_ops,
 		    const struct timespec *timeout,
 		    struct kevent *keva, struct thread *td);
 static void 	kqueue_wakeup(struct kqueue *kq);
 static struct filterops *kqueue_fo_find(int filt);
 static void	kqueue_fo_release(int filt);
 struct g_kevent_args;
 static int	kern_kevent_generic(struct thread *td,
 		    struct g_kevent_args *uap,
 		    struct kevent_copyops *k_ops, const char *struct_name);
 
 static fo_ioctl_t	kqueue_ioctl;
 static fo_poll_t	kqueue_poll;
 static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
 
 static struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
 	.fo_write = invfo_rdwr,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = kqueue_ioctl,
 	.fo_poll = kqueue_poll,
 	.fo_kqfilter = kqueue_kqfilter,
 	.fo_stat = kqueue_stat,
 	.fo_close = kqueue_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
 static void 	knote_drop(struct knote *kn, struct thread *td);
 static void 	knote_drop_detached(struct knote *kn, struct thread *td);
 static void 	knote_enqueue(struct knote *kn);
 static void 	knote_dequeue(struct knote *kn);
 static void 	knote_init(void);
 static struct 	knote *knote_alloc(int mflag);
 static void 	knote_free(struct knote *kn);
 
 static void	filt_kqdetach(struct knote *kn);
 static int	filt_kqueue(struct knote *kn, long hint);
 static int	filt_procattach(struct knote *kn);
 static void	filt_procdetach(struct knote *kn);
 static int	filt_proc(struct knote *kn, long hint);
 static int	filt_fileattach(struct knote *kn);
 static void	filt_timerexpire(void *knx);
 static int	filt_timerattach(struct knote *kn);
 static void	filt_timerdetach(struct knote *kn);
 static void	filt_timerstart(struct knote *kn, sbintime_t to);
 static void	filt_timertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 static int	filt_timervalidate(struct knote *kn, sbintime_t *to);
 static int	filt_timer(struct knote *kn, long hint);
 static int	filt_userattach(struct knote *kn);
 static void	filt_userdetach(struct knote *kn);
 static int	filt_user(struct knote *kn, long hint);
 static void	filt_usertouch(struct knote *kn, struct kevent *kev,
 		    u_long type);
 
 static struct filterops file_filtops = {
 	.f_isfd = 1,
 	.f_attach = filt_fileattach,
 };
 static struct filterops kqread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_kqdetach,
 	.f_event = filt_kqueue,
 };
 /* XXX - move to kern_proc.c?  */
 static struct filterops proc_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_procattach,
 	.f_detach = filt_procdetach,
 	.f_event = filt_proc,
 };
 static struct filterops timer_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_timerattach,
 	.f_detach = filt_timerdetach,
 	.f_event = filt_timer,
 	.f_touch = filt_timertouch,
 };
 static struct filterops user_filtops = {
 	.f_attach = filt_userattach,
 	.f_detach = filt_userdetach,
 	.f_event = filt_user,
 	.f_touch = filt_usertouch,
 };
 
 static uma_zone_t	knote_zone;
 static unsigned int	kq_ncallouts = 0;
 static unsigned int 	kq_calloutmax = 4 * 1024;
 SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
     &kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
 
 /* XXX - ensure not influx ? */
 #define KNOTE_ACTIVATE(kn, islock) do { 				\
 	if ((islock))							\
 		mtx_assert(&(kn)->kn_kq->kq_lock, MA_OWNED);		\
 	else								\
 		KQ_LOCK((kn)->kn_kq);					\
 	(kn)->kn_status |= KN_ACTIVE;					\
 	if (((kn)->kn_status & (KN_QUEUED | KN_DISABLED)) == 0)		\
 		knote_enqueue((kn));					\
 	if (!(islock))							\
 		KQ_UNLOCK((kn)->kn_kq);					\
 } while(0)
 #define KQ_LOCK(kq) do {						\
 	mtx_lock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_FLUX_WAKEUP(kq) do {						\
 	if (((kq)->kq_state & KQ_FLUXWAIT) == KQ_FLUXWAIT) {		\
 		(kq)->kq_state &= ~KQ_FLUXWAIT;				\
 		wakeup((kq));						\
 	}								\
 } while (0)
 #define KQ_UNLOCK_FLUX(kq) do {						\
 	KQ_FLUX_WAKEUP(kq);						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_UNLOCK(kq) do {						\
 	mtx_unlock(&(kq)->kq_lock);					\
 } while (0)
 #define KQ_OWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_OWNED);				\
 } while (0)
 #define KQ_NOTOWNED(kq) do {						\
 	mtx_assert(&(kq)->kq_lock, MA_NOTOWNED);			\
 } while (0)
 
 static struct knlist *
 kn_list_lock(struct knote *kn)
 {
 	struct knlist *knl;
 
 	knl = kn->kn_knlist;
 	if (knl != NULL)
 		knl->kl_lock(knl->kl_lockarg);
 	return (knl);
 }
 
 static void
 kn_list_unlock(struct knlist *knl)
 {
 	bool do_free;
 
 	if (knl == NULL)
 		return;
 	do_free = knl->kl_autodestroy && knlist_empty(knl);
 	knl->kl_unlock(knl->kl_lockarg);
 	if (do_free) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 static bool
 kn_in_flux(struct knote *kn)
 {
 
 	return (kn->kn_influx > 0);
 }
 
 static void
 kn_enter_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx < INT_MAX);
 	kn->kn_influx++;
 }
 
 static bool
 kn_leave_flux(struct knote *kn)
 {
 
 	KQ_OWNED(kn->kn_kq);
 	MPASS(kn->kn_influx > 0);
 	kn->kn_influx--;
 	return (kn->kn_influx == 0);
 }
 
 #define	KNL_ASSERT_LOCK(knl, islocked) do {				\
 	if (islocked)							\
 		KNL_ASSERT_LOCKED(knl);				\
 	else								\
 		KNL_ASSERT_UNLOCKED(knl);				\
 } while (0)
 #ifdef INVARIANTS
 #define	KNL_ASSERT_LOCKED(knl) do {					\
 	knl->kl_assert_locked((knl)->kl_lockarg);			\
 } while (0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {					\
 	knl->kl_assert_unlocked((knl)->kl_lockarg);			\
 } while (0)
 #else /* !INVARIANTS */
 #define	KNL_ASSERT_LOCKED(knl) do {} while(0)
 #define	KNL_ASSERT_UNLOCKED(knl) do {} while (0)
 #endif /* INVARIANTS */
 
 #ifndef	KN_HASHSIZE
 #define	KN_HASHSIZE		64		/* XXX should be tunable */
 #endif
 
 #define KN_HASH(val, mask)	(((val) ^ (val >> 8)) & (mask))
 
 static int
 filt_nullattach(struct knote *kn)
 {
 
 	return (ENXIO);
 };
 
 struct filterops null_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_nullattach,
 };
 
 /* XXX - make SYSINIT to add these, and move into respective modules. */
 extern struct filterops sig_filtops;
 extern struct filterops fs_filtops;
 
 /*
  * Table for for all system-defined filters.
  */
 static struct mtx	filterops_lock;
 MTX_SYSINIT(kqueue_filterops, &filterops_lock, "protect sysfilt_ops",
 	MTX_DEF);
 static struct {
 	struct filterops *for_fop;
 	int for_nolock;
 	int for_refcnt;
 } sysfilt_ops[EVFILT_SYSCOUNT] = {
 	{ &file_filtops, 1 },			/* EVFILT_READ */
 	{ &file_filtops, 1 },			/* EVFILT_WRITE */
 	{ &null_filtops },			/* EVFILT_AIO */
 	{ &file_filtops, 1 },			/* EVFILT_VNODE */
 	{ &proc_filtops, 1 },			/* EVFILT_PROC */
 	{ &sig_filtops, 1 },			/* EVFILT_SIGNAL */
 	{ &timer_filtops, 1 },			/* EVFILT_TIMER */
 	{ &file_filtops, 1 },			/* EVFILT_PROCDESC */
 	{ &fs_filtops, 1 },			/* EVFILT_FS */
 	{ &null_filtops },			/* EVFILT_LIO */
 	{ &user_filtops, 1 },			/* EVFILT_USER */
 	{ &null_filtops },			/* EVFILT_SENDFILE */
 	{ &file_filtops, 1 },                   /* EVFILT_EMPTY */
 };
 
 /*
  * Simple redirection for all cdevsw style objects to call their fo_kqfilter
  * method.
  */
 static int
 filt_fileattach(struct knote *kn)
 {
 
 	return (fo_kqfilter(kn->kn_fp, kn));
 }
 
 /*ARGSUSED*/
 static int
 kqueue_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	if (kn->kn_filter != EVFILT_READ)
 		return (EINVAL);
 
 	kn->kn_status |= KN_KQUEUE;
 	kn->kn_fop = &kqread_filtops;
 	knlist_add(&kq->kq_sel.si_note, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_kqdetach(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	knlist_remove(&kq->kq_sel.si_note, kn, 0);
 }
 
 /*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, long hint)
 {
 	struct kqueue *kq = kn->kn_fp->f_data;
 
 	kn->kn_data = kq->kq_count;
 	return (kn->kn_data > 0);
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_procattach(struct knote *kn)
 {
 	struct proc *p;
 	int error;
 	bool exiting, immediate;
 
 	exiting = immediate = false;
 	if (kn->kn_sfflags & NOTE_EXIT)
 		p = pfind_any(kn->kn_id);
 	else
 		p = pfind(kn->kn_id);
 	if (p == NULL)
 		return (ESRCH);
 	if (p->p_flag & P_WEXIT)
 		exiting = true;
 
 	if ((error = p_cansee(curthread, p))) {
 		PROC_UNLOCK(p);
 		return (error);
 	}
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	/*
 	 * Internal flag indicating registration done by kernel for the
 	 * purposes of getting a NOTE_CHILD notification.
 	 */
 	if (kn->kn_flags & EV_FLAG2) {
 		kn->kn_flags &= ~EV_FLAG2;
 		kn->kn_data = kn->kn_sdata;		/* ppid */
 		kn->kn_fflags = NOTE_CHILD;
 		kn->kn_sfflags &= ~(NOTE_EXIT | NOTE_EXEC | NOTE_FORK);
 		immediate = true; /* Force immediate activation of child note. */
 	}
 	/*
 	 * Internal flag indicating registration done by kernel (for other than
 	 * NOTE_CHILD).
 	 */
 	if (kn->kn_flags & EV_FLAG1) {
 		kn->kn_flags &= ~EV_FLAG1;
 	}
 
 	knlist_add(p->p_klist, kn, 1);
 
 	/*
 	 * Immediately activate any child notes or, in the case of a zombie
 	 * target process, exit notes.  The latter is necessary to handle the
 	 * case where the target process, e.g. a child, dies before the kevent
 	 * is registered.
 	 */
 	if (immediate || (exiting && filt_proc(kn, NOTE_EXIT)))
 		KNOTE_ACTIVATE(kn, 0);
 
 	PROC_UNLOCK(p);
 
 	return (0);
 }
 
 /*
  * The knote may be attached to a different process, which may exit,
  * leaving nothing for the knote to be attached to.  So when the process
  * exits, the knote is marked as DETACHED and also flagged as ONESHOT so
  * it will be deleted when read out.  However, as part of the knote deletion,
  * this routine is called, so a check is needed to avoid actually performing
  * a detach, because the original process does not exist any more.
  */
 /* XXX - move to kern_proc.c?  */
 static void
 filt_procdetach(struct knote *kn)
 {
 
 	knlist_remove(kn->kn_knlist, kn, 0);
 	kn->kn_ptr.p_proc = NULL;
 }
 
 /* XXX - move to kern_proc.c?  */
 static int
 filt_proc(struct knote *kn, long hint)
 {
 	struct proc *p;
 	u_int event;
 
 	p = kn->kn_ptr.p_proc;
 	if (p == NULL) /* already activated, from attach filter */
 		return (0);
 
 	/* Mask off extra data. */
 	event = (u_int)hint & NOTE_PCTRLMASK;
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
 	/* Process is gone, so flag the event as finished. */
 	if (event == NOTE_EXIT) {
 		kn->kn_flags |= EV_EOF | EV_ONESHOT;
 		kn->kn_ptr.p_proc = NULL;
 		if (kn->kn_fflags & NOTE_EXIT)
 			kn->kn_data = KW_EXITCODE(p->p_xexit, p->p_xsig);
 		if (kn->kn_fflags == 0)
 			kn->kn_flags |= EV_DROP;
 		return (1);
 	}
 
 	return (kn->kn_fflags != 0);
 }
 
 /*
  * Called when the process forked. It mostly does the same as the
  * knote(), activating all knotes registered to be activated when the
  * process forked. Additionally, for each knote attached to the
  * parent, check whether user wants to track the new process. If so
  * attach a new knote to it, and immediately report an event with the
  * child's pid.
  */
 void
 knote_fork(struct knlist *list, int pid)
 {
 	struct kqueue *kq;
 	struct knote *kn;
 	struct kevent kev;
 	int error;
 
 	MPASS(list != NULL);
 	KNL_ASSERT_LOCKED(list);
 	if (SLIST_EMPTY(&list->kl_list))
 		return;
 
 	memset(&kev, 0, sizeof(kev));
 	SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The same as knote(), activate the event.
 		 */
 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 			continue;
 		}
 
 		/*
 		 * The NOTE_TRACK case. In addition to the activation
 		 * of the event, we need to register new events to
 		 * track the child. Drop the locks in preparation for
 		 * the call to kqueue_register().
 		 */
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		list->kl_unlock(list->kl_lockarg);
 
 		/*
 		 * Activate existing knote and register tracking knotes with
 		 * new process.
 		 *
 		 * First register a knote to get just the child notice. This
 		 * must be a separate note from a potential NOTE_EXIT
 		 * notification since both NOTE_CHILD and NOTE_EXIT are defined
 		 * to use the data field (in conflicting ways).
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT |
 		    EV_FLAG2;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 
 		/*
 		 * Then register another knote to track other potential events
 		 * from the new process.
 		 */
 		kev.ident = pid;
 		kev.filter = kn->kn_filter;
 		kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
 		kev.fflags = kn->kn_sfflags;
 		kev.data = kn->kn_id;		/* parent */
 		kev.udata = kn->kn_kevent.udata;/* preserve udata */
 		error = kqueue_register(kq, &kev, NULL, M_NOWAIT);
 		if (error)
 			kn->kn_fflags |= NOTE_TRACKERR;
 		if (kn->kn_fop->f_event(kn, NOTE_FORK))
 			KNOTE_ACTIVATE(kn, 0);
 		list->kl_lock(list->kl_lockarg);
 		KQ_LOCK(kq);
 		kn_leave_flux(kn);
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 /*
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
  */
 
 #define NOTE_TIMER_PRECMASK						\
     (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
 
 static sbintime_t
 timer2sbintime(intptr_t data, int flags)
 {
 	int64_t secs;
 
         /*
          * Macros for converting to the fractional second portion of an
          * sbintime_t using 64bit multiplication to improve precision.
          */
 #define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
 #define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
 #define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
 	switch (flags & NOTE_TIMER_PRECMASK) {
 	case NOTE_SECONDS:
 #ifdef __LP64__
 		if (data > (SBT_MAX / SBT_1S))
 			return (SBT_MAX);
 #endif
 		return ((sbintime_t)data << 32);
 	case NOTE_MSECONDS: /* FALLTHROUGH */
 	case 0:
 		if (data >= 1000) {
 			secs = data / 1000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | MS_TO_SBT(data % 1000));
 		}
 		return (MS_TO_SBT(data));
 	case NOTE_USECONDS:
 		if (data >= 1000000) {
 			secs = data / 1000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000));
 		}
 		return (US_TO_SBT(data));
 	case NOTE_NSECONDS:
 		if (data >= 1000000000) {
 			secs = data / 1000000000;
 #ifdef __LP64__
 			if (secs > (SBT_MAX / SBT_1S))
 				return (SBT_MAX);
 #endif
 			return (secs << 32 | US_TO_SBT(data % 1000000000));
 		}
 		return (NS_TO_SBT(data));
 	default:
 		break;
 	}
 	return (-1);
 }
 
 struct kq_timer_cb_data {
 	struct callout c;
 	sbintime_t next;	/* next timer event fires at */
 	sbintime_t to;		/* precalculated timer period, 0 for abs */
 };
 
 static void
 filt_timerexpire(void *knx)
 {
 	struct knote *kn;
 	struct kq_timer_cb_data *kc;
 
 	kn = knx;
 	kn->kn_data++;
 	KNOTE_ACTIVATE(kn, 0);	/* XXX - handle locking */
 
 	if ((kn->kn_flags & EV_ONESHOT) != 0)
 		return;
 	kc = kn->kn_ptr.p_v;
 	if (kc->to == 0)
 		return;
 	kc->next += kc->to;
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
 }
 
 /*
  * data contains amount of time to sleep
  */
 static int
 filt_timervalidate(struct knote *kn, sbintime_t *to)
 {
 	struct bintime bt;
 	sbintime_t sbt;
 
 	if (kn->kn_sdata < 0)
 		return (EINVAL);
 	if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
 		kn->kn_sdata = 1;
 	/*
 	 * The only fflags values supported are the timer unit
 	 * (precision) and the absolute time indicator.
 	 */
 	if ((kn->kn_sfflags & ~(NOTE_TIMER_PRECMASK | NOTE_ABSTIME)) != 0)
 		return (EINVAL);
 
 	*to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		getboottimebin(&bt);
 		sbt = bttosbt(bt);
 		*to -= sbt;
 	}
 	if (*to < 0)
 		return (EINVAL);
 	return (0);
 }
 
 static int
 filt_timerattach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	sbintime_t to;
 	unsigned int ncallouts;
 	int error;
 
 	error = filt_timervalidate(kn, &to);
 	if (error != 0)
 		return (error);
 
 	do {
 		ncallouts = kq_ncallouts;
 		if (ncallouts >= kq_calloutmax)
 			return (ENOMEM);
 	} while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
 
 	if ((kn->kn_sfflags & NOTE_ABSTIME) == 0)
 		kn->kn_flags |= EV_CLEAR;	/* automatically set */
 	kn->kn_status &= ~KN_DETACHED;		/* knlist_add clears it */
 	kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
 	callout_init(&kc->c, 1);
 	filt_timerstart(kn, to);
 
 	return (0);
 }
 
 static void
 filt_timerstart(struct knote *kn, sbintime_t to)
 {
 	struct kq_timer_cb_data *kc;
 
 	kc = kn->kn_ptr.p_v;
 	if ((kn->kn_sfflags & NOTE_ABSTIME) != 0) {
 		kc->next = to;
 		kc->to = 0;
 	} else {
 		kc->next = to + sbinuptime();
 		kc->to = to;
 	}
 	callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
 	    PCPU_GET(cpuid), C_ABSOLUTE);
 }
 
 static void
 filt_timerdetach(struct knote *kn)
 {
 	struct kq_timer_cb_data *kc;
 	unsigned int old __unused;
 
 	kc = kn->kn_ptr.p_v;
 	callout_drain(&kc->c);
 	free(kc, M_KQUEUE);
 	old = atomic_fetchadd_int(&kq_ncallouts, -1);
 	KASSERT(old > 0, ("Number of callouts cannot become negative"));
 	kn->kn_status |= KN_DETACHED;	/* knlist_remove sets it */
 }
 
 static void
 filt_timertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	struct kq_timer_cb_data *kc;	
 	struct kqueue *kq;
 	sbintime_t to;
 	int error;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		/* Handle re-added timers that update data/fflags */
 		if (kev->flags & EV_ADD) {
 			kc = kn->kn_ptr.p_v;
 
 			/* Drain any existing callout. */
 			callout_drain(&kc->c);
 
 			/* Throw away any existing undelivered record
 			 * of the timer expiration. This is done under
 			 * the presumption that if a process is
 			 * re-adding this timer with new parameters,
 			 * it is no longer interested in what may have
 			 * happened under the old parameters. If it is
 			 * interested, it can wait for the expiration,
 			 * delete the old timer definition, and then
 			 * add the new one.
 			 *
 			 * This has to be done while the kq is locked:
 			 *   - if enqueued, dequeue
 			 *   - make it no longer active
 			 *   - clear the count of expiration events
 			 */
 			kq = kn->kn_kq;
 			KQ_LOCK(kq);
 			if (kn->kn_status & KN_QUEUED)
 				knote_dequeue(kn);
 
 			kn->kn_status &= ~KN_ACTIVE;
 			kn->kn_data = 0;
 			KQ_UNLOCK(kq);
 			
 			/* Reschedule timer based on new data/fflags */
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			error = filt_timervalidate(kn, &to);
 			if (error != 0) {
 			  	kn->kn_flags |= EV_ERROR;
 				kn->kn_data = error;
 			} else
 			  	filt_timerstart(kn, to);
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_timertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 static int
 filt_timer(struct knote *kn, long hint)
 {
 
 	return (kn->kn_data != 0);
 }
 
 static int
 filt_userattach(struct knote *kn)
 {
 
 	/* 
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */ 
 	kn->kn_hook = NULL;
 	if (kn->kn_fflags & NOTE_TRIGGER)
 		kn->kn_hookid = 1;
 	else
 		kn->kn_hookid = 0;
 	return (0);
 }
 
 static void
 filt_userdetach(__unused struct knote *kn)
 {
 
 	/*
 	 * EVFILT_USER knotes are not attached to anything in the kernel.
 	 */
 }
 
 static int
 filt_user(struct knote *kn, __unused long hint)
 {
 
 	return (kn->kn_hookid);
 }
 
 static void
 filt_usertouch(struct knote *kn, struct kevent *kev, u_long type)
 {
 	u_int ffctrl;
 
 	switch (type) {
 	case EVENT_REGISTER:
 		if (kev->fflags & NOTE_TRIGGER)
 			kn->kn_hookid = 1;
 
 		ffctrl = kev->fflags & NOTE_FFCTRLMASK;
 		kev->fflags &= NOTE_FFLAGSMASK;
 		switch (ffctrl) {
 		case NOTE_FFNOP:
 			break;
 
 		case NOTE_FFAND:
 			kn->kn_sfflags &= kev->fflags;
 			break;
 
 		case NOTE_FFOR:
 			kn->kn_sfflags |= kev->fflags;
 			break;
 
 		case NOTE_FFCOPY:
 			kn->kn_sfflags = kev->fflags;
 			break;
 
 		default:
 			/* XXX Return error? */
 			break;
 		}
 		kn->kn_sdata = kev->data;
 		if (kev->flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
         case EVENT_PROCESS:
 		*kev = kn->kn_kevent;
 		kev->fflags = kn->kn_sfflags;
 		kev->data = kn->kn_sdata;
 		if (kn->kn_flags & EV_CLEAR) {
 			kn->kn_hookid = 0;
 			kn->kn_data = 0;
 			kn->kn_fflags = 0;
 		}
 		break;
 
 	default:
 		panic("filt_usertouch() - invalid type (%ld)", type);
 		break;
 	}
 }
 
 int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
 	return (kern_kqueue(td, 0, NULL));
 }
 
 static void
 kqueue_init(struct kqueue *kq)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 }
 
 int
 kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct kqueue *kq;
 	struct file *fp;
 	struct ucred *cred;
 	int fd, error;
 
 	fdp = td->td_proc->p_fd;
 	cred = td->td_ucred;
 	if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td, RLIMIT_KQUEUES)))
 		return (ENOMEM);
 
 	error = falloc_caps(td, &fp, &fd, flags, fcaps);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
 	}
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
 	kqueue_init(kq);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
 	FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
 	FILEDESC_XUNLOCK(fdp);
 
 	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
 	return (0);
 }
 
 struct g_kevent_args {
 	int	fd;
 	void	*changelist;
 	int	nchanges;
 	void	*eventlist;
 	int	nevents;
 	const struct timespec *timeout;
 };
 
 int
 sys_kevent(struct thread *td, struct kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent_copyout,
 		.k_copyin = kevent_copyin,
 		.kevent_size = sizeof(struct kevent),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent"));
 }
 
 static int
 kern_kevent_generic(struct thread *td, struct g_kevent_args *uap,
     struct kevent_copyops *k_ops, const char *struct_name)
 {
 	struct timespec ts, *tsp;
 #ifdef KTRACE
 	struct kevent *eventlist = uap->eventlist;
 #endif
 	int error;
 
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 		tsp = &ts;
 	} else
 		tsp = NULL;
 
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, uap->changelist,
 		    uap->nchanges, k_ops->kevent_size);
 #endif
 
 	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
 	    k_ops, tsp);
 
 #ifdef KTRACE
 	if (error == 0 && KTRPOINT(td, KTR_STRUCT_ARRAY))
 		ktrstructarray(struct_name, UIO_USERSPACE, eventlist,
 		    td->td_retval[0], k_ops->kevent_size);
 #endif
 
 	return (error);
 }
 
 /*
  * Copy 'count' items into the destination list pointed to by uap->eventlist.
  */
 static int
 kevent_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyout(kevp, uap->eventlist, count * sizeof *kevp);
 	if (error == 0)
 		uap->eventlist += count;
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct kevent_args *uap;
 	int error;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct kevent_args *)arg;
 
 	error = copyin(uap->changelist, kevp, count * sizeof *kevp);
 	if (error == 0)
 		uap->changelist += count;
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD11
 static int
 kevent11_copyout(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		kev11.ident = kevp->ident;
 		kev11.filter = kevp->filter;
 		kev11.flags = kevp->flags;
 		kev11.fflags = kevp->fflags;
 		kev11.data = kevp->data;
 		kev11.udata = kevp->udata;
 		error = copyout(&kev11, uap->eventlist, sizeof(kev11));
 		if (error != 0)
 			break;
 		uap->eventlist++;
 		kevp++;
 	}
 	return (error);
 }
 
 /*
  * Copy 'count' items from the list pointed to by uap->changelist.
  */
 static int
 kevent11_copyin(void *arg, struct kevent *kevp, int count)
 {
 	struct freebsd11_kevent_args *uap;
 	struct kevent_freebsd11 kev11;
 	int error, i;
 
 	KASSERT(count <= KQ_NEVENTS, ("count (%d) > KQ_NEVENTS", count));
 	uap = (struct freebsd11_kevent_args *)arg;
 
 	for (i = 0; i < count; i++) {
 		error = copyin(uap->changelist, &kev11, sizeof(kev11));
 		if (error != 0)
 			break;
 		kevp->ident = kev11.ident;
 		kevp->filter = kev11.filter;
 		kevp->flags = kev11.flags;
 		kevp->fflags = kev11.fflags;
 		kevp->data = (uintptr_t)kev11.data;
 		kevp->udata = kev11.udata;
 		bzero(&kevp->ext, sizeof(kevp->ext));
 		uap->changelist++;
 		kevp++;
 	}
 	return (error);
 }
 
 int
 freebsd11_kevent(struct thread *td, struct freebsd11_kevent_args *uap)
 {
 	struct kevent_copyops k_ops = {
 		.arg = uap,
 		.k_copyout = kevent11_copyout,
 		.k_copyin = kevent11_copyin,
 		.kevent_size = sizeof(struct kevent_freebsd11),
 	};
 	struct g_kevent_args gk_args = {
 		.fd = uap->fd,
 		.changelist = uap->changelist,
 		.nchanges = uap->nchanges,
 		.eventlist = uap->eventlist,
 		.nevents = uap->nevents,
 		.timeout = uap->timeout,
 	};
 
 	return (kern_kevent_generic(td, &gk_args, &k_ops, "kevent_freebsd11"));
 }
 #endif
 
 int
 kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	cap_rights_t rights;
 	struct file *fp;
 	int error;
 
 	cap_rights_init(&rights);
 	if (nchanges > 0)
 		cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
 	if (nevents > 0)
 		cap_rights_set(&rights, CAP_KQUEUE_EVENT);
 	error = fget(td, fd, &rights, &fp);
 	if (error != 0)
 		return (error);
 
 	error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
 	fdrop(fp, td);
 
 	return (error);
 }
 
 static int
 kqueue_kevent(struct kqueue *kq, struct thread *td, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kevent keva[KQ_NEVENTS];
 	struct kevent *kevp, *changes;
 	int i, n, nerrors, error;
 
 	nerrors = 0;
 	while (nchanges > 0) {
 		n = nchanges > KQ_NEVENTS ? KQ_NEVENTS : nchanges;
 		error = k_ops->k_copyin(k_ops->arg, keva, n);
 		if (error)
 			return (error);
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
 			if (!kevp->filter)
 				continue;
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, M_WAITOK);
 			if (error || (kevp->flags & EV_RECEIPT)) {
 				if (nevents == 0)
 					return (error);
 				kevp->flags = EV_ERROR;
 				kevp->data = error;
 				(void)k_ops->k_copyout(k_ops->arg, kevp, 1);
 				nevents--;
 				nerrors++;
 			}
 		}
 		nchanges -= n;
 	}
 	if (nerrors) {
 		td->td_retval[0] = nerrors;
 		return (0);
 	}
 
 	return (kqueue_scan(kq, nevents, k_ops, timeout, keva, td));
 }
 
 int
 kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
     struct kevent_copyops *k_ops, const struct timespec *timeout)
 {
 	struct kqueue *kq;
 	int error;
 
 	error = kqueue_acquire(fp, &kq);
 	if (error != 0)
 		return (error);
 	error = kqueue_kevent(kq, td, nchanges, nevents, k_ops, timeout);
 	kqueue_release(kq, 0);
 	return (error);
 }
 
 /*
  * Performs a kevent() call on a temporarily created kqueue. This can be
  * used to perform one-shot polling, similar to poll() and select().
  */
 int
 kern_kevent_anonymous(struct thread *td, int nevents,
     struct kevent_copyops *k_ops)
 {
 	struct kqueue kq = {};
 	int error;
 
 	kqueue_init(&kq);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
 	kqueue_destroy(&kq);
 	return (error);
 }
 
 int
 kqueue_add_filteropts(int filt, struct filterops *filtops)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0) {
 		printf(
 "trying to add a filterop that is out of range: %d is beyond %d\n",
 		    ~filt, EVFILT_SYSCOUNT);
 		return EINVAL;
 	}
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop != &null_filtops &&
 	    sysfilt_ops[~filt].for_fop != NULL)
 		error = EEXIST;
 	else {
 		sysfilt_ops[~filt].for_fop = filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return (error);
 }
 
 int
 kqueue_del_filteropts(int filt)
 {
 	int error;
 
 	error = 0;
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return EINVAL;
 
 	mtx_lock(&filterops_lock);
 	if (sysfilt_ops[~filt].for_fop == &null_filtops ||
 	    sysfilt_ops[~filt].for_fop == NULL)
 		error = EINVAL;
 	else if (sysfilt_ops[~filt].for_refcnt != 0)
 		error = EBUSY;
 	else {
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 		sysfilt_ops[~filt].for_refcnt = 0;
 	}
 	mtx_unlock(&filterops_lock);
 
 	return error;
 }
 
 static struct filterops *
 kqueue_fo_find(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return NULL;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return sysfilt_ops[~filt].for_fop;
 
 	mtx_lock(&filterops_lock);
 	sysfilt_ops[~filt].for_refcnt++;
 	if (sysfilt_ops[~filt].for_fop == NULL)
 		sysfilt_ops[~filt].for_fop = &null_filtops;
 	mtx_unlock(&filterops_lock);
 
 	return sysfilt_ops[~filt].for_fop;
 }
 
 static void
 kqueue_fo_release(int filt)
 {
 
 	if (filt > 0 || filt + EVFILT_SYSCOUNT < 0)
 		return;
 
 	if (sysfilt_ops[~filt].for_nolock)
 		return;
 
 	mtx_lock(&filterops_lock);
 	KASSERT(sysfilt_ops[~filt].for_refcnt > 0,
 	    ("filter object refcount not valid on release"));
 	sysfilt_ops[~filt].for_refcnt--;
 	mtx_unlock(&filterops_lock);
 }
 
 /*
  * A ref to kq (obtained via kqueue_acquire) must be held.
  */
 static int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td,
     int mflag)
 {
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
 	struct knlist *knl;
 	int error, filt, event;
 	int haskqglobal, filedesc_unlock;
 
 	if ((kev->flags & (EV_ENABLE | EV_DISABLE)) == (EV_ENABLE | EV_DISABLE))
 		return (EINVAL);
 
 	fp = NULL;
 	kn = NULL;
 	knl = NULL;
 	error = 0;
 	haskqglobal = 0;
 	filedesc_unlock = 0;
 
 	filt = kev->filter;
 	fops = kqueue_fo_find(filt);
 	if (fops == NULL)
 		return EINVAL;
 
 	if (kev->flags & EV_ADD) {
 		/*
 		 * Prevent waiting with locks.  Non-sleepable
 		 * allocation failures are handled in the loop, only
 		 * if the spare knote appears to be actually required.
 		 */
 		tkn = knote_alloc(mflag);
 	} else {
 		tkn = NULL;
 	}
 
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
 		if (kev->ident > INT_MAX)
 			error = EBADF;
 		else
 			error = fget(td, kev->ident, &cap_event_rights, &fp);
 		if (error)
 			goto done;
 
 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 		    kev->ident, M_NOWAIT) != 0) {
 			/* try again */
 			fdrop(fp, td);
 			fp = NULL;
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error)
 				goto done;
 			goto findkn;
 		}
 
 		if (fp->f_type == DTYPE_KQUEUE) {
 			/*
 			 * If we add some intelligence about what we are doing,
 			 * we should be able to support events on ourselves.
 			 * We need to know when we are doing this to prevent
 			 * getting both the knlist lock and the kq lock since
 			 * they are the same thing.
 			 */
 			if (fp->f_data == kq) {
 				error = EINVAL;
 				goto done;
 			}
 
 			/*
 			 * Pre-lock the filedesc before the global
 			 * lock mutex, see the comment in
 			 * kqueue_close().
 			 */
 			FILEDESC_XLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 1;
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
 		KQ_LOCK(kq);
 		if (kev->ident < kq->kq_knlistsize) {
 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
 				if (kev->filter == kn->kn_filter)
 					break;
 		}
 	} else {
 		if ((kev->flags & EV_ADD) == EV_ADD) {
 			error = kqueue_expand(kq, fops, kev->ident, mflag);
 			if (error != 0)
 				goto done;
 		}
 
 		KQ_LOCK(kq);
 
 		/*
 		 * If possible, find an existing knote to use for this kevent.
 		 */
 		if (kev->filter == EVFILT_PROC &&
 		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
 			/* This is an internal creation of a process tracking
 			 * note. Don't attempt to coalesce this with an
 			 * existing note.
 			 */
 			;			
 		} else if (kq->kq_knhashmask != 0) {
 			struct klist *list;
 
 			list = &kq->kq_knhash[
 			    KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
 			SLIST_FOREACH(kn, list, kn_link)
 				if (kev->ident == kn->kn_id &&
 				    kev->filter == kn->kn_filter)
 					break;
 		}
 	}
 
 	/* knote is in the process of changing, wait for it to stabilize. */
 	if (kn != NULL && kn_in_flux(kn)) {
 		KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 		if (filedesc_unlock) {
 			FILEDESC_XUNLOCK(td->td_proc->p_fd);
 			filedesc_unlock = 0;
 		}
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
 		if (fp != NULL) {
 			fdrop(fp, td);
 			fp = NULL;
 		}
 		goto findkn;
 	}
 
 	/*
 	 * kn now contains the matching knote, or NULL if no match
 	 */
 	if (kn == NULL) {
 		if (kev->flags & EV_ADD) {
 			kn = tkn;
 			tkn = NULL;
 			if (kn == NULL) {
 				KQ_UNLOCK(kq);
 				error = ENOMEM;
 				goto done;
 			}
 			kn->kn_fp = fp;
 			kn->kn_kq = kq;
 			kn->kn_fop = fops;
 			/*
 			 * apply reference counts to knote structure, and
 			 * do not release it at the end of this routine.
 			 */
 			fops = NULL;
 			fp = NULL;
 
 			kn->kn_sfflags = kev->fflags;
 			kn->kn_sdata = kev->data;
 			kev->fflags = 0;
 			kev->data = 0;
 			kn->kn_kevent = *kev;
 			kn->kn_kevent.flags &= ~(EV_ADD | EV_DELETE |
 			    EV_ENABLE | EV_DISABLE | EV_FORCEONESHOT);
 			kn->kn_status = KN_DETACHED;
 			if ((kev->flags & EV_DISABLE) != 0)
 				kn->kn_status |= KN_DISABLED;
 			kn_enter_flux(kn);
 
 			error = knote_attach(kn, kq);
 			KQ_UNLOCK(kq);
 			if (error != 0) {
 				tkn = kn;
 				goto done;
 			}
 
 			if ((error = kn->kn_fop->f_attach(kn)) != 0) {
 				knote_drop_detached(kn, td);
 				goto done;
 			}
 			knl = kn_list_lock(kn);
 			goto done_ev_add;
 		} else {
 			/* No matching knote and the EV_ADD flag is not set. */
 			KQ_UNLOCK(kq);
 			error = ENOENT;
 			goto done;
 		}
 	}
 	
 	if (kev->flags & EV_DELETE) {
 		kn_enter_flux(kn);
 		KQ_UNLOCK(kq);
 		knote_drop(kn, td);
 		goto done;
 	}
 
 	if (kev->flags & EV_FORCEONESHOT) {
 		kn->kn_flags |= EV_ONESHOT;
 		KNOTE_ACTIVATE(kn, 1);
 	}
 
 	if ((kev->flags & EV_ENABLE) != 0)
 		kn->kn_status &= ~KN_DISABLED;
 	else if ((kev->flags & EV_DISABLE) != 0)
 		kn->kn_status |= KN_DISABLED;
 
 	/*
 	 * The user may change some filter values after the initial EV_ADD,
 	 * but doing so will not reset any filter which has already been
 	 * triggered.
 	 */
 	kn->kn_status |= KN_SCAN;
 	kn_enter_flux(kn);
 	KQ_UNLOCK(kq);
 	knl = kn_list_lock(kn);
 	kn->kn_kevent.udata = kev->udata;
 	if (!fops->f_isfd && fops->f_touch != NULL) {
 		fops->f_touch(kn, kev, EVENT_REGISTER);
 	} else {
 		kn->kn_sfflags = kev->fflags;
 		kn->kn_sdata = kev->data;
 	}
 
 done_ev_add:
 	/*
 	 * We can get here with kn->kn_knlist == NULL.  This can happen when
 	 * the initial attach event decides that the event is "completed" 
 	 * already, e.g., filt_procattach() is called on a zombie process.  It
 	 * will call filt_proc() which will remove it from the list, and NULL
 	 * kn_knlist.
 	 *
 	 * KN_DISABLED will be stable while the knote is in flux, so the
 	 * unlocked read will not race with an update.
 	 */
 	if ((kn->kn_status & KN_DISABLED) == 0)
 		event = kn->kn_fop->f_event(kn, 0);
 	else
 		event = 0;
 
 	KQ_LOCK(kq);
 	if (event)
 		kn->kn_status |= KN_ACTIVE;
 	if ((kn->kn_status & (KN_ACTIVE | KN_DISABLED | KN_QUEUED)) ==
 	    KN_ACTIVE)
 		knote_enqueue(kn);
 	kn->kn_status &= ~KN_SCAN;
 	kn_leave_flux(kn);
 	kn_list_unlock(knl);
 	KQ_UNLOCK_FLUX(kq);
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	if (fp != NULL)
 		fdrop(fp, td);
 	knote_free(tkn);
 	if (fops != NULL)
 		kqueue_fo_release(filt);
 	return (error);
 }
 
 static int
 kqueue_acquire(struct file *fp, struct kqueue **kqp)
 {
 	int error;
 	struct kqueue *kq;
 
 	error = 0;
 
 	kq = fp->f_data;
 	if (fp->f_type != DTYPE_KQUEUE || kq == NULL)
 		return (EBADF);
 	*kqp = kq;
 	KQ_LOCK(kq);
 	if ((kq->kq_state & KQ_CLOSING) == KQ_CLOSING) {
 		KQ_UNLOCK(kq);
 		return (EBADF);
 	}
 	kq->kq_refcnt++;
 	KQ_UNLOCK(kq);
 
 	return error;
 }
 
 static void
 kqueue_release(struct kqueue *kq, int locked)
 {
 	if (locked)
 		KQ_OWNED(kq);
 	else
 		KQ_LOCK(kq);
 	kq->kq_refcnt--;
 	if (kq->kq_refcnt == 1)
 		wakeup(&kq->kq_refcnt);
 	if (!locked)
 		KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_schedtask(struct kqueue *kq)
 {
 
 	KQ_OWNED(kq);
 	KASSERT(((kq->kq_state & KQ_TASKDRAIN) != KQ_TASKDRAIN),
 	    ("scheduling kqueue task while draining"));
 
 	if ((kq->kq_state & KQ_TASKSCHED) != KQ_TASKSCHED) {
 		taskqueue_enqueue(taskqueue_kqueue_ctx, &kq->kq_task);
 		kq->kq_state |= KQ_TASKSCHED;
 	}
 }
 
 /*
  * Expand the kq to make sure we have storage for fops/ident pair.
  *
  * Return 0 on success (or no work necessary), return errno on failure.
  */
 static int
 kqueue_expand(struct kqueue *kq, struct filterops *fops, uintptr_t ident,
     int mflag)
 {
 	struct klist *list, *tmp_knhash, *to_free;
 	u_long tmp_knhashmask;
 	int error, fd, size;
 
 	KQ_NOTOWNED(kq);
 
 	error = 0;
 	to_free = NULL;
 	if (fops->f_isfd) {
 		fd = ident;
 		if (kq->kq_knlistsize <= fd) {
 			size = kq->kq_knlistsize;
 			while (size <= fd)
 				size += KQEXTENT;
 			list = malloc(size * sizeof(*list), M_KQUEUE, mflag);
 			if (list == NULL)
 				return ENOMEM;
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = list;
 				error = EBADF;
 			} else if (kq->kq_knlistsize > fd) {
 				to_free = list;
 			} else {
 				if (kq->kq_knlist != NULL) {
 					bcopy(kq->kq_knlist, list,
 					    kq->kq_knlistsize * sizeof(*list));
 					to_free = kq->kq_knlist;
 					kq->kq_knlist = NULL;
 				}
 				bzero((caddr_t)list +
 				    kq->kq_knlistsize * sizeof(*list),
 				    (size - kq->kq_knlistsize) * sizeof(*list));
 				kq->kq_knlistsize = size;
 				kq->kq_knlist = list;
 			}
 			KQ_UNLOCK(kq);
 		}
 	} else {
 		if (kq->kq_knhashmask == 0) {
 			tmp_knhash = hashinit_flags(KN_HASHSIZE, M_KQUEUE,
 			    &tmp_knhashmask, (mflag & M_WAITOK) != 0 ?
 			    HASH_WAITOK : HASH_NOWAIT);
 			if (tmp_knhash == NULL)
 				return (ENOMEM);
 			KQ_LOCK(kq);
 			if ((kq->kq_state & KQ_CLOSING) != 0) {
 				to_free = tmp_knhash;
 				error = EBADF;
 			} else if (kq->kq_knhashmask == 0) {
 				kq->kq_knhash = tmp_knhash;
 				kq->kq_knhashmask = tmp_knhashmask;
 			} else {
 				to_free = tmp_knhash;
 			}
 			KQ_UNLOCK(kq);
 		}
 	}
 	free(to_free, M_KQUEUE);
 
 	KQ_NOTOWNED(kq);
 	return (error);
 }
 
 static void
 kqueue_task(void *arg, int pending)
 {
 	struct kqueue *kq;
 	int haskqglobal;
 
 	haskqglobal = 0;
 	kq = arg;
 
 	KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 	KQ_LOCK(kq);
 
 	KNOTE_LOCKED(&kq->kq_sel.si_note, 0);
 
 	kq->kq_state &= ~KQ_TASKSCHED;
 	if ((kq->kq_state & KQ_TASKDRAIN) == KQ_TASKDRAIN) {
 		wakeup(&kq->kq_state);
 	}
 	KQ_UNLOCK(kq);
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 }
 
 /*
  * Scan, update kn_data (if not ONESHOT), and copyout triggered events.
  * We treat KN_MARKER knotes as if they are in flux.
  */
 static int
 kqueue_scan(struct kqueue *kq, int maxevents, struct kevent_copyops *k_ops,
     const struct timespec *tsp, struct kevent *keva, struct thread *td)
 {
 	struct kevent *kevp;
 	struct knote *kn, *marker;
 	struct knlist *knl;
 	sbintime_t asbt, rsbt;
 	int count, error, haskqglobal, influx, nkev, touch;
 
 	count = maxevents;
 	nkev = 0;
 	error = 0;
 	haskqglobal = 0;
 
 	if (maxevents == 0)
 		goto done_nl;
 
 	rsbt = 0;
 	if (tsp != NULL) {
 		if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
 		    tsp->tv_nsec >= 1000000000) {
 			error = EINVAL;
 			goto done_nl;
 		}
 		if (timespecisset(tsp)) {
 			if (tsp->tv_sec <= INT32_MAX) {
 				rsbt = tstosbt(*tsp);
 				if (TIMESEL(&asbt, rsbt))
 					asbt += tc_tick_sbt;
 				if (asbt <= SBT_MAX - rsbt)
 					asbt += rsbt;
 				else
 					asbt = 0;
 				rsbt >>= tc_precexp;
 			} else
 				asbt = 0;
 		} else
 			asbt = -1;
 	} else
 		asbt = 0;
 	marker = knote_alloc(M_WAITOK);
 	marker->kn_status = KN_MARKER;
 	KQ_LOCK(kq);
 
 retry:
 	kevp = keva;
 	if (kq->kq_count == 0) {
 		if (asbt == -1) {
 			error = EWOULDBLOCK;
 		} else {
 			kq->kq_state |= KQ_SLEEP;
 			error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
 			    "kqread", asbt, rsbt, C_ABSOLUTE);
 		}
 		if (error == 0)
 			goto retry;
 		/* don't restart after signals... */
 		if (error == ERESTART)
 			error = EINTR;
 		else if (error == EWOULDBLOCK)
 			error = 0;
 		goto done;
 	}
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
 	influx = 0;
 	while (count) {
 		KQ_OWNED(kq);
 		kn = TAILQ_FIRST(&kq->kq_head);
 
 		if ((kn->kn_status == KN_MARKER && kn != marker) ||
 		    kn_in_flux(kn)) {
 			if (influx) {
 				influx = 0;
 				KQ_FLUX_WAKEUP(kq);
 			}
 			kq->kq_state |= KQ_FLUXWAIT;
 			error = msleep(kq, &kq->kq_lock, PSOCK,
 			    "kqflxwt", 0);
 			continue;
 		}
 
 		TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 		if ((kn->kn_status & KN_DISABLED) == KN_DISABLED) {
 			kn->kn_status &= ~KN_QUEUED;
 			kq->kq_count--;
 			continue;
 		}
 		if (kn == marker) {
 			KQ_FLUX_WAKEUP(kq);
 			if (count == maxevents)
 				goto retry;
 			goto done;
 		}
 		KASSERT(!kn_in_flux(kn),
 		    ("knote %p is unexpectedly in flux", kn));
 
 		if ((kn->kn_flags & EV_DROP) == EV_DROP) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked it as in flux.
 			 */
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			continue;
 		} else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
 			kn->kn_status &= ~KN_QUEUED;
 			kn_enter_flux(kn);
 			kq->kq_count--;
 			KQ_UNLOCK(kq);
 			/*
 			 * We don't need to lock the list since we've
 			 * marked the knote as being in flux.
 			 */
 			*kevp = kn->kn_kevent;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 			kn = NULL;
 		} else {
 			kn->kn_status |= KN_SCAN;
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
 				KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 			knl = kn_list_lock(kn);
 			if (kn->kn_fop->f_event(kn, 0) == 0) {
 				KQ_LOCK(kq);
 				KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE |
 				    KN_SCAN);
 				kn_leave_flux(kn);
 				kq->kq_count--;
 				kn_list_unlock(knl);
 				influx = 1;
 				continue;
 			}
 			touch = (!kn->kn_fop->f_isfd &&
 			    kn->kn_fop->f_touch != NULL);
 			if (touch)
 				kn->kn_fop->f_touch(kn, kevp, EVENT_PROCESS);
 			else
 				*kevp = kn->kn_kevent;
 			KQ_LOCK(kq);
 			KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
 			if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
 				/* 
 				 * Manually clear knotes who weren't 
 				 * 'touch'ed.
 				 */
 				if (touch == 0 && kn->kn_flags & EV_CLEAR) {
 					kn->kn_data = 0;
 					kn->kn_fflags = 0;
 				}
 				if (kn->kn_flags & EV_DISPATCH)
 					kn->kn_status |= KN_DISABLED;
 				kn->kn_status &= ~(KN_QUEUED | KN_ACTIVE);
 				kq->kq_count--;
 			} else
 				TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 			
 			kn->kn_status &= ~KN_SCAN;
 			kn_leave_flux(kn);
 			kn_list_unlock(knl);
 			influx = 1;
 		}
 
 		/* we are returning a copy to the user */
 		kevp++;
 		nkev++;
 		count--;
 
 		if (nkev == KQ_NEVENTS) {
 			influx = 0;
 			KQ_UNLOCK_FLUX(kq);
 			error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 			nkev = 0;
 			kevp = keva;
 			KQ_LOCK(kq);
 			if (error)
 				break;
 		}
 	}
 	TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
 done:
 	KQ_OWNED(kq);
 	KQ_UNLOCK_FLUX(kq);
 	knote_free(marker);
 done_nl:
 	KQ_NOTOWNED(kq);
 	if (nkev != 0)
 		error = k_ops->k_copyout(k_ops->arg, keva, nkev);
 	td->td_retval[0] = maxevents - count;
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_ioctl(struct file *fp, u_long cmd, void *data,
 	struct ucred *active_cred, struct thread *td)
 {
 	/*
 	 * Enabling sigio causes two major problems:
 	 * 1) infinite recursion:
 	 * Synopsys: kevent is being used to track signals and have FIOASYNC
 	 * set.  On receipt of a signal this will cause a kqueue to recurse
 	 * into itself over and over.  Sending the sigio causes the kqueue
 	 * to become ready, which in turn posts sigio again, forever.
 	 * Solution: this can be solved by setting a flag in the kqueue that
 	 * we have a SIGIO in progress.
 	 * 2) locking problems:
 	 * Synopsys: Kqueue is a leaf subsystem, but adding signalling puts
 	 * us above the proc and pgrp locks.
 	 * Solution: Post a signal using an async mechanism, being sure to
 	 * record a generation count in the delivery so that we do not deliver
 	 * a signal to the wrong process.
 	 *
 	 * Note, these two mechanisms are somewhat mutually exclusive!
 	 */
 #if 0
 	struct kqueue *kq;
 
 	kq = fp->f_data;
 	switch (cmd) {
 	case FIOASYNC:
 		if (*(int *)data) {
 			kq->kq_state |= KQ_ASYNC;
 		} else {
 			kq->kq_state &= ~KQ_ASYNC;
 		}
 		return (0);
 
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &kq->kq_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&kq->kq_sigio);
 		return (0);
 	}
 #endif
 
 	return (ENOTTY);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_poll(struct file *fp, int events, struct ucred *active_cred,
 	struct thread *td)
 {
 	struct kqueue *kq;
 	int revents = 0;
 	int error;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return POLLERR;
 
 	KQ_LOCK(kq);
 	if (events & (POLLIN | POLLRDNORM)) {
 		if (kq->kq_count) {
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			selrecord(td, &kq->kq_sel);
 			if (SEL_WAITING(&kq->kq_sel))
 				kq->kq_state |= KQ_SEL;
 		}
 	}
 	kqueue_release(kq, 1);
 	KQ_UNLOCK(kq);
 	return (revents);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
 	struct thread *td)
 {
 
 	bzero((void *)st, sizeof *st);
 	/*
 	 * We no longer return kq_count because the unlocked value is useless.
 	 * If you spent all this time getting the count, why not spend your
 	 * syscall better by calling kevent?
 	 *
 	 * XXX - This is needed for libc_r.
 	 */
 	st->st_mode = S_IFIFO;
 	return (0);
 }
 
 static void
 kqueue_drain(struct kqueue *kq, struct thread *td)
 {
 	struct knote *kn;
 	int i;
 
 	KQ_LOCK(kq);
 
 	KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
 	    ("kqueue already closing"));
 	kq->kq_state |= KQ_CLOSING;
 	if (kq->kq_refcnt > 1)
 		msleep(&kq->kq_refcnt, &kq->kq_lock, PSOCK, "kqclose", 0);
 
 	KASSERT(kq->kq_refcnt == 1, ("other refs are out there!"));
 
 	KASSERT(knlist_empty(&kq->kq_sel.si_note),
 	    ("kqueue's knlist not empty"));
 
 	for (i = 0; i < kq->kq_knlistsize; i++) {
 		while ((kn = SLIST_FIRST(&kq->kq_knlist[i])) != NULL) {
 			if (kn_in_flux(kn)) {
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqclo1", 0);
 				continue;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 	}
 	if (kq->kq_knhashmask != 0) {
 		for (i = 0; i <= kq->kq_knhashmask; i++) {
 			while ((kn = SLIST_FIRST(&kq->kq_knhash[i])) != NULL) {
 				if (kn_in_flux(kn)) {
 					kq->kq_state |= KQ_FLUXWAIT;
 					msleep(kq, &kq->kq_lock, PSOCK,
 					       "kqclo2", 0);
 					continue;
 				}
 				kn_enter_flux(kn);
 				KQ_UNLOCK(kq);
 				knote_drop(kn, td);
 				KQ_LOCK(kq);
 			}
 		}
 	}
 
 	if ((kq->kq_state & KQ_TASKSCHED) == KQ_TASKSCHED) {
 		kq->kq_state |= KQ_TASKDRAIN;
 		msleep(&kq->kq_state, &kq->kq_lock, PSOCK, "kqtqdr", 0);
 	}
 
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 
 	KQ_UNLOCK(kq);
 }
 
 static void
 kqueue_destroy(struct kqueue *kq)
 {
 
 	KASSERT(kq->kq_fdp == NULL,
 	    ("kqueue still attached to a file descriptor"));
 	seldrain(&kq->kq_sel);
 	knlist_destroy(&kq->kq_sel.si_note);
 	mtx_destroy(&kq->kq_lock);
 
 	if (kq->kq_knhash != NULL)
 		free(kq->kq_knhash, M_KQUEUE);
 	if (kq->kq_knlist != NULL)
 		free(kq->kq_knlist, M_KQUEUE);
 
 	funsetown(&kq->kq_sigio);
 }
 
 /*ARGSUSED*/
 static int
 kqueue_close(struct file *fp, struct thread *td)
 {
 	struct kqueue *kq = fp->f_data;
 	struct filedesc *fdp;
 	int error;
 	int filedesc_unlock;
 
 	if ((error = kqueue_acquire(fp, &kq)))
 		return error;
 	kqueue_drain(kq, td);
 
 	/*
 	 * We could be called due to the knote_drop() doing fdrop(),
 	 * called from kqueue_register().  In this case the global
 	 * lock is owned, and filedesc sx is locked before, to not
 	 * take the sleepable lock after non-sleepable.
 	 */
 	fdp = kq->kq_fdp;
 	kq->kq_fdp = NULL;
 	if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
 		FILEDESC_XLOCK(fdp);
 		filedesc_unlock = 1;
 	} else
 		filedesc_unlock = 0;
 	TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
 	if (filedesc_unlock)
 		FILEDESC_XUNLOCK(fdp);
 
 	kqueue_destroy(kq);
 	chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
 	crfree(kq->kq_cred);
 	free(kq, M_KQUEUE);
 	fp->f_data = NULL;
 
 	return (0);
 }
 
 static int
 kqueue_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 
 	kif->kf_type = KF_TYPE_KQUEUE;
 	return (0);
 }
 
 static void
 kqueue_wakeup(struct kqueue *kq)
 {
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_SLEEP) == KQ_SLEEP) {
 		kq->kq_state &= ~KQ_SLEEP;
 		wakeup(kq);
 	}
 	if ((kq->kq_state & KQ_SEL) == KQ_SEL) {
 		selwakeuppri(&kq->kq_sel, PSOCK);
 		if (!SEL_WAITING(&kq->kq_sel))
 			kq->kq_state &= ~KQ_SEL;
 	}
 	if (!knlist_empty(&kq->kq_sel.si_note))
 		kqueue_schedtask(kq);
 	if ((kq->kq_state & KQ_ASYNC) == KQ_ASYNC) {
 		pgsigio(&kq->kq_sigio, SIGIO, 0);
 	}
 }
 
 /*
  * Walk down a list of knotes, activating them if their event has triggered.
  *
  * There is a possibility to optimize in the case of one kq watching another.
  * Instead of scheduling a task to wake it up, you could pass enough state
  * down the chain to make up the parent kqueue.  Make this code functional
  * first.
  */
 void
 knote(struct knlist *list, long hint, int lockflags)
 {
 	struct kqueue *kq;
 	struct knote *kn, *tkn;
 	int error;
 
 	if (list == NULL)
 		return;
 
 	KNL_ASSERT_LOCK(list, lockflags & KNF_LISTLOCKED);
 
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_lock(list->kl_lockarg); 
 
 	/*
 	 * If we unlock the list lock (and enter influx), we can
 	 * eliminate the kqueue scheduling, but this will introduce
 	 * four lock/unlock's for each knote to test.  Also, marker
 	 * would be needed to keep iteration position, since filters
 	 * or other threads could remove events.
 	 */
 	SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0) {
 			/*
 			 * Do not process the influx notes, except for
 			 * the influx coming from the kq unlock in the
 			 * kqueue_scan().  In the later case, we do
 			 * not interfere with the scan, since the code
 			 * fragment in kqueue_scan() locks the knlist,
 			 * and cannot proceed until we finished.
 			 */
 			KQ_UNLOCK(kq);
 		} else if ((lockflags & KNF_NOKQLOCK) != 0) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			error = kn->kn_fop->f_event(kn, hint);
 			KQ_LOCK(kq);
 			kn_leave_flux(kn);
 			if (error)
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK_FLUX(kq);
 		} else {
 			if (kn->kn_fop->f_event(kn, hint))
 				KNOTE_ACTIVATE(kn, 1);
 			KQ_UNLOCK(kq);
 		}
 	}
 	if ((lockflags & KNF_LISTLOCKED) == 0)
 		list->kl_unlock(list->kl_lockarg); 
 }
 
 /*
  * add a knote to a knlist
  */
 void
 knlist_add(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	KNL_ASSERT_LOCK(knl, islocked);
 	KQ_NOTOWNED(kn->kn_kq);
 	KASSERT(kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p was not detached", kn));
 	if (!islocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_INSERT_HEAD(&knl->kl_list, kn, kn_selnext);
 	if (!islocked)
 		knl->kl_unlock(knl->kl_lockarg);
 	KQ_LOCK(kn->kn_kq);
 	kn->kn_knlist = knl;
 	kn->kn_status &= ~KN_DETACHED;
 	KQ_UNLOCK(kn->kn_kq);
 }
 
 static void
 knlist_remove_kq(struct knlist *knl, struct knote *kn, int knlislocked,
     int kqislocked)
 {
 
 	KASSERT(!kqislocked || knlislocked, ("kq locked w/o knl locked"));
 	KNL_ASSERT_LOCK(knl, knlislocked);
 	mtx_assert(&kn->kn_kq->kq_lock, kqislocked ? MA_OWNED : MA_NOTOWNED);
 	KASSERT(kqislocked || kn_in_flux(kn), ("knote %p not in flux", kn));
 	KASSERT((kn->kn_status & KN_DETACHED) == 0,
 	    ("knote %p was already detached", kn));
 	if (!knlislocked)
 		knl->kl_lock(knl->kl_lockarg);
 	SLIST_REMOVE(&knl->kl_list, kn, knote, kn_selnext);
 	kn->kn_knlist = NULL;
 	if (!knlislocked)
 		kn_list_unlock(knl);
 	if (!kqislocked)
 		KQ_LOCK(kn->kn_kq);
 	kn->kn_status |= KN_DETACHED;
 	if (!kqislocked)
 		KQ_UNLOCK(kn->kn_kq);
 }
 
 /*
  * remove knote from the specified knlist
  */
 void
 knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
 {
 
 	knlist_remove_kq(knl, kn, islocked, 0);
 }
 
 int
 knlist_empty(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	return (SLIST_EMPTY(&knl->kl_list));
 }
 
 static struct mtx knlist_lock;
 MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
     MTX_DEF);
 static void knlist_mtx_lock(void *arg);
 static void knlist_mtx_unlock(void *arg);
 
 static void
 knlist_mtx_lock(void *arg)
 {
 
 	mtx_lock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_unlock(void *arg)
 {
 
 	mtx_unlock((struct mtx *)arg);
 }
 
 static void
 knlist_mtx_assert_locked(void *arg)
 {
 
 	mtx_assert((struct mtx *)arg, MA_OWNED);
 }
 
 static void
 knlist_mtx_assert_unlocked(void *arg)
 {
 
 	mtx_assert((struct mtx *)arg, MA_NOTOWNED);
 }
 
 static void
 knlist_rw_rlock(void *arg)
 {
 
 	rw_rlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_runlock(void *arg)
 {
 
 	rw_runlock((struct rwlock *)arg);
 }
 
 static void
 knlist_rw_assert_locked(void *arg)
 {
 
 	rw_assert((struct rwlock *)arg, RA_LOCKED);
 }
 
 static void
 knlist_rw_assert_unlocked(void *arg)
 {
 
 	rw_assert((struct rwlock *)arg, RA_UNLOCKED);
 }
 
 void
 knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
     void (*kl_unlock)(void *),
     void (*kl_assert_locked)(void *), void (*kl_assert_unlocked)(void *))
 {
 
 	if (lock == NULL)
 		knl->kl_lockarg = &knlist_lock;
 	else
 		knl->kl_lockarg = lock;
 
 	if (kl_lock == NULL)
 		knl->kl_lock = knlist_mtx_lock;
 	else
 		knl->kl_lock = kl_lock;
 	if (kl_unlock == NULL)
 		knl->kl_unlock = knlist_mtx_unlock;
 	else
 		knl->kl_unlock = kl_unlock;
 	if (kl_assert_locked == NULL)
 		knl->kl_assert_locked = knlist_mtx_assert_locked;
 	else
 		knl->kl_assert_locked = kl_assert_locked;
 	if (kl_assert_unlocked == NULL)
 		knl->kl_assert_unlocked = knlist_mtx_assert_unlocked;
 	else
 		knl->kl_assert_unlocked = kl_assert_unlocked;
 
 	knl->kl_autodestroy = 0;
 	SLIST_INIT(&knl->kl_list);
 }
 
 void
 knlist_init_mtx(struct knlist *knl, struct mtx *lock)
 {
 
 	knlist_init(knl, lock, NULL, NULL, NULL, NULL);
 }
 
 struct knlist *
 knlist_alloc(struct mtx *lock)
 {
 	struct knlist *knl;
 
 	knl = malloc(sizeof(struct knlist), M_KQUEUE, M_WAITOK);
 	knlist_init_mtx(knl, lock);
 	return (knl);
 }
 
 void
 knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
 {
 
 	knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
 	    knlist_rw_assert_locked, knlist_rw_assert_unlocked);
 }
 
 void
 knlist_destroy(struct knlist *knl)
 {
 
 	KASSERT(KNLIST_EMPTY(knl),
 	    ("destroying knlist %p with knotes on it", knl));
 }
 
 void
 knlist_detach(struct knlist *knl)
 {
 
 	KNL_ASSERT_LOCKED(knl);
 	knl->kl_autodestroy = 1;
 	if (knlist_empty(knl)) {
 		knlist_destroy(knl);
 		free(knl, M_KQUEUE);
 	}
 }
 
 /*
  * Even if we are locked, we may need to drop the lock to allow any influx
  * knotes time to "settle".
  */
 void
 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 {
 	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
 	KASSERT(!knl->kl_autodestroy, ("cleardel for autodestroy %p", knl));
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		KNL_ASSERT_UNLOCKED(knl);
 again:		/* need to reacquire lock since we have dropped it */
 		knl->kl_lock(knl->kl_lockarg);
 	}
 
 	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if (kn_in_flux(kn)) {
 			KQ_UNLOCK(kq);
 			continue;
 		}
 		knlist_remove_kq(knl, kn, 1, 1);
 		if (killkn) {
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			knote_drop_detached(kn, td);
 		} else {
 			/* Make sure cleared knotes disappear soon */
 			kn->kn_flags |= EV_EOF | EV_ONESHOT;
 			KQ_UNLOCK(kq);
 		}
 		kq = NULL;
 	}
 
 	if (!SLIST_EMPTY(&knl->kl_list)) {
 		/* there are still in flux knotes remaining */
 		kn = SLIST_FIRST(&knl->kl_list);
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		KASSERT(kn_in_flux(kn), ("knote removed w/o list lock"));
 		knl->kl_unlock(knl->kl_lockarg);
 		kq->kq_state |= KQ_FLUXWAIT;
 		msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqkclr", 0);
 		kq = NULL;
 		goto again;
 	}
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		knl->kl_unlock(knl->kl_lockarg);
 		KNL_ASSERT_UNLOCKED(knl);
 	}
 }
 
 /*
  * Remove all knotes referencing a specified fd must be called with FILEDESC
  * lock.  This prevents a race where a new fd comes along and occupies the
  * entry and we attach a knote to the fd.
  */
 void
 knote_fdclose(struct thread *td, int fd)
 {
 	struct filedesc *fdp = td->td_proc->p_fd;
 	struct kqueue *kq;
 	struct knote *kn;
 	int influx;
 
 	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We shouldn't have to worry about new kevents appearing on fd
 	 * since filedesc is locked.
 	 */
 	TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
 		KQ_LOCK(kq);
 
 again:
 		influx = 0;
 		while (kq->kq_knlistsize > fd &&
 		    (kn = SLIST_FIRST(&kq->kq_knlist[fd])) != NULL) {
 			if (kn_in_flux(kn)) {
 				/* someone else might be waiting on our knote */
 				if (influx)
 					wakeup(kq);
 				kq->kq_state |= KQ_FLUXWAIT;
 				msleep(kq, &kq->kq_lock, PSOCK, "kqflxwt", 0);
 				goto again;
 			}
 			kn_enter_flux(kn);
 			KQ_UNLOCK(kq);
 			influx = 1;
 			knote_drop(kn, td);
 			KQ_LOCK(kq);
 		}
 		KQ_UNLOCK_FLUX(kq);
 	}
 }
 
 static int
 knote_attach(struct knote *kn, struct kqueue *kq)
 {
 	struct klist *list;
 
 	KASSERT(kn_in_flux(kn), ("knote %p not marked influx", kn));
 	KQ_OWNED(kq);
 
 	if ((kq->kq_state & KQ_CLOSING) != 0)
 		return (EBADF);
 	if (kn->kn_fop->f_isfd) {
 		if (kn->kn_id >= kq->kq_knlistsize)
 			return (ENOMEM);
 		list = &kq->kq_knlist[kn->kn_id];
 	} else {
 		if (kq->kq_knhash == NULL)
 			return (ENOMEM);
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 	}
 	SLIST_INSERT_HEAD(list, kn, kn_link);
 	return (0);
 }
 
 static void
 knote_drop(struct knote *kn, struct thread *td)
 {
 
 	if ((kn->kn_status & KN_DETACHED) == 0)
 		kn->kn_fop->f_detach(kn);
 	knote_drop_detached(kn, td);
 }
 
 static void
 knote_drop_detached(struct knote *kn, struct thread *td)
 {
 	struct kqueue *kq;
 	struct klist *list;
 
 	kq = kn->kn_kq;
 
 	KASSERT((kn->kn_status & KN_DETACHED) != 0,
 	    ("knote %p still attached", kn));
 	KQ_NOTOWNED(kq);
 
 	KQ_LOCK(kq);
 	KASSERT(kn->kn_influx == 1,
 	    ("knote_drop called on %p with influx %d", kn, kn->kn_influx));
 
 	if (kn->kn_fop->f_isfd)
 		list = &kq->kq_knlist[kn->kn_id];
 	else
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
 	if (!SLIST_EMPTY(list))
 		SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	KQ_UNLOCK_FLUX(kq);
 
 	if (kn->kn_fop->f_isfd) {
 		fdrop(kn->kn_fp, td);
 		kn->kn_fp = NULL;
 	}
 	kqueue_fo_release(kn->kn_kevent.filter);
 	kn->kn_fop = NULL;
 	knote_free(kn);
 }
 
 static void
 knote_enqueue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT((kn->kn_status & KN_QUEUED) == 0, ("knote already queued"));
 
 	TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status |= KN_QUEUED;
 	kq->kq_count++;
 	kqueue_wakeup(kq);
 }
 
 static void
 knote_dequeue(struct knote *kn)
 {
 	struct kqueue *kq = kn->kn_kq;
 
 	KQ_OWNED(kn->kn_kq);
 	KASSERT(kn->kn_status & KN_QUEUED, ("knote not queued"));
 
 	TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
 	kn->kn_status &= ~KN_QUEUED;
 	kq->kq_count--;
 }
 
 static void
 knote_init(void)
 {
 
 	knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
 static struct knote *
 knote_alloc(int mflag)
 {
 
 	return (uma_zalloc(knote_zone, mflag | M_ZERO));
 }
 
 static void
 knote_free(struct knote *kn)
 {
 
 	uma_zfree(knote_zone, kn);
 }
 
 /*
  * Register the kev w/ the kq specified by fd.
  */
 int 
 kqfd_register(int fd, struct kevent *kev, struct thread *td, int mflag)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	cap_rights_t rights;
 	int error;
 
 	error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
 	if (error != 0)
 		return (error);
 	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto noacquire;
 
 	error = kqueue_register(kq, kev, td, mflag);
 	kqueue_release(kq, 0);
 
 noacquire:
 	fdrop(fp, td);
 	return (error);
 }
Index: projects/fuse2/sys/kern/kern_sendfile.c
===================================================================
--- projects/fuse2/sys/kern/kern_sendfile.c	(revision 350434)
+++ projects/fuse2/sys/kern/kern_sendfile.c	(revision 350435)
@@ -1,1194 +1,1191 @@
 /*-
  * Copyright (c) 2013-2015 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <netinet/in.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
-#include <sys/sdt.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 #define	EXT_FLAG_SYNC		EXT_FLAG_VENDOR1
 #define	EXT_FLAG_NOCACHE	EXT_FLAG_VENDOR2
 #define	EXT_FLAG_CACHE_LAST	EXT_FLAG_VENDOR3
 
-SDT_PROVIDER_DECLARE(vfs);
-
 /*
  * Structure describing a single sendfile(2) I/O, which may consist of
  * several underlying pager I/Os.
  *
  * The syscall context allocates the structure and initializes 'nios'
  * to 1.  As sendfile_swapin() runs through pages and starts asynchronous
  * paging operations, it increments 'nios'.
  *
  * Every I/O completion calls sendfile_iodone(), which decrements the 'nios',
  * and the syscall also calls sendfile_iodone() after allocating all mbufs,
  * linking them and sending to socket.  Whoever reaches zero 'nios' is
  * responsible to * call pru_ready on the socket, to notify it of readyness
  * of the data.
  */
 struct sf_io {
 	volatile u_int	nios;
 	u_int		error;
 	int		npages;
 	struct socket	*so;
 	struct mbuf	*m;
 	vm_page_t	pa[];
 };
 
 /*
  * Structure used to track requests with SF_SYNC flag.
  */
 struct sendfile_sync {
 	struct mtx	mtx;
 	struct cv	cv;
 	unsigned	count;
 };
 
 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 
 static void
 sfstat_init(const void *unused)
 {
 
 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 	    M_WAITOK);
 }
 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 
 static int
 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct sfstat s;
 
 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 	if (req->newptr)
 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 	return (SYSCTL_OUT(req, &s, sizeof(s)));
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat, CTLTYPE_OPAQUE | CTLFLAG_RW,
     NULL, 0, sfstat_sysctl, "I", "sendfile statistics");
 
 /*
  * Detach mapped page and release resources back to the system.  Called
  * by mbuf(9) code when last reference to a page is freed.
  */
 static void
 sendfile_free_page(vm_page_t pg, bool nocache)
 {
 	bool freed;
 
 	vm_page_lock(pg);
 	/*
 	 * In either case check for the object going away on us.  This can
 	 * happen since we don't hold a reference to it.  If so, we're
 	 * responsible for freeing the page.  In 'noncache' case try to free
 	 * the page, but only if it is cheap to.
 	 */
 	if (vm_page_unwire_noq(pg)) {
 		vm_object_t obj;
 
 		if ((obj = pg->object) == NULL)
 			vm_page_free(pg);
 		else {
 			freed = false;
 			if (nocache && !vm_page_xbusied(pg) &&
 			    VM_OBJECT_TRYWLOCK(obj)) {
 				/* Only free unmapped pages. */
 				if (obj->ref_count == 0 ||
 				    !pmap_page_is_mapped(pg))
 					/*
 					 * The busy test before the object is
 					 * locked cannot be relied upon.
 					 */
 					freed = vm_page_try_to_free(pg);
 				VM_OBJECT_WUNLOCK(obj);
 			}
 			if (!freed) {
 				/*
 				 * If we were asked to not cache the page, place
 				 * it near the head of the inactive queue so
 				 * that it is reclaimed sooner.  Otherwise,
 				 * maintain LRU.
 				 */
 				if (nocache)
 					vm_page_deactivate_noreuse(pg);
 				else if (vm_page_active(pg))
 					vm_page_reference(pg);
 				else
 					vm_page_deactivate(pg);
 			}
 		}
 	}
 	vm_page_unlock(pg);
 }
 
 static void
 sendfile_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	vm_page_t pg;
 	bool nocache;
 
 	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF,
 	    ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m));
 
 	sf = m->m_ext.ext_arg1;
 	pg = sf_buf_page(sf);
 	nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE;
 
 	sf_buf_free(sf);
 	sendfile_free_page(pg, nocache);
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg2;
 
 		mtx_lock(&sfs->mtx);
 		KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
 		if (--sfs->count == 0)
 			cv_signal(&sfs->cv);
 		mtx_unlock(&sfs->mtx);
 	}
 }
 
 static void
 sendfile_free_mext_pg(struct mbuf *m)
 {
 	struct mbuf_ext_pgs *ext_pgs;
 	vm_page_t pg;
 	int i;
 	bool nocache, cache_last;
 
 	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_PGS,
 	    ("%s: m %p !M_EXT or !EXT_PGS", __func__, m));
 
 	nocache = m->m_ext.ext_flags & EXT_FLAG_NOCACHE;
 	cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
 	ext_pgs = m->m_ext.ext_pgs;
 
 	for (i = 0; i < ext_pgs->npgs; i++) {
 		if (cache_last && i == ext_pgs->npgs - 1)
 			nocache = false;
 		pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
 		sendfile_free_page(pg, nocache);
 	}
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg2;
 
 		mtx_lock(&sfs->mtx);
 		KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
 		if (--sfs->count == 0)
 			cv_signal(&sfs->cv);
 		mtx_unlock(&sfs->mtx);
 	}
 }
 
 /*
  * Helper function to calculate how much data to put into page i of n.
  * Only first and last pages are special.
  */
 static inline off_t
 xfsize(int i, int n, off_t off, off_t len)
 {
 
 	if (i == 0)
 		return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
 
 	if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
 		return ((off + len) & PAGE_MASK);
 
 	return (PAGE_SIZE);
 }
 
 /*
  * Helper function to get offset within object for i page.
  */
 static inline vm_ooffset_t
 vmoff(int i, off_t off)
 {
 
 	if (i == 0)
 		return ((vm_ooffset_t)off);
 
 	return (trunc_page(off + i * PAGE_SIZE));
 }
 
 /*
  * Helper function used when allocation of a page or sf_buf failed.
  * Pretend as if we don't have enough space, subtract xfsize() of
  * all pages that failed.
  */
 static inline void
 fixspace(int old, int new, off_t off, int *space)
 {
 
 	KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
 
 	/* Subtract last one. */
 	*space -= xfsize(old - 1, old, off, *space);
 	old--;
 
 	if (new == old)
 		/* There was only one page. */
 		return;
 
 	/* Subtract first one. */
 	if (new == 0) {
 		*space -= xfsize(0, old, off, *space);
 		new++;
 	}
 
 	/* Rest of pages are full sized. */
 	*space -= (old - new) * PAGE_SIZE;
 
 	KASSERT(*space >= 0, ("%s: space went backwards", __func__));
 }
 
 /*
  * I/O completion callback.
  */
 static void
 sendfile_iodone(void *arg, vm_page_t *pg, int count, int error)
 {
 	struct sf_io *sfio = arg;
 	struct socket *so = sfio->so;
 
 	for (int i = 0; i < count; i++)
 		if (pg[i] != bogus_page)
 			vm_page_xunbusy(pg[i]);
 
 	if (error)
 		sfio->error = error;
 
 	if (!refcount_release(&sfio->nios))
 		return;
 
 	CURVNET_SET(so->so_vnet);
 	if (sfio->error) {
 		/*
 		 * I/O operation failed.  The state of data in the socket
 		 * is now inconsistent, and all what we can do is to tear
 		 * it down. Protocol abort method would tear down protocol
 		 * state, free all ready mbufs and detach not ready ones.
 		 * We will free the mbufs corresponding to this I/O manually.
 		 *
 		 * The socket would be marked with EIO and made available
 		 * for read, so that application receives EIO on next
 		 * syscall and eventually closes the socket.
 		 */
 		so->so_proto->pr_usrreqs->pru_abort(so);
 		so->so_error = EIO;
 
 		mb_free_notready(sfio->m, sfio->npages);
 	} else
 		(void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
 		    sfio->npages);
 
 	SOCK_LOCK(so);
 	sorele(so);
 	CURVNET_RESTORE();
 	free(sfio, M_TEMP);
 }
 
-SDT_PROBE_DEFINE1(vfs, sendfile, swapin, pager_error, "int");
 /*
  * Iterate through pages vector and request paging for non-valid pages.
  */
 static int
 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off,
     off_t len, int npages, int rhpages, int flags)
 {
 	vm_page_t *pa = sfio->pa;
 	int grabbed;
 
 	*nios = 0;
 	flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
 
 	/*
 	 * First grab all the pages and wire them.  Note that we grab
 	 * only required pages.  Readahead pages are dealt with later.
 	 */
 	VM_OBJECT_WLOCK(obj);
 
 	grabbed = vm_page_grab_pages(obj, OFF_TO_IDX(off),
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages);
 	if (grabbed < npages) {
 		for (int i = grabbed; i < npages; i++)
 			pa[i] = NULL;
 		npages = grabbed;
 		rhpages = 0;
 	}
 
 	for (int i = 0; i < npages;) {
 		int j, a, count, rv;
 
 		/* Skip valid pages. */
 		if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
 		    xfsize(i, npages, off, len))) {
 			vm_page_xunbusy(pa[i]);
 			SFSTAT_INC(sf_pages_valid);
 			i++;
 			continue;
 		}
 
 		/*
 		 * Next page is invalid.  Check if it belongs to pager.  It
 		 * may not be there, which is a regular situation for shmem
 		 * pager.  For vnode pager this happens only in case of
 		 * a sparse file.
 		 *
 		 * Important feature of vm_pager_has_page() is the hint
 		 * stored in 'a', about how many pages we can pagein after
 		 * this page in a single I/O.
 		 */
 		if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL,
 		    &a)) {
 			pmap_zero_page(pa[i]);
 			pa[i]->valid = VM_PAGE_BITS_ALL;
 			MPASS(pa[i]->dirty == 0);
 			vm_page_xunbusy(pa[i]);
 			i++;
 			continue;
 		}
 
 		/*
 		 * We want to pagein as many pages as possible, limited only
 		 * by the 'a' hint and actual request.
 		 */
 		count = min(a + 1, npages - i);
 
 		/*
 		 * We should not pagein into a valid page, thus we first trim
 		 * any valid pages off the end of request, and substitute
 		 * to bogus_page those, that are in the middle.
 		 */
 		for (j = i + count - 1; j > i; j--) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				count--;
 				rhpages = 0;
 			} else
 				break;
 		}
 		for (j = i + 1; j < i + count - 1; j++)
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				SFSTAT_INC(sf_pages_bogus);
 				pa[j] = bogus_page;
 			}
 
 		refcount_acquire(&sfio->nios);
 		rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
 		    i + count == npages ? &rhpages : NULL,
 		    &sendfile_iodone, sfio);
 		if (rv != VM_PAGER_OK) {
-			SDT_PROBE1(vfs, sendfile, swapin, pager_error, rv);
-			for (j = 0; j < count; j++) {
-				vm_page_lock(*(pa + i + j));
-				vm_page_unwire(*(pa + i + j), PQ_INACTIVE);
-				vm_page_unlock(*(pa + i + j));
+			for (j = i; j < i + count; j++) {
+				if (pa[j] != bogus_page) {
+					vm_page_lock(pa[j]);
+					vm_page_unwire(pa[j], PQ_INACTIVE);
+					vm_page_unlock(pa[j]);
+				}
 			}
 			VM_OBJECT_WUNLOCK(obj);
-			return EIO;
+			return (EIO);
 		}
 		KASSERT(rv == VM_PAGER_OK, ("%s: pager fail obj %p page %p",
 		    __func__, obj, pa[i]));
 
 		SFSTAT_INC(sf_iocnt);
 		SFSTAT_ADD(sf_pages_read, count);
 		if (i + count == npages)
 			SFSTAT_ADD(sf_rhpages_read, rhpages);
 
 		/*
 		 * Restore the valid page pointers.  They are already
 		 * unbusied, but still wired.
 		 */
 		for (j = i; j < i + count; j++)
 			if (pa[j] == bogus_page) {
 				pa[j] = vm_page_lookup(obj,
 				    OFF_TO_IDX(vmoff(j, off)));
 				KASSERT(pa[j], ("%s: page %p[%d] disappeared",
 				    __func__, pa, j));
 
 			}
 		i += count;
 		(*nios)++;
 	}
 
 	VM_OBJECT_WUNLOCK(obj);
 
 	if (*nios == 0 && npages != 0)
 		SFSTAT_INC(sf_noiocnt);
 
 	return (0);
 }
 
 static int
 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
     int *bsize)
 {
 	struct vattr va;
 	vm_object_t obj;
 	struct vnode *vp;
 	struct shmfd *shmfd;
 	int error;
 
 	vp = *vp_res = NULL;
 	obj = NULL;
 	shmfd = *shmfd_res = NULL;
 	*bsize = 0;
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			goto out;
 		}
 		*bsize = vp->v_mount->mnt_stat.f_iosize;
 		error = VOP_GETATTR(vp, &va, td->td_ucred);
 		if (error != 0)
 			goto out;
 		*obj_size = va.va_size;
 		obj = vp->v_object;
 		if (obj == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 	} else if (fp->f_type == DTYPE_SHM) {
 		error = 0;
 		shmfd = fp->f_data;
 		obj = shmfd->shm_object;
 		*obj_size = shmfd->shm_size;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 
 	VM_OBJECT_WLOCK(obj);
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		VM_OBJECT_WUNLOCK(obj);
 		error = EBADF;
 		goto out;
 	}
 
 	/*
 	 * Temporarily increase the backing VM object's reference
 	 * count so that a forced reclamation of its vnode does not
 	 * immediately destroy it.
 	 */
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 	*obj_res = obj;
 	*vp_res = vp;
 	*shmfd_res = shmfd;
 
 out:
 	if (vp != NULL)
 		VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 static int
 sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
     struct socket **so)
 {
 	int error;
 
 	*sock_fp = NULL;
 	*so = NULL;
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 */
 	error = getsock_cap(td, s, &cap_send_rights,
 	    sock_fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	*so = (*sock_fp)->f_data;
 	if ((*so)->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (SOLISTENING(*so))
 		return (ENOTCONN);
 	return (0);
 }
 
 int
 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj;
 	struct socket *so;
 	struct mbuf_ext_pgs *ext_pgs;
 	struct mbuf *m, *mh, *mhtail;
 	struct sf_buf *sf;
 	struct shmfd *shmfd;
 	struct sendfile_sync *sfs;
 	struct vattr va;
 	off_t off, sbytes, rem, obj_size;
 	int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
 	bool use_ext_pgs;
 
 	obj = NULL;
 	so = NULL;
 	m = mh = NULL;
 	sfs = NULL;
 	hdrlen = sbytes = 0;
 	softerr = 0;
 	use_ext_pgs = false;
 
 	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 	if (error != 0)
 		return (error);
 
 	error = sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	SFSTAT_INC(sf_syscalls);
 	SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));
 
 	if (flags & SF_SYNC) {
 		sfs = malloc(sizeof *sfs, M_TEMP, M_WAITOK | M_ZERO);
 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 		cv_init(&sfs->cv, "sendfile");
 	}
 
 	rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
 
 	/*
 	 * Protect against multiple writers to the socket.
 	 *
 	 * XXXRW: Historically this has assumed non-interruptibility, so now
 	 * we implement that, but possibly shouldn't.
 	 */
 	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = offset; rem > 0; ) {
 		struct sf_io *sfio;
 		vm_page_t *pa;
 		struct mbuf *mtail;
 		int nios, space, npages, rhpages;
 
 		mtail = NULL;
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOTCONN;
 			goto done;
 		}
 
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(&so->so_snd);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * At the beginning of the first loop check if any headers
 		 * are specified and copy them into mbufs.  Reduce space in
 		 * the socket buffer by the size of the header mbuf chain.
 		 * Clear hdr_uio here and hdrlen at the end of the first loop.
 		 */
 		if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
 			hdr_uio->uio_td = td;
 			hdr_uio->uio_rw = UIO_WRITE;
 			mh = m_uiotombuf(hdr_uio, M_WAITOK, space, 0, 0);
 			hdrlen = m_length(mh, &mhtail);
 			space -= hdrlen;
 			/*
 			 * If header consumed all the socket buffer space,
 			 * don't waste CPU cycles and jump to the end.
 			 */
 			if (space == 0) {
 				sfio = NULL;
 				nios = 0;
 				goto prepend_header;
 			}
 			hdr_uio = NULL;
 		}
 
 		if (vp != NULL) {
 			error = vn_lock(vp, LK_SHARED);
 			if (error != 0)
 				goto done;
 			error = VOP_GETATTR(vp, &va, td->td_ucred);
 			if (error != 0 || off >= va.va_size) {
 				VOP_UNLOCK(vp, 0);
 				goto done;
 			}
 			if (va.va_size != obj_size) {
 				obj_size = va.va_size;
 				rem = nbytes ?
 				    omin(nbytes + offset, obj_size) : obj_size;
 				rem -= off;
 			}
 		}
 
 		if (space > rem)
 			space = rem;
 		else if (space > PAGE_SIZE) {
 			/*
 			 * Use page boundaries when possible for large
 			 * requests.
 			 */
 			if (off & PAGE_MASK)
 				space -= (PAGE_SIZE - (off & PAGE_MASK));
 			space = trunc_page(space);
 			if (off & PAGE_MASK)
 				space += (PAGE_SIZE - (off & PAGE_MASK));
 		}
 
 		npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
 
 		/*
 		 * Calculate maximum allowed number of pages for readahead
 		 * at this iteration.  If SF_USER_READAHEAD was set, we don't
 		 * do any heuristics and use exactly the value supplied by
 		 * application.  Otherwise, we allow readahead up to "rem".
 		 * If application wants more, let it be, but there is no
 		 * reason to go above MAXPHYS.  Also check against "obj_size",
 		 * since vm_pager_has_page() can hint beyond EOF.
 		 */
 		if (flags & SF_USER_READAHEAD) {
 			rhpages = SF_READAHEAD(flags);
 		} else {
 			rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) -
 			    npages;
 			rhpages += SF_READAHEAD(flags);
 		}
 		rhpages = min(howmany(MAXPHYS, PAGE_SIZE), rhpages);
 		rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
 		    npages, rhpages);
 
 		sfio = malloc(sizeof(struct sf_io) +
 		    npages * sizeof(vm_page_t), M_TEMP, M_WAITOK);
 		refcount_init(&sfio->nios, 1);
 		sfio->so = so;
 		sfio->error = 0;
 
 		error = sendfile_swapin(obj, sfio, &nios, off, space, npages,
 		    rhpages, flags);
-		if (error) {
-			free(sfio, M_TEMP);
+		if (error != 0) {
 			if (vp != NULL)
 				VOP_UNLOCK(vp, 0);
+			free(sfio, M_TEMP);
 			goto done;
 		}
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		pa = sfio->pa;
 
 		/*
 		 * Use unmapped mbufs if enabled for TCP.  Unmapped
 		 * bufs are restricted to TCP as that is what has been
 		 * tested.  In particular, unmapped mbufs have not
 		 * been tested with UNIX-domain sockets.
 		 */
 		if (mb_use_ext_pgs &&
 		    so->so_proto->pr_protocol == IPPROTO_TCP) {
 			use_ext_pgs = true;
 			max_pgs = MBUF_PEXT_MAX_PGS;
 
 			/* Start at last index, to wrap on first use. */
 			ext_pgs_idx = max_pgs - 1;
 		}
 
 		for (int i = 0; i < npages; i++) {
 			struct mbuf *m0;
 
 			/*
 			 * If a page wasn't grabbed successfully, then
 			 * trim the array. Can happen only with SF_NODISKIO.
 			 */
 			if (pa[i] == NULL) {
 				SFSTAT_INC(sf_busy);
 				fixspace(npages, i, off, &space);
 				npages = i;
 				softerr = EBUSY;
 				break;
 			}
 
 			if (use_ext_pgs) {
 				off_t xfs;
 
 				ext_pgs_idx++;
 				if (ext_pgs_idx == max_pgs) {
 					m0 = mb_alloc_ext_pgs(M_WAITOK, false,
 					    sendfile_free_mext_pg);
 
 					if (flags & SF_NOCACHE) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_NOCACHE;
 
 						/*
 						 * See comment below regarding
 						 * ignoring SF_NOCACHE for the
 						 * last page.
 						 */
 						if ((npages - i <= max_pgs) &&
 						    ((off + space) & PAGE_MASK) &&
 						    (rem > space || rhpages > 0))
 							m0->m_ext.ext_flags |=
 							    EXT_FLAG_CACHE_LAST;
 					}
 					if (sfs != NULL) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_SYNC;
 						m0->m_ext.ext_arg2 = sfs;
 						mtx_lock(&sfs->mtx);
 						sfs->count++;
 						mtx_unlock(&sfs->mtx);
 					}
 					ext_pgs = m0->m_ext.ext_pgs;
 					if (i == 0)
 						sfio->m = m0;
 					ext_pgs_idx = 0;
 
 					/* Append to mbuf chain. */
 					if (mtail != NULL)
 						mtail->m_next = m0;
 					else
 						m = m0;
 					mtail = m0;
 					ext_pgs->first_pg_off =
 					    vmoff(i, off) & PAGE_MASK;
 				}
 				if (nios) {
 					mtail->m_flags |= M_NOTREADY;
 					ext_pgs->nrdy++;
 				}
 
 				ext_pgs->pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pa[i]);
 				ext_pgs->npgs++;
 				xfs = xfsize(i, npages, off, space);
 				ext_pgs->last_pg_len = xfs;
 				MBUF_EXT_PGS_ASSERT_SANITY(ext_pgs);
 				mtail->m_len += xfs;
 				mtail->m_ext.ext_size += PAGE_SIZE;
 				continue;
 			}
 
 			/*
 			 * Get a sendfile buf.  When allocating the
 			 * first buffer for mbuf chain, we usually
 			 * wait as long as necessary, but this wait
 			 * can be interrupted.  For consequent
 			 * buffers, do not sleep, since several
 			 * threads might exhaust the buffers and then
 			 * deadlock.
 			 */
 			sf = sf_buf_alloc(pa[i],
 			    m != NULL ? SFB_NOWAIT : SFB_CATCH);
 			if (sf == NULL) {
 				SFSTAT_INC(sf_allocfail);
 				for (int j = i; j < npages; j++) {
 					vm_page_lock(pa[j]);
 					vm_page_unwire(pa[j], PQ_INACTIVE);
 					vm_page_unlock(pa[j]);
 				}
 				if (m == NULL)
 					softerr = ENOBUFS;
 				fixspace(npages, i, off, &space);
 				npages = i;
 				break;
 			}
 
 			m0 = m_get(M_WAITOK, MT_DATA);
 			m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
 			m0->m_ext.ext_size = PAGE_SIZE;
 			m0->m_ext.ext_arg1 = sf;
 			m0->m_ext.ext_type = EXT_SFBUF;
 			m0->m_ext.ext_flags = EXT_FLAG_EMBREF;
 			m0->m_ext.ext_free = sendfile_free_mext;
 			/*
 			 * SF_NOCACHE sets the page as being freed upon send.
 			 * However, we ignore it for the last page in 'space',
 			 * if the page is truncated, and we got more data to
 			 * send (rem > space), or if we have readahead
 			 * configured (rhpages > 0).
 			 */
 			if ((flags & SF_NOCACHE) &&
 			    (i != npages - 1 ||
 			    !((off + space) & PAGE_MASK) ||
 			    !(rem > space || rhpages > 0)))
 				m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE;
 			if (sfs != NULL) {
 				m0->m_ext.ext_flags |= EXT_FLAG_SYNC;
 				m0->m_ext.ext_arg2 = sfs;
 				mtx_lock(&sfs->mtx);
 				sfs->count++;
 				mtx_unlock(&sfs->mtx);
 			}
 			m0->m_ext.ext_count = 1;
 			m0->m_flags |= (M_EXT | M_RDONLY);
 			if (nios)
 				m0->m_flags |= M_NOTREADY;
 			m0->m_data = (char *)sf_buf_kva(sf) +
 			    (vmoff(i, off) & PAGE_MASK);
 			m0->m_len = xfsize(i, npages, off, space);
 
 			if (i == 0)
 				sfio->m = m0;
 
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
 				mtail->m_next = m0;
 			else
 				m = m0;
 			mtail = m0;
 		}
 
 		if (vp != NULL)
 			VOP_UNLOCK(vp, 0);
 
 		/* Keep track of bytes processed. */
 		off += space;
 		rem -= space;
 
 		/* Prepend header, if any. */
 		if (hdrlen) {
 prepend_header:
 			mhtail->m_next = m;
 			m = mh;
 			mh = NULL;
 		}
 
 		if (m == NULL) {
 			KASSERT(softerr, ("%s: m NULL, no error", __func__));
 			error = softerr;
 			free(sfio, M_TEMP);
 			goto done;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		KASSERT(m_length(m, NULL) == space + hdrlen,
 		    ("%s: mlen %u space %d hdrlen %d",
 		    __func__, m_length(m, NULL), space, hdrlen));
 
 		CURVNET_SET(so->so_vnet);
 		if (nios == 0) {
 			/*
 			 * If sendfile_swapin() didn't initiate any I/Os,
 			 * which happens if all data is cached in VM, then
 			 * we can send data right now without the
 			 * PRUS_NOTREADY flag.
 			 */
 			free(sfio, M_TEMP);
 			error = (*so->so_proto->pr_usrreqs->pru_send)
 			    (so, 0, m, NULL, NULL, td);
 		} else {
 			sfio->npages = npages;
 			soref(so);
 			error = (*so->so_proto->pr_usrreqs->pru_send)
 			    (so, PRUS_NOTREADY, m, NULL, NULL, td);
 			sendfile_iodone(sfio, NULL, 0, 0);
 		}
 		CURVNET_RESTORE();
 
 		m = NULL;	/* pru_send always consumes */
 		if (error)
 			goto done;
 		sbytes += space + hdrlen;
 		if (hdrlen)
 			hdrlen = 0;
 		if (softerr) {
 			error = softerr;
 			goto done;
 		}
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		sbunlock(&so->so_snd);
 		error = kern_writev(td, sockfd, trl_uio);
 		if (error == 0)
 			sbytes += td->td_retval[0];
 		goto out;
 	}
 
 done:
 	sbunlock(&so->so_snd);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 	if (mh)
 		m_freem(mh);
 
 	if (sfs != NULL) {
 		mtx_lock(&sfs->mtx);
 		if (sfs->count != 0)
 			cv_wait(&sfs->cv, &sfs->mtx);
 		KASSERT(sfs->count == 0, ("sendfile sync still busy"));
 		cv_destroy(&sfs->cv);
 		mtx_destroy(&sfs->mtx);
 		free(sfs, M_TEMP);
 	}
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	off_t sbytes;
 	int error;
 
 	/*
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if (uap->offset < 0)
 		return (EINVAL);
 
 	sbytes = 0;
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error != 0)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
 			    &hdr_uio);
 			if (error != 0)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
 			    &trl_uio);
 			if (error != 0)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	/*
 	 * sendfile(2) can start at any offset within a file so we require
 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
 	 */
 	if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
 	free(hdr_uio, M_IOV);
 	free(trl_uio, M_IOV);
 	return (error);
 }
 
 /*
  * sendfile(2)
  * 
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  * 
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 {
  
 	return (sendfile(td, uap, 0));
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
Index: projects/fuse2/sys/kern/kern_sig.c
===================================================================
--- projects/fuse2/sys/kern/kern_sig.c	(revision 350434)
+++ projects/fuse2/sys/kern/kern_sig.c	(revision 350435)
@@ -1,3858 +1,3859 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/ctype.h>
 #include <sys/systm.h>
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
 #include <sys/bus.h>
 #include <sys/capsicum.h>
 #include <sys/compressor.h>
 #include <sys/condvar.h>
 #include <sys/event.h>
 #include <sys/fcntl.h>
 #include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/procdesc.h>
 #include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sdt.h>
 #include <sys/sbuf.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 #include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <sys/jail.h>
 
 #include <machine/cpu.h>
 
 #include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
 SDT_PROVIDER_DECLARE(proc);
 SDT_PROBE_DEFINE3(proc, , , signal__send,
     "struct thread *", "struct proc *", "int");
 SDT_PROBE_DEFINE2(proc, , , signal__clear,
     "int", "ksiginfo_t *");
 SDT_PROBE_DEFINE3(proc, , , signal__discard,
     "struct thread *", "struct proc *", "int");
 
 static int	coredump(struct thread *);
 static int	killpg1(struct thread *td, int sig, int pgid, int all,
 		    ksiginfo_t *ksi);
 static int	issignal(struct thread *td);
 static int	sigprop(int sig);
 static void	tdsigwakeup(struct thread *, int, sig_t, int);
 static int	sig_suspend_threads(struct thread *, struct proc *, int);
 static int	filt_sigattach(struct knote *kn);
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
 static void	sigqueue_start(void);
 
 static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops = {
 	.f_isfd = 0,
 	.f_attach = filt_sigattach,
 	.f_detach = filt_sigdetach,
 	.f_event = filt_signal,
 };
 
 static int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW,
     &kern_logsigexit, 0,
     "Log processes quitting on abnormal signals to syslog(3)");
 
 static int	kern_forcesigexit = 1;
 SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
     &kern_forcesigexit, 0, "Force trap signal to be handled");
 
 static SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0,
     "POSIX real time signal");
 
 static int	max_pending_per_proc = 128;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
     &max_pending_per_proc, 0, "Max pending signals per proc");
 
 static int	preallocate_siginfo = 1024;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RDTUN,
     &preallocate_siginfo, 0, "Preallocated signal memory size");
 
 static int	signal_overflow = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
     &signal_overflow, 0, "Number of signals overflew");
 
 static int	signal_alloc_fail = 0;
 SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
     &signal_alloc_fail, 0, "signals failed to be allocated");
 
 static int	kern_lognosys = 0;
 SYSCTL_INT(_kern, OID_AUTO, lognosys, CTLFLAG_RWTUN, &kern_lognosys, 0,
     "Log invalid syscalls");
 
 SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
 
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
  * in the right situations.
  */
 #define CANSIGIO(cr1, cr2) \
 	((cr1)->cr_uid == 0 || \
 	    (cr1)->cr_ruid == (cr2)->cr_ruid || \
 	    (cr1)->cr_uid == (cr2)->cr_ruid || \
 	    (cr1)->cr_ruid == (cr2)->cr_uid || \
 	    (cr1)->cr_uid == (cr2)->cr_uid)
 
 static int	sugid_coredump;
 SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RWTUN,
     &sugid_coredump, 0, "Allow setuid and setgid processes to dump core");
 
 static int	capmode_coredump;
 SYSCTL_INT(_kern, OID_AUTO, capmode_coredump, CTLFLAG_RWTUN,
     &capmode_coredump, 0, "Allow processes in capability mode to dump core");
 
 static int	do_coredump = 1;
 SYSCTL_INT(_kern, OID_AUTO, coredump, CTLFLAG_RW,
 	&do_coredump, 0, "Enable/Disable coredumps");
 
 static int	set_core_nodump_flag = 0;
 SYSCTL_INT(_kern, OID_AUTO, nodump_coredump, CTLFLAG_RW, &set_core_nodump_flag,
 	0, "Enable setting the NODUMP flag on coredump files");
 
 static int	coredump_devctl = 0;
 SYSCTL_INT(_kern, OID_AUTO, coredump_devctl, CTLFLAG_RW, &coredump_devctl,
 	0, "Generate a devctl notification when processes coredump");
 
 /*
  * Signal properties and actions.
  * The array below categorizes the signals and their default actions
  * according to the following properties:
  */
 #define	SIGPROP_KILL		0x01	/* terminates process by default */
 #define	SIGPROP_CORE		0x02	/* ditto and coredumps */
 #define	SIGPROP_STOP		0x04	/* suspend process */
 #define	SIGPROP_TTYSTOP		0x08	/* ditto, from tty */
 #define	SIGPROP_IGNORE		0x10	/* ignore by default */
 #define	SIGPROP_CONT		0x20	/* continue if suspended */
 #define	SIGPROP_CANTMASK	0x40	/* non-maskable, catchable */
 
 static int sigproptbl[NSIG] = {
 	[SIGHUP] =	SIGPROP_KILL,
 	[SIGINT] =	SIGPROP_KILL,
 	[SIGQUIT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGILL] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGTRAP] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGABRT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGEMT] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGFPE] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGKILL] =	SIGPROP_KILL,
 	[SIGBUS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSEGV] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGSYS] =	SIGPROP_KILL | SIGPROP_CORE,
 	[SIGPIPE] =	SIGPROP_KILL,
 	[SIGALRM] =	SIGPROP_KILL,
 	[SIGTERM] =	SIGPROP_KILL,
 	[SIGURG] =	SIGPROP_IGNORE,
 	[SIGSTOP] =	SIGPROP_STOP,
 	[SIGTSTP] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGCONT] =	SIGPROP_IGNORE | SIGPROP_CONT,
 	[SIGCHLD] =	SIGPROP_IGNORE,
 	[SIGTTIN] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGTTOU] =	SIGPROP_STOP | SIGPROP_TTYSTOP,
 	[SIGIO] =	SIGPROP_IGNORE,
 	[SIGXCPU] =	SIGPROP_KILL,
 	[SIGXFSZ] =	SIGPROP_KILL,
 	[SIGVTALRM] =	SIGPROP_KILL,
 	[SIGPROF] =	SIGPROP_KILL,
 	[SIGWINCH] =	SIGPROP_IGNORE,
 	[SIGINFO] =	SIGPROP_IGNORE,
 	[SIGUSR1] =	SIGPROP_KILL,
 	[SIGUSR2] =	SIGPROP_KILL,
 };
 
 static void reschedule_signals(struct proc *p, sigset_t block, int flags);
 
 static void
 sigqueue_start(void)
 {
 	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
 		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
 	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
 	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
 	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
 }
 
 ksiginfo_t *
 ksiginfo_alloc(int wait)
 {
 	int flags;
 
 	flags = M_ZERO;
 	if (! wait)
 		flags |= M_NOWAIT;
 	if (ksiginfo_zone != NULL)
 		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
 	return (NULL);
 }
 
 void
 ksiginfo_free(ksiginfo_t *ksi)
 {
 	uma_zfree(ksiginfo_zone, ksi);
 }
 
 static __inline int
 ksiginfo_tryfree(ksiginfo_t *ksi)
 {
 	if (!(ksi->ksi_flags & KSI_EXT)) {
 		uma_zfree(ksiginfo_zone, ksi);
 		return (1);
 	}
 	return (0);
 }
 
 void
 sigqueue_init(sigqueue_t *list, struct proc *p)
 {
 	SIGEMPTYSET(list->sq_signals);
 	SIGEMPTYSET(list->sq_kill);
 	SIGEMPTYSET(list->sq_ptrace);
 	TAILQ_INIT(&list->sq_list);
 	list->sq_proc = p;
 	list->sq_flags = SQ_INIT;
 }
 
 /*
  * Get a signal's ksiginfo.
  * Return:
  *	0	-	signal not found
  *	others	-	signal number
  */
 static int
 sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi, *next;
 	int count = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (!SIGISMEMBER(sq->sq_signals, signo))
 		return (0);
 
 	if (SIGISMEMBER(sq->sq_ptrace, signo)) {
 		count++;
 		SIGDELSET(sq->sq_ptrace, signo);
 		si->ksi_flags |= KSI_PTRACE;
 	}
 	if (SIGISMEMBER(sq->sq_kill, signo)) {
 		count++;
 		if (count == 1)
 			SIGDELSET(sq->sq_kill, signo);
 	}
 
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (ksi->ksi_signo == signo) {
 			if (count == 0) {
 				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 				ksi->ksi_sigq = NULL;
 				ksiginfo_copy(ksi, si);
 				if (ksiginfo_tryfree(ksi) && p != NULL)
 					p->p_pendingcnt--;
 			}
 			if (++count > 1)
 				break;
 		}
 	}
 
 	if (count <= 1)
 		SIGDELSET(sq->sq_signals, signo);
 	si->ksi_signo = signo;
 	return (signo);
 }
 
 void
 sigqueue_take(ksiginfo_t *ksi)
 {
 	struct ksiginfo *kp;
 	struct proc	*p;
 	sigqueue_t	*sq;
 
 	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
 		return;
 
 	p = sq->sq_proc;
 	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 	ksi->ksi_sigq = NULL;
 	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
 		p->p_pendingcnt--;
 
 	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
 	     kp = TAILQ_NEXT(kp, ksi_link)) {
 		if (kp->ksi_signo == ksi->ksi_signo)
 			break;
 	}
 	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo) &&
 	    !SIGISMEMBER(sq->sq_ptrace, ksi->ksi_signo))
 		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
 }
 
 static int
 sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
 {
 	struct proc *p = sq->sq_proc;
 	struct ksiginfo *ksi;
 	int ret = 0;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	/*
 	 * SIGKILL/SIGSTOP cannot be caught or masked, so take the fast path
 	 * for these signals.
 	 */
 	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	/* directly insert the ksi, don't copy it */
 	if (si->ksi_flags & KSI_INS) {
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, si, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
 		si->ksi_sigq = sq;
 		goto out_set_bit;
 	}
 
 	if (__predict_false(ksiginfo_zone == NULL)) {
 		SIGADDSET(sq->sq_kill, signo);
 		goto out_set_bit;
 	}
 
 	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
 		signal_overflow++;
 		ret = EAGAIN;
 	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
 		signal_alloc_fail++;
 		ret = EAGAIN;
 	} else {
 		if (p != NULL)
 			p->p_pendingcnt++;
 		ksiginfo_copy(si, ksi);
 		ksi->ksi_signo = signo;
 		if (si->ksi_flags & KSI_HEAD)
 			TAILQ_INSERT_HEAD(&sq->sq_list, ksi, ksi_link);
 		else
 			TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = sq;
 	}
 
 	if (ret != 0) {
 		if ((si->ksi_flags & KSI_PTRACE) != 0) {
 			SIGADDSET(sq->sq_ptrace, signo);
 			ret = 0;
 			goto out_set_bit;
 		} else if ((si->ksi_flags & KSI_TRAP) != 0 ||
 		    (si->ksi_flags & KSI_SIGQ) == 0) {
 			SIGADDSET(sq->sq_kill, signo);
 			ret = 0;
 			goto out_set_bit;
 		}
 		return (ret);
 	}
 
 out_set_bit:
 	SIGADDSET(sq->sq_signals, signo);
 	return (ret);
 }
 
 void
 sigqueue_flush(sigqueue_t *sq)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
 
 	if (p != NULL)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
 		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 		ksi->ksi_sigq = NULL;
 		if (ksiginfo_tryfree(ksi) && p != NULL)
 			p->p_pendingcnt--;
 	}
 
 	SIGEMPTYSET(sq->sq_signals);
 	SIGEMPTYSET(sq->sq_kill);
 	SIGEMPTYSET(sq->sq_ptrace);
 }
 
 static void
 sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, const sigset_t *set)
 {
 	sigset_t tmp;
 	struct proc *p1, *p2;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
 	p1 = src->sq_proc;
 	p2 = dst->sq_proc;
 	/* Move siginfo to target list */
 	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
 			if (p1 != NULL)
 				p1->p_pendingcnt--;
 			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = dst;
 			if (p2 != NULL)
 				p2->p_pendingcnt++;
 		}
 	}
 
 	/* Move pending bits to target list */
 	tmp = src->sq_kill;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_kill, tmp);
 	SIGSETNAND(src->sq_kill, tmp);
 
 	tmp = src->sq_ptrace;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_ptrace, tmp);
 	SIGSETNAND(src->sq_ptrace, tmp);
 
 	tmp = src->sq_signals;
 	SIGSETAND(tmp, *set);
 	SIGSETOR(dst->sq_signals, tmp);
 	SIGSETNAND(src->sq_signals, tmp);
 }
 
 #if 0
 static void
 sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_move_set(src, dst, &set);
 }
 #endif
 
 static void
 sigqueue_delete_set(sigqueue_t *sq, const sigset_t *set)
 {
 	struct proc *p = sq->sq_proc;
 	ksiginfo_t *ksi, *next;
 
 	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
 
 	/* Remove siginfo queue */
 	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
 		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
 			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
 			ksi->ksi_sigq = NULL;
 			if (ksiginfo_tryfree(ksi) && p != NULL)
 				p->p_pendingcnt--;
 		}
 	}
 	SIGSETNAND(sq->sq_kill, *set);
 	SIGSETNAND(sq->sq_ptrace, *set);
 	SIGSETNAND(sq->sq_signals, *set);
 }
 
 void
 sigqueue_delete(sigqueue_t *sq, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set(sq, &set);
 }
 
 /* Remove a set of signals for a process */
 static void
 sigqueue_delete_set_proc(struct proc *p, const sigset_t *set)
 {
 	sigqueue_t worklist;
 	struct thread *td0;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_init(&worklist, NULL);
 	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
 
 	FOREACH_THREAD_IN_PROC(p, td0)
 		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
 
 	sigqueue_flush(&worklist);
 }
 
 void
 sigqueue_delete_proc(struct proc *p, int signo)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, signo);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 static void
 sigqueue_delete_stopmask_proc(struct proc *p)
 {
 	sigset_t set;
 
 	SIGEMPTYSET(set);
 	SIGADDSET(set, SIGSTOP);
 	SIGADDSET(set, SIGTSTP);
 	SIGADDSET(set, SIGTTIN);
 	SIGADDSET(set, SIGTTOU);
 	sigqueue_delete_set_proc(p, &set);
 }
 
 /*
  * Determine signal that should be delivered to thread td, the current
  * thread, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
 	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
  * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 
 	if (SIGPENDING(td)) {
 		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
 		thread_unlock(td);
 	}
 }
 
 /*
  * Returns 1 (true) if altstack is configured for the thread, and the
  * passed stack bottom address falls into the altstack range.  Handles
  * the 43 compat special case where the alt stack size is zero.
  */
 int
 sigonstack(size_t sp)
 {
 	struct thread *td;
 
 	td = curthread;
 	if ((td->td_pflags & TDP_ALTSTACK) == 0)
 		return (0);
 #if defined(COMPAT_43)
 	if (td->td_sigstk.ss_size == 0)
 		return ((td->td_sigstk.ss_flags & SS_ONSTACK) != 0);
 #endif
 	return (sp >= (size_t)td->td_sigstk.ss_sp &&
 	    sp < td->td_sigstk.ss_size + (size_t)td->td_sigstk.ss_sp);
 }
 
 static __inline int
 sigprop(int sig)
 {
 
 	if (sig > 0 && sig < nitems(sigproptbl))
 		return (sigproptbl[sig]);
 	return (0);
 }
 
 int
 sig_ffs(sigset_t *set)
 {
 	int i;
 
 	for (i = 0; i < _SIG_WORDS; i++)
 		if (set->__bits[i])
 			return (ffs(set->__bits[i]) + (i * 32));
 	return (0);
 }
 
 static bool
 sigact_flag_test(const struct sigaction *act, int flag)
 {
 
 	/*
 	 * SA_SIGINFO is reset when signal disposition is set to
 	 * ignore or default.  Other flags are kept according to user
 	 * settings.
 	 */
 	return ((act->sa_flags & flag) != 0 && (flag != SA_SIGINFO ||
 	    ((__sighandler_t *)act->sa_sigaction != SIG_IGN &&
 	    (__sighandler_t *)act->sa_sigaction != SIG_DFL)));
 }
 
 /*
  * kern_sigaction
  * sigaction
  * freebsd4_sigaction
  * osigaction
  */
 int
 kern_sigaction(struct thread *td, int sig, const struct sigaction *act,
     struct sigaction *oact, int flags)
 {
 	struct sigacts *ps;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
 		return (EINVAL);
 	if (act != NULL && act->sa_handler != SIG_DFL &&
 	    act->sa_handler != SIG_IGN && (act->sa_flags & ~(SA_ONSTACK |
 	    SA_RESTART | SA_RESETHAND | SA_NOCLDSTOP | SA_NODEFER |
 	    SA_NOCLDWAIT | SA_SIGINFO)) != 0)
 		return (EINVAL);
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if (oact) {
 		memset(oact, 0, sizeof(*oact));
 		oact->sa_mask = ps->ps_catchmask[_SIG_IDX(sig)];
 		if (SIGISMEMBER(ps->ps_sigonstack, sig))
 			oact->sa_flags |= SA_ONSTACK;
 		if (!SIGISMEMBER(ps->ps_sigintr, sig))
 			oact->sa_flags |= SA_RESTART;
 		if (SIGISMEMBER(ps->ps_sigreset, sig))
 			oact->sa_flags |= SA_RESETHAND;
 		if (SIGISMEMBER(ps->ps_signodefer, sig))
 			oact->sa_flags |= SA_NODEFER;
 		if (SIGISMEMBER(ps->ps_siginfo, sig)) {
 			oact->sa_flags |= SA_SIGINFO;
 			oact->sa_sigaction =
 			    (__siginfohandler_t *)ps->ps_sigact[_SIG_IDX(sig)];
 		} else
 			oact->sa_handler = ps->ps_sigact[_SIG_IDX(sig)];
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDSTOP)
 			oact->sa_flags |= SA_NOCLDSTOP;
 		if (sig == SIGCHLD && ps->ps_flag & PS_NOCLDWAIT)
 			oact->sa_flags |= SA_NOCLDWAIT;
 	}
 	if (act) {
 		if ((sig == SIGKILL || sig == SIGSTOP) &&
 		    act->sa_handler != SIG_DFL) {
 			mtx_unlock(&ps->ps_mtx);
 			PROC_UNLOCK(p);
 			return (EINVAL);
 		}
 
 		/*
 		 * Change setting atomically.
 		 */
 
 		ps->ps_catchmask[_SIG_IDX(sig)] = act->sa_mask;
 		SIG_CANTMASK(ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (sigact_flag_test(act, SA_SIGINFO)) {
 			ps->ps_sigact[_SIG_IDX(sig)] =
 			    (__sighandler_t *)act->sa_sigaction;
 			SIGADDSET(ps->ps_siginfo, sig);
 		} else {
 			ps->ps_sigact[_SIG_IDX(sig)] = act->sa_handler;
 			SIGDELSET(ps->ps_siginfo, sig);
 		}
 		if (!sigact_flag_test(act, SA_RESTART))
 			SIGADDSET(ps->ps_sigintr, sig);
 		else
 			SIGDELSET(ps->ps_sigintr, sig);
 		if (sigact_flag_test(act, SA_ONSTACK))
 			SIGADDSET(ps->ps_sigonstack, sig);
 		else
 			SIGDELSET(ps->ps_sigonstack, sig);
 		if (sigact_flag_test(act, SA_RESETHAND))
 			SIGADDSET(ps->ps_sigreset, sig);
 		else
 			SIGDELSET(ps->ps_sigreset, sig);
 		if (sigact_flag_test(act, SA_NODEFER))
 			SIGADDSET(ps->ps_signodefer, sig);
 		else
 			SIGDELSET(ps->ps_signodefer, sig);
 		if (sig == SIGCHLD) {
 			if (act->sa_flags & SA_NOCLDSTOP)
 				ps->ps_flag |= PS_NOCLDSTOP;
 			else
 				ps->ps_flag &= ~PS_NOCLDSTOP;
 			if (act->sa_flags & SA_NOCLDWAIT) {
 				/*
 				 * Paranoia: since SA_NOCLDWAIT is implemented
 				 * by reparenting the dying child to PID 1 (and
 				 * trust it to reap the zombie), PID 1 itself
 				 * is forbidden to set SA_NOCLDWAIT.
 				 */
 				if (p->p_pid == 1)
 					ps->ps_flag &= ~PS_NOCLDWAIT;
 				else
 					ps->ps_flag |= PS_NOCLDWAIT;
 			} else
 				ps->ps_flag &= ~PS_NOCLDWAIT;
 			if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 				ps->ps_flag |= PS_CLDSIGIGN;
 			else
 				ps->ps_flag &= ~PS_CLDSIGIGN;
 		}
 		/*
 		 * Set bit in ps_sigignore for signals that are set to SIG_IGN,
 		 * and for signals set to SIG_DFL where the default is to
 		 * ignore. However, don't put SIGCONT in ps_sigignore, as we
 		 * have to restart the process.
 		 */
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SIGPROP_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
 			/* never to be seen again */
 			sigqueue_delete_proc(p, sig);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 		} else {
 			SIGDELSET(ps->ps_sigignore, sig);
 			if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)
 				SIGDELSET(ps->ps_sigcatch, sig);
 			else
 				SIGADDSET(ps->ps_sigcatch, sig);
 		}
 #ifdef COMPAT_FREEBSD4
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_FREEBSD4) == 0)
 			SIGDELSET(ps->ps_freebsd4, sig);
 		else
 			SIGADDSET(ps->ps_freebsd4, sig);
 #endif
 #ifdef COMPAT_43
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL ||
 		    (flags & KSA_OSIGSET) == 0)
 			SIGDELSET(ps->ps_osigset, sig);
 		else
 			SIGADDSET(ps->ps_osigset, sig);
 #endif
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 sys_sigaction(struct thread *td, struct sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, 0);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD4
 #ifndef _SYS_SYSPROTO_H_
 struct freebsd4_sigaction_args {
 	int	sig;
 	struct	sigaction *act;
 	struct	sigaction *oact;
 };
 #endif
 int
 freebsd4_sigaction(struct thread *td, struct freebsd4_sigaction_args *uap)
 {
 	struct sigaction act, oact;
 	struct sigaction *actp, *oactp;
 	int error;
 
 
 	actp = (uap->act != NULL) ? &act : NULL;
 	oactp = (uap->oact != NULL) ? &oact : NULL;
 	if (actp) {
 		error = copyin(uap->act, actp, sizeof(act));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaction(td, uap->sig, actp, oactp, KSA_FREEBSD4);
 	if (oactp && !error)
 		error = copyout(oactp, uap->oact, sizeof(oact));
 	return (error);
 }
 #endif	/* COMAPT_FREEBSD4 */
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigaction_args {
 	int	signum;
 	struct	osigaction *nsa;
 	struct	osigaction *osa;
 };
 #endif
 int
 osigaction(struct thread *td, struct osigaction_args *uap)
 {
 	struct osigaction sa;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 
 	nsap = (uap->nsa != NULL) ? &nsa : NULL;
 	osap = (uap->osa != NULL) ? &osa : NULL;
 
 	if (nsap) {
 		error = copyin(uap->nsa, &sa, sizeof(sa));
 		if (error)
 			return (error);
 		nsap->sa_handler = sa.sa_handler;
 		nsap->sa_flags = sa.sa_flags;
 		OSIG2SIG(sa.sa_mask, nsap->sa_mask);
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		sa.sa_handler = osap->sa_handler;
 		sa.sa_flags = osap->sa_flags;
 		SIG2OSIG(osap->sa_mask, sa.sa_mask);
 		error = copyout(&sa, uap->osa, sizeof(sa));
 	}
 	return (error);
 }
 
 #if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(struct thread *td, struct osigreturn_args *uap)
 {
 
 	return (nosys(td, (struct nosys_args *)uap));
 }
 #endif
 #endif /* COMPAT_43 */
 
 /*
  * Initialize signal state for process 0;
  * set to ignore signals that are ignored by default.
  */
 void
 siginit(struct proc *p)
 {
 	int i;
 	struct sigacts *ps;
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	for (i = 1; i <= NSIG; i++) {
 		if (sigprop(i) & SIGPROP_IGNORE && i != SIGCONT) {
 			SIGADDSET(ps->ps_sigignore, i);
 		}
 	}
 	mtx_unlock(&ps->ps_mtx);
 	PROC_UNLOCK(p);
 }
 
 /*
  * Reset specified signal to the default disposition.
  */
 static void
 sigdflt(struct sigacts *ps, int sig)
 {
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	SIGDELSET(ps->ps_sigcatch, sig);
 	if ((sigprop(sig) & SIGPROP_IGNORE) != 0 && sig != SIGCONT)
 		SIGADDSET(ps->ps_sigignore, sig);
 	ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	SIGDELSET(ps->ps_siginfo, sig);
 }
 
 /*
  * Reset signals for an exec of the specified process.
  */
 void
 execsigs(struct proc *p)
 {
 	sigset_t osigignore;
 	struct sigacts *ps;
 	int sig;
 	struct thread *td;
 
 	/*
 	 * Reset caught signals.  Held signals remain held
 	 * through td_sigmask (unless they were caught,
 	 * and are now ignored by default).
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	while (SIGNOTEMPTY(ps->ps_sigcatch)) {
 		sig = sig_ffs(&ps->ps_sigcatch);
 		sigdflt(ps, sig);
 		if ((sigprop(sig) & SIGPROP_IGNORE) != 0)
 			sigqueue_delete_proc(p, sig);
 	}
 
 	/*
 	 * As CloudABI processes cannot modify signal handlers, fully
 	 * reset all signals to their default behavior. Do ignore
 	 * SIGPIPE, as it would otherwise be impossible to recover from
 	 * writes to broken pipes and sockets.
 	 */
 	if (SV_PROC_ABI(p) == SV_ABI_CLOUDABI) {
 		osigignore = ps->ps_sigignore;
 		while (SIGNOTEMPTY(osigignore)) {
 			sig = sig_ffs(&osigignore);
 			SIGDELSET(osigignore, sig);
 			if (sig != SIGPIPE)
 				sigdflt(ps, sig);
 		}
 		SIGADDSET(ps->ps_sigignore, SIGPIPE);
 	}
 
 	/*
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
 	td = curthread;
 	MPASS(td->td_proc == p);
 	td->td_sigstk.ss_flags = SS_DISABLE;
 	td->td_sigstk.ss_size = 0;
 	td->td_sigstk.ss_sp = 0;
 	td->td_pflags &= ~TDP_ALTSTACK;
 	/*
 	 * Reset no zombies if child dies flag as Solaris does.
 	 */
 	ps->ps_flag &= ~(PS_NOCLDWAIT | PS_CLDSIGIGN);
 	if (ps->ps_sigact[_SIG_IDX(SIGCHLD)] == SIG_IGN)
 		ps->ps_sigact[_SIG_IDX(SIGCHLD)] = SIG_DFL;
 	mtx_unlock(&ps->ps_mtx);
 }
 
 /*
  * kern_sigprocmask()
  *
  *	Manipulate signal mask.
  */
 int
 kern_sigprocmask(struct thread *td, int how, sigset_t *set, sigset_t *oset,
     int flags)
 {
 	sigset_t new_block, oset1;
 	struct proc *p;
 	int error;
 
 	p = td->td_proc;
 	if ((flags & SIGPROCMASK_PROC_LOCKED) != 0)
 		PROC_LOCK_ASSERT(p, MA_OWNED);
 	else
 		PROC_LOCK(p);
 	mtx_assert(&p->p_sigacts->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0
 	    ? MA_OWNED : MA_NOTOWNED);
 	if (oset != NULL)
 		*oset = td->td_sigmask;
 
 	error = 0;
 	if (set != NULL) {
 		switch (how) {
 		case SIG_BLOCK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			SIGSETOR(td->td_sigmask, *set);
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			break;
 		case SIG_UNBLOCK:
 			SIGSETNAND(td->td_sigmask, *set);
 			signotify(td);
 			goto out;
 		case SIG_SETMASK:
 			SIG_CANTMASK(*set);
 			oset1 = td->td_sigmask;
 			if (flags & SIGPROCMASK_OLD)
 				SIGSETLO(td->td_sigmask, *set);
 			else
 				td->td_sigmask = *set;
 			new_block = td->td_sigmask;
 			SIGSETNAND(new_block, oset1);
 			signotify(td);
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * The new_block set contains signals that were not previously
 		 * blocked, but are blocked now.
 		 *
 		 * In case we block any signal that was not previously blocked
 		 * for td, and process has the signal pending, try to schedule
 		 * signal delivery to some thread that does not block the
 		 * signal, possibly waking it up.
 		 */
 		if (p->p_numthreads != 1)
 			reschedule_signals(p, new_block, flags);
 	}
 
 out:
 	if (!(flags & SIGPROCMASK_PROC_LOCKED))
 		PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
 	const sigset_t *set;
 	sigset_t *oset;
 };
 #endif
 int
 sys_sigprocmask(struct thread *td, struct sigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	sigset_t *setp, *osetp;
 	int error;
 
 	setp = (uap->set != NULL) ? &set : NULL;
 	osetp = (uap->oset != NULL) ? &oset : NULL;
 	if (setp) {
 		error = copyin(uap->set, setp, sizeof(set));
 		if (error)
 			return (error);
 	}
 	error = kern_sigprocmask(td, uap->how, setp, osetp, 0);
 	if (osetp && !error) {
 		error = copyout(osetp, uap->oset, sizeof(oset));
 	}
 	return (error);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
 	osigset_t mask;
 };
 #endif
 int
 osigprocmask(struct thread *td, struct osigprocmask_args *uap)
 {
 	sigset_t set, oset;
 	int error;
 
 	OSIG2SIG(uap->mask, set);
 	error = kern_sigprocmask(td, uap->how, &set, &oset, 1);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 int
 sys_sigwait(struct thread *td, struct sigwait_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error) {
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == EINTR && td->td_proc->p_osrel < P_OSREL_SIGWAIT)
 			error = ERESTART;
 		if (error == ERESTART)
 			return (error);
 		td->td_retval[0] = error;
 		return (0);
 	}
 
 	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
 
 int
 sys_sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
 	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
 		if (error)
 			return (error);
 
 		timeout = &ts;
 	} else
 		timeout = NULL;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 int
 sys_sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
 	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
 	error = copyin(uap->set, &set, sizeof(set));
 	if (error)
 		return (error);
 
 	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
 		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
 
 	if (error == 0)
 		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
 static void
 proc_td_siginfo_capture(struct thread *td, siginfo_t *si)
 {
 	struct thread *thr;
 
 	FOREACH_THREAD_IN_PROC(td->td_proc, thr) {
 		if (thr == td)
 			thr->td_si = *si;
 		else
 			thr->td_si.si_signo = 0;
 	}
 }
 
 int
 kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
 	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t saved_mask, new_block;
 	struct proc *p;
 	int error, sig, timo, timevalid = 0;
 	struct timespec rts, ets, ts;
 	struct timeval tv;
 
 	p = td->td_proc;
 	error = 0;
 	ets.tv_sec = 0;
 	ets.tv_nsec = 0;
 
 	if (timeout != NULL) {
 		if (timeout->tv_nsec >= 0 && timeout->tv_nsec < 1000000000) {
 			timevalid = 1;
 			getnanouptime(&rts);
 			timespecadd(&rts, timeout, &ets);
 		}
 	}
 	ksiginfo_init(ksi);
 	/* Some signals can not be waited for. */
 	SIG_CANTMASK(waitset);
 	ps = p->p_sigacts;
 	PROC_LOCK(p);
 	saved_mask = td->td_sigmask;
 	SIGSETNAND(td->td_sigmask, waitset);
 	for (;;) {
 		mtx_lock(&ps->ps_mtx);
 		sig = cursig(td);
 		mtx_unlock(&ps->ps_mtx);
 		KASSERT(sig >= 0, ("sig %d", sig));
 		if (sig != 0 && SIGISMEMBER(waitset, sig)) {
 			if (sigqueue_get(&td->td_sigqueue, sig, ksi) != 0 ||
 			    sigqueue_get(&p->p_sigqueue, sig, ksi) != 0) {
 				error = 0;
 				break;
 			}
 		}
 
 		if (error != 0)
 			break;
 
 		/*
 		 * POSIX says this must be checked after looking for pending
 		 * signals.
 		 */
 		if (timeout != NULL) {
 			if (!timevalid) {
 				error = EINVAL;
 				break;
 			}
 			getnanouptime(&rts);
 			if (timespeccmp(&rts, &ets, >=)) {
 				error = EAGAIN;
 				break;
 			}
 			timespecsub(&ets, &rts, &ts);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts);
 			timo = tvtohz(&tv);
 		} else {
 			timo = 0;
 		}
 
 		error = msleep(ps, &p->p_mtx, PPAUSE|PCATCH, "sigwait", timo);
 
 		if (timeout != NULL) {
 			if (error == ERESTART) {
 				/* Timeout can not be restarted. */
 				error = EINTR;
 			} else if (error == EAGAIN) {
 				/* We will calculate timeout by ourself. */
 				error = 0;
 			}
 		}
 	}
 
 	new_block = saved_mask;
 	SIGSETNAND(new_block, td->td_sigmask);
 	td->td_sigmask = saved_mask;
 	/*
 	 * Fewer signals can be delivered to us, reschedule signal
 	 * notification.
 	 */
 	if (p->p_numthreads != 1)
 		reschedule_signals(p, new_block, 0);
 
 	if (error == 0) {
 		SDT_PROBE2(proc, , , signal__clear, sig, ksi);
 
 		if (ksi->ksi_code == SI_TIMER)
 			itimer_accept(p, ksi->ksi_timerid, ksi);
 
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_PSIG)) {
 			sig_t action;
 
 			mtx_lock(&ps->ps_mtx);
 			action = ps->ps_sigact[_SIG_IDX(sig)];
 			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, ksi->ksi_code);
 		}
 #endif
 		if (sig == SIGKILL) {
 			proc_td_siginfo_capture(td, &ksi->ksi_info);
 			sigexit(td, sig);
 		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigpending_args {
 	sigset_t	*set;
 };
 #endif
 int
 sys_sigpending(struct thread *td, struct sigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 #ifndef _SYS_SYSPROTO_H_
 struct osigpending_args {
 	int	dummy;
 };
 #endif
 int
 osigpending(struct thread *td, struct osigpending_args *uap)
 {
 	struct proc *p = td->td_proc;
 	sigset_t pending;
 
 	PROC_LOCK(p);
 	pending = p->p_sigqueue.sq_signals;
 	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
 	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigvec_args {
 	int	signum;
 	struct	sigvec *nsv;
 	struct	sigvec *osv;
 };
 #endif
 /* ARGSUSED */
 int
 osigvec(struct thread *td, struct osigvec_args *uap)
 {
 	struct sigvec vec;
 	struct sigaction nsa, osa;
 	struct sigaction *nsap, *osap;
 	int error;
 
 	if (uap->signum <= 0 || uap->signum >= ONSIG)
 		return (EINVAL);
 	nsap = (uap->nsv != NULL) ? &nsa : NULL;
 	osap = (uap->osv != NULL) ? &osa : NULL;
 	if (nsap) {
 		error = copyin(uap->nsv, &vec, sizeof(vec));
 		if (error)
 			return (error);
 		nsap->sa_handler = vec.sv_handler;
 		OSIG2SIG(vec.sv_mask, nsap->sa_mask);
 		nsap->sa_flags = vec.sv_flags;
 		nsap->sa_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
 	}
 	error = kern_sigaction(td, uap->signum, nsap, osap, KSA_OSIGSET);
 	if (osap && !error) {
 		vec.sv_handler = osap->sa_handler;
 		SIG2OSIG(osap->sa_mask, vec.sv_mask);
 		vec.sv_flags = osap->sa_flags;
 		vec.sv_flags &= ~SA_NOCLDWAIT;
 		vec.sv_flags ^= SA_RESTART;
 		error = copyout(&vec, uap->osv, sizeof(vec));
 	}
 	return (error);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigblock_args {
 	int	mask;
 };
 #endif
 int
 osigblock(struct thread *td, struct osigblock_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_BLOCK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct osigsetmask_args {
 	int	mask;
 };
 #endif
 int
 osigsetmask(struct thread *td, struct osigsetmask_args *uap)
 {
 	sigset_t set, oset;
 
 	OSIG2SIG(uap->mask, set);
 	kern_sigprocmask(td, SIG_SETMASK, &set, &oset, 0);
 	SIG2OSIG(oset, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
 
 /*
  * Suspend calling thread until signal, providing mask to be set in the
  * meantime.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigsuspend(struct thread *td, struct sigsuspend_args *uap)
 {
 	sigset_t mask;
 	int error;
 
 	error = copyin(uap->sigmask, &mask, sizeof(mask));
 	if (error)
 		return (error);
 	return (kern_sigsuspend(td, mask));
 }
 
 int
 kern_sigsuspend(struct thread *td, sigset_t mask)
 {
 	struct proc *p = td->td_proc;
 	int has_sig, sig;
 
 	/*
 	 * When returning from sigsuspend, we want
 	 * the old mask to be restored after the
 	 * signal handler has finished.  Thus, we
 	 * save it here and mark the sigacts structure
 	 * to indicate this.
 	 */
 	PROC_LOCK(p);
 	kern_sigprocmask(td, SIG_SETMASK, &mask, &td->td_oldsigmask,
 	    SIGPROCMASK_PROC_LOCKED);
 	td->td_pflags |= TDP_OLDMASK;
 
 	/*
 	 * Process signals now. Otherwise, we can get spurious wakeup
 	 * due to signal entered process queue, but delivered to other
 	 * thread. But sigsuspend should return only on signal
 	 * delivery.
 	 */
 	(p->p_sysent->sv_set_syscall_retval)(td, EINTR);
 	for (has_sig = 0; !has_sig;) {
 		while (msleep(&p->p_sigacts, &p->p_mtx, PPAUSE|PCATCH, "pause",
 			0) == 0)
 			/* void */;
 		thread_suspend_check(0);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0) {
 			KASSERT(sig >= 0, ("sig %d", sig));
 			has_sig += postsig(sig);
 		}
 		mtx_unlock(&p->p_sigacts->ps_mtx);
 	}
 	PROC_UNLOCK(p);
 	td->td_errno = EINTR;
 	td->td_pflags |= TDP_NERRNO;
 	return (EJUSTRETURN);
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
 /*
  * Compatibility sigsuspend call for old binaries.  Note nonstandard calling
  * convention: libc stub passes mask, not pointer, to save a copyin.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct osigsuspend_args {
 	osigset_t mask;
 };
 #endif
 /* ARGSUSED */
 int
 osigsuspend(struct thread *td, struct osigsuspend_args *uap)
 {
 	sigset_t mask;
 
 	OSIG2SIG(uap->mask, mask);
 	return (kern_sigsuspend(td, mask));
 }
 #endif /* COMPAT_43 */
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct osigstack_args {
 	struct	sigstack *nss;
 	struct	sigstack *oss;
 };
 #endif
 /* ARGSUSED */
 int
 osigstack(struct thread *td, struct osigstack_args *uap)
 {
 	struct sigstack nss, oss;
 	int error = 0;
 
 	if (uap->nss != NULL) {
 		error = copyin(uap->nss, &nss, sizeof(nss));
 		if (error)
 			return (error);
 	}
 	oss.ss_sp = td->td_sigstk.ss_sp;
 	oss.ss_onstack = sigonstack(cpu_getstack(td));
 	if (uap->nss != NULL) {
 		td->td_sigstk.ss_sp = nss.ss_sp;
 		td->td_sigstk.ss_size = 0;
 		td->td_sigstk.ss_flags |= nss.ss_onstack & SS_ONSTACK;
 		td->td_pflags |= TDP_ALTSTACK;
 	}
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(oss));
 
 	return (error);
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigaltstack_args {
 	stack_t	*ss;
 	stack_t	*oss;
 };
 #endif
 /* ARGSUSED */
 int
 sys_sigaltstack(struct thread *td, struct sigaltstack_args *uap)
 {
 	stack_t ss, oss;
 	int error;
 
 	if (uap->ss != NULL) {
 		error = copyin(uap->ss, &ss, sizeof(ss));
 		if (error)
 			return (error);
 	}
 	error = kern_sigaltstack(td, (uap->ss != NULL) ? &ss : NULL,
 	    (uap->oss != NULL) ? &oss : NULL);
 	if (error)
 		return (error);
 	if (uap->oss != NULL)
 		error = copyout(&oss, uap->oss, sizeof(stack_t));
 	return (error);
 }
 
 int
 kern_sigaltstack(struct thread *td, stack_t *ss, stack_t *oss)
 {
 	struct proc *p = td->td_proc;
 	int oonstack;
 
 	oonstack = sigonstack(cpu_getstack(td));
 
 	if (oss != NULL) {
 		*oss = td->td_sigstk;
 		oss->ss_flags = (td->td_pflags & TDP_ALTSTACK)
 		    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
 	}
 
 	if (ss != NULL) {
 		if (oonstack)
 			return (EPERM);
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
 			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
 
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
 			td->td_pflags &= ~TDP_ALTSTACK;
 		}
 	}
 	return (0);
 }
 
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
  */
 static int
 killpg1(struct thread *td, int sig, int pgid, int all, ksiginfo_t *ksi)
 {
 	struct proc *p;
 	struct pgrp *pgrp;
 	int err;
 	int ret;
 
 	ret = ESRCH;
 	if (all) {
 		/*
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
 		FOREACH_PROC_IN_SYSTEM(p) {
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == td->td_proc || p->p_state == PRS_NEW) {
 				continue;
 			}
 			PROC_LOCK(p);
 			err = p_cansignal(td, p, sig);
 			if (err == 0) {
 				if (sig)
 					pksignal(p, sig, ksi);
 				ret = err;
 			}
 			else if (ret == ESRCH)
 				ret = err;
 			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
 	} else {
 		sx_slock(&proctree_lock);
 		if (pgid == 0) {
 			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = td->td_proc->p_pgrp;
 			PGRP_LOCK(pgrp);
 		} else {
 			pgrp = pgfind(pgid);
 			if (pgrp == NULL) {
 				sx_sunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		}
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
 			err = p_cansignal(td, p, sig);
 			if (err == 0) {
 				if (sig)
 					pksignal(p, sig, ksi);
 				ret = err;
 			}
 			else if (ret == ESRCH)
 				ret = err;
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(pgrp);
 	}
 	return (ret);
 }
 
 #ifndef _SYS_SYSPROTO_H_
 struct kill_args {
 	int	pid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 sys_kill(struct thread *td, struct kill_args *uap)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	/*
 	 * A process in capability mode can send signals only to himself.
 	 * The main rationale behind this is that abort(3) is implemented as
 	 * kill(getpid(), SIGABRT).
 	 */
 	if (IN_CAPABILITY_MODE(td) && uap->pid != td->td_proc->p_pid)
 		return (ECAPMODE);
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 
 	if (uap->pid > 0) {
 		/* kill single process */
 		if ((p = pfind_any(uap->pid)) == NULL)
 			return (ESRCH);
 		AUDIT_ARG_PROCESS(p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			pksignal(p, uap->signum, &ksi);
 		PROC_UNLOCK(p);
 		return (error);
 	}
 	switch (uap->pid) {
 	case -1:		/* broadcast signal */
 		return (killpg1(td, uap->signum, 0, 1, &ksi));
 	case 0:			/* signal own process group */
 		return (killpg1(td, uap->signum, 0, 0, &ksi));
 	default:		/* negative explicit process group */
 		return (killpg1(td, uap->signum, -uap->pid, 0, &ksi));
 	}
 	/* NOTREACHED */
 }
 
 int
 sys_pdkill(struct thread *td, struct pdkill_args *uap)
 {
 	struct proc *p;
 	int error;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_FD(uap->fd);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	error = procdesc_find(td, uap->fd, &cap_pdkill_rights, &p);
 	if (error)
 		return (error);
 	AUDIT_ARG_PROCESS(p);
 	error = p_cansignal(td, p, uap->signum);
 	if (error == 0 && uap->signum)
 		kern_psignal(p, uap->signum);
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 #if defined(COMPAT_43)
 #ifndef _SYS_SYSPROTO_H_
 struct okillpg_args {
 	int	pgid;
 	int	signum;
 };
 #endif
 /* ARGSUSED */
 int
 okillpg(struct thread *td, struct okillpg_args *uap)
 {
 	ksiginfo_t ksi;
 
 	AUDIT_ARG_SIGNUM(uap->signum);
 	AUDIT_ARG_PID(uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = uap->signum;
 	ksi.ksi_code = SI_USER;
 	ksi.ksi_pid = td->td_proc->p_pid;
 	ksi.ksi_uid = td->td_ucred->cr_ruid;
 	return (killpg1(td, uap->signum, uap->pgid, 0, &ksi));
 }
 #endif /* COMPAT_43 */
 
 #ifndef _SYS_SYSPROTO_H_
 struct sigqueue_args {
 	pid_t pid;
 	int signum;
 	/* union sigval */ void *value;
 };
 #endif
 int
 sys_sigqueue(struct thread *td, struct sigqueue_args *uap)
 {
 	union sigval sv;
 
 	sv.sival_ptr = uap->value;
 
 	return (kern_sigqueue(td, uap->pid, uap->signum, &sv));
 }
 
 int
 kern_sigqueue(struct thread *td, pid_t pid, int signum, union sigval *value)
 {
 	ksiginfo_t ksi;
 	struct proc *p;
 	int error;
 
 	if ((u_int)signum > _SIG_MAXSIG)
 		return (EINVAL);
 
 	/*
 	 * Specification says sigqueue can only send signal to
 	 * single process.
 	 */
 	if (pid <= 0)
 		return (EINVAL);
 
 	if ((p = pfind_any(pid)) == NULL)
 		return (ESRCH);
 	error = p_cansignal(td, p, signum);
 	if (error == 0 && signum != 0) {
 		ksiginfo_init(&ksi);
 		ksi.ksi_flags = KSI_SIGQ;
 		ksi.ksi_signo = signum;
 		ksi.ksi_code = SI_QUEUE;
 		ksi.ksi_pid = td->td_proc->p_pid;
 		ksi.ksi_uid = td->td_ucred->cr_ruid;
 		ksi.ksi_value = *value;
 		error = pksignal(p, ksi.ksi_signo, &ksi);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
 /*
  * Send a signal to a process group.
  */
 void
 gsignal(int pgid, int sig, ksiginfo_t *ksi)
 {
 	struct pgrp *pgrp;
 
 	if (pgid != 0) {
 		sx_slock(&proctree_lock);
 		pgrp = pgfind(pgid);
 		sx_sunlock(&proctree_lock);
 		if (pgrp != NULL) {
 			pgsignal(pgrp, sig, 0, ksi);
 			PGRP_UNLOCK(pgrp);
 		}
 	}
 }
 
 /*
  * Send a signal to a process group.  If checktty is 1,
  * limit to members which have a controlling terminal.
  */
 void
 pgsignal(struct pgrp *pgrp, int sig, int checkctty, ksiginfo_t *ksi)
 {
 	struct proc *p;
 
 	if (pgrp) {
 		PGRP_LOCK_ASSERT(pgrp, MA_OWNED);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    (checkctty == 0 || p->p_flag & P_CONTROLT))
 				pksignal(p, sig, ksi);
 			PROC_UNLOCK(p);
 		}
 	}
 }
 
 
 /*
  * Recalculate the signal mask and reset the signal disposition after
  * usermode frame for delivery is formed.  Should be called after
  * mach-specific routine, because sysent->sv_sendsig() needs correct
  * ps_siginfo and signal mask.
  */
 static void
 postsig_done(int sig, struct thread *td, struct sigacts *ps)
 {
 	sigset_t mask;
 
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	td->td_ru.ru_nsignals++;
 	mask = ps->ps_catchmask[_SIG_IDX(sig)];
 	if (!SIGISMEMBER(ps->ps_signodefer, sig))
 		SIGADDSET(mask, sig);
 	kern_sigprocmask(td, SIG_BLOCK, &mask, NULL,
 	    SIGPROCMASK_PROC_LOCKED | SIGPROCMASK_PS_LOCKED);
 	if (SIGISMEMBER(ps->ps_sigreset, sig))
 		sigdflt(ps, sig);
 }
 
 
 /*
  * Send a signal caused by a trap to the current thread.  If it will be
  * caught immediately, deliver it with correct code.  Otherwise, post it
  * normally.
  */
 void
 trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
 	int sig;
 	int code;
 
 	p = td->td_proc;
 	sig = ksi->ksi_signo;
 	code = ksi->ksi_code;
 	KASSERT(_SIG_VALID(sig), ("invalid signal"));
 
 	PROC_LOCK(p);
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
 		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)],
 				ksi, &td->td_sigmask);
 		postsig_done(sig, td, ps);
 		mtx_unlock(&ps->ps_mtx);
 	} else {
 		/*
 		 * Avoid a possible infinite loop if the thread
 		 * masking the signal or process is ignoring the
 		 * signal.
 		 */
 		if (kern_forcesigexit &&
 		    (SIGISMEMBER(td->td_sigmask, sig) ||
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
 			SIGDELSET(td->td_sigmask, sig);
 			SIGDELSET(ps->ps_sigcatch, sig);
 			SIGDELSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_sig = sig;		/* XXX to verify code */
 		tdsendsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
 
 static struct thread *
 sigtd(struct proc *p, int sig, int prop)
 {
 	struct thread *td, *signal_td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	/*
 	 * Check if current thread can handle the signal without
 	 * switching context to another thread.
 	 */
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
 			break;
 		}
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
 	return (signal_td);
 }
 
 /*
  * Send the signal to the process.  If the signal has an action, the action
  * is usually performed by the target process rather than the caller; we add
  * the signal to the set of pending signals for the process.
  *
  * Exceptions:
  *   o When a stop signal is sent to a sleeping process that takes the
  *     default action, the process is stopped without awakening it.
  *   o SIGCONT restarts stopped processes (or puts them back to sleep)
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
  *
  * NB: This function may be entered from the debugger via the "kill" DDB
  * command.  There is little that can be done to mitigate the possibly messy
  * side effects of this unwise possibility.
  */
 void
 kern_psignal(struct proc *p, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(p, NULL, sig, &ksi);
 }
 
 int
 pksignal(struct proc *p, int sig, ksiginfo_t *ksi)
 {
 
 	return (tdsendsignal(p, NULL, sig, ksi));
 }
 
 /* Utility function for finding a thread to send signal event to. */
 int
 sigev_findtd(struct proc *p ,struct sigevent *sigev, struct thread **ttd)
 {
 	struct thread *td;
 
 	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
 		td = tdfind(sigev->sigev_notify_thread_id, p->p_pid);
 		if (td == NULL)
 			return (ESRCH);
 		*ttd = td;
 	} else {
 		*ttd = NULL;
 		PROC_LOCK(p);
 	}
 	return (0);
 }
 
 void
 tdsignal(struct thread *td, int sig)
 {
 	ksiginfo_t ksi;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 	(void) tdsendsignal(td->td_proc, td, sig, &ksi);
 }
 
 void
 tdksignal(struct thread *td, int sig, ksiginfo_t *ksi)
 {
 
 	(void) tdsendsignal(td->td_proc, td, sig, ksi);
 }
 
 int
 tdsendsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
 	sig_t action;
 	sigqueue_t *sigqueue;
 	int prop;
 	struct sigacts *ps;
 	int intrval;
 	int ret = 0;
 	int wakeup_swapper;
 
 	MPASS(td == NULL || p == td->td_proc);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
 		panic("%s(): invalid signal %d", __func__, sig);
 
 	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("%s: ksi on queue", __func__));
 
 	/*
 	 * IEEE Std 1003.1-2001: return success when killing a zombie.
 	 */
 	if (p->p_state == PRS_ZOMBIE) {
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 
 	ps = p->p_sigacts;
 	KNOTE_LOCKED(p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	if (td == NULL) {
 		td = sigtd(p, sig, prop);
 		sigqueue = &p->p_sigqueue;
 	} else
 		sigqueue = &td->td_sigqueue;
 
 	SDT_PROBE3(proc, , , signal__send, td, p, sig);
 
 	/*
 	 * If the signal is being ignored,
 	 * then we forget about it immediately.
 	 * (Note: we don't set SIGCONT in ps_sigignore,
 	 * and if it is set to SIG_IGN,
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
 	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		SDT_PROBE3(proc, , , signal__discard, td, p, sig);
 
 		mtx_unlock(&ps->ps_mtx);
 		if (ksi && (ksi->ksi_flags & KSI_INS))
 			ksiginfo_tryfree(ksi);
 		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
 	else if (SIGISMEMBER(ps->ps_sigcatch, sig))
 		action = SIG_CATCH;
 	else
 		action = SIG_DFL;
 	if (SIGISMEMBER(ps->ps_sigintr, sig))
 		intrval = EINTR;
 	else
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
 	if (prop & SIGPROP_CONT)
 		sigqueue_delete_stopmask_proc(p);
 	else if (prop & SIGPROP_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
 		 * is default; don't stop the process below if sleeping,
 		 * and don't clear any pending SIGCONT.
 		 */
 		if ((prop & SIGPROP_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
 		    (action == SIG_DFL)) {
 			if (ksi && (ksi->ksi_flags & KSI_INS))
 				ksiginfo_tryfree(ksi);
 			return (ret);
 		}
 		sigqueue_delete_proc(p, SIGCONT);
 		if (p->p_flag & P_CONTINUED) {
 			p->p_flag &= ~P_CONTINUED;
 			PROC_LOCK(p->p_pptr);
 			sigqueue_take(p->p_ksi);
 			PROC_UNLOCK(p->p_pptr);
 		}
 	}
 
 	ret = sigqueue_add(sigqueue, sig, ksi);
 	if (ret != 0)
 		return (ret);
 	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SIGPROP_CONT) && (p->p_flag & P_STOPPED_SIG)))
 		return (ret);
 
 	/* SIGKILL: Remove procfs STOPEVENTs. */
 	if (sig == SIGKILL) {
 		/* from procfs_ioctl.c: PIOCBIC */
 		p->p_stops = 0;
 		/* from procfs_ioctl.c: PIOCCONT */
 		p->p_step = 0;
 		wakeup(&p->p_step);
 	}
 	/*
 	 * Some signals have a process-wide effect and a per-thread
 	 * component.  Most processing occurs when the process next
 	 * tries to cross the user boundary, however there are some
 	 * times when processing needs to be done immediately, such as
 	 * waking up threads so that they can cross the user boundary.
 	 * We try to do the per-process part here.
 	 */
 	if (P_SHOULDSTOP(p)) {
 		KASSERT(!(p->p_flag & P_WEXIT),
 		    ("signal to stopped but exiting process"));
 		if (sig == SIGKILL) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * SIGKILL sets process running.
 			 * It will die elsewhere.
 			 * All threads must be restarted.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			goto runfast;
 		}
 
 		if (prop & SIGPROP_CONT) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
 			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
 			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
 			PROC_SLOCK(p);
 			if (p->p_numthreads == p->p_suspcount) {
 				PROC_SUNLOCK(p);
 				p->p_flag |= P_CONTINUED;
 				p->p_xsig = SIGCONT;
 				PROC_LOCK(p->p_pptr);
 				childproc_continued(p);
 				PROC_UNLOCK(p->p_pptr);
 				PROC_SLOCK(p);
 			}
 			if (action == SIG_DFL) {
 				thread_unsuspend(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete(sigqueue, sig);
 				goto out;
 			}
 			if (action == SIG_CATCH) {
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
 				 */
 				PROC_SUNLOCK(p);
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		if (prop & SIGPROP_STOP) {
 			/*
 			 * If traced process is already stopped,
 			 * then no further action is necessary.
 			 */
 			if (p->p_flag & P_TRACED)
 				goto out;
 			/*
 			 * Already stopped, don't need to stop again
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
 			p->p_flag |= P_STOPPED_SIG;
 			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
 		/*
 		 * All other kinds of signals:
 		 * If a thread is sleeping interruptibly, simulate a
 		 * wakeup so that when it is continued it will be made
 		 * runnable and can look at the signal.  However, don't make
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
 		wakeup_swapper = 0;
 		PROC_SLOCK(p);
 		thread_lock(td);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			wakeup_swapper = sleepq_abort(td, intrval);
 		thread_unlock(td);
 		PROC_SUNLOCK(p);
 		if (wakeup_swapper)
 			kick_proc0();
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
 		 * hit thread_suspend_check() soon.
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
 			tdsigwakeup(td, sig, action, intrval);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SIGPROP_STOP) {
 			if (p->p_flag & (P_PPWAIT|P_WEXIT))
 				goto out;
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xsig = sig;
 			PROC_SLOCK(p);
 			wakeup_swapper = sig_suspend_threads(td, p, 1);
 			if (p->p_numthreads == p->p_suspcount) {
 				/*
 				 * only thread sending signal to another
 				 * process can reach here, if thread is sending
 				 * signal to its process, because thread does
 				 * not suspend itself here, p_numthreads
 				 * should never be equal to p_suspcount.
 				 */
 				thread_stopped(p);
 				PROC_SUNLOCK(p);
 				sigqueue_delete_proc(p, p->p_xsig);
 			} else
 				PROC_SUNLOCK(p);
 			if (wakeup_swapper)
 				kick_proc0();
 			goto out;
 		}
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
 		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
 	/*
 	 * The process is not stopped so we need to apply the signal to all the
 	 * running threads.
 	 */
 runfast:
 	tdsigwakeup(td, sig, action, intrval);
 	PROC_SLOCK(p);
 	thread_unsuspend(p);
 	PROC_SUNLOCK(p);
 out:
 	/* If we jump here, proc slock should not be owned. */
 	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
 	return (ret);
 }
 
 /*
  * The force of a signal has been directed against a single
  * thread.  We need to see what we can do about knocking it
  * out of any sleep it may be in etc.
  */
 static void
 tdsigwakeup(struct thread *td, int sig, sig_t action, int intrval)
 {
 	struct proc *p = td->td_proc;
 	int prop;
 	int wakeup_swapper;
 
 	wakeup_swapper = 0;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	prop = sigprop(sig);
 
 	PROC_SLOCK(p);
 	thread_lock(td);
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.  Be careful to avoid bumping the
 	 * priority of the idle thread, since we still allow to signal
 	 * kernel processes.
 	 */
 	if (action == SIG_DFL && (prop & SIGPROP_KILL) != 0 &&
 	    td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 		sched_prio(td, PUSER);
 	if (TD_ON_SLEEPQ(td)) {
 		/*
 		 * If thread is sleeping uninterruptibly
 		 * we can't interrupt the sleep... the signal will
 		 * be noticed when the process returns through
 		 * trap() or syscall().
 		 */
 		if ((td->td_flags & TDF_SINTR) == 0)
 			goto out;
 		/*
 		 * If SIGCONT is default (or ignored) and process is
 		 * asleep, we are finished; the process should not
 		 * be awakened.
 		 */
 		if ((prop & SIGPROP_CONT) && action == SIG_DFL) {
 			thread_unlock(td);
 			PROC_SUNLOCK(p);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
 			sigqueue_delete(&td->td_sigqueue, sig);
 			return;
 		}
 
 		/*
 		 * Don't awaken a sleeping thread for SIGSTOP if the
 		 * STOP signal is deferred.
 		 */
 		if ((prop & SIGPROP_STOP) != 0 && (td->td_flags & (TDF_SBDRY |
 		    TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			goto out;
 
 		/*
 		 * Give low priority threads a better chance to run.
 		 */
 		if (td->td_priority > PUSER && !TD_IS_IDLETHREAD(td))
 			sched_prio(td, PUSER);
 
 		wakeup_swapper = sleepq_abort(td, intrval);
 	} else {
 		/*
 		 * Other states do nothing with the signal immediately,
 		 * other than kicking ourselves if we are running.
 		 * It will either never be noticed, or noticed very soon.
 		 */
 #ifdef SMP
 		if (TD_IS_RUNNING(td) && td != curthread)
 			forward_signal(td);
 #endif
 	}
 out:
 	PROC_SUNLOCK(p);
 	thread_unlock(td);
 	if (wakeup_swapper)
 		kick_proc0();
 }
 
 static int
 sig_suspend_threads(struct thread *td, struct proc *p, int sending)
 {
 	struct thread *td2;
 	int wakeup_swapper;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	MPASS(sending || td == curthread);
 
 	wakeup_swapper = 0;
 	FOREACH_THREAD_IN_PROC(p, td2) {
 		thread_lock(td2);
 		td2->td_flags |= TDF_ASTPENDING | TDF_NEEDSUSPCHK;
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR)) {
 			if (td2->td_flags & TDF_SBDRY) {
 				/*
 				 * Once a thread is asleep with
 				 * TDF_SBDRY and without TDF_SERESTART
 				 * or TDF_SEINTR set, it should never
 				 * become suspended due to this check.
 				 */
 				KASSERT(!TD_IS_SUSPENDED(td2),
 				    ("thread with deferred stops suspended"));
 				if (TD_SBDRY_INTR(td2))
 					wakeup_swapper |= sleepq_abort(td2,
 					    TD_SBDRY_ERRNO(td2));
 			} else if (!TD_IS_SUSPENDED(td2)) {
 				thread_suspend_one(td2);
 			}
 		} else if (!TD_IS_SUSPENDED(td2)) {
 			if (sending || td != td2)
 				td2->td_flags |= TDF_ASTPENDING;
 #ifdef SMP
 			if (TD_IS_RUNNING(td2) && td2 != td)
 				forward_signal(td2);
 #endif
 		}
 		thread_unlock(td2);
 	}
 	return (wakeup_swapper);
 }
 
 /*
  * Stop the process for an event deemed interesting to the debugger. If si is
  * non-NULL, this is a signal exchange; the new signal requested by the
  * debugger will be returned for handling. If si is NULL, this is some other
  * type of interesting event. The debugger may request a signal be delivered in
  * that case as well, however it will be deferred until it can be handled.
  */
 int
 ptracestop(struct thread *td, int sig, ksiginfo_t *si)
 {
 	struct proc *p = td->td_proc;
 	struct thread *td2;
 	ksiginfo_t ksi;
 	int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(!(p->p_flag & P_WEXIT), ("Stopping exiting process"));
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
 	td->td_xsig = sig;
 
 	if (si == NULL || (si->ksi_flags & KSI_PTRACE) == 0) {
 		td->td_dbgflags |= TDB_XSIG;
 		CTR4(KTR_PTRACE, "ptracestop: tid %d (pid %d) flags %#x sig %d",
 		    td->td_tid, p->p_pid, td->td_dbgflags, sig);
 		PROC_SLOCK(p);
 		while ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_XSIG)) {
 			if (P_KILLED(p)) {
 				/*
 				 * Ensure that, if we've been PT_KILLed, the
 				 * exit status reflects that. Another thread
 				 * may also be in ptracestop(), having just
 				 * received the SIGKILL, but this thread was
 				 * unsuspended first.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				td->td_xsig = SIGKILL;
 				p->p_ptevents = 0;
 				break;
 			}
 			if (p->p_flag & P_SINGLE_EXIT &&
 			    !(td->td_dbgflags & TDB_EXIT)) {
 				/*
 				 * Ignore ptrace stops except for thread exit
 				 * events when the process exits.
 				 */
 				td->td_dbgflags &= ~TDB_XSIG;
 				PROC_SUNLOCK(p);
 				return (0);
 			}
 
 			/*
 			 * Make wait(2) work.  Ensure that right after the
 			 * attach, the thread which was decided to become the
 			 * leader of attach gets reported to the waiter.
 			 * Otherwise, just avoid overwriting another thread's
 			 * assignment to p_xthread.  If another thread has
 			 * already set p_xthread, the current thread will get
 			 * a chance to report itself upon the next iteration.
 			 */
 			if ((td->td_dbgflags & TDB_FSTP) != 0 ||
 			    ((p->p_flag2 & P2_PTRACE_FSTP) == 0 &&
 			    p->p_xthread == NULL)) {
 				p->p_xsig = sig;
 				p->p_xthread = td;
 
 				/*
 				 * If we are on sleepqueue already,
 				 * let sleepqueue code decide if it
 				 * needs to go sleep after attach.
 				 */
 				if (td->td_wchan == NULL)
 					td->td_dbgflags &= ~TDB_FSTP;
 
 				p->p_flag2 &= ~P2_PTRACE_FSTP;
 				p->p_flag |= P_STOPPED_SIG | P_STOPPED_TRACE;
 				sig_suspend_threads(td, p, 0);
 			}
 			if ((td->td_dbgflags & TDB_STOPATFORK) != 0) {
 				td->td_dbgflags &= ~TDB_STOPATFORK;
 			}
 stopme:
 			thread_suspend_switch(td, p);
 			if (p->p_xthread == td)
 				p->p_xthread = NULL;
 			if (!(p->p_flag & P_TRACED))
 				break;
 			if (td->td_dbgflags & TDB_SUSPEND) {
 				if (p->p_flag & P_SINGLE_EXIT)
 					break;
 				goto stopme;
 			}
 		}
 		PROC_SUNLOCK(p);
 	}
 
 	if (si != NULL && sig == td->td_xsig) {
 		/* Parent wants us to take the original signal unchanged. */
 		si->ksi_flags |= KSI_HEAD;
 		if (sigqueue_add(&td->td_sigqueue, sig, si) != 0)
 			si->ksi_signo = 0;
 	} else if (td->td_xsig != 0) {
 		/*
 		 * If parent wants us to take a new signal, then it will leave
 		 * it in td->td_xsig; otherwise we just look for signals again.
 		 */
 		ksiginfo_init(&ksi);
 		ksi.ksi_signo = td->td_xsig;
 		ksi.ksi_flags |= KSI_PTRACE;
 		prop = sigprop(td->td_xsig);
 		td2 = sigtd(p, td->td_xsig, prop);
 		tdsendsignal(p, td2, td->td_xsig, &ksi);
 		if (td != td2)
 			return (0);
 	}
 
 	return (td->td_xsig);
 }
 
 static void
 reschedule_signals(struct proc *p, sigset_t block, int flags)
 {
 	struct sigacts *ps;
 	struct thread *td;
 	int sig;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, (flags & SIGPROCMASK_PS_LOCKED) != 0 ?
 	    MA_OWNED : MA_NOTOWNED);
 	if (SIGISEMPTY(p->p_siglist))
 		return;
 	SIGSETAND(block, p->p_siglist);
 	while ((sig = sig_ffs(&block)) != 0) {
 		SIGDELSET(block, sig);
 		td = sigtd(p, sig, 0);
 		signotify(td);
 		if (!(flags & SIGPROCMASK_PS_LOCKED))
 			mtx_lock(&ps->ps_mtx);
 		if (p->p_flag & P_TRACED ||
 		    (SIGISMEMBER(ps->ps_sigcatch, sig) &&
 		    !SIGISMEMBER(td->td_sigmask, sig)))
 			tdsigwakeup(td, sig, SIG_CATCH,
 			    (SIGISMEMBER(ps->ps_sigintr, sig) ? EINTR :
 			     ERESTART));
 		if (!(flags & SIGPROCMASK_PS_LOCKED))
 			mtx_unlock(&ps->ps_mtx);
 	}
 }
 
 void
 tdsigcleanup(struct thread *td)
 {
 	struct proc *p;
 	sigset_t unblocked;
 
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	sigqueue_flush(&td->td_sigqueue);
 	if (p->p_numthreads == 1)
 		return;
 
 	/*
 	 * Since we cannot handle signals, notify signal post code
 	 * about this by filling the sigmask.
 	 *
 	 * Also, if needed, wake up thread(s) that do not block the
 	 * same signals as the exiting thread, since the thread might
 	 * have been selected for delivery and woken up.
 	 */
 	SIGFILLSET(unblocked);
 	SIGSETNAND(unblocked, td->td_sigmask);
 	SIGFILLSET(td->td_sigmask);
 	reschedule_signals(p, unblocked, 0);
 
 }
 
 static int
 sigdeferstop_curr_flags(int cflags)
 {
 
 	MPASS((cflags & (TDF_SEINTR | TDF_SERESTART)) == 0 ||
 	    (cflags & TDF_SBDRY) != 0);
 	return (cflags & (TDF_SBDRY | TDF_SEINTR | TDF_SERESTART));
 }
 
 /*
  * Defer the delivery of SIGSTOP for the current thread, according to
  * the requested mode.  Returns previous flags, which must be restored
  * by sigallowstop().
  *
  * TDF_SBDRY, TDF_SEINTR, and TDF_SERESTART flags are only set and
  * cleared by the current thread, which allow the lock-less read-only
  * accesses below.
  */
 int
 sigdeferstop_impl(int mode)
 {
 	struct thread *td;
 	int cflags, nflags;
 
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	switch (mode) {
 	case SIGDEFERSTOP_NOP:
 		nflags = cflags;
 		break;
 	case SIGDEFERSTOP_OFF:
 		nflags = 0;
 		break;
 	case SIGDEFERSTOP_SILENT:
 		nflags = (cflags | TDF_SBDRY) & ~(TDF_SEINTR | TDF_SERESTART);
 		break;
 	case SIGDEFERSTOP_EINTR:
 		nflags = (cflags | TDF_SBDRY | TDF_SEINTR) & ~TDF_SERESTART;
 		break;
 	case SIGDEFERSTOP_ERESTART:
 		nflags = (cflags | TDF_SBDRY | TDF_SERESTART) & ~TDF_SEINTR;
 		break;
 	default:
 		panic("sigdeferstop: invalid mode %x", mode);
 		break;
 	}
 	if (cflags == nflags)
 		return (SIGDEFERSTOP_VAL_NCHG);
 	thread_lock(td);
 	td->td_flags = (td->td_flags & ~cflags) | nflags;
 	thread_unlock(td);
 	return (cflags);
 }
 
 /*
  * Restores the STOP handling mode, typically permitting the delivery
  * of SIGSTOP for the current thread.  This does not immediately
  * suspend if a stop was posted.  Instead, the thread will suspend
  * either via ast() or a subsequent interruptible sleep.
  */
 void
 sigallowstop_impl(int prev)
 {
 	struct thread *td;
 	int cflags;
 
 	KASSERT(prev != SIGDEFERSTOP_VAL_NCHG, ("failed sigallowstop"));
 	KASSERT((prev & ~(TDF_SBDRY | TDF_SEINTR | TDF_SERESTART)) == 0,
 	    ("sigallowstop: incorrect previous mode %x", prev));
 	td = curthread;
 	cflags = sigdeferstop_curr_flags(td->td_flags);
 	if (cflags != prev) {
 		thread_lock(td);
 		td->td_flags = (td->td_flags & ~cflags) | prev;
 		thread_unlock(td);
 	}
 }
 
 /*
  * If the current process has received a signal (should be caught or cause
  * termination, should interrupt current syscall), return the signal number.
  * Stop signals with default action are processed immediately, then cleared;
  * they aren't returned.  This is checked after each entry to the system for
  * a syscall or trap (though this can usually be done without calling issignal
  * by checking the pending signal masks in cursig.) The normal call
  * sequence is
  *
  *	while (sig = cursig(curthread))
  *		postsig(sig);
  */
 static int
 issignal(struct thread *td)
 {
 	struct proc *p;
 	struct sigacts *ps;
 	struct sigqueue *queue;
 	sigset_t sigpending;
 	ksiginfo_t ksi;
 	int prop, sig, traced;
 
 	p = td->td_proc;
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	for (;;) {
 		traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
 		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETOR(sigpending, p->p_sigqueue.sq_signals);
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if ((p->p_flag & P_PPWAIT) != 0 || (td->td_flags &
 		    (TDF_SBDRY | TDF_SERESTART | TDF_SEINTR)) == TDF_SBDRY)
 			SIG_STOPSIGMASK(sigpending);
 		if (SIGISEMPTY(sigpending))	/* no signal to send */
 			return (0);
 		if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED &&
 		    (p->p_flag2 & P2_PTRACE_FSTP) != 0 &&
 		    SIGISMEMBER(sigpending, SIGSTOP)) {
 			/*
 			 * If debugger just attached, always consume
 			 * SIGSTOP from ptrace(PT_ATTACH) first, to
 			 * execute the debugger attach ritual in
 			 * order.
 			 */
 			sig = SIGSTOP;
 			td->td_dbgflags |= TDB_FSTP;
 		} else {
 			sig = sig_ffs(&sigpending);
 		}
 
 		if (p->p_stops & S_SIG) {
 			mtx_unlock(&ps->ps_mtx);
 			stopevent(p, S_SIG, sig);
 			mtx_lock(&ps->ps_mtx);
 		}
 
 		/*
 		 * We should see pending but ignored signals
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
 			sigqueue_delete(&td->td_sigqueue, sig);
 			sigqueue_delete(&p->p_sigqueue, sig);
 			continue;
 		}
 		if ((p->p_flag & (P_TRACED | P_PPTRACE)) == P_TRACED) {
 			/*
 			 * If traced, always stop.
 			 * Remove old signal from queue before the stop.
 			 * XXX shrug off debugger, it causes siginfo to
 			 * be thrown away.
 			 */
 			queue = &td->td_sigqueue;
 			ksiginfo_init(&ksi);
 			if (sigqueue_get(queue, sig, &ksi) == 0) {
 				queue = &p->p_sigqueue;
 				sigqueue_get(queue, sig, &ksi);
 			}
 			td->td_si = ksi.ksi_info;
 
 			mtx_unlock(&ps->ps_mtx);
 			sig = ptracestop(td, sig, &ksi);
 			mtx_lock(&ps->ps_mtx);
 
 			td->td_si.si_signo = 0;
 
 			/* 
 			 * Keep looking if the debugger discarded or
 			 * replaced the signal.
 			 */
 			if (sig == 0)
 				continue;
 
 			/*
 			 * If the signal became masked, re-queue it.
 			 */
 			if (SIGISMEMBER(td->td_sigmask, sig)) {
 				ksi.ksi_flags |= KSI_HEAD;
 				sigqueue_add(&p->p_sigqueue, sig, &ksi);
 				continue;
 			}
 
 			/*
 			 * If the traced bit got turned off, requeue
 			 * the signal and go back up to the top to
 			 * rescan signals.  This ensures that p_sig*
 			 * and p_sigact are consistent.
 			 */
 			if ((p->p_flag & P_TRACED) == 0) {
 				ksi.ksi_flags |= KSI_HEAD;
 				sigqueue_add(queue, sig, &ksi);
 				continue;
 			}
 		}
 
 		prop = sigprop(sig);
 
 		/*
 		 * Decide whether the signal should be returned.
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
 		switch ((intptr_t)p->p_sigacts->ps_sigact[_SIG_IDX(sig)]) {
 
 		case (intptr_t)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
 			if (p->p_pid <= 1) {
 #ifdef DIAGNOSTIC
 				/*
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
 				printf("Process (pid %lu) got signal %d\n",
 					(u_long)p->p_pid, sig);
 #endif
 				break;		/* == ignore */
 			}
 			/*
 			 * If there is a pending stop signal to process with
 			 * default action, stop here, then clear the signal.
 			 * Traced or exiting processes should ignore stops.
 			 * Additionally, a member of an orphaned process group
 			 * should ignore tty stops.
 			 */
 			if (prop & SIGPROP_STOP) {
 				if (p->p_flag &
 				    (P_TRACED | P_WEXIT | P_SINGLE_EXIT) ||
 				    (p->p_pgrp->pg_jobc == 0 &&
 				     prop & SIGPROP_TTYSTOP))
 					break;	/* == ignore */
 				if (TD_SBDRY_INTR(td)) {
 					KASSERT((td->td_flags & TDF_SBDRY) != 0,
 					    ("lost TDF_SBDRY"));
 					return (-1);
 				}
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
 				    &p->p_mtx.lock_object, "Catching SIGSTOP");
 				sigqueue_delete(&td->td_sigqueue, sig);
 				sigqueue_delete(&p->p_sigqueue, sig);
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xsig = sig;
 				PROC_SLOCK(p);
 				sig_suspend_threads(td, p, 0);
 				thread_suspend_switch(td, p);
 				PROC_SUNLOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				goto next;
 			} else if (prop & SIGPROP_IGNORE) {
 				/*
 				 * Except for SIGCONT, shouldn't get here.
 				 * Default action is to ignore; drop it.
 				 */
 				break;		/* == ignore */
 			} else
 				return (sig);
 			/*NOTREACHED*/
 
 		case (intptr_t)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
 			 * than SIGCONT, unless process is traced.
 			 */
 			if ((prop & SIGPROP_CONT) == 0 &&
 			    (p->p_flag & P_TRACED) == 0)
 				printf("issignal\n");
 			break;		/* == ignore */
 
 		default:
 			/*
 			 * This signal has an action, let
 			 * postsig() process it.
 			 */
 			return (sig);
 		}
 		sigqueue_delete(&td->td_sigqueue, sig);	/* take the signal! */
 		sigqueue_delete(&p->p_sigqueue, sig);
 next:;
 	}
 	/* NOTREACHED */
 }
 
 void
 thread_stopped(struct proc *p)
 {
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
 	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
 		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
 		childproc_stopped(p, (p->p_flag & P_TRACED) ?
 			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
 		PROC_SLOCK(p);
 	}
 }
 
 /*
  * Take the action for the specified signal
  * from the current set of pending signals.
  */
 int
 postsig(int sig)
 {
 	struct thread *td;
 	struct proc *p;
 	struct sigacts *ps;
 	sig_t action;
 	ksiginfo_t ksi;
 	sigset_t returnmask;
 
 	KASSERT(sig != 0, ("postsig"));
 
 	td = curthread;
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 	ksiginfo_init(&ksi);
 	if (sigqueue_get(&td->td_sigqueue, sig, &ksi) == 0 &&
 	    sigqueue_get(&p->p_sigqueue, sig, &ksi) == 0)
 		return (0);
 	ksi.ksi_signo = sig;
 	if (ksi.ksi_code == SI_TIMER)
 		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
 		ktrpsig(sig, action, td->td_pflags & TDP_OLDMASK ?
 		    &td->td_oldsigmask : &td->td_sigmask, ksi.ksi_code);
 #endif
 	if ((p->p_stops & S_SIG) != 0) {
 		mtx_unlock(&ps->ps_mtx);
 		stopevent(p, S_SIG, sig);
 		mtx_lock(&ps->ps_mtx);
 	}
 
 	if (action == SIG_DFL) {
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
 		 */
 		mtx_unlock(&ps->ps_mtx);
 		proc_td_siginfo_capture(td, &ksi.ksi_info);
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
 		/*
 		 * If we get here, the signal must be caught.
 		 */
 		KASSERT(action != SIG_IGN, ("postsig action %p", action));
 		KASSERT(!SIGISMEMBER(td->td_sigmask, sig),
 		    ("postsig action: blocked sig %d", sig));
 
 		/*
 		 * Set the new mask value and also defer further
 		 * occurrences of this signal.
 		 *
 		 * Special case: user has done a sigsuspend.  Here the
 		 * current mask is not of interest, but rather the
 		 * mask from before the sigsuspend is what we want
 		 * restored after the signal processing is completed.
 		 */
 		if (td->td_pflags & TDP_OLDMASK) {
 			returnmask = td->td_oldsigmask;
 			td->td_pflags &= ~TDP_OLDMASK;
 		} else
 			returnmask = td->td_sigmask;
 
 		if (p->p_sig == sig) {
 			p->p_sig = 0;
 		}
 		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
 		postsig_done(sig, td, ps);
 	}
 	return (1);
 }
 
 void
 proc_wkilled(struct proc *p)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if ((p->p_flag & P_WKILLED) == 0) {
 		p->p_flag |= P_WKILLED;
 		/*
 		 * Notify swapper that there is a process to swap in.
 		 * The notification is racy, at worst it would take 10
 		 * seconds for the swapper process to notice.
 		 */
 		if ((p->p_flag & (P_INMEM | P_SWAPPINGIN)) == 0)
 			wakeup(&proc0);
 	}
 }
 
 /*
  * Kill the current process for stated reason.
  */
 void
 killproc(struct proc *p, char *why)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	CTR3(KTR_PROC, "killproc: proc %p (pid %d, %s)", p, p->p_pid,
 	    p->p_comm);
 	log(LOG_ERR, "pid %d (%s), jid %d, uid %d, was killed: %s\n",
 	    p->p_pid, p->p_comm, p->p_ucred->cr_prison->pr_id,
 	    p->p_ucred->cr_uid, why);
 	proc_wkilled(p);
 	kern_psignal(p, SIGKILL);
 }
 
 /*
  * Force the current process to exit with the specified signal, dumping core
  * if appropriate.  We bypass the normal tests for masked and caught signals,
  * allowing unrecoverable failures to terminate the process without changing
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
  */
 void
 sigexit(struct thread *td, int sig)
 {
 	struct proc *p = td->td_proc;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_acflag |= AXSIG;
 	/*
 	 * We must be single-threading to generate a core dump.  This
 	 * ensures that the registers in the core file are up-to-date.
 	 * Also, the ELF dump handler assumes that the thread list doesn't
 	 * change out from under it.
 	 *
 	 * XXX If another thread attempts to single-thread before us
 	 *     (e.g. via fork()), we won't get a dump at all.
 	 */
 	if ((sigprop(sig) & SIGPROP_CORE) &&
 	    thread_single(p, SINGLE_NO_EXIT) == 0) {
 		p->p_sig = sig;
 		/*
 		 * Log signals which would cause core dumps
 		 * (Log as LOG_INFO to appease those who don't want
 		 * these messages.)
 		 * XXX : Todo, as well as euid, write out ruid too
 		 * Note that coredump() drops proc lock.
 		 */
 		if (coredump(td) == 0)
 			sig |= WCOREFLAG;
 		if (kern_logsigexit)
 			log(LOG_INFO,
 			    "pid %d (%s), jid %d, uid %d: exited on "
 			    "signal %d%s\n", p->p_pid, p->p_comm,
 			    p->p_ucred->cr_prison->pr_id,
 			    td->td_ucred->cr_uid,
 			    sig &~ WCOREFLAG,
 			    sig & WCOREFLAG ? " (core dumped)" : "");
 	} else
 		PROC_UNLOCK(p);
 	exit1(td, 0, sig);
 	/* NOTREACHED */
 }
 
 /*
  * Send queued SIGCHLD to parent when child process's state
  * is changed.
  */
 static void
 sigparent(struct proc *p, int reason, int status)
 {
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	if (p->p_ksi != NULL) {
 		p->p_ksi->ksi_signo  = SIGCHLD;
 		p->p_ksi->ksi_code   = reason;
 		p->p_ksi->ksi_status = status;
 		p->p_ksi->ksi_pid    = p->p_pid;
 		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
 		if (KSI_ONQ(p->p_ksi))
 			return;
 	}
 	pksignal(p->p_pptr, SIGCHLD, p->p_ksi);
 }
 
 static void
 childproc_jobstate(struct proc *p, int reason, int sig)
 {
 	struct sigacts *ps;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
 
 	/*
 	 * Wake up parent sleeping in kern_wait(), also send
 	 * SIGCHLD to parent, but SIGCHLD does not guarantee
 	 * that parent will awake, because parent may masked
 	 * the signal.
 	 */
 	p->p_pptr->p_flag |= P_STATCHILD;
 	wakeup(p->p_pptr);
 
 	ps = p->p_pptr->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
 		mtx_unlock(&ps->ps_mtx);
 		sigparent(p, reason, sig);
 	} else
 		mtx_unlock(&ps->ps_mtx);
 }
 
 void
 childproc_stopped(struct proc *p, int reason)
 {
 
 	childproc_jobstate(p, reason, p->p_xsig);
 }
 
 void
 childproc_continued(struct proc *p)
 {
 	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
 }
 
 void
 childproc_exited(struct proc *p)
 {
 	int reason, status;
 
 	if (WCOREDUMP(p->p_xsig)) {
 		reason = CLD_DUMPED;
 		status = WTERMSIG(p->p_xsig);
 	} else if (WIFSIGNALED(p->p_xsig)) {
 		reason = CLD_KILLED;
 		status = WTERMSIG(p->p_xsig);
 	} else {
 		reason = CLD_EXITED;
 		status = p->p_xexit;
 	}
 	/*
 	 * XXX avoid calling wakeup(p->p_pptr), the work is
 	 * done in exit1().
 	 */
 	sigparent(p, reason, status);
 }
 
 #define	MAX_NUM_CORE_FILES 100000
 #ifndef NUM_CORE_FILES
 #define	NUM_CORE_FILES 5
 #endif
 CTASSERT(NUM_CORE_FILES >= 0 && NUM_CORE_FILES <= MAX_NUM_CORE_FILES);
 static int num_cores = NUM_CORE_FILES;
 
 static int
 sysctl_debug_num_cores_check (SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int new_val;
 
 	new_val = num_cores;
 	error = sysctl_handle_int(oidp, &new_val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (new_val > MAX_NUM_CORE_FILES)
 		new_val = MAX_NUM_CORE_FILES;
 	if (new_val < 0)
 		new_val = 0;
 	num_cores = new_val;
 	return (0);
 }
 SYSCTL_PROC(_debug, OID_AUTO, ncores, CTLTYPE_INT|CTLFLAG_RW,
 	    0, sizeof(int), sysctl_debug_num_cores_check, "I",
 	    "Maximum number of generated process corefiles while using index format");
 
 #define	GZIP_SUFFIX	".gz"
 #define	ZSTD_SUFFIX	".zst"
 
 int compress_user_cores = 0;
 
 static int
 sysctl_compress_user_cores(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = compress_user_cores;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	if (val != 0 && !compressor_avail(val))
 		return (EINVAL);
 	compress_user_cores = val;
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, compress_user_cores, CTLTYPE_INT | CTLFLAG_RWTUN,
     0, sizeof(int), sysctl_compress_user_cores, "I",
     "Enable compression of user corefiles ("
     __XSTRING(COMPRESS_GZIP) " = gzip, "
     __XSTRING(COMPRESS_ZSTD) " = zstd)");
 
 int compress_user_cores_level = 6;
 SYSCTL_INT(_kern, OID_AUTO, compress_user_cores_level, CTLFLAG_RWTUN,
     &compress_user_cores_level, 0,
     "Corefile compression level");
 
 /*
  * Protect the access to corefilename[] by allproc_lock.
  */
 #define	corefilename_lock	allproc_lock
 
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 TUNABLE_STR("kern.corefile", corefilename, sizeof(corefilename));
 
 static int
 sysctl_kern_corefile(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 
 	sx_xlock(&corefilename_lock);
 	error = sysctl_handle_string(oidp, corefilename, sizeof(corefilename),
 	    req);
 	sx_xunlock(&corefilename_lock);
 
 	return (error);
 }
 SYSCTL_PROC(_kern, OID_AUTO, corefile, CTLTYPE_STRING | CTLFLAG_RW |
     CTLFLAG_MPSAFE, 0, 0, sysctl_kern_corefile, "A",
     "Process corefile name format string");
 
 static void
 vnode_close_locked(struct thread *td, struct vnode *vp)
 {
 
 	VOP_UNLOCK(vp, 0);
 	vn_close(vp, FWRITE, td->td_ucred, td);
 }
 
 /*
  * If the core format has a %I in it, then we need to check
  * for existing corefiles before defining a name.
  * To do this we iterate over 0..ncores to find a
  * non-existing core file name to use. If all core files are
  * already used we choose the oldest one.
  */
 static int
 corefile_open_last(struct thread *td, char *name, int indexpos,
     int indexlen, int ncores, struct vnode **vpp)
 {
 	struct vnode *oldvp, *nextvp, *vp;
 	struct vattr vattr;
 	struct nameidata nd;
 	int error, i, flags, oflags, cmode;
 	char ch;
 	struct timespec lasttime;
 
 	nextvp = oldvp = NULL;
 	cmode = S_IRUSR | S_IWUSR;
 	oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 	    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 
 	for (i = 0; i < ncores; i++) {
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 
 		ch = name[indexpos + indexlen];
 		(void)snprintf(name + indexpos, indexlen + 1, "%.*u", indexlen,
 		    i);
 		name[indexpos + indexlen] = ch;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error != 0)
 			break;
 
 		vp = nd.ni_vp;
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		if ((flags & O_CREAT) == O_CREAT) {
 			nextvp = vp;
 			break;
 		}
 
 		error = VOP_GETATTR(vp, &vattr, td->td_ucred);
 		if (error != 0) {
 			vnode_close_locked(td, vp);
 			break;
 		}
 
 		if (oldvp == NULL ||
 		    lasttime.tv_sec > vattr.va_mtime.tv_sec ||
 		    (lasttime.tv_sec == vattr.va_mtime.tv_sec &&
 		    lasttime.tv_nsec >= vattr.va_mtime.tv_nsec)) {
 			if (oldvp != NULL)
 				vnode_close_locked(td, oldvp);
 			oldvp = vp;
 			lasttime = vattr.va_mtime;
 		} else {
 			vnode_close_locked(td, vp);
 		}
 	}
 
 	if (oldvp != NULL) {
 		if (nextvp == NULL) {
 			if ((td->td_proc->p_flag & P_SUGID) != 0) {
 				error = EFAULT;
 				vnode_close_locked(td, oldvp);
 			} else {
 				nextvp = oldvp;
 			}
 		} else {
 			vnode_close_locked(td, oldvp);
 		}
 	}
 	if (error != 0) {
 		if (nextvp != NULL)
 			vnode_close_locked(td, oldvp);
 	} else {
 		*vpp = nextvp;
 	}
 
 	return (error);
 }
 
 /*
  * corefile_open(comm, uid, pid, td, compress, vpp, namep)
  * Expand the name described in corefilename, using name, uid, and pid
  * and open/create core file.
  * corefilename is a printf-like string, with three format specifiers:
  *	%N	name of process ("name")
  *	%P	process id (pid)
  *	%U	user id (uid)
  * For example, "%N.core" is the default; they can be disabled completely
  * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
  * This is controlled by the sysctl variable kern.corefile (see above).
  */
 static int
 corefile_open(const char *comm, uid_t uid, pid_t pid, struct thread *td,
     int compress, int signum, struct vnode **vpp, char **namep)
 {
 	struct sbuf sb;
 	struct nameidata nd;
 	const char *format;
 	char *hostname, *name;
 	int cmode, error, flags, i, indexpos, indexlen, oflags, ncores;
 
 	hostname = NULL;
 	format = corefilename;
 	name = malloc(MAXPATHLEN, M_TEMP, M_WAITOK | M_ZERO);
 	indexlen = 0;
 	indexpos = -1;
 	ncores = num_cores;
 	(void)sbuf_new(&sb, name, MAXPATHLEN, SBUF_FIXEDLEN);
 	sx_slock(&corefilename_lock);
 	for (i = 0; format[i] != '\0'; i++) {
 		switch (format[i]) {
 		case '%':	/* Format character */
 			i++;
 			switch (format[i]) {
 			case '%':
 				sbuf_putc(&sb, '%');
 				break;
 			case 'H':	/* hostname */
 				if (hostname == NULL) {
 					hostname = malloc(MAXHOSTNAMELEN,
 					    M_TEMP, M_WAITOK);
 				}
 				getcredhostname(td->td_ucred, hostname,
 				    MAXHOSTNAMELEN);
 				sbuf_printf(&sb, "%s", hostname);
 				break;
 			case 'I':	/* autoincrementing index */
 				if (indexpos != -1) {
 					sbuf_printf(&sb, "%%I");
 					break;
 				}
 
 				indexpos = sbuf_len(&sb);
 				sbuf_printf(&sb, "%u", ncores - 1);
 				indexlen = sbuf_len(&sb) - indexpos;
 				break;
 			case 'N':	/* process name */
 				sbuf_printf(&sb, "%s", comm);
 				break;
 			case 'P':	/* process id */
 				sbuf_printf(&sb, "%u", pid);
 				break;
 			case 'S':	/* signal number */
 				sbuf_printf(&sb, "%i", signum);
 				break;
 			case 'U':	/* user id */
 				sbuf_printf(&sb, "%u", uid);
 				break;
 			default:
 				log(LOG_ERR,
 				    "Unknown format character %c in "
 				    "corename `%s'\n", format[i], format);
 				break;
 			}
 			break;
 		default:
 			sbuf_putc(&sb, format[i]);
 			break;
 		}
 	}
 	sx_sunlock(&corefilename_lock);
 	free(hostname, M_TEMP);
 	if (compress == COMPRESS_GZIP)
 		sbuf_printf(&sb, GZIP_SUFFIX);
 	else if (compress == COMPRESS_ZSTD)
 		sbuf_printf(&sb, ZSTD_SUFFIX);
 	if (sbuf_error(&sb) != 0) {
 		log(LOG_ERR, "pid %ld (%s), uid (%lu): corename is too "
 		    "long\n", (long)pid, comm, (u_long)uid);
 		sbuf_delete(&sb);
 		free(name, M_TEMP);
 		return (ENOMEM);
 	}
 	sbuf_finish(&sb);
 	sbuf_delete(&sb);
 
 	if (indexpos != -1) {
 		error = corefile_open_last(td, name, indexpos, indexlen, ncores,
 		    vpp);
 		if (error != 0) {
 			log(LOG_ERR,
 			    "pid %d (%s), uid (%u):  Path `%s' failed "
 			    "on initial open test, error = %d\n",
 			    pid, comm, uid, name, error);
 		}
 	} else {
 		cmode = S_IRUSR | S_IWUSR;
 		oflags = VN_OPEN_NOAUDIT | VN_OPEN_NAMECACHE |
 		    (capmode_coredump ? VN_OPEN_NOCAPCHECK : 0);
 		flags = O_CREAT | FWRITE | O_NOFOLLOW;
 		if ((td->td_proc->p_flag & P_SUGID) != 0)
 			flags |= O_EXCL;
 
 		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td);
 		error = vn_open_cred(&nd, &flags, cmode, oflags, td->td_ucred,
 		    NULL);
 		if (error == 0) {
 			*vpp = nd.ni_vp;
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 		}
 	}
 
 	if (error != 0) {
 #ifdef AUDIT
 		audit_proc_coredump(td, name, error);
 #endif
 		free(name, M_TEMP);
 		return (error);
 	}
 	*namep = name;
 	return (0);
 }
 
 /*
  * Dump a process' core.  The main routine does some
  * policy checking, and creates the name of the coredump;
  * then it passes on a vnode and a size limit to the process-specific
  * coredump routine if there is one; if there _is not_ one, it returns
  * ENOSYS; otherwise it returns the error from the process-specific routine.
  */
 
 static int
 coredump(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 	struct ucred *cred = td->td_ucred;
 	struct vnode *vp;
 	struct flock lf;
 	struct vattr vattr;
 	int error, error1, locked;
 	char *name;			/* name of corefile */
 	void *rl_cookie;
 	off_t limit;
 	char *fullpath, *freepath = NULL;
 	struct sbuf *sb;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
 	_STOPEVENT(p, S_CORE, 0);
 
 	if (!do_coredump || (!sugid_coredump && (p->p_flag & P_SUGID) != 0) ||
 	    (p->p_flag2 & P2_NOTRACE) != 0) {
 		PROC_UNLOCK(p);
 		return (EFAULT);
 	}
 
 	/*
 	 * Note that the bulk of limit checking is done after
 	 * the corefile is created.  The exception is if the limit
 	 * for corefiles is 0, in which case we don't bother
 	 * creating the corefile at all.  This layout means that
 	 * a corefile is truncated instead of not being created,
 	 * if it is larger than the limit.
 	 */
 	limit = (off_t)lim_cur(td, RLIMIT_CORE);
 	if (limit == 0 || racct_get_available(p, RACCT_CORE) == 0) {
 		PROC_UNLOCK(p);
 		return (EFBIG);
 	}
 	PROC_UNLOCK(p);
 
 	error = corefile_open(p->p_comm, cred->cr_uid, p->p_pid, td,
 	    compress_user_cores, p->p_sig, &vp, &name);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Don't dump to non-regular files or files with links.
 	 * Do not dump into system files. Effective user must own the corefile.
 	 */
 	if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) != 0 ||
 	    vattr.va_nlink != 1 || (vp->v_vflag & VV_SYSTEM) != 0 ||
 	    vattr.va_uid != cred->cr_uid) {
 		VOP_UNLOCK(vp, 0);
 		error = EFAULT;
 		goto out;
 	}
 
 	VOP_UNLOCK(vp, 0);
 
 	/* Postpone other writers, including core dumps of other processes. */
 	rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
 
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
 	lf.l_type = F_WRLCK;
 	locked = (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &lf, F_FLOCK) == 0);
 
 	VATTR_NULL(&vattr);
 	vattr.va_size = 0;
 	if (set_core_nodump_flag)
 		vattr.va_flags = UF_NODUMP;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	VOP_SETATTR(vp, &vattr, cred);
 	VOP_UNLOCK(vp, 0);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
 
 	if (p->p_sysent->sv_coredump != NULL) {
 		error = p->p_sysent->sv_coredump(td, vp, limit, 0);
 	} else {
 		error = ENOSYS;
 	}
 
 	if (locked) {
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
 	vn_rangelock_unlock(vp, rl_cookie);
 
 	/*
 	 * Notify the userland helper that a process triggered a core dump.
 	 * This allows the helper to run an automated debugging session.
 	 */
 	if (error != 0 || coredump_devctl == 0)
 		goto out;
 	sb = sbuf_new_auto();
 	if (vn_fullpath_global(td, p->p_textvp, &fullpath, &freepath) != 0)
 		goto out2;
 	sbuf_printf(sb, "comm=\"");
 	devctl_safe_quote_sb(sb, fullpath);
 	free(freepath, M_TEMP);
 	sbuf_printf(sb, "\" core=\"");
 
 	/*
 	 * We can't lookup core file vp directly. When we're replacing a core, and
 	 * other random times, we flush the name cache, so it will fail. Instead,
 	 * if the path of the core is relative, add the current dir in front if it.
 	 */
 	if (name[0] != '/') {
 		fullpath = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 		if (kern___getcwd(td, fullpath, UIO_SYSSPACE, MAXPATHLEN, MAXPATHLEN) != 0) {
 			free(fullpath, M_TEMP);
 			goto out2;
 		}
 		devctl_safe_quote_sb(sb, fullpath);
 		free(fullpath, M_TEMP);
 		sbuf_putc(sb, '/');
 	}
 	devctl_safe_quote_sb(sb, name);
 	sbuf_printf(sb, "\"");
 	if (sbuf_finish(sb) == 0)
 		devctl_notify("kernel", "signal", "coredump", sbuf_data(sb));
 out2:
 	sbuf_delete(sb);
 out:
 	error1 = vn_close(vp, FWRITE, cred, td);
 	if (error == 0)
 		error = error1;
 #ifdef AUDIT
 	audit_proc_coredump(td, name, error);
 #endif
 	free(name, M_TEMP);
 	return (error);
 }
 
 /*
  * Nonexistent system call-- signal process (may want to handle it).  Flag
  * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
 /* ARGSUSED */
 int
 nosys(struct thread *td, struct nosys_args *args)
 {
 	struct proc *p;
 
 	p = td->td_proc;
 
 	PROC_LOCK(p);
 	tdsignal(td, SIGSYS);
 	PROC_UNLOCK(p);
 	if (kern_lognosys == 1 || kern_lognosys == 3) {
 		uprintf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	if (kern_lognosys == 2 || kern_lognosys == 3) {
 		printf("pid %d comm %s: nosys %d\n", p->p_pid, p->p_comm,
 		    td->td_sa.code);
 	}
 	return (ENOSYS);
 }
 
 /*
  * Send a SIGIO or SIGURG signal to a process or process group using stored
  * credentials rather than those of the current process.
  */
 void
 pgsigio(struct sigio **sigiop, int sig, int checkctty)
 {
 	ksiginfo_t ksi;
 	struct sigio *sigio;
 
 	ksiginfo_init(&ksi);
 	ksi.ksi_signo = sig;
 	ksi.ksi_code = SI_KERNEL;
 
 	SIGIO_LOCK();
 	sigio = *sigiop;
 	if (sigio == NULL) {
 		SIGIO_UNLOCK();
 		return;
 	}
 	if (sigio->sio_pgid > 0) {
 		PROC_LOCK(sigio->sio_proc);
 		if (CANSIGIO(sigio->sio_ucred, sigio->sio_proc->p_ucred))
 			kern_psignal(sigio->sio_proc, sig);
 		PROC_UNLOCK(sigio->sio_proc);
 	} else if (sigio->sio_pgid < 0) {
 		struct proc *p;
 
 		PGRP_LOCK(sigio->sio_pgrp);
 		LIST_FOREACH(p, &sigio->sio_pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);
 			if (p->p_state == PRS_NORMAL &&
 			    CANSIGIO(sigio->sio_ucred, p->p_ucred) &&
 			    (checkctty == 0 || (p->p_flag & P_CONTROLT)))
 				kern_psignal(p, sig);
 			PROC_UNLOCK(p);
 		}
 		PGRP_UNLOCK(sigio->sio_pgrp);
 	}
 	SIGIO_UNLOCK();
 }
 
 static int
 filt_sigattach(struct knote *kn)
 {
 	struct proc *p = curproc;
 
 	kn->kn_ptr.p_proc = p;
 	kn->kn_flags |= EV_CLEAR;		/* automatically set */
 
 	knlist_add(p->p_klist, kn, 0);
 
 	return (0);
 }
 
 static void
 filt_sigdetach(struct knote *kn)
 {
 	struct proc *p = kn->kn_ptr.p_proc;
 
 	knlist_remove(p->p_klist, kn, 0);
 }
 
 /*
  * signal knotes are shared with proc knotes, so we apply a mask to
  * the hint in order to differentiate them from process hints.  This
  * could be avoided by using a signal-specific knote list, but probably
  * isn't worth the trouble.
  */
 static int
 filt_signal(struct knote *kn, long hint)
 {
 
 	if (hint & NOTE_SIGNAL) {
 		hint &= ~NOTE_SIGNAL;
 
 		if (kn->kn_id == hint)
 			kn->kn_data++;
 	}
 	return (kn->kn_data != 0);
 }
 
 struct sigacts *
 sigacts_alloc(void)
 {
 	struct sigacts *ps;
 
 	ps = malloc(sizeof(struct sigacts), M_SUBPROC, M_WAITOK | M_ZERO);
 	refcount_init(&ps->ps_refcnt, 1);
 	mtx_init(&ps->ps_mtx, "sigacts", NULL, MTX_DEF);
 	return (ps);
 }
 
 void
 sigacts_free(struct sigacts *ps)
 {
 
 	if (refcount_release(&ps->ps_refcnt) == 0)
 		return;
 	mtx_destroy(&ps->ps_mtx);
 	free(ps, M_SUBPROC);
 }
 
 struct sigacts *
 sigacts_hold(struct sigacts *ps)
 {
 
 	refcount_acquire(&ps->ps_refcnt);
 	return (ps);
 }
 
 void
 sigacts_copy(struct sigacts *dest, struct sigacts *src)
 {
 
 	KASSERT(dest->ps_refcnt == 1, ("sigacts_copy to shared dest"));
 	mtx_lock(&src->ps_mtx);
 	bcopy(src, dest, offsetof(struct sigacts, ps_refcnt));
 	mtx_unlock(&src->ps_mtx);
 }
 
 int
 sigacts_shared(struct sigacts *ps)
 {
 
 	return (ps->ps_refcnt > 1);
 }
Index: projects/fuse2/sys/kern/sys_process.c
===================================================================
--- projects/fuse2/sys/kern/sys_process.c	(revision 350434)
+++ projects/fuse2/sys/kern/sys_process.c	(revision 350435)
@@ -1,1557 +1,1558 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1994, Sean Eric Fagan
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by Sean Eric Fagan.
  * 4. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ktr.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/pioctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/ptrace.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/malloc.h>
 #include <sys/signalvar.h>
 
 #include <machine/reg.h>
 
 #include <security/audit/audit.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_param.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/procfs.h>
 #include <compat/freebsd32/freebsd32_signal.h>
 
 struct ptrace_io_desc32 {
 	int		piod_op;
 	uint32_t	piod_offs;
 	uint32_t	piod_addr;
 	uint32_t	piod_len;
 };
 
 struct ptrace_sc_ret32 {
 	uint32_t	sr_retval[2];
 	int		sr_error;
 };
 
 struct ptrace_vm_entry32 {
 	int		pve_entry;
 	int		pve_timestamp;
 	uint32_t	pve_start;
 	uint32_t	pve_end;
 	uint32_t	pve_offset;
 	u_int		pve_prot;
 	u_int		pve_pathlen;
 	int32_t		pve_fileid;
 	u_int		pve_fsid;
 	uint32_t	pve_path;
 };
 #endif
 
 /*
  * Functions implemented using PROC_ACTION():
  *
  * proc_read_regs(proc, regs)
  *	Get the current user-visible register set from the process
  *	and copy it into the regs structure (<machine/reg.h>).
  *	The process is stopped at the time read_regs is called.
  *
  * proc_write_regs(proc, regs)
  *	Update the current register set from the passed in regs
  *	structure.  Take care to avoid clobbering special CPU
  *	registers or privileged bits in the PSL.
  *	Depending on the architecture this may have fix-up work to do,
  *	especially if the IAR or PCW are modified.
  *	The process is stopped at the time write_regs is called.
  *
  * proc_read_fpregs, proc_write_fpregs
  *	deal with the floating point register set, otherwise as above.
  *
  * proc_read_dbregs, proc_write_dbregs
  *	deal with the processor debug register set, otherwise as above.
  *
  * proc_sstep(proc)
  *	Arrange for the process to trap after executing a single instruction.
  */
 
 #define	PROC_ACTION(action) do {					\
 	int error;							\
 									\
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);			\
 	if ((td->td_proc->p_flag & P_INMEM) == 0)			\
 		error = EIO;						\
 	else								\
 		error = (action);					\
 	return (error);							\
 } while(0)
 
 int
 proc_read_regs(struct thread *td, struct reg *regs)
 {
 
 	PROC_ACTION(fill_regs(td, regs));
 }
 
 int
 proc_write_regs(struct thread *td, struct reg *regs)
 {
 
 	PROC_ACTION(set_regs(td, regs));
 }
 
 int
 proc_read_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	PROC_ACTION(fill_dbregs(td, dbregs));
 }
 
 int
 proc_write_dbregs(struct thread *td, struct dbreg *dbregs)
 {
 
 	PROC_ACTION(set_dbregs(td, dbregs));
 }
 
 /*
  * Ptrace doesn't support fpregs at all, and there are no security holes
  * or translations for fpregs, so we can just copy them.
  */
 int
 proc_read_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	PROC_ACTION(fill_fpregs(td, fpregs));
 }
 
 int
 proc_write_fpregs(struct thread *td, struct fpreg *fpregs)
 {
 
 	PROC_ACTION(set_fpregs(td, fpregs));
 }
 
 #ifdef COMPAT_FREEBSD32
 /* For 32 bit binaries, we need to expose the 32 bit regs layouts. */
 int
 proc_read_regs32(struct thread *td, struct reg32 *regs32)
 {
 
 	PROC_ACTION(fill_regs32(td, regs32));
 }
 
 int
 proc_write_regs32(struct thread *td, struct reg32 *regs32)
 {
 
 	PROC_ACTION(set_regs32(td, regs32));
 }
 
 int
 proc_read_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
 {
 
 	PROC_ACTION(fill_dbregs32(td, dbregs32));
 }
 
 int
 proc_write_dbregs32(struct thread *td, struct dbreg32 *dbregs32)
 {
 
 	PROC_ACTION(set_dbregs32(td, dbregs32));
 }
 
 int
 proc_read_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
 {
 
 	PROC_ACTION(fill_fpregs32(td, fpregs32));
 }
 
 int
 proc_write_fpregs32(struct thread *td, struct fpreg32 *fpregs32)
 {
 
 	PROC_ACTION(set_fpregs32(td, fpregs32));
 }
 #endif
 
 int
 proc_sstep(struct thread *td)
 {
 
 	PROC_ACTION(ptrace_single_step(td));
 }
 
 int
 proc_rwmem(struct proc *p, struct uio *uio)
 {
 	vm_map_t map;
 	vm_offset_t pageno;		/* page number */
 	vm_prot_t reqprot;
 	int error, fault_flags, page_offset, writing;
 
 	/*
 	 * Assert that someone has locked this vmspace.  (Should be
 	 * curthread but we can't assert that.)  This keeps the process
 	 * from exiting out from under us until this operation completes.
 	 */
 	PROC_ASSERT_HELD(p);
 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
 
 	/*
 	 * The map we want...
 	 */
 	map = &p->p_vmspace->vm_map;
 
 	/*
 	 * If we are writing, then we request vm_fault() to create a private
 	 * copy of each page.  Since these copies will not be writeable by the
 	 * process, we must explicity request that they be dirtied.
 	 */
 	writing = uio->uio_rw == UIO_WRITE;
 	reqprot = writing ? VM_PROT_COPY | VM_PROT_READ : VM_PROT_READ;
 	fault_flags = writing ? VM_FAULT_DIRTY : VM_FAULT_NORMAL;
 
 	/*
 	 * Only map in one page at a time.  We don't have to, but it
 	 * makes things easier.  This way is trivial - right?
 	 */
 	do {
 		vm_offset_t uva;
 		u_int len;
 		vm_page_t m;
 
 		uva = (vm_offset_t)uio->uio_offset;
 
 		/*
 		 * Get the page number of this segment.
 		 */
 		pageno = trunc_page(uva);
 		page_offset = uva - pageno;
 
 		/*
 		 * How many bytes to copy
 		 */
 		len = min(PAGE_SIZE - page_offset, uio->uio_resid);
 
 		/*
 		 * Fault and hold the page on behalf of the process.
 		 */
 		error = vm_fault_hold(map, pageno, reqprot, fault_flags, &m);
 		if (error != KERN_SUCCESS) {
 			if (error == KERN_RESOURCE_SHORTAGE)
 				error = ENOMEM;
 			else
 				error = EFAULT;
 			break;
 		}
 
 		/*
 		 * Now do the i/o move.
 		 */
 		error = uiomove_fromphys(&m, page_offset, len, uio);
 
 		/* Make the I-cache coherent for breakpoints. */
 		if (writing && error == 0) {
 			vm_map_lock_read(map);
 			if (vm_map_check_protection(map, pageno, pageno +
 			    PAGE_SIZE, VM_PROT_EXECUTE))
 				vm_sync_icache(map, uva, len);
 			vm_map_unlock_read(map);
 		}
 
 		/*
 		 * Release the page.
 		 */
 		vm_page_lock(m);
 		if (vm_page_unwire(m, PQ_ACTIVE) && m->object == NULL)
 			vm_page_free(m);
 		vm_page_unlock(m);
 
 	} while (error == 0 && uio->uio_resid > 0);
 
 	return (error);
 }
 
 static ssize_t
 proc_iop(struct thread *td, struct proc *p, vm_offset_t va, void *buf,
     size_t len, enum uio_rw rw)
 {
 	struct iovec iov;
 	struct uio uio;
 	ssize_t slen;
 
 	MPASS(len < SSIZE_MAX);
 	slen = (ssize_t)len;
 
 	iov.iov_base = (caddr_t)buf;
 	iov.iov_len = len;
 	uio.uio_iov = &iov;
 	uio.uio_iovcnt = 1;
 	uio.uio_offset = va;
 	uio.uio_resid = slen;
 	uio.uio_segflg = UIO_SYSSPACE;
 	uio.uio_rw = rw;
 	uio.uio_td = td;
 	proc_rwmem(p, &uio);
 	if (uio.uio_resid == slen)
 		return (-1);
 	return (slen - uio.uio_resid);
 }
 
 ssize_t
 proc_readmem(struct thread *td, struct proc *p, vm_offset_t va, void *buf,
     size_t len)
 {
 
 	return (proc_iop(td, p, va, buf, len, UIO_READ));
 }
 
 ssize_t
 proc_writemem(struct thread *td, struct proc *p, vm_offset_t va, void *buf,
     size_t len)
 {
 
 	return (proc_iop(td, p, va, buf, len, UIO_WRITE));
 }
 
 static int
 ptrace_vm_entry(struct thread *td, struct proc *p, struct ptrace_vm_entry *pve)
 {
 	struct vattr vattr;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_object_t obj, tobj, lobj;
 	struct vmspace *vm;
 	struct vnode *vp;
 	char *freepath, *fullpath;
 	u_int pathlen;
 	int error, index;
 
 	error = 0;
 	obj = NULL;
 
 	vm = vmspace_acquire_ref(p);
 	map = &vm->vm_map;
 	vm_map_lock_read(map);
 
 	do {
 		entry = map->header.next;
 		index = 0;
 		while (index < pve->pve_entry && entry != &map->header) {
 			entry = entry->next;
 			index++;
 		}
 		if (index != pve->pve_entry) {
 			error = EINVAL;
 			break;
 		}
 		KASSERT((map->header.eflags & MAP_ENTRY_IS_SUB_MAP) == 0,
 		    ("Submap in map header"));
 		while ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) != 0) {
 			entry = entry->next;
 			index++;
 		}
 		if (entry == &map->header) {
 			error = ENOENT;
 			break;
 		}
 
 		/* We got an entry. */
 		pve->pve_entry = index + 1;
 		pve->pve_timestamp = map->timestamp;
 		pve->pve_start = entry->start;
 		pve->pve_end = entry->end - 1;
 		pve->pve_offset = entry->offset;
 		pve->pve_prot = entry->protection;
 
 		/* Backing object's path needed? */
 		if (pve->pve_pathlen == 0)
 			break;
 
 		pathlen = pve->pve_pathlen;
 		pve->pve_pathlen = 0;
 
 		obj = entry->object.vm_object;
 		if (obj != NULL)
 			VM_OBJECT_RLOCK(obj);
 	} while (0);
 
 	vm_map_unlock_read(map);
 
 	pve->pve_fsid = VNOVAL;
 	pve->pve_fileid = VNOVAL;
 
 	if (error == 0 && obj != NULL) {
 		lobj = obj;
 		for (tobj = obj; tobj != NULL; tobj = tobj->backing_object) {
 			if (tobj != obj)
 				VM_OBJECT_RLOCK(tobj);
 			if (lobj != obj)
 				VM_OBJECT_RUNLOCK(lobj);
 			lobj = tobj;
 			pve->pve_offset += tobj->backing_object_offset;
 		}
 		vp = vm_object_vnode(lobj);
 		if (vp != NULL)
 			vref(vp);
 		if (lobj != obj)
 			VM_OBJECT_RUNLOCK(lobj);
 		VM_OBJECT_RUNLOCK(obj);
 
 		if (vp != NULL) {
 			freepath = NULL;
 			fullpath = NULL;
 			vn_fullpath(td, vp, &fullpath, &freepath);
 			vn_lock(vp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(vp, &vattr, td->td_ucred) == 0) {
 				pve->pve_fileid = vattr.va_fileid;
 				pve->pve_fsid = vattr.va_fsid;
 			}
 			vput(vp);
 
 			if (fullpath != NULL) {
 				pve->pve_pathlen = strlen(fullpath) + 1;
 				if (pve->pve_pathlen <= pathlen) {
 					error = copyout(fullpath, pve->pve_path,
 					    pve->pve_pathlen);
 				} else
 					error = ENAMETOOLONG;
 			}
 			if (freepath != NULL)
 				free(freepath, M_TEMP);
 		}
 	}
 	vmspace_free(vm);
 	if (error == 0)
 		CTR3(KTR_PTRACE, "PT_VM_ENTRY: pid %d, entry %d, start %p",
 		    p->p_pid, pve->pve_entry, pve->pve_start);
 
 	return (error);
 }
 
 #ifdef COMPAT_FREEBSD32
 static int
 ptrace_vm_entry32(struct thread *td, struct proc *p,
     struct ptrace_vm_entry32 *pve32)
 {
 	struct ptrace_vm_entry pve;
 	int error;
 
 	pve.pve_entry = pve32->pve_entry;
 	pve.pve_pathlen = pve32->pve_pathlen;
 	pve.pve_path = (void *)(uintptr_t)pve32->pve_path;
 
 	error = ptrace_vm_entry(td, p, &pve);
 	if (error == 0) {
 		pve32->pve_entry = pve.pve_entry;
 		pve32->pve_timestamp = pve.pve_timestamp;
 		pve32->pve_start = pve.pve_start;
 		pve32->pve_end = pve.pve_end;
 		pve32->pve_offset = pve.pve_offset;
 		pve32->pve_prot = pve.pve_prot;
 		pve32->pve_fileid = pve.pve_fileid;
 		pve32->pve_fsid = pve.pve_fsid;
 	}
 
 	pve32->pve_pathlen = pve.pve_pathlen;
 	return (error);
 }
 
 static void
 ptrace_lwpinfo_to32(const struct ptrace_lwpinfo *pl,
     struct ptrace_lwpinfo32 *pl32)
 {
 
 	bzero(pl32, sizeof(*pl32));
 	pl32->pl_lwpid = pl->pl_lwpid;
 	pl32->pl_event = pl->pl_event;
 	pl32->pl_flags = pl->pl_flags;
 	pl32->pl_sigmask = pl->pl_sigmask;
 	pl32->pl_siglist = pl->pl_siglist;
 	siginfo_to_siginfo32(&pl->pl_siginfo, &pl32->pl_siginfo);
 	strcpy(pl32->pl_tdname, pl->pl_tdname);
 	pl32->pl_child_pid = pl->pl_child_pid;
 	pl32->pl_syscall_code = pl->pl_syscall_code;
 	pl32->pl_syscall_narg = pl->pl_syscall_narg;
 }
 
 static void
 ptrace_sc_ret_to32(const struct ptrace_sc_ret *psr,
     struct ptrace_sc_ret32 *psr32)
 {
 
 	bzero(psr32, sizeof(*psr32));
 	psr32->sr_retval[0] = psr->sr_retval[0];
 	psr32->sr_retval[1] = psr->sr_retval[1];
 	psr32->sr_error = psr->sr_error;
 }
 #endif /* COMPAT_FREEBSD32 */
 
 /*
  * Process debugging system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ptrace_args {
 	int	req;
 	pid_t	pid;
 	caddr_t	addr;
 	int	data;
 };
 #endif
 
 #ifdef COMPAT_FREEBSD32
 /*
  * This CPP subterfuge is to try and reduce the number of ifdefs in
  * the body of the code.
  *   COPYIN(uap->addr, &r.reg, sizeof r.reg);
  * becomes either:
  *   copyin(uap->addr, &r.reg, sizeof r.reg);
  * or
  *   copyin(uap->addr, &r.reg32, sizeof r.reg32);
  * .. except this is done at runtime.
  */
 #define	BZERO(a, s)		wrap32 ? \
 	bzero(a ## 32, s ## 32) : \
 	bzero(a, s)
 #define	COPYIN(u, k, s)		wrap32 ? \
 	copyin(u, k ## 32, s ## 32) : \
 	copyin(u, k, s)
 #define	COPYOUT(k, u, s)	wrap32 ? \
 	copyout(k ## 32, u, s ## 32) : \
 	copyout(k, u, s)
 #else
 #define	BZERO(a, s)		bzero(a, s)
 #define	COPYIN(u, k, s)		copyin(u, k, s)
 #define	COPYOUT(k, u, s)	copyout(k, u, s)
 #endif
 int
 sys_ptrace(struct thread *td, struct ptrace_args *uap)
 {
 	/*
 	 * XXX this obfuscation is to reduce stack usage, but the register
 	 * structs may be too large to put on the stack anyway.
 	 */
 	union {
 		struct ptrace_io_desc piod;
 		struct ptrace_lwpinfo pl;
 		struct ptrace_vm_entry pve;
 		struct dbreg dbreg;
 		struct fpreg fpreg;
 		struct reg reg;
 #ifdef COMPAT_FREEBSD32
 		struct dbreg32 dbreg32;
 		struct fpreg32 fpreg32;
 		struct reg32 reg32;
 		struct ptrace_io_desc32 piod32;
 		struct ptrace_lwpinfo32 pl32;
 		struct ptrace_vm_entry32 pve32;
 #endif
 		char args[sizeof(td->td_sa.args)];
 		struct ptrace_sc_ret psr;
 		int ptevents;
 	} r;
 	void *addr;
 	int error = 0;
 #ifdef COMPAT_FREEBSD32
 	int wrap32 = 0;
 
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		wrap32 = 1;
 #endif
 	AUDIT_ARG_PID(uap->pid);
 	AUDIT_ARG_CMD(uap->req);
 	AUDIT_ARG_VALUE(uap->data);
 	addr = &r;
 	switch (uap->req) {
 	case PT_GET_EVENT_MASK:
 	case PT_LWPINFO:
 	case PT_GET_SC_ARGS:
 	case PT_GET_SC_RET:
 		break;
 	case PT_GETREGS:
 		BZERO(&r.reg, sizeof r.reg);
 		break;
 	case PT_GETFPREGS:
 		BZERO(&r.fpreg, sizeof r.fpreg);
 		break;
 	case PT_GETDBREGS:
 		BZERO(&r.dbreg, sizeof r.dbreg);
 		break;
 	case PT_SETREGS:
 		error = COPYIN(uap->addr, &r.reg, sizeof r.reg);
 		break;
 	case PT_SETFPREGS:
 		error = COPYIN(uap->addr, &r.fpreg, sizeof r.fpreg);
 		break;
 	case PT_SETDBREGS:
 		error = COPYIN(uap->addr, &r.dbreg, sizeof r.dbreg);
 		break;
 	case PT_SET_EVENT_MASK:
 		if (uap->data != sizeof(r.ptevents))
 			error = EINVAL;
 		else
 			error = copyin(uap->addr, &r.ptevents, uap->data);
 		break;
 	case PT_IO:
 		error = COPYIN(uap->addr, &r.piod, sizeof r.piod);
 		break;
 	case PT_VM_ENTRY:
 		error = COPYIN(uap->addr, &r.pve, sizeof r.pve);
 		break;
 	default:
 		addr = uap->addr;
 		break;
 	}
 	if (error)
 		return (error);
 
 	error = kern_ptrace(td, uap->req, uap->pid, addr, uap->data);
 	if (error)
 		return (error);
 
 	switch (uap->req) {
 	case PT_VM_ENTRY:
 		error = COPYOUT(&r.pve, uap->addr, sizeof r.pve);
 		break;
 	case PT_IO:
 		error = COPYOUT(&r.piod, uap->addr, sizeof r.piod);
 		break;
 	case PT_GETREGS:
 		error = COPYOUT(&r.reg, uap->addr, sizeof r.reg);
 		break;
 	case PT_GETFPREGS:
 		error = COPYOUT(&r.fpreg, uap->addr, sizeof r.fpreg);
 		break;
 	case PT_GETDBREGS:
 		error = COPYOUT(&r.dbreg, uap->addr, sizeof r.dbreg);
 		break;
 	case PT_GET_EVENT_MASK:
 		/* NB: The size in uap->data is validated in kern_ptrace(). */
 		error = copyout(&r.ptevents, uap->addr, uap->data);
 		break;
 	case PT_LWPINFO:
 		/* NB: The size in uap->data is validated in kern_ptrace(). */
 		error = copyout(&r.pl, uap->addr, uap->data);
 		break;
 	case PT_GET_SC_ARGS:
 		error = copyout(r.args, uap->addr, MIN(uap->data,
 		    sizeof(r.args)));
 		break;
 	case PT_GET_SC_RET:
 		error = copyout(&r.psr, uap->addr, MIN(uap->data,
 		    sizeof(r.psr)));
 		break;
 	}
 
 	return (error);
 }
 #undef COPYIN
 #undef COPYOUT
 #undef BZERO
 
 #ifdef COMPAT_FREEBSD32
 /*
  *   PROC_READ(regs, td2, addr);
  * becomes either:
  *   proc_read_regs(td2, addr);
  * or
  *   proc_read_regs32(td2, addr);
  * .. except this is done at runtime.  There is an additional
  * complication in that PROC_WRITE disallows 32 bit consumers
  * from writing to 64 bit address space targets.
  */
 #define	PROC_READ(w, t, a)	wrap32 ? \
 	proc_read_ ## w ## 32(t, a) : \
 	proc_read_ ## w (t, a)
 #define	PROC_WRITE(w, t, a)	wrap32 ? \
 	(safe ? proc_write_ ## w ## 32(t, a) : EINVAL ) : \
 	proc_write_ ## w (t, a)
 #else
 #define	PROC_READ(w, t, a)	proc_read_ ## w (t, a)
 #define	PROC_WRITE(w, t, a)	proc_write_ ## w (t, a)
 #endif
 
 void
 proc_set_traced(struct proc *p, bool stop)
 {
 
 	sx_assert(&proctree_lock, SX_XLOCKED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag |= P_TRACED;
 	if (stop)
 		p->p_flag2 |= P2_PTRACE_FSTP;
 	p->p_ptevents = PTRACE_DEFAULT;
 }
 
 int
 kern_ptrace(struct thread *td, int req, pid_t pid, void *addr, int data)
 {
 	struct iovec iov;
 	struct uio uio;
 	struct proc *curp, *p, *pp;
 	struct thread *td2 = NULL, *td3;
 	struct ptrace_io_desc *piod = NULL;
 	struct ptrace_lwpinfo *pl;
 	struct ptrace_sc_ret *psr;
 	int error, num, tmp;
 	int proctree_locked = 0;
 	lwpid_t tid = 0, *buf;
 #ifdef COMPAT_FREEBSD32
 	int wrap32 = 0, safe = 0;
 	struct ptrace_io_desc32 *piod32 = NULL;
 	struct ptrace_lwpinfo32 *pl32 = NULL;
 	struct ptrace_sc_ret32 *psr32 = NULL;
 	union {
 		struct ptrace_lwpinfo pl;
 		struct ptrace_sc_ret psr;
 	} r;
 #endif
 
 	curp = td->td_proc;
 
 	/* Lock proctree before locking the process. */
 	switch (req) {
 	case PT_TRACE_ME:
 	case PT_ATTACH:
 	case PT_STEP:
 	case PT_CONTINUE:
 	case PT_TO_SCE:
 	case PT_TO_SCX:
 	case PT_SYSCALL:
 	case PT_FOLLOW_FORK:
 	case PT_LWP_EVENTS:
 	case PT_GET_EVENT_MASK:
 	case PT_SET_EVENT_MASK:
 	case PT_DETACH:
 	case PT_GET_SC_ARGS:
 		sx_xlock(&proctree_lock);
 		proctree_locked = 1;
 		break;
 	default:
 		break;
 	}
 
 	if (req == PT_TRACE_ME) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 	} else {
 		if (pid <= PID_MAX) {
 			if ((p = pfind(pid)) == NULL) {
 				if (proctree_locked)
 					sx_xunlock(&proctree_lock);
 				return (ESRCH);
 			}
 		} else {
 			td2 = tdfind(pid, -1);
 			if (td2 == NULL) {
 				if (proctree_locked)
 					sx_xunlock(&proctree_lock);
 				return (ESRCH);
 			}
 			p = td2->td_proc;
 			tid = pid;
 			pid = p->p_pid;
 		}
 	}
 	AUDIT_ARG_PROCESS(p);
 
 	if ((p->p_flag & P_WEXIT) != 0) {
 		error = ESRCH;
 		goto fail;
 	}
 	if ((error = p_cansee(td, p)) != 0)
 		goto fail;
 
 	if ((error = p_candebug(td, p)) != 0)
 		goto fail;
 
 	/*
 	 * System processes can't be debugged.
 	 */
 	if ((p->p_flag & P_SYSTEM) != 0) {
 		error = EINVAL;
 		goto fail;
 	}
 
 	if (tid == 0) {
 		if ((p->p_flag & P_STOPPED_TRACE) != 0) {
 			KASSERT(p->p_xthread != NULL, ("NULL p_xthread"));
 			td2 = p->p_xthread;
 		} else {
 			td2 = FIRST_THREAD_IN_PROC(p);
 		}
 		tid = td2->td_tid;
 	}
 
 #ifdef COMPAT_FREEBSD32
 	/*
 	 * Test if we're a 32 bit client and what the target is.
 	 * Set the wrap controls accordingly.
 	 */
 	if (SV_CURPROC_FLAG(SV_ILP32)) {
 		if (SV_PROC_FLAG(td2->td_proc, SV_ILP32))
 			safe = 1;
 		wrap32 = 1;
 	}
 #endif
 	/*
 	 * Permissions check
 	 */
 	switch (req) {
 	case PT_TRACE_ME:
 		/*
 		 * Always legal, when there is a parent process which
 		 * could trace us.  Otherwise, reject.
 		 */
 		if ((p->p_flag & P_TRACED) != 0) {
 			error = EBUSY;
 			goto fail;
 		}
 		if (p->p_pptr == initproc) {
 			error = EPERM;
 			goto fail;
 		}
 		break;
 
 	case PT_ATTACH:
 		/* Self */
 		if (p == td->td_proc) {
 			error = EINVAL;
 			goto fail;
 		}
 
 		/* Already traced */
 		if (p->p_flag & P_TRACED) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* Can't trace an ancestor if you're being traced. */
 		if (curp->p_flag & P_TRACED) {
 			for (pp = curp->p_pptr; pp != NULL; pp = pp->p_pptr) {
 				if (pp == p) {
 					error = EINVAL;
 					goto fail;
 				}
 			}
 		}
 
 
 		/* OK */
 		break;
 
 	case PT_CLEARSTEP:
 		/* Allow thread to clear single step for itself */
 		if (td->td_tid == tid)
 			break;
 
 		/* FALLTHROUGH */
 	default:
 		/* not being traced... */
 		if ((p->p_flag & P_TRACED) == 0) {
 			error = EPERM;
 			goto fail;
 		}
 
 		/* not being traced by YOU */
 		if (p->p_pptr != td->td_proc) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* not currently stopped */
 		if ((p->p_flag & P_STOPPED_TRACE) == 0 ||
 		    p->p_suspcount != p->p_numthreads  ||
 		    (p->p_flag & P_WAITED) == 0) {
 			error = EBUSY;
 			goto fail;
 		}
 
 		/* OK */
 		break;
 	}
 
 	/* Keep this process around until we finish this request. */
 	_PHOLD(p);
 
 #ifdef FIX_SSTEP
 	/*
 	 * Single step fixup ala procfs
 	 */
 	FIX_SSTEP(td2);
 #endif
 
 	/*
 	 * Actually do the requests
 	 */
 
 	td->td_retval[0] = 0;
 
 	switch (req) {
 	case PT_TRACE_ME:
 		/* set my trace flag and "owner" so it can read/write me */
 		proc_set_traced(p, false);
 		if (p->p_flag & P_PPWAIT)
 			p->p_flag |= P_PPTRACE;
 		CTR1(KTR_PTRACE, "PT_TRACE_ME: pid %d", p->p_pid);
 		break;
 
 	case PT_ATTACH:
 		/* security check done above */
 		/*
 		 * It would be nice if the tracing relationship was separate
 		 * from the parent relationship but that would require
 		 * another set of links in the proc struct or for "wait"
 		 * to scan the entire proc table.  To make life easier,
 		 * we just re-parent the process we're trying to trace.
 		 * The old parent is remembered so we can put things back
 		 * on a "detach".
 		 */
 		proc_set_traced(p, true);
 		if (p->p_pptr != td->td_proc) {
 			proc_reparent(p, td->td_proc, false);
 		}
 		CTR2(KTR_PTRACE, "PT_ATTACH: pid %d, oppid %d", p->p_pid,
 		    p->p_oppid);
 
 		sx_xunlock(&proctree_lock);
 		proctree_locked = 0;
 		MPASS(p->p_xthread == NULL);
 		MPASS((p->p_flag & P_STOPPED_TRACE) == 0);
 
 		/*
 		 * If already stopped due to a stop signal, clear the
 		 * existing stop before triggering a traced SIGSTOP.
 		 */
 		if ((p->p_flag & P_STOPPED_SIG) != 0) {
 			PROC_SLOCK(p);
 			p->p_flag &= ~(P_STOPPED_SIG | P_WAITED);
 			thread_unsuspend(p);
 			PROC_SUNLOCK(p);
 		}
 
 		kern_psignal(p, SIGSTOP);
 		break;
 
 	case PT_CLEARSTEP:
 		CTR2(KTR_PTRACE, "PT_CLEARSTEP: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = ptrace_clear_single_step(td2);
 		break;
 
 	case PT_SETSTEP:
 		CTR2(KTR_PTRACE, "PT_SETSTEP: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = ptrace_single_step(td2);
 		break;
 
 	case PT_SUSPEND:
 		CTR2(KTR_PTRACE, "PT_SUSPEND: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_SUSPEND;
 		thread_lock(td2);
 		td2->td_flags |= TDF_NEEDSUSPCHK;
 		thread_unlock(td2);
 		break;
 
 	case PT_RESUME:
 		CTR2(KTR_PTRACE, "PT_RESUME: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags &= ~TDB_SUSPEND;
 		break;
 
 	case PT_FOLLOW_FORK:
 		CTR3(KTR_PTRACE, "PT_FOLLOW_FORK: pid %d %s -> %s", p->p_pid,
 		    p->p_ptevents & PTRACE_FORK ? "enabled" : "disabled",
 		    data ? "enabled" : "disabled");
 		if (data)
 			p->p_ptevents |= PTRACE_FORK;
 		else
 			p->p_ptevents &= ~PTRACE_FORK;
 		break;
 
 	case PT_LWP_EVENTS:
 		CTR3(KTR_PTRACE, "PT_LWP_EVENTS: pid %d %s -> %s", p->p_pid,
 		    p->p_ptevents & PTRACE_LWP ? "enabled" : "disabled",
 		    data ? "enabled" : "disabled");
 		if (data)
 			p->p_ptevents |= PTRACE_LWP;
 		else
 			p->p_ptevents &= ~PTRACE_LWP;
 		break;
 
 	case PT_GET_EVENT_MASK:
 		if (data != sizeof(p->p_ptevents)) {
 			error = EINVAL;
 			break;
 		}
 		CTR2(KTR_PTRACE, "PT_GET_EVENT_MASK: pid %d mask %#x", p->p_pid,
 		    p->p_ptevents);
 		*(int *)addr = p->p_ptevents;
 		break;
 
 	case PT_SET_EVENT_MASK:
 		if (data != sizeof(p->p_ptevents)) {
 			error = EINVAL;
 			break;
 		}
 		tmp = *(int *)addr;
 		if ((tmp & ~(PTRACE_EXEC | PTRACE_SCE | PTRACE_SCX |
 		    PTRACE_FORK | PTRACE_LWP | PTRACE_VFORK)) != 0) {
 			error = EINVAL;
 			break;
 		}
 		CTR3(KTR_PTRACE, "PT_SET_EVENT_MASK: pid %d mask %#x -> %#x",
 		    p->p_pid, p->p_ptevents, tmp);
 		p->p_ptevents = tmp;
 		break;
 
 	case PT_GET_SC_ARGS:
 		CTR1(KTR_PTRACE, "PT_GET_SC_ARGS: pid %d", p->p_pid);
 		if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) == 0
 #ifdef COMPAT_FREEBSD32
 		    || (wrap32 && !safe)
 #endif
 		    ) {
 			error = EINVAL;
 			break;
 		}
 		bzero(addr, sizeof(td2->td_sa.args));
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			for (num = 0; num < nitems(td2->td_sa.args); num++)
 				((uint32_t *)addr)[num] = (uint32_t)
 				    td2->td_sa.args[num];
 		else
 #endif
 			bcopy(td2->td_sa.args, addr, td2->td_sa.narg *
 			    sizeof(register_t));
 		break;
 
 	case PT_GET_SC_RET:
 		if ((td2->td_dbgflags & (TDB_SCX)) == 0
 #ifdef COMPAT_FREEBSD32
 		    || (wrap32 && !safe)
 #endif
 		    ) {
 			error = EINVAL;
 			break;
 		}
 #ifdef COMPAT_FREEBSD32
 		if (wrap32) {
 			psr = &r.psr;
 			psr32 = addr;
 		} else
 #endif
 		psr = addr;
 		bzero(psr, sizeof(*psr));
 		psr->sr_error = td2->td_errno;
 		if (psr->sr_error == 0) {
 			psr->sr_retval[0] = td2->td_retval[0];
 			psr->sr_retval[1] = td2->td_retval[1];
 		}
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			ptrace_sc_ret_to32(psr, psr32);
 #endif
 		CTR4(KTR_PTRACE,
 		    "PT_GET_SC_RET: pid %d error %d retval %#lx,%#lx",
 		    p->p_pid, psr->sr_error, psr->sr_retval[0],
 		    psr->sr_retval[1]);
 		break;
 		
 	case PT_STEP:
 	case PT_CONTINUE:
 	case PT_TO_SCE:
 	case PT_TO_SCX:
 	case PT_SYSCALL:
 	case PT_DETACH:
 		/* Zero means do not send any signal */
 		if (data < 0 || data > _SIG_MAXSIG) {
 			error = EINVAL;
 			break;
 		}
 
 		switch (req) {
 		case PT_STEP:
 			CTR3(KTR_PTRACE, "PT_STEP: tid %d (pid %d), sig = %d",
 			    td2->td_tid, p->p_pid, data);
 			error = ptrace_single_step(td2);
 			if (error)
 				goto out;
 			break;
 		case PT_CONTINUE:
 		case PT_TO_SCE:
 		case PT_TO_SCX:
 		case PT_SYSCALL:
 			if (addr != (void *)1) {
 				error = ptrace_set_pc(td2,
 				    (u_long)(uintfptr_t)addr);
 				if (error)
 					goto out;
 			}
 			switch (req) {
 			case PT_TO_SCE:
 				p->p_ptevents |= PTRACE_SCE;
 				CTR4(KTR_PTRACE,
 		    "PT_TO_SCE: pid %d, events = %#x, PC = %#lx, sig = %d",
 				    p->p_pid, p->p_ptevents,
 				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_TO_SCX:
 				p->p_ptevents |= PTRACE_SCX;
 				CTR4(KTR_PTRACE,
 		    "PT_TO_SCX: pid %d, events = %#x, PC = %#lx, sig = %d",
 				    p->p_pid, p->p_ptevents,
 				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_SYSCALL:
 				p->p_ptevents |= PTRACE_SYSCALL;
 				CTR4(KTR_PTRACE,
 		    "PT_SYSCALL: pid %d, events = %#x, PC = %#lx, sig = %d",
 				    p->p_pid, p->p_ptevents,
 				    (u_long)(uintfptr_t)addr, data);
 				break;
 			case PT_CONTINUE:
 				CTR3(KTR_PTRACE,
 				    "PT_CONTINUE: pid %d, PC = %#lx, sig = %d",
 				    p->p_pid, (u_long)(uintfptr_t)addr, data);
 				break;
 			}
 			break;
 		case PT_DETACH:
 			/*
 			 * Reset the process parent.
 			 *
 			 * NB: This clears P_TRACED before reparenting
 			 * a detached process back to its original
 			 * parent.  Otherwise the debugee will be set
 			 * as an orphan of the debugger.
 			 */
 			p->p_flag &= ~(P_TRACED | P_WAITED);
 			if (p->p_oppid != p->p_pptr->p_pid) {
 				PROC_LOCK(p->p_pptr);
 				sigqueue_take(p->p_ksi);
 				PROC_UNLOCK(p->p_pptr);
 
 				pp = proc_realparent(p);
 				proc_reparent(p, pp, false);
 				if (pp == initproc)
 					p->p_sigparent = SIGCHLD;
 				CTR3(KTR_PTRACE,
 			    "PT_DETACH: pid %d reparented to pid %d, sig %d",
 				    p->p_pid, pp->p_pid, data);
 			} else
 				CTR2(KTR_PTRACE, "PT_DETACH: pid %d, sig %d",
 				    p->p_pid, data);
 			p->p_ptevents = 0;
 			FOREACH_THREAD_IN_PROC(p, td3) {
 				if ((td3->td_dbgflags & TDB_FSTP) != 0) {
 					sigqueue_delete(&td3->td_sigqueue,
 					    SIGSTOP);
 				}
 				td3->td_dbgflags &= ~(TDB_XSIG | TDB_FSTP |
 				    TDB_SUSPEND);
 			}
 
 			if ((p->p_flag2 & P2_PTRACE_FSTP) != 0) {
 				sigqueue_delete(&p->p_sigqueue, SIGSTOP);
 				p->p_flag2 &= ~P2_PTRACE_FSTP;
 			}
 
 			/* should we send SIGCHLD? */
 			/* childproc_continued(p); */
 			break;
 		}
 
 		sx_xunlock(&proctree_lock);
 		proctree_locked = 0;
 
 	sendsig:
 		MPASS(proctree_locked == 0);
 		
 		/* 
 		 * Clear the pending event for the thread that just
 		 * reported its event (p_xthread).  This may not be
 		 * the thread passed to PT_CONTINUE, PT_STEP, etc. if
 		 * the debugger is resuming a different thread.
 		 *
 		 * Deliver any pending signal via the reporting thread.
 		 */
 		MPASS(p->p_xthread != NULL);
 		p->p_xthread->td_dbgflags &= ~TDB_XSIG;
 		p->p_xthread->td_xsig = data;
 		p->p_xthread = NULL;
 		p->p_xsig = data;
 
 		/*
 		 * P_WKILLED is insurance that a PT_KILL/SIGKILL
 		 * always works immediately, even if another thread is
 		 * unsuspended first and attempts to handle a
 		 * different signal or if the POSIX.1b style signal
 		 * queue cannot accommodate any new signals.
 		 */
 		if (data == SIGKILL)
 			proc_wkilled(p);
 
 		/*
 		 * Unsuspend all threads.  To leave a thread
 		 * suspended, use PT_SUSPEND to suspend it before
 		 * continuing the process.
 		 */
 		PROC_SLOCK(p);
 		p->p_flag &= ~(P_STOPPED_TRACE | P_STOPPED_SIG | P_WAITED);
 		thread_unsuspend(p);
 		PROC_SUNLOCK(p);
 		break;
 
 	case PT_WRITE_I:
 	case PT_WRITE_D:
 		td2->td_dbgflags |= TDB_USERWR;
 		PROC_UNLOCK(p);
 		error = 0;
 		if (proc_writemem(td, p, (off_t)(uintptr_t)addr, &data,
 		    sizeof(int)) != sizeof(int))
 			error = ENOMEM;
 		else
 			CTR3(KTR_PTRACE, "PT_WRITE: pid %d: %p <= %#x",
 			    p->p_pid, addr, data);
 		PROC_LOCK(p);
 		break;
 
 	case PT_READ_I:
 	case PT_READ_D:
 		PROC_UNLOCK(p);
 		error = tmp = 0;
 		if (proc_readmem(td, p, (off_t)(uintptr_t)addr, &tmp,
 		    sizeof(int)) != sizeof(int))
 			error = ENOMEM;
 		else
 			CTR3(KTR_PTRACE, "PT_READ: pid %d: %p >= %#x",
 			    p->p_pid, addr, tmp);
 		td->td_retval[0] = tmp;
 		PROC_LOCK(p);
 		break;
 
 	case PT_IO:
 #ifdef COMPAT_FREEBSD32
 		if (wrap32) {
 			piod32 = addr;
 			iov.iov_base = (void *)(uintptr_t)piod32->piod_addr;
 			iov.iov_len = piod32->piod_len;
 			uio.uio_offset = (off_t)(uintptr_t)piod32->piod_offs;
 			uio.uio_resid = piod32->piod_len;
 		} else
 #endif
 		{
 			piod = addr;
 			iov.iov_base = piod->piod_addr;
 			iov.iov_len = piod->piod_len;
 			uio.uio_offset = (off_t)(uintptr_t)piod->piod_offs;
 			uio.uio_resid = piod->piod_len;
 		}
 		uio.uio_iov = &iov;
 		uio.uio_iovcnt = 1;
 		uio.uio_segflg = UIO_USERSPACE;
 		uio.uio_td = td;
 #ifdef COMPAT_FREEBSD32
 		tmp = wrap32 ? piod32->piod_op : piod->piod_op;
 #else
 		tmp = piod->piod_op;
 #endif
 		switch (tmp) {
 		case PIOD_READ_D:
 		case PIOD_READ_I:
 			CTR3(KTR_PTRACE, "PT_IO: pid %d: READ (%p, %#x)",
 			    p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid);
 			uio.uio_rw = UIO_READ;
 			break;
 		case PIOD_WRITE_D:
 		case PIOD_WRITE_I:
 			CTR3(KTR_PTRACE, "PT_IO: pid %d: WRITE (%p, %#x)",
 			    p->p_pid, (uintptr_t)uio.uio_offset, uio.uio_resid);
 			td2->td_dbgflags |= TDB_USERWR;
 			uio.uio_rw = UIO_WRITE;
 			break;
 		default:
 			error = EINVAL;
 			goto out;
 		}
 		PROC_UNLOCK(p);
 		error = proc_rwmem(p, &uio);
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			piod32->piod_len -= uio.uio_resid;
 		else
 #endif
 			piod->piod_len -= uio.uio_resid;
 		PROC_LOCK(p);
 		break;
 
 	case PT_KILL:
 		CTR1(KTR_PTRACE, "PT_KILL: pid %d", p->p_pid);
 		data = SIGKILL;
 		goto sendsig;	/* in PT_CONTINUE above */
 
 	case PT_SETREGS:
 		CTR2(KTR_PTRACE, "PT_SETREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(regs, td2, addr);
 		break;
 
 	case PT_GETREGS:
 		CTR2(KTR_PTRACE, "PT_GETREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = PROC_READ(regs, td2, addr);
 		break;
 
 	case PT_SETFPREGS:
 		CTR2(KTR_PTRACE, "PT_SETFPREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(fpregs, td2, addr);
 		break;
 
 	case PT_GETFPREGS:
 		CTR2(KTR_PTRACE, "PT_GETFPREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = PROC_READ(fpregs, td2, addr);
 		break;
 
 	case PT_SETDBREGS:
 		CTR2(KTR_PTRACE, "PT_SETDBREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		td2->td_dbgflags |= TDB_USERWR;
 		error = PROC_WRITE(dbregs, td2, addr);
 		break;
 
 	case PT_GETDBREGS:
 		CTR2(KTR_PTRACE, "PT_GETDBREGS: tid %d (pid %d)", td2->td_tid,
 		    p->p_pid);
 		error = PROC_READ(dbregs, td2, addr);
 		break;
 
 	case PT_LWPINFO:
 		if (data <= 0 ||
 #ifdef COMPAT_FREEBSD32
 		    (!wrap32 && data > sizeof(*pl)) ||
 		    (wrap32 && data > sizeof(*pl32))) {
 #else
 		    data > sizeof(*pl)) {
 #endif
 			error = EINVAL;
 			break;
 		}
 #ifdef COMPAT_FREEBSD32
 		if (wrap32) {
 			pl = &r.pl;
 			pl32 = addr;
 		} else
 #endif
 		pl = addr;
 		bzero(pl, sizeof(*pl));
 		pl->pl_lwpid = td2->td_tid;
 		pl->pl_event = PL_EVENT_NONE;
 		pl->pl_flags = 0;
 		if (td2->td_dbgflags & TDB_XSIG) {
 			pl->pl_event = PL_EVENT_SIGNAL;
 			if (td2->td_si.si_signo != 0 &&
 #ifdef COMPAT_FREEBSD32
 			    ((!wrap32 && data >= offsetof(struct ptrace_lwpinfo,
 			    pl_siginfo) + sizeof(pl->pl_siginfo)) ||
 			    (wrap32 && data >= offsetof(struct ptrace_lwpinfo32,
 			    pl_siginfo) + sizeof(struct siginfo32)))
 #else
 			    data >= offsetof(struct ptrace_lwpinfo, pl_siginfo)
 			    + sizeof(pl->pl_siginfo)
 #endif
 			){
 				pl->pl_flags |= PL_FLAG_SI;
 				pl->pl_siginfo = td2->td_si;
 			}
 		}
 		if (td2->td_dbgflags & TDB_SCE)
 			pl->pl_flags |= PL_FLAG_SCE;
 		else if (td2->td_dbgflags & TDB_SCX)
 			pl->pl_flags |= PL_FLAG_SCX;
 		if (td2->td_dbgflags & TDB_EXEC)
 			pl->pl_flags |= PL_FLAG_EXEC;
 		if (td2->td_dbgflags & TDB_FORK) {
 			pl->pl_flags |= PL_FLAG_FORKED;
 			pl->pl_child_pid = td2->td_dbg_forked;
 			if (td2->td_dbgflags & TDB_VFORK)
 				pl->pl_flags |= PL_FLAG_VFORKED;
 		} else if ((td2->td_dbgflags & (TDB_SCX | TDB_VFORK)) ==
 		    TDB_VFORK)
 			pl->pl_flags |= PL_FLAG_VFORK_DONE;
 		if (td2->td_dbgflags & TDB_CHILD)
 			pl->pl_flags |= PL_FLAG_CHILD;
 		if (td2->td_dbgflags & TDB_BORN)
 			pl->pl_flags |= PL_FLAG_BORN;
 		if (td2->td_dbgflags & TDB_EXIT)
 			pl->pl_flags |= PL_FLAG_EXITED;
 		pl->pl_sigmask = td2->td_sigmask;
 		pl->pl_siglist = td2->td_siglist;
 		strcpy(pl->pl_tdname, td2->td_name);
 		if ((td2->td_dbgflags & (TDB_SCE | TDB_SCX)) != 0) {
 			pl->pl_syscall_code = td2->td_sa.code;
 			pl->pl_syscall_narg = td2->td_sa.narg;
 		} else {
 			pl->pl_syscall_code = 0;
 			pl->pl_syscall_narg = 0;
 		}
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			ptrace_lwpinfo_to32(pl, pl32);
 #endif
 		CTR6(KTR_PTRACE,
     "PT_LWPINFO: tid %d (pid %d) event %d flags %#x child pid %d syscall %d",
 		    td2->td_tid, p->p_pid, pl->pl_event, pl->pl_flags,
 		    pl->pl_child_pid, pl->pl_syscall_code);
 		break;
 
 	case PT_GETNUMLWPS:
 		CTR2(KTR_PTRACE, "PT_GETNUMLWPS: pid %d: %d threads", p->p_pid,
 		    p->p_numthreads);
 		td->td_retval[0] = p->p_numthreads;
 		break;
 
 	case PT_GETLWPLIST:
 		CTR3(KTR_PTRACE, "PT_GETLWPLIST: pid %d: data %d, actual %d",
 		    p->p_pid, data, p->p_numthreads);
 		if (data <= 0) {
 			error = EINVAL;
 			break;
 		}
 		num = imin(p->p_numthreads, data);
 		PROC_UNLOCK(p);
 		buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
 		tmp = 0;
 		PROC_LOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (tmp >= num)
 				break;
 			buf[tmp++] = td2->td_tid;
 		}
 		PROC_UNLOCK(p);
 		error = copyout(buf, addr, tmp * sizeof(lwpid_t));
 		free(buf, M_TEMP);
 		if (!error)
 			td->td_retval[0] = tmp;
 		PROC_LOCK(p);
 		break;
 
 	case PT_VM_TIMESTAMP:
 		CTR2(KTR_PTRACE, "PT_VM_TIMESTAMP: pid %d: timestamp %d",
 		    p->p_pid, p->p_vmspace->vm_map.timestamp);
 		td->td_retval[0] = p->p_vmspace->vm_map.timestamp;
 		break;
 
 	case PT_VM_ENTRY:
 		PROC_UNLOCK(p);
 #ifdef COMPAT_FREEBSD32
 		if (wrap32)
 			error = ptrace_vm_entry32(td, p, addr);
 		else
 #endif
 		error = ptrace_vm_entry(td, p, addr);
 		PROC_LOCK(p);
 		break;
 
 	default:
 #ifdef __HAVE_PTRACE_MACHDEP
 		if (req >= PT_FIRSTMACH) {
 			PROC_UNLOCK(p);
 			error = cpu_ptrace(td2, req, addr, data);
 			PROC_LOCK(p);
 		} else
 #endif
 			/* Unknown request. */
 			error = EINVAL;
 		break;
 	}
 
 out:
 	/* Drop our hold on this process now that the request has completed. */
 	_PRELE(p);
 fail:
 	PROC_UNLOCK(p);
 	if (proctree_locked)
 		sx_xunlock(&proctree_lock);
 	return (error);
 }
 #undef PROC_READ
 #undef PROC_WRITE
 
 /*
  * Stop a process because of a debugging event;
  * stay stopped until p->p_step is cleared
  * (cleared by PIOCCONT in procfs).
  */
 void
 stopevent(struct proc *p, unsigned int event, unsigned int val)
 {
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_step = 1;
 	CTR3(KTR_PTRACE, "stopevent: pid %d event %u val %u", p->p_pid, event,
 	    val);
 	do {
 		if (event != S_EXIT)
 			p->p_xsig = val;
 		p->p_xthread = NULL;
 		p->p_stype = event;	/* Which event caused the stop? */
 		wakeup(&p->p_stype);	/* Wake up any PIOCWAIT'ing procs */
 		msleep(&p->p_step, &p->p_mtx, PWAIT, "stopevent", 0);
 	} while (p->p_step);
 }
Index: projects/fuse2/sys/kern/uipc_shm.c
===================================================================
--- projects/fuse2/sys/kern/uipc_shm.c	(revision 350434)
+++ projects/fuse2/sys/kern/uipc_shm.c	(revision 350435)
@@ -1,1180 +1,1181 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2006, 2011, 2016-2017 Robert N. M. Watson
  * All rights reserved.
  *
  * Portions of this software were developed by BAE Systems, the University of
  * Cambridge Computer Laboratory, and Memorial University under DARPA/AFRL
  * contract FA8650-15-C-7558 ("CADETS"), as part of the DARPA Transparent
  * Computing (TC) research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * Support for shared swap-backed anonymous memory objects via
  * shm_open(2) and shm_unlink(2).  While most of the implementation is
  * here, vm_mmap.c contains mapping logic changes.
  *
  * posixshmcontrol(1) allows users to inspect the state of the memory
  * objects.  Per-uid swap resource limit controls total amount of
  * memory that user can consume for anonymous objects, including
  * shared.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fnv_hash.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/uio.h>
 #include <sys/signal.h>
 #include <sys/jail.h>
 #include <sys/ktrace.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/stat.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
 #include <sys/unistd.h>
 #include <sys/user.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
 struct shm_mapping {
 	char		*sm_path;
 	Fnv32_t		sm_fnv;
 	struct shmfd	*sm_shmfd;
 	LIST_ENTRY(shm_mapping) sm_link;
 };
 
 static MALLOC_DEFINE(M_SHMFD, "shmfd", "shared memory file descriptor");
 static LIST_HEAD(, shm_mapping) *shm_dictionary;
 static struct sx shm_dict_lock;
 static struct mtx shm_timestamp_lock;
 static u_long shm_hash;
 static struct unrhdr64 shm_ino_unr;
 static dev_t shm_dev_ino;
 
 #define	SHM_HASH(fnv)	(&shm_dictionary[(fnv) & shm_hash])
 
 static void	shm_init(void *arg);
 static void	shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd);
 static struct shmfd *shm_lookup(char *path, Fnv32_t fnv);
 static int	shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred);
 
 static fo_rdwr_t	shm_read;
 static fo_rdwr_t	shm_write;
 static fo_truncate_t	shm_truncate;
 static fo_ioctl_t	shm_ioctl;
 static fo_stat_t	shm_stat;
 static fo_close_t	shm_close;
 static fo_chmod_t	shm_chmod;
 static fo_chown_t	shm_chown;
 static fo_seek_t	shm_seek;
 static fo_fill_kinfo_t	shm_fill_kinfo;
 static fo_mmap_t	shm_mmap;
 
 /* File descriptor operations. */
 struct fileops shm_ops = {
 	.fo_read = shm_read,
 	.fo_write = shm_write,
 	.fo_truncate = shm_truncate,
 	.fo_ioctl = shm_ioctl,
 	.fo_poll = invfo_poll,
 	.fo_kqfilter = invfo_kqfilter,
 	.fo_stat = shm_stat,
 	.fo_close = shm_close,
 	.fo_chmod = shm_chmod,
 	.fo_chown = shm_chown,
 	.fo_sendfile = vn_sendfile,
 	.fo_seek = shm_seek,
 	.fo_fill_kinfo = shm_fill_kinfo,
 	.fo_mmap = shm_mmap,
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
 FEATURE(posix_shm, "POSIX shared memory");
 
 static int
 uiomove_object_page(vm_object_t obj, size_t len, struct uio *uio)
 {
 	vm_page_t m;
 	vm_pindex_t idx;
 	size_t tlen;
 	int error, offset, rv;
 
 	idx = OFF_TO_IDX(uio->uio_offset);
 	offset = uio->uio_offset & PAGE_MASK;
 	tlen = MIN(PAGE_SIZE - offset, len);
 
 	VM_OBJECT_WLOCK(obj);
 
 	/*
 	 * Read I/O without either a corresponding resident page or swap
 	 * page: use zero_region.  This is intended to avoid instantiating
 	 * pages on read from a sparse region.
 	 */
 	if (uio->uio_rw == UIO_READ && vm_page_lookup(obj, idx) == NULL &&
 	    !vm_pager_has_page(obj, idx, NULL, NULL)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (uiomove(__DECONST(void *, zero_region), tlen, uio));
 	}
 
 	/*
 	 * Parallel reads of the page content from disk are prevented
 	 * by exclusive busy.
 	 *
 	 * Although the tmpfs vnode lock is held here, it is
 	 * nonetheless safe to sleep waiting for a free page.  The
 	 * pageout daemon does not need to acquire the tmpfs vnode
 	 * lock to page out tobj's pages because tobj is a OBJT_SWAP
 	 * type object.
 	 */
 	m = vm_page_grab(obj, idx, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
 	if (m->valid != VM_PAGE_BITS_ALL) {
 		vm_page_xbusy(m);
 		if (vm_pager_has_page(obj, idx, NULL, NULL)) {
 			rv = vm_pager_get_pages(obj, &m, 1, NULL, NULL);
 			if (rv != VM_PAGER_OK) {
 				printf(
 	    "uiomove_object: vm_obj %p idx %jd valid %x pager error %d\n",
 				    obj, idx, m->valid, rv);
 				vm_page_lock(m);
 				vm_page_free(m);
 				vm_page_unlock(m);
 				VM_OBJECT_WUNLOCK(obj);
 				return (EIO);
 			}
 		} else
 			vm_page_zero_invalid(m, TRUE);
 		vm_page_xunbusy(m);
 	}
 	vm_page_lock(m);
 	vm_page_wire(m);
 	vm_page_unlock(m);
 	VM_OBJECT_WUNLOCK(obj);
 	error = uiomove_fromphys(&m, offset, tlen, uio);
 	if (uio->uio_rw == UIO_WRITE && error == 0) {
 		VM_OBJECT_WLOCK(obj);
 		vm_page_dirty(m);
 		vm_pager_page_unswapped(m);
 		VM_OBJECT_WUNLOCK(obj);
 	}
 	vm_page_lock(m);
 	vm_page_unwire(m, PQ_ACTIVE);
 	vm_page_unlock(m);
 
 	return (error);
 }
 
 int
 uiomove_object(vm_object_t obj, off_t obj_size, struct uio *uio)
 {
 	ssize_t resid;
 	size_t len;
 	int error;
 
 	error = 0;
 	while ((resid = uio->uio_resid) > 0) {
 		if (obj_size <= uio->uio_offset)
 			break;
 		len = MIN(obj_size - uio->uio_offset, resid);
 		if (len == 0)
 			break;
 		error = uiomove_object_page(obj, len, uio);
 		if (error != 0 || resid == uio->uio_resid)
 			break;
 	}
 	return (error);
 }
 
 static int
 shm_seek(struct file *fp, off_t offset, int whence, struct thread *td)
 {
 	struct shmfd *shmfd;
 	off_t foffset;
 	int error;
 
 	shmfd = fp->f_data;
 	foffset = foffset_lock(fp, 0);
 	error = 0;
 	switch (whence) {
 	case L_INCR:
 		if (foffset < 0 ||
 		    (offset > 0 && foffset > OFF_MAX - offset)) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += foffset;
 		break;
 	case L_XTND:
 		if (offset > 0 && shmfd->shm_size > OFF_MAX - offset) {
 			error = EOVERFLOW;
 			break;
 		}
 		offset += shmfd->shm_size;
 		break;
 	case L_SET:
 		break;
 	default:
 		error = EINVAL;
 	}
 	if (error == 0) {
 		if (offset < 0 || offset > shmfd->shm_size)
 			error = EINVAL;
 		else
 			td->td_uretoff.tdu_off = offset;
 	}
 	foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
 	return (error);
 }
 
 static int
 shm_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_read(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	rl_cookie = rangelock_rlock(&shmfd->shm_rl, uio->uio_offset,
 	    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct shmfd *shmfd;
 	void *rl_cookie;
 	int error;
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_write(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	foffset_lock_uio(fp, uio, flags);
 	if ((flags & FOF_OFFSET) == 0) {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, 0, OFF_MAX,
 		    &shmfd->shm_mtx);
 	} else {
 		rl_cookie = rangelock_wlock(&shmfd->shm_rl, uio->uio_offset,
 		    uio->uio_offset + uio->uio_resid, &shmfd->shm_mtx);
 	}
 
 	error = uiomove_object(shmfd->shm_object, shmfd->shm_size, uio);
 	rangelock_unlock(&shmfd->shm_rl, rl_cookie, &shmfd->shm_mtx);
 	foffset_unlock_uio(fp, uio, flags);
 	return (error);
 }
 
 static int
 shm_truncate(struct file *fp, off_t length, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 #ifdef MAC
 	error = mac_posixshm_check_truncate(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	return (shm_dotruncate(shmfd, length));
 }
 
 int
 shm_ioctl(struct file *fp, u_long com, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 
 	switch (com) {
 	case FIONBIO:
 	case FIOASYNC:
 		/*
 		 * Allow fcntl(fd, F_SETFL, O_NONBLOCK) to work,
 		 * just like it would on an unlinked regular file
 		 */
 		return (0);
 	default:
 		return (ENOTTY);
 	}
 }
 
 static int
 shm_stat(struct file *fp, struct stat *sb, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 #ifdef MAC
 	int error;
 #endif
 
 	shmfd = fp->f_data;
 
 #ifdef MAC
 	error = mac_posixshm_check_stat(active_cred, fp->f_cred, shmfd);
 	if (error)
 		return (error);
 #endif
 	
 	/*
 	 * Attempt to return sanish values for fstat() on a memory file
 	 * descriptor.
 	 */
 	bzero(sb, sizeof(*sb));
 	sb->st_blksize = PAGE_SIZE;
 	sb->st_size = shmfd->shm_size;
 	sb->st_blocks = howmany(sb->st_size, sb->st_blksize);
 	mtx_lock(&shm_timestamp_lock);
 	sb->st_atim = shmfd->shm_atime;
 	sb->st_ctim = shmfd->shm_ctime;
 	sb->st_mtim = shmfd->shm_mtime;
 	sb->st_birthtim = shmfd->shm_birthtime;
 	sb->st_mode = S_IFREG | shmfd->shm_mode;		/* XXX */
 	sb->st_uid = shmfd->shm_uid;
 	sb->st_gid = shmfd->shm_gid;
 	mtx_unlock(&shm_timestamp_lock);
 	sb->st_dev = shm_dev_ino;
 	sb->st_ino = shmfd->shm_ino;
 	sb->st_nlink = shmfd->shm_object->ref_count;
 
 	return (0);
 }
 
 static int
 shm_close(struct file *fp, struct thread *td)
 {
 	struct shmfd *shmfd;
 
 	shmfd = fp->f_data;
 	fp->f_data = NULL;
 	shm_drop(shmfd);
 
 	return (0);
 }
 
 int
 shm_dotruncate(struct shmfd *shmfd, off_t length)
 {
 	vm_object_t object;
 	vm_page_t m;
 	vm_pindex_t idx, nobjsize;
 	vm_ooffset_t delta;
 	int base, rv;
 
 	KASSERT(length >= 0, ("shm_dotruncate: length < 0"));
 	object = shmfd->shm_object;
 	VM_OBJECT_WLOCK(object);
 	if (length == shmfd->shm_size) {
 		VM_OBJECT_WUNLOCK(object);
 		return (0);
 	}
 	nobjsize = OFF_TO_IDX(length + PAGE_MASK);
 
 	/* Are we shrinking?  If so, trim the end. */
 	if (length < shmfd->shm_size) {
 		/*
 		 * Disallow any requests to shrink the size if this
 		 * object is mapped into the kernel.
 		 */
 		if (shmfd->shm_kmappings > 0) {
 			VM_OBJECT_WUNLOCK(object);
 			return (EBUSY);
 		}
 
 		/*
 		 * Zero the truncated part of the last page.
 		 */
 		base = length & PAGE_MASK;
 		if (base != 0) {
 			idx = OFF_TO_IDX(length);
 retry:
 			m = vm_page_lookup(object, idx);
 			if (m != NULL) {
 				if (vm_page_sleep_if_busy(m, "shmtrc"))
 					goto retry;
 			} else if (vm_pager_has_page(object, idx, NULL, NULL)) {
 				m = vm_page_alloc(object, idx,
 				    VM_ALLOC_NORMAL | VM_ALLOC_WAITFAIL);
 				if (m == NULL)
 					goto retry;
 				rv = vm_pager_get_pages(object, &m, 1, NULL,
 				    NULL);
 				vm_page_lock(m);
 				if (rv == VM_PAGER_OK) {
 					/*
 					 * Since the page was not resident,
 					 * and therefore not recently
 					 * accessed, immediately enqueue it
 					 * for asynchronous laundering.  The
 					 * current operation is not regarded
 					 * as an access.
 					 */
 					vm_page_launder(m);
 					vm_page_unlock(m);
 					vm_page_xunbusy(m);
 				} else {
 					vm_page_free(m);
 					vm_page_unlock(m);
 					VM_OBJECT_WUNLOCK(object);
 					return (EIO);
 				}
 			}
 			if (m != NULL) {
 				pmap_zero_page_area(m, base, PAGE_SIZE - base);
 				KASSERT(m->valid == VM_PAGE_BITS_ALL,
 				    ("shm_dotruncate: page %p is invalid", m));
 				vm_page_dirty(m);
 				vm_pager_page_unswapped(m);
 			}
 		}
 		delta = IDX_TO_OFF(object->size - nobjsize);
 
 		/* Toss in memory pages. */
 		if (nobjsize < object->size)
 			vm_object_page_remove(object, nobjsize, object->size,
 			    0);
 
 		/* Toss pages from swap. */
 		if (object->type == OBJT_SWAP)
 			swap_pager_freespace(object, nobjsize, delta);
 
 		/* Free the swap accounted for shm */
 		swap_release_by_cred(delta, object->cred);
 		object->charge -= delta;
 	} else {
 		/* Try to reserve additional swap space. */
 		delta = IDX_TO_OFF(nobjsize - object->size);
 		if (!swap_reserve_by_cred(delta, object->cred)) {
 			VM_OBJECT_WUNLOCK(object);
 			return (ENOMEM);
 		}
 		object->charge += delta;
 	}
 	shmfd->shm_size = length;
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_ctime);
 	shmfd->shm_mtime = shmfd->shm_ctime;
 	mtx_unlock(&shm_timestamp_lock);
 	object->size = nobjsize;
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * shmfd object management including creation and reference counting
  * routines.
  */
 struct shmfd *
 shm_alloc(struct ucred *ucred, mode_t mode)
 {
 	struct shmfd *shmfd;
 
 	shmfd = malloc(sizeof(*shmfd), M_SHMFD, M_WAITOK | M_ZERO);
 	shmfd->shm_size = 0;
 	shmfd->shm_uid = ucred->cr_uid;
 	shmfd->shm_gid = ucred->cr_gid;
 	shmfd->shm_mode = mode;
 	shmfd->shm_object = vm_pager_allocate(OBJT_DEFAULT, NULL,
 	    shmfd->shm_size, VM_PROT_DEFAULT, 0, ucred);
 	KASSERT(shmfd->shm_object != NULL, ("shm_create: vm_pager_allocate"));
 	shmfd->shm_object->pg_color = 0;
 	VM_OBJECT_WLOCK(shmfd->shm_object);
 	vm_object_clear_flag(shmfd->shm_object, OBJ_ONEMAPPING);
 	vm_object_set_flag(shmfd->shm_object, OBJ_COLORED | OBJ_NOSPLIT);
 	VM_OBJECT_WUNLOCK(shmfd->shm_object);
 	vfs_timestamp(&shmfd->shm_birthtime);
 	shmfd->shm_atime = shmfd->shm_mtime = shmfd->shm_ctime =
 	    shmfd->shm_birthtime;
 	shmfd->shm_ino = alloc_unr64(&shm_ino_unr);
 	refcount_init(&shmfd->shm_refs, 1);
 	mtx_init(&shmfd->shm_mtx, "shmrl", NULL, MTX_DEF);
 	rangelock_init(&shmfd->shm_rl);
 #ifdef MAC
 	mac_posixshm_init(shmfd);
 	mac_posixshm_create(ucred, shmfd);
 #endif
 
 	return (shmfd);
 }
 
 struct shmfd *
 shm_hold(struct shmfd *shmfd)
 {
 
 	refcount_acquire(&shmfd->shm_refs);
 	return (shmfd);
 }
 
 void
 shm_drop(struct shmfd *shmfd)
 {
 
 	if (refcount_release(&shmfd->shm_refs)) {
 #ifdef MAC
 		mac_posixshm_destroy(shmfd);
 #endif
 		rangelock_destroy(&shmfd->shm_rl);
 		mtx_destroy(&shmfd->shm_mtx);
 		vm_object_deallocate(shmfd->shm_object);
 		free(shmfd, M_SHMFD);
 	}
 }
 
 /*
  * Determine if the credentials have sufficient permissions for a
  * specified combination of FREAD and FWRITE.
  */
 int
 shm_access(struct shmfd *shmfd, struct ucred *ucred, int flags)
 {
 	accmode_t accmode;
 	int error;
 
 	accmode = 0;
 	if (flags & FREAD)
 		accmode |= VREAD;
 	if (flags & FWRITE)
 		accmode |= VWRITE;
 	mtx_lock(&shm_timestamp_lock);
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid, shmfd->shm_gid,
 	    accmode, ucred, NULL);
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Dictionary management.  We maintain an in-kernel dictionary to map
  * paths to shmfd objects.  We use the FNV hash on the path to store
  * the mappings in a hash table.
  */
 static void
 shm_init(void *arg)
 {
 
 	mtx_init(&shm_timestamp_lock, "shm timestamps", NULL, MTX_DEF);
 	sx_init(&shm_dict_lock, "shm dictionary");
 	shm_dictionary = hashinit(1024, M_SHMFD, &shm_hash);
 	new_unrhdr64(&shm_ino_unr, 1);
 	shm_dev_ino = devfs_alloc_cdp_inode();
 	KASSERT(shm_dev_ino > 0, ("shm dev inode not initialized"));
 }
 SYSINIT(shm_init, SI_SUB_SYSV_SHM, SI_ORDER_ANY, shm_init, NULL);
 
 static struct shmfd *
 shm_lookup(char *path, Fnv32_t fnv)
 {
 	struct shm_mapping *map;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0)
 			return (map->sm_shmfd);
 	}
 
 	return (NULL);
 }
 
 static void
 shm_insert(char *path, Fnv32_t fnv, struct shmfd *shmfd)
 {
 	struct shm_mapping *map;
 
 	map = malloc(sizeof(struct shm_mapping), M_SHMFD, M_WAITOK);
 	map->sm_path = path;
 	map->sm_fnv = fnv;
 	map->sm_shmfd = shm_hold(shmfd);
 	shmfd->shm_path = path;
 	LIST_INSERT_HEAD(SHM_HASH(fnv), map, sm_link);
 }
 
 static int
 shm_remove(char *path, Fnv32_t fnv, struct ucred *ucred)
 {
 	struct shm_mapping *map;
 	int error;
 
 	LIST_FOREACH(map, SHM_HASH(fnv), sm_link) {
 		if (map->sm_fnv != fnv)
 			continue;
 		if (strcmp(map->sm_path, path) == 0) {
 #ifdef MAC
 			error = mac_posixshm_check_unlink(ucred, map->sm_shmfd);
 			if (error)
 				return (error);
 #endif
 			error = shm_access(map->sm_shmfd, ucred,
 			    FREAD | FWRITE);
 			if (error)
 				return (error);
 			map->sm_shmfd->shm_path = NULL;
 			LIST_REMOVE(map, sm_link);
 			shm_drop(map->sm_shmfd);
 			free(map->sm_path, M_SHMFD);
 			free(map, M_SHMFD);
 			return (0);
 		}
 	}
 
 	return (ENOENT);
 }
 
 int
 kern_shm_open(struct thread *td, const char *userpath, int flags, mode_t mode,
     struct filecaps *fcaps)
 {
 	struct filedesc *fdp;
 	struct shmfd *shmfd;
 	struct file *fp;
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	mode_t cmode;
 	int fd, error;
 
 #ifdef CAPABILITY_MODE
 	/*
 	 * shm_open(2) is only allowed for anonymous objects.
 	 */
 	if (IN_CAPABILITY_MODE(td) && (userpath != SHM_ANON))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FFLAGS(flags);
 	AUDIT_ARG_MODE(mode);
 
 	if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR)
 		return (EINVAL);
 
 	if ((flags & ~(O_ACCMODE | O_CREAT | O_EXCL | O_TRUNC | O_CLOEXEC)) != 0)
 		return (EINVAL);
 
 	fdp = td->td_proc->p_fd;
 	cmode = (mode & ~fdp->fd_cmask) & ACCESSPERMS;
 
 	error = falloc_caps(td, &fp, &fd, O_CLOEXEC, fcaps);
 	if (error)
 		return (error);
 
 	/* A SHM_ANON path pointer creates an anonymous object. */
 	if (userpath == SHM_ANON) {
 		/* A read-only anonymous object is pointless. */
 		if ((flags & O_ACCMODE) == O_RDONLY) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 		shmfd = shm_alloc(td->td_ucred, cmode);
 	} else {
 		path = malloc(MAXPATHLEN, M_SHMFD, M_WAITOK);
 		pr_path = td->td_ucred->cr_prison->pr_path;
 
 		/* Construct a full pathname for jailed callers. */
 		pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 		    : strlcpy(path, pr_path, MAXPATHLEN);
 		error = copyinstr(userpath, path + pr_pathlen,
 		    MAXPATHLEN - pr_pathlen, NULL);
 #ifdef KTRACE
 		if (error == 0 && KTRPOINT(curthread, KTR_NAMEI))
 			ktrnamei(path);
 #endif
 		/* Require paths to start with a '/' character. */
 		if (error == 0 && path[pr_pathlen] != '/')
 			error = EINVAL;
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			free(path, M_SHMFD);
 			return (error);
 		}
 
 		AUDIT_ARG_UPATH1_CANON(path);
 		fnv = fnv_32_str(path, FNV1_32_INIT);
 		sx_xlock(&shm_dict_lock);
 		shmfd = shm_lookup(path, fnv);
 		if (shmfd == NULL) {
 			/* Object does not yet exist, create it if requested. */
 			if (flags & O_CREAT) {
 #ifdef MAC
 				error = mac_posixshm_check_create(td->td_ucred,
 				    path);
 				if (error == 0) {
 #endif
 					shmfd = shm_alloc(td->td_ucred, cmode);
 					shm_insert(path, fnv, shmfd);
 #ifdef MAC
 				}
 #endif
 			} else {
 				free(path, M_SHMFD);
 				error = ENOENT;
 			}
 		} else {
 			/*
 			 * Object already exists, obtain a new
 			 * reference if requested and permitted.
 			 */
 			free(path, M_SHMFD);
 			if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
 				error = EEXIST;
 			else {
 #ifdef MAC
 				error = mac_posixshm_check_open(td->td_ucred,
 				    shmfd, FFLAGS(flags & O_ACCMODE));
 				if (error == 0)
 #endif
 				error = shm_access(shmfd, td->td_ucred,
 				    FFLAGS(flags & O_ACCMODE));
 			}
 
 			/*
 			 * Truncate the file back to zero length if
 			 * O_TRUNC was specified and the object was
 			 * opened with read/write.
 			 */
 			if (error == 0 &&
 			    (flags & (O_ACCMODE | O_TRUNC)) ==
 			    (O_RDWR | O_TRUNC)) {
 #ifdef MAC
 				error = mac_posixshm_check_truncate(
 					td->td_ucred, fp->f_cred, shmfd);
 				if (error == 0)
 #endif
 					shm_dotruncate(shmfd, 0);
 			}
 			if (error == 0)
 				shm_hold(shmfd);
 		}
 		sx_xunlock(&shm_dict_lock);
 
 		if (error) {
 			fdclose(td, fp, fd);
 			fdrop(fp, td);
 			return (error);
 		}
 	}
 
 	finit(fp, FFLAGS(flags & O_ACCMODE), DTYPE_SHM, shmfd, &shm_ops);
 
 	td->td_retval[0] = fd;
 	fdrop(fp, td);
 
 	return (0);
 }
 
 /* System calls. */
 int
 sys_shm_open(struct thread *td, struct shm_open_args *uap)
 {
 
 	return (kern_shm_open(td, uap->path, uap->flags, uap->mode, NULL));
 }
 
 int
 sys_shm_unlink(struct thread *td, struct shm_unlink_args *uap)
 {
 	char *path;
 	const char *pr_path;
 	size_t pr_pathlen;
 	Fnv32_t fnv;
 	int error;
 
 	path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	pr_path = td->td_ucred->cr_prison->pr_path;
 	pr_pathlen = strcmp(pr_path, "/") == 0 ? 0
 	    : strlcpy(path, pr_path, MAXPATHLEN);
 	error = copyinstr(uap->path, path + pr_pathlen, MAXPATHLEN - pr_pathlen,
 	    NULL);
 	if (error) {
 		free(path, M_TEMP);
 		return (error);
 	}
 #ifdef KTRACE
 	if (KTRPOINT(curthread, KTR_NAMEI))
 		ktrnamei(path);
 #endif
 	AUDIT_ARG_UPATH1_CANON(path);
 	fnv = fnv_32_str(path, FNV1_32_INIT);
 	sx_xlock(&shm_dict_lock);
 	error = shm_remove(path, fnv, td->td_ucred);
 	sx_xunlock(&shm_dict_lock);
 	free(path, M_TEMP);
 
 	return (error);
 }
 
 int
 shm_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t cap_maxprot, int flags,
     vm_ooffset_t foff, struct thread *td)
 {
 	struct shmfd *shmfd;
 	vm_prot_t maxprot;
 	int error;
 
 	shmfd = fp->f_data;
 	maxprot = VM_PROT_NONE;
 
 	/* FREAD should always be set. */
 	if ((fp->f_flag & FREAD) != 0)
 		maxprot |= VM_PROT_EXECUTE | VM_PROT_READ;
 	if ((fp->f_flag & FWRITE) != 0)
 		maxprot |= VM_PROT_WRITE;
 
 	/* Don't permit shared writable mappings on read-only descriptors. */
 	if ((flags & MAP_SHARED) != 0 &&
 	    (maxprot & VM_PROT_WRITE) == 0 &&
 	    (prot & VM_PROT_WRITE) != 0)
 		return (EACCES);
 	maxprot &= cap_maxprot;
 
 	/* See comment in vn_mmap(). */
 	if (
 #ifdef _LP64
 	    objsize > OFF_MAX ||
 #endif
 	    foff < 0 || foff > OFF_MAX - objsize)
 		return (EINVAL);
 
 #ifdef MAC
 	error = mac_posixshm_check_mmap(td->td_ucred, shmfd, prot, flags);
 	if (error != 0)
 		return (error);
 #endif
 	
 	mtx_lock(&shm_timestamp_lock);
 	vfs_timestamp(&shmfd->shm_atime);
 	mtx_unlock(&shm_timestamp_lock);
 	vm_object_reference(shmfd->shm_object);
 
 	error = vm_mmap_object(map, addr, objsize, prot, maxprot, flags,
 	    shmfd->shm_object, foff, FALSE, td);
 	if (error != 0)
 		vm_object_deallocate(shmfd->shm_object);
 	return (error);
 }
 
 static int
 shm_chmod(struct file *fp, mode_t mode, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 	/*
 	 * SUSv4 says that x bits of permission need not be affected.
 	 * Be consistent with our shm_open there.
 	 */
 #ifdef MAC
 	error = mac_posixshm_check_setmode(active_cred, shmfd, mode);
 	if (error != 0)
 		goto out;
 #endif
 	error = vaccess(VREG, shmfd->shm_mode, shmfd->shm_uid,
 	    shmfd->shm_gid, VADMIN, active_cred, NULL);
 	if (error != 0)
 		goto out;
 	shmfd->shm_mode = mode & ACCESSPERMS;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 static int
 shm_chown(struct file *fp, uid_t uid, gid_t gid, struct ucred *active_cred,
     struct thread *td)
 {
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	shmfd = fp->f_data;
 	mtx_lock(&shm_timestamp_lock);
 #ifdef MAC
 	error = mac_posixshm_check_setowner(active_cred, shmfd, uid, gid);
 	if (error != 0)
 		goto out;
 #endif
 	if (uid == (uid_t)-1)
 		uid = shmfd->shm_uid;
 	if (gid == (gid_t)-1)
                  gid = shmfd->shm_gid;
 	if (((uid != shmfd->shm_uid && uid != active_cred->cr_uid) ||
 	    (gid != shmfd->shm_gid && !groupmember(gid, active_cred))) &&
 	    (error = priv_check_cred(active_cred, PRIV_VFS_CHOWN)))
 		goto out;
 	shmfd->shm_uid = uid;
 	shmfd->shm_gid = gid;
 out:
 	mtx_unlock(&shm_timestamp_lock);
 	return (error);
 }
 
 /*
  * Helper routines to allow the backing object of a shared memory file
  * descriptor to be mapped in the kernel.
  */
 int
 shm_map(struct file *fp, size_t size, off_t offset, void **memp)
 {
 	struct shmfd *shmfd;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	obj = shmfd->shm_object;
 	VM_OBJECT_WLOCK(obj);
 	/*
 	 * XXXRW: This validation is probably insufficient, and subject to
 	 * sign errors.  It should be fixed.
 	 */
 	if (offset >= shmfd->shm_size ||
 	    offset + size > round_page(shmfd->shm_size)) {
 		VM_OBJECT_WUNLOCK(obj);
 		return (EINVAL);
 	}
 
 	shmfd->shm_kmappings++;
 	vm_object_reference_locked(obj);
 	VM_OBJECT_WUNLOCK(obj);
 
 	/* Map the object into the kernel_map and wire it. */
 	kva = vm_map_min(kernel_map);
 	ofs = offset & PAGE_MASK;
 	offset = trunc_page(offset);
 	size = round_page(size + ofs);
 	rv = vm_map_find(kernel_map, obj, offset, &kva, size, 0,
 	    VMFS_OPTIMAL_SPACE, VM_PROT_READ | VM_PROT_WRITE,
 	    VM_PROT_READ | VM_PROT_WRITE, 0);
 	if (rv == KERN_SUCCESS) {
 		rv = vm_map_wire(kernel_map, kva, kva + size,
 		    VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES);
 		if (rv == KERN_SUCCESS) {
 			*memp = (void *)(kva + ofs);
 			return (0);
 		}
 		vm_map_remove(kernel_map, kva, kva + size);
 	} else
 		vm_object_deallocate(obj);
 
 	/* On failure, drop our mapping reference. */
 	VM_OBJECT_WLOCK(obj);
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 
 	return (vm_mmap_to_errno(rv));
 }
 
 /*
  * We require the caller to unmap the entire entry.  This allows us to
  * safely decrement shm_kmappings when a mapping is removed.
  */
 int
 shm_unmap(struct file *fp, void *mem, size_t size)
 {
 	struct shmfd *shmfd;
 	vm_map_entry_t entry;
 	vm_offset_t kva, ofs;
 	vm_object_t obj;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 	vm_map_t map;
 	int rv;
 
 	if (fp->f_type != DTYPE_SHM)
 		return (EINVAL);
 	shmfd = fp->f_data;
 	kva = (vm_offset_t)mem;
 	ofs = kva & PAGE_MASK;
 	kva = trunc_page(kva);
 	size = round_page(size + ofs);
 	map = kernel_map;
 	rv = vm_map_lookup(&map, kva, VM_PROT_READ | VM_PROT_WRITE, &entry,
 	    &obj, &pindex, &prot, &wired);
 	if (rv != KERN_SUCCESS)
 		return (EINVAL);
 	if (entry->start != kva || entry->end != kva + size) {
 		vm_map_lookup_done(map, entry);
 		return (EINVAL);
 	}
 	vm_map_lookup_done(map, entry);
 	if (obj != shmfd->shm_object)
 		return (EINVAL);
 	vm_map_remove(map, kva, kva + size);
 	VM_OBJECT_WLOCK(obj);
 	KASSERT(shmfd->shm_kmappings > 0, ("shm_unmap: object not mapped"));
 	shmfd->shm_kmappings--;
 	VM_OBJECT_WUNLOCK(obj);
 	return (0);
 }
 
 static int
 shm_fill_kinfo_locked(struct shmfd *shmfd, struct kinfo_file *kif, bool list)
 {
 	const char *path, *pr_path;
 	size_t pr_pathlen;
 	bool visible;
 
 	sx_assert(&shm_dict_lock, SA_LOCKED);
 	kif->kf_type = KF_TYPE_SHM;
 	kif->kf_un.kf_file.kf_file_mode = S_IFREG | shmfd->shm_mode;
 	kif->kf_un.kf_file.kf_file_size = shmfd->shm_size;
 	if (shmfd->shm_path != NULL) {
 		if (shmfd->shm_path != NULL) {
 			path = shmfd->shm_path;
 			pr_path = curthread->td_ucred->cr_prison->pr_path;
 			if (strcmp(pr_path, "/") != 0) {
 				/* Return the jail-rooted pathname. */
 				pr_pathlen = strlen(pr_path);
 				visible = strncmp(path, pr_path, pr_pathlen)
 				    == 0 && path[pr_pathlen] == '/';
 				if (list && !visible)
 					return (EPERM);
 				if (visible)
 					path += pr_pathlen;
 			}
 			strlcpy(kif->kf_path, path, sizeof(kif->kf_path));
 		}
 	}
 	return (0);
 }
 
 static int
 shm_fill_kinfo(struct file *fp, struct kinfo_file *kif,
     struct filedesc *fdp __unused)
 {
 	int res;
 
 	sx_slock(&shm_dict_lock);
 	res = shm_fill_kinfo_locked(fp->f_data, kif, false);
 	sx_sunlock(&shm_dict_lock);
 	return (res);
 }
 
 static int
 sysctl_posix_shm_list(SYSCTL_HANDLER_ARGS)
 {
 	struct shm_mapping *shmm;
 	struct sbuf sb;
 	struct kinfo_file kif;
 	u_long i;
 	ssize_t curlen;
 	int error, error2;
 
 	sbuf_new_for_sysctl(&sb, NULL, sizeof(struct kinfo_file) * 5, req);
 	sbuf_clear_flags(&sb, SBUF_INCLUDENUL);
 	curlen = 0;
 	error = 0;
 	sx_slock(&shm_dict_lock);
 	for (i = 0; i < shm_hash + 1; i++) {
 		LIST_FOREACH(shmm, &shm_dictionary[i], sm_link) {
 			error = shm_fill_kinfo_locked(shmm->sm_shmfd,
 			    &kif, true);
 			if (error == EPERM)
 				continue;
 			if (error != 0)
 				break;
 			pack_kinfo(&kif);
 			if (req->oldptr != NULL &&
 			    kif.kf_structsize + curlen > req->oldlen)
 				break;
 			error = sbuf_bcat(&sb, &kif, kif.kf_structsize) == 0 ?
 			    0 : ENOMEM;
 			if (error != 0)
 				break;
 			curlen += kif.kf_structsize;
 		}
 	}
 	sx_sunlock(&shm_dict_lock);
 	error2 = sbuf_finish(&sb);
 	sbuf_delete(&sb);
 	return (error != 0 ? error : error2);
 }
 
 SYSCTL_PROC(_kern_ipc, OID_AUTO, posix_shm_list,
     CTLFLAG_RD | CTLFLAG_MPSAFE | CTLTYPE_OPAQUE,
     NULL, 0, sysctl_posix_shm_list, "",
     "POSIX SHM list");
Index: projects/fuse2/sys/mips/broadcom/bhnd_nexus.c
===================================================================
--- projects/fuse2/sys/mips/broadcom/bhnd_nexus.c	(revision 350434)
+++ projects/fuse2/sys/mips/broadcom/bhnd_nexus.c	(revision 350435)
@@ -1,281 +1,282 @@
 /*-
  * Copyright (c) 2015-2016 Landon Fuller <landon@freebsd.org>
  * Copyright (c) 2017 The FreeBSD Foundation
  * All rights reserved.
  *
  * Portions of this software were developed by Landon Fuller
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    similar to the "NO WARRANTY" disclaimer below ("Disclaimer") and any
  *    redistribution must be conditioned upon including a substantially
  *    similar Disclaimer requirement for further binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF NONINFRINGEMENT, MERCHANTIBILITY
  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
  * THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY,
  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
  * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGES.
  * 
  * $FreeBSD$
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * bhnd(4) driver mix-in providing shared common methods for
  * bhnd bus devices attached via a MIPS root nexus.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/intr.h>
+#include <sys/limits.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/malloc.h>
 
 #include <machine/bus.h>
 
 #include <dev/bhnd/bhndvar.h>
 #include <dev/bhnd/bhnd_ids.h>
 
 #include <dev/bhnd/cores/chipc/chipcreg.h>
 
 #include "bcm_machdep.h"
 #include "bcm_mipsvar.h"
 
 #include "bhnd_nexusvar.h"
 
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_GET_SERVICE_REGISTRY().
  */
 static struct bhnd_service_registry *
 bhnd_nexus_get_service_registry(device_t dev, device_t child)
 {
 	struct bcm_platform *bp = bcm_get_platform();
 	return (&bp->services);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_ACTIVATE_RESOURCE().
  */
 static int
 bhnd_nexus_activate_resource(device_t dev, device_t child, int type, int rid,
     struct bhnd_resource *r)
 {
 	int error;
 
 	/* Always direct */
 	if ((error = bus_activate_resource(child, type, rid, r->res)))
 		return (error);
 
 	r->direct = true;
 	return (0);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_DEACTIVATE_RESOURCE().
  */
 static int
 bhnd_nexus_deactivate_resource(device_t dev, device_t child,
     int type, int rid, struct bhnd_resource *r)
 {
 	int error;
 
 	/* Always direct */
 	KASSERT(r->direct, ("indirect resource delegated to bhnd_nexus\n"));
 
 	if ((error = bus_deactivate_resource(child, type, rid, r->res)))
 		return (error);
 
 	r->direct = false;
 	return (0);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_IS_HW_DISABLED().
  */
 static bool
 bhnd_nexus_is_hw_disabled(device_t dev, device_t child)
 {
 	struct bcm_platform	*bp;
 	struct bhnd_chipid	*cid;
 
 	bp = bcm_get_platform();
 	cid = &bp->cid;
 
 	/* The BCM4706 low-cost package leaves secondary GMAC cores
 	 * floating */
 	if (cid->chip_id == BHND_CHIPID_BCM4706 &&
 	    cid->chip_pkg == BHND_PKGID_BCM4706L &&
 	    bhnd_get_device(child) == BHND_COREID_4706_GMAC &&
 	    bhnd_get_core_unit(child) != 0)
 	{
 		return (true);
 	}
 
 	return (false);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_AGET_ATTACH_TYPE().
  */
 static bhnd_attach_type
 bhnd_nexus_get_attach_type(device_t dev, device_t child)
 {
 	return (BHND_ATTACH_NATIVE);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_GET_CHIPID().
  */
 static const struct bhnd_chipid *
 bhnd_nexus_get_chipid(device_t dev, device_t child)
 {
 	return (&bcm_get_platform()->cid);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_READ_BOARD_INFO().
  */
 static int
 bhnd_nexus_read_board_info(device_t dev, device_t child,
     struct bhnd_board_info *info)
 {
 	int error;
 
 	/* Initialize with NVRAM-derived values */
 	if ((error = bhnd_bus_generic_read_board_info(dev, child, info)))
 		return (error);
 
 	/* The board vendor should default to PCI_VENDOR_BROADCOM if not
 	 * otherwise specified */
 	if (info->board_vendor == 0)
 		info->board_vendor = PCI_VENDOR_BROADCOM;
 
 	return (0);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_MAP_INTR().
  */
 static int
 bhnd_nexus_map_intr(device_t dev, device_t child, u_int intr, rman_res_t *irq)
 {
 	struct bcm_mips_intr_map_data	*imd;
 	u_int				 ivec;
 	uintptr_t			 xref;
 	int				 error;
 
 	/* Fetch the backplane interrupt vector */
 	if ((error = bhnd_get_intr_ivec(child, intr, &ivec))) {
 		device_printf(dev, "error fetching ivec for intr %u: %d\n",
 		    intr, error);
 		return (error);
 	}
 
 	/* Determine our interrupt domain */
 	xref = BHND_BUS_GET_INTR_DOMAIN(dev, child, false);
 	KASSERT(xref != 0, ("missing interrupt domain"));
 
 	/* Allocate our map data */
 	imd = (struct bcm_mips_intr_map_data *)intr_alloc_map_data(
 	    INTR_MAP_DATA_BCM_MIPS, sizeof(*imd), M_WAITOK | M_ZERO);
 	imd->ivec = ivec;
 
 	/* Map the IRQ */
 	*irq = intr_map_irq(NULL, xref, &imd->mdata);
 	return (0);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_UNMAP_INTR().
  */
 static void
 bhnd_nexus_unmap_intr(device_t dev, device_t child, rman_res_t irq)
 {
 	if (irq > UINT_MAX)
 		panic("invalid irq: %ju", (uintmax_t)irq);
 
 	intr_unmap_irq(irq);
 }
 
 /**
  * Default bhnd_nexus implementation of BHND_BUS_GET_DMA_TRANSLATION().
  */
 static int
 bhnd_nexus_get_dma_translation(device_t dev, device_t child,
     u_int width, uint32_t flags, bus_dma_tag_t *dmat,
     struct bhnd_dma_translation *translation)
 {
 	struct bcm_platform *bp = bcm_get_platform();
 
 	/* We don't (currently) support any flags */
 	if (flags != 0x0)
 		return (ENOENT);
 
 	KASSERT(width > 0 && width <= BHND_DMA_ADDR_64BIT,
 	    ("invalid width %u", width));
 
 	/* Is the requested width supported? */
 	if (width > BHND_DMA_ADDR_32BIT) {
 		/* Backplane must support 64-bit addressing */
 		if (!(bp->cid.chip_caps & BHND_CAP_BP64))
 			width = BHND_DMA_ADDR_32BIT;
 	}
 
 	/* No DMA address translation required */
 	if (dmat != NULL)
 		*dmat = bus_get_dma_tag(dev);
 
 	if (translation != NULL) {
 		*translation = (struct bhnd_dma_translation) {
 			.base_addr	= 0x0,
 			.addr_mask	= BHND_DMA_ADDR_BITMASK(width),
 			.addrext_mask	= 0
 		};
 	}
 
 	return (0);
 }
 
 static device_method_t bhnd_nexus_methods[] = {
 	/* bhnd interface */
 	DEVMETHOD(bhnd_bus_get_service_registry,bhnd_nexus_get_service_registry),
 	DEVMETHOD(bhnd_bus_register_provider,	bhnd_bus_generic_sr_register_provider),
 	DEVMETHOD(bhnd_bus_deregister_provider,	bhnd_bus_generic_sr_deregister_provider),
 	DEVMETHOD(bhnd_bus_retain_provider,	bhnd_bus_generic_sr_retain_provider),
 	DEVMETHOD(bhnd_bus_release_provider,	bhnd_bus_generic_sr_release_provider),
 	DEVMETHOD(bhnd_bus_activate_resource,	bhnd_nexus_activate_resource),
 	DEVMETHOD(bhnd_bus_deactivate_resource, bhnd_nexus_deactivate_resource),
 	DEVMETHOD(bhnd_bus_is_hw_disabled,	bhnd_nexus_is_hw_disabled),
 	DEVMETHOD(bhnd_bus_get_attach_type,	bhnd_nexus_get_attach_type),
 	DEVMETHOD(bhnd_bus_get_chipid,		bhnd_nexus_get_chipid),
 	DEVMETHOD(bhnd_bus_get_dma_translation,	bhnd_nexus_get_dma_translation),
 	DEVMETHOD(bhnd_bus_get_intr_domain,	bhnd_bus_generic_get_intr_domain),
 	DEVMETHOD(bhnd_bus_map_intr,		bhnd_nexus_map_intr),
 	DEVMETHOD(bhnd_bus_read_board_info,	bhnd_nexus_read_board_info),
 	DEVMETHOD(bhnd_bus_unmap_intr,		bhnd_nexus_unmap_intr),
 
 	DEVMETHOD_END
 };
 
 DEFINE_CLASS_0(bhnd, bhnd_nexus_driver, bhnd_nexus_methods,
     sizeof(struct bhnd_softc));
Index: projects/fuse2/sys/netinet/cc/cc_dctcp.c
===================================================================
--- projects/fuse2/sys/netinet/cc/cc_dctcp.c	(revision 350434)
+++ projects/fuse2/sys/netinet/cc/cc_dctcp.c	(revision 350435)
@@ -1,469 +1,471 @@
 /*-
  * Copyright (c) 2007-2008
  *	Swinburne University of Technology, Melbourne, Australia
  * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
  * Copyright (c) 2014 Midori Kato <katoon@sfc.wide.ad.jp>
  * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * An implementation of the DCTCP algorithm for FreeBSD, based on
  * "Data Center TCP (DCTCP)" by M. Alizadeh, A. Greenberg, D. A. Maltz,
  * J. Padhye, P. Patel, B. Prabhakar, S. Sengupta, and M. Sridharan.,
  * in ACM Conference on SIGCOMM 2010, New York, USA,
  * Originally released as the contribution of Microsoft Research project.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_var.h>
 #include <netinet/cc/cc.h>
 #include <netinet/cc/cc_module.h>
 
-#define MAX_ALPHA_VALUE 1024
-VNET_DEFINE_STATIC(uint32_t, dctcp_alpha) = 0;
+#define DCTCP_SHIFT 10
+#define MAX_ALPHA_VALUE (1<<DCTCP_SHIFT)
+VNET_DEFINE_STATIC(uint32_t, dctcp_alpha) = MAX_ALPHA_VALUE;
 #define V_dctcp_alpha	    VNET(dctcp_alpha)
 VNET_DEFINE_STATIC(uint32_t, dctcp_shift_g) = 4;
 #define	V_dctcp_shift_g	    VNET(dctcp_shift_g)
 VNET_DEFINE_STATIC(uint32_t, dctcp_slowstart) = 0;
 #define	V_dctcp_slowstart   VNET(dctcp_slowstart)
 
 struct dctcp {
-	int     bytes_ecn;	/* # of marked bytes during a RTT */
-	int     bytes_total;	/* # of acked bytes during a RTT */
-	int     alpha;		/* the fraction of marked bytes */
-	int     ce_prev;	/* CE state of the last segment */
-	int     save_sndnxt;	/* end sequence number of the current window */
-	int	ece_curr;	/* ECE flag in this segment */
-	int	ece_prev;	/* ECE flag in the last segment */
-	uint32_t    num_cong_events; /* # of congestion events */
+	uint32_t bytes_ecn;	  /* # of marked bytes during a RTT */
+	uint32_t bytes_total;	  /* # of acked bytes during a RTT */
+	int      alpha;		  /* the fraction of marked bytes */
+	int      ce_prev;	  /* CE state of the last segment */
+	tcp_seq  save_sndnxt;	  /* end sequence number of the current window */
+	int      ece_curr;	  /* ECE flag in this segment */
+	int      ece_prev;	  /* ECE flag in the last segment */
+	uint32_t num_cong_events; /* # of congestion events */
 };
 
 static MALLOC_DEFINE(M_dctcp, "dctcp data",
     "Per connection data required for the dctcp algorithm");
 
 static void	dctcp_ack_received(struct cc_var *ccv, uint16_t type);
 static void	dctcp_after_idle(struct cc_var *ccv);
 static void	dctcp_cb_destroy(struct cc_var *ccv);
 static int	dctcp_cb_init(struct cc_var *ccv);
 static void	dctcp_cong_signal(struct cc_var *ccv, uint32_t type);
 static void	dctcp_conn_init(struct cc_var *ccv);
 static void	dctcp_post_recovery(struct cc_var *ccv);
 static void	dctcp_ecnpkt_handler(struct cc_var *ccv);
 static void	dctcp_update_alpha(struct cc_var *ccv);
 
 struct cc_algo dctcp_cc_algo = {
 	.name = "dctcp",
 	.ack_received = dctcp_ack_received,
 	.cb_destroy = dctcp_cb_destroy,
 	.cb_init = dctcp_cb_init,
 	.cong_signal = dctcp_cong_signal,
 	.conn_init = dctcp_conn_init,
 	.post_recovery = dctcp_post_recovery,
 	.ecnpkt_handler = dctcp_ecnpkt_handler,
 	.after_idle = dctcp_after_idle,
 };
 
 static void
 dctcp_ack_received(struct cc_var *ccv, uint16_t type)
 {
 	struct dctcp *dctcp_data;
 	int bytes_acked = 0;
 
 	dctcp_data = ccv->cc_data;
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
 		/*
 		 * DCTCP doesn't treat receipt of ECN marked packet as a
 		 * congestion event. Thus, DCTCP always executes the ACK
 		 * processing out of congestion recovery.
 		 */
 		if (IN_CONGRECOVERY(CCV(ccv, t_flags))) {
 			EXIT_CONGRECOVERY(CCV(ccv, t_flags));
 			newreno_cc_algo.ack_received(ccv, type);
 			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 		} else
 			newreno_cc_algo.ack_received(ccv, type);
 
 		if (type == CC_DUPACK)
-			bytes_acked = CCV(ccv, t_maxseg);
+			bytes_acked = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
 
 		if (type == CC_ACK)
 			bytes_acked = ccv->bytes_this_ack;
 
 		/* Update total bytes. */
 		dctcp_data->bytes_total += bytes_acked;
 
 		/* Update total marked bytes. */
 		if (dctcp_data->ece_curr) {
+			//XXRMS: For fluid-model DCTCP, update
+			//cwnd here during for RTT fairness
 			if (!dctcp_data->ece_prev
 			    && bytes_acked > CCV(ccv, t_maxseg)) {
 				dctcp_data->bytes_ecn +=
 				    (bytes_acked - CCV(ccv, t_maxseg));
 			} else
 				dctcp_data->bytes_ecn += bytes_acked;
 			dctcp_data->ece_prev = 1;
 		} else {
 			if (dctcp_data->ece_prev
 			    && bytes_acked > CCV(ccv, t_maxseg))
 				dctcp_data->bytes_ecn += CCV(ccv, t_maxseg);
 			dctcp_data->ece_prev = 0;
 		}
 		dctcp_data->ece_curr = 0;
 
 		/*
 		 * Update the fraction of marked bytes at the end of
 		 * current window size.
 		 */
 		if ((IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
 		    SEQ_GEQ(ccv->curack, CCV(ccv, snd_recover))) ||
 		    (!IN_FASTRECOVERY(CCV(ccv, t_flags)) &&
 		    SEQ_GT(ccv->curack, dctcp_data->save_sndnxt)))
 			dctcp_update_alpha(ccv);
 	} else
 		newreno_cc_algo.ack_received(ccv, type);
 }
 
 static void
 dctcp_after_idle(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 
-	dctcp_data = ccv->cc_data;
+	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
+		dctcp_data = ccv->cc_data;
 
-	/* Initialize internal parameters after idle time */
-	dctcp_data->bytes_ecn = 0;
-	dctcp_data->bytes_total = 0;
-	dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
-	dctcp_data->alpha = V_dctcp_alpha;
-	dctcp_data->ece_curr = 0;
-	dctcp_data->ece_prev = 0;
-	dctcp_data->num_cong_events = 0;
+		/* Initialize internal parameters after idle time */
+		dctcp_data->bytes_ecn = 0;
+		dctcp_data->bytes_total = 0;
+		dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
+		dctcp_data->alpha = V_dctcp_alpha;
+		dctcp_data->ece_curr = 0;
+		dctcp_data->ece_prev = 0;
+		dctcp_data->num_cong_events = 0;
+	}
 
-	dctcp_cc_algo.after_idle = newreno_cc_algo.after_idle;
+	newreno_cc_algo.after_idle(ccv);
 }
 
 static void
 dctcp_cb_destroy(struct cc_var *ccv)
 {
 	free(ccv->cc_data, M_dctcp);
 }
 
 static int
 dctcp_cb_init(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 
 	dctcp_data = malloc(sizeof(struct dctcp), M_dctcp, M_NOWAIT|M_ZERO);
 
 	if (dctcp_data == NULL)
 		return (ENOMEM);
 
 	/* Initialize some key variables with sensible defaults. */
 	dctcp_data->bytes_ecn = 0;
 	dctcp_data->bytes_total = 0;
 	/*
 	 * When alpha is set to 0 in the beginning, DCTCP sender transfers as
 	 * much data as possible until the value converges which may expand the
 	 * queueing delay at the switch. When alpha is set to 1, queueing delay
 	 * is kept small.
 	 * Throughput-sensitive applications should have alpha = 0
 	 * Latency-sensitive applications should have alpha = 1
 	 *
 	 * Note: DCTCP draft suggests initial alpha to be 1 but we've decided to
 	 * keep it 0 as default.
 	 */
 	dctcp_data->alpha = V_dctcp_alpha;
 	dctcp_data->save_sndnxt = 0;
 	dctcp_data->ce_prev = 0;
 	dctcp_data->ece_curr = 0;
 	dctcp_data->ece_prev = 0;
 	dctcp_data->num_cong_events = 0;
 
 	ccv->cc_data = dctcp_data;
 	return (0);
 }
 
 /*
  * Perform any necessary tasks before we enter congestion recovery.
  */
 static void
 dctcp_cong_signal(struct cc_var *ccv, uint32_t type)
 {
 	struct dctcp *dctcp_data;
-	u_int win, mss;
+	u_int cwin, mss;
 
-	dctcp_data = ccv->cc_data;
-	win = CCV(ccv, snd_cwnd);
-	mss = CCV(ccv, t_maxseg);
+	if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
+		dctcp_data = ccv->cc_data;
+		cwin = CCV(ccv, snd_cwnd);
+		mss = CCV(ccv, t_maxseg);
 
-	switch (type) {
-	case CC_NDUPACK:
-		if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+		switch (type) {
+		case CC_NDUPACK:
+			if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+				if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
+					CCV(ccv, snd_ssthresh) =
+					    max(cwin / 2, 2 * mss);
+					dctcp_data->num_cong_events++;
+				} else {
+					/* cwnd has already updated as congestion
+					 * recovery. Reverse cwnd value using
+					 * snd_cwnd_prev and recalculate snd_ssthresh
+					 */
+					cwin = CCV(ccv, snd_cwnd_prev);
+					CCV(ccv, snd_ssthresh) =
+					    max(cwin / 2, 2 * mss);
+				}
+				ENTER_RECOVERY(CCV(ccv, t_flags));
+			}
+			break;
+		case CC_ECN:
+			/*
+			 * Save current snd_cwnd when the host encounters both
+			 * congestion recovery and fast recovery.
+			 */
+			CCV(ccv, snd_cwnd_prev) = cwin;
 			if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
-				CCV(ccv, snd_ssthresh) = mss *
-				    max(win / 2 / mss, 2);
-				dctcp_data->num_cong_events++;
-			} else {
-				/* cwnd has already updated as congestion
-				 * recovery. Reverse cwnd value using
-				 * snd_cwnd_prev and recalculate snd_ssthresh
-				 */
-				win = CCV(ccv, snd_cwnd_prev);
-				CCV(ccv, snd_ssthresh) =
-				    max(win / 2 / mss, 2) * mss;
+				if (V_dctcp_slowstart &&
+				    dctcp_data->num_cong_events++ == 0) {
+					CCV(ccv, snd_ssthresh) =
+					    max(cwin / 2, 2 * mss);
+					dctcp_data->alpha = MAX_ALPHA_VALUE;
+					dctcp_data->bytes_ecn = 0;
+					dctcp_data->bytes_total = 0;
+					dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
+				} else
+					CCV(ccv, snd_ssthresh) = 
+					    max((cwin - (((uint64_t)cwin *
+					    dctcp_data->alpha) >> (DCTCP_SHIFT+1))), 
+					    2 * mss);
+				CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
+				ENTER_CONGRECOVERY(CCV(ccv, t_flags));
 			}
-			ENTER_RECOVERY(CCV(ccv, t_flags));
-		}
-		break;
-	case CC_ECN:
-		/*
-		 * Save current snd_cwnd when the host encounters both
-		 * congestion recovery and fast recovery.
-		 */
-		CCV(ccv, snd_cwnd_prev) = win;
-		if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
-			if (V_dctcp_slowstart &&
-			    dctcp_data->num_cong_events++ == 0) {
-				CCV(ccv, snd_ssthresh) =
-				    mss * max(win / 2 / mss, 2);
-				dctcp_data->alpha = MAX_ALPHA_VALUE;
-				dctcp_data->bytes_ecn = 0;
-				dctcp_data->bytes_total = 0;
-				dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
-			} else
-				CCV(ccv, snd_ssthresh) = max((win - ((win *
-				    dctcp_data->alpha) >> 11)) / mss, 2) * mss;
-			CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
-			ENTER_CONGRECOVERY(CCV(ccv, t_flags));
-		}
-		dctcp_data->ece_curr = 1;
-		break;
-	case CC_RTO:
-		if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
+			dctcp_data->ece_curr = 1;
+			break;
+		case CC_RTO:
 			CCV(ccv, t_flags) |= TF_ECN_SND_CWR;
 			dctcp_update_alpha(ccv);
 			dctcp_data->save_sndnxt += CCV(ccv, t_maxseg);
 			dctcp_data->num_cong_events++;
+			break;
 		}
-		break;
-	}
+	} else
+		newreno_cc_algo.cong_signal(ccv, type);
 }
 
 static void
 dctcp_conn_init(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 
 	dctcp_data = ccv->cc_data;
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
 		dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
 }
 
 /*
  * Perform any necessary tasks before we exit congestion recovery.
  */
 static void
 dctcp_post_recovery(struct cc_var *ccv)
 {
-	dctcp_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
+	newreno_cc_algo.post_recovery(ccv);
 
 	if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
 		dctcp_update_alpha(ccv);
 }
 
 /*
  * Execute an additional ECN processing using ECN field in IP header and the CWR
  * bit in TCP header.
  *
  * delay_ack == 0 - Delayed ACK disabled
  * delay_ack == 1 - Delayed ACK enabled
  */
 
 static void
 dctcp_ecnpkt_handler(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 	uint32_t ccflag;
 	int delay_ack;
 
 	dctcp_data = ccv->cc_data;
 	ccflag = ccv->flags;
 	delay_ack = 1;
 
 	/*
-	 * DCTCP responses an ACK immediately when the CE state
-	 * in between this segment and the last segment is not same.
+	 * DCTCP responds with an ACK immediately when the CE state
+	 * in between this segment and the last segment has changed.
 	 */
 	if (ccflag & CCF_IPHDR_CE) {
 		if (!dctcp_data->ce_prev && (ccflag & CCF_DELACK))
 			delay_ack = 0;
 		dctcp_data->ce_prev = 1;
 		CCV(ccv, t_flags) |= TF_ECN_SND_ECE;
 	} else {
 		if (dctcp_data->ce_prev && (ccflag & CCF_DELACK))
 			delay_ack = 0;
 		dctcp_data->ce_prev = 0;
 		CCV(ccv, t_flags) &= ~TF_ECN_SND_ECE;
 	}
 
 	/* DCTCP sets delayed ack when this segment sets the CWR flag. */
 	if ((ccflag & CCF_DELACK) && (ccflag & CCF_TCPHDR_CWR))
 		delay_ack = 1;
 
 	if (delay_ack == 0)
 		ccv->flags |= CCF_ACKNOW;
-	else
-		ccv->flags &= ~CCF_ACKNOW;
 }
 
 /*
  * Update the fraction of marked bytes represented as 'alpha'.
  * Also initialize several internal parameters at the end of this function.
  */
 static void
 dctcp_update_alpha(struct cc_var *ccv)
 {
 	struct dctcp *dctcp_data;
 	int alpha_prev;
 
 	dctcp_data = ccv->cc_data;
 	alpha_prev = dctcp_data->alpha;
 	dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1);
 
 	/*
-	 * Update alpha: alpha = (1 - g) * alpha + g * F.
+	 * Update alpha: alpha = (1 - g) * alpha + g * M.
 	 * Here:
 	 * g is weight factor
 	 *	recommaded to be set to 1/16
 	 *	small g = slow convergence between competitive DCTCP flows
 	 *	large g = impacts low utilization of bandwidth at switches
-	 * F is fraction of marked segments in last RTT
+	 * M is fraction of marked segments in last RTT
 	 *	updated every RTT
 	 * Alpha must be round to 0 - MAX_ALPHA_VALUE.
 	 */
-	dctcp_data->alpha = min(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
-	    (dctcp_data->bytes_ecn << (10 - V_dctcp_shift_g)) /
+	dctcp_data->alpha = ulmin(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
+	    ((uint64_t)dctcp_data->bytes_ecn << (DCTCP_SHIFT - V_dctcp_shift_g)) /
 	    dctcp_data->bytes_total, MAX_ALPHA_VALUE);
 
 	/* Initialize internal parameters for next alpha calculation */
 	dctcp_data->bytes_ecn = 0;
 	dctcp_data->bytes_total = 0;
 	dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
 }
 
 static int
 dctcp_alpha_handler(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t new;
 	int error;
 
 	new = V_dctcp_alpha;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
-		if (new > 1)
+		if (new > MAX_ALPHA_VALUE)
 			error = EINVAL;
-		else {
-			if (new > MAX_ALPHA_VALUE)
-				V_dctcp_alpha = MAX_ALPHA_VALUE;
-			else
-				V_dctcp_alpha = new;
-		}
+		else
+			V_dctcp_alpha = new;
 	}
 
 	return (error);
 }
 
 static int
 dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t new;
 	int error;
 
 	new = V_dctcp_shift_g;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
-		if (new > 1)
+		if (new > DCTCP_SHIFT)
 			error = EINVAL;
 		else
 			V_dctcp_shift_g = new;
 	}
 
 	return (error);
 }
 
 static int
 dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS)
 {
 	uint32_t new;
 	int error;
 
 	new = V_dctcp_slowstart;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr != NULL) {
 		if (new > 1)
 			error = EINVAL;
 		else
 			V_dctcp_slowstart = new;
 	}
 
 	return (error);
 }
 
 SYSCTL_DECL(_net_inet_tcp_cc_dctcp);
 SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp, CTLFLAG_RW, NULL,
     "dctcp congestion control related settings");
 
 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_alpha), 0,
     &dctcp_alpha_handler,
-    "IU", "dctcp alpha parameter");
+    "IU", "dctcp alpha parameter at start of session");
 
 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_shift_g), 4,
     &dctcp_shift_g_handler,
     "IU", "dctcp shift parameter");
 
 SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, slowstart,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_slowstart), 0,
     &dctcp_slowstart_handler,
     "IU", "half CWND reduction after the first slow start");
 
 DECLARE_CC_MODULE(dctcp, &dctcp_cc_algo);
Index: projects/fuse2/sys/netpfil/ipfw/ip_fw2.c
===================================================================
--- projects/fuse2/sys/netpfil/ipfw/ip_fw2.c	(revision 350434)
+++ projects/fuse2/sys/netpfil/ipfw/ip_fw2.c	(revision 350435)
@@ -1,3529 +1,3536 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * The FreeBSD IP packet firewall, main file
  */
 
 #include "opt_ipfw.h"
 #include "opt_ipdivert.h"
 #include "opt_inet.h"
 #ifndef INET
 #error "IPFIREWALL requires INET"
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
 #include <sys/counter.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/jail.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/ucred.h>
 #include <net/ethernet.h> /* for ETHERTYPE_IP */
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/pfil.h>
 #include <net/vnet.h>
 
 #include <netpfil/pf/pf_mtag.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_carp.h>
 #include <netinet/pim.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #include <netinet/sctp_header.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/in_fib.h>
 #ifdef INET6
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/ip6_var.h>
 #endif
 
 #include <net/if_gre.h> /* for struct grehdr */
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 /*
  * static variables followed by global ones.
  * All ipfw global variables are here.
  */
 
 VNET_DEFINE_STATIC(int, fw_deny_unknown_exthdrs);
 #define	V_fw_deny_unknown_exthdrs	VNET(fw_deny_unknown_exthdrs)
 
 VNET_DEFINE_STATIC(int, fw_permit_single_frag6) = 1;
 #define	V_fw_permit_single_frag6	VNET(fw_permit_single_frag6)
 
 #ifdef IPFIREWALL_DEFAULT_TO_ACCEPT
 static int default_to_accept = 1;
 #else
 static int default_to_accept;
 #endif
 
 VNET_DEFINE(int, autoinc_step);
 VNET_DEFINE(int, fw_one_pass) = 1;
 
 VNET_DEFINE(unsigned int, fw_tables_max);
 VNET_DEFINE(unsigned int, fw_tables_sets) = 0;	/* Don't use set-aware tables */
 /* Use 128 tables by default */
 static unsigned int default_fw_tables = IPFW_TABLES_DEFAULT;
 
 #ifndef LINEAR_SKIPTO
 static int jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_fast(ch, f, num, targ, back)
 #else
 static int jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards);
 #define	JUMP(ch, f, num, targ, back)	jump_linear(ch, f, num, targ, back)
 #endif
 
 /*
  * Each rule belongs to one of 32 different sets (0..31).
  * The variable set_disable contains one bit per set.
  * If the bit is set, all rules in the corresponding set
  * are disabled. Set RESVD_SET(31) is reserved for the default rule
  * and rules that are not deleted by the flush command,
  * and CANNOT be disabled.
  * Rules in set RESVD_SET can only be deleted individually.
  */
 VNET_DEFINE(u_int32_t, set_disable);
 #define	V_set_disable			VNET(set_disable)
 
 VNET_DEFINE(int, fw_verbose);
 /* counter for ipfw_log(NULL...) */
 VNET_DEFINE(u_int64_t, norule_counter);
 VNET_DEFINE(int, verbose_limit);
 
 /* layer3_chain contains the list of rules for layer 3 */
 VNET_DEFINE(struct ip_fw_chain, layer3_chain);
 
 /* ipfw_vnet_ready controls when we are open for business */
 VNET_DEFINE(int, ipfw_vnet_ready) = 0;
 
 VNET_DEFINE(int, ipfw_nat_ready) = 0;
 
 ipfw_nat_t *ipfw_nat_ptr = NULL;
 struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
 ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_del_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 #ifdef SYSCTL_NODE
 uint32_t dummy_def = IPFW_DEFAULT_RULE;
 static int sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS);
 static int sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS);
 
 SYSBEGIN(f3)
 
 SYSCTL_NODE(_net_inet_ip, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, one_pass,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_one_pass), 0,
     "Only do a single pass through ipfw when using dummynet(4)");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, autoinc_step,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(autoinc_step), 0,
     "Rule number auto-increment step");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE3, &VNET_NAME(fw_verbose), 0,
     "Log matches to ipfw rules");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, verbose_limit,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(verbose_limit), 0,
     "Set upper limit of matches of ipfw rules logged");
 SYSCTL_UINT(_net_inet_ip_fw, OID_AUTO, default_rule, CTLFLAG_RD,
     &dummy_def, 0,
     "The default/max possible rule number.");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_max,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW, 0, 0, sysctl_ipfw_table_num, "IU",
     "Maximum number of concurrently used tables");
 SYSCTL_PROC(_net_inet_ip_fw, OID_AUTO, tables_sets,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW,
     0, 0, sysctl_ipfw_tables_sets, "IU",
     "Use per-set namespace for tables");
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, default_to_accept, CTLFLAG_RDTUN,
     &default_to_accept, 0,
     "Make the default rule accept all packets.");
 TUNABLE_INT("net.inet.ip.fw.tables_max", (int *)&default_fw_tables);
 SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, static_count,
     CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(layer3_chain.n_rules), 0,
     "Number of static rules");
 
 #ifdef INET6
 SYSCTL_DECL(_net_inet6_ip6);
 SYSCTL_NODE(_net_inet6_ip6, OID_AUTO, fw, CTLFLAG_RW, 0, "Firewall");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, deny_unknown_exthdrs,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_deny_unknown_exthdrs), 0,
     "Deny packets with unknown IPv6 Extension Headers");
 SYSCTL_INT(_net_inet6_ip6_fw, OID_AUTO, permit_single_frag6,
     CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE,
     &VNET_NAME(fw_permit_single_frag6), 0,
     "Permit single packet IPv6 fragments");
 #endif /* INET6 */
 
 SYSEND
 
 #endif /* SYSCTL_NODE */
 
 
 /*
  * Some macros used in the various matching options.
  * L3HDR maps an ipv4 pointer into a layer3 header pointer of type T
  * Other macros just cast void * into the appropriate type
  */
 #define	L3HDR(T, ip)	((T *)((u_int32_t *)(ip) + (ip)->ip_hl))
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 #define	ICMP(p)		((struct icmphdr *)(p))
 #define	ICMP6(p)	((struct icmp6_hdr *)(p))
 
 static __inline int
 icmptype_match(struct icmphdr *icmp, ipfw_insn_u32 *cmd)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (cmd->d[0] & (1<<type)) );
 }
 
 #define TT	( (1 << ICMP_ECHO) | (1 << ICMP_ROUTERSOLICIT) | \
     (1 << ICMP_TSTAMP) | (1 << ICMP_IREQ) | (1 << ICMP_MASKREQ) )
 
 static int
 is_icmp_query(struct icmphdr *icmp)
 {
 	int type = icmp->icmp_type;
 
 	return (type <= ICMP_MAXTYPE && (TT & (1<<type)) );
 }
 #undef TT
 
 /*
  * The following checks use two arrays of 8 or 16 bits to store the
  * bits that we want set or clear, respectively. They are in the
  * low and high half of cmd->arg1 or cmd->d[0].
  *
  * We scan options and store the bits we find set. We succeed if
  *
  *	(want_set & ~bits) == 0 && (want_clear & ~bits) == want_clear
  *
  * The code is sometimes optimized not to store additional variables.
  */
 
 static int
 flags_match(ipfw_insn *cmd, u_int8_t bits)
 {
 	u_char want_clear;
 	bits = ~bits;
 
 	if ( ((cmd->arg1 & 0xff) & bits) != 0)
 		return 0; /* some bits we want set were clear */
 	want_clear = (cmd->arg1 >> 8) & 0xff;
 	if ( (want_clear & bits) != want_clear)
 		return 0; /* some bits we want clear were set */
 	return 1;
 }
 
 static int
 ipopts_match(struct ip *ip, ipfw_insn *cmd)
 {
 	int optlen, bits = 0;
 	u_char *cp = (u_char *)(ip + 1);
 	int x = (ip->ip_hl << 2) - sizeof (struct ip);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[IPOPT_OPTVAL];
 
 		if (opt == IPOPT_EOL)
 			break;
 		if (opt == IPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[IPOPT_OLEN];
 			if (optlen <= 0 || optlen > x)
 				return 0; /* invalid or truncated */
 		}
 		switch (opt) {
 
 		default:
 			break;
 
 		case IPOPT_LSRR:
 			bits |= IP_FW_IPOPT_LSRR;
 			break;
 
 		case IPOPT_SSRR:
 			bits |= IP_FW_IPOPT_SSRR;
 			break;
 
 		case IPOPT_RR:
 			bits |= IP_FW_IPOPT_RR;
 			break;
 
 		case IPOPT_TS:
 			bits |= IP_FW_IPOPT_TS;
 			break;
 		}
 	}
 	return (flags_match(cmd, bits));
 }
 
 static int
 tcpopts_parse(struct tcphdr *tcp, uint16_t *mss)
 {
 	u_char *cp = (u_char *)(tcp + 1);
 	int optlen, bits = 0;
 	int x = (tcp->th_off << 2) - sizeof(struct tcphdr);
 
 	for (; x > 0; x -= optlen, cp += optlen) {
 		int opt = cp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			optlen = cp[1];
 			if (optlen <= 0)
 				break;
 		}
 
 		switch (opt) {
 		default:
 			break;
 
 		case TCPOPT_MAXSEG:
 			bits |= IP_FW_TCPOPT_MSS;
 			if (mss != NULL)
 				*mss = be16dec(cp + 2);
 			break;
 
 		case TCPOPT_WINDOW:
 			bits |= IP_FW_TCPOPT_WINDOW;
 			break;
 
 		case TCPOPT_SACK_PERMITTED:
 		case TCPOPT_SACK:
 			bits |= IP_FW_TCPOPT_SACK;
 			break;
 
 		case TCPOPT_TIMESTAMP:
 			bits |= IP_FW_TCPOPT_TS;
 			break;
 		}
 	}
 	return (bits);
 }
 
 static int
 tcpopts_match(struct tcphdr *tcp, ipfw_insn *cmd)
 {
 
 	return (flags_match(cmd, tcpopts_parse(tcp, NULL)));
 }
 
 static int
 iface_match(struct ifnet *ifp, ipfw_insn_if *cmd, struct ip_fw_chain *chain,
     uint32_t *tablearg)
 {
 
 	if (ifp == NULL)	/* no iface with this packet, match fails */
 		return (0);
 
 	/* Check by name or by IP address */
 	if (cmd->name[0] != '\0') { /* match by name */
 		if (cmd->name[0] == '\1') /* use tablearg to match */
 			return ipfw_lookup_table(chain, cmd->p.kidx, 0,
 			    &ifp->if_index, tablearg);
 		/* Check name */
 		if (cmd->p.glob) {
 			if (fnmatch(cmd->name, ifp->if_xname, 0) == 0)
 				return(1);
 		} else {
 			if (strncmp(ifp->if_xname, cmd->name, IFNAMSIZ) == 0)
 				return(1);
 		}
 	} else {
 #if !defined(USERSPACE) && defined(__FreeBSD__)	/* and OSX too ? */
 		struct ifaddr *ia;
 
 		if_addr_rlock(ifp);
 		CK_STAILQ_FOREACH(ia, &ifp->if_addrhead, ifa_link) {
 			if (ia->ifa_addr->sa_family != AF_INET)
 				continue;
 			if (cmd->p.ip.s_addr == ((struct sockaddr_in *)
 			    (ia->ifa_addr))->sin_addr.s_addr) {
 				if_addr_runlock(ifp);
 				return(1);	/* match */
 			}
 		}
 		if_addr_runlock(ifp);
 #endif /* __FreeBSD__ */
 	}
 	return(0);	/* no match, fail ... */
 }
 
 /*
  * The verify_path function checks if a route to the src exists and
  * if it is reachable via ifp (when provided).
  * 
  * The 'verrevpath' option checks that the interface that an IP packet
  * arrives on is the same interface that traffic destined for the
  * packet's source address would be routed out of.
  * The 'versrcreach' option just checks that the source address is
  * reachable via any route (except default) in the routing table.
  * These two are a measure to block forged packets. This is also
  * commonly known as "anti-spoofing" or Unicast Reverse Path
  * Forwarding (Unicast RFP) in Cisco-ese. The name of the knobs
  * is purposely reminiscent of the Cisco IOS command,
  *
  *   ip verify unicast reverse-path
  *   ip verify unicast source reachable-via any
  *
  * which implements the same functionality. But note that the syntax
  * is misleading, and the check may be performed on all IP packets
  * whether unicast, multicast, or broadcast.
  */
 static int
 verify_path(struct in_addr src, struct ifnet *ifp, u_int fib)
 {
 #if defined(USERSPACE) || !defined(__FreeBSD__)
 	return 0;
 #else
 	struct nhop4_basic nh4;
 
 	if (fib4_lookup_nh_basic(fib, src, NHR_IFAIF, 0, &nh4) != 0)
 		return (0);
 
 	/*
 	 * If ifp is provided, check for equality with rtentry.
 	 * We should use rt->rt_ifa->ifa_ifp, instead of rt->rt_ifp,
 	 * in order to pass packets injected back by if_simloop():
 	 * routing entry (via lo0) for our own address
 	 * may exist, so we need to handle routing assymetry.
 	 */
 	if (ifp != NULL && ifp != nh4.nh_ifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh4.nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh4.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 #endif /* __FreeBSD__ */
 }
 
 /*
  * Generate an SCTP packet containing an ABORT chunk. The verification tag
  * is given by vtag. The T-bit is set in the ABORT chunk if and only if
  * reflected is not 0.
  */
 
 static struct mbuf *
 ipfw_send_abort(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t vtag,
     int reflected)
 {
 	struct mbuf *m;
 	struct ip *ip;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct sctphdr *sctp;
 	struct sctp_chunkhdr *chunk;
 	u_int16_t hlen, plen, tlen;
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	M_SETFIB(m, id->fib);
 #ifdef MAC
 	if (replyto != NULL)
 		mac_netinet_firewall_reply(replyto, m);
 	else
 		mac_netinet_firewall_send(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	switch (id->addr_type) {
 	case 4:
 		hlen = sizeof(struct ip);
 		break;
 #ifdef INET6
 	case 6:
 		hlen = sizeof(struct ip6_hdr);
 		break;
 #endif
 	default:
 		/* XXX: log me?!? */
 		FREE_PKT(m);
 		return (NULL);
 	}
 	plen = sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
 	tlen = hlen + plen;
 	m->m_data += max_linkhdr;
 	m->m_flags |= M_SKIP_FIREWALL;
 	m->m_pkthdr.len = m->m_len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, tlen);
 
 	switch (id->addr_type) {
 	case 4:
 		ip = mtod(m, struct ip *);
 
 		ip->ip_v = 4;
 		ip->ip_hl = sizeof(struct ip) >> 2;
 		ip->ip_tos = IPTOS_LOWDELAY;
 		ip->ip_len = htons(tlen);
 		ip->ip_id = htons(0);
 		ip->ip_off = htons(0);
 		ip->ip_ttl = V_ip_defttl;
 		ip->ip_p = IPPROTO_SCTP;
 		ip->ip_sum = 0;
 		ip->ip_src.s_addr = htonl(id->dst_ip);
 		ip->ip_dst.s_addr = htonl(id->src_ip);
 
 		sctp = (struct sctphdr *)(ip + 1);
 		break;
 #ifdef INET6
 	case 6:
 		ip6 = mtod(m, struct ip6_hdr *);
 
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_plen = htons(plen);
 		ip6->ip6_nxt = IPPROTO_SCTP;
 		ip6->ip6_hlim = IPV6_DEFHLIM;
 		ip6->ip6_src = id->dst_ip6;
 		ip6->ip6_dst = id->src_ip6;
 
 		sctp = (struct sctphdr *)(ip6 + 1);
 		break;
 #endif
 	}
 
 	sctp->src_port = htons(id->dst_port);
 	sctp->dest_port = htons(id->src_port);
 	sctp->v_tag = htonl(vtag);
 	sctp->checksum = htonl(0);
 
 	chunk = (struct sctp_chunkhdr *)(sctp + 1);
 	chunk->chunk_type = SCTP_ABORT_ASSOCIATION;
 	chunk->chunk_flags = 0;
 	if (reflected != 0) {
 		chunk->chunk_flags |= SCTP_HAD_NO_TCB;
 	}
 	chunk->chunk_length = htons(sizeof(struct sctp_chunkhdr));
 
 	sctp->checksum = sctp_calculate_cksum(m, hlen);
 
 	return (m);
 }
 
 /*
  * Generate a TCP packet, containing either a RST or a keepalive.
  * When flags & TH_RST, we are sending a RST packet, because of a
  * "reset" action matched the packet.
  * Otherwise we are sending a keepalive, and flags & TH_
  * The 'replyto' mbuf is the mbuf being replied to, if any, and is required
  * so that MAC can label the reply appropriately.
  */
 struct mbuf *
 ipfw_send_pkt(struct mbuf *replyto, struct ipfw_flow_id *id, u_int32_t seq,
     u_int32_t ack, int flags)
 {
 	struct mbuf *m = NULL;		/* stupid compiler */
 	struct ip *h = NULL;		/* stupid compiler */
 #ifdef INET6
 	struct ip6_hdr *h6 = NULL;
 #endif
 	struct tcphdr *th = NULL;
 	int len, dir;
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (NULL);
 
 	M_SETFIB(m, id->fib);
 #ifdef MAC
 	if (replyto != NULL)
 		mac_netinet_firewall_reply(replyto, m);
 	else
 		mac_netinet_firewall_send(m);
 #else
 	(void)replyto;		/* don't warn about unused arg */
 #endif
 
 	switch (id->addr_type) {
 	case 4:
 		len = sizeof(struct ip) + sizeof(struct tcphdr);
 		break;
 #ifdef INET6
 	case 6:
 		len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 		break;
 #endif
 	default:
 		/* XXX: log me?!? */
 		FREE_PKT(m);
 		return (NULL);
 	}
 	dir = ((flags & (TH_SYN | TH_RST)) == TH_SYN);
 
 	m->m_data += max_linkhdr;
 	m->m_flags |= M_SKIP_FIREWALL;
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, len);
 
 	switch (id->addr_type) {
 	case 4:
 		h = mtod(m, struct ip *);
 
 		/* prepare for checksum */
 		h->ip_p = IPPROTO_TCP;
 		h->ip_len = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h->ip_src.s_addr = htonl(id->src_ip);
 			h->ip_dst.s_addr = htonl(id->dst_ip);
 		} else {
 			h->ip_src.s_addr = htonl(id->dst_ip);
 			h->ip_dst.s_addr = htonl(id->src_ip);
 		}
 
 		th = (struct tcphdr *)(h + 1);
 		break;
 #ifdef INET6
 	case 6:
 		h6 = mtod(m, struct ip6_hdr *);
 
 		/* prepare for checksum */
 		h6->ip6_nxt = IPPROTO_TCP;
 		h6->ip6_plen = htons(sizeof(struct tcphdr));
 		if (dir) {
 			h6->ip6_src = id->src_ip6;
 			h6->ip6_dst = id->dst_ip6;
 		} else {
 			h6->ip6_src = id->dst_ip6;
 			h6->ip6_dst = id->src_ip6;
 		}
 
 		th = (struct tcphdr *)(h6 + 1);
 		break;
 #endif
 	}
 
 	if (dir) {
 		th->th_sport = htons(id->src_port);
 		th->th_dport = htons(id->dst_port);
 	} else {
 		th->th_sport = htons(id->dst_port);
 		th->th_dport = htons(id->src_port);
 	}
 	th->th_off = sizeof(struct tcphdr) >> 2;
 
 	if (flags & TH_RST) {
 		if (flags & TH_ACK) {
 			th->th_seq = htonl(ack);
 			th->th_flags = TH_RST;
 		} else {
 			if (flags & TH_SYN)
 				seq++;
 			th->th_ack = htonl(seq);
 			th->th_flags = TH_RST | TH_ACK;
 		}
 	} else {
 		/*
 		 * Keepalive - use caller provided sequence numbers
 		 */
 		th->th_seq = htonl(seq);
 		th->th_ack = htonl(ack);
 		th->th_flags = TH_ACK;
 	}
 
 	switch (id->addr_type) {
 	case 4:
 		th->th_sum = in_cksum(m, len);
 
 		/* finish the ip header */
 		h->ip_v = 4;
 		h->ip_hl = sizeof(*h) >> 2;
 		h->ip_tos = IPTOS_LOWDELAY;
 		h->ip_off = htons(0);
 		h->ip_len = htons(len);
 		h->ip_ttl = V_ip_defttl;
 		h->ip_sum = 0;
 		break;
 #ifdef INET6
 	case 6:
 		th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(*h6),
 		    sizeof(struct tcphdr));
 
 		/* finish the ip6 header */
 		h6->ip6_vfc |= IPV6_VERSION;
 		h6->ip6_hlim = IPV6_DEFHLIM;
 		break;
 #endif
 	}
 
 	return (m);
 }
 
 #ifdef INET6
 /*
  * ipv6 specific rules here...
  */
 static __inline int
 icmp6type_match (int type, ipfw_insn_u32 *cmd)
 {
 	return (type <= ICMP6_MAXTYPE && (cmd->d[type/32] & (1<<(type%32)) ) );
 }
 
 static int
 flow6id_match( int curr_flow, ipfw_insn_u32 *cmd )
 {
 	int i;
 	for (i=0; i <= cmd->o.arg1; ++i )
 		if (curr_flow == cmd->d[i] )
 			return 1;
 	return 0;
 }
 
 /* support for IP6_*_ME opcodes */
 static const struct in6_addr lla_mask = {{{
 	0xff, 0xff, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
 	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
 }}};
 
 static int
 ipfw_localip6(struct in6_addr *in6)
 {
 	struct rm_priotracker in6_ifa_tracker;
 	struct in6_ifaddr *ia;
 
 	if (IN6_IS_ADDR_MULTICAST(in6))
 		return (0);
 
 	if (!IN6_IS_ADDR_LINKLOCAL(in6))
 		return (in6_localip(in6));
 
 	IN6_IFADDR_RLOCK(&in6_ifa_tracker);
 	CK_STAILQ_FOREACH(ia, &V_in6_ifaddrhead, ia_link) {
 		if (!IN6_IS_ADDR_LINKLOCAL(&ia->ia_addr.sin6_addr))
 			continue;
 		if (IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
 		    in6, &lla_mask)) {
 			IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 			return (1);
 		}
 	}
 	IN6_IFADDR_RUNLOCK(&in6_ifa_tracker);
 	return (0);
 }
 
 static int
 verify_path6(struct in6_addr *src, struct ifnet *ifp, u_int fib)
 {
 	struct nhop6_basic nh6;
 
 	if (IN6_IS_SCOPE_LINKLOCAL(src))
 		return (1);
 
 	if (fib6_lookup_nh_basic(fib, src, 0, NHR_IFAIF, 0, &nh6) != 0)
 		return (0);
 
 	/* If ifp is provided, check for equality with route table. */
 	if (ifp != NULL && ifp != nh6.nh_ifp)
 		return (0);
 
 	/* if no ifp provided, check if rtentry is not default route */
 	if (ifp == NULL && (nh6.nh_flags & NHF_DEFAULT) != 0)
 		return (0);
 
 	/* or if this is a blackhole/reject route */
 	if (ifp == NULL && (nh6.nh_flags & (NHF_REJECT|NHF_BLACKHOLE)) != 0)
 		return (0);
 
 	/* found valid route */
 	return 1;
 }
 
 static int
 is_icmp6_query(int icmp6_type)
 {
 	if ((icmp6_type <= ICMP6_MAXTYPE) &&
 	    (icmp6_type == ICMP6_ECHO_REQUEST ||
 	    icmp6_type == ICMP6_MEMBERSHIP_QUERY ||
 	    icmp6_type == ICMP6_WRUREQUEST ||
 	    icmp6_type == ICMP6_FQDN_QUERY ||
 	    icmp6_type == ICMP6_NI_QUERY))
 		return (1);
 
 	return (0);
 }
 
 static int
 map_icmp_unreach(int code)
 {
 
 	/* RFC 7915 p4.2 */
 	switch (code) {
 	case ICMP_UNREACH_NET:
 	case ICMP_UNREACH_HOST:
 	case ICMP_UNREACH_SRCFAIL:
 	case ICMP_UNREACH_NET_UNKNOWN:
 	case ICMP_UNREACH_HOST_UNKNOWN:
 	case ICMP_UNREACH_TOSNET:
 	case ICMP_UNREACH_TOSHOST:
 		return (ICMP6_DST_UNREACH_NOROUTE);
 	case ICMP_UNREACH_PORT:
 		return (ICMP6_DST_UNREACH_NOPORT);
 	default:
 		/*
 		 * Map the rest of codes into admit prohibited.
 		 * XXX: unreach proto should be mapped into ICMPv6
 		 * parameter problem, but we use only unreach type.
 		 */
 		return (ICMP6_DST_UNREACH_ADMIN);
 	}
 }
 
 static void
 send_reject6(struct ip_fw_args *args, int code, u_int hlen, struct ip6_hdr *ip6)
 {
 	struct mbuf *m;
 
 	m = args->m;
 	if (code == ICMP6_UNREACH_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *tcp;
 		tcp = (struct tcphdr *)((char *)ip6 + hlen);
 
 		if ((tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m0;
 			m0 = ipfw_send_pkt(args->m, &(args->f_id),
 			    ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 			    tcp->th_flags | TH_RST);
 			if (m0 != NULL)
 				ip6_output(m0, NULL, NULL, 0, NULL, NULL,
 				    NULL);
 		}
 		FREE_PKT(m);
 	} else if (code == ICMP6_UNREACH_ABORT &&
 	    args->f_id.proto == IPPROTO_SCTP) {
 		struct mbuf *m0;
 		struct sctphdr *sctp;
 		u_int32_t v_tag;
 		int reflected;
 
 		sctp = (struct sctphdr *)((char *)ip6 + hlen);
 		reflected = 1;
 		v_tag = ntohl(sctp->v_tag);
 		/* Investigate the first chunk header if available */
 		if (m->m_len >= hlen + sizeof(struct sctphdr) +
 		    sizeof(struct sctp_chunkhdr)) {
 			struct sctp_chunkhdr *chunk;
 
 			chunk = (struct sctp_chunkhdr *)(sctp + 1);
 			switch (chunk->chunk_type) {
 			case SCTP_INITIATION:
 				/*
 				 * Packets containing an INIT chunk MUST have
 				 * a zero v-tag.
 				 */
 				if (v_tag != 0) {
 					v_tag = 0;
 					break;
 				}
 				/* INIT chunk MUST NOT be bundled */
 				if (m->m_pkthdr.len >
 				    hlen + sizeof(struct sctphdr) +
 				    ntohs(chunk->chunk_length) + 3) {
 					break;
 				}
 				/* Use the initiate tag if available */
 				if ((m->m_len >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))) {
 					struct sctp_init *init;
 
 					init = (struct sctp_init *)(chunk + 1);
 					v_tag = ntohl(init->initiate_tag);
 					reflected = 0;
 				}
 				break;
 			case SCTP_ABORT_ASSOCIATION:
 				/*
 				 * If the packet contains an ABORT chunk, don't
 				 * reply.
 				 * XXX: We should search through all chunks,
 				 *      but don't do to avoid attacks.
 				 */
 				v_tag = 0;
 				break;
 			}
 		}
 		if (v_tag == 0) {
 			m0 = NULL;
 		} else {
 			m0 = ipfw_send_abort(args->m, &(args->f_id), v_tag,
 			    reflected);
 		}
 		if (m0 != NULL)
 			ip6_output(m0, NULL, NULL, 0, NULL, NULL, NULL);
 		FREE_PKT(m);
 	} else if (code != ICMP6_UNREACH_RST && code != ICMP6_UNREACH_ABORT) {
 		/* Send an ICMPv6 unreach. */
 #if 0
 		/*
 		 * Unlike above, the mbufs need to line up with the ip6 hdr,
 		 * as the contents are read. We need to m_adj() the
 		 * needed amount.
 		 * The mbuf will however be thrown away so we can adjust it.
 		 * Remember we did an m_pullup on it already so we
 		 * can make some assumptions about contiguousness.
 		 */
 		if (args->L3offset)
 			m_adj(m, args->L3offset);
 #endif
 		icmp6_error(m, ICMP6_DST_UNREACH, code, 0);
 	} else
 		FREE_PKT(m);
 
 	args->m = NULL;
 }
 
 #endif /* INET6 */
 
 
 /*
  * sends a reject message, consuming the mbuf passed as an argument.
  */
 static void
 send_reject(struct ip_fw_args *args, int code, int iplen, struct ip *ip)
 {
 
 #if 0
 	/* XXX When ip is not guaranteed to be at mtod() we will
 	 * need to account for this */
 	 * The mbuf will however be thrown away so we can adjust it.
 	 * Remember we did an m_pullup on it already so we
 	 * can make some assumptions about contiguousness.
 	 */
 	if (args->L3offset)
 		m_adj(m, args->L3offset);
 #endif
 	if (code != ICMP_REJECT_RST && code != ICMP_REJECT_ABORT) {
 		/* Send an ICMP unreach */
 		icmp_error(args->m, ICMP_UNREACH, code, 0L, 0);
 	} else if (code == ICMP_REJECT_RST && args->f_id.proto == IPPROTO_TCP) {
 		struct tcphdr *const tcp =
 		    L3HDR(struct tcphdr, mtod(args->m, struct ip *));
 		if ( (tcp->th_flags & TH_RST) == 0) {
 			struct mbuf *m;
 			m = ipfw_send_pkt(args->m, &(args->f_id),
 				ntohl(tcp->th_seq), ntohl(tcp->th_ack),
 				tcp->th_flags | TH_RST);
 			if (m != NULL)
 				ip_output(m, NULL, NULL, 0, NULL, NULL);
 		}
 		FREE_PKT(args->m);
 	} else if (code == ICMP_REJECT_ABORT &&
 	    args->f_id.proto == IPPROTO_SCTP) {
 		struct mbuf *m;
 		struct sctphdr *sctp;
 		struct sctp_chunkhdr *chunk;
 		struct sctp_init *init;
 		u_int32_t v_tag;
 		int reflected;
 
 		sctp = L3HDR(struct sctphdr, mtod(args->m, struct ip *));
 		reflected = 1;
 		v_tag = ntohl(sctp->v_tag);
 		if (iplen >= (ip->ip_hl << 2) + sizeof(struct sctphdr) +
 		    sizeof(struct sctp_chunkhdr)) {
 			/* Look at the first chunk header if available */
 			chunk = (struct sctp_chunkhdr *)(sctp + 1);
 			switch (chunk->chunk_type) {
 			case SCTP_INITIATION:
 				/*
 				 * Packets containing an INIT chunk MUST have
 				 * a zero v-tag.
 				 */
 				if (v_tag != 0) {
 					v_tag = 0;
 					break;
 				}
 				/* INIT chunk MUST NOT be bundled */
 				if (iplen >
 				    (ip->ip_hl << 2) + sizeof(struct sctphdr) +
 				    ntohs(chunk->chunk_length) + 3) {
 					break;
 				}
 				/* Use the initiate tag if available */
 				if ((iplen >= (ip->ip_hl << 2) +
 				    sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))) {
 					init = (struct sctp_init *)(chunk + 1);
 					v_tag = ntohl(init->initiate_tag);
 					reflected = 0;
 				}
 				break;
 			case SCTP_ABORT_ASSOCIATION:
 				/*
 				 * If the packet contains an ABORT chunk, don't
 				 * reply.
 				 * XXX: We should search through all chunks,
 				 * but don't do to avoid attacks.
 				 */
 				v_tag = 0;
 				break;
 			}
 		}
 		if (v_tag == 0) {
 			m = NULL;
 		} else {
 			m = ipfw_send_abort(args->m, &(args->f_id), v_tag,
 			    reflected);
 		}
 		if (m != NULL)
 			ip_output(m, NULL, NULL, 0, NULL, NULL);
 		FREE_PKT(args->m);
 	} else
 		FREE_PKT(args->m);
 	args->m = NULL;
 }
 
 /*
  * Support for uid/gid/jail lookup. These tests are expensive
  * (because we may need to look into the list of active sockets)
  * so we cache the results. ugid_lookupp is 0 if we have not
  * yet done a lookup, 1 if we succeeded, and -1 if we tried
  * and failed. The function always returns the match value.
  * We could actually spare the variable and use *uc, setting
  * it to '(void *)check_uidgid if we have no info, NULL if
  * we tried and failed, or any other value if successful.
  */
 static int
 check_uidgid(ipfw_insn_u32 *insn, struct ip_fw_args *args, int *ugid_lookupp,
     struct ucred **uc)
 {
 #if defined(USERSPACE)
 	return 0;	// not supported in userspace
 #else
 #ifndef __FreeBSD__
 	/* XXX */
 	return cred_check(insn, proto, oif,
 	    dst_ip, dst_port, src_ip, src_port,
 	    (struct bsd_ucred *)uc, ugid_lookupp, ((struct mbuf *)inp)->m_skb);
 #else  /* FreeBSD */
 	struct in_addr src_ip, dst_ip;
 	struct inpcbinfo *pi;
 	struct ipfw_flow_id *id;
 	struct inpcb *pcb, *inp;
 	int lookupflags;
 	int match;
 
 	id = &args->f_id;
 	inp = args->inp;
 
 	/*
 	 * Check to see if the UDP or TCP stack supplied us with
 	 * the PCB. If so, rather then holding a lock and looking
 	 * up the PCB, we can use the one that was supplied.
 	 */
 	if (inp && *ugid_lookupp == 0) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_socket != NULL) {
 			*uc = crhold(inp->inp_cred);
 			*ugid_lookupp = 1;
 		} else
 			*ugid_lookupp = -1;
 	}
 	/*
 	 * If we have already been here and the packet has no
 	 * PCB entry associated with it, then we can safely
 	 * assume that this is a no match.
 	 */
 	if (*ugid_lookupp == -1)
 		return (0);
 	if (id->proto == IPPROTO_TCP) {
 		lookupflags = 0;
 		pi = &V_tcbinfo;
 	} else if (id->proto == IPPROTO_UDP) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_udbinfo;
 	} else if (id->proto == IPPROTO_UDPLITE) {
 		lookupflags = INPLOOKUP_WILDCARD;
 		pi = &V_ulitecbinfo;
 	} else
 		return 0;
 	lookupflags |= INPLOOKUP_RLOCKPCB;
 	match = 0;
 	if (*ugid_lookupp == 0) {
 		if (id->addr_type == 6) {
 #ifdef INET6
 			if (args->flags & IPFW_ARGS_IN)
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->src_ip6, htons(id->src_port),
 				    &id->dst_ip6, htons(id->dst_port),
 				    lookupflags, NULL, args->m);
 			else
 				pcb = in6_pcblookup_mbuf(pi,
 				    &id->dst_ip6, htons(id->dst_port),
 				    &id->src_ip6, htons(id->src_port),
 				    lookupflags, args->ifp, args->m);
 #else
 			*ugid_lookupp = -1;
 			return (0);
 #endif
 		} else {
 			src_ip.s_addr = htonl(id->src_ip);
 			dst_ip.s_addr = htonl(id->dst_ip);
 			if (args->flags & IPFW_ARGS_IN)
 				pcb = in_pcblookup_mbuf(pi,
 				    src_ip, htons(id->src_port),
 				    dst_ip, htons(id->dst_port),
 				    lookupflags, NULL, args->m);
 			else
 				pcb = in_pcblookup_mbuf(pi,
 				    dst_ip, htons(id->dst_port),
 				    src_ip, htons(id->src_port),
 				    lookupflags, args->ifp, args->m);
 		}
 		if (pcb != NULL) {
 			INP_RLOCK_ASSERT(pcb);
 			*uc = crhold(pcb->inp_cred);
 			*ugid_lookupp = 1;
 			INP_RUNLOCK(pcb);
 		}
 		if (*ugid_lookupp == 0) {
 			/*
 			 * We tried and failed, set the variable to -1
 			 * so we will not try again on this packet.
 			 */
 			*ugid_lookupp = -1;
 			return (0);
 		}
 	}
 	if (insn->o.opcode == O_UID)
 		match = ((*uc)->cr_uid == (uid_t)insn->d[0]);
 	else if (insn->o.opcode == O_GID)
 		match = groupmember((gid_t)insn->d[0], *uc);
 	else if (insn->o.opcode == O_JAIL)
 		match = ((*uc)->cr_prison->pr_id == (int)insn->d[0]);
 	return (match);
 #endif /* __FreeBSD__ */
 #endif /* not supported in userspace */
 }
 
 /*
  * Helper function to set args with info on the rule after the matching
  * one. slot is precise, whereas we guess rule_id as they are
  * assigned sequentially.
  */
 static inline void
 set_match(struct ip_fw_args *args, int slot,
 	struct ip_fw_chain *chain)
 {
 	args->rule.chain_id = chain->id;
 	args->rule.slot = slot + 1; /* we use 0 as a marker */
 	args->rule.rule_id = 1 + chain->map[slot]->id;
 	args->rule.rulenum = chain->map[slot]->rulenum;
 	args->flags |= IPFW_ARGS_REF;
 }
 
 #ifndef LINEAR_SKIPTO
 /*
  * Helper function to enable cached rule lookups using
  * cached_id and cached_pos fields in ipfw rule.
  */
 static int
 jump_fast(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	/* If possible use cached f_pos (in f->cached_pos),
 	 * whose version is written in f->cached_id
 	 * (horrible hacks to avoid changing the ABI).
 	 */
 	if (num != IP_FW_TARG && f->cached_id == chain->id)
 		f_pos = f->cached_pos;
 	else {
 		int i = IP_FW_ARG_TABLEARG(chain, num, skipto);
 		/* make sure we do not jump backward */
 		if (jump_backwards == 0 && i <= f->rulenum)
 			i = f->rulenum + 1;
 		if (chain->idxmap != NULL)
 			f_pos = chain->idxmap[i];
 		else
 			f_pos = ipfw_find_rule(chain, i, 0);
 		/* update the cache */
 		if (num != IP_FW_TARG) {
 			f->cached_id = chain->id;
 			f->cached_pos = f_pos;
 		}
 	}
 
 	return (f_pos);
 }
 #else
 /*
  * Helper function to enable real fast rule lookups.
  */
 static int
 jump_linear(struct ip_fw_chain *chain, struct ip_fw *f, int num,
     int tablearg, int jump_backwards)
 {
 	int f_pos;
 
 	num = IP_FW_ARG_TABLEARG(chain, num, skipto);
 	/* make sure we do not jump backward */
 	if (jump_backwards == 0 && num <= f->rulenum)
 		num = f->rulenum + 1;
 	f_pos = chain->idxmap[num];
 
 	return (f_pos);
 }
 #endif
 
 #define	TARG(k, f)	IP_FW_ARG_TABLEARG(chain, k, f)
 /*
  * The main check routine for the firewall.
  *
  * All arguments are in args so we can modify them and return them
  * back to the caller.
  *
  * Parameters:
  *
  *	args->m	(in/out) The packet; we set to NULL when/if we nuke it.
  *		Starts with the IP header.
  *	args->L3offset	Number of bytes bypassed if we came from L2.
  *			e.g. often sizeof(eh)  ** NOTYET **
  *	args->ifp	Incoming or outgoing interface.
  *	args->divert_rule (in/out)
  *		Skip up to the first rule past this rule number;
  *		upon return, non-zero port number for divert or tee.
  *
  *	args->rule	Pointer to the last matching rule (in/out)
  *	args->next_hop	Socket we are forwarding to (out).
  *	args->next_hop6	IPv6 next hop we are forwarding to (out).
  *	args->f_id	Addresses grabbed from the packet (out)
  * 	args->rule.info	a cookie depending on rule action
  *
  * Return value:
  *
  *	IP_FW_PASS	the packet must be accepted
  *	IP_FW_DENY	the packet must be dropped
  *	IP_FW_DIVERT	divert packet, port in m_tag
  *	IP_FW_TEE	tee packet, port in m_tag
  *	IP_FW_DUMMYNET	to dummynet, pipe in args->cookie
  *	IP_FW_NETGRAPH	into netgraph, cookie args->cookie
  *		args->rule contains the matching rule,
  *		args->rule.info has additional information.
  *
  */
 int
 ipfw_chk(struct ip_fw_args *args)
 {
 
 	/*
 	 * Local variables holding state while processing a packet:
 	 *
 	 * IMPORTANT NOTE: to speed up the processing of rules, there
 	 * are some assumption on the values of the variables, which
 	 * are documented here. Should you change them, please check
 	 * the implementation of the various instructions to make sure
 	 * that they still work.
 	 *
 	 * m | args->m	Pointer to the mbuf, as received from the caller.
 	 *	It may change if ipfw_chk() does an m_pullup, or if it
 	 *	consumes the packet because it calls send_reject().
 	 *	XXX This has to change, so that ipfw_chk() never modifies
 	 *	or consumes the buffer.
 	 *	OR
 	 * args->mem	Pointer to contigous memory chunk.
 	 * ip	Is the beginning of the ip(4 or 6) header.
 	 * eh	Ethernet header in case if input is Layer2.
 	 */
 	struct mbuf *m;
 	struct ip *ip;
 	struct ether_header *eh;
 
 	/*
 	 * For rules which contain uid/gid or jail constraints, cache
 	 * a copy of the users credentials after the pcb lookup has been
 	 * executed. This will speed up the processing of rules with
 	 * these types of constraints, as well as decrease contention
 	 * on pcb related locks.
 	 */
 #ifndef __FreeBSD__
 	struct bsd_ucred ucred_cache;
 #else
 	struct ucred *ucred_cache = NULL;
 #endif
 	int ucred_lookup = 0;
 	int f_pos = 0;		/* index of current rule in the array */
 	int retval = 0;
 	struct ifnet *oif, *iif;
 
 	/*
 	 * hlen	The length of the IP header.
 	 */
 	u_int hlen = 0;		/* hlen >0 means we have an IP pkt */
 
 	/*
 	 * offset	The offset of a fragment. offset != 0 means that
 	 *	we have a fragment at this offset of an IPv4 packet.
 	 *	offset == 0 means that (if this is an IPv4 packet)
 	 *	this is the first or only fragment.
 	 *	For IPv6 offset|ip6f_mf == 0 means there is no Fragment Header
 	 *	or there is a single packet fragment (fragment header added
 	 *	without needed).  We will treat a single packet fragment as if
 	 *	there was no fragment header (or log/block depending on the
 	 *	V_fw_permit_single_frag6 sysctl setting).
 	 */
 	u_short offset = 0;
 	u_short ip6f_mf = 0;
 
 	/*
 	 * Local copies of addresses. They are only valid if we have
 	 * an IP packet.
 	 *
 	 * proto	The protocol. Set to 0 for non-ip packets,
 	 *	or to the protocol read from the packet otherwise.
 	 *	proto != 0 means that we have an IPv4 packet.
 	 *
 	 * src_port, dst_port	port numbers, in HOST format. Only
 	 *	valid for TCP and UDP packets.
 	 *
 	 * src_ip, dst_ip	ip addresses, in NETWORK format.
 	 *	Only valid for IPv4 packets.
 	 */
 	uint8_t proto;
 	uint16_t src_port, dst_port;		/* NOTE: host format	*/
 	struct in_addr src_ip, dst_ip;		/* NOTE: network format	*/
 	int iplen = 0;
 	int pktlen;
 
 	struct ipfw_dyn_info dyn_info;
 	struct ip_fw *q = NULL;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 
 	/*
 	 * We store in ulp a pointer to the upper layer protocol header.
 	 * In the ipv4 case this is easy to determine from the header,
 	 * but for ipv6 we might have some additional headers in the middle.
 	 * ulp is NULL if not found.
 	 */
 	void *ulp = NULL;		/* upper layer protocol pointer. */
 
 	/* XXX ipv6 variables */
 	int is_ipv6 = 0;
 	uint8_t	icmp6_type = 0;
 	uint16_t ext_hd = 0;	/* bits vector for extension header filtering */
 	/* end of ipv6 variables */
 
 	int is_ipv4 = 0;
 
 	int done = 0;		/* flag to exit the outer loop */
 	IPFW_RLOCK_TRACKER;
 	bool mem;
 
 	if ((mem = (args->flags & IPFW_ARGS_LENMASK))) {
 		if (args->flags & IPFW_ARGS_ETHER) {
 			eh = (struct ether_header *)args->mem;
 			if (eh->ether_type == htons(ETHERTYPE_VLAN))
 				ip = (struct ip *)
 				    ((struct ether_vlan_header *)eh + 1);
 			else
 				ip = (struct ip *)(eh + 1);
 		} else {
 			eh = NULL;
 			ip = (struct ip *)args->mem;
 		}
 		pktlen = IPFW_ARGS_LENGTH(args->flags);
 		args->f_id.fib = args->ifp->if_fib;	/* best guess */
 	} else {
 		m = args->m;
 		if (m->m_flags & M_SKIP_FIREWALL || (! V_ipfw_vnet_ready))
 			return (IP_FW_PASS);	/* accept */
 		if (args->flags & IPFW_ARGS_ETHER) {
 	                /* We need some amount of data to be contiguous. */
 			if (m->m_len < min(m->m_pkthdr.len, max_protohdr) &&
 			    (args->m = m = m_pullup(m, min(m->m_pkthdr.len,
 			    max_protohdr))) == NULL)
 				goto pullup_failed;
 			eh = mtod(m, struct ether_header *);
 			ip = (struct ip *)(eh + 1);
 		} else {
 			eh = NULL;
 			ip = mtod(m, struct ip *);
 		}
 		pktlen = m->m_pkthdr.len;
 		args->f_id.fib = M_GETFIB(m); /* mbuf not altered */
 	}
 
 	dst_ip.s_addr = 0;		/* make sure it is initialized */
 	src_ip.s_addr = 0;		/* make sure it is initialized */
 	src_port = dst_port = 0;
 
 	DYN_INFO_INIT(&dyn_info);
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
-#define PULLUP_TO(_len, p, T)	PULLUP_LEN(_len, p, sizeof(T))
+#define	PULLUP_TO(_len, p, T)	PULLUP_LEN(_len, p, sizeof(T))
 #define	EHLEN	(eh != NULL ? ((char *)ip - (char *)eh) : 0)
-#define PULLUP_LEN(_len, p, T)					\
+#define	_PULLUP_LOCKED(_len, p, T, unlock)			\
 do {								\
 	int x = (_len) + T + EHLEN;				\
 	if (mem) {						\
 		MPASS(pktlen >= x);				\
 		p = (char *)args->mem + (_len) + EHLEN;		\
 	} else {						\
 		if (__predict_false((m)->m_len < x)) {		\
 			args->m = m = m_pullup(m, x);		\
-			if (m == NULL)				\
+			if (m == NULL) {			\
+				unlock;				\
 				goto pullup_failed;		\
+			}					\
 		}						\
 		p = mtod(m, char *) + (_len) + EHLEN;		\
 	}							\
 } while (0)
+
+#define	PULLUP_LEN(_len, p, T)	_PULLUP_LOCKED(_len, p, T, )
+#define	PULLUP_LEN_LOCKED(_len, p, T)	\
+    _PULLUP_LOCKED(_len, p, T, IPFW_PF_RUNLOCK(chain))
 /*
  * In case pointers got stale after pullups, update them.
  */
 #define	UPDATE_POINTERS()					\
 do {								\
 	if (!mem) {						\
 		if (eh != NULL) {				\
 			eh = mtod(m, struct ether_header *);	\
 			ip = (struct ip *)(eh + 1);		\
 		} else						\
 			ip = mtod(m, struct ip *);		\
 		args->m = m;					\
 	}							\
 } while (0)
 
 	/* Identify IP packets and fill up variables. */
 	if (pktlen >= sizeof(struct ip6_hdr) &&
 	    (eh == NULL || eh->ether_type == htons(ETHERTYPE_IPV6)) &&
 	    ip->ip_v == 6) {
 		struct ip6_hdr *ip6 = (struct ip6_hdr *)ip;
 
 		is_ipv6 = 1;
 		args->flags |= IPFW_ARGS_IP6;
 		hlen = sizeof(struct ip6_hdr);
 		proto = ip6->ip6_nxt;
 		/* Search extension headers to find upper layer protocols */
 		while (ulp == NULL && offset == 0) {
 			switch (proto) {
 			case IPPROTO_ICMPV6:
 				PULLUP_TO(hlen, ulp, struct icmp6_hdr);
 				icmp6_type = ICMP6(ulp)->icmp6_type;
 				break;
 
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				if (pktlen >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr) +
 					    sizeof(struct sctp_chunkhdr) +
 					    offsetof(struct sctp_init, a_rwnd));
 				else if (pktlen >= hlen + sizeof(struct sctphdr))
 					PULLUP_LEN(hlen, ulp, pktlen - hlen);
 				else
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr));
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 			case IPPROTO_UDPLITE:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_HOPOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_HOPOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ROUTING:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 				switch (((struct ip6_rthdr *)ulp)->ip6r_type) {
 				case 0:
 					ext_hd |= EXT_RTHDR0;
 					break;
 				case 2:
 					ext_hd |= EXT_RTHDR2;
 					break;
 				default:
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Unknown "
 						    "Routing Header type(%d)\n",
 						    ((struct ip6_rthdr *)
 						    ulp)->ip6r_type);
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				ext_hd |= EXT_ROUTING;
 				hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 				proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_FRAGMENT:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_frag);
 				ext_hd |= EXT_FRAGMENT;
 				hlen += sizeof (struct ip6_frag);
 				proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 				offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_OFF_MASK;
 				ip6f_mf = ((struct ip6_frag *)ulp)->ip6f_offlg &
 					IP6F_MORE_FRAG;
 				if (V_fw_permit_single_frag6 == 0 &&
 				    offset == 0 && ip6f_mf == 0) {
 					if (V_fw_verbose)
 						printf("IPFW2: IPV6 - Invalid "
 						    "Fragment Header\n");
 					if (V_fw_deny_unknown_exthdrs)
 					    return (IP_FW_DENY);
 					break;
 				}
 				args->f_id.extra =
 				    ntohl(((struct ip6_frag *)ulp)->ip6f_ident);
 				ulp = NULL;
 				break;
 
 			case IPPROTO_DSTOPTS:	/* RFC 2460 */
 				PULLUP_TO(hlen, ulp, struct ip6_hbh);
 				ext_hd |= EXT_DSTOPTS;
 				hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 				proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_AH:	/* RFC 2402 */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				ext_hd |= EXT_AH;
 				hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 				proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 				ulp = NULL;
 				break;
 
 			case IPPROTO_ESP:	/* RFC 2406 */
 				PULLUP_TO(hlen, ulp, uint32_t);	/* SPI, Seq# */
 				/* Anything past Seq# is variable length and
 				 * data past this ext. header is encrypted. */
 				ext_hd |= EXT_ESP;
 				break;
 
 			case IPPROTO_NONE:	/* RFC 2460 */
 				/*
 				 * Packet ends here, and IPv6 header has
 				 * already been pulled up. If ip6e_len!=0
 				 * then octets must be ignored.
 				 */
 				ulp = ip; /* non-NULL to get out of loop. */
 				break;
 
 			case IPPROTO_OSPFIGP:
 				/* XXX OSPF header check? */
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 
 			case IPPROTO_PIM:
 				/* XXX PIM header check? */
 				PULLUP_TO(hlen, ulp, struct pim);
 				break;
 
 			case IPPROTO_GRE:	/* RFC 1701 */
 				/* XXX GRE header check? */
 				PULLUP_TO(hlen, ulp, struct grehdr);
 				break;
 
 			case IPPROTO_CARP:
 				PULLUP_TO(hlen, ulp, offsetof(
 				    struct carp_header, carp_counter));
 				if (CARP_ADVERTISEMENT !=
 				    ((struct carp_header *)ulp)->carp_type)
 					return (IP_FW_DENY);
 				break;
 
 			case IPPROTO_IPV6:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip6_hdr);
 				break;
 
 			case IPPROTO_IPV4:	/* RFC 2893 */
 				PULLUP_TO(hlen, ulp, struct ip);
 				break;
 
 			default:
 				if (V_fw_verbose)
 					printf("IPFW2: IPV6 - Unknown "
 					    "Extension Header(%d), ext_hd=%x\n",
 					     proto, ext_hd);
 				if (V_fw_deny_unknown_exthdrs)
 				    return (IP_FW_DENY);
 				PULLUP_TO(hlen, ulp, struct ip6_ext);
 				break;
 			} /*switch */
 		}
 		UPDATE_POINTERS();
 		ip6 = (struct ip6_hdr *)ip;
 		args->f_id.addr_type = 6;
 		args->f_id.src_ip6 = ip6->ip6_src;
 		args->f_id.dst_ip6 = ip6->ip6_dst;
 		args->f_id.flow_id6 = ntohl(ip6->ip6_flow);
 		iplen = ntohs(ip6->ip6_plen) + sizeof(*ip6);
 	} else if (pktlen >= sizeof(struct ip) &&
 	    (eh == NULL || eh->ether_type == htons(ETHERTYPE_IP)) &&
 	    ip->ip_v == 4) {
 		is_ipv4 = 1;
 		args->flags |= IPFW_ARGS_IP4;
 		hlen = ip->ip_hl << 2;
 		/*
 		 * Collect parameters into local variables for faster
 		 * matching.
 		 */
 		proto = ip->ip_p;
 		src_ip = ip->ip_src;
 		dst_ip = ip->ip_dst;
 		offset = ntohs(ip->ip_off) & IP_OFFMASK;
 		iplen = ntohs(ip->ip_len);
 
 		if (offset == 0) {
 			switch (proto) {
 			case IPPROTO_TCP:
 				PULLUP_TO(hlen, ulp, struct tcphdr);
 				dst_port = TCP(ulp)->th_dport;
 				src_port = TCP(ulp)->th_sport;
 				/* save flags for dynamic rules */
 				args->f_id._flags = TCP(ulp)->th_flags;
 				break;
 
 			case IPPROTO_SCTP:
 				if (pktlen >= hlen + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd))
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr) +
 					    sizeof(struct sctp_chunkhdr) +
 					    offsetof(struct sctp_init, a_rwnd));
 				else if (pktlen >= hlen + sizeof(struct sctphdr))
 					PULLUP_LEN(hlen, ulp, pktlen - hlen);
 				else
 					PULLUP_LEN(hlen, ulp,
 					    sizeof(struct sctphdr));
 				src_port = SCTP(ulp)->src_port;
 				dst_port = SCTP(ulp)->dest_port;
 				break;
 
 			case IPPROTO_UDP:
 			case IPPROTO_UDPLITE:
 				PULLUP_TO(hlen, ulp, struct udphdr);
 				dst_port = UDP(ulp)->uh_dport;
 				src_port = UDP(ulp)->uh_sport;
 				break;
 
 			case IPPROTO_ICMP:
 				PULLUP_TO(hlen, ulp, struct icmphdr);
 				//args->f_id.flags = ICMP(ulp)->icmp_type;
 				break;
 
 			default:
 				break;
 			}
 		} else {
 			if (offset == 1 && proto == IPPROTO_TCP) {
 				/* RFC 3128 */
 				goto pullup_failed;
 			}
 		}
 
 		UPDATE_POINTERS();
 		args->f_id.addr_type = 4;
 		args->f_id.src_ip = ntohl(src_ip.s_addr);
 		args->f_id.dst_ip = ntohl(dst_ip.s_addr);
 	} else {
 		proto = 0;
 		dst_ip.s_addr = src_ip.s_addr = 0;
 
 		args->f_id.addr_type = 1; /* XXX */
 	}
 #undef PULLUP_TO
 	pktlen = iplen < pktlen ? iplen: pktlen;
 
 	/* Properly initialize the rest of f_id */
 	args->f_id.proto = proto;
 	args->f_id.src_port = src_port = ntohs(src_port);
 	args->f_id.dst_port = dst_port = ntohs(dst_port);
 
 	IPFW_PF_RLOCK(chain);
 	if (! V_ipfw_vnet_ready) { /* shutting down, leave NOW. */
 		IPFW_PF_RUNLOCK(chain);
 		return (IP_FW_PASS);	/* accept */
 	}
 	if (args->flags & IPFW_ARGS_REF) {
 		/*
 		 * Packet has already been tagged as a result of a previous
 		 * match on rule args->rule aka args->rule_id (PIPE, QUEUE,
 		 * REASS, NETGRAPH, DIVERT/TEE...)
 		 * Validate the slot and continue from the next one
 		 * if still present, otherwise do a lookup.
 		 */
 		f_pos = (args->rule.chain_id == chain->id) ?
 		    args->rule.slot :
 		    ipfw_find_rule(chain, args->rule.rulenum,
 			args->rule.rule_id);
 	} else {
 		f_pos = 0;
 	}
 
 	if (args->flags & IPFW_ARGS_IN) {
 		iif = args->ifp;
 		oif = NULL;
 	} else {
 		MPASS(args->flags & IPFW_ARGS_OUT);
 		iif = mem ? NULL : m_rcvif(m);
 		oif = args->ifp;
 	}
 
 	/*
 	 * Now scan the rules, and parse microinstructions for each rule.
 	 * We have two nested loops and an inner switch. Sometimes we
 	 * need to break out of one or both loops, or re-enter one of
 	 * the loops with updated variables. Loop variables are:
 	 *
 	 *	f_pos (outer loop) points to the current rule.
 	 *		On output it points to the matching rule.
 	 *	done (outer loop) is used as a flag to break the loop.
 	 *	l (inner loop)	residual length of current rule.
 	 *		cmd points to the current microinstruction.
 	 *
 	 * We break the inner loop by setting l=0 and possibly
 	 * cmdlen=0 if we don't want to advance cmd.
 	 * We break the outer loop by setting done=1
 	 * We can restart the inner loop by setting l>0 and f_pos, f, cmd
 	 * as needed.
 	 */
 	for (; f_pos < chain->n_rules; f_pos++) {
 		ipfw_insn *cmd;
 		uint32_t tablearg = 0;
 		int l, cmdlen, skip_or; /* skip rest of OR block */
 		struct ip_fw *f;
 
 		f = chain->map[f_pos];
 		if (V_set_disable & (1 << f->set) )
 			continue;
 
 		skip_or = 0;
 		for (l = f->cmd_len, cmd = f->cmd ; l > 0 ;
 		    l -= cmdlen, cmd += cmdlen) {
 			int match;
 
 			/*
 			 * check_body is a jump target used when we find a
 			 * CHECK_STATE, and need to jump to the body of
 			 * the target rule.
 			 */
 
 /* check_body: */
 			cmdlen = F_LEN(cmd);
 			/*
 			 * An OR block (insn_1 || .. || insn_n) has the
 			 * F_OR bit set in all but the last instruction.
 			 * The first match will set "skip_or", and cause
 			 * the following instructions to be skipped until
 			 * past the one with the F_OR bit clear.
 			 */
 			if (skip_or) {		/* skip this instruction */
 				if ((cmd->len & F_OR) == 0)
 					skip_or = 0;	/* next one is good */
 				continue;
 			}
 			match = 0; /* set to 1 if we succeed */
 
 			switch (cmd->opcode) {
 			/*
 			 * The first set of opcodes compares the packet's
 			 * fields with some pattern, setting 'match' if a
 			 * match is found. At the end of the loop there is
 			 * logic to deal with F_NOT and F_OR flags associated
 			 * with the opcode.
 			 */
 			case O_NOP:
 				match = 1;
 				break;
 
 			case O_FORWARD_MAC:
 				printf("ipfw: opcode %d unimplemented\n",
 				    cmd->opcode);
 				break;
 
 			case O_GID:
 			case O_UID:
 			case O_JAIL:
 				/*
 				 * We only check offset == 0 && proto != 0,
 				 * as this ensures that we have a
 				 * packet with the ports info.
 				 */
 				if (offset != 0)
 					break;
 				if (proto == IPPROTO_TCP ||
 				    proto == IPPROTO_UDP ||
 				    proto == IPPROTO_UDPLITE)
 					match = check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 #ifdef __FreeBSD__
 						    &ucred_cache);
 #else
 						    (void *)&ucred_cache);
 #endif
 				break;
 
 			case O_RECV:
 				match = iface_match(iif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_XMIT:
 				match = iface_match(oif, (ipfw_insn_if *)cmd,
 				    chain, &tablearg);
 				break;
 
 			case O_VIA:
 				match = iface_match(args->ifp,
 				    (ipfw_insn_if *)cmd, chain, &tablearg);
 				break;
 
 			case O_MACADDR2:
 				if (args->flags & IPFW_ARGS_ETHER) {
 					u_int32_t *want = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->addr;
 					u_int32_t *mask = (u_int32_t *)
 						((ipfw_insn_mac *)cmd)->mask;
 					u_int32_t *hdr = (u_int32_t *)eh;
 
 					match =
 					    ( want[0] == (hdr[0] & mask[0]) &&
 					      want[1] == (hdr[1] & mask[1]) &&
 					      want[2] == (hdr[2] & mask[2]) );
 				}
 				break;
 
 			case O_MAC_TYPE:
 				if (args->flags & IPFW_ARGS_ETHER) {
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match =
 						    (ntohs(eh->ether_type) >=
 						    p[0] &&
 						    ntohs(eh->ether_type) <=
 						    p[1]);
 				}
 				break;
 
 			case O_FRAG:
 				match = (offset != 0);
 				break;
 
 			case O_IN:	/* "out" is "not in" */
 				match = (oif == NULL);
 				break;
 
 			case O_LAYER2:
 				match = (args->flags & IPFW_ARGS_ETHER);
 				break;
 
 			case O_DIVERTED:
 				if ((args->flags & IPFW_ARGS_REF) == 0)
 					break;
 				/*
 				 * For diverted packets, args->rule.info
 				 * contains the divert port (in host format)
 				 * reason and direction.
 				 */
 				match = ((args->rule.info & IPFW_IS_MASK) ==
 				    IPFW_IS_DIVERT) && (
 				    ((args->rule.info & IPFW_INFO_IN) ?
 					1: 2) & cmd->arg1);
 				break;
 
 			case O_PROTO:
 				/*
 				 * We do not allow an arg of 0 so the
 				 * check of "proto" only suffices.
 				 */
 				match = (proto == cmd->arg1);
 				break;
 
 			case O_IP_SRC:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    src_ip.s_addr);
 				break;
 
 			case O_IP_DST_LOOKUP:
 			{
 				void *pkey;
 				uint32_t vidx, key;
 				uint16_t keylen;
 
 				if (cmdlen > F_INSN_SIZE(ipfw_insn_u32)) {
 					/* Determine lookup key type */
 					vidx = ((ipfw_insn_u32 *)cmd)->d[1];
 					if (vidx != 4 /* uid */ &&
 					    vidx != 5 /* jail */ &&
 					    is_ipv6 == 0 && is_ipv4 == 0)
 						break;
 					/* Determine key length */
 					if (vidx == 0 /* dst-ip */ ||
 					    vidx == 1 /* src-ip */)
 						keylen = is_ipv6 ?
 						    sizeof(struct in6_addr):
 						    sizeof(in_addr_t);
 					else {
 						keylen = sizeof(key);
 						pkey = &key;
 					}
 					if (vidx == 0 /* dst-ip */)
 						pkey = is_ipv4 ? (void *)&dst_ip:
 						    (void *)&args->f_id.dst_ip6;
 					else if (vidx == 1 /* src-ip */)
 						pkey = is_ipv4 ? (void *)&src_ip:
 						    (void *)&args->f_id.src_ip6;
 					else if (vidx == 6 /* dscp */) {
 						if (is_ipv4)
 							key = ip->ip_tos >> 2;
 						else {
 							key = args->f_id.flow_id6;
 							key = (key & 0x0f) << 2 |
 							    (key & 0xf000) >> 14;
 						}
 						key &= 0x3f;
 					} else if (vidx == 2 /* dst-port */ ||
 					    vidx == 3 /* src-port */) {
 						/* Skip fragments */
 						if (offset != 0)
 							break;
 						/* Skip proto without ports */
 						if (proto != IPPROTO_TCP &&
 						    proto != IPPROTO_UDP &&
 						    proto != IPPROTO_UDPLITE &&
 						    proto != IPPROTO_SCTP)
 							break;
 						if (vidx == 2 /* dst-port */)
 							key = dst_port;
 						else
 							key = src_port;
 					}
 #ifndef USERSPACE
 					else if (vidx == 4 /* uid */ ||
 					    vidx == 5 /* jail */) {
 						check_uidgid(
 						    (ipfw_insn_u32 *)cmd,
 						    args, &ucred_lookup,
 #ifdef __FreeBSD__
 						    &ucred_cache);
 						if (vidx == 4 /* uid */)
 							key = ucred_cache->cr_uid;
 						else if (vidx == 5 /* jail */)
 							key = ucred_cache->cr_prison->pr_id;
 #else /* !__FreeBSD__ */
 						    (void *)&ucred_cache);
 						if (vidx == 4 /* uid */)
 							key = ucred_cache.uid;
 						else if (vidx == 5 /* jail */)
 							key = ucred_cache.xid;
 #endif /* !__FreeBSD__ */
 					}
 #endif /* !USERSPACE */
 					else
 						break;
 					match = ipfw_lookup_table(chain,
 					    cmd->arg1, keylen, pkey, &vidx);
 					if (!match)
 						break;
 					tablearg = vidx;
 					break;
 				}
 				/* cmdlen =< F_INSN_SIZE(ipfw_insn_u32) */
 				/* FALLTHROUGH */
 			}
 			case O_IP_SRC_LOOKUP:
 			{
 				void *pkey;
 				uint32_t vidx;
 				uint16_t keylen;
 
 				if (is_ipv4) {
 					keylen = sizeof(in_addr_t);
 					if (cmd->opcode == O_IP_DST_LOOKUP)
 						pkey = &dst_ip;
 					else
 						pkey = &src_ip;
 				} else if (is_ipv6) {
 					keylen = sizeof(struct in6_addr);
 					if (cmd->opcode == O_IP_DST_LOOKUP)
 						pkey = &args->f_id.dst_ip6;
 					else
 						pkey = &args->f_id.src_ip6;
 				} else
 					break;
 				match = ipfw_lookup_table(chain, cmd->arg1,
 				    keylen, pkey, &vidx);
 				if (!match)
 					break;
 				if (cmdlen == F_INSN_SIZE(ipfw_insn_u32)) {
 					match = ((ipfw_insn_u32 *)cmd)->d[0] ==
 					    TARG_VAL(chain, vidx, tag);
 					if (!match)
 						break;
 				}
 				tablearg = vidx;
 				break;
 			}
 
 			case O_IP_FLOW_LOOKUP:
 				{
 					uint32_t v = 0;
 					match = ipfw_lookup_table(chain,
 					    cmd->arg1, 0, &args->f_id, &v);
 					if (cmdlen == F_INSN_SIZE(ipfw_insn_u32))
 						match = ((ipfw_insn_u32 *)cmd)->d[0] ==
 						    TARG_VAL(chain, v, tag);
 					if (match)
 						tablearg = v;
 				}
 				break;
 			case O_IP_SRC_MASK:
 			case O_IP_DST_MASK:
 				if (is_ipv4) {
 				    uint32_t a =
 					(cmd->opcode == O_IP_DST_MASK) ?
 					    dst_ip.s_addr : src_ip.s_addr;
 				    uint32_t *p = ((ipfw_insn_u32 *)cmd)->d;
 				    int i = cmdlen-1;
 
 				    for (; !match && i>0; i-= 2, p+= 2)
 					match = (p[0] == (a & p[1]));
 				}
 				break;
 
 			case O_IP_SRC_ME:
 				if (is_ipv4) {
 					match = in_localip(src_ip);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_SRC_ME:
 				match = is_ipv6 &&
 				    ipfw_localip6(&args->f_id.src_ip6);
 #endif
 				break;
 
 			case O_IP_DST_SET:
 			case O_IP_SRC_SET:
 				if (is_ipv4) {
 					u_int32_t *d = (u_int32_t *)(cmd+1);
 					u_int32_t addr =
 					    cmd->opcode == O_IP_DST_SET ?
 						args->f_id.dst_ip :
 						args->f_id.src_ip;
 
 					    if (addr < d[0])
 						    break;
 					    addr -= d[0]; /* subtract base */
 					    match = (addr < cmd->arg1) &&
 						( d[ 1 + (addr>>5)] &
 						  (1<<(addr & 0x1f)) );
 				}
 				break;
 
 			case O_IP_DST:
 				match = is_ipv4 &&
 				    (((ipfw_insn_ip *)cmd)->addr.s_addr ==
 				    dst_ip.s_addr);
 				break;
 
 			case O_IP_DST_ME:
 				if (is_ipv4) {
 					match = in_localip(dst_ip);
 					break;
 				}
 #ifdef INET6
 				/* FALLTHROUGH */
 			case O_IP6_DST_ME:
 				match = is_ipv6 &&
 				    ipfw_localip6(&args->f_id.dst_ip6);
 #endif
 				break;
 
 
 			case O_IP_SRCPORT:
 			case O_IP_DSTPORT:
 				/*
 				 * offset == 0 && proto != 0 is enough
 				 * to guarantee that we have a
 				 * packet with port info.
 				 */
 				if ((proto == IPPROTO_UDP ||
 				    proto == IPPROTO_UDPLITE ||
 				    proto == IPPROTO_TCP ||
 				    proto == IPPROTO_SCTP) && offset == 0) {
 					u_int16_t x =
 					    (cmd->opcode == O_IP_SRCPORT) ?
 						src_port : dst_port ;
 					u_int16_t *p =
 					    ((ipfw_insn_u16 *)cmd)->ports;
 					int i;
 
 					for (i = cmdlen - 1; !match && i>0;
 					    i--, p += 2)
 						match = (x>=p[0] && x<=p[1]);
 				}
 				break;
 
 			case O_ICMPTYPE:
 				match = (offset == 0 && proto==IPPROTO_ICMP &&
 				    icmptype_match(ICMP(ulp), (ipfw_insn_u32 *)cmd) );
 				break;
 
 #ifdef INET6
 			case O_ICMP6TYPE:
 				match = is_ipv6 && offset == 0 &&
 				    proto==IPPROTO_ICMPV6 &&
 				    icmp6type_match(
 					ICMP6(ulp)->icmp6_type,
 					(ipfw_insn_u32 *)cmd);
 				break;
 #endif /* INET6 */
 
 			case O_IPOPT:
 				match = (is_ipv4 &&
 				    ipopts_match(ip, cmd) );
 				break;
 
 			case O_IPVER:
 				match = (is_ipv4 &&
 				    cmd->arg1 == ip->ip_v);
 				break;
 
 			case O_IPID:
 			case O_IPTTL:
 				if (!is_ipv4)
 					break;
 			case O_IPLEN:
 				{	/* only for IP packets */
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    if (cmd->opcode == O_IPLEN)
 					x = iplen;
 				    else if (cmd->opcode == O_IPTTL)
 					x = ip->ip_ttl;
 				    else /* must be IPID */
 					x = ntohs(ip->ip_id);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_IPPRECEDENCE:
 				match = (is_ipv4 &&
 				    (cmd->arg1 == (ip->ip_tos & 0xe0)) );
 				break;
 
 			case O_IPTOS:
 				match = (is_ipv4 &&
 				    flags_match(cmd, ip->ip_tos));
 				break;
 
 			case O_DSCP:
 			    {
 				uint32_t *p;
 				uint16_t x;
 
 				p = ((ipfw_insn_u32 *)cmd)->d;
 
 				if (is_ipv4)
 					x = ip->ip_tos >> 2;
 				else if (is_ipv6) {
 					uint8_t *v;
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					x = (*v & 0x0F) << 2;
 					v++;
 					x |= *v >> 6;
 				} else
 					break;
 
 				/* DSCP bitmask is stored as low_u32 high_u32 */
 				if (x >= 32)
 					match = *(p + 1) & (1 << (x - 32));
 				else
 					match = *p & (1 << x);
 			    }
 				break;
 
 			case O_TCPDATALEN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    struct tcphdr *tcp;
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 #ifdef INET6
 				    if (is_ipv6) {
 					    struct ip6_hdr *ip6;
 
 					    ip6 = (struct ip6_hdr *)ip;
 					    if (ip6->ip6_plen == 0) {
 						    /*
 						     * Jumbo payload is not
 						     * supported by this
 						     * opcode.
 						     */
 						    break;
 					    }
 					    x = iplen - hlen;
 				    } else
 #endif /* INET6 */
 					    x = iplen - (ip->ip_hl << 2);
 				    tcp = TCP(ulp);
 				    x -= tcp->th_off << 2;
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* otherwise we have ranges */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i>0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_TCPFLAGS:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    flags_match(cmd, TCP(ulp)->th_flags));
 				break;
 
 			case O_TCPOPTS:
 				if (proto == IPPROTO_TCP && offset == 0 && ulp){
-					PULLUP_LEN(hlen, ulp,
+					PULLUP_LEN_LOCKED(hlen, ulp,
 					    (TCP(ulp)->th_off << 2));
 					match = tcpopts_match(TCP(ulp), cmd);
 				}
 				break;
 
 			case O_TCPSEQ:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_seq);
 				break;
 
 			case O_TCPACK:
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    ((ipfw_insn_u32 *)cmd)->d[0] ==
 					TCP(ulp)->th_ack);
 				break;
 
 			case O_TCPMSS:
 				if (proto == IPPROTO_TCP &&
 				    (args->f_id._flags & TH_SYN) != 0 &&
 				    ulp != NULL) {
 					uint16_t mss, *p;
 					int i;
 
-					PULLUP_LEN(hlen, ulp,
+					PULLUP_LEN_LOCKED(hlen, ulp,
 					    (TCP(ulp)->th_off << 2));
 					if ((tcpopts_parse(TCP(ulp), &mss) &
 					    IP_FW_TCPOPT_MSS) == 0)
 						break;
 					if (cmdlen == 1) {
 						match = (cmd->arg1 == mss);
 						break;
 					}
 					/* Otherwise we have ranges. */
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for (; !match && i > 0; i--, p += 2)
 						match = (mss >= p[0] &&
 						    mss <= p[1]);
 				}
 				break;
 
 			case O_TCPWIN:
 				if (proto == IPPROTO_TCP && offset == 0) {
 				    uint16_t x;
 				    uint16_t *p;
 				    int i;
 
 				    x = ntohs(TCP(ulp)->th_win);
 				    if (cmdlen == 1) {
 					match = (cmd->arg1 == x);
 					break;
 				    }
 				    /* Otherwise we have ranges. */
 				    p = ((ipfw_insn_u16 *)cmd)->ports;
 				    i = cmdlen - 1;
 				    for (; !match && i > 0; i--, p += 2)
 					match = (x >= p[0] && x <= p[1]);
 				}
 				break;
 
 			case O_ESTAB:
 				/* reject packets which have SYN only */
 				/* XXX should i also check for TH_ACK ? */
 				match = (proto == IPPROTO_TCP && offset == 0 &&
 				    (TCP(ulp)->th_flags &
 				     (TH_RST | TH_ACK | TH_SYN)) != TH_SYN);
 				break;
 
 			case O_ALTQ: {
 				struct pf_mtag *at;
 				struct m_tag *mtag;
 				ipfw_insn_altq *altq = (ipfw_insn_altq *)cmd;
 
 				/*
 				 * ALTQ uses mbuf tags from another
 				 * packet filtering system - pf(4).
 				 * We allocate a tag in its format
 				 * and fill it in, pretending to be pf(4).
 				 */
 				match = 1;
 				at = pf_find_mtag(m);
 				if (at != NULL && at->qid != 0)
 					break;
 				mtag = m_tag_get(PACKET_TAG_PF,
 				    sizeof(struct pf_mtag), M_NOWAIT | M_ZERO);
 				if (mtag == NULL) {
 					/*
 					 * Let the packet fall back to the
 					 * default ALTQ.
 					 */
 					break;
 				}
 				m_tag_prepend(m, mtag);
 				at = (struct pf_mtag *)(mtag + 1);
 				at->qid = altq->qid;
 				at->hdr = ip;
 				break;
 			}
 
 			case O_LOG:
 				ipfw_log(chain, f, hlen, args,
 				    offset | ip6f_mf, tablearg, ip);
 				match = 1;
 				break;
 
 			case O_PROB:
 				match = (random()<((ipfw_insn_u32 *)cmd)->d[0]);
 				break;
 
 			case O_VERREVPATH:
 				/* Outgoing packets automatically pass/match */
 				match = (args->flags & IPFW_ARGS_OUT ||
 				    (
 #ifdef INET6
 				    is_ipv6 ?
 					verify_path6(&(args->f_id.src_ip6),
 					    iif, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, iif, args->f_id.fib)));
 				break;
 
 			case O_VERSRCREACH:
 				/* Outgoing packets automatically pass/match */
 				match = (hlen > 0 && ((oif != NULL) || (
 #ifdef INET6
 				    is_ipv6 ?
 				        verify_path6(&(args->f_id.src_ip6),
 				            NULL, args->f_id.fib) :
 #endif
 				    verify_path(src_ip, NULL, args->f_id.fib))));
 				break;
 
 			case O_ANTISPOOF:
 				/* Outgoing packets automatically pass/match */
 				if (oif == NULL && hlen > 0 &&
 				    (  (is_ipv4 && in_localaddr(src_ip))
 #ifdef INET6
 				    || (is_ipv6 &&
 				        in6_localaddr(&(args->f_id.src_ip6)))
 #endif
 				    ))
 					match =
 #ifdef INET6
 					    is_ipv6 ? verify_path6(
 					        &(args->f_id.src_ip6), iif,
 						args->f_id.fib) :
 #endif
 					    verify_path(src_ip, iif,
 					        args->f_id.fib);
 				else
 					match = 1;
 				break;
 
 			case O_IPSEC:
 				match = (m_tag_find(m,
 				    PACKET_TAG_IPSEC_IN_DONE, NULL) != NULL);
 				/* otherwise no match */
 				break;
 
 #ifdef INET6
 			case O_IP6_SRC:
 				match = is_ipv6 &&
 				    IN6_ARE_ADDR_EQUAL(&args->f_id.src_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 
 			case O_IP6_DST:
 				match = is_ipv6 &&
 				IN6_ARE_ADDR_EQUAL(&args->f_id.dst_ip6,
 				    &((ipfw_insn_ip6 *)cmd)->addr6);
 				break;
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 				if (is_ipv6) {
 					int i = cmdlen - 1;
 					struct in6_addr p;
 					struct in6_addr *d =
 					    &((ipfw_insn_ip6 *)cmd)->addr6;
 
 					for (; !match && i > 0; d += 2,
 					    i -= F_INSN_SIZE(struct in6_addr)
 					    * 2) {
 						p = (cmd->opcode ==
 						    O_IP6_SRC_MASK) ?
 						    args->f_id.src_ip6:
 						    args->f_id.dst_ip6;
 						APPLY_MASK(&p, &d[1]);
 						match =
 						    IN6_ARE_ADDR_EQUAL(&d[0],
 						    &p);
 					}
 				}
 				break;
 
 			case O_FLOW6ID:
 				match = is_ipv6 &&
 				    flow6id_match(args->f_id.flow_id6,
 				    (ipfw_insn_u32 *) cmd);
 				break;
 
 			case O_EXT_HDR:
 				match = is_ipv6 &&
 				    (ext_hd & ((ipfw_insn *) cmd)->arg1);
 				break;
 
 			case O_IP6:
 				match = is_ipv6;
 				break;
 #endif
 
 			case O_IP4:
 				match = is_ipv4;
 				break;
 
 			case O_TAG: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				/* Packet is already tagged with this tag? */
 				mtag = m_tag_locate(m, MTAG_IPFW, tag, NULL);
 
 				/* We have `untag' action when F_NOT flag is
 				 * present. And we must remove this mtag from
 				 * mbuf and reset `match' to zero (`match' will
 				 * be inversed later).
 				 * Otherwise we should allocate new mtag and
 				 * push it into mbuf.
 				 */
 				if (cmd->len & F_NOT) { /* `untag' action */
 					if (mtag != NULL)
 						m_tag_delete(m, mtag);
 					match = 0;
 				} else {
 					if (mtag == NULL) {
 						mtag = m_tag_alloc( MTAG_IPFW,
 						    tag, 0, M_NOWAIT);
 						if (mtag != NULL)
 							m_tag_prepend(m, mtag);
 					}
 					match = 1;
 				}
 				break;
 			}
 
 			case O_FIB: /* try match the specified fib */
 				if (args->f_id.fib == cmd->arg1)
 					match = 1;
 				break;
 
 			case O_SOCKARG:	{
 #ifndef USERSPACE	/* not supported in userspace */
 				struct inpcb *inp = args->inp;
 				struct inpcbinfo *pi;
 				
 				if (is_ipv6) /* XXX can we remove this ? */
 					break;
 
 				if (proto == IPPROTO_TCP)
 					pi = &V_tcbinfo;
 				else if (proto == IPPROTO_UDP)
 					pi = &V_udbinfo;
 				else if (proto == IPPROTO_UDPLITE)
 					pi = &V_ulitecbinfo;
 				else
 					break;
 
 				/*
 				 * XXXRW: so_user_cookie should almost
 				 * certainly be inp_user_cookie?
 				 */
 
 				/* For incoming packet, lookup up the 
 				inpcb using the src/dest ip/port tuple */
 				if (inp == NULL) {
 					inp = in_pcblookup(pi, 
 						src_ip, htons(src_port),
 						dst_ip, htons(dst_port),
 						INPLOOKUP_RLOCKPCB, NULL);
 					if (inp != NULL) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 						INP_RUNLOCK(inp);
 					}
 				} else {
 					if (inp->inp_socket) {
 						tablearg =
 						    inp->inp_socket->so_user_cookie;
 						if (tablearg)
 							match = 1;
 					}
 				}
 #endif /* !USERSPACE */
 				break;
 			}
 
 			case O_TAGGED: {
 				struct m_tag *mtag;
 				uint32_t tag = TARG(cmd->arg1, tag);
 
 				if (cmdlen == 1) {
 					match = m_tag_locate(m, MTAG_IPFW,
 					    tag, NULL) != NULL;
 					break;
 				}
 
 				/* we have ranges */
 				for (mtag = m_tag_first(m);
 				    mtag != NULL && !match;
 				    mtag = m_tag_next(m, mtag)) {
 					uint16_t *p;
 					int i;
 
 					if (mtag->m_tag_cookie != MTAG_IPFW)
 						continue;
 
 					p = ((ipfw_insn_u16 *)cmd)->ports;
 					i = cmdlen - 1;
 					for(; !match && i > 0; i--, p += 2)
 						match =
 						    mtag->m_tag_id >= p[0] &&
 						    mtag->m_tag_id <= p[1];
 				}
 				break;
 			}
 				
 			/*
 			 * The second set of opcodes represents 'actions',
 			 * i.e. the terminal part of a rule once the packet
 			 * matches all previous patterns.
 			 * Typically there is only one action for each rule,
 			 * and the opcode is stored at the end of the rule
 			 * (but there are exceptions -- see below).
 			 *
 			 * In general, here we set retval and terminate the
 			 * outer loop (would be a 'break 3' in some language,
 			 * but we need to set l=0, done=1)
 			 *
 			 * Exceptions:
 			 * O_COUNT and O_SKIPTO actions:
 			 *   instead of terminating, we jump to the next rule
 			 *   (setting l=0), or to the SKIPTO target (setting
 			 *   f/f_len, cmd and l as needed), respectively.
 			 *
 			 * O_TAG, O_LOG and O_ALTQ action parameters:
 			 *   perform some action and set match = 1;
 			 *
 			 * O_LIMIT and O_KEEP_STATE: these opcodes are
 			 *   not real 'actions', and are stored right
 			 *   before the 'action' part of the rule (one
 			 *   exception is O_SKIP_ACTION which could be
 			 *   between these opcodes and 'action' one).
 			 *   These opcodes try to install an entry in the
 			 *   state tables; if successful, we continue with
 			 *   the next opcode (match=1; break;), otherwise
 			 *   the packet must be dropped (set retval,
 			 *   break loops with l=0, done=1)
 			 *
 			 * O_PROBE_STATE and O_CHECK_STATE: these opcodes
 			 *   cause a lookup of the state table, and a jump
 			 *   to the 'action' part of the parent rule
 			 *   if an entry is found, or
 			 *   (CHECK_STATE only) a jump to the next rule if
 			 *   the entry is not found.
 			 *   The result of the lookup is cached so that
 			 *   further instances of these opcodes become NOPs.
 			 *   The jump to the next rule is done by setting
 			 *   l=0, cmdlen=0.
 			 *
 			 * O_SKIP_ACTION: this opcode is not a real 'action'
 			 *  either, and is stored right before the 'action'
 			 *  part of the rule, right after the O_KEEP_STATE
 			 *  opcode. It causes match failure so the real
 			 *  'action' could be executed only if the rule
 			 *  is checked via dynamic rule from the state
 			 *  table, as in such case execution starts
 			 *  from the true 'action' opcode directly.
 			 *   
 			 */
 			case O_LIMIT:
 			case O_KEEP_STATE:
 				if (ipfw_dyn_install_state(chain, f,
 				    (ipfw_insn_limit *)cmd, args, ulp,
 				    pktlen, &dyn_info, tablearg)) {
 					/* error or limit violation */
 					retval = IP_FW_DENY;
 					l = 0;	/* exit inner loop */
 					done = 1; /* exit outer loop */
 				}
 				match = 1;
 				break;
 
 			case O_PROBE_STATE:
 			case O_CHECK_STATE:
 				/*
 				 * dynamic rules are checked at the first
 				 * keep-state or check-state occurrence,
 				 * with the result being stored in dyn_info.
 				 * The compiler introduces a PROBE_STATE
 				 * instruction for us when we have a
 				 * KEEP_STATE (because PROBE_STATE needs
 				 * to be run first).
 				 */
 				if (DYN_LOOKUP_NEEDED(&dyn_info, cmd) &&
 				    (q = ipfw_dyn_lookup_state(args, ulp,
 				    pktlen, cmd, &dyn_info)) != NULL) {
 					/*
 					 * Found dynamic entry, jump to the
 					 * 'action' part of the parent rule
 					 * by setting f, cmd, l and clearing
 					 * cmdlen.
 					 */
 					f = q;
 					f_pos = dyn_info.f_pos;
 					cmd = ACTION_PTR(f);
 					l = f->cmd_len - f->act_ofs;
 					cmdlen = 0;
 					match = 1;
 					break;
 				}
 				/*
 				 * Dynamic entry not found. If CHECK_STATE,
 				 * skip to next rule, if PROBE_STATE just
 				 * ignore and continue with next opcode.
 				 */
 				if (cmd->opcode == O_CHECK_STATE)
 					l = 0;	/* exit inner loop */
 				match = 1;
 				break;
 
 			case O_SKIP_ACTION:
 				match = 0;	/* skip to the next rule */
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_ACCEPT:
 				retval = 0;	/* accept */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_PIPE:
 			case O_QUEUE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, pipe);
 				if (cmd->opcode == O_PIPE)
 					args->rule.info |= IPFW_IS_PIPE;
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = IP_FW_DUMMYNET;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_DIVERT:
 			case O_TEE:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not on layer 2 */
 				/* otherwise this is terminal */
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				retval = (cmd->opcode == O_DIVERT) ?
 					IP_FW_DIVERT : IP_FW_TEE;
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, divert);
 				break;
 
 			case O_COUNT:
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				l = 0;		/* exit inner loop */
 				break;
 
 			case O_SKIPTO:
 			    IPFW_INC_RULE_COUNTER(f, pktlen);
 			    f_pos = JUMP(chain, f, cmd->arg1, tablearg, 0);
 			    /*
 			     * Skip disabled rules, and re-enter
 			     * the inner loop with the correct
 			     * f_pos, f, l and cmd.
 			     * Also clear cmdlen and skip_or
 			     */
 			    for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				     (1 << chain->map[f_pos]->set));
 				    f_pos++)
 				;
 			    /* Re-enter the inner loop at the skipto rule. */
 			    f = chain->map[f_pos];
 			    l = f->cmd_len;
 			    cmd = f->cmd;
 			    match = 1;
 			    cmdlen = 0;
 			    skip_or = 0;
 			    continue;
 			    break;	/* not reached */
 
 			case O_CALLRETURN: {
 				/*
 				 * Implementation of `subroutine' call/return,
 				 * in the stack carried in an mbuf tag. This
 				 * is different from `skipto' in that any call
 				 * address is possible (`skipto' must prevent
 				 * backward jumps to avoid endless loops).
 				 * We have `return' action when F_NOT flag is
 				 * present. The `m_tag_id' field is used as
 				 * stack pointer.
 				 */
 				struct m_tag *mtag;
 				uint16_t jmpto, *stack;
 
 #define	IS_CALL		((cmd->len & F_NOT) == 0)
 #define	IS_RETURN	((cmd->len & F_NOT) != 0)
 				/*
 				 * Hand-rolled version of m_tag_locate() with
 				 * wildcard `type'.
 				 * If not already tagged, allocate new tag.
 				 */
 				mtag = m_tag_first(m);
 				while (mtag != NULL) {
 					if (mtag->m_tag_cookie ==
 					    MTAG_IPFW_CALL)
 						break;
 					mtag = m_tag_next(m, mtag);
 				}
 				if (mtag == NULL && IS_CALL) {
 					mtag = m_tag_alloc(MTAG_IPFW_CALL, 0,
 					    IPFW_CALLSTACK_SIZE *
 					    sizeof(uint16_t), M_NOWAIT);
 					if (mtag != NULL)
 						m_tag_prepend(m, mtag);
 				}
 
 				/*
 				 * On error both `call' and `return' just
 				 * continue with next rule.
 				 */
 				if (IS_RETURN && (mtag == NULL ||
 				    mtag->m_tag_id == 0)) {
 					l = 0;		/* exit inner loop */
 					break;
 				}
 				if (IS_CALL && (mtag == NULL ||
 				    mtag->m_tag_id >= IPFW_CALLSTACK_SIZE)) {
 					printf("ipfw: call stack error, "
 					    "go to next rule\n");
 					l = 0;		/* exit inner loop */
 					break;
 				}
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				stack = (uint16_t *)(mtag + 1);
 
 				/*
 				 * The `call' action may use cached f_pos
 				 * (in f->next_rule), whose version is written
 				 * in f->next_rule.
 				 * The `return' action, however, doesn't have
 				 * fixed jump address in cmd->arg1 and can't use
 				 * cache.
 				 */
 				if (IS_CALL) {
 					stack[mtag->m_tag_id] = f->rulenum;
 					mtag->m_tag_id++;
 			    		f_pos = JUMP(chain, f, cmd->arg1,
 					    tablearg, 1);
 				} else {	/* `return' action */
 					mtag->m_tag_id--;
 					jmpto = stack[mtag->m_tag_id] + 1;
 					f_pos = ipfw_find_rule(chain, jmpto, 0);
 				}
 
 				/*
 				 * Skip disabled rules, and re-enter
 				 * the inner loop with the correct
 				 * f_pos, f, l and cmd.
 				 * Also clear cmdlen and skip_or
 				 */
 				for (; f_pos < chain->n_rules - 1 &&
 				    (V_set_disable &
 				    (1 << chain->map[f_pos]->set)); f_pos++)
 					;
 				/* Re-enter the inner loop at the dest rule. */
 				f = chain->map[f_pos];
 				l = f->cmd_len;
 				cmd = f->cmd;
 				cmdlen = 0;
 				skip_or = 0;
 				continue;
 				break;	/* NOTREACHED */
 			}
 #undef IS_CALL
 #undef IS_RETURN
 
 			case O_REJECT:
 				/*
 				 * Drop the packet and send a reject notice
 				 * if the packet is not ICMP (or is an ICMP
 				 * query), and it is not multicast/broadcast.
 				 */
 				if (hlen > 0 && is_ipv4 && offset == 0 &&
 				    (proto != IPPROTO_ICMP ||
 				     is_icmp_query(ICMP(ulp))) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN_MULTICAST(ntohl(dst_ip.s_addr))) {
 					send_reject(args, cmd->arg1, iplen, ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #ifdef INET6
 			case O_UNREACH6:
 				if (hlen > 0 && is_ipv6 &&
 				    ((offset & IP6F_OFF_MASK) == 0) &&
 				    (proto != IPPROTO_ICMPV6 ||
 				     (is_icmp6_query(icmp6_type) == 1)) &&
 				    !(m->m_flags & (M_BCAST|M_MCAST)) &&
 				    !IN6_IS_ADDR_MULTICAST(
 					&args->f_id.dst_ip6)) {
 					send_reject6(args,
 					    cmd->opcode == O_REJECT ?
 					    map_icmp_unreach(cmd->arg1):
 					    cmd->arg1, hlen,
 					    (struct ip6_hdr *)ip);
 					m = args->m;
 				}
 				/* FALLTHROUGH */
 #endif
 			case O_DENY:
 				retval = IP_FW_DENY;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 
 			case O_FORWARD_IP:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not valid on layer2 pkts */
 				if (q != f ||
 				    dyn_info.direction == MATCH_FORWARD) {
 				    struct sockaddr_in *sa;
 
 				    sa = &(((ipfw_insn_sa *)cmd)->sa);
 				    if (sa->sin_addr.s_addr == INADDR_ANY) {
 #ifdef INET6
 					/*
 					 * We use O_FORWARD_IP opcode for
 					 * fwd rule with tablearg, but tables
 					 * now support IPv6 addresses. And
 					 * when we are inspecting IPv6 packet,
 					 * we can use nh6 field from
 					 * table_value as next_hop6 address.
 					 */
 					if (is_ipv6) {
 						struct ip_fw_nh6 *nh6;
 
 						args->flags |= IPFW_ARGS_NH6;
 						nh6 = &args->hopstore6;
 						nh6->sin6_addr = TARG_VAL(
 						    chain, tablearg, nh6);
 						nh6->sin6_port = sa->sin_port;
 						nh6->sin6_scope_id = TARG_VAL(
 						    chain, tablearg, zoneid);
 					} else
 #endif
 					{
 						args->flags |= IPFW_ARGS_NH4;
 						args->hopstore.sin_port =
 						    sa->sin_port;
 						sa = &args->hopstore;
 						sa->sin_family = AF_INET;
 						sa->sin_len = sizeof(*sa);
 						sa->sin_addr.s_addr = htonl(
 						    TARG_VAL(chain, tablearg,
 						    nh4));
 					}
 				    } else {
 					    args->flags |= IPFW_ARGS_NH4PTR;
 					    args->next_hop = sa;
 				    }
 				}
 				retval = IP_FW_PASS;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 #ifdef INET6
 			case O_FORWARD_IP6:
 				if (args->flags & IPFW_ARGS_ETHER)
 					break;	/* not valid on layer2 pkts */
 				if (q != f ||
 				    dyn_info.direction == MATCH_FORWARD) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = &(((ipfw_insn_sa6 *)cmd)->sa);
 					args->flags |= IPFW_ARGS_NH6PTR;
 					args->next_hop6 = sin6;
 				}
 				retval = IP_FW_PASS;
 				l = 0;		/* exit inner loop */
 				done = 1;	/* exit outer loop */
 				break;
 #endif
 
 			case O_NETGRAPH:
 			case O_NGTEE:
 				set_match(args, f_pos, chain);
 				args->rule.info = TARG(cmd->arg1, netgraph);
 				if (V_fw_one_pass)
 					args->rule.info |= IPFW_ONEPASS;
 				retval = (cmd->opcode == O_NETGRAPH) ?
 				    IP_FW_NETGRAPH : IP_FW_NGTEE;
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				break;
 
 			case O_SETFIB: {
 				uint32_t fib;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				fib = TARG(cmd->arg1, fib) & 0x7FFF;
 				if (fib >= rt_numfibs)
 					fib = 0;
 				M_SETFIB(m, fib);
 				args->f_id.fib = fib; /* XXX */
 				l = 0;		/* exit inner loop */
 				break;
 		        }
 
 			case O_SETDSCP: {
 				uint16_t code;
 
 				code = TARG(cmd->arg1, dscp) & 0x3F;
 				l = 0;		/* exit inner loop */
 				if (is_ipv4) {
 					uint16_t old;
 
 					old = *(uint16_t *)ip;
 					ip->ip_tos = (code << 2) |
 					    (ip->ip_tos & 0x03);
 					ip->ip_sum = cksum_adjust(ip->ip_sum,
 					    old, *(uint16_t *)ip);
 				} else if (is_ipv6) {
 					uint8_t *v;
 
 					v = &((struct ip6_hdr *)ip)->ip6_vfc;
 					*v = (*v & 0xF0) | (code >> 2);
 					v++;
 					*v = (*v & 0x3F) | ((code & 0x03) << 6);
 				} else
 					break;
 
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				break;
 			}
 
 			case O_NAT:
 				l = 0;          /* exit inner loop */
 				done = 1;       /* exit outer loop */
 				/*
 				 * Ensure that we do not invoke NAT handler for
 				 * non IPv4 packets. Libalias expects only IPv4.
 				 */
 				if (!is_ipv4 || !IPFW_NAT_LOADED) {
 				    retval = IP_FW_DENY;
 				    break;
 				}
 
 				struct cfg_nat *t;
 				int nat_id;
 
 				args->rule.info = 0;
 				set_match(args, f_pos, chain);
 				/* Check if this is 'global' nat rule */
 				if (cmd->arg1 == IP_FW_NAT44_GLOBAL) {
 					retval = ipfw_nat_ptr(args, NULL, m);
 					break;
 				}
 				t = ((ipfw_insn_nat *)cmd)->nat;
 				if (t == NULL) {
 					nat_id = TARG(cmd->arg1, nat);
 					t = (*lookup_nat_ptr)(&chain->nat, nat_id);
 
 					if (t == NULL) {
 					    retval = IP_FW_DENY;
 					    break;
 					}
 					if (cmd->arg1 != IP_FW_TARG)
 					    ((ipfw_insn_nat *)cmd)->nat = t;
 				}
 				retval = ipfw_nat_ptr(args, t, m);
 				break;
 
 			case O_REASS: {
 				int ip_off;
 
 				l = 0;	/* in any case exit inner loop */
 				if (is_ipv6) /* IPv6 is not supported yet */
 					break;
 				IPFW_INC_RULE_COUNTER(f, pktlen);
 				ip_off = ntohs(ip->ip_off);
 
 				/* if not fragmented, go to next rule */
 				if ((ip_off & (IP_MF | IP_OFFMASK)) == 0)
 				    break;
 
 				args->m = m = ip_reass(m);
 
 				/*
 				 * do IP header checksum fixup.
 				 */
 				if (m == NULL) { /* fragment got swallowed */
 				    retval = IP_FW_DENY;
 				} else { /* good, packet complete */
 				    int hlen;
 
 				    ip = mtod(m, struct ip *);
 				    hlen = ip->ip_hl << 2;
 				    ip->ip_sum = 0;
 				    if (hlen == sizeof(struct ip))
 					ip->ip_sum = in_cksum_hdr(ip);
 				    else
 					ip->ip_sum = in_cksum(m, hlen);
 				    retval = IP_FW_REASS;
 				    args->rule.info = 0;
 				    set_match(args, f_pos, chain);
 				}
 				done = 1;	/* exit outer loop */
 				break;
 			}
 			case O_EXTERNAL_ACTION:
 				l = 0; /* in any case exit inner loop */
 				retval = ipfw_run_eaction(chain, args,
 				    cmd, &done);
 				/*
 				 * If both @retval and @done are zero,
 				 * consider this as rule matching and
 				 * update counters.
 				 */
 				if (retval == 0 && done == 0) {
 					IPFW_INC_RULE_COUNTER(f, pktlen);
 					/*
 					 * Reset the result of the last
 					 * dynamic state lookup.
 					 * External action can change
 					 * @args content, and it may be
 					 * used for new state lookup later.
 					 */
 					DYN_INFO_INIT(&dyn_info);
 				}
 				break;
 
 			default:
 				panic("-- unknown opcode %d\n", cmd->opcode);
 			} /* end of switch() on opcodes */
 			/*
 			 * if we get here with l=0, then match is irrelevant.
 			 */
 
 			if (cmd->len & F_NOT)
 				match = !match;
 
 			if (match) {
 				if (cmd->len & F_OR)
 					skip_or = 1;
 			} else {
 				if (!(cmd->len & F_OR)) /* not an OR block, */
 					break;		/* try next rule    */
 			}
 
 		}	/* end of inner loop, scan opcodes */
 #undef PULLUP_LEN
+#undef PULLUP_LEN_LOCKED
 
 		if (done)
 			break;
 
 /* next_rule:; */	/* try next rule		*/
 
 	}		/* end of outer for, scan rules */
 
 	if (done) {
 		struct ip_fw *rule = chain->map[f_pos];
 		/* Update statistics */
 		IPFW_INC_RULE_COUNTER(rule, pktlen);
 	} else {
 		retval = IP_FW_DENY;
 		printf("ipfw: ouch!, skip past end of rules, denying packet\n");
 	}
 	IPFW_PF_RUNLOCK(chain);
 #ifdef __FreeBSD__
 	if (ucred_cache != NULL)
 		crfree(ucred_cache);
 #endif
 	return (retval);
 
 pullup_failed:
 	if (V_fw_verbose)
 		printf("ipfw: pullup failed\n");
 	return (IP_FW_DENY);
 }
 
 /*
  * Set maximum number of tables that can be used in given VNET ipfw instance.
  */
 #ifdef SYSCTL_NODE
 static int
 sysctl_ipfw_table_num(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int ntables;
 
 	ntables = V_fw_tables_max;
 
 	error = sysctl_handle_int(oidp, &ntables, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_resize_tables(&V_layer3_chain, ntables));
 }
 
 /*
  * Switches table namespace between global and per-set.
  */
 static int
 sysctl_ipfw_tables_sets(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	unsigned int sets;
 
 	sets = V_fw_tables_sets;
 
 	error = sysctl_handle_int(oidp, &sets, 0, req);
 	/* Read operation or some error */
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	return (ipfw_switch_tables_namespace(&V_layer3_chain, sets));
 }
 #endif
 
 /*
  * Module and VNET glue
  */
 
 /*
  * Stuff that must be initialised only on boot or module load
  */
 static int
 ipfw_init(void)
 {
 	int error = 0;
 
 	/*
  	 * Only print out this stuff the first time around,
 	 * when called from the sysinit code.
 	 */
 	printf("ipfw2 "
 #ifdef INET6
 		"(+ipv6) "
 #endif
 		"initialized, divert %s, nat %s, "
 		"default to %s, logging ",
 #ifdef IPDIVERT
 		"enabled",
 #else
 		"loadable",
 #endif
 #ifdef IPFIREWALL_NAT
 		"enabled",
 #else
 		"loadable",
 #endif
 		default_to_accept ? "accept" : "deny");
 
 	/*
 	 * Note: V_xxx variables can be accessed here but the vnet specific
 	 * initializer may not have been called yet for the VIMAGE case.
 	 * Tuneables will have been processed. We will print out values for
 	 * the default vnet. 
 	 * XXX This should all be rationalized AFTER 8.0
 	 */
 	if (V_fw_verbose == 0)
 		printf("disabled\n");
 	else if (V_verbose_limit == 0)
 		printf("unlimited\n");
 	else
 		printf("limited to %d packets/entry by default\n",
 		    V_verbose_limit);
 
 	/* Check user-supplied table count for validness */
 	if (default_fw_tables > IPFW_TABLES_MAX)
 	  default_fw_tables = IPFW_TABLES_MAX;
 
 	ipfw_init_sopt_handler();
 	ipfw_init_obj_rewriter();
 	ipfw_iface_init();
 	return (error);
 }
 
 /*
  * Called for the removal of the last instance only on module unload.
  */
 static void
 ipfw_destroy(void)
 {
 
 	ipfw_iface_destroy();
 	ipfw_destroy_sopt_handler();
 	ipfw_destroy_obj_rewriter();
 	printf("IP firewall unloaded\n");
 }
 
 /*
  * Stuff that must be initialized for every instance
  * (including the first of course).
  */
 static int
 vnet_ipfw_init(const void *unused)
 {
 	int error, first;
 	struct ip_fw *rule = NULL;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 
 	first = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	/* First set up some values that are compile time options */
 	V_autoinc_step = 100;	/* bounded to 1..1000 in add_rule() */
 	V_fw_deny_unknown_exthdrs = 1;
 #ifdef IPFIREWALL_VERBOSE
 	V_fw_verbose = 1;
 #endif
 #ifdef IPFIREWALL_VERBOSE_LIMIT
 	V_verbose_limit = IPFIREWALL_VERBOSE_LIMIT;
 #endif
 #ifdef IPFIREWALL_NAT
 	LIST_INIT(&chain->nat);
 #endif
 
 	/* Init shared services hash table */
 	ipfw_init_srv(chain);
 
 	ipfw_init_counters();
 	/* Set initial number of tables */
 	V_fw_tables_max = default_fw_tables;
 	error = ipfw_init_tables(chain, first);
 	if (error) {
 		printf("ipfw2: setting up tables failed\n");
 		free(chain->map, M_IPFW);
 		free(rule, M_IPFW);
 		return (ENOSPC);
 	}
 
 	IPFW_LOCK_INIT(chain);
 
 	/* fill and insert the default rule */
 	rule = ipfw_alloc_rule(chain, sizeof(struct ip_fw));
 	rule->flags |= IPFW_RULE_NOOPT;
 	rule->cmd_len = 1;
 	rule->cmd[0].len = 1;
 	rule->cmd[0].opcode = default_to_accept ? O_ACCEPT : O_DENY;
 	chain->default_rule = rule;
 	ipfw_add_protected_rule(chain, rule, 0);
 
 	ipfw_dyn_init(chain);
 	ipfw_eaction_init(chain, first);
 #ifdef LINEAR_SKIPTO
 	ipfw_init_skipto_cache(chain);
 #endif
 	ipfw_bpf_init(first);
 
 	/* First set up some values that are compile time options */
 	V_ipfw_vnet_ready = 1;		/* Open for business */
 
 	/*
 	 * Hook the sockopt handler and pfil hooks for ipv4 and ipv6.
 	 * Even if the latter two fail we still keep the module alive
 	 * because the sockopt and layer2 paths are still useful.
 	 * ipfw[6]_hook return 0 on success, ENOENT on failure,
 	 * so we can ignore the exact return value and just set a flag.
 	 *
 	 * Note that V_fw[6]_enable are manipulated by a SYSCTL_PROC so
 	 * changes in the underlying (per-vnet) variables trigger
 	 * immediate hook()/unhook() calls.
 	 * In layer2 we have the same behaviour, except that V_ether_ipfw
 	 * is checked on each packet because there are no pfil hooks.
 	 */
 	V_ip_fw_ctl_ptr = ipfw_ctl3;
 	error = ipfw_attach_hooks();
 	return (error);
 }
 
 /*
  * Called for the removal of each instance.
  */
 static int
 vnet_ipfw_uninit(const void *unused)
 {
 	struct ip_fw *reap;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i, last;
 
 	V_ipfw_vnet_ready = 0; /* tell new callers to go away */
 	/*
 	 * disconnect from ipv4, ipv6, layer2 and sockopt.
 	 * Then grab, release and grab again the WLOCK so we make
 	 * sure the update is propagated and nobody will be in.
 	 */
 	ipfw_detach_hooks();
 	V_ip_fw_ctl_ptr = NULL;
 
 	last = IS_DEFAULT_VNET(curvnet) ? 1 : 0;
 
 	IPFW_UH_WLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 
 	ipfw_dyn_uninit(0);	/* run the callout_drain */
 
 	IPFW_UH_WLOCK(chain);
 
 	reap = NULL;
 	IPFW_WLOCK(chain);
 	for (i = 0; i < chain->n_rules; i++)
 		ipfw_reap_add(chain, &reap, chain->map[i]);
 	free(chain->map, M_IPFW);
 #ifdef LINEAR_SKIPTO
 	ipfw_destroy_skipto_cache(chain);
 #endif
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	ipfw_destroy_tables(chain, last);
 	ipfw_eaction_uninit(chain, last);
 	if (reap != NULL)
 		ipfw_reap_rules(reap);
 	vnet_ipfw_iface_destroy(chain);
 	ipfw_destroy_srv(chain);
 	IPFW_LOCK_DESTROY(chain);
 	ipfw_dyn_uninit(1);	/* free the remaining parts */
 	ipfw_destroy_counters();
 	ipfw_bpf_uninit(last);
 	return (0);
 }
 
 /*
  * Module event handler.
  * In general we have the choice of handling most of these events by the
  * event handler or by the (VNET_)SYS(UN)INIT handlers. I have chosen to
  * use the SYSINIT handlers as they are more capable of expressing the
  * flow of control during module and vnet operations, so this is just
  * a skeleton. Note there is no SYSINIT equivalent of the module
  * SHUTDOWN handler, but we don't have anything to do in that case anyhow.
  */
 static int
 ipfw_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		/* Called once at module load or
 	 	 * system boot if compiled in. */
 		break;
 	case MOD_QUIESCE:
 		/* Called before unload. May veto unloading. */
 		break;
 	case MOD_UNLOAD:
 		/* Called during unload. */
 		break;
 	case MOD_SHUTDOWN:
 		/* Called during system shutdown. */
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfwmod = {
 	"ipfw",
 	ipfw_modevent,
 	0
 };
 
 /* Define startup order. */
 #define	IPFW_SI_SUB_FIREWALL	SI_SUB_PROTO_FIREWALL
 #define	IPFW_MODEVENT_ORDER	(SI_ORDER_ANY - 255) /* On boot slot in here. */
 #define	IPFW_MODULE_ORDER	(IPFW_MODEVENT_ORDER + 1) /* A little later. */
 #define	IPFW_VNET_ORDER		(IPFW_MODEVENT_ORDER + 2) /* Later still. */
 
 DECLARE_MODULE(ipfw, ipfwmod, IPFW_SI_SUB_FIREWALL, IPFW_MODEVENT_ORDER);
 FEATURE(ipfw_ctl3, "ipfw new sockopt calls");
 MODULE_VERSION(ipfw, 3);
 /* should declare some dependencies here */
 
 /*
  * Starting up. Done in order after ipfwmod() has been called.
  * VNET_SYSINIT is also called for each existing vnet and each new vnet.
  */
 SYSINIT(ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_init, NULL);
 VNET_SYSINIT(vnet_ipfw_init, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_init, NULL);
  
 /*
  * Closing up shop. These are done in REVERSE ORDER, but still
  * after ipfwmod() has been called. Not called on reboot.
  * VNET_SYSUNINIT is also called for each exiting vnet as it exits.
  * or when the module is unloaded.
  */
 SYSUNINIT(ipfw_destroy, IPFW_SI_SUB_FIREWALL, IPFW_MODULE_ORDER,
 	    ipfw_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_uninit, IPFW_SI_SUB_FIREWALL, IPFW_VNET_ORDER,
 	    vnet_ipfw_uninit, NULL);
 /* end of file */
Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_eaction.c
===================================================================
--- projects/fuse2/sys/netpfil/ipfw/ip_fw_eaction.c	(revision 350434)
+++ projects/fuse2/sys/netpfil/ipfw/ip_fw_eaction.c	(revision 350435)
@@ -1,457 +1,455 @@
 /*-
  * Copyright (c) 2016-2017 Yandex LLC
  * Copyright (c) 2016-2017 Andrey V. Elsukov <ae@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/hash.h>
 #include <sys/lock.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/queue.h>
 
 #include <net/if.h>	/* ip_fw.h requires IFNAMSIZ */
 #include <net/pfil.h>
 #include <netinet/in.h>
 #include <netinet/ip_var.h>	/* struct ipfw_rule_ref */
 #include <netinet/ip_fw.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include "opt_ipfw.h"
 
 /*
  * External actions support for ipfw.
  *
  * This code provides KPI for implementing loadable modules, that
  * can provide handlers for external action opcodes in the ipfw's
  * rules.
  * Module should implement opcode handler with type ipfw_eaction_t.
  * This handler will be called by ipfw_chk() function when
  * O_EXTERNAL_ACTION opcode is matched. The handler must return
  * value used as return value in ipfw_chk(), i.e. IP_FW_PASS,
  * IP_FW_DENY (see ip_fw_private.h).
  * Also the last argument must be set by handler. If it is zero,
  * the search continues to the next rule. If it has non zero value,
  * the search terminates.
  *
  * The module that implements external action should register its
  * handler and name with ipfw_add_eaction() function.
  * This function will return eaction_id, that can be used by module.
  *
  * It is possible to pass some additional information to external
  * action handler using O_EXTERNAL_INSTANCE and O_EXTERNAL_DATA opcodes.
  * Such opcodes should be next after the O_EXTERNAL_ACTION opcode.
  * For the O_EXTERNAL_INSTANCE opcode the cmd->arg1 contains index of named
  * object related to an instance of external action.
  * For the O_EXTERNAL_DATA opcode the cmd contains the data that can be used
  * by external action handler without needing to create named instance.
  *
  * In case when eaction module uses named instances, it should register
  * opcode rewriting routines for O_EXTERNAL_INSTANCE opcode. The
  * classifier callback can look back into O_EXTERNAL_ACTION opcode (it
  * must be in the (ipfw_insn *)(cmd - 1)). By arg1 from O_EXTERNAL_ACTION
  * it can deteremine eaction_id and compare it with its own.
  * The macro IPFW_TLV_EACTION_NAME(eaction_id) can be used to deteremine
  * the type of named_object related to external action instance.
  *
  * On module unload handler should be deregistered with ipfw_del_eaction()
  * function using known eaction_id.
  */
 
 struct eaction_obj {
 	struct named_object	no;
 	ipfw_eaction_t		*handler;
 	char			name[64];
 };
 
 #define	EACTION_OBJ(ch, cmd)			\
     ((struct eaction_obj *)SRV_OBJECT((ch), (cmd)->arg1))
 
 #if 0
 #define	EACTION_DEBUG(fmt, ...)	do {			\
 	printf("%s: " fmt "\n", __func__, ## __VA_ARGS__);	\
 } while (0)
 #else
 #define	EACTION_DEBUG(fmt, ...)
 #endif
 
 const char *default_eaction_typename = "drop";
 static int
 default_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args,
     ipfw_insn *cmd, int *done)
 {
 
 	*done = 1; /* terminate the search */
 	return (IP_FW_DENY);
 }
 
 /*
  * Opcode rewriting callbacks.
  */
 static int
 eaction_classify(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
 {
 
 	EACTION_DEBUG("opcode %d, arg1 %d", cmd->opcode, cmd->arg1);
 	*puidx = cmd->arg1;
 	*ptype = 0;
 	return (0);
 }
 
 static void
 eaction_update(ipfw_insn *cmd, uint16_t idx)
 {
 
 	cmd->arg1 = idx;
 	EACTION_DEBUG("opcode %d, arg1 -> %d", cmd->opcode, cmd->arg1);
 }
 
 static int
 eaction_findbyname(struct ip_fw_chain *ch, struct tid_info *ti,
     struct named_object **pno)
 {
 	ipfw_obj_ntlv *ntlv;
 
 	if (ti->tlvs == NULL)
 		return (EINVAL);
 
 	/* Search ntlv in the buffer provided by user */
 	ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx,
 	    IPFW_TLV_EACTION);
 	if (ntlv == NULL)
 		return (EINVAL);
 	EACTION_DEBUG("name %s, uidx %u, type %u", ntlv->name,
 	    ti->uidx, ti->type);
 	/*
 	 * Search named object with corresponding name.
 	 * Since eaction objects are global - ignore the set value
 	 * and use zero instead.
 	 */
 	*pno = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch),
 	    0, IPFW_TLV_EACTION, ntlv->name);
 	if (*pno == NULL)
 		return (ESRCH);
 	return (0);
 }
 
 static struct named_object *
 eaction_findbykidx(struct ip_fw_chain *ch, uint16_t idx)
 {
 
 	EACTION_DEBUG("kidx %u", idx);
 	return (ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), idx));
 }
 
 static struct opcode_obj_rewrite eaction_opcodes[] = {
 	{
 		.opcode = O_EXTERNAL_ACTION,
 		.etlv = IPFW_TLV_EACTION,
 		.classifier = eaction_classify,
 		.update = eaction_update,
 		.find_byname = eaction_findbyname,
 		.find_bykidx = eaction_findbykidx,
 	},
 };
 
 static int
 create_eaction_obj(struct ip_fw_chain *ch, ipfw_eaction_t handler,
     const char *name, uint16_t *eaction_id)
 {
 	struct namedobj_instance *ni;
 	struct eaction_obj *obj;
 
 	IPFW_UH_UNLOCK_ASSERT(ch);
 
 	ni = CHAIN_TO_SRV(ch);
 	obj = malloc(sizeof(*obj), M_IPFW, M_WAITOK | M_ZERO);
 	obj->no.name = obj->name;
 	obj->no.etlv = IPFW_TLV_EACTION;
 	obj->handler = handler;
 	strlcpy(obj->name, name, sizeof(obj->name));
 
 	IPFW_UH_WLOCK(ch);
 	if (ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION,
 	    name) != NULL) {
 		/*
 		 * Object is already created.
 		 * We don't allow eactions with the same name.
 		 */
 		IPFW_UH_WUNLOCK(ch);
 		free(obj, M_IPFW);
 		EACTION_DEBUG("External action with typename "
 		    "'%s' already exists", name);
 		return (EEXIST);
 	}
 	if (ipfw_objhash_alloc_idx(ni, &obj->no.kidx) != 0) {
 		IPFW_UH_WUNLOCK(ch);
 		free(obj, M_IPFW);
 		EACTION_DEBUG("alloc_idx failed");
 		return (ENOSPC);
 	}
 	ipfw_objhash_add(ni, &obj->no);
 	IPFW_WLOCK(ch);
 	SRV_OBJECT(ch, obj->no.kidx) = obj;
 	IPFW_WUNLOCK(ch);
 	obj->no.refcnt++;
 	IPFW_UH_WUNLOCK(ch);
 
 	if (eaction_id != NULL)
 		*eaction_id = obj->no.kidx;
 	return (0);
 }
 
 static void
 destroy_eaction_obj(struct ip_fw_chain *ch, struct named_object *no)
 {
 	struct namedobj_instance *ni;
 	struct eaction_obj *obj;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	ni = CHAIN_TO_SRV(ch);
 	IPFW_WLOCK(ch);
 	obj = SRV_OBJECT(ch, no->kidx);
 	SRV_OBJECT(ch, no->kidx) = NULL;
 	IPFW_WUNLOCK(ch);
 	ipfw_objhash_del(ni, no);
 	ipfw_objhash_free_idx(ni, no->kidx);
 	free(obj, M_IPFW);
 }
 
 /*
  * Resets all eaction opcodes to default handlers.
  */
 static void
 reset_eaction_rules(struct ip_fw_chain *ch, uint16_t eaction_id,
     uint16_t instance_id, bool reset_rules)
 {
 	struct named_object *no;
 	int i;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	no = ipfw_objhash_lookup_name_type(CHAIN_TO_SRV(ch), 0,
 	    IPFW_TLV_EACTION, default_eaction_typename);
 	if (no == NULL)
 		panic("Default external action handler is not found");
 	if (eaction_id == no->kidx)
 		panic("Wrong eaction_id");
 
 	EACTION_DEBUG("Going to replace id %u with %u", eaction_id, no->kidx);
 	IPFW_WLOCK(ch);
 	/*
 	 * Reset eaction objects only if it is referenced by rules.
 	 * But always reset objects for orphaned dynamic states.
 	 */
 	if (reset_rules) {
 		for (i = 0; i < ch->n_rules; i++) {
 			/*
 			 * Refcount on the original object will be just
 			 * ignored on destroy. Refcount on default_eaction
 			 * will be decremented on rule deletion, thus we
 			 * need to reference default_eaction object.
 			 */
 			if (ipfw_reset_eaction(ch, ch->map[i], eaction_id,
 			    no->kidx, instance_id) != 0)
 				no->refcnt++;
 		}
 	}
 	/*
 	 * Reset eaction opcodes for orphaned dynamic states.
 	 * Since parent rules are already deleted, we don't need to
 	 * reference named object of default_eaction.
 	 */
 	ipfw_dyn_reset_eaction(ch, eaction_id, no->kidx, instance_id);
 	IPFW_WUNLOCK(ch);
 }
 
 /*
  * Initialize external actions framework.
  * Create object with default eaction handler "drop".
  */
 int
 ipfw_eaction_init(struct ip_fw_chain *ch, int first)
 {
 	int error;
 
 	error = create_eaction_obj(ch, default_eaction,
 	    default_eaction_typename, NULL);
 	if (error != 0)
 		return (error);
 	IPFW_ADD_OBJ_REWRITER(first, eaction_opcodes);
 	EACTION_DEBUG("External actions support initialized");
 	return (0);
 }
 
 void
 ipfw_eaction_uninit(struct ip_fw_chain *ch, int last)
 {
 	struct namedobj_instance *ni;
 	struct named_object *no;
 
 	ni = CHAIN_TO_SRV(ch);
 
 	IPFW_UH_WLOCK(ch);
 	no = ipfw_objhash_lookup_name_type(ni, 0, IPFW_TLV_EACTION,
 	    default_eaction_typename);
 	if (no != NULL)
 		destroy_eaction_obj(ch, no);
 	IPFW_UH_WUNLOCK(ch);
 	IPFW_DEL_OBJ_REWRITER(last, eaction_opcodes);
 	EACTION_DEBUG("External actions support uninitialized");
 }
 
 /*
  * Registers external action handler to the global array.
  * On success it returns eaction id, otherwise - zero.
  */
 uint16_t
 ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler,
     const char *name)
 {
 	uint16_t eaction_id;
 
 	eaction_id = 0;
 	if (ipfw_check_object_name_generic(name) == 0) {
 		create_eaction_obj(ch, handler, name, &eaction_id);
 		EACTION_DEBUG("Registered external action '%s' with id %u",
 		    name, eaction_id);
 	}
 	return (eaction_id);
 }
 
 /*
  * Deregisters external action handler with id eaction_id.
  */
 int
 ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id)
 {
 	struct named_object *no;
 
 	IPFW_UH_WLOCK(ch);
 	no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id);
 	if (no == NULL || no->etlv != IPFW_TLV_EACTION) {
 		IPFW_UH_WUNLOCK(ch);
 		return (EINVAL);
 	}
 	reset_eaction_rules(ch, eaction_id, 0, (no->refcnt > 1));
 	EACTION_DEBUG("External action '%s' with id %u unregistered",
 	    no->name, eaction_id);
 	destroy_eaction_obj(ch, no);
 	IPFW_UH_WUNLOCK(ch);
 	return (0);
 }
 
 int
 ipfw_reset_eaction(struct ip_fw_chain *ch, struct ip_fw *rule,
     uint16_t eaction_id, uint16_t default_id, uint16_t instance_id)
 {
 	ipfw_insn *cmd, *icmd;
-	int l, cmdlen;
+	int l;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 	IPFW_WLOCK_ASSERT(ch);
 
-	cmd = ACTION_PTR(rule);
-	l = rule->cmd_len - rule->act_ofs;
-	while (l > 0) {
-		cmdlen = F_LEN(cmd);
-		l -= cmdlen;
-		if (cmd->opcode == O_EXTERNAL_ACTION || l <= 0)
-			break;
-		cmd += cmdlen;
-	}
 	/*
 	 * Return if there is not O_EXTERNAL_ACTION or its id is
 	 * different.
 	 */
+	cmd = ipfw_get_action(rule);
 	if (cmd->opcode != O_EXTERNAL_ACTION ||
 	    cmd->arg1 != eaction_id)
 		return (0);
 	/*
 	 * If instance_id is specified, we need to truncate the
 	 * rule length. Check if there is O_EXTERNAL_INSTANCE opcode.
+	 *
+	 * NOTE: F_LEN(cmd) must be 1 for O_EXTERNAL_ACTION opcode,
+	 *  and rule length should be enough to keep O_EXTERNAL_INSTANCE
+	 *  opcode, thus we do check for l > 1.
 	 */
-	if (instance_id != 0 && l > 0) {
-		MPASS(cmdlen == 1);
+	l = rule->cmd + rule->cmd_len - cmd;
+	if (instance_id != 0 && l > 1) {
+		MPASS(F_LEN(cmd) == 1);
 		icmd = cmd + 1;
 		if (icmd->opcode != O_EXTERNAL_INSTANCE ||
 		    icmd->arg1 != instance_id)
 			return (0);
 		/*
 		 * Since named_object related to this instance will be
 		 * destroyed, truncate the chain of opcodes to remove
 		 * the rest of cmd chain just after O_EXTERNAL_ACTION
 		 * opcode.
 		 */
 		EACTION_DEBUG("truncate rule %d: len %u -> %u",
-		    rule->rulenum, rule->cmd_len, rule->cmd_len - l);
-		rule->cmd_len -= l;
+		    rule->rulenum, rule->cmd_len,
+		    rule->cmd_len - F_LEN(icmd));
+		rule->cmd_len -= F_LEN(icmd);
 		MPASS(((uint32_t *)icmd -
 		    (uint32_t *)rule->cmd) == rule->cmd_len);
 	}
 
 	cmd->arg1 = default_id; /* Set to default id */
 	/*
 	 * Return 1 when reset successfully happened.
 	 */
 	return (1);
 }
 
 /*
  * This function should be called before external action instance is
  * destroyed. It will reset eaction_id to default_id for rules, where
  * eaction has instance with id == kidx.
  */
 int
 ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint16_t eaction_id,
     uint16_t kidx)
 {
 	struct named_object *no;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 	no = ipfw_objhash_lookup_kidx(CHAIN_TO_SRV(ch), eaction_id);
 	if (no == NULL || no->etlv != IPFW_TLV_EACTION)
 		return (EINVAL);
 
 	reset_eaction_rules(ch, eaction_id, kidx, 0);
 	return (0);
 }
 
 int
 ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args,
     ipfw_insn *cmd, int *done)
 {
 
 	return (EACTION_OBJ(ch, cmd)->handler(ch, args, cmd, done));
 }
Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_nat.c
===================================================================
--- projects/fuse2/sys/netpfil/ipfw/ip_fw_nat.c	(revision 350434)
+++ projects/fuse2/sys/netpfil/ipfw/ip_fw_nat.c	(revision 350435)
@@ -1,1243 +1,1242 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2008 Paolo Pisati
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 
 #include <netinet/libalias/alias.h>
 #include <netinet/libalias/alias_local.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 
 #include <machine/in_cksum.h>	/* XXX for in_cksum */
 
 struct cfg_spool {
 	LIST_ENTRY(cfg_spool)   _next;          /* chain of spool instances */
 	struct in_addr          addr;
 	uint16_t		port;
 };
 
 /* Nat redirect configuration. */
 struct cfg_redir {
 	LIST_ENTRY(cfg_redir)	_next;	/* chain of redir instances */
 	uint16_t		mode;	/* type of redirect mode */
 	uint16_t		proto;	/* protocol: tcp/udp */
 	struct in_addr		laddr;	/* local ip address */
 	struct in_addr		paddr;	/* public ip address */
 	struct in_addr		raddr;	/* remote ip address */
 	uint16_t		lport;	/* local port */
 	uint16_t		pport;	/* public port */
 	uint16_t		rport;	/* remote port	*/
 	uint16_t		pport_cnt;	/* number of public ports */
 	uint16_t		rport_cnt;	/* number of remote ports */
 	struct alias_link	**alink;	
 	u_int16_t		spool_cnt; /* num of entry in spool chain */
 	/* chain of spool instances */
 	LIST_HEAD(spool_chain, cfg_spool) spool_chain;
 };
 
 /* Nat configuration data struct. */
 struct cfg_nat {
 	/* chain of nat instances */
 	LIST_ENTRY(cfg_nat)	_next;
 	int			id;		/* nat id  */
 	struct in_addr		ip;		/* nat ip address */
 	struct libalias		*lib;		/* libalias instance */
 	int			mode;		/* aliasing mode */
 	int			redir_cnt; /* number of entry in spool chain */
 	/* chain of redir instances */
 	LIST_HEAD(redir_chain, cfg_redir) redir_chain;  
 	char			if_name[IF_NAMESIZE];	/* interface name */
 };
 
 static eventhandler_tag ifaddr_event_tag;
 
 static void
 ifaddr_change(void *arg __unused, struct ifnet *ifp)
 {
 	struct cfg_nat *ptr;
 	struct ifaddr *ifa;
 	struct ip_fw_chain *chain;
 
 	KASSERT(curvnet == ifp->if_vnet,
 	    ("curvnet(%p) differs from iface vnet(%p)", curvnet, ifp->if_vnet));
 
 	if (V_ipfw_vnet_ready == 0 || V_ipfw_nat_ready == 0)
 		return;
 
 	chain = &V_layer3_chain;
 	IPFW_UH_WLOCK(chain);
 	/* Check every nat entry... */
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		/* ...using nic 'ifp->if_xname' as dynamic alias address. */
 		if (strncmp(ptr->if_name, ifp->if_xname, IF_NAMESIZE) != 0)
 			continue;
 		if_addr_rlock(ifp);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr == NULL)
 				continue;
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			IPFW_WLOCK(chain);
 			ptr->ip = ((struct sockaddr_in *)
 			    (ifa->ifa_addr))->sin_addr;
 			LibAliasSetAddress(ptr->lib, ptr->ip);
 			IPFW_WUNLOCK(chain);
 		}
 		if_addr_runlock(ifp);
 	}
 	IPFW_UH_WUNLOCK(chain);
 }
 
 /*
  * delete the pointers for nat entry ix, or all of them if ix < 0
  */
 static void
 flush_nat_ptrs(struct ip_fw_chain *chain, const int ix)
 {
-	int i;
 	ipfw_insn_nat *cmd;
+	int i;
 
 	IPFW_WLOCK_ASSERT(chain);
 	for (i = 0; i < chain->n_rules; i++) {
-		cmd = (ipfw_insn_nat *)ACTION_PTR(chain->map[i]);
-		/* XXX skip log and the like ? */
+		cmd = (ipfw_insn_nat *)ipfw_get_action(chain->map[i]);
 		if (cmd->o.opcode == O_NAT && cmd->nat != NULL &&
 			    (ix < 0 || cmd->nat->id == ix))
 			cmd->nat = NULL;
 	}
 }
 
 static void
 del_redir_spool_cfg(struct cfg_nat *n, struct redir_chain *head)
 {
 	struct cfg_redir *r, *tmp_r;
 	struct cfg_spool *s, *tmp_s;
 	int i, num;
 
 	LIST_FOREACH_SAFE(r, head, _next, tmp_r) {
 		num = 1; /* Number of alias_link to delete. */
 		switch (r->mode) {
 		case NAT44_REDIR_PORT:
 			num = r->pport_cnt;
 			/* FALLTHROUGH */
 		case NAT44_REDIR_ADDR:
 		case NAT44_REDIR_PROTO:
 			/* Delete all libalias redirect entry. */
 			for (i = 0; i < num; i++)
 				LibAliasRedirectDelete(n->lib, r->alink[i]);
 			/* Del spool cfg if any. */
 			LIST_FOREACH_SAFE(s, &r->spool_chain, _next, tmp_s) {
 				LIST_REMOVE(s, _next);
 				free(s, M_IPFW);
 			}
 			free(r->alink, M_IPFW);
 			LIST_REMOVE(r, _next);
 			free(r, M_IPFW);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);
 			/* XXX - panic?!?!? */
 			break;
 		}
 	}
 }
 
 static int
 add_redir_spool_cfg(char *buf, struct cfg_nat *ptr)
 {
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct nat44_cfg_redir *ser_r;
 	struct nat44_cfg_spool *ser_s;
 
 	int cnt, off, i;
 
 	for (cnt = 0, off = 0; cnt < ptr->redir_cnt; cnt++) {
 		ser_r = (struct nat44_cfg_redir *)&buf[off];
 		r = malloc(sizeof(*r), M_IPFW, M_WAITOK | M_ZERO);
 		r->mode = ser_r->mode;
 		r->laddr = ser_r->laddr;
 		r->paddr = ser_r->paddr;
 		r->raddr = ser_r->raddr;
 		r->lport = ser_r->lport;
 		r->pport = ser_r->pport;
 		r->rport = ser_r->rport;
 		r->pport_cnt = ser_r->pport_cnt;
 		r->rport_cnt = ser_r->rport_cnt;
 		r->proto = ser_r->proto;
 		r->spool_cnt = ser_r->spool_cnt;
 		//memcpy(r, ser_r, SOF_REDIR);
 		LIST_INIT(&r->spool_chain);
 		off += sizeof(struct nat44_cfg_redir);
 		r->alink = malloc(sizeof(struct alias_link *) * r->pport_cnt,
 		    M_IPFW, M_WAITOK | M_ZERO);
 		switch (r->mode) {
 		case NAT44_REDIR_ADDR:
 			r->alink[0] = LibAliasRedirectAddr(ptr->lib, r->laddr,
 			    r->paddr);
 			break;
 		case NAT44_REDIR_PORT:
 			for (i = 0 ; i < r->pport_cnt; i++) {
 				/* If remotePort is all ports, set it to 0. */
 				u_short remotePortCopy = r->rport + i;
 				if (r->rport_cnt == 1 && r->rport == 0)
 					remotePortCopy = 0;
 				r->alink[i] = LibAliasRedirectPort(ptr->lib,
 				    r->laddr, htons(r->lport + i), r->raddr,
 				    htons(remotePortCopy), r->paddr,
 				    htons(r->pport + i), r->proto);
 				if (r->alink[i] == NULL) {
 					r->alink[0] = NULL;
 					break;
 				}
 			}
 			break;
 		case NAT44_REDIR_PROTO:
 			r->alink[0] = LibAliasRedirectProto(ptr->lib ,r->laddr,
 			    r->raddr, r->paddr, r->proto);
 			break;
 		default:
 			printf("unknown redirect mode: %u\n", r->mode);
 			break;
 		}
 		if (r->alink[0] == NULL) {
 			printf("LibAliasRedirect* returned NULL\n");
 			free(r->alink, M_IPFW);
 			free(r, M_IPFW);
 			return (EINVAL);
 		}
 		/* LSNAT handling. */
 		for (i = 0; i < r->spool_cnt; i++) {
 			ser_s = (struct nat44_cfg_spool *)&buf[off];
 			s = malloc(sizeof(*s), M_IPFW, M_WAITOK | M_ZERO);
 			s->addr = ser_s->addr;
 			s->port = ser_s->port;
 			LibAliasAddServer(ptr->lib, r->alink[0],
 			    s->addr, htons(s->port));
 			off += sizeof(struct nat44_cfg_spool);
 			/* Hook spool entry. */
 			LIST_INSERT_HEAD(&r->spool_chain, s, _next);
 		}
 		/* And finally hook this redir entry. */
 		LIST_INSERT_HEAD(&ptr->redir_chain, r, _next);
 	}
 
 	return (0);
 }
 
 static void
 free_nat_instance(struct cfg_nat *ptr)
 {
 
 	del_redir_spool_cfg(ptr, &ptr->redir_chain);
 	LibAliasUninit(ptr->lib);
 	free(ptr, M_IPFW);
 }
 
 
 /*
  * ipfw_nat - perform mbuf header translation.
  *
  * Note V_layer3_chain has to be locked while calling ipfw_nat() in
  * 'global' operation mode (t == NULL).
  *
  */
 static int
 ipfw_nat(struct ip_fw_args *args, struct cfg_nat *t, struct mbuf *m)
 {
 	struct mbuf *mcl;
 	struct ip *ip;
 	/* XXX - libalias duct tape */
 	int ldt, retval, found;
 	struct ip_fw_chain *chain;
 	char *c;
 
 	ldt = 0;
 	retval = 0;
 	mcl = m_megapullup(m, m->m_pkthdr.len);
 	if (mcl == NULL) {
 		args->m = NULL;
 		return (IP_FW_DENY);
 	}
 	ip = mtod(mcl, struct ip *);
 
 	/*
 	 * XXX - Libalias checksum offload 'duct tape':
 	 *
 	 * locally generated packets have only pseudo-header checksum
 	 * calculated and libalias will break it[1], so mark them for
 	 * later fix.  Moreover there are cases when libalias modifies
 	 * tcp packet data[2], mark them for later fix too.
 	 *
 	 * [1] libalias was never meant to run in kernel, so it does
 	 * not have any knowledge about checksum offloading, and
 	 * expects a packet with a full internet checksum.
 	 * Unfortunately, packets generated locally will have just the
 	 * pseudo header calculated, and when libalias tries to adjust
 	 * the checksum it will actually compute a wrong value.
 	 *
 	 * [2] when libalias modifies tcp's data content, full TCP
 	 * checksum has to be recomputed: the problem is that
 	 * libalias does not have any idea about checksum offloading.
 	 * To work around this, we do not do checksumming in LibAlias,
 	 * but only mark the packets in th_x2 field. If we receive a
 	 * marked packet, we calculate correct checksum for it
 	 * aware of offloading.  Why such a terrible hack instead of
 	 * recalculating checksum for each packet?
 	 * Because the previous checksum was not checked!
 	 * Recalculating checksums for EVERY packet will hide ALL
 	 * transmission errors. Yes, marked packets still suffer from
 	 * this problem. But, sigh, natd(8) has this problem, too.
 	 *
 	 * TODO: -make libalias mbuf aware (so
 	 * it can handle delayed checksum and tso)
 	 */
 
 	if (mcl->m_pkthdr.rcvif == NULL &&
 	    mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
 		ldt = 1;
 
 	c = mtod(mcl, char *);
 
 	/* Check if this is 'global' instance */
 	if (t == NULL) {
 		if (args->flags & IPFW_ARGS_IN) {
 			/* Wrong direction, skip processing */
 			args->m = mcl;
 			return (IP_FW_NAT);
 		}
 
 		found = 0;
 		chain = &V_layer3_chain;
 		IPFW_RLOCK_ASSERT(chain);
 		/* Check every nat entry... */
 		LIST_FOREACH(t, &chain->nat, _next) {
 			if ((t->mode & PKT_ALIAS_SKIP_GLOBAL) != 0)
 				continue;
 			retval = LibAliasOutTry(t->lib, c,
 			    mcl->m_len + M_TRAILINGSPACE(mcl), 0);
 			if (retval == PKT_ALIAS_OK) {
 				/* Nat instance recognises state */
 				found = 1;
 				break;
 			}
 		}
 		if (found != 1) {
 			/* No instance found, return ignore */
 			args->m = mcl;
 			return (IP_FW_NAT);
 		}
 	} else {
 		if (args->flags & IPFW_ARGS_IN)
 			retval = LibAliasIn(t->lib, c,
 				mcl->m_len + M_TRAILINGSPACE(mcl));
 		else
 			retval = LibAliasOut(t->lib, c,
 				mcl->m_len + M_TRAILINGSPACE(mcl));
 	}
 
 	/*
 	 * We drop packet when:
 	 * 1. libalias returns PKT_ALIAS_ERROR;
 	 * 2. For incoming packets:
 	 *	a) for unresolved fragments;
 	 *	b) libalias returns PKT_ALIAS_IGNORED and
 	 *		PKT_ALIAS_DENY_INCOMING flag is set.
 	 */
 	if (retval == PKT_ALIAS_ERROR ||
 	    ((args->flags & IPFW_ARGS_IN) &&
 	    (retval == PKT_ALIAS_UNRESOLVED_FRAGMENT ||
 	    (retval == PKT_ALIAS_IGNORED &&
 	    (t->mode & PKT_ALIAS_DENY_INCOMING) != 0)))) {
 		/* XXX - should i add some logging? */
 		m_free(mcl);
 		args->m = NULL;
 		return (IP_FW_DENY);
 	}
 
 	if (retval == PKT_ALIAS_RESPOND)
 		mcl->m_flags |= M_SKIP_FIREWALL;
 	mcl->m_pkthdr.len = mcl->m_len = ntohs(ip->ip_len);
 
 	/*
 	 * XXX - libalias checksum offload
 	 * 'duct tape' (see above)
 	 */
 
 	if ((ip->ip_off & htons(IP_OFFMASK)) == 0 &&
 	    ip->ip_p == IPPROTO_TCP) {
 		struct tcphdr 	*th;
 
 		th = (struct tcphdr *)(ip + 1);
 		if (th->th_x2)
 			ldt = 1;
 	}
 
 	if (ldt) {
 		struct tcphdr 	*th;
 		struct udphdr 	*uh;
 		uint16_t ip_len, cksum;
 
 		ip_len = ntohs(ip->ip_len);
 		cksum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 		    htons(ip->ip_p + ip_len - (ip->ip_hl << 2)));
 
 		switch (ip->ip_p) {
 		case IPPROTO_TCP:
 			th = (struct tcphdr *)(ip + 1);
 			/*
 			 * Maybe it was set in
 			 * libalias...
 			 */
 			th->th_x2 = 0;
 			th->th_sum = cksum;
 			mcl->m_pkthdr.csum_data =
 			    offsetof(struct tcphdr, th_sum);
 			break;
 		case IPPROTO_UDP:
 			uh = (struct udphdr *)(ip + 1);
 			uh->uh_sum = cksum;
 			mcl->m_pkthdr.csum_data =
 			    offsetof(struct udphdr, uh_sum);
 			break;
 		}
 		/* No hw checksum offloading: do it ourselves */
 		if ((mcl->m_pkthdr.csum_flags & CSUM_DELAY_DATA) == 0) {
 			in_delayed_cksum(mcl);
 			mcl->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 		}
 	}
 	args->m = mcl;
 	return (IP_FW_NAT);
 }
 
 static struct cfg_nat *
 lookup_nat(struct nat_list *l, int nat_id)
 {
 	struct cfg_nat *res;
 
 	LIST_FOREACH(res, l, _next) {
 		if (res->id == nat_id)
 			break;
 	}
 	return res;
 }
 
 static struct cfg_nat *
 lookup_nat_name(struct nat_list *l, char *name)
 {
 	struct cfg_nat *res;
 	int id;
 	char *errptr;
 
 	id = strtol(name, &errptr, 10);
 	if (id == 0 || *errptr != '\0')
 		return (NULL);
 
 	LIST_FOREACH(res, l, _next) {
 		if (res->id == id)
 			break;
 	}
 	return (res);
 }
 
 /* IP_FW3 configuration routines */
 
 static void
 nat44_config(struct ip_fw_chain *chain, struct nat44_cfg_nat *ucfg)
 {
 	struct cfg_nat *ptr, *tcfg;
 	int gencnt;
 
 	/*
 	 * Find/create nat rule.
 	 */
 	IPFW_UH_WLOCK(chain);
 	gencnt = chain->gencnt;
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		/* New rule: allocate and init new instance. */
 		ptr = malloc(sizeof(struct cfg_nat), M_IPFW, M_WAITOK | M_ZERO);
 		ptr->lib = LibAliasInit(NULL);
 		LIST_INIT(&ptr->redir_chain);
 	} else {
 		/* Entry already present: temporarily unhook it. */
 		IPFW_WLOCK(chain);
 		LIST_REMOVE(ptr, _next);
 		flush_nat_ptrs(chain, ptr->id);
 		IPFW_WUNLOCK(chain);
 		IPFW_UH_WUNLOCK(chain);
 	}
 
 	/*
 	 * Basic nat (re)configuration.
 	 */
 	ptr->id = strtol(ucfg->name, NULL, 10);
 	/*
 	 * XXX - what if this rule doesn't nat any ip and just
 	 * redirect?
 	 * do we set aliasaddress to 0.0.0.0?
 	 */
 	ptr->ip = ucfg->ip;
 	ptr->redir_cnt = ucfg->redir_cnt;
 	ptr->mode = ucfg->mode;
 	strlcpy(ptr->if_name, ucfg->if_name, sizeof(ptr->if_name));
 	LibAliasSetMode(ptr->lib, ptr->mode, ~0);
 	LibAliasSetAddress(ptr->lib, ptr->ip);
 
 	/*
 	 * Redir and LSNAT configuration.
 	 */
 	/* Delete old cfgs. */
 	del_redir_spool_cfg(ptr, &ptr->redir_chain);
 	/* Add new entries. */
 	add_redir_spool_cfg((char *)(ucfg + 1), ptr);
 	IPFW_UH_WLOCK(chain);
 
 	/* Extra check to avoid race with another ipfw_nat_cfg() */
 	tcfg = NULL;
 	if (gencnt != chain->gencnt)
 	    tcfg = lookup_nat_name(&chain->nat, ucfg->name);
 	IPFW_WLOCK(chain);
 	if (tcfg != NULL)
 		LIST_REMOVE(tcfg, _next);
 	LIST_INSERT_HEAD(&chain->nat, ptr, _next);
 	IPFW_WUNLOCK(chain);
 	chain->gencnt++;
 
 	IPFW_UH_WUNLOCK(chain);
 
 	if (tcfg != NULL)
 		free_nat_instance(ptr);
 }
 
 /*
  * Creates/configure nat44 instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
  *
  * Returns 0 on success
  */
 static int
 nat44_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	int id;
 	size_t read;
 	char *errptr;
 
 	/* Check minimum header size */
 	if (sd->valsize < (sizeof(*oh) + sizeof(*ucfg)))
 		return (EINVAL);
 
 	oh = (ipfw_obj_header *)sd->kbuf;
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated and looks like number */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 	id = strtol(ucfg->name, &errptr, 10);
 	if (id == 0 || *errptr != '\0')
 		return (EINVAL);
 
 	read = sizeof(*oh) + sizeof(*ucfg);
 	/* Check number of redirs */
 	if (sd->valsize < read + ucfg->redir_cnt*sizeof(struct nat44_cfg_redir))
 		return (EINVAL);
 
 	nat44_config(chain, ucfg);
 	return (0);
 }
 
 /*
  * Destroys given nat instances.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ]
  *
  * Returns 0 on success
  */
 static int
 nat44_destroy(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct cfg_nat *ptr;
 	ipfw_obj_ntlv *ntlv;
 
 	/* Check minimum header size */
 	if (sd->valsize < sizeof(*oh))
 		return (EINVAL);
 
 	oh = (ipfw_obj_header *)sd->kbuf;
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ntlv = &oh->ntlv;
 	/* Check if name is properly terminated */
 	if (strnlen(ntlv->name, sizeof(ntlv->name)) == sizeof(ntlv->name))
 		return (EINVAL);
 
 	IPFW_UH_WLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ntlv->name);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (ESRCH);
 	}
 	IPFW_WLOCK(chain);
 	LIST_REMOVE(ptr, _next);
 	flush_nat_ptrs(chain, ptr->id);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 
 	free_nat_instance(ptr);
 
 	return (0);
 }
 
 static void
 export_nat_cfg(struct cfg_nat *ptr, struct nat44_cfg_nat *ucfg)
 {
 
 	snprintf(ucfg->name, sizeof(ucfg->name), "%d", ptr->id);
 	ucfg->ip = ptr->ip;
 	ucfg->redir_cnt = ptr->redir_cnt;
 	ucfg->mode = ptr->mode;
 	strlcpy(ucfg->if_name, ptr->if_name, sizeof(ucfg->if_name));
 }
 
 /*
  * Gets config for given nat instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat .. ]
  *
  * Returns 0 on success
  */
 static int
 nat44_get_cfg(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct nat44_cfg_redir *ser_r;
 	struct nat44_cfg_spool *ser_s;
 	size_t sz;
 
 	sz = sizeof(*oh) + sizeof(*ucfg);
 	/* Check minimum header size */
 	if (sd->valsize < sz)
 		return (EINVAL);
 
 	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ESRCH);
 	}
 
 	export_nat_cfg(ptr, ucfg);
 	
 	/* Estimate memory amount */
 	sz = sizeof(ipfw_obj_header) + sizeof(struct nat44_cfg_nat);
 	LIST_FOREACH(r, &ptr->redir_chain, _next) {
 		sz += sizeof(struct nat44_cfg_redir);
 		LIST_FOREACH(s, &r->spool_chain, _next)
 			sz += sizeof(struct nat44_cfg_spool);
 	}
 
 	ucfg->size = sz;
 	if (sd->valsize < sz) {
 
 		/*
 		 * Submitted buffer size is not enough.
 		 * WE've already filled in @ucfg structure with
 		 * relevant info including size, so we
 		 * can return. Buffer will be flushed automatically.
 		 */
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	/* Size OK, let's copy data */
 	LIST_FOREACH(r, &ptr->redir_chain, _next) {
 		ser_r = (struct nat44_cfg_redir *)ipfw_get_sopt_space(sd,
 		    sizeof(*ser_r));
 		ser_r->mode = r->mode;
 		ser_r->laddr = r->laddr;
 		ser_r->paddr = r->paddr;
 		ser_r->raddr = r->raddr;
 		ser_r->lport = r->lport;
 		ser_r->pport = r->pport;
 		ser_r->rport = r->rport;
 		ser_r->pport_cnt = r->pport_cnt;
 		ser_r->rport_cnt = r->rport_cnt;
 		ser_r->proto = r->proto;
 		ser_r->spool_cnt = r->spool_cnt;
 
 		LIST_FOREACH(s, &r->spool_chain, _next) {
 			ser_s = (struct nat44_cfg_spool *)ipfw_get_sopt_space(
 			    sd, sizeof(*ser_s));
 
 			ser_s->addr = s->addr;
 			ser_s->port = s->port;
 		}
 	}
 
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Lists all nat44 instances currently available in kernel.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ]
  * Reply: [ ipfw_obj_lheader nat44_cfg_nat x N ]
  *
  * Returns 0 on success
  */
 static int
 nat44_list_nat(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_lheader *olh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	int nat_count;
 
 	/* Check minimum header size */
 	if (sd->valsize < sizeof(ipfw_obj_lheader))
 		return (EINVAL);
 
 	olh = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*olh));
 	IPFW_UH_RLOCK(chain);
 	nat_count = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next)
 		nat_count++;
 
 	olh->count = nat_count;
 	olh->objsize = sizeof(struct nat44_cfg_nat);
 	olh->size = sizeof(*olh) + olh->count * olh->objsize;
 
 	if (sd->valsize < olh->size) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		ucfg = (struct nat44_cfg_nat *)ipfw_get_sopt_space(sd,
 		    sizeof(*ucfg));
 		export_nat_cfg(ptr, ucfg);
 	}
 
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 /*
  * Gets log for given nat instance
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header nat44_cfg_nat ]
  * Reply: [ ipfw_obj_header nat44_cfg_nat LOGBUFFER ]
  *
  * Returns 0 on success
  */
 static int
 nat44_get_log(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_header *oh;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_nat *ptr;
 	void *pbuf;
 	size_t sz;
 
 	sz = sizeof(*oh) + sizeof(*ucfg);
 	/* Check minimum header size */
 	if (sd->valsize < sz)
 		return (EINVAL);
 
 	oh = (struct _ipfw_obj_header *)ipfw_get_sopt_header(sd, sz);
 
 	/* Basic length checks for TLVs */
 	if (oh->ntlv.head.length != sizeof(oh->ntlv))
 		return (EINVAL);
 
 	ucfg = (struct nat44_cfg_nat *)(oh + 1);
 
 	/* Check if name is properly terminated */
 	if (strnlen(ucfg->name, sizeof(ucfg->name)) == sizeof(ucfg->name))
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	ptr = lookup_nat_name(&chain->nat, ucfg->name);
 	if (ptr == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ESRCH);
 	}
 
 	if (ptr->lib->logDesc == NULL) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOENT);
 	}
 
 	export_nat_cfg(ptr, ucfg);
 	
 	/* Estimate memory amount */
 	ucfg->size = sizeof(struct nat44_cfg_nat) + LIBALIAS_BUF_SIZE;
 	if (sd->valsize < sz + sizeof(*oh)) {
 
 		/*
 		 * Submitted buffer size is not enough.
 		 * WE've already filled in @ucfg structure with
 		 * relevant info including size, so we
 		 * can return. Buffer will be flushed automatically.
 		 */
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	pbuf = (void *)ipfw_get_sopt_space(sd, LIBALIAS_BUF_SIZE);
 	memcpy(pbuf, ptr->lib->logDesc, LIBALIAS_BUF_SIZE);
 	
 	IPFW_UH_RUNLOCK(chain);
 
 	return (0);
 }
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_NAT44_XCONFIG,	0,	HDIR_SET,	nat44_cfg },
 	{ IP_FW_NAT44_DESTROY,	0,	HDIR_SET,	nat44_destroy },
 	{ IP_FW_NAT44_XGETCONFIG,	0,	HDIR_GET,	nat44_get_cfg },
 	{ IP_FW_NAT44_LIST_NAT,	0,	HDIR_GET,	nat44_list_nat },
 	{ IP_FW_NAT44_XGETLOG,	0,	HDIR_GET,	nat44_get_log },
 };
 
 
 /*
  * Legacy configuration routines
  */
 
 struct cfg_spool_legacy {
 	LIST_ENTRY(cfg_spool_legacy)	_next;
 	struct in_addr			addr;
 	u_short				port;
 };
 
 struct cfg_redir_legacy {
 	LIST_ENTRY(cfg_redir)   _next;
 	u_int16_t               mode;
 	struct in_addr	        laddr;
 	struct in_addr	        paddr;
 	struct in_addr	        raddr;
 	u_short                 lport;
 	u_short                 pport;
 	u_short                 rport;
 	u_short                 pport_cnt;
 	u_short                 rport_cnt;
 	int                     proto;
 	struct alias_link       **alink;
 	u_int16_t               spool_cnt;
 	LIST_HEAD(, cfg_spool_legacy) spool_chain;
 };
 
 struct cfg_nat_legacy {
 	LIST_ENTRY(cfg_nat_legacy)	_next;
 	int				id;
 	struct in_addr			ip;
 	char				if_name[IF_NAMESIZE];
 	int				mode;
 	struct libalias			*lib;
 	int				redir_cnt;
 	LIST_HEAD(, cfg_redir_legacy)	redir_chain;
 };
 
 static int
 ipfw_nat_cfg(struct sockopt *sopt)
 {
 	struct cfg_nat_legacy *cfg;
 	struct nat44_cfg_nat *ucfg;
 	struct cfg_redir_legacy *rdir;
 	struct nat44_cfg_redir *urdir;
 	char *buf;
 	size_t len, len2;
 	int error, i;
 
 	len = sopt->sopt_valsize;
 	len2 = len + 128;
 
 	/*
 	 * Allocate 2x buffer to store converted structures.
 	 * new redir_cfg has shrunk, so we're sure that
 	 * new buffer size is enough.
 	 */
 	buf = malloc(roundup2(len, 8) + len2, M_TEMP, M_WAITOK | M_ZERO);
 	error = sooptcopyin(sopt, buf, len, sizeof(struct cfg_nat_legacy));
 	if (error != 0)
 		goto out;
 
 	cfg = (struct cfg_nat_legacy *)buf;
 	if (cfg->id < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	ucfg = (struct nat44_cfg_nat *)&buf[roundup2(len, 8)];
 	snprintf(ucfg->name, sizeof(ucfg->name), "%d", cfg->id);
 	strlcpy(ucfg->if_name, cfg->if_name, sizeof(ucfg->if_name));
 	ucfg->ip = cfg->ip;
 	ucfg->mode = cfg->mode;
 	ucfg->redir_cnt = cfg->redir_cnt;
 
 	if (len < sizeof(*cfg) + cfg->redir_cnt * sizeof(*rdir)) {
 		error = EINVAL;
 		goto out;
 	}
 
 	urdir = (struct nat44_cfg_redir *)(ucfg + 1);
 	rdir = (struct cfg_redir_legacy *)(cfg + 1);
 	for (i = 0; i < cfg->redir_cnt; i++) {
 		urdir->mode = rdir->mode;
 		urdir->laddr = rdir->laddr;
 		urdir->paddr = rdir->paddr;
 		urdir->raddr = rdir->raddr;
 		urdir->lport = rdir->lport;
 		urdir->pport = rdir->pport;
 		urdir->rport = rdir->rport;
 		urdir->pport_cnt = rdir->pport_cnt;
 		urdir->rport_cnt = rdir->rport_cnt;
 		urdir->proto = rdir->proto;
 		urdir->spool_cnt = rdir->spool_cnt;
 
 		urdir++;
 		rdir++;
 	}
 
 	nat44_config(&V_layer3_chain, ucfg);
 
 out:
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 ipfw_nat_del(struct sockopt *sopt)
 {
 	struct cfg_nat *ptr;
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	int i;
 
 	sooptcopyin(sopt, &i, sizeof i, sizeof i);
 	/* XXX validate i */
 	IPFW_UH_WLOCK(chain);
 	ptr = lookup_nat(&chain->nat, i);
 	if (ptr == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (EINVAL);
 	}
 	IPFW_WLOCK(chain);
 	LIST_REMOVE(ptr, _next);
 	flush_nat_ptrs(chain, i);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 	free_nat_instance(ptr);
 	return (0);
 }
 
 static int
 ipfw_nat_get_cfg(struct sockopt *sopt)
 {
 	struct ip_fw_chain *chain = &V_layer3_chain;
 	struct cfg_nat *n;
 	struct cfg_nat_legacy *ucfg;
 	struct cfg_redir *r;
 	struct cfg_spool *s;
 	struct cfg_redir_legacy *ser_r;
 	struct cfg_spool_legacy *ser_s;
 	char *data;
 	int gencnt, nat_cnt, len, error;
 
 	nat_cnt = 0;
 	len = sizeof(nat_cnt);
 
 	IPFW_UH_RLOCK(chain);
 retry:
 	gencnt = chain->gencnt;
 	/* Estimate memory amount */
 	LIST_FOREACH(n, &chain->nat, _next) {
 		nat_cnt++;
 		len += sizeof(struct cfg_nat_legacy);
 		LIST_FOREACH(r, &n->redir_chain, _next) {
 			len += sizeof(struct cfg_redir_legacy);
 			LIST_FOREACH(s, &r->spool_chain, _next)
 				len += sizeof(struct cfg_spool_legacy);
 		}
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	data = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
 	bcopy(&nat_cnt, data, sizeof(nat_cnt));
 
 	nat_cnt = 0;
 	len = sizeof(nat_cnt);
 
 	IPFW_UH_RLOCK(chain);
 	if (gencnt != chain->gencnt) {
 		free(data, M_TEMP);
 		goto retry;
 	}
 	/* Serialize all the data. */
 	LIST_FOREACH(n, &chain->nat, _next) {
 		ucfg = (struct cfg_nat_legacy *)&data[len];
 		ucfg->id = n->id;
 		ucfg->ip = n->ip;
 		ucfg->redir_cnt = n->redir_cnt;
 		ucfg->mode = n->mode;
 		strlcpy(ucfg->if_name, n->if_name, sizeof(ucfg->if_name));
 		len += sizeof(struct cfg_nat_legacy);
 		LIST_FOREACH(r, &n->redir_chain, _next) {
 			ser_r = (struct cfg_redir_legacy *)&data[len];
 			ser_r->mode = r->mode;
 			ser_r->laddr = r->laddr;
 			ser_r->paddr = r->paddr;
 			ser_r->raddr = r->raddr;
 			ser_r->lport = r->lport;
 			ser_r->pport = r->pport;
 			ser_r->rport = r->rport;
 			ser_r->pport_cnt = r->pport_cnt;
 			ser_r->rport_cnt = r->rport_cnt;
 			ser_r->proto = r->proto;
 			ser_r->spool_cnt = r->spool_cnt;
 			len += sizeof(struct cfg_redir_legacy);
 			LIST_FOREACH(s, &r->spool_chain, _next) {
 				ser_s = (struct cfg_spool_legacy *)&data[len];
 				ser_s->addr = s->addr;
 				ser_s->port = s->port;
 				len += sizeof(struct cfg_spool_legacy);
 			}
 		}
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	error = sooptcopyout(sopt, data, len);
 	free(data, M_TEMP);
 
 	return (error);
 }
 
 static int
 ipfw_nat_get_log(struct sockopt *sopt)
 {
 	uint8_t *data;
 	struct cfg_nat *ptr;
 	int i, size;
 	struct ip_fw_chain *chain;
 	IPFW_RLOCK_TRACKER;
 
 	chain = &V_layer3_chain;
 
 	IPFW_RLOCK(chain);
 	/* one pass to count, one to copy the data */
 	i = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		if (ptr->lib->logDesc == NULL)
 			continue;
 		i++;
 	}
 	size = i * (LIBALIAS_BUF_SIZE + sizeof(int));
 	data = malloc(size, M_IPFW, M_NOWAIT | M_ZERO);
 	if (data == NULL) {
 		IPFW_RUNLOCK(chain);
 		return (ENOSPC);
 	}
 	i = 0;
 	LIST_FOREACH(ptr, &chain->nat, _next) {
 		if (ptr->lib->logDesc == NULL)
 			continue;
 		bcopy(&ptr->id, &data[i], sizeof(int));
 		i += sizeof(int);
 		bcopy(ptr->lib->logDesc, &data[i], LIBALIAS_BUF_SIZE);
 		i += LIBALIAS_BUF_SIZE;
 	}
 	IPFW_RUNLOCK(chain);
 	sooptcopyout(sopt, data, size);
 	free(data, M_IPFW);
 	return(0);
 }
 
 static int
 vnet_ipfw_nat_init(const void *arg __unused)
 {
 
 	V_ipfw_nat_ready = 1;
 	return (0);
 }
 
 static int
 vnet_ipfw_nat_uninit(const void *arg __unused)
 {
 	struct cfg_nat *ptr, *ptr_temp;
 	struct ip_fw_chain *chain;
 
 	chain = &V_layer3_chain;
 	IPFW_WLOCK(chain);
 	V_ipfw_nat_ready = 0;
 	LIST_FOREACH_SAFE(ptr, &chain->nat, _next, ptr_temp) {
 		LIST_REMOVE(ptr, _next);
 		free_nat_instance(ptr);
 	}
 	flush_nat_ptrs(chain, -1 /* flush all */);
 	IPFW_WUNLOCK(chain);
 	return (0);
 }
 
 static void
 ipfw_nat_init(void)
 {
 
 	/* init ipfw hooks */
 	ipfw_nat_ptr = ipfw_nat;
 	lookup_nat_ptr = lookup_nat;
 	ipfw_nat_cfg_ptr = ipfw_nat_cfg;
 	ipfw_nat_del_ptr = ipfw_nat_del;
 	ipfw_nat_get_cfg_ptr = ipfw_nat_get_cfg;
 	ipfw_nat_get_log_ptr = ipfw_nat_get_log;
 	IPFW_ADD_SOPT_HANDLER(1, scodes);
 
 	ifaddr_event_tag = EVENTHANDLER_REGISTER(ifaddr_event, ifaddr_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 }
 
 static void
 ipfw_nat_destroy(void)
 {
 
 	EVENTHANDLER_DEREGISTER(ifaddr_event, ifaddr_event_tag);
 	/* deregister ipfw_nat */
 	IPFW_DEL_SOPT_HANDLER(1, scodes);
 	ipfw_nat_ptr = NULL;
 	lookup_nat_ptr = NULL;
 	ipfw_nat_cfg_ptr = NULL;
 	ipfw_nat_del_ptr = NULL;
 	ipfw_nat_get_cfg_ptr = NULL;
 	ipfw_nat_get_log_ptr = NULL;
 }
 
 static int
 ipfw_nat_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		break;
 
 	default:
 		return EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipfw_nat_mod = {
 	"ipfw_nat",
 	ipfw_nat_modevent,
 	0
 };
 
 /* Define startup order. */
 #define	IPFW_NAT_SI_SUB_FIREWALL	SI_SUB_PROTO_FIREWALL
 #define	IPFW_NAT_MODEVENT_ORDER		(SI_ORDER_ANY - 128) /* after ipfw */
 #define	IPFW_NAT_MODULE_ORDER		(IPFW_NAT_MODEVENT_ORDER + 1)
 #define	IPFW_NAT_VNET_ORDER		(IPFW_NAT_MODEVENT_ORDER + 2)
 
 DECLARE_MODULE(ipfw_nat, ipfw_nat_mod, IPFW_NAT_SI_SUB_FIREWALL, SI_ORDER_ANY);
 MODULE_DEPEND(ipfw_nat, libalias, 1, 1, 1);
 MODULE_DEPEND(ipfw_nat, ipfw, 3, 3, 3);
 MODULE_VERSION(ipfw_nat, 1);
 
 SYSINIT(ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER,
     ipfw_nat_init, NULL);
 VNET_SYSINIT(vnet_ipfw_nat_init, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_VNET_ORDER,
     vnet_ipfw_nat_init, NULL);
 
 SYSUNINIT(ipfw_nat_destroy, IPFW_NAT_SI_SUB_FIREWALL, IPFW_NAT_MODULE_ORDER,
     ipfw_nat_destroy, NULL);
 VNET_SYSUNINIT(vnet_ipfw_nat_uninit, IPFW_NAT_SI_SUB_FIREWALL,
     IPFW_NAT_VNET_ORDER, vnet_ipfw_nat_uninit, NULL);
 
 /* end of file */
Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_private.h
===================================================================
--- projects/fuse2/sys/netpfil/ipfw/ip_fw_private.h	(revision 350434)
+++ projects/fuse2/sys/netpfil/ipfw/ip_fw_private.h	(revision 350435)
@@ -1,823 +1,824 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _IPFW2_PRIVATE_H
 #define _IPFW2_PRIVATE_H
 
 /*
  * Internal constants and data structures used by ipfw components
  * and not meant to be exported outside the kernel.
  */
 
 #ifdef _KERNEL
 
 /*
  * For platforms that do not have SYSCTL support, we wrap the
  * SYSCTL_* into a function (one per file) to collect the values
  * into an array at module initialization. The wrapping macros,
  * SYSBEGIN() and SYSEND, are empty in the default case.
  */
 #ifndef SYSBEGIN
 #define SYSBEGIN(x)
 #endif
 #ifndef SYSEND
 #define SYSEND
 #endif
 
 /* Return values from ipfw_chk() */
 enum {
 	IP_FW_PASS = 0,
 	IP_FW_DENY,
 	IP_FW_DIVERT,
 	IP_FW_TEE,
 	IP_FW_DUMMYNET,
 	IP_FW_NETGRAPH,
 	IP_FW_NGTEE,
 	IP_FW_NAT,
 	IP_FW_REASS,
 	IP_FW_NAT64,
 };
 
 /*
  * Structure for collecting parameters to dummynet for ip6_output forwarding
  */
 struct _ip6dn_args {
        struct ip6_pktopts *opt_or;
        int flags_or;
        struct ip6_moptions *im6o_or;
        struct ifnet *origifp_or;
        struct ifnet *ifp_or;
        struct sockaddr_in6 dst_or;
        u_long mtu_or;
 };
 
 
 /*
  * Arguments for calling ipfw_chk() and dummynet_io(). We put them
  * all into a structure because this way it is easier and more
  * efficient to pass variables around and extend the interface.
  */
 struct ip_fw_args {
 	uint32_t		flags;
 #define	IPFW_ARGS_ETHER		0x00010000	/* valid ethernet header */
 #define	IPFW_ARGS_NH4		0x00020000	/* IPv4 next hop in hopstore */
 #define	IPFW_ARGS_NH6		0x00040000	/* IPv6 next hop in hopstore */
 #define	IPFW_ARGS_NH4PTR	0x00080000	/* IPv4 next hop in next_hop */
 #define	IPFW_ARGS_NH6PTR	0x00100000	/* IPv6 next hop in next_hop6 */
 #define	IPFW_ARGS_REF		0x00200000	/* valid ipfw_rule_ref	*/
 #define	IPFW_ARGS_IN		0x00400000	/* called on input */
 #define	IPFW_ARGS_OUT		0x00800000	/* called on output */
 #define	IPFW_ARGS_IP4		0x01000000	/* belongs to v4 ISR */
 #define	IPFW_ARGS_IP6		0x02000000	/* belongs to v6 ISR */
 #define	IPFW_ARGS_DROP		0x04000000	/* drop it (dummynet) */
 #define	IPFW_ARGS_LENMASK	0x0000ffff	/* length of data in *mem */
 #define	IPFW_ARGS_LENGTH(f)	((f) & IPFW_ARGS_LENMASK)
 	/*
 	 * On return, it points to the matching rule.
 	 * On entry, rule.slot > 0 means the info is valid and
 	 * contains the starting rule for an ipfw search.
 	 * If chain_id == chain->id && slot >0 then jump to that slot.
 	 * Otherwise, we locate the first rule >= rulenum:rule_id
 	 */
 	struct ipfw_rule_ref	rule;	/* match/restart info		*/
 
 	struct ifnet		*ifp;	/* input/output interface	*/
 	struct inpcb		*inp;
 	union {
 		/*
 		 * next_hop[6] pointers can be used to point to next hop
 		 * stored in rule's opcode to avoid copying into hopstore.
 		 * Also, it is expected that all 0x1-0x10 flags are mutually
 		 * exclusive.
 		 */
 		struct sockaddr_in	*next_hop;
 		struct sockaddr_in6	*next_hop6;
 		/* ipfw next hop storage */
 		struct sockaddr_in	hopstore;
 		struct ip_fw_nh6 {
 			struct in6_addr sin6_addr;
 			uint32_t	sin6_scope_id;
 			uint16_t	sin6_port;
 		} hopstore6;
 	};
 	union {
 		struct mbuf	*m;	/* the mbuf chain		*/
 		void		*mem;	/* or memory pointer		*/
 	};
 	struct ipfw_flow_id	f_id;	/* grabbed from IP header	*/
 };
 
 MALLOC_DECLARE(M_IPFW);
 
 /* wrapper for freeing a packet, in case we need to do more work */
 #ifndef FREE_PKT
 #if defined(__linux__) || defined(_WIN32)
 #define FREE_PKT(m)	netisr_dispatch(-1, m)
 #else
 #define FREE_PKT(m)	m_freem(m)
 #endif
 #endif /* !FREE_PKT */
 
 /*
  * Function definitions.
  */
 int ipfw_chk(struct ip_fw_args *args);
 struct mbuf *ipfw_send_pkt(struct mbuf *, struct ipfw_flow_id *,
     u_int32_t, u_int32_t, int);
 
 int ipfw_attach_hooks(void);
 void ipfw_detach_hooks(void);
 #ifdef NOTYET
 void ipfw_nat_destroy(void);
 #endif
 
 /* In ip_fw_log.c */
 struct ip;
 struct ip_fw_chain;
 
 void ipfw_bpf_init(int);
 void ipfw_bpf_uninit(int);
 void ipfw_bpf_tap(u_char *, u_int);
 void ipfw_bpf_mtap(struct mbuf *);
 void ipfw_bpf_mtap2(void *, u_int, struct mbuf *);
 void ipfw_log(struct ip_fw_chain *chain, struct ip_fw *f, u_int hlen,
     struct ip_fw_args *args, u_short offset, uint32_t tablearg, struct ip *ip);
 VNET_DECLARE(u_int64_t, norule_counter);
 #define	V_norule_counter	VNET(norule_counter)
 VNET_DECLARE(int, verbose_limit);
 #define	V_verbose_limit		VNET(verbose_limit)
 
 /* In ip_fw_dynamic.c */
 struct sockopt_data;
 
 enum { /* result for matching dynamic rules */
 	MATCH_REVERSE = 0,
 	MATCH_FORWARD,
 	MATCH_NONE,
 	MATCH_UNKNOWN,
 };
 
 /*
  * Macro to determine that we need to do or redo dynamic state lookup.
  * direction == MATCH_UNKNOWN means that this is first lookup, then we need
  * to do lookup.
  * Otherwise check the state name, if previous lookup was for "any" name,
  * this means there is no state with specific name. Thus no need to do
  * lookup. If previous name was not "any", redo lookup for specific name.
  */
 #define	DYN_LOOKUP_NEEDED(p, cmd)	\
     ((p)->direction == MATCH_UNKNOWN ||	\
 	((p)->kidx != 0 && (p)->kidx != (cmd)->arg1))
 #define	DYN_INFO_INIT(p)	do {	\
 	(p)->direction = MATCH_UNKNOWN;	\
 	(p)->kidx = 0;			\
 } while (0)
 struct ipfw_dyn_info {
 	uint16_t	direction;	/* match direction */
 	uint16_t	kidx;		/* state name kidx */
 	uint32_t	hashval;	/* hash value */
 	uint32_t	version;	/* bucket version */
 	uint32_t	f_pos;
 };
 int ipfw_dyn_install_state(struct ip_fw_chain *chain, struct ip_fw *rule,
     const ipfw_insn_limit *cmd, const struct ip_fw_args *args,
     const void *ulp, int pktlen, struct ipfw_dyn_info *info,
     uint32_t tablearg);
 struct ip_fw *ipfw_dyn_lookup_state(const struct ip_fw_args *args,
     const void *ulp, int pktlen, const ipfw_insn *cmd,
     struct ipfw_dyn_info *info);
 
 int ipfw_is_dyn_rule(struct ip_fw *rule);
 void ipfw_expire_dyn_states(struct ip_fw_chain *, ipfw_range_tlv *);
 void ipfw_get_dynamic(struct ip_fw_chain *chain, char **bp, const char *ep);
 int ipfw_dump_states(struct ip_fw_chain *chain, struct sockopt_data *sd);
 
 void ipfw_dyn_init(struct ip_fw_chain *);	/* per-vnet initialization */
 void ipfw_dyn_uninit(int);	/* per-vnet deinitialization */
 int ipfw_dyn_len(void);
 uint32_t ipfw_dyn_get_count(uint32_t *, int *);
 void ipfw_dyn_reset_eaction(struct ip_fw_chain *ch, uint16_t eaction_id,
     uint16_t default_id, uint16_t instance_id);
 
 /* common variables */
 VNET_DECLARE(int, fw_one_pass);
 #define	V_fw_one_pass		VNET(fw_one_pass)
 
 VNET_DECLARE(int, fw_verbose);
 #define	V_fw_verbose		VNET(fw_verbose)
 
 VNET_DECLARE(struct ip_fw_chain, layer3_chain);
 #define	V_layer3_chain		VNET(layer3_chain)
 
 VNET_DECLARE(int, ipfw_vnet_ready);
 #define	V_ipfw_vnet_ready	VNET(ipfw_vnet_ready)
 
 VNET_DECLARE(u_int32_t, set_disable);
 #define	V_set_disable		VNET(set_disable)
 
 VNET_DECLARE(int, autoinc_step);
 #define V_autoinc_step		VNET(autoinc_step)
 
 VNET_DECLARE(unsigned int, fw_tables_max);
 #define V_fw_tables_max		VNET(fw_tables_max)
 
 VNET_DECLARE(unsigned int, fw_tables_sets);
 #define V_fw_tables_sets	VNET(fw_tables_sets)
 
 struct tables_config;
 
 #ifdef _KERNEL
 /*
  * Here we have the structure representing an ipfw rule.
  *
  * It starts with a general area 
  * followed by an array of one or more instructions, which the code
  * accesses as an array of 32-bit values.
  *
  * Given a rule pointer  r:
  *
  *  r->cmd		is the start of the first instruction.
  *  ACTION_PTR(r)	is the start of the first action (things to do
  *			once a rule matched).
  */
 
 struct ip_fw {
 	uint16_t	act_ofs;	/* offset of action in 32-bit units */
 	uint16_t	cmd_len;	/* # of 32-bit words in cmd	*/
 	uint16_t	rulenum;	/* rule number			*/
 	uint8_t		set;		/* rule set (0..31)		*/
 	uint8_t		flags;		/* currently unused		*/
 	counter_u64_t	cntr;		/* Pointer to rule counters	*/
 	uint32_t	timestamp;	/* tv_sec of last match		*/
 	uint32_t	id;		/* rule id			*/
 	uint32_t	cached_id;	/* used by jump_fast		*/
 	uint32_t	cached_pos;	/* used by jump_fast		*/
 	uint32_t	refcnt;		/* number of references		*/
 
 	struct ip_fw	*next;		/* linked list of deleted rules */
 	ipfw_insn	cmd[1];		/* storage for commands		*/
 };
 
 #define	IPFW_RULE_CNTR_SIZE	(2 * sizeof(uint64_t))
 
 #endif
 
 struct ip_fw_chain {
 	struct ip_fw	**map;		/* array of rule ptrs to ease lookup */
 	uint32_t	id;		/* ruleset id */
 	int		n_rules;	/* number of static rules */
 	void		*tablestate;	/* runtime table info */
 	void		*valuestate;	/* runtime table value info */
 	int		*idxmap;	/* skipto array of rules */
 	void		**srvstate;	/* runtime service mappings */
 #if defined( __linux__ ) || defined( _WIN32 )
 	spinlock_t rwmtx;
 #else
 	struct rmlock	rwmtx;
 #endif
 	int		static_len;	/* total len of static rules (v0) */
 	uint32_t	gencnt;		/* NAT generation count */
 	LIST_HEAD(nat_list, cfg_nat) nat;       /* list of nat entries */
 	struct ip_fw	*default_rule;
 	struct tables_config *tblcfg;	/* tables module data */
 	void		*ifcfg;		/* interface module data */
 	int		*idxmap_back;	/* standby skipto array of rules */
 	struct namedobj_instance	*srvmap; /* cfg name->number mappings */
 #if defined( __linux__ ) || defined( _WIN32 )
 	spinlock_t uh_lock;
 #else
 	struct rwlock	uh_lock;	/* lock for upper half */
 #endif
 };
 
 /* 64-byte structure representing multi-field table value */
 struct table_value {
 	uint32_t	tag;		/* O_TAG/O_TAGGED */
 	uint32_t	pipe;		/* O_PIPE/O_QUEUE */
 	uint16_t	divert;		/* O_DIVERT/O_TEE */
 	uint16_t	skipto;		/* skipto, CALLRET */
 	uint32_t	netgraph;	/* O_NETGRAPH/O_NGTEE */
 	uint32_t	fib;		/* O_SETFIB */
 	uint32_t	nat;		/* O_NAT */
 	uint32_t	nh4;
 	uint8_t		dscp;
 	uint8_t		spare0;
 	uint16_t	spare1;
 	/* -- 32 bytes -- */
 	struct in6_addr	nh6;
 	uint32_t	limit;		/* O_LIMIT */
 	uint32_t	zoneid;		/* scope zone id for nh6 */
 	uint64_t	refcnt;		/* Number of references */
 };
 
 
 struct named_object {
 	TAILQ_ENTRY(named_object)	nn_next;	/* namehash */
 	TAILQ_ENTRY(named_object)	nv_next;	/* valuehash */
 	char			*name;	/* object name */
 	uint16_t		etlv;	/* Export TLV id */
 	uint8_t			subtype;/* object subtype within class */
 	uint8_t			set;	/* set object belongs to */
 	uint16_t		kidx;	/* object kernel index */
 	uint16_t		spare;
 	uint32_t		ocnt;	/* object counter for internal use */
 	uint32_t		refcnt;	/* number of references */
 };
 TAILQ_HEAD(namedobjects_head, named_object);
 
 struct sockopt;	/* used by tcp_var.h */
 struct sockopt_data {
 	caddr_t		kbuf;		/* allocated buffer */
 	size_t		ksize;		/* given buffer size */
 	size_t		koff;		/* data already used */
 	size_t		kavail;		/* number of bytes available */
 	size_t		ktotal;		/* total bytes pushed */
 	struct sockopt	*sopt;		/* socket data */
 	caddr_t		sopt_val;	/* sopt user buffer */
 	size_t		valsize;	/* original data size */
 };
 
 struct ipfw_ifc;
 
 typedef void (ipfw_ifc_cb)(struct ip_fw_chain *ch, void *cbdata,
     uint16_t ifindex);
 
 struct ipfw_iface {
 	struct named_object	no;
 	char ifname[64];
 	int resolved;
 	uint16_t ifindex;
 	uint16_t spare;
 	uint64_t gencnt;
 	TAILQ_HEAD(, ipfw_ifc)	consumers;
 };
 
 struct ipfw_ifc {
 	TAILQ_ENTRY(ipfw_ifc)	next;
 	struct ipfw_iface	*iface;
 	ipfw_ifc_cb		*cb;
 	void			*cbdata;
 };
 
 /* Macro for working with various counters */
 #define	IPFW_INC_RULE_COUNTER(_cntr, _bytes)	do {	\
 	counter_u64_add((_cntr)->cntr, 1);		\
 	counter_u64_add((_cntr)->cntr + 1, _bytes);	\
 	if ((_cntr)->timestamp != time_uptime)		\
 		(_cntr)->timestamp = time_uptime;	\
 	} while (0)
 
 #define	IPFW_INC_DYN_COUNTER(_cntr, _bytes)	do {		\
 	(_cntr)->pcnt++;				\
 	(_cntr)->bcnt += _bytes;			\
 	} while (0)
 
 #define	IPFW_ZERO_RULE_COUNTER(_cntr) do {		\
 	counter_u64_zero((_cntr)->cntr);		\
 	counter_u64_zero((_cntr)->cntr + 1);		\
 	(_cntr)->timestamp = 0;				\
 	} while (0)
 
 #define	IPFW_ZERO_DYN_COUNTER(_cntr) do {		\
 	(_cntr)->pcnt = 0;				\
 	(_cntr)->bcnt = 0;				\
 	} while (0)
 
 #define	TARG_VAL(ch, k, f)	((struct table_value *)((ch)->valuestate))[k].f
 #define	IP_FW_ARG_TABLEARG(ch, a, f)	\
 	(((a) == IP_FW_TARG) ? TARG_VAL(ch, tablearg, f) : (a))
 /*
  * The lock is heavily used by ip_fw2.c (the main file) and ip_fw_nat.c
  * so the variable and the macros must be here.
  */
 
 #if defined( __linux__ ) || defined( _WIN32 )
 #define	IPFW_LOCK_INIT(_chain) do {			\
 	rw_init(&(_chain)->rwmtx, "IPFW static rules");	\
 	rw_init(&(_chain)->uh_lock, "IPFW UH lock");	\
 	} while (0)
 
 #define	IPFW_LOCK_DESTROY(_chain) do {			\
 	rw_destroy(&(_chain)->rwmtx);			\
 	rw_destroy(&(_chain)->uh_lock);			\
 	} while (0)
 
 #define	IPFW_RLOCK_ASSERT(_chain)	rw_assert(&(_chain)->rwmtx, RA_RLOCKED)
 #define	IPFW_WLOCK_ASSERT(_chain)	rw_assert(&(_chain)->rwmtx, RA_WLOCKED)
 
 #define	IPFW_RLOCK_TRACKER
 #define	IPFW_RLOCK(p)			rw_rlock(&(p)->rwmtx)
 #define	IPFW_RUNLOCK(p)			rw_runlock(&(p)->rwmtx)
 #define	IPFW_WLOCK(p)			rw_wlock(&(p)->rwmtx)
 #define	IPFW_WUNLOCK(p)			rw_wunlock(&(p)->rwmtx)
 #define	IPFW_PF_RLOCK(p)		IPFW_RLOCK(p)
 #define	IPFW_PF_RUNLOCK(p)		IPFW_RUNLOCK(p)
 #else /* FreeBSD */
 #define	IPFW_LOCK_INIT(_chain) do {			\
 	rm_init_flags(&(_chain)->rwmtx, "IPFW static rules", RM_RECURSE); \
 	rw_init(&(_chain)->uh_lock, "IPFW UH lock");	\
 	} while (0)
 
 #define	IPFW_LOCK_DESTROY(_chain) do {			\
 	rm_destroy(&(_chain)->rwmtx);			\
 	rw_destroy(&(_chain)->uh_lock);			\
 	} while (0)
 
 #define	IPFW_RLOCK_ASSERT(_chain)	rm_assert(&(_chain)->rwmtx, RA_RLOCKED)
 #define	IPFW_WLOCK_ASSERT(_chain)	rm_assert(&(_chain)->rwmtx, RA_WLOCKED)
 
 #define	IPFW_RLOCK_TRACKER		struct rm_priotracker _tracker
 #define	IPFW_RLOCK(p)			rm_rlock(&(p)->rwmtx, &_tracker)
 #define	IPFW_RUNLOCK(p)			rm_runlock(&(p)->rwmtx, &_tracker)
 #define	IPFW_WLOCK(p)			rm_wlock(&(p)->rwmtx)
 #define	IPFW_WUNLOCK(p)			rm_wunlock(&(p)->rwmtx)
 #define	IPFW_PF_RLOCK(p)		IPFW_RLOCK(p)
 #define	IPFW_PF_RUNLOCK(p)		IPFW_RUNLOCK(p)
 #endif
 
 #define	IPFW_UH_RLOCK_ASSERT(_chain)	rw_assert(&(_chain)->uh_lock, RA_RLOCKED)
 #define	IPFW_UH_WLOCK_ASSERT(_chain)	rw_assert(&(_chain)->uh_lock, RA_WLOCKED)
 #define	IPFW_UH_UNLOCK_ASSERT(_chain)	rw_assert(&(_chain)->uh_lock, RA_UNLOCKED)
 
 #define IPFW_UH_RLOCK(p) rw_rlock(&(p)->uh_lock)
 #define IPFW_UH_RUNLOCK(p) rw_runlock(&(p)->uh_lock)
 #define IPFW_UH_WLOCK(p) rw_wlock(&(p)->uh_lock)
 #define IPFW_UH_WUNLOCK(p) rw_wunlock(&(p)->uh_lock)
 
 struct obj_idx {
 	uint16_t	uidx;	/* internal index supplied by userland */
 	uint16_t	kidx;	/* kernel object index */
 	uint16_t	off;	/* tlv offset from rule end in 4-byte words */
 	uint8_t		spare;
 	uint8_t		type;	/* object type within its category */
 };
 
 struct rule_check_info {
 	uint16_t	flags;		/* rule-specific check flags */
 	uint16_t	object_opcodes;	/* num of opcodes referencing objects */
 	uint16_t	urule_numoff;	/* offset of rulenum in bytes */
 	uint8_t		version;	/* rule version */
 	uint8_t		spare;
 	ipfw_obj_ctlv	*ctlv;		/* name TLV containter */
 	struct ip_fw	*krule;		/* resulting rule pointer */
 	caddr_t		urule;		/* original rule pointer */
 	struct obj_idx	obuf[8];	/* table references storage */
 };
 
 /* Legacy interface support */
 /*
  * FreeBSD 8 export rule format
  */
 struct ip_fw_rule0 {
 	struct ip_fw	*x_next;	/* linked list of rules		*/
 	struct ip_fw	*next_rule;	/* ptr to next [skipto] rule	*/
 	/* 'next_rule' is used to pass up 'set_disable' status		*/
 
 	uint16_t	act_ofs;	/* offset of action in 32-bit units */
 	uint16_t	cmd_len;	/* # of 32-bit words in cmd	*/
 	uint16_t	rulenum;	/* rule number			*/
 	uint8_t		set;		/* rule set (0..31)		*/
 	uint8_t		_pad;		/* padding			*/
 	uint32_t	id;		/* rule id */
 
 	/* These fields are present in all rules.			*/
 	uint64_t	pcnt;		/* Packet counter		*/
 	uint64_t	bcnt;		/* Byte counter			*/
 	uint32_t	timestamp;	/* tv_sec of last match		*/
 
 	ipfw_insn	cmd[1];		/* storage for commands		*/
 };
 
 struct ip_fw_bcounter0 {
 	uint64_t	pcnt;		/* Packet counter		*/
 	uint64_t	bcnt;		/* Byte counter			*/
 	uint32_t	timestamp;	/* tv_sec of last match		*/
 };
 
 /* Kernel rule length */
 /*
  * RULE _K_ SIZE _V_ ->
  * get kernel size from userland rool version _V_.
  * RULE _U_ SIZE _V_ ->
  * get user size version _V_ from kernel rule
  * RULESIZE _V_ ->
  * get user size rule length 
  */
 /* FreeBSD8 <> current kernel format */
 #define	RULEUSIZE0(r)	(sizeof(struct ip_fw_rule0) + (r)->cmd_len * 4 - 4)
 #define	RULEKSIZE0(r)	roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8)
 /* FreeBSD11 <> current kernel format */
 #define	RULEUSIZE1(r)	(roundup2(sizeof(struct ip_fw_rule) + \
     (r)->cmd_len * 4 - 4, 8))
 #define	RULEKSIZE1(r)	roundup2((sizeof(struct ip_fw) + (r)->cmd_len*4 - 4), 8)
 
 /*
  * Tables/Objects index rewriting code
  */
 
 /* Default and maximum number of ipfw tables/objects. */
 #define	IPFW_TABLES_MAX		65536
 #define	IPFW_TABLES_DEFAULT	128
 #define	IPFW_OBJECTS_MAX	65536
 #define	IPFW_OBJECTS_DEFAULT	1024
 
 #define	CHAIN_TO_SRV(ch)	((ch)->srvmap)
 #define	SRV_OBJECT(ch, idx)	((ch)->srvstate[(idx)])
 
 struct tid_info {
 	uint32_t	set;	/* table set */
 	uint16_t	uidx;	/* table index */
 	uint8_t		type;	/* table type */
 	uint8_t		atype;
 	uint8_t		spare;
 	int		tlen;	/* Total TLV size block */
 	void		*tlvs;	/* Pointer to first TLV */
 };
 
 /*
  * Classifier callback. Checks if @cmd opcode contains kernel object reference.
  * If true, returns its index and type.
  * Returns 0 if match is found, 1 overwise.
  */
 typedef int (ipfw_obj_rw_cl)(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype);
 /*
  * Updater callback. Sets kernel object reference index to @puidx
  */
 typedef void (ipfw_obj_rw_upd)(ipfw_insn *cmd, uint16_t puidx);
 /*
  * Finder callback. Tries to find named object by name (specified via @ti).
  * Stores found named object pointer in @pno.
  * If object was not found, NULL is stored.
  *
  * Return 0 if input data was valid.
  */
 typedef int (ipfw_obj_fname_cb)(struct ip_fw_chain *ch,
     struct tid_info *ti, struct named_object **pno);
 /*
  * Another finder callback. Tries to findex named object by kernel index.
  *
  * Returns pointer to named object or NULL.
  */
 typedef struct named_object *(ipfw_obj_fidx_cb)(struct ip_fw_chain *ch,
     uint16_t kidx);
 /*
  * Object creator callback. Tries to create object specified by @ti.
  * Stores newly-allocated object index in @pkidx.
  *
  * Returns 0 on success.
  */
 typedef int (ipfw_obj_create_cb)(struct ip_fw_chain *ch, struct tid_info *ti,
     uint16_t *pkidx);
 /*
  * Object destroy callback. Intended to free resources allocated by
  * create_object callback.
  */
 typedef void (ipfw_obj_destroy_cb)(struct ip_fw_chain *ch,
     struct named_object *no);
 /*
  * Sets handler callback. Handles moving and swaping set of named object.
  *  SWAP_ALL moves all named objects from set `set' to `new_set' and vise versa;
  *  TEST_ALL checks that there aren't any named object with conflicting names;
  *  MOVE_ALL moves all named objects from set `set' to `new_set';
  *  COUNT_ONE used to count number of references used by object with kidx `set';
  *  TEST_ONE checks that named object with kidx `set' can be moved to `new_set`;
  *  MOVE_ONE moves named object with kidx `set' to set `new_set'.
  */
 enum ipfw_sets_cmd {
 	SWAP_ALL = 0, TEST_ALL, MOVE_ALL, COUNT_ONE, TEST_ONE, MOVE_ONE
 };
 typedef int (ipfw_obj_sets_cb)(struct ip_fw_chain *ch,
     uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd);
 
 
 struct opcode_obj_rewrite {
 	uint32_t		opcode;		/* Opcode to act upon */
 	uint32_t		etlv;		/* Relevant export TLV id  */
 	ipfw_obj_rw_cl		*classifier;	/* Check if rewrite is needed */
 	ipfw_obj_rw_upd		*update;	/* update cmd with new value */
 	ipfw_obj_fname_cb	*find_byname;	/* Find named object by name */
 	ipfw_obj_fidx_cb	*find_bykidx;	/* Find named object by kidx */
 	ipfw_obj_create_cb	*create_object;	/* Create named object */
 	ipfw_obj_destroy_cb	*destroy_object;/* Destroy named object */
 	ipfw_obj_sets_cb	*manage_sets;	/* Swap or move sets */
 };
 
 #define	IPFW_ADD_OBJ_REWRITER(f, c)	do {	\
 	if ((f) != 0) 				\
 		ipfw_add_obj_rewriter(c,	\
 		    sizeof(c) / sizeof(c[0]));	\
 	} while(0)
 #define	IPFW_DEL_OBJ_REWRITER(l, c)	do {	\
 	if ((l) != 0) 				\
 		ipfw_del_obj_rewriter(c,	\
 		    sizeof(c) / sizeof(c[0]));	\
 	} while(0)
 
 /* In ip_fw_iface.c */
 int ipfw_iface_init(void);
 void ipfw_iface_destroy(void);
 void vnet_ipfw_iface_destroy(struct ip_fw_chain *ch);
 int ipfw_iface_ref(struct ip_fw_chain *ch, char *name,
     struct ipfw_ifc *ic);
 void ipfw_iface_unref(struct ip_fw_chain *ch, struct ipfw_ifc *ic);
 void ipfw_iface_add_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic);
 void ipfw_iface_del_notify(struct ip_fw_chain *ch, struct ipfw_ifc *ic);
 
 /* In ip_fw_sockopt.c */
 void ipfw_init_skipto_cache(struct ip_fw_chain *chain);
 void ipfw_destroy_skipto_cache(struct ip_fw_chain *chain);
 int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id);
 int ipfw_ctl3(struct sockopt *sopt);
 int ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
     int locked);
 void ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head,
     struct ip_fw *rule);
 void ipfw_reap_rules(struct ip_fw *head);
 void ipfw_init_counters(void);
 void ipfw_destroy_counters(void);
 struct ip_fw *ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize);
 void ipfw_free_rule(struct ip_fw *rule);
 int ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt);
 int ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx);
+ipfw_insn *ipfw_get_action(struct ip_fw *);
 
 typedef int (sopt_handler_f)(struct ip_fw_chain *ch,
     ip_fw3_opheader *op3, struct sockopt_data *sd);
 struct ipfw_sopt_handler {
 	uint16_t	opcode;
 	uint8_t		version;
 	uint8_t		dir;
 	sopt_handler_f	*handler;
 	uint64_t	refcnt;
 };
 #define	HDIR_SET	0x01	/* Handler is used to set some data */
 #define	HDIR_GET	0x02	/* Handler is used to retrieve data */
 #define	HDIR_BOTH	HDIR_GET|HDIR_SET
 
 void ipfw_init_sopt_handler(void);
 void ipfw_destroy_sopt_handler(void);
 void ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count);
 int ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count);
 caddr_t ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed);
 caddr_t ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed);
 #define	IPFW_ADD_SOPT_HANDLER(f, c)	do {	\
 	if ((f) != 0) 				\
 		ipfw_add_sopt_handler(c,	\
 		    sizeof(c) / sizeof(c[0]));	\
 	} while(0)
 #define	IPFW_DEL_SOPT_HANDLER(l, c)	do {	\
 	if ((l) != 0) 				\
 		ipfw_del_sopt_handler(c,	\
 		    sizeof(c) / sizeof(c[0]));	\
 	} while(0)
 
 struct namedobj_instance;
 typedef int (objhash_cb_t)(struct namedobj_instance *ni, struct named_object *,
     void *arg);
 typedef uint32_t (objhash_hash_f)(struct namedobj_instance *ni, const void *key,
     uint32_t kopt);
 typedef int (objhash_cmp_f)(struct named_object *no, const void *key,
     uint32_t kopt);
 struct namedobj_instance *ipfw_objhash_create(uint32_t items);
 void ipfw_objhash_destroy(struct namedobj_instance *);
 void ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks);
 void ipfw_objhash_bitmap_merge(struct namedobj_instance *ni,
     void **idx, int *blocks);
 void ipfw_objhash_bitmap_swap(struct namedobj_instance *ni,
     void **idx, int *blocks);
 void ipfw_objhash_bitmap_free(void *idx, int blocks);
 void ipfw_objhash_set_hashf(struct namedobj_instance *ni, objhash_hash_f *f);
 struct named_object *ipfw_objhash_lookup_name(struct namedobj_instance *ni,
     uint32_t set, char *name);
 struct named_object *ipfw_objhash_lookup_name_type(struct namedobj_instance *ni,
     uint32_t set, uint32_t type, const char *name);
 struct named_object *ipfw_objhash_lookup_kidx(struct namedobj_instance *ni,
     uint16_t idx);
 int ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a,
     struct named_object *b);
 void ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no);
 void ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no);
 uint32_t ipfw_objhash_count(struct namedobj_instance *ni);
 uint32_t ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type);
 int ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f,
     void *arg);
 int ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f,
     void *arg, uint16_t type);
 int ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx);
 int ipfw_objhash_alloc_idx(void *n, uint16_t *pidx);
 void ipfw_objhash_set_funcs(struct namedobj_instance *ni,
     objhash_hash_f *hash_f, objhash_cmp_f *cmp_f);
 int ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti,
     uint32_t etlv, struct named_object **pno);
 void ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv);
 ipfw_obj_ntlv *ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx,
     uint32_t etlv);
 void ipfw_init_obj_rewriter(void);
 void ipfw_destroy_obj_rewriter(void);
 void ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count);
 int ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count);
 
 int create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti);
 void update_opcode_kidx(ipfw_insn *cmd, uint16_t idx);
 int classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx);
 void ipfw_init_srv(struct ip_fw_chain *ch);
 void ipfw_destroy_srv(struct ip_fw_chain *ch);
 int ipfw_check_object_name_generic(const char *name);
 int ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type,
     uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd);
 
 /* In ip_fw_eaction.c */
 typedef int (ipfw_eaction_t)(struct ip_fw_chain *ch, struct ip_fw_args *args,
     ipfw_insn *cmd, int *done);
 int ipfw_eaction_init(struct ip_fw_chain *ch, int first);
 void ipfw_eaction_uninit(struct ip_fw_chain *ch, int last);
 
 uint16_t ipfw_add_eaction(struct ip_fw_chain *ch, ipfw_eaction_t handler,
     const char *name);
 int ipfw_del_eaction(struct ip_fw_chain *ch, uint16_t eaction_id);
 int ipfw_run_eaction(struct ip_fw_chain *ch, struct ip_fw_args *args,
     ipfw_insn *cmd, int *done);
 int ipfw_reset_eaction(struct ip_fw_chain *ch, struct ip_fw *rule,
     uint16_t eaction_id, uint16_t default_id, uint16_t instance_id);
 int ipfw_reset_eaction_instance(struct ip_fw_chain *ch, uint16_t eaction_id,
     uint16_t instance_id);
 
 /* In ip_fw_table.c */
 struct table_info;
 
 typedef int (table_lookup_t)(struct table_info *ti, void *key, uint32_t keylen,
     uint32_t *val);
 
 int ipfw_lookup_table(struct ip_fw_chain *ch, uint16_t tbl, uint16_t plen,
     void *paddr, uint32_t *val);
 struct named_object *ipfw_objhash_lookup_table_kidx(struct ip_fw_chain *ch,
     uint16_t kidx);
 int ipfw_ref_table(struct ip_fw_chain *ch, ipfw_obj_ntlv *ntlv, uint16_t *kidx);
 void ipfw_unref_table(struct ip_fw_chain *ch, uint16_t kidx);
 int ipfw_init_tables(struct ip_fw_chain *ch, int first);
 int ipfw_resize_tables(struct ip_fw_chain *ch, unsigned int ntables);
 int ipfw_switch_tables_namespace(struct ip_fw_chain *ch, unsigned int nsets);
 void ipfw_destroy_tables(struct ip_fw_chain *ch, int last);
 
 /* In ip_fw_nat.c -- XXX to be moved to ip_var.h */
 
 extern struct cfg_nat *(*lookup_nat_ptr)(struct nat_list *, int);
 
 typedef int ipfw_nat_t(struct ip_fw_args *, struct cfg_nat *, struct mbuf *);
 typedef int ipfw_nat_cfg_t(struct sockopt *);
 
 VNET_DECLARE(int, ipfw_nat_ready);
 #define	V_ipfw_nat_ready	VNET(ipfw_nat_ready)
 #define	IPFW_NAT_LOADED	(V_ipfw_nat_ready)
 
 extern ipfw_nat_t *ipfw_nat_ptr;
 extern ipfw_nat_cfg_t *ipfw_nat_cfg_ptr;
 extern ipfw_nat_cfg_t *ipfw_nat_del_ptr;
 extern ipfw_nat_cfg_t *ipfw_nat_get_cfg_ptr;
 extern ipfw_nat_cfg_t *ipfw_nat_get_log_ptr;
 
 /* Helper functions for IP checksum adjustment */
 static __inline uint16_t
 cksum_add(uint16_t sum, uint16_t a)
 {
 	uint16_t res;
 
 	res = sum + a;
 	return (res + (res < a));
 }
 
 static __inline uint16_t
 cksum_adjust(uint16_t oldsum, uint16_t old, uint16_t new)
 {
 
 	return (~cksum_add(cksum_add(~oldsum, ~old), new));
 }
 
 #endif /* _KERNEL */
 #endif /* _IPFW2_PRIVATE_H */
Index: projects/fuse2/sys/netpfil/ipfw/ip_fw_sockopt.c
===================================================================
--- projects/fuse2/sys/netpfil/ipfw/ip_fw_sockopt.c	(revision 350434)
+++ projects/fuse2/sys/netpfil/ipfw/ip_fw_sockopt.c	(revision 350435)
@@ -1,4686 +1,4715 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2002-2009 Luigi Rizzo, Universita` di Pisa
  * Copyright (c) 2014 Yandex LLC
  * Copyright (c) 2014 Alexander V. Chernikov
  *
  * Supported by: Valeria Paoli
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * Control socket and rule management routines for ipfw.
  * Control is currently implemented via IP_FW3 setsockopt() code.
  */
 
 #include "opt_ipfw.h"
 #include "opt_inet.h"
 #ifndef INET
 #error IPFIREWALL requires INET.
 #endif /* INET */
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>	/* struct m_tag used by nested headers */
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/fnv_hash.h>
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <netinet/in.h>
 #include <netinet/ip_var.h> /* hooks */
 #include <netinet/ip_fw.h>
 
 #include <netpfil/ipfw/ip_fw_private.h>
 #include <netpfil/ipfw/ip_fw_table.h>
 
 #ifdef MAC
 #include <security/mac/mac_framework.h>
 #endif
 
 static int ipfw_ctl(struct sockopt *sopt);
 static int check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len,
     struct rule_check_info *ci);
 static int check_ipfw_rule1(struct ip_fw_rule *rule, int size,
     struct rule_check_info *ci);
 static int check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
     struct rule_check_info *ci);
 static int rewrite_rule_uidx(struct ip_fw_chain *chain,
     struct rule_check_info *ci);
 
 #define	NAMEDOBJ_HASH_SIZE	32
 
 struct namedobj_instance {
 	struct namedobjects_head	*names;
 	struct namedobjects_head	*values;
 	uint32_t nn_size;		/* names hash size */
 	uint32_t nv_size;		/* number hash size */
 	u_long *idx_mask;		/* used items bitmask */
 	uint32_t max_blocks;		/* number of "long" blocks in bitmask */
 	uint32_t count;			/* number of items */
 	uint16_t free_off[IPFW_MAX_SETS];	/* first possible free offset */
 	objhash_hash_f	*hash_f;
 	objhash_cmp_f	*cmp_f;
 };
 #define	BLOCK_ITEMS	(8 * sizeof(u_long))	/* Number of items for ffsl() */
 
 static uint32_t objhash_hash_name(struct namedobj_instance *ni,
     const void *key, uint32_t kopt);
 static uint32_t objhash_hash_idx(struct namedobj_instance *ni, uint32_t val);
 static int objhash_cmp_name(struct named_object *no, const void *name,
     uint32_t set);
 
 MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
 
 static int dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 static int dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd);
 
 /* ctl3 handler data */
 struct mtx ctl3_lock;
 #define	CTL3_LOCK_INIT()	mtx_init(&ctl3_lock, "ctl3_lock", NULL, MTX_DEF)
 #define	CTL3_LOCK_DESTROY()	mtx_destroy(&ctl3_lock)
 #define	CTL3_LOCK()		mtx_lock(&ctl3_lock)
 #define	CTL3_UNLOCK()		mtx_unlock(&ctl3_lock)
 
 static struct ipfw_sopt_handler *ctl3_handlers;
 static size_t ctl3_hsize;
 static uint64_t ctl3_refct, ctl3_gencnt;
 #define	CTL3_SMALLBUF	4096			/* small page-size write buffer */
 #define	CTL3_LARGEBUF	16 * 1024 * 1024	/* handle large rulesets */
 
 static int ipfw_flush_sopt_data(struct sockopt_data *sd);
 
 static struct ipfw_sopt_handler	scodes[] = {
 	{ IP_FW_XGET,		0,	HDIR_GET,	dump_config },
 	{ IP_FW_XADD,		0,	HDIR_BOTH,	add_rules },
 	{ IP_FW_XDEL,		0,	HDIR_BOTH,	del_rules },
 	{ IP_FW_XZERO,		0,	HDIR_SET,	clear_rules },
 	{ IP_FW_XRESETLOG,	0,	HDIR_SET,	clear_rules },
 	{ IP_FW_XMOVE,		0,	HDIR_SET,	move_rules },
 	{ IP_FW_SET_SWAP,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_SET_MOVE,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_SET_ENABLE,	0,	HDIR_SET,	manage_sets },
 	{ IP_FW_DUMP_SOPTCODES,	0,	HDIR_GET,	dump_soptcodes },
 	{ IP_FW_DUMP_SRVOBJECTS,0,	HDIR_GET,	dump_srvobjects },
 };
 
 static int
 set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule);
 static struct opcode_obj_rewrite *find_op_rw(ipfw_insn *cmd,
     uint16_t *puidx, uint8_t *ptype);
 static int ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
     struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti);
 static int ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct tid_info *ti, struct obj_idx *pidx, int *unresolved);
 static void unref_rule_objects(struct ip_fw_chain *chain, struct ip_fw *rule);
 static void unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct obj_idx *oib, struct obj_idx *end);
 static int export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
     struct sockopt_data *sd);
 
 /*
  * Opcode object rewriter variables
  */
 struct opcode_obj_rewrite *ctl3_rewriters;
 static size_t ctl3_rsize;
 
 /*
  * static variables followed by global ones
  */
 
 VNET_DEFINE_STATIC(uma_zone_t, ipfw_cntr_zone);
 #define	V_ipfw_cntr_zone		VNET(ipfw_cntr_zone)
 
 void
 ipfw_init_counters()
 {
 
 	V_ipfw_cntr_zone = uma_zcreate("IPFW counters",
 	    IPFW_RULE_CNTR_SIZE, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, UMA_ZONE_PCPU);
 }
 
 void
 ipfw_destroy_counters()
 {
 	
 	uma_zdestroy(V_ipfw_cntr_zone);
 }
 
 struct ip_fw *
 ipfw_alloc_rule(struct ip_fw_chain *chain, size_t rulesize)
 {
 	struct ip_fw *rule;
 
 	rule = malloc(rulesize, M_IPFW, M_WAITOK | M_ZERO);
 	rule->cntr = uma_zalloc_pcpu(V_ipfw_cntr_zone, M_WAITOK | M_ZERO);
 	rule->refcnt = 1;
 
 	return (rule);
 }
 
 void
 ipfw_free_rule(struct ip_fw *rule)
 {
 
 	/*
 	 * We don't release refcnt here, since this function
 	 * can be called without any locks held. The caller
 	 * must release reference under IPFW_UH_WLOCK, and then
 	 * call this function if refcount becomes 1.
 	 */
 	if (rule->refcnt > 1)
 		return;
 	uma_zfree_pcpu(V_ipfw_cntr_zone, rule->cntr);
 	free(rule, M_IPFW);
 }
 
 
 /*
  * Find the smallest rule >= key, id.
  * We could use bsearch but it is so simple that we code it directly
  */
 int
 ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, uint32_t id)
 {
 	int i, lo, hi;
 	struct ip_fw *r;
 
   	for (lo = 0, hi = chain->n_rules - 1; lo < hi;) {
 		i = (lo + hi) / 2;
 		r = chain->map[i];
 		if (r->rulenum < key)
 			lo = i + 1;	/* continue from the next one */
 		else if (r->rulenum > key)
 			hi = i;		/* this might be good */
 		else if (r->id < id)
 			lo = i + 1;	/* continue from the next one */
 		else /* r->id >= id */
 			hi = i;		/* this might be good */
 	}
 	return hi;
 }
 
 /*
  * Builds skipto cache on rule set @map.
  */
 static void
 update_skipto_cache(struct ip_fw_chain *chain, struct ip_fw **map)
 {
 	int *smap, rulenum;
 	int i, mi;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	mi = 0;
 	rulenum = map[mi]->rulenum;
 	smap = chain->idxmap_back;
 
 	if (smap == NULL)
 		return;
 
 	for (i = 0; i < 65536; i++) {
 		smap[i] = mi;
 		/* Use the same rule index until i < rulenum */
 		if (i != rulenum || i == 65535)
 			continue;
 		/* Find next rule with num > i */
 		rulenum = map[++mi]->rulenum;
 		while (rulenum == i)
 			rulenum = map[++mi]->rulenum;
 	}
 }
 
 /*
  * Swaps prepared (backup) index with current one.
  */
 static void
 swap_skipto_cache(struct ip_fw_chain *chain)
 {
 	int *map;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 	IPFW_WLOCK_ASSERT(chain);
 
 	map = chain->idxmap;
 	chain->idxmap = chain->idxmap_back;
 	chain->idxmap_back = map;
 }
 
 /*
  * Allocate and initialize skipto cache.
  */
 void
 ipfw_init_skipto_cache(struct ip_fw_chain *chain)
 {
 	int *idxmap, *idxmap_back;
 
 	idxmap = malloc(65536 * sizeof(int), M_IPFW, M_WAITOK | M_ZERO);
 	idxmap_back = malloc(65536 * sizeof(int), M_IPFW, M_WAITOK);
 
 	/*
 	 * Note we may be called at any time after initialization,
 	 * for example, on first skipto rule, so we need to
 	 * provide valid chain->idxmap on return
 	 */
 
 	IPFW_UH_WLOCK(chain);
 	if (chain->idxmap != NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		free(idxmap, M_IPFW);
 		free(idxmap_back, M_IPFW);
 		return;
 	}
 
 	/* Set backup pointer first to permit building cache */
 	chain->idxmap_back = idxmap_back;
 	update_skipto_cache(chain, chain->map);
 	IPFW_WLOCK(chain);
 	/* It is now safe to set chain->idxmap ptr */
 	chain->idxmap = idxmap;
 	swap_skipto_cache(chain);
 	IPFW_WUNLOCK(chain);
 	IPFW_UH_WUNLOCK(chain);
 }
 
 /*
  * Destroys skipto cache.
  */
 void
 ipfw_destroy_skipto_cache(struct ip_fw_chain *chain)
 {
 
 	if (chain->idxmap != NULL)
 		free(chain->idxmap, M_IPFW);
 	if (chain->idxmap != NULL)
 		free(chain->idxmap_back, M_IPFW);
 }
 
 
 /*
  * allocate a new map, returns the chain locked. extra is the number
  * of entries to add or delete.
  */
 static struct ip_fw **
 get_map(struct ip_fw_chain *chain, int extra, int locked)
 {
 
 	for (;;) {
 		struct ip_fw **map;
 		u_int i, mflags;
 
 		mflags = M_ZERO | ((locked != 0) ? M_NOWAIT : M_WAITOK);
 
 		i = chain->n_rules + extra;
 		map = malloc(i * sizeof(struct ip_fw *), M_IPFW, mflags);
 		if (map == NULL) {
 			printf("%s: cannot allocate map\n", __FUNCTION__);
 			return NULL;
 		}
 		if (!locked)
 			IPFW_UH_WLOCK(chain);
 		if (i >= chain->n_rules + extra) /* good */
 			return map;
 		/* otherwise we lost the race, free and retry */
 		if (!locked)
 			IPFW_UH_WUNLOCK(chain);
 		free(map, M_IPFW);
 	}
 }
 
 /*
  * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
  */
 static struct ip_fw **
 swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
 {
 	struct ip_fw **old_map;
 
 	IPFW_WLOCK(chain);
 	chain->id++;
 	chain->n_rules = new_len;
 	old_map = chain->map;
 	chain->map = new_map;
 	swap_skipto_cache(chain);
 	IPFW_WUNLOCK(chain);
 	return old_map;
 }
 
 
 static void
 export_cntr1_base(struct ip_fw *krule, struct ip_fw_bcounter *cntr)
 {
 	struct timeval boottime;
 
 	cntr->size = sizeof(*cntr);
 
 	if (krule->cntr != NULL) {
 		cntr->pcnt = counter_u64_fetch(krule->cntr);
 		cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
 		cntr->timestamp = krule->timestamp;
 	}
 	if (cntr->timestamp > 0) {
 		getboottime(&boottime);
 		cntr->timestamp += boottime.tv_sec;
 	}
 }
 
 static void
 export_cntr0_base(struct ip_fw *krule, struct ip_fw_bcounter0 *cntr)
 {
 	struct timeval boottime;
 
 	if (krule->cntr != NULL) {
 		cntr->pcnt = counter_u64_fetch(krule->cntr);
 		cntr->bcnt = counter_u64_fetch(krule->cntr + 1);
 		cntr->timestamp = krule->timestamp;
 	}
 	if (cntr->timestamp > 0) {
 		getboottime(&boottime);
 		cntr->timestamp += boottime.tv_sec;
 	}
 }
 
 /*
  * Copies rule @urule from v1 userland format (current).
  * to kernel @krule.
  * Assume @krule is zeroed.
  */
 static void
 import_rule1(struct rule_check_info *ci)
 {
 	struct ip_fw_rule *urule;
 	struct ip_fw *krule;
 
 	urule = (struct ip_fw_rule *)ci->urule;
 	krule = (struct ip_fw *)ci->krule;
 
 	/* copy header */
 	krule->act_ofs = urule->act_ofs;
 	krule->cmd_len = urule->cmd_len;
 	krule->rulenum = urule->rulenum;
 	krule->set = urule->set;
 	krule->flags = urule->flags;
 
 	/* Save rulenum offset */
 	ci->urule_numoff = offsetof(struct ip_fw_rule, rulenum);
 
 	/* Copy opcodes */
 	memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
 }
 
 /*
  * Export rule into v1 format (Current).
  * Layout:
  * [ ipfw_obj_tlv(IPFW_TLV_RULE_ENT)
  *     [ ip_fw_rule ] OR
  *     [ ip_fw_bcounter ip_fw_rule] (depends on rcntrs).
  * ]
  * Assume @data is zeroed.
  */
 static void
 export_rule1(struct ip_fw *krule, caddr_t data, int len, int rcntrs)
 {
 	struct ip_fw_bcounter *cntr;
 	struct ip_fw_rule *urule;
 	ipfw_obj_tlv *tlv;
 
 	/* Fill in TLV header */
 	tlv = (ipfw_obj_tlv *)data;
 	tlv->type = IPFW_TLV_RULE_ENT;
 	tlv->length = len;
 
 	if (rcntrs != 0) {
 		/* Copy counters */
 		cntr = (struct ip_fw_bcounter *)(tlv + 1);
 		urule = (struct ip_fw_rule *)(cntr + 1);
 		export_cntr1_base(krule, cntr);
 	} else
 		urule = (struct ip_fw_rule *)(tlv + 1);
 
 	/* copy header */
 	urule->act_ofs = krule->act_ofs;
 	urule->cmd_len = krule->cmd_len;
 	urule->rulenum = krule->rulenum;
 	urule->set = krule->set;
 	urule->flags = krule->flags;
 	urule->id = krule->id;
 
 	/* Copy opcodes */
 	memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
 }
 
 
 /*
  * Copies rule @urule from FreeBSD8 userland format (v0)
  * to kernel @krule.
  * Assume @krule is zeroed.
  */
 static void
 import_rule0(struct rule_check_info *ci)
 {
 	struct ip_fw_rule0 *urule;
 	struct ip_fw *krule;
 	int cmdlen, l;
 	ipfw_insn *cmd;
 	ipfw_insn_limit *lcmd;
 	ipfw_insn_if *cmdif;
 
 	urule = (struct ip_fw_rule0 *)ci->urule;
 	krule = (struct ip_fw *)ci->krule;
 
 	/* copy header */
 	krule->act_ofs = urule->act_ofs;
 	krule->cmd_len = urule->cmd_len;
 	krule->rulenum = urule->rulenum;
 	krule->set = urule->set;
 	if ((urule->_pad & 1) != 0)
 		krule->flags |= IPFW_RULE_NOOPT;
 
 	/* Save rulenum offset */
 	ci->urule_numoff = offsetof(struct ip_fw_rule0, rulenum);
 
 	/* Copy opcodes */
 	memcpy(krule->cmd, urule->cmd, krule->cmd_len * sizeof(uint32_t));
 
 	/*
 	 * Alter opcodes:
 	 * 1) convert tablearg value from 65535 to 0
 	 * 2) Add high bit to O_SETFIB/O_SETDSCP values (to make room
 	 *    for targ).
 	 * 3) convert table number in iface opcodes to u16
 	 * 4) convert old `nat global` into new 65535
 	 */
 	l = krule->cmd_len;
 	cmd = krule->cmd;
 	cmdlen = 0;
 
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		switch (cmd->opcode) {
 		/* Opcodes supporting tablearg */
 		case O_TAG:
 		case O_TAGGED:
 		case O_PIPE:
 		case O_QUEUE:
 		case O_DIVERT:
 		case O_TEE:
 		case O_SKIPTO:
 		case O_CALLRETURN:
 		case O_NETGRAPH:
 		case O_NGTEE:
 		case O_NAT:
 			if (cmd->arg1 == IP_FW_TABLEARG)
 				cmd->arg1 = IP_FW_TARG;
 			else if (cmd->arg1 == 0)
 				cmd->arg1 = IP_FW_NAT44_GLOBAL;
 			break;
 		case O_SETFIB:
 		case O_SETDSCP:
 			if (cmd->arg1 == IP_FW_TABLEARG)
 				cmd->arg1 = IP_FW_TARG;
 			else
 				cmd->arg1 |= 0x8000;
 			break;
 		case O_LIMIT:
 			lcmd = (ipfw_insn_limit *)cmd;
 			if (lcmd->conn_limit == IP_FW_TABLEARG)
 				lcmd->conn_limit = IP_FW_TARG;
 			break;
 		/* Interface tables */
 		case O_XMIT:
 		case O_RECV:
 		case O_VIA:
 			/* Interface table, possibly */
 			cmdif = (ipfw_insn_if *)cmd;
 			if (cmdif->name[0] != '\1')
 				break;
 
 			cmdif->p.kidx = (uint16_t)cmdif->p.glob;
 			break;
 		}
 	}
 }
 
 /*
  * Copies rule @krule from kernel to FreeBSD8 userland format (v0)
  */
 static void
 export_rule0(struct ip_fw *krule, struct ip_fw_rule0 *urule, int len)
 {
 	int cmdlen, l;
 	ipfw_insn *cmd;
 	ipfw_insn_limit *lcmd;
 	ipfw_insn_if *cmdif;
 
 	/* copy header */
 	memset(urule, 0, len);
 	urule->act_ofs = krule->act_ofs;
 	urule->cmd_len = krule->cmd_len;
 	urule->rulenum = krule->rulenum;
 	urule->set = krule->set;
 	if ((krule->flags & IPFW_RULE_NOOPT) != 0)
 		urule->_pad |= 1;
 
 	/* Copy opcodes */
 	memcpy(urule->cmd, krule->cmd, krule->cmd_len * sizeof(uint32_t));
 
 	/* Export counters */
 	export_cntr0_base(krule, (struct ip_fw_bcounter0 *)&urule->pcnt);
 
 	/*
 	 * Alter opcodes:
 	 * 1) convert tablearg value from 0 to 65535
 	 * 2) Remove highest bit from O_SETFIB/O_SETDSCP values.
 	 * 3) convert table number in iface opcodes to int
 	 */
 	l = urule->cmd_len;
 	cmd = urule->cmd;
 	cmdlen = 0;
 
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		switch (cmd->opcode) {
 		/* Opcodes supporting tablearg */
 		case O_TAG:
 		case O_TAGGED:
 		case O_PIPE:
 		case O_QUEUE:
 		case O_DIVERT:
 		case O_TEE:
 		case O_SKIPTO:
 		case O_CALLRETURN:
 		case O_NETGRAPH:
 		case O_NGTEE:
 		case O_NAT:
 			if (cmd->arg1 == IP_FW_TARG)
 				cmd->arg1 = IP_FW_TABLEARG;
 			else if (cmd->arg1 == IP_FW_NAT44_GLOBAL)
 				cmd->arg1 = 0;
 			break;
 		case O_SETFIB:
 		case O_SETDSCP:
 			if (cmd->arg1 == IP_FW_TARG)
 				cmd->arg1 = IP_FW_TABLEARG;
 			else
 				cmd->arg1 &= ~0x8000;
 			break;
 		case O_LIMIT:
 			lcmd = (ipfw_insn_limit *)cmd;
 			if (lcmd->conn_limit == IP_FW_TARG)
 				lcmd->conn_limit = IP_FW_TABLEARG;
 			break;
 		/* Interface tables */
 		case O_XMIT:
 		case O_RECV:
 		case O_VIA:
 			/* Interface table, possibly */
 			cmdif = (ipfw_insn_if *)cmd;
 			if (cmdif->name[0] != '\1')
 				break;
 
 			cmdif->p.glob = cmdif->p.kidx;
 			break;
 		}
 	}
 }
 
 /*
  * Add new rule(s) to the list possibly creating rule number for each.
  * Update the rule_number in the input struct so the caller knows it as well.
  * Must be called without IPFW_UH held
  */
 static int
 commit_rules(struct ip_fw_chain *chain, struct rule_check_info *rci, int count)
 {
 	int error, i, insert_before, tcount;
 	uint16_t rulenum, *pnum;
 	struct rule_check_info *ci;
 	struct ip_fw *krule;
 	struct ip_fw **map;	/* the new array of pointers */
 
 	/* Check if we need to do table/obj index remap */
 	tcount = 0;
 	for (ci = rci, i = 0; i < count; ci++, i++) {
 		if (ci->object_opcodes == 0)
 			continue;
 
 		/*
 		 * Rule has some object opcodes.
 		 * We need to find (and create non-existing)
 		 * kernel objects, and reference existing ones.
 		 */
 		error = rewrite_rule_uidx(chain, ci);
 		if (error != 0) {
 
 			/*
 			 * rewrite failed, state for current rule
 			 * has been reverted. Check if we need to
 			 * revert more.
 			 */
 			if (tcount > 0) {
 
 				/*
 				 * We have some more table rules
 				 * we need to rollback.
 				 */
 
 				IPFW_UH_WLOCK(chain);
 				while (ci != rci) {
 					ci--;
 					if (ci->object_opcodes == 0)
 						continue;
 					unref_rule_objects(chain,ci->krule);
 
 				}
 				IPFW_UH_WUNLOCK(chain);
 
 			}
 
 			return (error);
 		}
 
 		tcount++;
 	}
 
 	/* get_map returns with IPFW_UH_WLOCK if successful */
 	map = get_map(chain, count, 0 /* not locked */);
 	if (map == NULL) {
 		if (tcount > 0) {
 			/* Unbind tables */
 			IPFW_UH_WLOCK(chain);
 			for (ci = rci, i = 0; i < count; ci++, i++) {
 				if (ci->object_opcodes == 0)
 					continue;
 
 				unref_rule_objects(chain, ci->krule);
 			}
 			IPFW_UH_WUNLOCK(chain);
 		}
 
 		return (ENOSPC);
 	}
 
 	if (V_autoinc_step < 1)
 		V_autoinc_step = 1;
 	else if (V_autoinc_step > 1000)
 		V_autoinc_step = 1000;
 
 	/* FIXME: Handle count > 1 */
 	ci = rci;
 	krule = ci->krule;
 	rulenum = krule->rulenum;
 
 	/* find the insertion point, we will insert before */
 	insert_before = rulenum ? rulenum + 1 : IPFW_DEFAULT_RULE;
 	i = ipfw_find_rule(chain, insert_before, 0);
 	/* duplicate first part */
 	if (i > 0)
 		bcopy(chain->map, map, i * sizeof(struct ip_fw *));
 	map[i] = krule;
 	/* duplicate remaining part, we always have the default rule */
 	bcopy(chain->map + i, map + i + 1,
 		sizeof(struct ip_fw *) *(chain->n_rules - i));
 	if (rulenum == 0) {
 		/* Compute rule number and write it back */
 		rulenum = i > 0 ? map[i-1]->rulenum : 0;
 		if (rulenum < IPFW_DEFAULT_RULE - V_autoinc_step)
 			rulenum += V_autoinc_step;
 		krule->rulenum = rulenum;
 		/* Save number to userland rule */
 		pnum = (uint16_t *)((caddr_t)ci->urule + ci->urule_numoff);
 		*pnum = rulenum;
 	}
 
 	krule->id = chain->id + 1;
 	update_skipto_cache(chain, map);
 	map = swap_map(chain, map, chain->n_rules + 1);
 	chain->static_len += RULEUSIZE0(krule);
 	IPFW_UH_WUNLOCK(chain);
 	if (map)
 		free(map, M_IPFW);
 	return (0);
 }
 
 int
 ipfw_add_protected_rule(struct ip_fw_chain *chain, struct ip_fw *rule,
     int locked)
 {
 	struct ip_fw **map;
 
 	map = get_map(chain, 1, locked);
 	if (map == NULL)
 		return (ENOMEM);
 	if (chain->n_rules > 0)
 		bcopy(chain->map, map,
 		    chain->n_rules * sizeof(struct ip_fw *));
 	map[chain->n_rules] = rule;
 	rule->rulenum = IPFW_DEFAULT_RULE;
 	rule->set = RESVD_SET;
 	rule->id = chain->id + 1;
 	/* We add rule in the end of chain, no need to update skipto cache */
 	map = swap_map(chain, map, chain->n_rules + 1);
 	chain->static_len += RULEUSIZE0(rule);
 	IPFW_UH_WUNLOCK(chain);
 	free(map, M_IPFW);
 	return (0);
 }
 
 /*
  * Adds @rule to the list of rules to reap
  */
 void
 ipfw_reap_add(struct ip_fw_chain *chain, struct ip_fw **head,
     struct ip_fw *rule)
 {
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	/* Unlink rule from everywhere */
 	unref_rule_objects(chain, rule);
 
 	rule->next = *head;
 	*head = rule;
 }
 
 /*
  * Reclaim storage associated with a list of rules.  This is
  * typically the list created using remove_rule.
  * A NULL pointer on input is handled correctly.
  */
 void
 ipfw_reap_rules(struct ip_fw *head)
 {
 	struct ip_fw *rule;
 
 	while ((rule = head) != NULL) {
 		head = head->next;
 		ipfw_free_rule(rule);
 	}
 }
 
 /*
  * Rules to keep are
  *	(default || reserved || !match_set || !match_number)
  * where
  *   default ::= (rule->rulenum == IPFW_DEFAULT_RULE)
  *	// the default rule is always protected
  *
  *   reserved ::= (cmd == 0 && n == 0 && rule->set == RESVD_SET)
  *	// RESVD_SET is protected only if cmd == 0 and n == 0 ("ipfw flush")
  *
  *   match_set ::= (cmd == 0 || rule->set == set)
  *	// set number is ignored for cmd == 0
  *
  *   match_number ::= (cmd == 1 || n == 0 || n == rule->rulenum)
  *	// number is ignored for cmd == 1 or n == 0
  *
  */
 int
 ipfw_match_range(struct ip_fw *rule, ipfw_range_tlv *rt)
 {
 
 	/* Don't match default rule for modification queries */
 	if (rule->rulenum == IPFW_DEFAULT_RULE &&
 	    (rt->flags & IPFW_RCFLAG_DEFAULT) == 0)
 		return (0);
 
 	/* Don't match rules in reserved set for flush requests */
 	if ((rt->flags & IPFW_RCFLAG_ALL) != 0 && rule->set == RESVD_SET)
 		return (0);
 
 	/* If we're filtering by set, don't match other sets */
 	if ((rt->flags & IPFW_RCFLAG_SET) != 0 && rule->set != rt->set)
 		return (0);
 
 	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0 &&
 	    (rule->rulenum < rt->start_rule || rule->rulenum > rt->end_rule))
 		return (0);
 
 	return (1);
 }
 
 struct manage_sets_args {
 	uint16_t	set;
 	uint8_t		new_set;
 };
 
 static int
 swap_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set == (uint8_t)args->set)
 		no->set = args->new_set;
 	else if (no->set == args->new_set)
 		no->set = (uint8_t)args->set;
 	return (0);
 }
 
 static int
 move_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set == (uint8_t)args->set)
 		no->set = args->new_set;
 	return (0);
 }
 
 static int
 test_sets_cb(struct namedobj_instance *ni, struct named_object *no,
     void *arg)
 {
 	struct manage_sets_args *args;
 
 	args = (struct manage_sets_args *)arg;
 	if (no->set != (uint8_t)args->set)
 		return (0);
 	if (ipfw_objhash_lookup_name_type(ni, args->new_set,
 	    no->etlv, no->name) != NULL)
 		return (EEXIST);
 	return (0);
 }
 
 /*
  * Generic function to handler moving and swapping sets.
  */
 int
 ipfw_obj_manage_sets(struct namedobj_instance *ni, uint16_t type,
     uint16_t set, uint8_t new_set, enum ipfw_sets_cmd cmd)
 {
 	struct manage_sets_args args;
 	struct named_object *no;
 
 	args.set = set;
 	args.new_set = new_set;
 	switch (cmd) {
 	case SWAP_ALL:
 		return (ipfw_objhash_foreach_type(ni, swap_sets_cb,
 		    &args, type));
 	case TEST_ALL:
 		return (ipfw_objhash_foreach_type(ni, test_sets_cb,
 		    &args, type));
 	case MOVE_ALL:
 		return (ipfw_objhash_foreach_type(ni, move_sets_cb,
 		    &args, type));
 	case COUNT_ONE:
 		/*
 		 * @set used to pass kidx.
 		 * When @new_set is zero - reset object counter,
 		 * otherwise increment it.
 		 */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		if (new_set != 0)
 			no->ocnt++;
 		else
 			no->ocnt = 0;
 		return (0);
 	case TEST_ONE:
 		/* @set used to pass kidx */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		/*
 		 * First check number of references:
 		 * when it differs, this mean other rules are holding
 		 * reference to given object, so it is not possible to
 		 * change its set. Note that refcnt may account references
 		 * to some going-to-be-added rules. Since we don't know
 		 * their numbers (and even if they will be added) it is
 		 * perfectly OK to return error here.
 		 */
 		if (no->ocnt != no->refcnt)
 			return (EBUSY);
 		if (ipfw_objhash_lookup_name_type(ni, new_set, type,
 		    no->name) != NULL)
 			return (EEXIST);
 		return (0);
 	case MOVE_ONE:
 		/* @set used to pass kidx */
 		no = ipfw_objhash_lookup_kidx(ni, set);
 		no->set = new_set;
 		return (0);
 	}
 	return (EINVAL);
 }
 
 /*
  * Delete rules matching range @rt.
  * Saves number of deleted rules in @ndel.
  *
  * Returns 0 on success.
  */
 static int
 delete_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int *ndel)
 {
 	struct ip_fw *reap, *rule, **map;
 	int end, start;
 	int i, n, ndyn, ofs;
 
 	reap = NULL;
 	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
 
 	/*
 	 * Stage 1: Determine range to inspect.
 	 * Range is half-inclusive, e.g [start, end).
 	 */
 	start = 0;
 	end = chain->n_rules - 1;
 
 	if ((rt->flags & IPFW_RCFLAG_RANGE) != 0) {
 		start = ipfw_find_rule(chain, rt->start_rule, 0);
 
 		if (rt->end_rule >= IPFW_DEFAULT_RULE)
 			rt->end_rule = IPFW_DEFAULT_RULE - 1;
 		end = ipfw_find_rule(chain, rt->end_rule, UINT32_MAX);
 	}
 
 	if (rt->flags & IPFW_RCFLAG_DYNAMIC) {
 		/*
 		 * Requested deleting only for dynamic states.
 		 */
 		*ndel = 0;
 		ipfw_expire_dyn_states(chain, rt);
 		IPFW_UH_WUNLOCK(chain);
 		return (0);
 	}
 
 	/* Allocate new map of the same size */
 	map = get_map(chain, 0, 1 /* locked */);
 	if (map == NULL) {
 		IPFW_UH_WUNLOCK(chain);
 		return (ENOMEM);
 	}
 
 	n = 0;
 	ndyn = 0;
 	ofs = start;
 	/* 1. bcopy the initial part of the map */
 	if (start > 0)
 		bcopy(chain->map, map, start * sizeof(struct ip_fw *));
 	/* 2. copy active rules between start and end */
 	for (i = start; i < end; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0) {
 			map[ofs++] = rule;
 			continue;
 		}
 
 		n++;
 		if (ipfw_is_dyn_rule(rule) != 0)
 			ndyn++;
 	}
 	/* 3. copy the final part of the map */
 	bcopy(chain->map + end, map + ofs,
 		(chain->n_rules - end) * sizeof(struct ip_fw *));
 	/* 4. recalculate skipto cache */
 	update_skipto_cache(chain, map);
 	/* 5. swap the maps (under UH_WLOCK + WHLOCK) */
 	map = swap_map(chain, map, chain->n_rules - n);
 	/* 6. Remove all dynamic states originated by deleted rules */
 	if (ndyn > 0)
 		ipfw_expire_dyn_states(chain, rt);
 	/* 7. now remove the rules deleted from the old map */
 	for (i = start; i < end; i++) {
 		rule = map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		chain->static_len -= RULEUSIZE0(rule);
 		ipfw_reap_add(chain, &reap, rule);
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	ipfw_reap_rules(reap);
 	if (map != NULL)
 		free(map, M_IPFW);
 	*ndel = n;
 	return (0);
 }
 
 static int
 move_objects(struct ip_fw_chain *ch, ipfw_range_tlv *rt)
 {
 	struct opcode_obj_rewrite *rw;
 	struct ip_fw *rule;
 	ipfw_insn *cmd;
 	int cmdlen, i, l, c;
 	uint16_t kidx;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	/* Stage 1: count number of references by given rules */
 	for (c = 0, i = 0; i < ch->n_rules - 1; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/*
 			 * When manage_sets() returns non-zero value to
 			 * COUNT_ONE command, consider this as an object
 			 * doesn't support sets (e.g. disabled with sysctl).
 			 * So, skip checks for this object.
 			 */
 			if (rw->manage_sets(ch, kidx, 1, COUNT_ONE) != 0)
 				continue;
 			c++;
 		}
 	}
 	if (c == 0) /* No objects found */
 		return (0);
 	/* Stage 2: verify "ownership" */
 	for (c = 0, i = 0; (i < ch->n_rules - 1) && c == 0; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0 && c == 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/* Test for ownership and conflicting names */
 			c = rw->manage_sets(ch, kidx,
 			    (uint8_t)rt->new_set, TEST_ONE);
 		}
 	}
 	/* Stage 3: change set and cleanup */
 	for (i = 0; i < ch->n_rules - 1; i++) {
 		rule = ch->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		if (rule->set == rt->new_set) /* nothing to do */
 			continue;
 		/* Search opcodes with named objects */
 		for (l = rule->cmd_len, cmdlen = 0, cmd = rule->cmd;
 		    l > 0; l -= cmdlen, cmd += cmdlen) {
 			cmdlen = F_LEN(cmd);
 			rw = find_op_rw(cmd, &kidx, NULL);
 			if (rw == NULL || rw->manage_sets == NULL)
 				continue;
 			/* cleanup object counter */
 			rw->manage_sets(ch, kidx,
 			    0 /* reset counter */, COUNT_ONE);
 			if (c != 0)
 				continue;
 			/* change set */
 			rw->manage_sets(ch, kidx,
 			    (uint8_t)rt->new_set, MOVE_ONE);
 		}
 	}
 	return (c);
 }
 
 /*
  * Changes set of given rule rannge @rt
  * with each other.
  *
  * Returns 0 on success.
  */
 static int
 move_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
 {
 	struct ip_fw *rule;
 	int i;
 
 	IPFW_UH_WLOCK(chain);
 
 	/*
 	 * Move rules with matching paramenerts to a new set.
 	 * This one is much more complex. We have to ensure
 	 * that all referenced tables (if any) are referenced
 	 * by given rule subset only. Otherwise, we can't move
 	 * them to new set and have to return error.
 	 */
 	if ((i = move_objects(chain, rt)) != 0) {
 		IPFW_UH_WUNLOCK(chain);
 		return (i);
 	}
 
 	/* XXX: We have to do swap holding WLOCK */
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		rule->set = rt->new_set;
 	}
 
 	IPFW_UH_WUNLOCK(chain);
 
 	return (0);
 }
 
 /*
+ * Returns pointer to action instruction, skips all possible rule
+ * modifiers like O_LOG, O_TAG, O_ALTQ.
+ */
+ipfw_insn *
+ipfw_get_action(struct ip_fw *rule)
+{
+	ipfw_insn *cmd;
+	int l, cmdlen;
+
+	cmd = ACTION_PTR(rule);
+	l = rule->cmd_len - rule->act_ofs;
+	while (l > 0) {
+		switch (cmd->opcode) {
+		case O_ALTQ:
+		case O_LOG:
+		case O_TAG:
+			break;
+		default:
+			return (cmd);
+		}
+		cmdlen = F_LEN(cmd);
+		l -= cmdlen;
+		cmd += cmdlen;
+	}
+	panic("%s: rule (%p) has not action opcode", __func__, rule);
+	return (NULL);
+}
+
+/*
  * Clear counters for a specific rule.
  * Normally run under IPFW_UH_RLOCK, but these are idempotent ops
  * so we only care that rules do not disappear.
  */
 static void
 clear_counters(struct ip_fw *rule, int log_only)
 {
 	ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
 
 	if (log_only == 0)
 		IPFW_ZERO_RULE_COUNTER(rule);
 	if (l->o.opcode == O_LOG)
 		l->log_left = l->max_log;
 }
 
 /*
  * Flushes rules counters and/or log values on matching range.
  *
  * Returns number of items cleared.
  */
 static int
 clear_range(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int log_only)
 {
 	struct ip_fw *rule;
 	int num;
 	int i;
 
 	num = 0;
 	rt->flags |= IPFW_RCFLAG_DEFAULT;
 
 	IPFW_UH_WLOCK(chain);	/* arbitrate writers */
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 		if (ipfw_match_range(rule, rt) == 0)
 			continue;
 		clear_counters(rule, log_only);
 		num++;
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	return (num);
 }
 
 static int
 check_range_tlv(ipfw_range_tlv *rt)
 {
 
 	if (rt->head.length != sizeof(*rt))
 		return (1);
 	if (rt->start_rule > rt->end_rule)
 		return (1);
 	if (rt->set >= IPFW_MAX_SETS || rt->new_set >= IPFW_MAX_SETS)
 		return (1);
 
 	if ((rt->flags & IPFW_RCFLAG_USER) != rt->flags)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Delete rules matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  * Reply: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Saves number of deleted rules in ipfw_range_tlv->new_set.
  *
  * Returns 0 on success.
  */
 static int
 del_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int error, ndel;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	ndel = 0;
 	if ((error = delete_range(chain, &rh->range, &ndel)) != 0)
 		return (error);
 
 	/* Save number of rules deleted */
 	rh->range.new_set = ndel;
 	return (0);
 }
 
 /*
  * Move rules/sets matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Returns 0 on success.
  */
 static int
 move_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	return (move_range(chain, &rh->range));
 }
 
 /*
  * Clear rule accounting data matching specified parameters
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  * Reply: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Saves number of cleared rules in ipfw_range_tlv->new_set.
  *
  * Returns 0 on success.
  */
 static int
 clear_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int log_only, num;
 	char *msg;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (check_range_tlv(&rh->range) != 0)
 		return (EINVAL);
 
 	log_only = (op3->opcode == IP_FW_XRESETLOG);
 
 	num = clear_range(chain, &rh->range, log_only);
 
 	if (rh->range.flags & IPFW_RCFLAG_ALL)
 		msg = log_only ? "All logging counts reset" :
 		    "Accounting cleared";
 	else
 		msg = log_only ? "logging count reset" : "cleared";
 
 	if (V_fw_verbose) {
 		int lev = LOG_SECURITY | LOG_NOTICE;
 		log(lev, "ipfw: %s.\n", msg);
 	}
 
 	/* Save number of rules cleared */
 	rh->range.new_set = num;
 	return (0);
 }
 
 static void
 enable_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt)
 {
 	uint32_t v_set;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	/* Change enabled/disabled sets mask */
 	v_set = (V_set_disable | rt->set) & ~rt->new_set;
 	v_set &= ~(1 << RESVD_SET); /* set RESVD_SET always enabled */
 	IPFW_WLOCK(chain);
 	V_set_disable = v_set;
 	IPFW_WUNLOCK(chain);
 }
 
 static int
 swap_sets(struct ip_fw_chain *chain, ipfw_range_tlv *rt, int mv)
 {
 	struct opcode_obj_rewrite *rw;
 	struct ip_fw *rule;
 	int i;
 
 	IPFW_UH_WLOCK_ASSERT(chain);
 
 	if (rt->set == rt->new_set) /* nothing to do */
 		return (0);
 
 	if (mv != 0) {
 		/*
 		 * Berfore moving the rules we need to check that
 		 * there aren't any conflicting named objects.
 		 */
 		for (rw = ctl3_rewriters;
 		    rw < ctl3_rewriters + ctl3_rsize; rw++) {
 			if (rw->manage_sets == NULL)
 				continue;
 			i = rw->manage_sets(chain, (uint8_t)rt->set,
 			    (uint8_t)rt->new_set, TEST_ALL);
 			if (i != 0)
 				return (EEXIST);
 		}
 	}
 	/* Swap or move two sets */
 	for (i = 0; i < chain->n_rules - 1; i++) {
 		rule = chain->map[i];
 		if (rule->set == (uint8_t)rt->set)
 			rule->set = (uint8_t)rt->new_set;
 		else if (rule->set == (uint8_t)rt->new_set && mv == 0)
 			rule->set = (uint8_t)rt->set;
 	}
 	for (rw = ctl3_rewriters; rw < ctl3_rewriters + ctl3_rsize; rw++) {
 		if (rw->manage_sets == NULL)
 			continue;
 		rw->manage_sets(chain, (uint8_t)rt->set,
 		    (uint8_t)rt->new_set, mv != 0 ? MOVE_ALL: SWAP_ALL);
 	}
 	return (0);
 }
 
 /*
  * Swaps or moves set
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_header ipfw_range_tlv ]
  *
  * Returns 0 on success.
  */
 static int
 manage_sets(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_range_header *rh;
 	int ret;
 
 	if (sd->valsize != sizeof(*rh))
 		return (EINVAL);
 
 	rh = (ipfw_range_header *)ipfw_get_sopt_space(sd, sd->valsize);
 
 	if (rh->range.head.length != sizeof(ipfw_range_tlv))
 		return (1);
 	/* enable_sets() expects bitmasks. */
 	if (op3->opcode != IP_FW_SET_ENABLE &&
 	    (rh->range.set >= IPFW_MAX_SETS ||
 	    rh->range.new_set >= IPFW_MAX_SETS))
 		return (EINVAL);
 
 	ret = 0;
 	IPFW_UH_WLOCK(chain);
 	switch (op3->opcode) {
 	case IP_FW_SET_SWAP:
 	case IP_FW_SET_MOVE:
 		ret = swap_sets(chain, &rh->range,
 		    op3->opcode == IP_FW_SET_MOVE);
 		break;
 	case IP_FW_SET_ENABLE:
 		enable_sets(chain, &rh->range);
 		break;
 	}
 	IPFW_UH_WUNLOCK(chain);
 
 	return (ret);
 }
 
 /**
  * Remove all rules with given number, or do set manipulation.
  * Assumes chain != NULL && *chain != NULL.
  *
  * The argument is an uint32_t. The low 16 bit are the rule or set number;
  * the next 8 bits are the new set; the top 8 bits indicate the command:
  *
  *	0	delete rules numbered "rulenum"
  *	1	delete rules in set "rulenum"
  *	2	move rules "rulenum" to set "new_set"
  *	3	move rules from set "rulenum" to set "new_set"
  *	4	swap sets "rulenum" and "new_set"
  *	5	delete rules "rulenum" and set "new_set"
  */
 static int
 del_entry(struct ip_fw_chain *chain, uint32_t arg)
 {
 	uint32_t num;	/* rule number or old_set */
 	uint8_t cmd, new_set;
 	int do_del, ndel;
 	int error = 0;
 	ipfw_range_tlv rt;
 
 	num = arg & 0xffff;
 	cmd = (arg >> 24) & 0xff;
 	new_set = (arg >> 16) & 0xff;
 
 	if (cmd > 5 || new_set > RESVD_SET)
 		return EINVAL;
 	if (cmd == 0 || cmd == 2 || cmd == 5) {
 		if (num >= IPFW_DEFAULT_RULE)
 			return EINVAL;
 	} else {
 		if (num > RESVD_SET)	/* old_set */
 			return EINVAL;
 	}
 
 	/* Convert old requests into new representation */
 	memset(&rt, 0, sizeof(rt));
 	rt.start_rule = num;
 	rt.end_rule = num;
 	rt.set = num;
 	rt.new_set = new_set;
 	do_del = 0;
 
 	switch (cmd) {
 	case 0: /* delete rules numbered "rulenum" */
 		if (num == 0)
 			rt.flags |= IPFW_RCFLAG_ALL;
 		else
 			rt.flags |= IPFW_RCFLAG_RANGE;
 		do_del = 1;
 		break;
 	case 1: /* delete rules in set "rulenum" */
 		rt.flags |= IPFW_RCFLAG_SET;
 		do_del = 1;
 		break;
 	case 5: /* delete rules "rulenum" and set "new_set" */
 		rt.flags |= IPFW_RCFLAG_RANGE | IPFW_RCFLAG_SET;
 		rt.set = new_set;
 		rt.new_set = 0;
 		do_del = 1;
 		break;
 	case 2: /* move rules "rulenum" to set "new_set" */
 		rt.flags |= IPFW_RCFLAG_RANGE;
 		break;
 	case 3: /* move rules from set "rulenum" to set "new_set" */
 		IPFW_UH_WLOCK(chain);
 		error = swap_sets(chain, &rt, 1);
 		IPFW_UH_WUNLOCK(chain);
 		return (error);
 	case 4: /* swap sets "rulenum" and "new_set" */
 		IPFW_UH_WLOCK(chain);
 		error = swap_sets(chain, &rt, 0);
 		IPFW_UH_WUNLOCK(chain);
 		return (error);
 	default:
 		return (ENOTSUP);
 	}
 
 	if (do_del != 0) {
 		if ((error = delete_range(chain, &rt, &ndel)) != 0)
 			return (error);
 
 		if (ndel == 0 && (cmd != 1 && num != 0))
 			return (EINVAL);
 
 		return (0);
 	}
 
 	return (move_range(chain, &rt));
 }
 
 /**
  * Reset some or all counters on firewall rules.
  * The argument `arg' is an u_int32_t. The low 16 bit are the rule number,
  * the next 8 bits are the set number, the top 8 bits are the command:
  *	0	work with rules from all set's;
  *	1	work with rules only from specified set.
  * Specified rule number is zero if we want to clear all entries.
  * log_only is 1 if we only want to reset logs, zero otherwise.
  */
 static int
 zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int log_only)
 {
 	struct ip_fw *rule;
 	char *msg;
 	int i;
 
 	uint16_t rulenum = arg & 0xffff;
 	uint8_t set = (arg >> 16) & 0xff;
 	uint8_t cmd = (arg >> 24) & 0xff;
 
 	if (cmd > 1)
 		return (EINVAL);
 	if (cmd == 1 && set > RESVD_SET)
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	if (rulenum == 0) {
 		V_norule_counter = 0;
 		for (i = 0; i < chain->n_rules; i++) {
 			rule = chain->map[i];
 			/* Skip rules not in our set. */
 			if (cmd == 1 && rule->set != set)
 				continue;
 			clear_counters(rule, log_only);
 		}
 		msg = log_only ? "All logging counts reset" :
 		    "Accounting cleared";
 	} else {
 		int cleared = 0;
 		for (i = 0; i < chain->n_rules; i++) {
 			rule = chain->map[i];
 			if (rule->rulenum == rulenum) {
 				if (cmd == 0 || rule->set == set)
 					clear_counters(rule, log_only);
 				cleared = 1;
 			}
 			if (rule->rulenum > rulenum)
 				break;
 		}
 		if (!cleared) {	/* we did not find any matching rules */
 			IPFW_UH_RUNLOCK(chain);
 			return (EINVAL);
 		}
 		msg = log_only ? "logging count reset" : "cleared";
 	}
 	IPFW_UH_RUNLOCK(chain);
 
 	if (V_fw_verbose) {
 		int lev = LOG_SECURITY | LOG_NOTICE;
 
 		if (rulenum)
 			log(lev, "ipfw: Entry %d %s.\n", rulenum, msg);
 		else
 			log(lev, "ipfw: %s.\n", msg);
 	}
 	return (0);
 }
 
 
 /*
  * Check rule head in FreeBSD11 format
  *
  */
 static int
 check_ipfw_rule1(struct ip_fw_rule *rule, int size,
     struct rule_check_info *ci)
 {
 	int l;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 
 	/* Check for valid cmd_len */
 	l = roundup2(RULESIZE(rule), sizeof(uint64_t));
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	if (rule->act_ofs >= rule->cmd_len) {
 		printf("ipfw: bogus action offset (%u > %u)\n",
 		    rule->act_ofs, rule->cmd_len - 1);
 		return (EINVAL);
 	}
 
 	if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
 		return (EINVAL);
 
 	return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
 }
 
 /*
  * Check rule head in FreeBSD8 format
  *
  */
 static int
 check_ipfw_rule0(struct ip_fw_rule0 *rule, int size,
     struct rule_check_info *ci)
 {
 	int l;
 
 	if (size < sizeof(*rule)) {
 		printf("ipfw: rule too short\n");
 		return (EINVAL);
 	}
 
 	/* Check for valid cmd_len */
 	l = sizeof(*rule) + rule->cmd_len * 4 - 4;
 	if (l != size) {
 		printf("ipfw: size mismatch (have %d want %d)\n", size, l);
 		return (EINVAL);
 	}
 	if (rule->act_ofs >= rule->cmd_len) {
 		printf("ipfw: bogus action offset (%u > %u)\n",
 		    rule->act_ofs, rule->cmd_len - 1);
 		return (EINVAL);
 	}
 
 	if (rule->rulenum > IPFW_DEFAULT_RULE - 1)
 		return (EINVAL);
 
 	return (check_ipfw_rule_body(rule->cmd, rule->cmd_len, ci));
 }
 
 static int
 check_ipfw_rule_body(ipfw_insn *cmd, int cmd_len, struct rule_check_info *ci)
 {
 	int cmdlen, l;
 	int have_action;
 
 	have_action = 0;
 
 	/*
 	 * Now go for the individual checks. Very simple ones, basically only
 	 * instruction sizes.
 	 */
 	for (l = cmd_len; l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		if (cmdlen > l) {
 			printf("ipfw: opcode %d size truncated\n",
 			    cmd->opcode);
 			return EINVAL;
 		}
 		switch (cmd->opcode) {
 		case O_PROBE_STATE:
 		case O_KEEP_STATE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_PROTO:
 		case O_IP_SRC_ME:
 		case O_IP_DST_ME:
 		case O_LAYER2:
 		case O_IN:
 		case O_FRAG:
 		case O_DIVERTED:
 		case O_IPOPT:
 		case O_IPTOS:
 		case O_IPPRECEDENCE:
 		case O_IPVER:
 		case O_SOCKARG:
 		case O_TCPFLAGS:
 		case O_TCPOPTS:
 		case O_ESTAB:
 		case O_VERREVPATH:
 		case O_VERSRCREACH:
 		case O_ANTISPOOF:
 		case O_IPSEC:
 #ifdef INET6
 		case O_IP6_SRC_ME:
 		case O_IP6_DST_ME:
 		case O_EXT_HDR:
 		case O_IP6:
 #endif
 		case O_IP4:
 		case O_TAG:
 		case O_SKIP_ACTION:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_EXTERNAL_ACTION:
 			if (cmd->arg1 == 0 ||
 			    cmdlen != F_INSN_SIZE(ipfw_insn)) {
 				printf("ipfw: invalid external "
 				    "action opcode\n");
 				return (EINVAL);
 			}
 			ci->object_opcodes++;
 			/*
 			 * Do we have O_EXTERNAL_INSTANCE or O_EXTERNAL_DATA
 			 * opcode?
 			 */
 			if (l != cmdlen) {
 				l -= cmdlen;
 				cmd += cmdlen;
 				cmdlen = F_LEN(cmd);
 				if (cmd->opcode == O_EXTERNAL_DATA)
 					goto check_action;
 				if (cmd->opcode != O_EXTERNAL_INSTANCE) {
 					printf("ipfw: invalid opcode "
 					    "next to external action %u\n",
 					    cmd->opcode);
 					return (EINVAL);
 				}
 				if (cmd->arg1 == 0 ||
 				    cmdlen != F_INSN_SIZE(ipfw_insn)) {
 					printf("ipfw: invalid external "
 					    "action instance opcode\n");
 					return (EINVAL);
 				}
 				ci->object_opcodes++;
 			}
 			goto check_action;
 
 		case O_FIB:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			if (cmd->arg1 >= rt_numfibs) {
 				printf("ipfw: invalid fib number %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			break;
 
 		case O_SETFIB:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			if ((cmd->arg1 != IP_FW_TARG) &&
 			    ((cmd->arg1 & 0x7FFF) >= rt_numfibs)) {
 				printf("ipfw: invalid fib number %d\n",
 					cmd->arg1 & 0x7FFF);
 				return EINVAL;
 			}
 			goto check_action;
 
 		case O_UID:
 		case O_GID:
 		case O_JAIL:
 		case O_IP_SRC:
 		case O_IP_DST:
 		case O_TCPSEQ:
 		case O_TCPACK:
 		case O_PROB:
 		case O_ICMPTYPE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			break;
 
 		case O_LIMIT:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_limit))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 
 		case O_LOG:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_log))
 				goto bad_size;
 
 			((ipfw_insn_log *)cmd)->log_left =
 			    ((ipfw_insn_log *)cmd)->max_log;
 
 			break;
 
 		case O_IP_SRC_MASK:
 		case O_IP_DST_MASK:
 			/* only odd command lengths */
 			if ((cmdlen & 1) == 0)
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_SET:
 		case O_IP_DST_SET:
 			if (cmd->arg1 == 0 || cmd->arg1 > 256) {
 				printf("ipfw: invalid set size %d\n",
 					cmd->arg1);
 				return EINVAL;
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    (cmd->arg1+31)/32 )
 				goto bad_size;
 			break;
 
 		case O_IP_SRC_LOOKUP:
 			if (cmdlen > F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 		case O_IP_DST_LOOKUP:
 			if (cmd->arg1 >= V_fw_tables_max) {
 				printf("ipfw: invalid table number %d\n",
 				    cmd->arg1);
 				return (EINVAL);
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1 &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_IP_FLOW_LOOKUP:
 			if (cmd->arg1 >= V_fw_tables_max) {
 				printf("ipfw: invalid table number %d\n",
 				    cmd->arg1);
 				return (EINVAL);
 			}
 			if (cmdlen != F_INSN_SIZE(ipfw_insn) &&
 			    cmdlen != F_INSN_SIZE(ipfw_insn_u32))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 		case O_MACADDR2:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_mac))
 				goto bad_size;
 			break;
 
 		case O_NOP:
 		case O_IPID:
 		case O_IPTTL:
 		case O_IPLEN:
 		case O_TCPDATALEN:
 		case O_TCPMSS:
 		case O_TCPWIN:
 		case O_TAGGED:
 			if (cmdlen < 1 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_DSCP:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) + 1)
 				goto bad_size;
 			break;
 
 		case O_MAC_TYPE:
 		case O_IP_SRCPORT:
 		case O_IP_DSTPORT: /* XXX artificial limit, 30 port pairs */
 			if (cmdlen < 2 || cmdlen > 31)
 				goto bad_size;
 			break;
 
 		case O_RECV:
 		case O_XMIT:
 		case O_VIA:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_if))
 				goto bad_size;
 			ci->object_opcodes++;
 			break;
 
 		case O_ALTQ:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_altq))
 				goto bad_size;
 			break;
 
 		case O_PIPE:
 		case O_QUEUE:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			goto check_action;
 
 		case O_FORWARD_IP:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa))
 				goto bad_size;
 			goto check_action;
 #ifdef INET6
 		case O_FORWARD_IP6:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_sa6))
 				goto bad_size;
 			goto check_action;
 #endif /* INET6 */
 
 		case O_DIVERT:
 		case O_TEE:
 			if (ip_divert_ptr == NULL)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NETGRAPH:
 		case O_NGTEE:
 			if (ng_ipfw_input_p == NULL)
 				return EINVAL;
 			else
 				goto check_size;
 		case O_NAT:
 			if (!IPFW_NAT_LOADED)
 				return EINVAL;
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_nat))
  				goto bad_size;		
  			goto check_action;
 		case O_CHECK_STATE:
 			ci->object_opcodes++;
 			/* FALLTHROUGH */
 		case O_FORWARD_MAC: /* XXX not implemented yet */
 		case O_COUNT:
 		case O_ACCEPT:
 		case O_DENY:
 		case O_REJECT:
 		case O_SETDSCP:
 #ifdef INET6
 		case O_UNREACH6:
 #endif
 		case O_SKIPTO:
 		case O_REASS:
 		case O_CALLRETURN:
 check_size:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 check_action:
 			if (have_action) {
 				printf("ipfw: opcode %d, multiple actions"
 					" not allowed\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 			have_action = 1;
 			if (l != cmdlen) {
 				printf("ipfw: opcode %d, action must be"
 					" last opcode\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 			break;
 #ifdef INET6
 		case O_IP6_SRC:
 		case O_IP6_DST:
 			if (cmdlen != F_INSN_SIZE(struct in6_addr) +
 			    F_INSN_SIZE(ipfw_insn))
 				goto bad_size;
 			break;
 
 		case O_FLOW6ID:
 			if (cmdlen != F_INSN_SIZE(ipfw_insn_u32) +
 			    ((ipfw_insn_u32 *)cmd)->o.arg1)
 				goto bad_size;
 			break;
 
 		case O_IP6_SRC_MASK:
 		case O_IP6_DST_MASK:
 			if ( !(cmdlen & 1) || cmdlen > 127)
 				goto bad_size;
 			break;
 		case O_ICMP6TYPE:
 			if( cmdlen != F_INSN_SIZE( ipfw_insn_icmp6 ) )
 				goto bad_size;
 			break;
 #endif
 
 		default:
 			switch (cmd->opcode) {
 #ifndef INET6
 			case O_IP6_SRC_ME:
 			case O_IP6_DST_ME:
 			case O_EXT_HDR:
 			case O_IP6:
 			case O_UNREACH6:
 			case O_IP6_SRC:
 			case O_IP6_DST:
 			case O_FLOW6ID:
 			case O_IP6_SRC_MASK:
 			case O_IP6_DST_MASK:
 			case O_ICMP6TYPE:
 				printf("ipfw: no IPv6 support in kernel\n");
 				return (EPROTONOSUPPORT);
 #endif
 			default:
 				printf("ipfw: opcode %d, unknown opcode\n",
 					cmd->opcode);
 				return (EINVAL);
 			}
 		}
 	}
 	if (have_action == 0) {
 		printf("ipfw: missing action\n");
 		return (EINVAL);
 	}
 	return 0;
 
 bad_size:
 	printf("ipfw: opcode %d size %d wrong\n",
 		cmd->opcode, cmdlen);
 	return (EINVAL);
 }
 
 
 /*
  * Translation of requests for compatibility with FreeBSD 7.2/8.
  * a static variable tells us if we have an old client from userland,
  * and if necessary we translate requests and responses between the
  * two formats.
  */
 static int is7 = 0;
 
 struct ip_fw7 {
 	struct ip_fw7	*next;		/* linked list of rules     */
 	struct ip_fw7	*next_rule;	/* ptr to next [skipto] rule    */
 	/* 'next_rule' is used to pass up 'set_disable' status      */
 
 	uint16_t	act_ofs;	/* offset of action in 32-bit units */
 	uint16_t	cmd_len;	/* # of 32-bit words in cmd */
 	uint16_t	rulenum;	/* rule number          */
 	uint8_t		set;		/* rule set (0..31)     */
 	// #define RESVD_SET   31  /* set for default and persistent rules */
 	uint8_t		_pad;		/* padding          */
 	// uint32_t        id;             /* rule id, only in v.8 */
 	/* These fields are present in all rules.           */
 	uint64_t	pcnt;		/* Packet counter       */
 	uint64_t	bcnt;		/* Byte counter         */
 	uint32_t	timestamp;	/* tv_sec of last match     */
 
 	ipfw_insn	cmd[1];		/* storage for commands     */
 };
 
 static int convert_rule_to_7(struct ip_fw_rule0 *rule);
 static int convert_rule_to_8(struct ip_fw_rule0 *rule);
 
 #ifndef RULESIZE7
 #define RULESIZE7(rule)  (sizeof(struct ip_fw7) + \
 	((struct ip_fw7 *)(rule))->cmd_len * 4 - 4)
 #endif
 
 
 /*
  * Copy the static and dynamic rules to the supplied buffer
  * and return the amount of space actually used.
  * Must be run under IPFW_UH_RLOCK
  */
 static size_t
 ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t space)
 {
 	char *bp = buf;
 	char *ep = bp + space;
 	struct ip_fw *rule;
 	struct ip_fw_rule0 *dst;
 	struct timeval boottime;
 	int error, i, l, warnflag;
 	time_t	boot_seconds;
 
 	warnflag = 0;
 
 	getboottime(&boottime);
         boot_seconds = boottime.tv_sec;
 	for (i = 0; i < chain->n_rules; i++) {
 		rule = chain->map[i];
 
 		if (is7) {
 		    /* Convert rule to FreeBSd 7.2 format */
 		    l = RULESIZE7(rule);
 		    if (bp + l + sizeof(uint32_t) <= ep) {
 			bcopy(rule, bp, l + sizeof(uint32_t));
 			error = set_legacy_obj_kidx(chain,
 			    (struct ip_fw_rule0 *)bp);
 			if (error != 0)
 				return (0);
 			error = convert_rule_to_7((struct ip_fw_rule0 *) bp);
 			if (error)
 				return 0; /*XXX correct? */
 			/*
 			 * XXX HACK. Store the disable mask in the "next"
 			 * pointer in a wild attempt to keep the ABI the same.
 			 * Why do we do this on EVERY rule?
 			 */
 			bcopy(&V_set_disable,
 				&(((struct ip_fw7 *)bp)->next_rule),
 				sizeof(V_set_disable));
 			if (((struct ip_fw7 *)bp)->timestamp)
 			    ((struct ip_fw7 *)bp)->timestamp += boot_seconds;
 			bp += l;
 		    }
 		    continue; /* go to next rule */
 		}
 
 		l = RULEUSIZE0(rule);
 		if (bp + l > ep) { /* should not happen */
 			printf("overflow dumping static rules\n");
 			break;
 		}
 		dst = (struct ip_fw_rule0 *)bp;
 		export_rule0(rule, dst, l);
 		error = set_legacy_obj_kidx(chain, dst);
 
 		/*
 		 * XXX HACK. Store the disable mask in the "next"
 		 * pointer in a wild attempt to keep the ABI the same.
 		 * Why do we do this on EVERY rule?
 		 *
 		 * XXX: "ipfw set show" (ab)uses IP_FW_GET to read disabled mask
 		 * so we need to fail _after_ saving at least one mask.
 		 */
 		bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
 		if (dst->timestamp)
 			dst->timestamp += boot_seconds;
 		bp += l;
 
 		if (error != 0) {
 			if (error == 2) {
 				/* Non-fatal table rewrite error. */
 				warnflag = 1;
 				continue;
 			}
 			printf("Stop on rule %d. Fail to convert table\n",
 			    rule->rulenum);
 			break;
 		}
 	}
 	if (warnflag != 0)
 		printf("ipfw: process %s is using legacy interfaces,"
 		    " consider rebuilding\n", "");
 	ipfw_get_dynamic(chain, &bp, ep); /* protected by the dynamic lock */
 	return (bp - (char *)buf);
 }
 
 
 struct dump_args {
 	uint32_t	b;	/* start rule */
 	uint32_t	e;	/* end rule */
 	uint32_t	rcount;	/* number of rules */
 	uint32_t	rsize;	/* rules size */
 	uint32_t	tcount;	/* number of tables */
 	int		rcounters;	/* counters */
 	uint32_t	*bmask;	/* index bitmask of used named objects */
 };
 
 void
 ipfw_export_obj_ntlv(struct named_object *no, ipfw_obj_ntlv *ntlv)
 {
 
 	ntlv->head.type = no->etlv;
 	ntlv->head.length = sizeof(*ntlv);
 	ntlv->idx = no->kidx;
 	strlcpy(ntlv->name, no->name, sizeof(ntlv->name));
 }
 
 /*
  * Export named object info in instance @ni, identified by @kidx
  * to ipfw_obj_ntlv. TLV is allocated from @sd space.
  *
  * Returns 0 on success.
  */
 static int
 export_objhash_ntlv(struct namedobj_instance *ni, uint16_t kidx,
     struct sockopt_data *sd)
 {
 	struct named_object *no;
 	ipfw_obj_ntlv *ntlv;
 
 	no = ipfw_objhash_lookup_kidx(ni, kidx);
 	KASSERT(no != NULL, ("invalid object kernel index passed"));
 
 	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
 	if (ntlv == NULL)
 		return (ENOMEM);
 
 	ipfw_export_obj_ntlv(no, ntlv);
 	return (0);
 }
 
 static int
 export_named_objects(struct namedobj_instance *ni, struct dump_args *da,
     struct sockopt_data *sd)
 {
 	int error, i;
 
 	for (i = 0; i < IPFW_TABLES_MAX && da->tcount > 0; i++) {
 		if ((da->bmask[i / 32] & (1 << (i % 32))) == 0)
 			continue;
 		if ((error = export_objhash_ntlv(ni, i, sd)) != 0)
 			return (error);
 		da->tcount--;
 	}
 	return (0);
 }
 
 static int
 dump_named_objects(struct ip_fw_chain *ch, struct dump_args *da,
     struct sockopt_data *sd)
 {
 	ipfw_obj_ctlv *ctlv;
 	int error;
 
 	MPASS(da->tcount > 0);
 	/* Header first */
 	ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
 	if (ctlv == NULL)
 		return (ENOMEM);
 	ctlv->head.type = IPFW_TLV_TBLNAME_LIST;
 	ctlv->head.length = da->tcount * sizeof(ipfw_obj_ntlv) +
 	    sizeof(*ctlv);
 	ctlv->count = da->tcount;
 	ctlv->objsize = sizeof(ipfw_obj_ntlv);
 
 	/* Dump table names first (if any) */
 	error = export_named_objects(ipfw_get_table_objhash(ch), da, sd);
 	if (error != 0)
 		return (error);
 	/* Then dump another named objects */
 	da->bmask += IPFW_TABLES_MAX / 32;
 	return (export_named_objects(CHAIN_TO_SRV(ch), da, sd));
 }
 
 /*
  * Dumps static rules with table TLVs in buffer @sd.
  *
  * Returns 0 on success.
  */
 static int
 dump_static_rules(struct ip_fw_chain *chain, struct dump_args *da,
     struct sockopt_data *sd)
 {
 	ipfw_obj_ctlv *ctlv;
 	struct ip_fw *krule;
 	caddr_t dst;
 	int i, l;
 
 	/* Dump rules */
 	ctlv = (ipfw_obj_ctlv *)ipfw_get_sopt_space(sd, sizeof(*ctlv));
 	if (ctlv == NULL)
 		return (ENOMEM);
 	ctlv->head.type = IPFW_TLV_RULE_LIST;
 	ctlv->head.length = da->rsize + sizeof(*ctlv);
 	ctlv->count = da->rcount;
 
 	for (i = da->b; i < da->e; i++) {
 		krule = chain->map[i];
 
 		l = RULEUSIZE1(krule) + sizeof(ipfw_obj_tlv);
 		if (da->rcounters != 0)
 			l += sizeof(struct ip_fw_bcounter);
 		dst = (caddr_t)ipfw_get_sopt_space(sd, l);
 		if (dst == NULL)
 			return (ENOMEM);
 
 		export_rule1(krule, dst, l, da->rcounters);
 	}
 
 	return (0);
 }
 
 int
 ipfw_mark_object_kidx(uint32_t *bmask, uint16_t etlv, uint16_t kidx)
 {
 	uint32_t bidx;
 
 	/*
 	 * Maintain separate bitmasks for table and non-table objects.
 	 */
 	bidx = (etlv == IPFW_TLV_TBL_NAME) ? 0: IPFW_TABLES_MAX / 32;
 	bidx += kidx / 32;
 	if ((bmask[bidx] & (1 << (kidx % 32))) != 0)
 		return (0);
 
 	bmask[bidx] |= 1 << (kidx % 32);
 	return (1);
 }
 
 /*
  * Marks every object index used in @rule with bit in @bmask.
  * Used to generate bitmask of referenced tables/objects for given ruleset
  * or its part.
  */
 static void
 mark_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
     struct dump_args *da)
 {
 	struct opcode_obj_rewrite *rw;
 	ipfw_insn *cmd;
 	int cmdlen, l;
 	uint16_t kidx;
 	uint8_t subtype;
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 
 		if (ipfw_mark_object_kidx(da->bmask, rw->etlv, kidx))
 			da->tcount++;
 	}
 }
 
 /*
  * Dumps requested objects data
  * Data layout (version 0)(current):
  * Request: [ ipfw_cfg_lheader ] + IPFW_CFG_GET_* flags
  *   size = ipfw_cfg_lheader.size
  * Reply: [ ipfw_cfg_lheader 
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST)
  *     ipfw_obj_tlv(IPFW_TLV_RULE_ENT) [ ip_fw_bcounter (optional) ip_fw_rule ]
  *   ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_STATE_LIST) ipfw_obj_dyntlv x N ] (optional)
  * ]
  * * NOTE IPFW_TLV_STATE_LIST has the single valid field: objsize.
  * The rest (size, count) are set to zero and needs to be ignored.
  *
  * Returns 0 on success.
  */
 static int
 dump_config(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	struct dump_args da;
 	ipfw_cfg_lheader *hdr;
 	struct ip_fw *rule;
 	size_t sz, rnum;
 	uint32_t hdr_flags, *bmask;
 	int error, i;
 
 	hdr = (ipfw_cfg_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
 	if (hdr == NULL)
 		return (EINVAL);
 
 	error = 0;
 	bmask = NULL;
 	memset(&da, 0, sizeof(da));
 	/*
 	 * Allocate needed state.
 	 * Note we allocate 2xspace mask, for table & srv
 	 */
 	if (hdr->flags & (IPFW_CFG_GET_STATIC | IPFW_CFG_GET_STATES))
 		da.bmask = bmask = malloc(
 		    sizeof(uint32_t) * IPFW_TABLES_MAX * 2 / 32, M_TEMP,
 		    M_WAITOK | M_ZERO);
 	IPFW_UH_RLOCK(chain);
 
 	/*
 	 * STAGE 1: Determine size/count for objects in range.
 	 * Prepare used tables bitmask.
 	 */
 	sz = sizeof(ipfw_cfg_lheader);
 	da.e = chain->n_rules;
 
 	if (hdr->end_rule != 0) {
 		/* Handle custom range */
 		if ((rnum = hdr->start_rule) > IPFW_DEFAULT_RULE)
 			rnum = IPFW_DEFAULT_RULE;
 		da.b = ipfw_find_rule(chain, rnum, 0);
 		rnum = (hdr->end_rule < IPFW_DEFAULT_RULE) ?
 		    hdr->end_rule + 1: IPFW_DEFAULT_RULE;
 		da.e = ipfw_find_rule(chain, rnum, UINT32_MAX) + 1;
 	}
 
 	if (hdr->flags & IPFW_CFG_GET_STATIC) {
 		for (i = da.b; i < da.e; i++) {
 			rule = chain->map[i];
 			da.rsize += RULEUSIZE1(rule) + sizeof(ipfw_obj_tlv);
 			da.rcount++;
 			/* Update bitmask of used objects for given range */
 			mark_rule_objects(chain, rule, &da);
 		}
 		/* Add counters if requested */
 		if (hdr->flags & IPFW_CFG_GET_COUNTERS) {
 			da.rsize += sizeof(struct ip_fw_bcounter) * da.rcount;
 			da.rcounters = 1;
 		}
 		sz += da.rsize + sizeof(ipfw_obj_ctlv);
 	}
 
 	if (hdr->flags & IPFW_CFG_GET_STATES) {
 		sz += sizeof(ipfw_obj_ctlv) +
 		    ipfw_dyn_get_count(bmask, &i) * sizeof(ipfw_obj_dyntlv);
 		da.tcount += i;
 	}
 
 	if (da.tcount > 0)
 		sz += da.tcount * sizeof(ipfw_obj_ntlv) +
 		    sizeof(ipfw_obj_ctlv);
 
 	/*
 	 * Fill header anyway.
 	 * Note we have to save header fields to stable storage
 	 * buffer inside @sd can be flushed after dumping rules
 	 */
 	hdr->size = sz;
 	hdr->set_mask = ~V_set_disable;
 	hdr_flags = hdr->flags;
 	hdr = NULL;
 
 	if (sd->valsize < sz) {
 		error = ENOMEM;
 		goto cleanup;
 	}
 
 	/* STAGE2: Store actual data */
 	if (da.tcount > 0) {
 		error = dump_named_objects(chain, &da, sd);
 		if (error != 0)
 			goto cleanup;
 	}
 
 	if (hdr_flags & IPFW_CFG_GET_STATIC) {
 		error = dump_static_rules(chain, &da, sd);
 		if (error != 0)
 			goto cleanup;
 	}
 
 	if (hdr_flags & IPFW_CFG_GET_STATES)
 		error = ipfw_dump_states(chain, sd);
 
 cleanup:
 	IPFW_UH_RUNLOCK(chain);
 
 	if (bmask != NULL)
 		free(bmask, M_TEMP);
 
 	return (error);
 }
 
 int
 ipfw_check_object_name_generic(const char *name)
 {
 	int nsize;
 
 	nsize = sizeof(((ipfw_obj_ntlv *)0)->name);
 	if (strnlen(name, nsize) == nsize)
 		return (EINVAL);
 	if (name[0] == '\0')
 		return (EINVAL);
 	return (0);
 }
 
 /*
  * Creates non-existent objects referenced by rule.
  *
  * Return 0 on success.
  */
 int
 create_objects_compat(struct ip_fw_chain *ch, ipfw_insn *cmd,
     struct obj_idx *oib, struct obj_idx *pidx, struct tid_info *ti)
 {
 	struct opcode_obj_rewrite *rw;
 	struct obj_idx *p;
 	uint16_t kidx;
 	int error;
 
 	/*
 	 * Compatibility stuff: do actual creation for non-existing,
 	 * but referenced objects.
 	 */
 	for (p = oib; p < pidx; p++) {
 		if (p->kidx != 0)
 			continue;
 
 		ti->uidx = p->uidx;
 		ti->type = p->type;
 		ti->atype = 0;
 
 		rw = find_op_rw(cmd + p->off, NULL, NULL);
 		KASSERT(rw != NULL, ("Unable to find handler for op %d",
 		    (cmd + p->off)->opcode));
 
 		if (rw->create_object == NULL)
 			error = EOPNOTSUPP;
 		else
 			error = rw->create_object(ch, ti, &kidx);
 		if (error == 0) {
 			p->kidx = kidx;
 			continue;
 		}
 
 		/*
 		 * Error happened. We have to rollback everything.
 		 * Drop all already acquired references.
 		 */
 		IPFW_UH_WLOCK(ch);
 		unref_oib_objects(ch, cmd, oib, pidx);
 		IPFW_UH_WUNLOCK(ch);
 
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Compatibility function for old ipfw(8) binaries.
  * Rewrites table/nat kernel indices with userland ones.
  * Convert tables matching '/^\d+$/' to their atoi() value.
  * Use number 65535 for other tables.
  *
  * Returns 0 on success.
  */
 static int
 set_legacy_obj_kidx(struct ip_fw_chain *ch, struct ip_fw_rule0 *rule)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	ipfw_insn *cmd;
 	char *end;
 	long val;
 	int cmdlen, error, l;
 	uint16_t kidx, uidx;
 	uint8_t subtype;
 
 	error = 0;
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		/* Check if is index in given opcode */
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 
 		/* Try to find referenced kernel object */
 		no = rw->find_bykidx(ch, kidx);
 		if (no == NULL)
 			continue;
 
 		val = strtol(no->name, &end, 10);
 		if (*end == '\0' && val < 65535) {
 			uidx = val;
 		} else {
 
 			/*
 			 * We are called via legacy opcode.
 			 * Save error and show table as fake number
 			 * not to make ipfw(8) hang.
 			 */
 			uidx = 65535;
 			error = 2;
 		}
 
 		rw->update(cmd, uidx);
 	}
 
 	return (error);
 }
 
 
 /*
  * Unreferences all already-referenced objects in given @cmd rule,
  * using information in @oib.
  *
  * Used to rollback partially converted rule on error.
  */
 static void
 unref_oib_objects(struct ip_fw_chain *ch, ipfw_insn *cmd, struct obj_idx *oib,
     struct obj_idx *end)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	struct obj_idx *p;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	for (p = oib; p < end; p++) {
 		if (p->kidx == 0)
 			continue;
 
 		rw = find_op_rw(cmd + p->off, NULL, NULL);
 		KASSERT(rw != NULL, ("Unable to find handler for op %d",
 		    (cmd + p->off)->opcode));
 
 		/* Find & unref by existing idx */
 		no = rw->find_bykidx(ch, p->kidx);
 		KASSERT(no != NULL, ("Ref'd object %d disappeared", p->kidx));
 		no->refcnt--;
 	}
 }
 
 /*
  * Remove references from every object used in @rule.
  * Used at rule removal code.
  */
 static void
 unref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule)
 {
 	struct opcode_obj_rewrite *rw;
 	struct named_object *no;
 	ipfw_insn *cmd;
 	int cmdlen, l;
 	uint16_t kidx;
 	uint8_t subtype;
 
 	IPFW_UH_WLOCK_ASSERT(ch);
 
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 
 		rw = find_op_rw(cmd, &kidx, &subtype);
 		if (rw == NULL)
 			continue;
 		no = rw->find_bykidx(ch, kidx);
 
 		KASSERT(no != NULL, ("object id %d not found", kidx));
 		KASSERT(no->subtype == subtype,
 		    ("wrong type %d (%d) for object id %d",
 		    no->subtype, subtype, kidx));
 		KASSERT(no->refcnt > 0, ("refcount for object %d is %d",
 		    kidx, no->refcnt));
 
 		if (no->refcnt == 1 && rw->destroy_object != NULL)
 			rw->destroy_object(ch, no);
 		else
 			no->refcnt--;
 	}
 }
 
 
 /*
  * Find and reference object (if any) stored in instruction @cmd.
  *
  * Saves object info in @pidx, sets
  *  - @unresolved to 1 if object should exists but not found
  *
  * Returns non-zero value in case of error.
  */
 static int
 ref_opcode_object(struct ip_fw_chain *ch, ipfw_insn *cmd, struct tid_info *ti,
     struct obj_idx *pidx, int *unresolved)
 {
 	struct named_object *no;
 	struct opcode_obj_rewrite *rw;
 	int error;
 
 	/* Check if this opcode is candidate for rewrite */
 	rw = find_op_rw(cmd, &ti->uidx, &ti->type);
 	if (rw == NULL)
 		return (0);
 
 	/* Need to rewrite. Save necessary fields */
 	pidx->uidx = ti->uidx;
 	pidx->type = ti->type;
 
 	/* Try to find referenced kernel object */
 	error = rw->find_byname(ch, ti, &no);
 	if (error != 0)
 		return (error);
 	if (no == NULL) {
 		/*
 		 * Report about unresolved object for automaic
 		 * creation.
 		 */
 		*unresolved = 1;
 		return (0);
 	}
 
 	/*
 	 * Object is already exist.
 	 * Its subtype should match with expected value.
 	 */
 	if (ti->type != no->subtype)
 		return (EINVAL);
 
 	/* Bump refcount and update kidx. */
 	no->refcnt++;
 	rw->update(cmd, no->kidx);
 	return (0);
 }
 
 /*
  * Finds and bumps refcount for objects referenced by given @rule.
  * Auto-creates non-existing tables.
  * Fills in @oib array with userland/kernel indexes.
  *
  * Returns 0 on success.
  */
 static int
 ref_rule_objects(struct ip_fw_chain *ch, struct ip_fw *rule,
     struct rule_check_info *ci, struct obj_idx *oib, struct tid_info *ti)
 {
 	struct obj_idx *pidx;
 	ipfw_insn *cmd;
 	int cmdlen, error, l, unresolved;
 
 	pidx = oib;
 	l = rule->cmd_len;
 	cmd = rule->cmd;
 	cmdlen = 0;
 	error = 0;
 
 	IPFW_UH_WLOCK(ch);
 
 	/* Increase refcount on each existing referenced table. */
 	for ( ;	l > 0 ; l -= cmdlen, cmd += cmdlen) {
 		cmdlen = F_LEN(cmd);
 		unresolved = 0;
 
 		error = ref_opcode_object(ch, cmd, ti, pidx, &unresolved);
 		if (error != 0)
 			break;
 		/*
 		 * Compatibility stuff for old clients:
 		 * prepare to automaitcally create non-existing objects.
 		 */
 		if (unresolved != 0) {
 			pidx->off = rule->cmd_len - l;
 			pidx++;
 		}
 	}
 
 	if (error != 0) {
 		/* Unref everything we have already done */
 		unref_oib_objects(ch, rule->cmd, oib, pidx);
 		IPFW_UH_WUNLOCK(ch);
 		return (error);
 	}
 	IPFW_UH_WUNLOCK(ch);
 
 	/* Perform auto-creation for non-existing objects */
 	if (pidx != oib)
 		error = create_objects_compat(ch, rule->cmd, oib, pidx, ti);
 
 	/* Calculate real number of dynamic objects */
 	ci->object_opcodes = (uint16_t)(pidx - oib);
 
 	return (error);
 }
 
 /*
  * Checks is opcode is referencing table of appropriate type.
  * Adds reference count for found table if true.
  * Rewrites user-supplied opcode values with kernel ones.
  *
  * Returns 0 on success and appropriate error code otherwise.
  */
 static int
 rewrite_rule_uidx(struct ip_fw_chain *chain, struct rule_check_info *ci)
 {
 	int error;
 	ipfw_insn *cmd;
 	uint8_t type;
 	struct obj_idx *p, *pidx_first, *pidx_last;
 	struct tid_info ti;
 
 	/*
 	 * Prepare an array for storing opcode indices.
 	 * Use stack allocation by default.
 	 */
 	if (ci->object_opcodes <= (sizeof(ci->obuf)/sizeof(ci->obuf[0]))) {
 		/* Stack */
 		pidx_first = ci->obuf;
 	} else
 		pidx_first = malloc(
 		    ci->object_opcodes * sizeof(struct obj_idx),
 		    M_IPFW, M_WAITOK | M_ZERO);
 
 	error = 0;
 	type = 0;
 	memset(&ti, 0, sizeof(ti));
 
 	/* Use set rule is assigned to. */
 	ti.set = ci->krule->set;
 	if (ci->ctlv != NULL) {
 		ti.tlvs = (void *)(ci->ctlv + 1);
 		ti.tlen = ci->ctlv->head.length - sizeof(ipfw_obj_ctlv);
 	}
 
 	/* Reference all used tables and other objects */
 	error = ref_rule_objects(chain, ci->krule, ci, pidx_first, &ti);
 	if (error != 0)
 		goto free;
 	/*
 	 * Note that ref_rule_objects() might have updated ci->object_opcodes
 	 * to reflect actual number of object opcodes.
 	 */
 
 	/* Perform rewrite of remaining opcodes */
 	p = pidx_first;
 	pidx_last = pidx_first + ci->object_opcodes;
 	for (p = pidx_first; p < pidx_last; p++) {
 		cmd = ci->krule->cmd + p->off;
 		update_opcode_kidx(cmd, p->kidx);
 	}
 
 free:
 	if (pidx_first != ci->obuf)
 		free(pidx_first, M_IPFW);
 
 	return (error);
 }
 
 /*
  * Adds one or more rules to ipfw @chain.
  * Data layout (version 0)(current):
  * Request:
  * [
  *   ip_fw3_opheader
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional *1)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ] (*2) (*3)
  * ]
  * Reply:
  * [
  *   ip_fw3_opheader
  *   [ ipfw_obj_ctlv(IPFW_TLV_TBL_LIST) ipfw_obj_ntlv x N ] (optional)
  *   [ ipfw_obj_ctlv(IPFW_TLV_RULE_LIST) ip_fw x N ]
  * ]
  *
  * Rules in reply are modified to store their actual ruleset number.
  *
  * (*1) TLVs inside IPFW_TLV_TBL_LIST needs to be sorted ascending
  * according to their idx field and there has to be no duplicates.
  * (*2) Numbered rules inside IPFW_TLV_RULE_LIST needs to be sorted ascending.
  * (*3) Each ip_fw structure needs to be aligned to u64 boundary.
  *
  * Returns 0 on success.
  */
 static int
 add_rules(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_ctlv *ctlv, *rtlv, *tstate;
 	ipfw_obj_ntlv *ntlv;
 	int clen, error, idx;
 	uint32_t count, read;
 	struct ip_fw_rule *r;
 	struct rule_check_info rci, *ci, *cbuf;
 	int i, rsize;
 
 	op3 = (ip_fw3_opheader *)ipfw_get_sopt_space(sd, sd->valsize);
 	ctlv = (ipfw_obj_ctlv *)(op3 + 1);
 
 	read = sizeof(ip_fw3_opheader);
 	rtlv = NULL;
 	tstate = NULL;
 	cbuf = NULL;
 	memset(&rci, 0, sizeof(struct rule_check_info));
 
 	if (read + sizeof(*ctlv) > sd->valsize)
 		return (EINVAL);
 
 	if (ctlv->head.type == IPFW_TLV_TBLNAME_LIST) {
 		clen = ctlv->head.length;
 		/* Check size and alignment */
 		if (clen > sd->valsize || clen < sizeof(*ctlv))
 			return (EINVAL);
 		if ((clen % sizeof(uint64_t)) != 0)
 			return (EINVAL);
 
 		/*
 		 * Some table names or other named objects.
 		 * Check for validness.
 		 */
 		count = (ctlv->head.length - sizeof(*ctlv)) / sizeof(*ntlv);
 		if (ctlv->count != count || ctlv->objsize != sizeof(*ntlv))
 			return (EINVAL);
 
 		/*
 		 * Check each TLV.
 		 * Ensure TLVs are sorted ascending and
 		 * there are no duplicates.
 		 */
 		idx = -1;
 		ntlv = (ipfw_obj_ntlv *)(ctlv + 1);
 		while (count > 0) {
 			if (ntlv->head.length != sizeof(ipfw_obj_ntlv))
 				return (EINVAL);
 
 			error = ipfw_check_object_name_generic(ntlv->name);
 			if (error != 0)
 				return (error);
 
 			if (ntlv->idx <= idx)
 				return (EINVAL);
 
 			idx = ntlv->idx;
 			count--;
 			ntlv++;
 		}
 
 		tstate = ctlv;
 		read += ctlv->head.length;
 		ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
 	}
 
 	if (read + sizeof(*ctlv) > sd->valsize)
 		return (EINVAL);
 
 	if (ctlv->head.type == IPFW_TLV_RULE_LIST) {
 		clen = ctlv->head.length;
 		if (clen + read > sd->valsize || clen < sizeof(*ctlv))
 			return (EINVAL);
 		if ((clen % sizeof(uint64_t)) != 0)
 			return (EINVAL);
 
 		/*
 		 * TODO: Permit adding multiple rules at once
 		 */
 		if (ctlv->count != 1)
 			return (ENOTSUP);
 
 		clen -= sizeof(*ctlv);
 
 		if (ctlv->count > clen / sizeof(struct ip_fw_rule))
 			return (EINVAL);
 
 		/* Allocate state for each rule or use stack */
 		if (ctlv->count == 1) {
 			memset(&rci, 0, sizeof(struct rule_check_info));
 			cbuf = &rci;
 		} else
 			cbuf = malloc(ctlv->count * sizeof(*ci), M_TEMP,
 			    M_WAITOK | M_ZERO);
 		ci = cbuf;
 
 		/*
 		 * Check each rule for validness.
 		 * Ensure numbered rules are sorted ascending
 		 * and properly aligned
 		 */
 		idx = 0;
 		r = (struct ip_fw_rule *)(ctlv + 1);
 		count = 0;
 		error = 0;
 		while (clen > 0) {
 			rsize = roundup2(RULESIZE(r), sizeof(uint64_t));
 			if (rsize > clen || ctlv->count <= count) {
 				error = EINVAL;
 				break;
 			}
 
 			ci->ctlv = tstate;
 			error = check_ipfw_rule1(r, rsize, ci);
 			if (error != 0)
 				break;
 
 			/* Check sorting */
 			if (r->rulenum != 0 && r->rulenum < idx) {
 				printf("rulenum %d idx %d\n", r->rulenum, idx);
 				error = EINVAL;
 				break;
 			}
 			idx = r->rulenum;
 
 			ci->urule = (caddr_t)r;
 
 			rsize = roundup2(rsize, sizeof(uint64_t));
 			clen -= rsize;
 			r = (struct ip_fw_rule *)((caddr_t)r + rsize);
 			count++;
 			ci++;
 		}
 
 		if (ctlv->count != count || error != 0) {
 			if (cbuf != &rci)
 				free(cbuf, M_TEMP);
 			return (EINVAL);
 		}
 
 		rtlv = ctlv;
 		read += ctlv->head.length;
 		ctlv = (ipfw_obj_ctlv *)((caddr_t)ctlv + ctlv->head.length);
 	}
 
 	if (read != sd->valsize || rtlv == NULL || rtlv->count == 0) {
 		if (cbuf != NULL && cbuf != &rci)
 			free(cbuf, M_TEMP);
 		return (EINVAL);
 	}
 
 	/*
 	 * Passed rules seems to be valid.
 	 * Allocate storage and try to add them to chain.
 	 */
 	for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++) {
 		clen = RULEKSIZE1((struct ip_fw_rule *)ci->urule);
 		ci->krule = ipfw_alloc_rule(chain, clen);
 		import_rule1(ci);
 	}
 
 	if ((error = commit_rules(chain, cbuf, rtlv->count)) != 0) {
 		/* Free allocate krules */
 		for (i = 0, ci = cbuf; i < rtlv->count; i++, ci++)
 			ipfw_free_rule(ci->krule);
 	}
 
 	if (cbuf != NULL && cbuf != &rci)
 		free(cbuf, M_TEMP);
 
 	return (error);
 }
 
 /*
  * Lists all sopts currently registered.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ], size = ipfw_obj_lheader.size
  * Reply: [ ipfw_obj_lheader ipfw_sopt_info x N ]
  *
  * Returns 0 on success
  */
 static int
 dump_soptcodes(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	struct _ipfw_obj_lheader *olh;
 	ipfw_sopt_info *i;
 	struct ipfw_sopt_handler *sh;
 	uint32_t count, n, size;
 
 	olh = (struct _ipfw_obj_lheader *)ipfw_get_sopt_header(sd,sizeof(*olh));
 	if (olh == NULL)
 		return (EINVAL);
 	if (sd->valsize < olh->size)
 		return (EINVAL);
 
 	CTL3_LOCK();
 	count = ctl3_hsize;
 	size = count * sizeof(ipfw_sopt_info) + sizeof(ipfw_obj_lheader);
 
 	/* Fill in header regadless of buffer size */
 	olh->count = count;
 	olh->objsize = sizeof(ipfw_sopt_info);
 
 	if (size > olh->size) {
 		olh->size = size;
 		CTL3_UNLOCK();
 		return (ENOMEM);
 	}
 	olh->size = size;
 
 	for (n = 1; n <= count; n++) {
 		i = (ipfw_sopt_info *)ipfw_get_sopt_space(sd, sizeof(*i));
 		KASSERT(i != NULL, ("previously checked buffer is not enough"));
 		sh = &ctl3_handlers[n];
 		i->opcode = sh->opcode;
 		i->version = sh->version;
 		i->refcnt = sh->refcnt;
 	}
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Compares two opcodes.
  * Used both in qsort() and bsearch().
  *
  * Returns 0 if match is found.
  */
 static int
 compare_opcodes(const void *_a, const void *_b)
 {
 	const struct opcode_obj_rewrite *a, *b;
 
 	a = (const struct opcode_obj_rewrite *)_a;
 	b = (const struct opcode_obj_rewrite *)_b;
 
 	if (a->opcode < b->opcode)
 		return (-1);
 	else if (a->opcode > b->opcode)
 		return (1);
 
 	return (0);
 }
 
 /*
  * XXX: Rewrite bsearch()
  */
 static int
 find_op_rw_range(uint16_t op, struct opcode_obj_rewrite **plo,
     struct opcode_obj_rewrite **phi)
 {
 	struct opcode_obj_rewrite *ctl3_max, *lo, *hi, h, *rw;
 
 	memset(&h, 0, sizeof(h));
 	h.opcode = op;
 
 	rw = (struct opcode_obj_rewrite *)bsearch(&h, ctl3_rewriters,
 	    ctl3_rsize, sizeof(h), compare_opcodes);
 	if (rw == NULL)
 		return (1);
 
 	/* Find the first element matching the same opcode */
 	lo = rw;
 	for ( ; lo > ctl3_rewriters && (lo - 1)->opcode == op; lo--)
 		;
 
 	/* Find the last element matching the same opcode */
 	hi = rw;
 	ctl3_max = ctl3_rewriters + ctl3_rsize;
 	for ( ; (hi + 1) < ctl3_max && (hi + 1)->opcode == op; hi++)
 		;
 
 	*plo = lo;
 	*phi = hi;
 
 	return (0);
 }
 
 /*
  * Finds opcode object rewriter based on @code.
  *
  * Returns pointer to handler or NULL.
  */
 static struct opcode_obj_rewrite *
 find_op_rw(ipfw_insn *cmd, uint16_t *puidx, uint8_t *ptype)
 {
 	struct opcode_obj_rewrite *rw, *lo, *hi;
 	uint16_t uidx;
 	uint8_t subtype;
 
 	if (find_op_rw_range(cmd->opcode, &lo, &hi) != 0)
 		return (NULL);
 
 	for (rw = lo; rw <= hi; rw++) {
 		if (rw->classifier(cmd, &uidx, &subtype) == 0) {
 			if (puidx != NULL)
 				*puidx = uidx;
 			if (ptype != NULL)
 				*ptype = subtype;
 			return (rw);
 		}
 	}
 
 	return (NULL);
 }
 int
 classify_opcode_kidx(ipfw_insn *cmd, uint16_t *puidx)
 {
 
 	if (find_op_rw(cmd, puidx, NULL) == NULL)
 		return (1);
 	return (0);
 }
 
 void
 update_opcode_kidx(ipfw_insn *cmd, uint16_t idx)
 {
 	struct opcode_obj_rewrite *rw;
 
 	rw = find_op_rw(cmd, NULL, NULL);
 	KASSERT(rw != NULL, ("No handler to update opcode %d", cmd->opcode));
 	rw->update(cmd, idx);
 }
 
 void
 ipfw_init_obj_rewriter()
 {
 
 	ctl3_rewriters = NULL;
 	ctl3_rsize = 0;
 }
 
 void
 ipfw_destroy_obj_rewriter()
 {
 
 	if (ctl3_rewriters != NULL)
 		free(ctl3_rewriters, M_IPFW);
 	ctl3_rewriters = NULL;
 	ctl3_rsize = 0;
 }
 
 /*
  * Adds one or more opcode object rewrite handlers to the global array.
  * Function may sleep.
  */
 void
 ipfw_add_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
 {
 	size_t sz;
 	struct opcode_obj_rewrite *tmp;
 
 	CTL3_LOCK();
 
 	for (;;) {
 		sz = ctl3_rsize + count;
 		CTL3_UNLOCK();
 		tmp = malloc(sizeof(*rw) * sz, M_IPFW, M_WAITOK | M_ZERO);
 		CTL3_LOCK();
 		if (ctl3_rsize + count <= sz)
 			break;
 
 		/* Retry */
 		free(tmp, M_IPFW);
 	}
 
 	/* Merge old & new arrays */
 	sz = ctl3_rsize + count;
 	memcpy(tmp, ctl3_rewriters, ctl3_rsize * sizeof(*rw));
 	memcpy(&tmp[ctl3_rsize], rw, count * sizeof(*rw));
 	qsort(tmp, sz, sizeof(*rw), compare_opcodes);
 	/* Switch new and free old */
 	if (ctl3_rewriters != NULL)
 		free(ctl3_rewriters, M_IPFW);
 	ctl3_rewriters = tmp;
 	ctl3_rsize = sz;
 
 	CTL3_UNLOCK();
 }
 
 /*
  * Removes one or more object rewrite handlers from the global array.
  */
 int
 ipfw_del_obj_rewriter(struct opcode_obj_rewrite *rw, size_t count)
 {
 	size_t sz;
 	struct opcode_obj_rewrite *ctl3_max, *ktmp, *lo, *hi;
 	int i;
 
 	CTL3_LOCK();
 
 	for (i = 0; i < count; i++) {
 		if (find_op_rw_range(rw[i].opcode, &lo, &hi) != 0)
 			continue;
 
 		for (ktmp = lo; ktmp <= hi; ktmp++) {
 			if (ktmp->classifier != rw[i].classifier)
 				continue;
 
 			ctl3_max = ctl3_rewriters + ctl3_rsize;
 			sz = (ctl3_max - (ktmp + 1)) * sizeof(*ktmp);
 			memmove(ktmp, ktmp + 1, sz);
 			ctl3_rsize--;
 			break;
 		}
 
 	}
 
 	if (ctl3_rsize == 0) {
 		if (ctl3_rewriters != NULL)
 			free(ctl3_rewriters, M_IPFW);
 		ctl3_rewriters = NULL;
 	}
 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 static int
 export_objhash_ntlv_internal(struct namedobj_instance *ni,
     struct named_object *no, void *arg)
 {
 	struct sockopt_data *sd;
 	ipfw_obj_ntlv *ntlv;
 
 	sd = (struct sockopt_data *)arg;
 	ntlv = (ipfw_obj_ntlv *)ipfw_get_sopt_space(sd, sizeof(*ntlv));
 	if (ntlv == NULL)
 		return (ENOMEM);
 	ipfw_export_obj_ntlv(no, ntlv);
 	return (0);
 }
 
 /*
  * Lists all service objects.
  * Data layout (v0)(current):
  * Request: [ ipfw_obj_lheader ] size = ipfw_obj_lheader.size
  * Reply: [ ipfw_obj_lheader [ ipfw_obj_ntlv x N ] (optional) ]
  * Returns 0 on success
  */
 static int
 dump_srvobjects(struct ip_fw_chain *chain, ip_fw3_opheader *op3,
     struct sockopt_data *sd)
 {
 	ipfw_obj_lheader *hdr;
 	int count;
 
 	hdr = (ipfw_obj_lheader *)ipfw_get_sopt_header(sd, sizeof(*hdr));
 	if (hdr == NULL)
 		return (EINVAL);
 
 	IPFW_UH_RLOCK(chain);
 	count = ipfw_objhash_count(CHAIN_TO_SRV(chain));
 	hdr->size = sizeof(ipfw_obj_lheader) + count * sizeof(ipfw_obj_ntlv);
 	if (sd->valsize < hdr->size) {
 		IPFW_UH_RUNLOCK(chain);
 		return (ENOMEM);
 	}
 	hdr->count = count;
 	hdr->objsize = sizeof(ipfw_obj_ntlv);
 	if (count > 0)
 		ipfw_objhash_foreach(CHAIN_TO_SRV(chain),
 		    export_objhash_ntlv_internal, sd);
 	IPFW_UH_RUNLOCK(chain);
 	return (0);
 }
 
 /*
  * Compares two sopt handlers (code, version and handler ptr).
  * Used both as qsort() and bsearch().
  * Does not compare handler for latter case.
  *
  * Returns 0 if match is found.
  */
 static int
 compare_sh(const void *_a, const void *_b)
 {
 	const struct ipfw_sopt_handler *a, *b;
 
 	a = (const struct ipfw_sopt_handler *)_a;
 	b = (const struct ipfw_sopt_handler *)_b;
 
 	if (a->opcode < b->opcode)
 		return (-1);
 	else if (a->opcode > b->opcode)
 		return (1);
 
 	if (a->version < b->version)
 		return (-1);
 	else if (a->version > b->version)
 		return (1);
 
 	/* bsearch helper */
 	if (a->handler == NULL)
 		return (0);
 
 	if ((uintptr_t)a->handler < (uintptr_t)b->handler)
 		return (-1);
 	else if ((uintptr_t)a->handler > (uintptr_t)b->handler)
 		return (1);
 
 	return (0);
 }
 
 /*
  * Finds sopt handler based on @code and @version.
  *
  * Returns pointer to handler or NULL.
  */
 static struct ipfw_sopt_handler *
 find_sh(uint16_t code, uint8_t version, sopt_handler_f *handler)
 {
 	struct ipfw_sopt_handler *sh, h;
 
 	memset(&h, 0, sizeof(h));
 	h.opcode = code;
 	h.version = version;
 	h.handler = handler;
 
 	sh = (struct ipfw_sopt_handler *)bsearch(&h, ctl3_handlers,
 	    ctl3_hsize, sizeof(h), compare_sh);
 
 	return (sh);
 }
 
 static int
 find_ref_sh(uint16_t opcode, uint8_t version, struct ipfw_sopt_handler *psh)
 {
 	struct ipfw_sopt_handler *sh;
 
 	CTL3_LOCK();
 	if ((sh = find_sh(opcode, version, NULL)) == NULL) {
 		CTL3_UNLOCK();
 		printf("ipfw: ipfw_ctl3 invalid option %d""v""%d\n",
 		    opcode, version);
 		return (EINVAL);
 	}
 	sh->refcnt++;
 	ctl3_refct++;
 	/* Copy handler data to requested buffer */
 	*psh = *sh; 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 static void
 find_unref_sh(struct ipfw_sopt_handler *psh)
 {
 	struct ipfw_sopt_handler *sh;
 
 	CTL3_LOCK();
 	sh = find_sh(psh->opcode, psh->version, NULL);
 	KASSERT(sh != NULL, ("ctl3 handler disappeared"));
 	sh->refcnt--;
 	ctl3_refct--;
 	CTL3_UNLOCK();
 }
 
 void
 ipfw_init_sopt_handler()
 {
 
 	CTL3_LOCK_INIT();
 	IPFW_ADD_SOPT_HANDLER(1, scodes);
 }
 
 void
 ipfw_destroy_sopt_handler()
 {
 
 	IPFW_DEL_SOPT_HANDLER(1, scodes);
 	CTL3_LOCK_DESTROY();
 }
 
 /*
  * Adds one or more sockopt handlers to the global array.
  * Function may sleep.
  */
 void
 ipfw_add_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
 {
 	size_t sz;
 	struct ipfw_sopt_handler *tmp;
 
 	CTL3_LOCK();
 
 	for (;;) {
 		sz = ctl3_hsize + count;
 		CTL3_UNLOCK();
 		tmp = malloc(sizeof(*sh) * sz, M_IPFW, M_WAITOK | M_ZERO);
 		CTL3_LOCK();
 		if (ctl3_hsize + count <= sz)
 			break;
 
 		/* Retry */
 		free(tmp, M_IPFW);
 	}
 
 	/* Merge old & new arrays */
 	sz = ctl3_hsize + count;
 	memcpy(tmp, ctl3_handlers, ctl3_hsize * sizeof(*sh));
 	memcpy(&tmp[ctl3_hsize], sh, count * sizeof(*sh));
 	qsort(tmp, sz, sizeof(*sh), compare_sh);
 	/* Switch new and free old */
 	if (ctl3_handlers != NULL)
 		free(ctl3_handlers, M_IPFW);
 	ctl3_handlers = tmp;
 	ctl3_hsize = sz;
 	ctl3_gencnt++;
 
 	CTL3_UNLOCK();
 }
 
 /*
  * Removes one or more sockopt handlers from the global array.
  */
 int
 ipfw_del_sopt_handler(struct ipfw_sopt_handler *sh, size_t count)
 {
 	size_t sz;
 	struct ipfw_sopt_handler *tmp, *h;
 	int i;
 
 	CTL3_LOCK();
 
 	for (i = 0; i < count; i++) {
 		tmp = &sh[i];
 		h = find_sh(tmp->opcode, tmp->version, tmp->handler);
 		if (h == NULL)
 			continue;
 
 		sz = (ctl3_handlers + ctl3_hsize - (h + 1)) * sizeof(*h);
 		memmove(h, h + 1, sz);
 		ctl3_hsize--;
 	}
 
 	if (ctl3_hsize == 0) {
 		if (ctl3_handlers != NULL)
 			free(ctl3_handlers, M_IPFW);
 		ctl3_handlers = NULL;
 	}
 
 	ctl3_gencnt++;
 
 	CTL3_UNLOCK();
 
 	return (0);
 }
 
 /*
  * Writes data accumulated in @sd to sockopt buffer.
  * Zeroes internal @sd buffer.
  */
 static int
 ipfw_flush_sopt_data(struct sockopt_data *sd)
 {
 	struct sockopt *sopt;
 	int error;
 	size_t sz;
 
 	sz = sd->koff;
 	if (sz == 0)
 		return (0);
 
 	sopt = sd->sopt;
 
 	if (sopt->sopt_dir == SOPT_GET) {
 		error = copyout(sd->kbuf, sopt->sopt_val, sz);
 		if (error != 0)
 			return (error);
 	}
 
 	memset(sd->kbuf, 0, sd->ksize);
 	sd->ktotal += sz;
 	sd->koff = 0;
 	if (sd->ktotal + sd->ksize < sd->valsize)
 		sd->kavail = sd->ksize;
 	else
 		sd->kavail = sd->valsize - sd->ktotal;
 
 	/* Update sopt buffer data */
 	sopt->sopt_valsize = sd->ktotal;
 	sopt->sopt_val = sd->sopt_val + sd->ktotal;
 
 	return (0);
 }
 
 /*
  * Ensures that @sd buffer has contiguous @neeeded number of
  * bytes.
  *
  * Returns pointer to requested space or NULL.
  */
 caddr_t
 ipfw_get_sopt_space(struct sockopt_data *sd, size_t needed)
 {
 	int error;
 	caddr_t addr;
 
 	if (sd->kavail < needed) {
 		/*
 		 * Flush data and try another time.
 		 */
 		error = ipfw_flush_sopt_data(sd);
 
 		if (sd->kavail < needed || error != 0)
 			return (NULL);
 	}
 
 	addr = sd->kbuf + sd->koff;
 	sd->koff += needed;
 	sd->kavail -= needed;
 	return (addr);
 }
 
 /*
  * Requests @needed contiguous bytes from @sd buffer.
  * Function is used to notify subsystem that we are
  * interesed in first @needed bytes (request header)
  * and the rest buffer can be safely zeroed.
  *
  * Returns pointer to requested space or NULL.
  */
 caddr_t
 ipfw_get_sopt_header(struct sockopt_data *sd, size_t needed)
 {
 	caddr_t addr;
 
 	if ((addr = ipfw_get_sopt_space(sd, needed)) == NULL)
 		return (NULL);
 
 	if (sd->kavail > 0)
 		memset(sd->kbuf + sd->koff, 0, sd->kavail);
 	
 	return (addr);
 }
 
 /*
  * New sockopt handler.
  */
 int
 ipfw_ctl3(struct sockopt *sopt)
 {
 	int error, locked;
 	size_t size, valsize;
 	struct ip_fw_chain *chain;
 	char xbuf[256];
 	struct sockopt_data sdata;
 	struct ipfw_sopt_handler h;
 	ip_fw3_opheader *op3 = NULL;
 
 	error = priv_check(sopt->sopt_td, PRIV_NETINET_IPFW);
 	if (error != 0)
 		return (error);
 
 	if (sopt->sopt_name != IP_FW3)
 		return (ipfw_ctl(sopt));
 
 	chain = &V_layer3_chain;
 	error = 0;
 
 	/* Save original valsize before it is altered via sooptcopyin() */
 	valsize = sopt->sopt_valsize;
 	memset(&sdata, 0, sizeof(sdata));
 	/* Read op3 header first to determine actual operation */
 	op3 = (ip_fw3_opheader *)xbuf;
 	error = sooptcopyin(sopt, op3, sizeof(*op3), sizeof(*op3));
 	if (error != 0)
 		return (error);
 	sopt->sopt_valsize = valsize;
 
 	/*
 	 * Find and reference command.
 	 */
 	error = find_ref_sh(op3->opcode, op3->version, &h);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if ((h.dir & HDIR_SET) != 0 && h.opcode != IP_FW_XRESETLOG) {
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error != 0) {
 			find_unref_sh(&h);
 			return (error);
 		}
 	}
 
 	/*
 	 * Fill in sockopt_data structure that may be useful for
 	 * IP_FW3 get requests.
 	 */
 	locked = 0;
 	if (valsize <= sizeof(xbuf)) {
 		/* use on-stack buffer */
 		sdata.kbuf = xbuf;
 		sdata.ksize = sizeof(xbuf);
 		sdata.kavail = valsize;
 	} else {
 
 		/*
 		 * Determine opcode type/buffer size:
 		 * allocate sliding-window buf for data export or
 		 * contiguous buffer for special ops.
 		 */
 		if ((h.dir & HDIR_SET) != 0) {
 			/* Set request. Allocate contigous buffer. */
 			if (valsize > CTL3_LARGEBUF) {
 				find_unref_sh(&h);
 				return (EFBIG);
 			}
 
 			size = valsize;
 		} else {
 			/* Get request. Allocate sliding window buffer */
 			size = (valsize<CTL3_SMALLBUF) ? valsize:CTL3_SMALLBUF;
 
 			if (size < valsize) {
 				/* We have to wire user buffer */
 				error = vslock(sopt->sopt_val, valsize);
 				if (error != 0)
 					return (error);
 				locked = 1;
 			}
 		}
 
 		sdata.kbuf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 		sdata.ksize = size;
 		sdata.kavail = size;
 	}
 
 	sdata.sopt = sopt;
 	sdata.sopt_val = sopt->sopt_val;
 	sdata.valsize = valsize;
 
 	/*
 	 * Copy either all request (if valsize < bsize_max)
 	 * or first bsize_max bytes to guarantee most consumers
 	 * that all necessary data has been copied).
 	 * Anyway, copy not less than sizeof(ip_fw3_opheader).
 	 */
 	if ((error = sooptcopyin(sopt, sdata.kbuf, sdata.ksize,
 	    sizeof(ip_fw3_opheader))) != 0)
 		return (error);
 	op3 = (ip_fw3_opheader *)sdata.kbuf;
 
 	/* Finally, run handler */
 	error = h.handler(chain, op3, &sdata);
 	find_unref_sh(&h);
 
 	/* Flush state and free buffers */
 	if (error == 0)
 		error = ipfw_flush_sopt_data(&sdata);
 	else
 		ipfw_flush_sopt_data(&sdata);
 
 	if (locked != 0)
 		vsunlock(sdata.sopt_val, valsize);
 
 	/* Restore original pointer and set number of bytes written */
 	sopt->sopt_val = sdata.sopt_val;
 	sopt->sopt_valsize = sdata.ktotal;
 	if (sdata.kbuf != xbuf)
 		free(sdata.kbuf, M_TEMP);
 
 	return (error);
 }
 
 /**
  * {set|get}sockopt parser.
  */
 int
 ipfw_ctl(struct sockopt *sopt)
 {
 #define	RULE_MAXSIZE	(512*sizeof(u_int32_t))
 	int error;
 	size_t size, valsize;
 	struct ip_fw *buf;
 	struct ip_fw_rule0 *rule;
 	struct ip_fw_chain *chain;
 	u_int32_t rulenum[2];
 	uint32_t opt;
 	struct rule_check_info ci;
 	IPFW_RLOCK_TRACKER;
 
 	chain = &V_layer3_chain;
 	error = 0;
 
 	/* Save original valsize before it is altered via sooptcopyin() */
 	valsize = sopt->sopt_valsize;
 	opt = sopt->sopt_name;
 
 	/*
 	 * Disallow modifications in really-really secure mode, but still allow
 	 * the logging counters to be reset.
 	 */
 	if (opt == IP_FW_ADD ||
 	    (sopt->sopt_dir == SOPT_SET && opt != IP_FW_RESETLOG)) {
 		error = securelevel_ge(sopt->sopt_td->td_ucred, 3);
 		if (error != 0)
 			return (error);
 	}
 
 	switch (opt) {
 	case IP_FW_GET:
 		/*
 		 * pass up a copy of the current rules. Static rules
 		 * come first (the last of which has number IPFW_DEFAULT_RULE),
 		 * followed by a possibly empty list of dynamic rule.
 		 * The last dynamic rule has NULL in the "next" field.
 		 *
 		 * Note that the calculated size is used to bound the
 		 * amount of data returned to the user.  The rule set may
 		 * change between calculating the size and returning the
 		 * data in which case we'll just return what fits.
 		 */
 		for (;;) {
 			int len = 0, want;
 
 			size = chain->static_len;
 			size += ipfw_dyn_len();
 			if (size >= sopt->sopt_valsize)
 				break;
 			buf = malloc(size, M_TEMP, M_WAITOK | M_ZERO);
 			IPFW_UH_RLOCK(chain);
 			/* check again how much space we need */
 			want = chain->static_len + ipfw_dyn_len();
 			if (size >= want)
 				len = ipfw_getrules(chain, buf, size);
 			IPFW_UH_RUNLOCK(chain);
 			if (size >= want)
 				error = sooptcopyout(sopt, buf, len);
 			free(buf, M_TEMP);
 			if (size >= want)
 				break;
 		}
 		break;
 
 	case IP_FW_FLUSH:
 		/* locking is done within del_entry() */
 		error = del_entry(chain, 0); /* special case, rule=0, cmd=0 means all */
 		break;
 
 	case IP_FW_ADD:
 		rule = malloc(RULE_MAXSIZE, M_TEMP, M_WAITOK);
 		error = sooptcopyin(sopt, rule, RULE_MAXSIZE,
 			sizeof(struct ip_fw7) );
 
 		memset(&ci, 0, sizeof(struct rule_check_info));
 
 		/*
 		 * If the size of commands equals RULESIZE7 then we assume
 		 * a FreeBSD7.2 binary is talking to us (set is7=1).
 		 * is7 is persistent so the next 'ipfw list' command
 		 * will use this format.
 		 * NOTE: If wrong version is guessed (this can happen if
 		 *       the first ipfw command is 'ipfw [pipe] list')
 		 *       the ipfw binary may crash or loop infinitly...
 		 */
 		size = sopt->sopt_valsize;
 		if (size == RULESIZE7(rule)) {
 		    is7 = 1;
 		    error = convert_rule_to_8(rule);
 		    if (error) {
 			free(rule, M_TEMP);
 			return error;
 		    }
 		    size = RULESIZE(rule);
 		} else
 		    is7 = 0;
 		if (error == 0)
 			error = check_ipfw_rule0(rule, size, &ci);
 		if (error == 0) {
 			/* locking is done within add_rule() */
 			struct ip_fw *krule;
 			krule = ipfw_alloc_rule(chain, RULEKSIZE0(rule));
 			ci.urule = (caddr_t)rule;
 			ci.krule = krule;
 			import_rule0(&ci);
 			error = commit_rules(chain, &ci, 1);
 			if (error != 0)
 				ipfw_free_rule(ci.krule);
 			else if (sopt->sopt_dir == SOPT_GET) {
 				if (is7) {
 					error = convert_rule_to_7(rule);
 					size = RULESIZE7(rule);
 					if (error) {
 						free(rule, M_TEMP);
 						return error;
 					}
 				}
 				error = sooptcopyout(sopt, rule, size);
 			}
 		}
 		free(rule, M_TEMP);
 		break;
 
 	case IP_FW_DEL:
 		/*
 		 * IP_FW_DEL is used for deleting single rules or sets,
 		 * and (ab)used to atomically manipulate sets. Argument size
 		 * is used to distinguish between the two:
 		 *    sizeof(u_int32_t)
 		 *	delete single rule or set of rules,
 		 *	or reassign rules (or sets) to a different set.
 		 *    2*sizeof(u_int32_t)
 		 *	atomic disable/enable sets.
 		 *	first u_int32_t contains sets to be disabled,
 		 *	second u_int32_t contains sets to be enabled.
 		 */
 		error = sooptcopyin(sopt, rulenum,
 			2*sizeof(u_int32_t), sizeof(u_int32_t));
 		if (error)
 			break;
 		size = sopt->sopt_valsize;
 		if (size == sizeof(u_int32_t) && rulenum[0] != 0) {
 			/* delete or reassign, locking done in del_entry() */
 			error = del_entry(chain, rulenum[0]);
 		} else if (size == 2*sizeof(u_int32_t)) { /* set enable/disable */
 			IPFW_UH_WLOCK(chain);
 			V_set_disable =
 			    (V_set_disable | rulenum[0]) & ~rulenum[1] &
 			    ~(1<<RESVD_SET); /* set RESVD_SET always enabled */
 			IPFW_UH_WUNLOCK(chain);
 		} else
 			error = EINVAL;
 		break;
 
 	case IP_FW_ZERO:
 	case IP_FW_RESETLOG: /* argument is an u_int_32, the rule number */
 		rulenum[0] = 0;
 		if (sopt->sopt_val != 0) {
 		    error = sooptcopyin(sopt, rulenum,
 			    sizeof(u_int32_t), sizeof(u_int32_t));
 		    if (error)
 			break;
 		}
 		error = zero_entry(chain, rulenum[0],
 			sopt->sopt_name == IP_FW_RESETLOG);
 		break;
 
 	/*--- TABLE opcodes ---*/
 	case IP_FW_TABLE_ADD:
 	case IP_FW_TABLE_DEL:
 		{
 			ipfw_table_entry ent;
 			struct tentry_info tei;
 			struct tid_info ti;
 			struct table_value v;
 
 			error = sooptcopyin(sopt, &ent,
 			    sizeof(ent), sizeof(ent));
 			if (error)
 				break;
 
 			memset(&tei, 0, sizeof(tei));
 			tei.paddr = &ent.addr;
 			tei.subtype = AF_INET;
 			tei.masklen = ent.masklen;
 			ipfw_import_table_value_legacy(ent.value, &v);
 			tei.pvalue = &v;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = ent.tbl;
 			ti.type = IPFW_TABLE_CIDR;
 
 			error = (opt == IP_FW_TABLE_ADD) ?
 			    add_table_entry(chain, &ti, &tei, 0, 1) :
 			    del_table_entry(chain, &ti, &tei, 0, 1);
 		}
 		break;
 
 
 	case IP_FW_TABLE_FLUSH:
 		{
 			u_int16_t tbl;
 			struct tid_info ti;
 
 			error = sooptcopyin(sopt, &tbl,
 			    sizeof(tbl), sizeof(tbl));
 			if (error)
 				break;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl;
 			error = flush_table(chain, &ti);
 		}
 		break;
 
 	case IP_FW_TABLE_GETSIZE:
 		{
 			u_int32_t tbl, cnt;
 			struct tid_info ti;
 
 			if ((error = sooptcopyin(sopt, &tbl, sizeof(tbl),
 			    sizeof(tbl))))
 				break;
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl;
 			IPFW_RLOCK(chain);
 			error = ipfw_count_table(chain, &ti, &cnt);
 			IPFW_RUNLOCK(chain);
 			if (error)
 				break;
 			error = sooptcopyout(sopt, &cnt, sizeof(cnt));
 		}
 		break;
 
 	case IP_FW_TABLE_LIST:
 		{
 			ipfw_table *tbl;
 			struct tid_info ti;
 
 			if (sopt->sopt_valsize < sizeof(*tbl)) {
 				error = EINVAL;
 				break;
 			}
 			size = sopt->sopt_valsize;
 			tbl = malloc(size, M_TEMP, M_WAITOK);
 			error = sooptcopyin(sopt, tbl, size, sizeof(*tbl));
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			tbl->size = (size - sizeof(*tbl)) /
 			    sizeof(ipfw_table_entry);
 			memset(&ti, 0, sizeof(ti));
 			ti.uidx = tbl->tbl;
 			IPFW_RLOCK(chain);
 			error = ipfw_dump_table_legacy(chain, &ti, tbl);
 			IPFW_RUNLOCK(chain);
 			if (error) {
 				free(tbl, M_TEMP);
 				break;
 			}
 			error = sooptcopyout(sopt, tbl, size);
 			free(tbl, M_TEMP);
 		}
 		break;
 
 	/*--- NAT operations are protected by the IPFW_LOCK ---*/
 	case IP_FW_NAT_CFG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_cfg_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_CFG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_DEL:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_del_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_DEL: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_GET_CONFIG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_get_cfg_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_GET_CFG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	case IP_FW_NAT_GET_LOG:
 		if (IPFW_NAT_LOADED)
 			error = ipfw_nat_get_log_ptr(sopt);
 		else {
 			printf("IP_FW_NAT_GET_LOG: %s\n",
 			    "ipfw_nat not present, please load it");
 			error = EINVAL;
 		}
 		break;
 
 	default:
 		printf("ipfw: ipfw_ctl invalid option %d\n", sopt->sopt_name);
 		error = EINVAL;
 	}
 
 	return (error);
 #undef RULE_MAXSIZE
 }
 #define	RULE_MAXSIZE	(256*sizeof(u_int32_t))
 
 /* Functions to convert rules 7.2 <==> 8.0 */
 static int
 convert_rule_to_7(struct ip_fw_rule0 *rule)
 {
 	/* Used to modify original rule */
 	struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
 	/* copy of original rule, version 8 */
 	struct ip_fw_rule0 *tmp;
 
 	/* Used to copy commands */
 	ipfw_insn *ccmd, *dst;
 	int ll = 0, ccmdlen = 0;
 
 	tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
 	if (tmp == NULL) {
 		return 1; //XXX error
 	}
 	bcopy(rule, tmp, RULE_MAXSIZE);
 
 	/* Copy fields */
 	//rule7->_pad = tmp->_pad;
 	rule7->set = tmp->set;
 	rule7->rulenum = tmp->rulenum;
 	rule7->cmd_len = tmp->cmd_len;
 	rule7->act_ofs = tmp->act_ofs;
 	rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
 	rule7->cmd_len = tmp->cmd_len;
 	rule7->pcnt = tmp->pcnt;
 	rule7->bcnt = tmp->bcnt;
 	rule7->timestamp = tmp->timestamp;
 
 	/* Copy commands */
 	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
 			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
 		ccmdlen = F_LEN(ccmd);
 
 		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
 
 		if (dst->opcode > O_NAT)
 			/* O_REASS doesn't exists in 7.2 version, so
 			 * decrement opcode if it is after O_REASS
 			 */
 			dst->opcode--;
 
 		if (ccmdlen > ll) {
 			printf("ipfw: opcode %d size truncated\n",
 				ccmd->opcode);
 			return EINVAL;
 		}
 	}
 	free(tmp, M_TEMP);
 
 	return 0;
 }
 
 static int
 convert_rule_to_8(struct ip_fw_rule0 *rule)
 {
 	/* Used to modify original rule */
 	struct ip_fw7 *rule7 = (struct ip_fw7 *) rule;
 
 	/* Used to copy commands */
 	ipfw_insn *ccmd, *dst;
 	int ll = 0, ccmdlen = 0;
 
 	/* Copy of original rule */
 	struct ip_fw7 *tmp = malloc(RULE_MAXSIZE, M_TEMP, M_NOWAIT | M_ZERO);
 	if (tmp == NULL) {
 		return 1; //XXX error
 	}
 
 	bcopy(rule7, tmp, RULE_MAXSIZE);
 
 	for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule->cmd ;
 			ll > 0 ; ll -= ccmdlen, ccmd += ccmdlen, dst += ccmdlen) {
 		ccmdlen = F_LEN(ccmd);
 		
 		bcopy(ccmd, dst, F_LEN(ccmd)*sizeof(uint32_t));
 
 		if (dst->opcode > O_NAT)
 			/* O_REASS doesn't exists in 7.2 version, so
 			 * increment opcode if it is after O_REASS
 			 */
 			dst->opcode++;
 
 		if (ccmdlen > ll) {
 			printf("ipfw: opcode %d size truncated\n",
 			    ccmd->opcode);
 			return EINVAL;
 		}
 	}
 
 	rule->_pad = tmp->_pad;
 	rule->set = tmp->set;
 	rule->rulenum = tmp->rulenum;
 	rule->cmd_len = tmp->cmd_len;
 	rule->act_ofs = tmp->act_ofs;
 	rule->next_rule = (struct ip_fw *)tmp->next_rule;
 	rule->cmd_len = tmp->cmd_len;
 	rule->id = 0; /* XXX see if is ok = 0 */
 	rule->pcnt = tmp->pcnt;
 	rule->bcnt = tmp->bcnt;
 	rule->timestamp = tmp->timestamp;
 
 	free (tmp, M_TEMP);
 	return 0;
 }
 
 /*
  * Named object api
  *
  */
 
 void
 ipfw_init_srv(struct ip_fw_chain *ch)
 {
 
 	ch->srvmap = ipfw_objhash_create(IPFW_OBJECTS_DEFAULT);
 	ch->srvstate = malloc(sizeof(void *) * IPFW_OBJECTS_DEFAULT,
 	    M_IPFW, M_WAITOK | M_ZERO);
 }
 
 void
 ipfw_destroy_srv(struct ip_fw_chain *ch)
 {
 
 	free(ch->srvstate, M_IPFW);
 	ipfw_objhash_destroy(ch->srvmap);
 }
 
 /*
  * Allocate new bitmask which can be used to enlarge/shrink
  * named instance index.
  */
 void
 ipfw_objhash_bitmap_alloc(uint32_t items, void **idx, int *pblocks)
 {
 	size_t size;
 	int max_blocks;
 	u_long *idx_mask;
 
 	KASSERT((items % BLOCK_ITEMS) == 0,
 	   ("bitmask size needs to power of 2 and greater or equal to %zu",
 	    BLOCK_ITEMS));
 
 	max_blocks = items / BLOCK_ITEMS;
 	size = items / 8;
 	idx_mask = malloc(size * IPFW_MAX_SETS, M_IPFW, M_WAITOK);
 	/* Mark all as free */
 	memset(idx_mask, 0xFF, size * IPFW_MAX_SETS);
 	*idx_mask &= ~(u_long)1; /* Skip index 0 */
 
 	*idx = idx_mask;
 	*pblocks = max_blocks;
 }
 
 /*
  * Copy current bitmask index to new one.
  */
 void
 ipfw_objhash_bitmap_merge(struct namedobj_instance *ni, void **idx, int *blocks)
 {
 	int old_blocks, new_blocks;
 	u_long *old_idx, *new_idx;
 	int i;
 
 	old_idx = ni->idx_mask;
 	old_blocks = ni->max_blocks;
 	new_idx = *idx;
 	new_blocks = *blocks;
 
 	for (i = 0; i < IPFW_MAX_SETS; i++) {
 		memcpy(&new_idx[new_blocks * i], &old_idx[old_blocks * i],
 		    old_blocks * sizeof(u_long));
 	}
 }
 
 /*
  * Swaps current @ni index with new one.
  */
 void
 ipfw_objhash_bitmap_swap(struct namedobj_instance *ni, void **idx, int *blocks)
 {
 	int old_blocks;
 	u_long *old_idx;
 
 	old_idx = ni->idx_mask;
 	old_blocks = ni->max_blocks;
 
 	ni->idx_mask = *idx;
 	ni->max_blocks = *blocks;
 
 	/* Save old values */
 	*idx = old_idx;
 	*blocks = old_blocks;
 }
 
 void
 ipfw_objhash_bitmap_free(void *idx, int blocks)
 {
 
 	free(idx, M_IPFW);
 }
 
 /*
  * Creates named hash instance.
  * Must be called without holding any locks.
  * Return pointer to new instance.
  */
 struct namedobj_instance *
 ipfw_objhash_create(uint32_t items)
 {
 	struct namedobj_instance *ni;
 	int i;
 	size_t size;
 
 	size = sizeof(struct namedobj_instance) +
 	    sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE +
 	    sizeof(struct namedobjects_head) * NAMEDOBJ_HASH_SIZE;
 
 	ni = malloc(size, M_IPFW, M_WAITOK | M_ZERO);
 	ni->nn_size = NAMEDOBJ_HASH_SIZE;
 	ni->nv_size = NAMEDOBJ_HASH_SIZE;
 
 	ni->names = (struct namedobjects_head *)(ni +1);
 	ni->values = &ni->names[ni->nn_size];
 
 	for (i = 0; i < ni->nn_size; i++)
 		TAILQ_INIT(&ni->names[i]);
 
 	for (i = 0; i < ni->nv_size; i++)
 		TAILQ_INIT(&ni->values[i]);
 
 	/* Set default hashing/comparison functions */
 	ni->hash_f = objhash_hash_name;
 	ni->cmp_f = objhash_cmp_name;
 
 	/* Allocate bitmask separately due to possible resize */
 	ipfw_objhash_bitmap_alloc(items, (void*)&ni->idx_mask, &ni->max_blocks);
 
 	return (ni);
 }
 
 void
 ipfw_objhash_destroy(struct namedobj_instance *ni)
 {
 
 	free(ni->idx_mask, M_IPFW);
 	free(ni, M_IPFW);
 }
 
 void
 ipfw_objhash_set_funcs(struct namedobj_instance *ni, objhash_hash_f *hash_f,
     objhash_cmp_f *cmp_f)
 {
 
 	ni->hash_f = hash_f;
 	ni->cmp_f = cmp_f;
 }
 
 static uint32_t
 objhash_hash_name(struct namedobj_instance *ni, const void *name, uint32_t set)
 {
 
 	return (fnv_32_str((const char *)name, FNV1_32_INIT));
 }
 
 static int
 objhash_cmp_name(struct named_object *no, const void *name, uint32_t set)
 {
 
 	if ((strcmp(no->name, (const char *)name) == 0) && (no->set == set))
 		return (0);
 
 	return (1);
 }
 
 static uint32_t
 objhash_hash_idx(struct namedobj_instance *ni, uint32_t val)
 {
 	uint32_t v;
 
 	v = val % (ni->nv_size - 1);
 
 	return (v);
 }
 
 struct named_object *
 ipfw_objhash_lookup_name(struct namedobj_instance *ni, uint32_t set, char *name)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, name, set) % ni->nn_size;
 	
 	TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
 		if (ni->cmp_f(no, name, set) == 0)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 /*
  * Find named object by @uid.
  * Check @tlvs for valid data inside.
  *
  * Returns pointer to found TLV or NULL.
  */
 ipfw_obj_ntlv *
 ipfw_find_name_tlv_type(void *tlvs, int len, uint16_t uidx, uint32_t etlv)
 {
 	ipfw_obj_ntlv *ntlv;
 	uintptr_t pa, pe;
 	int l;
 
 	pa = (uintptr_t)tlvs;
 	pe = pa + len;
 	l = 0;
 	for (; pa < pe; pa += l) {
 		ntlv = (ipfw_obj_ntlv *)pa;
 		l = ntlv->head.length;
 
 		if (l != sizeof(*ntlv))
 			return (NULL);
 
 		if (ntlv->idx != uidx)
 			continue;
 		/*
 		 * When userland has specified zero TLV type, do
 		 * not compare it with eltv. In some cases userland
 		 * doesn't know what type should it have. Use only
 		 * uidx and name for search named_object.
 		 */
 		if (ntlv->head.type != 0 &&
 		    ntlv->head.type != (uint16_t)etlv)
 			continue;
 
 		if (ipfw_check_object_name_generic(ntlv->name) != 0)
 			return (NULL);
 
 		return (ntlv);
 	}
 
 	return (NULL);
 }
 
 /*
  * Finds object config based on either legacy index
  * or name in ntlv.
  * Note @ti structure contains unchecked data from userland.
  *
  * Returns 0 in success and fills in @pno with found config
  */
 int
 ipfw_objhash_find_type(struct namedobj_instance *ni, struct tid_info *ti,
     uint32_t etlv, struct named_object **pno)
 {
 	char *name;
 	ipfw_obj_ntlv *ntlv;
 	uint32_t set;
 
 	if (ti->tlvs == NULL)
 		return (EINVAL);
 
 	ntlv = ipfw_find_name_tlv_type(ti->tlvs, ti->tlen, ti->uidx, etlv);
 	if (ntlv == NULL)
 		return (EINVAL);
 	name = ntlv->name;
 
 	/*
 	 * Use set provided by @ti instead of @ntlv one.
 	 * This is needed due to different sets behavior
 	 * controlled by V_fw_tables_sets.
 	 */
 	set = ti->set;
 	*pno = ipfw_objhash_lookup_name(ni, set, name);
 	if (*pno == NULL)
 		return (ESRCH);
 	return (0);
 }
 
 /*
  * Find named object by name, considering also its TLV type.
  */
 struct named_object *
 ipfw_objhash_lookup_name_type(struct namedobj_instance *ni, uint32_t set,
     uint32_t type, const char *name)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, name, set) % ni->nn_size;
 
 	TAILQ_FOREACH(no, &ni->names[hash], nn_next) {
 		if (ni->cmp_f(no, name, set) == 0 &&
 		    no->etlv == (uint16_t)type)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 struct named_object *
 ipfw_objhash_lookup_kidx(struct namedobj_instance *ni, uint16_t kidx)
 {
 	struct named_object *no;
 	uint32_t hash;
 
 	hash = objhash_hash_idx(ni, kidx);
 	
 	TAILQ_FOREACH(no, &ni->values[hash], nv_next) {
 		if (no->kidx == kidx)
 			return (no);
 	}
 
 	return (NULL);
 }
 
 int
 ipfw_objhash_same_name(struct namedobj_instance *ni, struct named_object *a,
     struct named_object *b)
 {
 
 	if ((strcmp(a->name, b->name) == 0) && a->set == b->set)
 		return (1);
 
 	return (0);
 }
 
 void
 ipfw_objhash_add(struct namedobj_instance *ni, struct named_object *no)
 {
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
 	TAILQ_INSERT_HEAD(&ni->names[hash], no, nn_next);
 
 	hash = objhash_hash_idx(ni, no->kidx);
 	TAILQ_INSERT_HEAD(&ni->values[hash], no, nv_next);
 
 	ni->count++;
 }
 
 void
 ipfw_objhash_del(struct namedobj_instance *ni, struct named_object *no)
 {
 	uint32_t hash;
 
 	hash = ni->hash_f(ni, no->name, no->set) % ni->nn_size;
 	TAILQ_REMOVE(&ni->names[hash], no, nn_next);
 
 	hash = objhash_hash_idx(ni, no->kidx);
 	TAILQ_REMOVE(&ni->values[hash], no, nv_next);
 
 	ni->count--;
 }
 
 uint32_t
 ipfw_objhash_count(struct namedobj_instance *ni)
 {
 
 	return (ni->count);
 }
 
 uint32_t
 ipfw_objhash_count_type(struct namedobj_instance *ni, uint16_t type)
 {
 	struct named_object *no;
 	uint32_t count;
 	int i;
 
 	count = 0;
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH(no, &ni->names[i], nn_next) {
 			if (no->etlv == type)
 				count++;
 		}
 	}
 	return (count);
 }
 
 /*
  * Runs @func for each found named object.
  * It is safe to delete objects from callback
  */
 int
 ipfw_objhash_foreach(struct namedobj_instance *ni, objhash_cb_t *f, void *arg)
 {
 	struct named_object *no, *no_tmp;
 	int i, ret;
 
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
 			ret = f(ni, no, arg);
 			if (ret != 0)
 				return (ret);
 		}
 	}
 	return (0);
 }
 
 /*
  * Runs @f for each found named object with type @type.
  * It is safe to delete objects from callback
  */
 int
 ipfw_objhash_foreach_type(struct namedobj_instance *ni, objhash_cb_t *f,
     void *arg, uint16_t type)
 {
 	struct named_object *no, *no_tmp;
 	int i, ret;
 
 	for (i = 0; i < ni->nn_size; i++) {
 		TAILQ_FOREACH_SAFE(no, &ni->names[i], nn_next, no_tmp) {
 			if (no->etlv != type)
 				continue;
 			ret = f(ni, no, arg);
 			if (ret != 0)
 				return (ret);
 		}
 	}
 	return (0);
 }
 
 /*
  * Removes index from given set.
  * Returns 0 on success.
  */
 int
 ipfw_objhash_free_idx(struct namedobj_instance *ni, uint16_t idx)
 {
 	u_long *mask;
 	int i, v;
 
 	i = idx / BLOCK_ITEMS;
 	v = idx % BLOCK_ITEMS;
 
 	if (i >= ni->max_blocks)
 		return (1);
 
 	mask = &ni->idx_mask[i];
 
 	if ((*mask & ((u_long)1 << v)) != 0)
 		return (1);
 
 	/* Mark as free */
 	*mask |= (u_long)1 << v;
 
 	/* Update free offset */
 	if (ni->free_off[0] > i)
 		ni->free_off[0] = i;
 	
 	return (0);
 }
 
 /*
  * Allocate new index in given instance and stores in in @pidx.
  * Returns 0 on success.
  */
 int
 ipfw_objhash_alloc_idx(void *n, uint16_t *pidx)
 {
 	struct namedobj_instance *ni;
 	u_long *mask;
 	int i, off, v;
 
 	ni = (struct namedobj_instance *)n;
 
 	off = ni->free_off[0];
 	mask = &ni->idx_mask[off];
 
 	for (i = off; i < ni->max_blocks; i++, mask++) {
 		if ((v = ffsl(*mask)) == 0)
 			continue;
 
 		/* Mark as busy */
 		*mask &= ~ ((u_long)1 << (v - 1));
 
 		ni->free_off[0] = i;
 		
 		v = BLOCK_ITEMS * i + v - 1;
 
 		*pidx = v;
 		return (0);
 	}
 
 	return (1);
 }
 
 /* end of file */
Index: projects/fuse2/sys/netpfil/pf/pf.c
===================================================================
--- projects/fuse2/sys/netpfil/pf/pf.c	(revision 350434)
+++ projects/fuse2/sys/netpfil/pf/pf.c	(revision 350435)
@@ -1,6706 +1,6704 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2001 Daniel Hartmeier
  * Copyright (c) 2002 - 2008 Henning Brauer
  * Copyright (c) 2012 Gleb Smirnoff <glebius@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  *    - Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    - Redistributions in binary form must reproduce the above
  *      copyright notice, this list of conditions and the following
  *      disclaimer in the documentation and/or other materials provided
  *      with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  *
  * Effort sponsored in part by the Defense Advanced Research Projects
  * Agency (DARPA) and Air Force Research Laboratory, Air Force
  * Materiel Command, USAF, under agreement number F30602-01-2-0537.
  *
  *	$OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_bpf.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/endian.h>
 #include <sys/gsb_crc32.h>
 #include <sys/hash.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/mbuf.h>
 #include <sys/md5.h>
 #include <sys/random.h>
 #include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <net/route.h>
 #include <net/radix_mpath.h>
 #include <net/vnet.h>
 
 #include <net/pfil.h>
 #include <net/pfvar.h>
 #include <net/if_pflog.h>
 #include <net/if_pfsync.h>
 
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/in_fib.h>
 #include <netinet/ip.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet6/nd6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 #endif /* INET6 */
 
 #include <machine/in_cksum.h>
 #include <security/mac/mac_framework.h>
 
 #define	DPFPRINTF(n, x)	if (V_pf_status.debug >= (n)) printf x
 
 /*
  * Global variables
  */
 
 /* state tables */
 VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[4]);
 VNET_DEFINE(struct pf_palist,		 pf_pabuf);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_active);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_inactive);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altq_ifs_inactive);
 VNET_DEFINE(struct pf_kstatus,		 pf_status);
 
 VNET_DEFINE(u_int32_t,			 ticket_altqs_active);
 VNET_DEFINE(u_int32_t,			 ticket_altqs_inactive);
 VNET_DEFINE(int,			 altqs_inactive_open);
 VNET_DEFINE(u_int32_t,			 ticket_pabuf);
 
 VNET_DEFINE(MD5_CTX,			 pf_tcp_secret_ctx);
 #define	V_pf_tcp_secret_ctx		 VNET(pf_tcp_secret_ctx)
 VNET_DEFINE(u_char,			 pf_tcp_secret[16]);
 #define	V_pf_tcp_secret			 VNET(pf_tcp_secret)
 VNET_DEFINE(int,			 pf_tcp_secret_init);
 #define	V_pf_tcp_secret_init		 VNET(pf_tcp_secret_init)
 VNET_DEFINE(int,			 pf_tcp_iss_off);
 #define	V_pf_tcp_iss_off		 VNET(pf_tcp_iss_off)
 VNET_DECLARE(int,			 pf_vnet_active);
 #define	V_pf_vnet_active		 VNET(pf_vnet_active)
 
 VNET_DEFINE_STATIC(uint32_t, pf_purge_idx);
 #define V_pf_purge_idx	VNET(pf_purge_idx)
 
 /*
  * Queue for pf_intr() sends.
  */
 static MALLOC_DEFINE(M_PFTEMP, "pf_temp", "pf(4) temporary allocations");
 struct pf_send_entry {
 	STAILQ_ENTRY(pf_send_entry)	pfse_next;
 	struct mbuf			*pfse_m;
 	enum {
 		PFSE_IP,
 		PFSE_IP6,
 		PFSE_ICMP,
 		PFSE_ICMP6,
 	}				pfse_type;
 	struct {
 		int		type;
 		int		code;
 		int		mtu;
 	} icmpopts;
 };
 
 STAILQ_HEAD(pf_send_head, pf_send_entry);
 VNET_DEFINE_STATIC(struct pf_send_head, pf_sendqueue);
 #define	V_pf_sendqueue	VNET(pf_sendqueue)
 
 static struct mtx pf_sendqueue_mtx;
 MTX_SYSINIT(pf_sendqueue_mtx, &pf_sendqueue_mtx, "pf send queue", MTX_DEF);
 #define	PF_SENDQ_LOCK()		mtx_lock(&pf_sendqueue_mtx)
 #define	PF_SENDQ_UNLOCK()	mtx_unlock(&pf_sendqueue_mtx)
 
 /*
  * Queue for pf_overload_task() tasks.
  */
 struct pf_overload_entry {
 	SLIST_ENTRY(pf_overload_entry)	next;
 	struct pf_addr  		addr;
 	sa_family_t			af;
 	uint8_t				dir;
 	struct pf_rule  		*rule;
 };
 
 SLIST_HEAD(pf_overload_head, pf_overload_entry);
 VNET_DEFINE_STATIC(struct pf_overload_head, pf_overloadqueue);
 #define V_pf_overloadqueue	VNET(pf_overloadqueue)
 VNET_DEFINE_STATIC(struct task, pf_overloadtask);
 #define	V_pf_overloadtask	VNET(pf_overloadtask)
 
 static struct mtx pf_overloadqueue_mtx;
 MTX_SYSINIT(pf_overloadqueue_mtx, &pf_overloadqueue_mtx,
     "pf overload/flush queue", MTX_DEF);
 #define	PF_OVERLOADQ_LOCK()	mtx_lock(&pf_overloadqueue_mtx)
 #define	PF_OVERLOADQ_UNLOCK()	mtx_unlock(&pf_overloadqueue_mtx)
 
 VNET_DEFINE(struct pf_rulequeue, pf_unlinked_rules);
 struct mtx pf_unlnkdrules_mtx;
 MTX_SYSINIT(pf_unlnkdrules_mtx, &pf_unlnkdrules_mtx, "pf unlinked rules",
     MTX_DEF);
 
 VNET_DEFINE_STATIC(uma_zone_t,	pf_sources_z);
 #define	V_pf_sources_z	VNET(pf_sources_z)
 uma_zone_t		pf_mtag_z;
 VNET_DEFINE(uma_zone_t,	 pf_state_z);
 VNET_DEFINE(uma_zone_t,	 pf_state_key_z);
 
 VNET_DEFINE(uint64_t, pf_stateid[MAXCPU]);
 #define	PFID_CPUBITS	8
 #define	PFID_CPUSHIFT	(sizeof(uint64_t) * NBBY - PFID_CPUBITS)
 #define	PFID_CPUMASK	((uint64_t)((1 << PFID_CPUBITS) - 1) <<	PFID_CPUSHIFT)
 #define	PFID_MAXID	(~PFID_CPUMASK)
 CTASSERT((1 << PFID_CPUBITS) >= MAXCPU);
 
 static void		 pf_src_tree_remove_state(struct pf_state *);
 static void		 pf_init_threshold(struct pf_threshold *, u_int32_t,
 			    u_int32_t);
 static void		 pf_add_threshold(struct pf_threshold *);
 static int		 pf_check_threshold(struct pf_threshold *);
 
 static void		 pf_change_ap(struct mbuf *, struct pf_addr *, u_int16_t *,
 			    u_int16_t *, u_int16_t *, struct pf_addr *,
 			    u_int16_t, u_int8_t, sa_family_t);
 static int		 pf_modulate_sack(struct mbuf *, int, struct pf_pdesc *,
 			    struct tcphdr *, struct pf_state_peer *);
 static void		 pf_change_icmp(struct pf_addr *, u_int16_t *,
 			    struct pf_addr *, struct pf_addr *, u_int16_t,
 			    u_int16_t *, u_int16_t *, u_int16_t *,
 			    u_int16_t *, u_int8_t, sa_family_t);
 static void		 pf_send_tcp(struct mbuf *,
 			    const struct pf_rule *, sa_family_t,
 			    const struct pf_addr *, const struct pf_addr *,
 			    u_int16_t, u_int16_t, u_int32_t, u_int32_t,
 			    u_int8_t, u_int16_t, u_int16_t, u_int8_t, int,
 			    u_int16_t, struct ifnet *);
 static void		 pf_send_icmp(struct mbuf *, u_int8_t, u_int8_t,
 			    sa_family_t, struct pf_rule *);
 static void		 pf_detach_state(struct pf_state *);
 static int		 pf_state_key_attach(struct pf_state_key *,
 			    struct pf_state_key *, struct pf_state *);
 static void		 pf_state_key_detach(struct pf_state *, int);
 static int		 pf_state_key_ctor(void *, int, void *, int);
 static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
 static int		 pf_test_rule(struct pf_rule **, struct pf_state **,
 			    int, struct pfi_kif *, struct mbuf *, int,
 			    struct pf_pdesc *, struct pf_rule **,
 			    struct pf_ruleset **, struct inpcb *);
 static int		 pf_create_state(struct pf_rule *, struct pf_rule *,
 			    struct pf_rule *, struct pf_pdesc *,
 			    struct pf_src_node *, struct pf_state_key *,
 			    struct pf_state_key *, struct mbuf *, int,
 			    u_int16_t, u_int16_t, int *, struct pfi_kif *,
 			    struct pf_state **, int, u_int16_t, u_int16_t,
 			    int);
 static int		 pf_test_fragment(struct pf_rule **, int,
 			    struct pfi_kif *, struct mbuf *, void *,
 			    struct pf_pdesc *, struct pf_rule **,
 			    struct pf_ruleset **);
 static int		 pf_tcp_track_full(struct pf_state_peer *,
 			    struct pf_state_peer *, struct pf_state **,
 			    struct pfi_kif *, struct mbuf *, int,
 			    struct pf_pdesc *, u_short *, int *);
 static int		 pf_tcp_track_sloppy(struct pf_state_peer *,
 			    struct pf_state_peer *, struct pf_state **,
 			    struct pf_pdesc *, u_short *);
 static int		 pf_test_state_tcp(struct pf_state **, int,
 			    struct pfi_kif *, struct mbuf *, int,
 			    void *, struct pf_pdesc *, u_short *);
 static int		 pf_test_state_udp(struct pf_state **, int,
 			    struct pfi_kif *, struct mbuf *, int,
 			    void *, struct pf_pdesc *);
 static int		 pf_test_state_icmp(struct pf_state **, int,
 			    struct pfi_kif *, struct mbuf *, int,
 			    void *, struct pf_pdesc *, u_short *);
 static int		 pf_test_state_other(struct pf_state **, int,
 			    struct pfi_kif *, struct mbuf *, struct pf_pdesc *);
 static u_int8_t		 pf_get_wscale(struct mbuf *, int, u_int16_t,
 			    sa_family_t);
 static u_int16_t	 pf_get_mss(struct mbuf *, int, u_int16_t,
 			    sa_family_t);
 static u_int16_t	 pf_calc_mss(struct pf_addr *, sa_family_t,
 				int, u_int16_t);
 static int		 pf_check_proto_cksum(struct mbuf *, int, int,
 			    u_int8_t, sa_family_t);
 static void		 pf_print_state_parts(struct pf_state *,
 			    struct pf_state_key *, struct pf_state_key *);
 static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
 			    struct pf_addr_wrap *);
 static struct pf_state	*pf_find_state(struct pfi_kif *,
 			    struct pf_state_key_cmp *, u_int);
 static int		 pf_src_connlimit(struct pf_state **);
 static void		 pf_overload_task(void *v, int pending);
 static int		 pf_insert_src_node(struct pf_src_node **,
 			    struct pf_rule *, struct pf_addr *, sa_family_t);
 static u_int		 pf_purge_expired_states(u_int, int);
 static void		 pf_purge_unlinked_rules(void);
 static int		 pf_mtag_uminit(void *, int, int);
 static void		 pf_mtag_free(struct m_tag *);
 #ifdef INET
 static void		 pf_route(struct mbuf **, struct pf_rule *, int,
 			    struct ifnet *, struct pf_state *,
 			    struct pf_pdesc *, struct inpcb *);
 #endif /* INET */
 #ifdef INET6
 static void		 pf_change_a6(struct pf_addr *, u_int16_t *,
 			    struct pf_addr *, u_int8_t);
 static void		 pf_route6(struct mbuf **, struct pf_rule *, int,
 			    struct ifnet *, struct pf_state *,
 			    struct pf_pdesc *, struct inpcb *);
 #endif /* INET6 */
 
 int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
 
 extern int pf_end_threads;
 extern struct proc *pf_purge_proc;
 
 VNET_DEFINE(struct pf_limit, pf_limits[PF_LIMIT_MAX]);
 
 #define	PACKET_LOOPED(pd)	((pd)->pf_mtag &&			\
 				 (pd)->pf_mtag->flags & PF_PACKET_LOOPED)
 
 #define	STATE_LOOKUP(i, k, d, s, pd)					\
 	do {								\
 		(s) = pf_find_state((i), (k), (d));			\
 		if ((s) == NULL)					\
 			return (PF_DROP);				\
 		if (PACKET_LOOPED(pd))					\
 			return (PF_PASS);				\
 		if ((d) == PF_OUT &&					\
 		    (((s)->rule.ptr->rt == PF_ROUTETO &&		\
 		    (s)->rule.ptr->direction == PF_OUT) ||		\
 		    ((s)->rule.ptr->rt == PF_REPLYTO &&			\
 		    (s)->rule.ptr->direction == PF_IN)) &&		\
 		    (s)->rt_kif != NULL &&				\
 		    (s)->rt_kif != (i))					\
 			return (PF_PASS);				\
 	} while (0)
 
 #define	BOUND_IFACE(r, k) \
 	((r)->rule_flag & PFRULE_IFBOUND) ? (k) : V_pfi_all
 
 #define	STATE_INC_COUNTERS(s)						\
 	do {								\
 		counter_u64_add(s->rule.ptr->states_cur, 1);		\
 		counter_u64_add(s->rule.ptr->states_tot, 1);		\
 		if (s->anchor.ptr != NULL) {				\
 			counter_u64_add(s->anchor.ptr->states_cur, 1);	\
 			counter_u64_add(s->anchor.ptr->states_tot, 1);	\
 		}							\
 		if (s->nat_rule.ptr != NULL) {				\
 			counter_u64_add(s->nat_rule.ptr->states_cur, 1);\
 			counter_u64_add(s->nat_rule.ptr->states_tot, 1);\
 		}							\
 	} while (0)
 
 #define	STATE_DEC_COUNTERS(s)						\
 	do {								\
 		if (s->nat_rule.ptr != NULL)				\
 			counter_u64_add(s->nat_rule.ptr->states_cur, -1);\
 		if (s->anchor.ptr != NULL)				\
 			counter_u64_add(s->anchor.ptr->states_cur, -1);	\
 		counter_u64_add(s->rule.ptr->states_cur, -1);		\
 	} while (0)
 
 MALLOC_DEFINE(M_PFHASH, "pf_hash", "pf(4) hash header structures");
 VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
 VNET_DEFINE(struct pf_idhash *, pf_idhash);
 VNET_DEFINE(struct pf_srchash *, pf_srchash);
 
 SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW, 0, "pf(4)");
 
 u_long	pf_hashmask;
 u_long	pf_srchashmask;
 static u_long	pf_hashsize;
 static u_long	pf_srchashsize;
 u_long	pf_ioctl_maxcount = 65535;
 
 SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_RDTUN,
     &pf_hashsize, 0, "Size of pf(4) states hashtable");
 SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_RDTUN,
     &pf_srchashsize, 0, "Size of pf(4) source nodes hashtable");
 SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RDTUN,
     &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a single ioctl() call");
 
 VNET_DEFINE(void *, pf_swi_cookie);
 
 VNET_DEFINE(uint32_t, pf_hashseed);
 #define	V_pf_hashseed	VNET(pf_hashseed)
 
 int
 pf_addr_cmp(struct pf_addr *a, struct pf_addr *b, sa_family_t af)
 {
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		if (a->addr32[0] > b->addr32[0])
 			return (1);
 		if (a->addr32[0] < b->addr32[0])
 			return (-1);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		if (a->addr32[3] > b->addr32[3])
 			return (1);
 		if (a->addr32[3] < b->addr32[3])
 			return (-1);
 		if (a->addr32[2] > b->addr32[2])
 			return (1);
 		if (a->addr32[2] < b->addr32[2])
 			return (-1);
 		if (a->addr32[1] > b->addr32[1])
 			return (1);
 		if (a->addr32[1] < b->addr32[1])
 			return (-1);
 		if (a->addr32[0] > b->addr32[0])
 			return (1);
 		if (a->addr32[0] < b->addr32[0])
 			return (-1);
 		break;
 #endif /* INET6 */
 	default:
 		panic("%s: unknown address family %u", __func__, af);
 	}
 	return (0);
 }
 
 static __inline uint32_t
 pf_hashkey(struct pf_state_key *sk)
 {
 	uint32_t h;
 
 	h = murmur3_32_hash32((uint32_t *)sk,
 	    sizeof(struct pf_state_key_cmp)/sizeof(uint32_t),
 	    V_pf_hashseed);
 
 	return (h & pf_hashmask);
 }
 
 static __inline uint32_t
 pf_hashsrc(struct pf_addr *addr, sa_family_t af)
 {
 	uint32_t h;
 
 	switch (af) {
 	case AF_INET:
 		h = murmur3_32_hash32((uint32_t *)&addr->v4,
 		    sizeof(addr->v4)/sizeof(uint32_t), V_pf_hashseed);
 		break;
 	case AF_INET6:
 		h = murmur3_32_hash32((uint32_t *)&addr->v6,
 		    sizeof(addr->v6)/sizeof(uint32_t), V_pf_hashseed);
 		break;
 	default:
 		panic("%s: unknown address family %u", __func__, af);
 	}
 
 	return (h & pf_srchashmask);
 }
 
 #ifdef ALTQ
 static int
 pf_state_hash(struct pf_state *s)
 {
 	u_int32_t hv = (intptr_t)s / sizeof(*s);
 
 	hv ^= crc32(&s->src, sizeof(s->src));
 	hv ^= crc32(&s->dst, sizeof(s->dst));
 	if (hv == 0)
 		hv = 1;
 	return (hv);
 }
 #endif
 
 #ifdef INET6
 void
 pf_addrcpy(struct pf_addr *dst, struct pf_addr *src, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		dst->addr32[0] = src->addr32[0];
 		break;
 #endif /* INET */
 	case AF_INET6:
 		dst->addr32[0] = src->addr32[0];
 		dst->addr32[1] = src->addr32[1];
 		dst->addr32[2] = src->addr32[2];
 		dst->addr32[3] = src->addr32[3];
 		break;
 	}
 }
 #endif /* INET6 */
 
 static void
 pf_init_threshold(struct pf_threshold *threshold,
     u_int32_t limit, u_int32_t seconds)
 {
 	threshold->limit = limit * PF_THRESHOLD_MULT;
 	threshold->seconds = seconds;
 	threshold->count = 0;
 	threshold->last = time_uptime;
 }
 
 static void
 pf_add_threshold(struct pf_threshold *threshold)
 {
 	u_int32_t t = time_uptime, diff = t - threshold->last;
 
 	if (diff >= threshold->seconds)
 		threshold->count = 0;
 	else
 		threshold->count -= threshold->count * diff /
 		    threshold->seconds;
 	threshold->count += PF_THRESHOLD_MULT;
 	threshold->last = t;
 }
 
 static int
 pf_check_threshold(struct pf_threshold *threshold)
 {
 	return (threshold->count > threshold->limit);
 }
 
 static int
 pf_src_connlimit(struct pf_state **state)
 {
 	struct pf_overload_entry *pfoe;
 	int bad = 0;
 
 	PF_STATE_LOCK_ASSERT(*state);
 
 	(*state)->src_node->conn++;
 	(*state)->src.tcp_est = 1;
 	pf_add_threshold(&(*state)->src_node->conn_rate);
 
 	if ((*state)->rule.ptr->max_src_conn &&
 	    (*state)->rule.ptr->max_src_conn <
 	    (*state)->src_node->conn) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONN], 1);
 		bad++;
 	}
 
 	if ((*state)->rule.ptr->max_src_conn_rate.limit &&
 	    pf_check_threshold(&(*state)->src_node->conn_rate)) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_SRCCONNRATE], 1);
 		bad++;
 	}
 
 	if (!bad)
 		return (0);
 
 	/* Kill this state. */
 	(*state)->timeout = PFTM_PURGE;
 	(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
 
 	if ((*state)->rule.ptr->overload_tbl == NULL)
 		return (1);
 
 	/* Schedule overloading and flushing task. */
 	pfoe = malloc(sizeof(*pfoe), M_PFTEMP, M_NOWAIT);
 	if (pfoe == NULL)
 		return (1);	/* too bad :( */
 
 	bcopy(&(*state)->src_node->addr, &pfoe->addr, sizeof(pfoe->addr));
 	pfoe->af = (*state)->key[PF_SK_WIRE]->af;
 	pfoe->rule = (*state)->rule.ptr;
 	pfoe->dir = (*state)->direction;
 	PF_OVERLOADQ_LOCK();
 	SLIST_INSERT_HEAD(&V_pf_overloadqueue, pfoe, next);
 	PF_OVERLOADQ_UNLOCK();
 	taskqueue_enqueue(taskqueue_swi, &V_pf_overloadtask);
 
 	return (1);
 }
 
 static void
 pf_overload_task(void *v, int pending)
 {
 	struct pf_overload_head queue;
 	struct pfr_addr p;
 	struct pf_overload_entry *pfoe, *pfoe1;
 	uint32_t killed = 0;
 
 	CURVNET_SET((struct vnet *)v);
 
 	PF_OVERLOADQ_LOCK();
 	queue = V_pf_overloadqueue;
 	SLIST_INIT(&V_pf_overloadqueue);
 	PF_OVERLOADQ_UNLOCK();
 
 	bzero(&p, sizeof(p));
 	SLIST_FOREACH(pfoe, &queue, next) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_OVERLOAD_TABLE], 1);
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("%s: blocking address ", __func__);
 			pf_print_host(&pfoe->addr, 0, pfoe->af);
 			printf("\n");
 		}
 
 		p.pfra_af = pfoe->af;
 		switch (pfoe->af) {
 #ifdef INET
 		case AF_INET:
 			p.pfra_net = 32;
 			p.pfra_ip4addr = pfoe->addr.v4;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			p.pfra_net = 128;
 			p.pfra_ip6addr = pfoe->addr.v6;
 			break;
 #endif
 		}
 
 		PF_RULES_WLOCK();
 		pfr_insert_kentry(pfoe->rule->overload_tbl, &p, time_second);
 		PF_RULES_WUNLOCK();
 	}
 
 	/*
 	 * Remove those entries, that don't need flushing.
 	 */
 	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
 		if (pfoe->rule->flush == 0) {
 			SLIST_REMOVE(&queue, pfoe, pf_overload_entry, next);
 			free(pfoe, M_PFTEMP);
 		} else
 			counter_u64_add(
 			    V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH], 1);
 
 	/* If nothing to flush, return. */
 	if (SLIST_EMPTY(&queue)) {
 		CURVNET_RESTORE();
 		return;
 	}
 
 	for (int i = 0; i <= pf_hashmask; i++) {
 		struct pf_idhash *ih = &V_pf_idhash[i];
 		struct pf_state_key *sk;
 		struct pf_state *s;
 
 		PF_HASHROW_LOCK(ih);
 		LIST_FOREACH(s, &ih->states, entry) {
 		    sk = s->key[PF_SK_WIRE];
 		    SLIST_FOREACH(pfoe, &queue, next)
 			if (sk->af == pfoe->af &&
 			    ((pfoe->rule->flush & PF_FLUSH_GLOBAL) ||
 			    pfoe->rule == s->rule.ptr) &&
 			    ((pfoe->dir == PF_OUT &&
 			    PF_AEQ(&pfoe->addr, &sk->addr[1], sk->af)) ||
 			    (pfoe->dir == PF_IN &&
 			    PF_AEQ(&pfoe->addr, &sk->addr[0], sk->af)))) {
 				s->timeout = PFTM_PURGE;
 				s->src.state = s->dst.state = TCPS_CLOSED;
 				killed++;
 			}
 		}
 		PF_HASHROW_UNLOCK(ih);
 	}
 	SLIST_FOREACH_SAFE(pfoe, &queue, next, pfoe1)
 		free(pfoe, M_PFTEMP);
 	if (V_pf_status.debug >= PF_DEBUG_MISC)
 		printf("%s: %u states killed", __func__, killed);
 
 	CURVNET_RESTORE();
 }
 
 /*
  * Can return locked on failure, so that we can consistently
  * allocate and insert a new one.
  */
 struct pf_src_node *
 pf_find_src_node(struct pf_addr *src, struct pf_rule *rule, sa_family_t af,
 	int returnlocked)
 {
 	struct pf_srchash *sh;
 	struct pf_src_node *n;
 
 	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_SEARCH], 1);
 
 	sh = &V_pf_srchash[pf_hashsrc(src, af)];
 	PF_HASHROW_LOCK(sh);
 	LIST_FOREACH(n, &sh->nodes, entry)
 		if (n->rule.ptr == rule && n->af == af &&
 		    ((af == AF_INET && n->addr.v4.s_addr == src->v4.s_addr) ||
 		    (af == AF_INET6 && bcmp(&n->addr, src, sizeof(*src)) == 0)))
 			break;
 	if (n != NULL) {
 		n->states++;
 		PF_HASHROW_UNLOCK(sh);
 	} else if (returnlocked == 0)
 		PF_HASHROW_UNLOCK(sh);
 
 	return (n);
 }
 
 static int
 pf_insert_src_node(struct pf_src_node **sn, struct pf_rule *rule,
     struct pf_addr *src, sa_family_t af)
 {
 
 	KASSERT((rule->rule_flag & PFRULE_RULESRCTRACK ||
 	    rule->rpool.opts & PF_POOL_STICKYADDR),
 	    ("%s for non-tracking rule %p", __func__, rule));
 
 	if (*sn == NULL)
 		*sn = pf_find_src_node(src, rule, af, 1);
 
 	if (*sn == NULL) {
 		struct pf_srchash *sh = &V_pf_srchash[pf_hashsrc(src, af)];
 
 		PF_HASHROW_ASSERT(sh);
 
 		if (!rule->max_src_nodes ||
 		    counter_u64_fetch(rule->src_nodes) < rule->max_src_nodes)
 			(*sn) = uma_zalloc(V_pf_sources_z, M_NOWAIT | M_ZERO);
 		else
 			counter_u64_add(V_pf_status.lcounters[LCNT_SRCNODES],
 			    1);
 		if ((*sn) == NULL) {
 			PF_HASHROW_UNLOCK(sh);
 			return (-1);
 		}
 
 		pf_init_threshold(&(*sn)->conn_rate,
 		    rule->max_src_conn_rate.limit,
 		    rule->max_src_conn_rate.seconds);
 
 		(*sn)->af = af;
 		(*sn)->rule.ptr = rule;
 		PF_ACPY(&(*sn)->addr, src, af);
 		LIST_INSERT_HEAD(&sh->nodes, *sn, entry);
 		(*sn)->creation = time_uptime;
 		(*sn)->ruletype = rule->action;
 		(*sn)->states = 1;
 		if ((*sn)->rule.ptr != NULL)
 			counter_u64_add((*sn)->rule.ptr->src_nodes, 1);
 		PF_HASHROW_UNLOCK(sh);
 		counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_INSERT], 1);
 	} else {
 		if (rule->max_src_states &&
 		    (*sn)->states >= rule->max_src_states) {
 			counter_u64_add(V_pf_status.lcounters[LCNT_SRCSTATES],
 			    1);
 			return (-1);
 		}
 	}
 	return (0);
 }
 
 void
 pf_unlink_src_node(struct pf_src_node *src)
 {
 
 	PF_HASHROW_ASSERT(&V_pf_srchash[pf_hashsrc(&src->addr, src->af)]);
 	LIST_REMOVE(src, entry);
 	if (src->rule.ptr)
 		counter_u64_add(src->rule.ptr->src_nodes, -1);
 }
 
 u_int
 pf_free_src_nodes(struct pf_src_node_list *head)
 {
 	struct pf_src_node *sn, *tmp;
 	u_int count = 0;
 
 	LIST_FOREACH_SAFE(sn, head, entry, tmp) {
 		uma_zfree(V_pf_sources_z, sn);
 		count++;
 	}
 
 	counter_u64_add(V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], count);
 
 	return (count);
 }
 
 void
 pf_mtag_initialize()
 {
 
 	pf_mtag_z = uma_zcreate("pf mtags", sizeof(struct m_tag) +
 	    sizeof(struct pf_mtag), NULL, NULL, pf_mtag_uminit, NULL,
 	    UMA_ALIGN_PTR, 0);
 }
 
 /* Per-vnet data storage structures initialization. */
 void
 pf_initialize()
 {
 	struct pf_keyhash	*kh;
 	struct pf_idhash	*ih;
 	struct pf_srchash	*sh;
 	u_int i;
 
 	if (pf_hashsize == 0 || !powerof2(pf_hashsize))
 		pf_hashsize = PF_HASHSIZ;
 	if (pf_srchashsize == 0 || !powerof2(pf_srchashsize))
 		pf_srchashsize = PF_SRCHASHSIZ;
 
 	V_pf_hashseed = arc4random();
 
 	/* States and state keys storage. */
 	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	V_pf_limits[PF_LIMIT_STATES].zone = V_pf_state_z;
 	uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
 	uma_zone_set_warning(V_pf_state_z, "PF states limit reached");
 
 	V_pf_state_key_z = uma_zcreate("pf state keys",
 	    sizeof(struct pf_state_key), pf_state_key_ctor, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 
 	V_pf_keyhash = mallocarray(pf_hashsize, sizeof(struct pf_keyhash),
 	    M_PFHASH, M_NOWAIT | M_ZERO);
 	V_pf_idhash = mallocarray(pf_hashsize, sizeof(struct pf_idhash),
 	    M_PFHASH, M_NOWAIT | M_ZERO);
 	if (V_pf_keyhash == NULL || V_pf_idhash == NULL) {
 		printf("pf: Unable to allocate memory for "
 		    "state_hashsize %lu.\n", pf_hashsize);
 
 		free(V_pf_keyhash, M_PFHASH);
 		free(V_pf_idhash, M_PFHASH);
 
 		pf_hashsize = PF_HASHSIZ;
 		V_pf_keyhash = mallocarray(pf_hashsize,
 		    sizeof(struct pf_keyhash), M_PFHASH, M_WAITOK | M_ZERO);
 		V_pf_idhash = mallocarray(pf_hashsize,
 		    sizeof(struct pf_idhash), M_PFHASH, M_WAITOK | M_ZERO);
 	}
 
 	pf_hashmask = pf_hashsize - 1;
 	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
 	    i++, kh++, ih++) {
 		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF | MTX_DUPOK);
 		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
 	}
 
 	/* Source nodes. */
 	V_pf_sources_z = uma_zcreate("pf source nodes",
 	    sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
 	    0);
 	V_pf_limits[PF_LIMIT_SRC_NODES].zone = V_pf_sources_z;
 	uma_zone_set_max(V_pf_sources_z, PFSNODE_HIWAT);
 	uma_zone_set_warning(V_pf_sources_z, "PF source nodes limit reached");
 
 	V_pf_srchash = mallocarray(pf_srchashsize,
 	    sizeof(struct pf_srchash), M_PFHASH, M_NOWAIT | M_ZERO);
 	if (V_pf_srchash == NULL) {
 		printf("pf: Unable to allocate memory for "
 		    "source_hashsize %lu.\n", pf_srchashsize);
 
 		pf_srchashsize = PF_SRCHASHSIZ;
 		V_pf_srchash = mallocarray(pf_srchashsize,
 		    sizeof(struct pf_srchash), M_PFHASH, M_WAITOK | M_ZERO);
 	}
 
 	pf_srchashmask = pf_srchashsize - 1;
 	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++)
 		mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
 
 	/* ALTQ */
 	TAILQ_INIT(&V_pf_altqs[0]);
 	TAILQ_INIT(&V_pf_altqs[1]);
 	TAILQ_INIT(&V_pf_altqs[2]);
 	TAILQ_INIT(&V_pf_altqs[3]);
 	TAILQ_INIT(&V_pf_pabuf);
 	V_pf_altqs_active = &V_pf_altqs[0];
 	V_pf_altq_ifs_active = &V_pf_altqs[1];
 	V_pf_altqs_inactive = &V_pf_altqs[2];
 	V_pf_altq_ifs_inactive = &V_pf_altqs[3];
 
 	/* Send & overload+flush queues. */
 	STAILQ_INIT(&V_pf_sendqueue);
 	SLIST_INIT(&V_pf_overloadqueue);
 	TASK_INIT(&V_pf_overloadtask, 0, pf_overload_task, curvnet);
 
 	/* Unlinked, but may be referenced rules. */
 	TAILQ_INIT(&V_pf_unlinked_rules);
 }
 
 void
 pf_mtag_cleanup()
 {
 
 	uma_zdestroy(pf_mtag_z);
 }
 
 void
 pf_cleanup()
 {
 	struct pf_keyhash	*kh;
 	struct pf_idhash	*ih;
 	struct pf_srchash	*sh;
 	struct pf_send_entry	*pfse, *next;
 	u_int i;
 
 	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= pf_hashmask;
 	    i++, kh++, ih++) {
 		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
 		    __func__));
 		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
 		    __func__));
 		mtx_destroy(&kh->lock);
 		mtx_destroy(&ih->lock);
 	}
 	free(V_pf_keyhash, M_PFHASH);
 	free(V_pf_idhash, M_PFHASH);
 
 	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
 		KASSERT(LIST_EMPTY(&sh->nodes),
 		    ("%s: source node hash not empty", __func__));
 		mtx_destroy(&sh->lock);
 	}
 	free(V_pf_srchash, M_PFHASH);
 
 	STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
 		m_freem(pfse->pfse_m);
 		free(pfse, M_PFTEMP);
 	}
 
 	uma_zdestroy(V_pf_sources_z);
 	uma_zdestroy(V_pf_state_z);
 	uma_zdestroy(V_pf_state_key_z);
 }
 
 static int
 pf_mtag_uminit(void *mem, int size, int how)
 {
 	struct m_tag *t;
 
 	t = (struct m_tag *)mem;
 	t->m_tag_cookie = MTAG_ABI_COMPAT;
 	t->m_tag_id = PACKET_TAG_PF;
 	t->m_tag_len = sizeof(struct pf_mtag);
 	t->m_tag_free = pf_mtag_free;
 
 	return (0);
 }
 
 static void
 pf_mtag_free(struct m_tag *t)
 {
 
 	uma_zfree(pf_mtag_z, t);
 }
 
 struct pf_mtag *
 pf_get_mtag(struct mbuf *m)
 {
 	struct m_tag *mtag;
 
 	if ((mtag = m_tag_find(m, PACKET_TAG_PF, NULL)) != NULL)
 		return ((struct pf_mtag *)(mtag + 1));
 
 	mtag = uma_zalloc(pf_mtag_z, M_NOWAIT);
 	if (mtag == NULL)
 		return (NULL);
 	bzero(mtag + 1, sizeof(struct pf_mtag));
 	m_tag_prepend(m, mtag);
 
 	return ((struct pf_mtag *)(mtag + 1));
 }
 
 static int
 pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
     struct pf_state *s)
 {
 	struct pf_keyhash	*khs, *khw, *kh;
 	struct pf_state_key	*sk, *cur;
 	struct pf_state		*si, *olds = NULL;
 	int idx;
 
 	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
 	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
 	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
 
 	/*
 	 * We need to lock hash slots of both keys. To avoid deadlock
 	 * we always lock the slot with lower address first. Unlock order
 	 * isn't important.
 	 *
 	 * We also need to lock ID hash slot before dropping key
 	 * locks. On success we return with ID hash slot locked.
 	 */
 
 	if (skw == sks) {
 		khs = khw = &V_pf_keyhash[pf_hashkey(skw)];
 		PF_HASHROW_LOCK(khs);
 	} else {
 		khs = &V_pf_keyhash[pf_hashkey(sks)];
 		khw = &V_pf_keyhash[pf_hashkey(skw)];
 		if (khs == khw) {
 			PF_HASHROW_LOCK(khs);
 		} else if (khs < khw) {
 			PF_HASHROW_LOCK(khs);
 			PF_HASHROW_LOCK(khw);
 		} else {
 			PF_HASHROW_LOCK(khw);
 			PF_HASHROW_LOCK(khs);
 		}
 	}
 
 #define	KEYS_UNLOCK()	do {			\
 	if (khs != khw) {			\
 		PF_HASHROW_UNLOCK(khs);		\
 		PF_HASHROW_UNLOCK(khw);		\
 	} else					\
 		PF_HASHROW_UNLOCK(khs);		\
 } while (0)
 
 	/*
 	 * First run: start with wire key.
 	 */
 	sk = skw;
 	kh = khw;
 	idx = PF_SK_WIRE;
 
 keyattach:
 	LIST_FOREACH(cur, &kh->keys, entry)
 		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
 			break;
 
 	if (cur != NULL) {
 		/* Key exists. Check for same kif, if none, add to key. */
 		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
 			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
 
 			PF_HASHROW_LOCK(ih);
 			if (si->kif == s->kif &&
 			    si->direction == s->direction) {
 				if (sk->proto == IPPROTO_TCP &&
 				    si->src.state >= TCPS_FIN_WAIT_2 &&
 				    si->dst.state >= TCPS_FIN_WAIT_2) {
 					/*
 					 * New state matches an old >FIN_WAIT_2
 					 * state. We can't drop key hash locks,
 					 * thus we can't unlink it properly.
 					 *
 					 * As a workaround we drop it into
 					 * TCPS_CLOSED state, schedule purge
 					 * ASAP and push it into the very end
 					 * of the slot TAILQ, so that it won't
 					 * conflict with our new state.
 					 */
 					si->src.state = si->dst.state =
 					    TCPS_CLOSED;
 					si->timeout = PFTM_PURGE;
 					olds = si;
 				} else {
 					if (V_pf_status.debug >= PF_DEBUG_MISC) {
 						printf("pf: %s key attach "
 						    "failed on %s: ",
 						    (idx == PF_SK_WIRE) ?
 						    "wire" : "stack",
 						    s->kif->pfik_name);
 						pf_print_state_parts(s,
 						    (idx == PF_SK_WIRE) ?
 						    sk : NULL,
 						    (idx == PF_SK_STACK) ?
 						    sk : NULL);
 						printf(", existing: ");
 						pf_print_state_parts(si,
 						    (idx == PF_SK_WIRE) ?
 						    sk : NULL,
 						    (idx == PF_SK_STACK) ?
 						    sk : NULL);
 						printf("\n");
 					}
 					PF_HASHROW_UNLOCK(ih);
 					KEYS_UNLOCK();
 					uma_zfree(V_pf_state_key_z, sk);
 					if (idx == PF_SK_STACK)
 						pf_detach_state(s);
 					return (EEXIST); /* collision! */
 				}
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 		uma_zfree(V_pf_state_key_z, sk);
 		s->key[idx] = cur;
 	} else {
 		LIST_INSERT_HEAD(&kh->keys, sk, entry);
 		s->key[idx] = sk;
 	}
 
 stateattach:
 	/* List is sorted, if-bound states before floating. */
 	if (s->kif == V_pfi_all)
 		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
 	else
 		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
 
 	if (olds) {
 		TAILQ_REMOVE(&s->key[idx]->states[idx], olds, key_list[idx]);
 		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], olds,
 		    key_list[idx]);
 		olds = NULL;
 	}
 
 	/*
 	 * Attach done. See how should we (or should not?)
 	 * attach a second key.
 	 */
 	if (sks == skw) {
 		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
 		idx = PF_SK_STACK;
 		sks = NULL;
 		goto stateattach;
 	} else if (sks != NULL) {
 		/*
 		 * Continue attaching with stack key.
 		 */
 		sk = sks;
 		kh = khs;
 		idx = PF_SK_STACK;
 		sks = NULL;
 		goto keyattach;
 	}
 
 	PF_STATE_LOCK(s);
 	KEYS_UNLOCK();
 
 	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
 	    ("%s failure", __func__));
 
 	return (0);
 #undef	KEYS_UNLOCK
 }
 
 static void
 pf_detach_state(struct pf_state *s)
 {
 	struct pf_state_key *sks = s->key[PF_SK_STACK];
 	struct pf_keyhash *kh;
 
 	if (sks != NULL) {
 		kh = &V_pf_keyhash[pf_hashkey(sks)];
 		PF_HASHROW_LOCK(kh);
 		if (s->key[PF_SK_STACK] != NULL)
 			pf_state_key_detach(s, PF_SK_STACK);
 		/*
 		 * If both point to same key, then we are done.
 		 */
 		if (sks == s->key[PF_SK_WIRE]) {
 			pf_state_key_detach(s, PF_SK_WIRE);
 			PF_HASHROW_UNLOCK(kh);
 			return;
 		}
 		PF_HASHROW_UNLOCK(kh);
 	}
 
 	if (s->key[PF_SK_WIRE] != NULL) {
 		kh = &V_pf_keyhash[pf_hashkey(s->key[PF_SK_WIRE])];
 		PF_HASHROW_LOCK(kh);
 		if (s->key[PF_SK_WIRE] != NULL)
 			pf_state_key_detach(s, PF_SK_WIRE);
 		PF_HASHROW_UNLOCK(kh);
 	}
 }
 
 static void
 pf_state_key_detach(struct pf_state *s, int idx)
 {
 	struct pf_state_key *sk = s->key[idx];
 #ifdef INVARIANTS
 	struct pf_keyhash *kh = &V_pf_keyhash[pf_hashkey(sk)];
 
 	PF_HASHROW_ASSERT(kh);
 #endif
 	TAILQ_REMOVE(&sk->states[idx], s, key_list[idx]);
 	s->key[idx] = NULL;
 
 	if (TAILQ_EMPTY(&sk->states[0]) && TAILQ_EMPTY(&sk->states[1])) {
 		LIST_REMOVE(sk, entry);
 		uma_zfree(V_pf_state_key_z, sk);
 	}
 }
 
 static int
 pf_state_key_ctor(void *mem, int size, void *arg, int flags)
 {
 	struct pf_state_key *sk = mem;
 
 	bzero(sk, sizeof(struct pf_state_key_cmp));
 	TAILQ_INIT(&sk->states[PF_SK_WIRE]);
 	TAILQ_INIT(&sk->states[PF_SK_STACK]);
 
 	return (0);
 }
 
 struct pf_state_key *
 pf_state_key_setup(struct pf_pdesc *pd, struct pf_addr *saddr,
 	struct pf_addr *daddr, u_int16_t sport, u_int16_t dport)
 {
 	struct pf_state_key *sk;
 
 	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 	if (sk == NULL)
 		return (NULL);
 
 	PF_ACPY(&sk->addr[pd->sidx], saddr, pd->af);
 	PF_ACPY(&sk->addr[pd->didx], daddr, pd->af);
 	sk->port[pd->sidx] = sport;
 	sk->port[pd->didx] = dport;
 	sk->proto = pd->proto;
 	sk->af = pd->af;
 
 	return (sk);
 }
 
 struct pf_state_key *
 pf_state_key_clone(struct pf_state_key *orig)
 {
 	struct pf_state_key *sk;
 
 	sk = uma_zalloc(V_pf_state_key_z, M_NOWAIT);
 	if (sk == NULL)
 		return (NULL);
 
 	bcopy(orig, sk, sizeof(struct pf_state_key_cmp));
 
 	return (sk);
 }
 
 int
 pf_state_insert(struct pfi_kif *kif, struct pf_state_key *skw,
     struct pf_state_key *sks, struct pf_state *s)
 {
 	struct pf_idhash *ih;
 	struct pf_state *cur;
 	int error;
 
 	KASSERT(TAILQ_EMPTY(&sks->states[0]) && TAILQ_EMPTY(&sks->states[1]),
 	    ("%s: sks not pristine", __func__));
 	KASSERT(TAILQ_EMPTY(&skw->states[0]) && TAILQ_EMPTY(&skw->states[1]),
 	    ("%s: skw not pristine", __func__));
 	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
 
 	s->kif = kif;
 
 	if (s->id == 0 && s->creatorid == 0) {
 		/* XXX: should be atomic, but probability of collision low */
 		if ((s->id = V_pf_stateid[curcpu]++) == PFID_MAXID)
 			V_pf_stateid[curcpu] = 1;
 		s->id |= (uint64_t )curcpu << PFID_CPUSHIFT;
 		s->id = htobe64(s->id);
 		s->creatorid = V_pf_status.hostid;
 	}
 
 	/* Returns with ID locked on success. */
 	if ((error = pf_state_key_attach(skw, sks, s)) != 0)
 		return (error);
 
 	ih = &V_pf_idhash[PF_IDHASH(s)];
 	PF_HASHROW_ASSERT(ih);
 	LIST_FOREACH(cur, &ih->states, entry)
 		if (cur->id == s->id && cur->creatorid == s->creatorid)
 			break;
 
 	if (cur != NULL) {
 		PF_HASHROW_UNLOCK(ih);
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: state ID collision: "
 			    "id: %016llx creatorid: %08x\n",
 			    (unsigned long long)be64toh(s->id),
 			    ntohl(s->creatorid));
 		}
 		pf_detach_state(s);
 		return (EEXIST);
 	}
 	LIST_INSERT_HEAD(&ih->states, s, entry);
 	/* One for keys, one for ID hash. */
 	refcount_init(&s->refs, 2);
 
 	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_INSERT], 1);
 	if (V_pfsync_insert_state_ptr != NULL)
 		V_pfsync_insert_state_ptr(s);
 
 	/* Returns locked. */
 	return (0);
 }
 
 /*
  * Find state by ID: returns with locked row on success.
  */
 struct pf_state *
 pf_find_state_byid(uint64_t id, uint32_t creatorid)
 {
 	struct pf_idhash *ih;
 	struct pf_state *s;
 
 	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
 
 	ih = &V_pf_idhash[(be64toh(id) % (pf_hashmask + 1))];
 
 	PF_HASHROW_LOCK(ih);
 	LIST_FOREACH(s, &ih->states, entry)
 		if (s->id == id && s->creatorid == creatorid)
 			break;
 
 	if (s == NULL)
 		PF_HASHROW_UNLOCK(ih);
 
 	return (s);
 }
 
 /*
  * Find state by key.
  * Returns with ID hash slot locked on success.
  */
 static struct pf_state *
 pf_find_state(struct pfi_kif *kif, struct pf_state_key_cmp *key, u_int dir)
 {
 	struct pf_keyhash	*kh;
 	struct pf_state_key	*sk;
 	struct pf_state		*s;
 	int idx;
 
 	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
 
 	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
 
 	PF_HASHROW_LOCK(kh);
 	LIST_FOREACH(sk, &kh->keys, entry)
 		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
 			break;
 	if (sk == NULL) {
 		PF_HASHROW_UNLOCK(kh);
 		return (NULL);
 	}
 
 	idx = (dir == PF_IN ? PF_SK_WIRE : PF_SK_STACK);
 
 	/* List is sorted, if-bound states before floating ones. */
 	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx])
 		if (s->kif == V_pfi_all || s->kif == kif) {
 			PF_STATE_LOCK(s);
 			PF_HASHROW_UNLOCK(kh);
 			if (s->timeout >= PFTM_MAX) {
 				/*
 				 * State is either being processed by
 				 * pf_unlink_state() in an other thread, or
 				 * is scheduled for immediate expiry.
 				 */
 				PF_STATE_UNLOCK(s);
 				return (NULL);
 			}
 			return (s);
 		}
 	PF_HASHROW_UNLOCK(kh);
 
 	return (NULL);
 }
 
 struct pf_state *
 pf_find_state_all(struct pf_state_key_cmp *key, u_int dir, int *more)
 {
 	struct pf_keyhash	*kh;
 	struct pf_state_key	*sk;
 	struct pf_state		*s, *ret = NULL;
 	int			 idx, inout = 0;
 
 	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_SEARCH], 1);
 
 	kh = &V_pf_keyhash[pf_hashkey((struct pf_state_key *)key)];
 
 	PF_HASHROW_LOCK(kh);
 	LIST_FOREACH(sk, &kh->keys, entry)
 		if (bcmp(sk, key, sizeof(struct pf_state_key_cmp)) == 0)
 			break;
 	if (sk == NULL) {
 		PF_HASHROW_UNLOCK(kh);
 		return (NULL);
 	}
 	switch (dir) {
 	case PF_IN:
 		idx = PF_SK_WIRE;
 		break;
 	case PF_OUT:
 		idx = PF_SK_STACK;
 		break;
 	case PF_INOUT:
 		idx = PF_SK_WIRE;
 		inout = 1;
 		break;
 	default:
 		panic("%s: dir %u", __func__, dir);
 	}
 second_run:
 	TAILQ_FOREACH(s, &sk->states[idx], key_list[idx]) {
 		if (more == NULL) {
 			PF_HASHROW_UNLOCK(kh);
 			return (s);
 		}
 
 		if (ret)
 			(*more)++;
 		else
 			ret = s;
 	}
 	if (inout == 1) {
 		inout = 0;
 		idx = PF_SK_STACK;
 		goto second_run;
 	}
 	PF_HASHROW_UNLOCK(kh);
 
 	return (ret);
 }
 
 /* END state table stuff */
 
 static void
 pf_send(struct pf_send_entry *pfse)
 {
 
 	PF_SENDQ_LOCK();
 	STAILQ_INSERT_TAIL(&V_pf_sendqueue, pfse, pfse_next);
 	PF_SENDQ_UNLOCK();
 	swi_sched(V_pf_swi_cookie, 0);
 }
 
 void
 pf_intr(void *v)
 {
 	struct pf_send_head queue;
 	struct pf_send_entry *pfse, *next;
 
 	CURVNET_SET((struct vnet *)v);
 
 	PF_SENDQ_LOCK();
 	queue = V_pf_sendqueue;
 	STAILQ_INIT(&V_pf_sendqueue);
 	PF_SENDQ_UNLOCK();
 
 	STAILQ_FOREACH_SAFE(pfse, &queue, pfse_next, next) {
 		switch (pfse->pfse_type) {
 #ifdef INET
 		case PFSE_IP:
 			ip_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL);
 			break;
 		case PFSE_ICMP:
 			icmp_error(pfse->pfse_m, pfse->icmpopts.type,
 			    pfse->icmpopts.code, 0, pfse->icmpopts.mtu);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case PFSE_IP6:
 			ip6_output(pfse->pfse_m, NULL, NULL, 0, NULL, NULL,
 			    NULL);
 			break;
 		case PFSE_ICMP6:
 			icmp6_error(pfse->pfse_m, pfse->icmpopts.type,
 			    pfse->icmpopts.code, pfse->icmpopts.mtu);
 			break;
 #endif /* INET6 */
 		default:
 			panic("%s: unknown type", __func__);
 		}
 		free(pfse, M_PFTEMP);
 	}
 	CURVNET_RESTORE();
 }
 
 void
 pf_purge_thread(void *unused __unused)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	sx_xlock(&pf_end_lock);
 	while (pf_end_threads == 0) {
 		sx_sleep(pf_purge_thread, &pf_end_lock, 0, "pftm", hz / 10);
 
 		VNET_LIST_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 
 
 			/* Wait until V_pf_default_rule is initialized. */
 			if (V_pf_vnet_active == 0) {
 				CURVNET_RESTORE();
 				continue;
 			}
 
 			/*
 			 *  Process 1/interval fraction of the state
 			 * table every run.
 			 */
 			V_pf_purge_idx =
 			    pf_purge_expired_states(V_pf_purge_idx, pf_hashmask /
 			    (V_pf_default_rule.timeout[PFTM_INTERVAL] * 10));
 
 			/*
 			 * Purge other expired types every
 			 * PFTM_INTERVAL seconds.
 			 */
 			if (V_pf_purge_idx == 0) {
 				/*
 				 * Order is important:
 				 * - states and src nodes reference rules
 				 * - states and rules reference kifs
 				 */
 				pf_purge_expired_fragments();
 				pf_purge_expired_src_nodes();
 				pf_purge_unlinked_rules();
 				pfi_kif_purge();
 			}
 			CURVNET_RESTORE();
 		}
 		VNET_LIST_RUNLOCK();
 	}
 
 	pf_end_threads++;
 	sx_xunlock(&pf_end_lock);
 	kproc_exit(0);
 }
 
 void
 pf_unload_vnet_purge(void)
 {
 
 	/*
 	 * To cleanse up all kifs and rules we need
 	 * two runs: first one clears reference flags,
 	 * then pf_purge_expired_states() doesn't
 	 * raise them, and then second run frees.
 	 */
 	pf_purge_unlinked_rules();
 	pfi_kif_purge();
 
 	/*
 	 * Now purge everything.
 	 */
 	pf_purge_expired_states(0, pf_hashmask);
 	pf_purge_fragments(UINT_MAX);
 	pf_purge_expired_src_nodes();
 
 	/*
 	 * Now all kifs & rules should be unreferenced,
 	 * thus should be successfully freed.
 	 */
 	pf_purge_unlinked_rules();
 	pfi_kif_purge();
 }
 
 
 u_int32_t
 pf_state_expires(const struct pf_state *state)
 {
 	u_int32_t	timeout;
 	u_int32_t	start;
 	u_int32_t	end;
 	u_int32_t	states;
 
 	/* handle all PFTM_* > PFTM_MAX here */
 	if (state->timeout == PFTM_PURGE)
 		return (time_uptime);
 	KASSERT(state->timeout != PFTM_UNLINKED,
 	    ("pf_state_expires: timeout == PFTM_UNLINKED"));
 	KASSERT((state->timeout < PFTM_MAX),
 	    ("pf_state_expires: timeout > PFTM_MAX"));
 	timeout = state->rule.ptr->timeout[state->timeout];
 	if (!timeout)
 		timeout = V_pf_default_rule.timeout[state->timeout];
 	start = state->rule.ptr->timeout[PFTM_ADAPTIVE_START];
 	if (start && state->rule.ptr != &V_pf_default_rule) {
 		end = state->rule.ptr->timeout[PFTM_ADAPTIVE_END];
 		states = counter_u64_fetch(state->rule.ptr->states_cur);
 	} else {
 		start = V_pf_default_rule.timeout[PFTM_ADAPTIVE_START];
 		end = V_pf_default_rule.timeout[PFTM_ADAPTIVE_END];
 		states = V_pf_status.states;
 	}
 	if (end && states > start && start < end) {
 		if (states < end) {
 			timeout = (u_int64_t)timeout * (end - states) /
 			    (end - start);
 			return (state->expire + timeout);
 		}
 		else
 			return (time_uptime);
 	}
 	return (state->expire + timeout);
 }
 
 void
 pf_purge_expired_src_nodes()
 {
 	struct pf_src_node_list	 freelist;
 	struct pf_srchash	*sh;
 	struct pf_src_node	*cur, *next;
 	int i;
 
 	LIST_INIT(&freelist);
 	for (i = 0, sh = V_pf_srchash; i <= pf_srchashmask; i++, sh++) {
 	    PF_HASHROW_LOCK(sh);
 	    LIST_FOREACH_SAFE(cur, &sh->nodes, entry, next)
 		if (cur->states == 0 && cur->expire <= time_uptime) {
 			pf_unlink_src_node(cur);
 			LIST_INSERT_HEAD(&freelist, cur, entry);
 		} else if (cur->rule.ptr != NULL)
 			cur->rule.ptr->rule_flag |= PFRULE_REFS;
 	    PF_HASHROW_UNLOCK(sh);
 	}
 
 	pf_free_src_nodes(&freelist);
 
 	V_pf_status.src_nodes = uma_zone_get_cur(V_pf_sources_z);
 }
 
 static void
 pf_src_tree_remove_state(struct pf_state *s)
 {
 	struct pf_src_node *sn;
 	struct pf_srchash *sh;
 	uint32_t timeout;
 
 	timeout = s->rule.ptr->timeout[PFTM_SRC_NODE] ?
 	    s->rule.ptr->timeout[PFTM_SRC_NODE] :
 	    V_pf_default_rule.timeout[PFTM_SRC_NODE];
 
 	if (s->src_node != NULL) {
 		sn = s->src_node;
 		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
 	    	PF_HASHROW_LOCK(sh);
 		if (s->src.tcp_est)
 			--sn->conn;
 		if (--sn->states == 0)
 			sn->expire = time_uptime + timeout;
 	    	PF_HASHROW_UNLOCK(sh);
 	}
 	if (s->nat_src_node != s->src_node && s->nat_src_node != NULL) {
 		sn = s->nat_src_node;
 		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
 	    	PF_HASHROW_LOCK(sh);
 		if (--sn->states == 0)
 			sn->expire = time_uptime + timeout;
 	    	PF_HASHROW_UNLOCK(sh);
 	}
 	s->src_node = s->nat_src_node = NULL;
 }
 
 /*
  * Unlink and potentilly free a state. Function may be
  * called with ID hash row locked, but always returns
  * unlocked, since it needs to go through key hash locking.
  */
 int
 pf_unlink_state(struct pf_state *s, u_int flags)
 {
 	struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(s)];
 
 	if ((flags & PF_ENTER_LOCKED) == 0)
 		PF_HASHROW_LOCK(ih);
 	else
 		PF_HASHROW_ASSERT(ih);
 
 	if (s->timeout == PFTM_UNLINKED) {
 		/*
 		 * State is being processed
 		 * by pf_unlink_state() in
 		 * an other thread.
 		 */
 		PF_HASHROW_UNLOCK(ih);
 		return (0);	/* XXXGL: undefined actually */
 	}
 
 	if (s->src.state == PF_TCPS_PROXY_DST) {
 		/* XXX wire key the right one? */
 		pf_send_tcp(NULL, s->rule.ptr, s->key[PF_SK_WIRE]->af,
 		    &s->key[PF_SK_WIRE]->addr[1],
 		    &s->key[PF_SK_WIRE]->addr[0],
 		    s->key[PF_SK_WIRE]->port[1],
 		    s->key[PF_SK_WIRE]->port[0],
 		    s->src.seqhi, s->src.seqlo + 1,
 		    TH_RST|TH_ACK, 0, 0, 0, 1, s->tag, NULL);
 	}
 
 	LIST_REMOVE(s, entry);
 	pf_src_tree_remove_state(s);
 
 	if (V_pfsync_delete_state_ptr != NULL)
 		V_pfsync_delete_state_ptr(s);
 
 	STATE_DEC_COUNTERS(s);
 
 	s->timeout = PFTM_UNLINKED;
 
 	PF_HASHROW_UNLOCK(ih);
 
 	pf_detach_state(s);
 	/* pf_state_insert() initialises refs to 2, so we can never release the
 	 * last reference here, only in pf_release_state(). */
 	(void)refcount_release(&s->refs);
 
 	return (pf_release_state(s));
 }
 
 void
 pf_free_state(struct pf_state *cur)
 {
 
 	KASSERT(cur->refs == 0, ("%s: %p has refs", __func__, cur));
 	KASSERT(cur->timeout == PFTM_UNLINKED, ("%s: timeout %u", __func__,
 	    cur->timeout));
 
 	pf_normalize_tcp_cleanup(cur);
 	uma_zfree(V_pf_state_z, cur);
 	counter_u64_add(V_pf_status.fcounters[FCNT_STATE_REMOVALS], 1);
 }
 
 /*
  * Called only from pf_purge_thread(), thus serialized.
  */
 static u_int
 pf_purge_expired_states(u_int i, int maxcheck)
 {
 	struct pf_idhash *ih;
 	struct pf_state *s;
 
 	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
 
 	/*
 	 * Go through hash and unlink states that expire now.
 	 */
 	while (maxcheck > 0) {
 
 		ih = &V_pf_idhash[i];
 
 		/* only take the lock if we expect to do work */
 		if (!LIST_EMPTY(&ih->states)) {
 relock:
 			PF_HASHROW_LOCK(ih);
 			LIST_FOREACH(s, &ih->states, entry) {
 				if (pf_state_expires(s) <= time_uptime) {
 					V_pf_status.states -=
 					    pf_unlink_state(s, PF_ENTER_LOCKED);
 					goto relock;
 				}
 				s->rule.ptr->rule_flag |= PFRULE_REFS;
 				if (s->nat_rule.ptr != NULL)
 					s->nat_rule.ptr->rule_flag |= PFRULE_REFS;
 				if (s->anchor.ptr != NULL)
 					s->anchor.ptr->rule_flag |= PFRULE_REFS;
 				s->kif->pfik_flags |= PFI_IFLAG_REFS;
 				if (s->rt_kif)
 					s->rt_kif->pfik_flags |= PFI_IFLAG_REFS;
 			}
 			PF_HASHROW_UNLOCK(ih);
 		}
 
 		/* Return when we hit end of hash. */
 		if (++i > pf_hashmask) {
 			V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
 			return (0);
 		}
 
 		maxcheck--;
 	}
 
 	V_pf_status.states = uma_zone_get_cur(V_pf_state_z);
 
 	return (i);
 }
 
 static void
 pf_purge_unlinked_rules()
 {
 	struct pf_rulequeue tmpq;
 	struct pf_rule *r, *r1;
 
 	/*
 	 * If we have overloading task pending, then we'd
 	 * better skip purging this time. There is a tiny
 	 * probability that overloading task references
 	 * an already unlinked rule.
 	 */
 	PF_OVERLOADQ_LOCK();
 	if (!SLIST_EMPTY(&V_pf_overloadqueue)) {
 		PF_OVERLOADQ_UNLOCK();
 		return;
 	}
 	PF_OVERLOADQ_UNLOCK();
 
 	/*
 	 * Do naive mark-and-sweep garbage collecting of old rules.
 	 * Reference flag is raised by pf_purge_expired_states()
 	 * and pf_purge_expired_src_nodes().
 	 *
 	 * To avoid LOR between PF_UNLNKDRULES_LOCK/PF_RULES_WLOCK,
 	 * use a temporary queue.
 	 */
 	TAILQ_INIT(&tmpq);
 	PF_UNLNKDRULES_LOCK();
 	TAILQ_FOREACH_SAFE(r, &V_pf_unlinked_rules, entries, r1) {
 		if (!(r->rule_flag & PFRULE_REFS)) {
 			TAILQ_REMOVE(&V_pf_unlinked_rules, r, entries);
 			TAILQ_INSERT_TAIL(&tmpq, r, entries);
 		} else
 			r->rule_flag &= ~PFRULE_REFS;
 	}
 	PF_UNLNKDRULES_UNLOCK();
 
 	if (!TAILQ_EMPTY(&tmpq)) {
 		PF_RULES_WLOCK();
 		TAILQ_FOREACH_SAFE(r, &tmpq, entries, r1) {
 			TAILQ_REMOVE(&tmpq, r, entries);
 			pf_free_rule(r);
 		}
 		PF_RULES_WUNLOCK();
 	}
 }
 
 void
 pf_print_host(struct pf_addr *addr, u_int16_t p, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		u_int32_t a = ntohl(addr->addr32[0]);
 		printf("%u.%u.%u.%u", (a>>24)&255, (a>>16)&255,
 		    (a>>8)&255, a&255);
 		if (p) {
 			p = ntohs(p);
 			printf(":%u", p);
 		}
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		u_int16_t b;
 		u_int8_t i, curstart, curend, maxstart, maxend;
 		curstart = curend = maxstart = maxend = 255;
 		for (i = 0; i < 8; i++) {
 			if (!addr->addr16[i]) {
 				if (curstart == 255)
 					curstart = i;
 				curend = i;
 			} else {
 				if ((curend - curstart) >
 				    (maxend - maxstart)) {
 					maxstart = curstart;
 					maxend = curend;
 				}
 				curstart = curend = 255;
 			}
 		}
 		if ((curend - curstart) >
 		    (maxend - maxstart)) {
 			maxstart = curstart;
 			maxend = curend;
 		}
 		for (i = 0; i < 8; i++) {
 			if (i >= maxstart && i <= maxend) {
 				if (i == 0)
 					printf(":");
 				if (i == maxend)
 					printf(":");
 			} else {
 				b = ntohs(addr->addr16[i]);
 				printf("%x", b);
 				if (i < 7)
 					printf(":");
 			}
 		}
 		if (p) {
 			p = ntohs(p);
 			printf("[%u]", p);
 		}
 		break;
 	}
 #endif /* INET6 */
 	}
 }
 
 void
 pf_print_state(struct pf_state *s)
 {
 	pf_print_state_parts(s, NULL, NULL);
 }
 
 static void
 pf_print_state_parts(struct pf_state *s,
     struct pf_state_key *skwp, struct pf_state_key *sksp)
 {
 	struct pf_state_key *skw, *sks;
 	u_int8_t proto, dir;
 
 	/* Do our best to fill these, but they're skipped if NULL */
 	skw = skwp ? skwp : (s ? s->key[PF_SK_WIRE] : NULL);
 	sks = sksp ? sksp : (s ? s->key[PF_SK_STACK] : NULL);
 	proto = skw ? skw->proto : (sks ? sks->proto : 0);
 	dir = s ? s->direction : 0;
 
 	switch (proto) {
 	case IPPROTO_IPV4:
 		printf("IPv4");
 		break;
 	case IPPROTO_IPV6:
 		printf("IPv6");
 		break;
 	case IPPROTO_TCP:
 		printf("TCP");
 		break;
 	case IPPROTO_UDP:
 		printf("UDP");
 		break;
 	case IPPROTO_ICMP:
 		printf("ICMP");
 		break;
 	case IPPROTO_ICMPV6:
 		printf("ICMPv6");
 		break;
 	default:
 		printf("%u", proto);
 		break;
 	}
 	switch (dir) {
 	case PF_IN:
 		printf(" in");
 		break;
 	case PF_OUT:
 		printf(" out");
 		break;
 	}
 	if (skw) {
 		printf(" wire: ");
 		pf_print_host(&skw->addr[0], skw->port[0], skw->af);
 		printf(" ");
 		pf_print_host(&skw->addr[1], skw->port[1], skw->af);
 	}
 	if (sks) {
 		printf(" stack: ");
 		if (sks != skw) {
 			pf_print_host(&sks->addr[0], sks->port[0], sks->af);
 			printf(" ");
 			pf_print_host(&sks->addr[1], sks->port[1], sks->af);
 		} else
 			printf("-");
 	}
 	if (s) {
 		if (proto == IPPROTO_TCP) {
 			printf(" [lo=%u high=%u win=%u modulator=%u",
 			    s->src.seqlo, s->src.seqhi,
 			    s->src.max_win, s->src.seqdiff);
 			if (s->src.wscale && s->dst.wscale)
 				printf(" wscale=%u",
 				    s->src.wscale & PF_WSCALE_MASK);
 			printf("]");
 			printf(" [lo=%u high=%u win=%u modulator=%u",
 			    s->dst.seqlo, s->dst.seqhi,
 			    s->dst.max_win, s->dst.seqdiff);
 			if (s->src.wscale && s->dst.wscale)
 				printf(" wscale=%u",
 				s->dst.wscale & PF_WSCALE_MASK);
 			printf("]");
 		}
 		printf(" %u:%u", s->src.state, s->dst.state);
 	}
 }
 
 void
 pf_print_flags(u_int8_t f)
 {
 	if (f)
 		printf(" ");
 	if (f & TH_FIN)
 		printf("F");
 	if (f & TH_SYN)
 		printf("S");
 	if (f & TH_RST)
 		printf("R");
 	if (f & TH_PUSH)
 		printf("P");
 	if (f & TH_ACK)
 		printf("A");
 	if (f & TH_URG)
 		printf("U");
 	if (f & TH_ECE)
 		printf("E");
 	if (f & TH_CWR)
 		printf("W");
 }
 
 #define	PF_SET_SKIP_STEPS(i)					\
 	do {							\
 		while (head[i] != cur) {			\
 			head[i]->skip[i].ptr = cur;		\
 			head[i] = TAILQ_NEXT(head[i], entries);	\
 		}						\
 	} while (0)
 
 void
 pf_calc_skip_steps(struct pf_rulequeue *rules)
 {
 	struct pf_rule *cur, *prev, *head[PF_SKIP_COUNT];
 	int i;
 
 	cur = TAILQ_FIRST(rules);
 	prev = cur;
 	for (i = 0; i < PF_SKIP_COUNT; ++i)
 		head[i] = cur;
 	while (cur != NULL) {
 
 		if (cur->kif != prev->kif || cur->ifnot != prev->ifnot)
 			PF_SET_SKIP_STEPS(PF_SKIP_IFP);
 		if (cur->direction != prev->direction)
 			PF_SET_SKIP_STEPS(PF_SKIP_DIR);
 		if (cur->af != prev->af)
 			PF_SET_SKIP_STEPS(PF_SKIP_AF);
 		if (cur->proto != prev->proto)
 			PF_SET_SKIP_STEPS(PF_SKIP_PROTO);
 		if (cur->src.neg != prev->src.neg ||
 		    pf_addr_wrap_neq(&cur->src.addr, &prev->src.addr))
 			PF_SET_SKIP_STEPS(PF_SKIP_SRC_ADDR);
 		if (cur->src.port[0] != prev->src.port[0] ||
 		    cur->src.port[1] != prev->src.port[1] ||
 		    cur->src.port_op != prev->src.port_op)
 			PF_SET_SKIP_STEPS(PF_SKIP_SRC_PORT);
 		if (cur->dst.neg != prev->dst.neg ||
 		    pf_addr_wrap_neq(&cur->dst.addr, &prev->dst.addr))
 			PF_SET_SKIP_STEPS(PF_SKIP_DST_ADDR);
 		if (cur->dst.port[0] != prev->dst.port[0] ||
 		    cur->dst.port[1] != prev->dst.port[1] ||
 		    cur->dst.port_op != prev->dst.port_op)
 			PF_SET_SKIP_STEPS(PF_SKIP_DST_PORT);
 
 		prev = cur;
 		cur = TAILQ_NEXT(cur, entries);
 	}
 	for (i = 0; i < PF_SKIP_COUNT; ++i)
 		PF_SET_SKIP_STEPS(i);
 }
 
 static int
 pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
 {
 	if (aw1->type != aw2->type)
 		return (1);
 	switch (aw1->type) {
 	case PF_ADDR_ADDRMASK:
 	case PF_ADDR_RANGE:
 		if (PF_ANEQ(&aw1->v.a.addr, &aw2->v.a.addr, AF_INET6))
 			return (1);
 		if (PF_ANEQ(&aw1->v.a.mask, &aw2->v.a.mask, AF_INET6))
 			return (1);
 		return (0);
 	case PF_ADDR_DYNIFTL:
 		return (aw1->p.dyn->pfid_kt != aw2->p.dyn->pfid_kt);
 	case PF_ADDR_NOROUTE:
 	case PF_ADDR_URPFFAILED:
 		return (0);
 	case PF_ADDR_TABLE:
 		return (aw1->p.tbl != aw2->p.tbl);
 	default:
 		printf("invalid address type: %d\n", aw1->type);
 		return (1);
 	}
 }
 
 /**
  * Checksum updates are a little complicated because the checksum in the TCP/UDP
  * header isn't always a full checksum. In some cases (i.e. output) it's a
  * pseudo-header checksum, which is a partial checksum over src/dst IP
  * addresses, protocol number and length.
  *
  * That means we have the following cases:
  *  * Input or forwarding: we don't have TSO, the checksum fields are full
  *  	checksums, we need to update the checksum whenever we change anything.
  *  * Output (i.e. the checksum is a pseudo-header checksum):
  *  	x The field being updated is src/dst address or affects the length of
  *  	the packet. We need to update the pseudo-header checksum (note that this
  *  	checksum is not ones' complement).
  *  	x Some other field is being modified (e.g. src/dst port numbers): We
  *  	don't have to update anything.
  **/
 u_int16_t
 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
 {
 	u_int32_t	l;
 
 	if (udp && !cksum)
 		return (0x0000);
 	l = cksum + old - new;
 	l = (l >> 16) + (l & 65535);
 	l = l & 65535;
 	if (udp && !l)
 		return (0xFFFF);
 	return (l);
 }
 
 u_int16_t
 pf_proto_cksum_fixup(struct mbuf *m, u_int16_t cksum, u_int16_t old,
         u_int16_t new, u_int8_t udp)
 {
 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
 		return (cksum);
 
 	return (pf_cksum_fixup(cksum, old, new, udp));
 }
 
 static void
 pf_change_ap(struct mbuf *m, struct pf_addr *a, u_int16_t *p, u_int16_t *ic,
         u_int16_t *pc, struct pf_addr *an, u_int16_t pn, u_int8_t u,
         sa_family_t af)
 {
 	struct pf_addr	ao;
 	u_int16_t	po = *p;
 
 	PF_ACPY(&ao, a, af);
 	PF_ACPY(a, an, af);
 
 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_DELAY_DATA_IPV6))
 		*pc = ~*pc;
 
 	*p = pn;
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
 		    ao.addr16[0], an->addr16[0], 0),
 		    ao.addr16[1], an->addr16[1], 0);
 		*p = pn;
 
 		*pc = pf_cksum_fixup(pf_cksum_fixup(*pc,
 		    ao.addr16[0], an->addr16[0], u),
 		    ao.addr16[1], an->addr16[1], u);
 
 		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		*pc = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(*pc,
 		    ao.addr16[0], an->addr16[0], u),
 		    ao.addr16[1], an->addr16[1], u),
 		    ao.addr16[2], an->addr16[2], u),
 		    ao.addr16[3], an->addr16[3], u),
 		    ao.addr16[4], an->addr16[4], u),
 		    ao.addr16[5], an->addr16[5], u),
 		    ao.addr16[6], an->addr16[6], u),
 		    ao.addr16[7], an->addr16[7], u);
 
 		*pc = pf_proto_cksum_fixup(m, *pc, po, pn, u);
 		break;
 #endif /* INET6 */
 	}
 
 	if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | 
 	    CSUM_DELAY_DATA_IPV6)) {
 		*pc = ~*pc;
 		if (! *pc)
 			*pc = 0xffff;
 	}
 }
 
 /* Changes a u_int32_t.  Uses a void * so there are no align restrictions */
 void
 pf_change_a(void *a, u_int16_t *c, u_int32_t an, u_int8_t u)
 {
 	u_int32_t	ao;
 
 	memcpy(&ao, a, sizeof(ao));
 	memcpy(a, &an, sizeof(u_int32_t));
 	*c = pf_cksum_fixup(pf_cksum_fixup(*c, ao / 65536, an / 65536, u),
 	    ao % 65536, an % 65536, u);
 }
 
 void
 pf_change_proto_a(struct mbuf *m, void *a, u_int16_t *c, u_int32_t an, u_int8_t udp)
 {
 	u_int32_t	ao;
 
 	memcpy(&ao, a, sizeof(ao));
 	memcpy(a, &an, sizeof(u_int32_t));
 
 	*c = pf_proto_cksum_fixup(m,
 	    pf_proto_cksum_fixup(m, *c, ao / 65536, an / 65536, udp),
 	    ao % 65536, an % 65536, udp);
 }
 
 #ifdef INET6
 static void
 pf_change_a6(struct pf_addr *a, u_int16_t *c, struct pf_addr *an, u_int8_t u)
 {
 	struct pf_addr	ao;
 
 	PF_ACPY(&ao, a, AF_INET6);
 	PF_ACPY(a, an, AF_INET6);
 
 	*c = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 	    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 	    pf_cksum_fixup(pf_cksum_fixup(*c,
 	    ao.addr16[0], an->addr16[0], u),
 	    ao.addr16[1], an->addr16[1], u),
 	    ao.addr16[2], an->addr16[2], u),
 	    ao.addr16[3], an->addr16[3], u),
 	    ao.addr16[4], an->addr16[4], u),
 	    ao.addr16[5], an->addr16[5], u),
 	    ao.addr16[6], an->addr16[6], u),
 	    ao.addr16[7], an->addr16[7], u);
 }
 #endif /* INET6 */
 
 static void
 pf_change_icmp(struct pf_addr *ia, u_int16_t *ip, struct pf_addr *oa,
     struct pf_addr *na, u_int16_t np, u_int16_t *pc, u_int16_t *h2c,
     u_int16_t *ic, u_int16_t *hc, u_int8_t u, sa_family_t af)
 {
 	struct pf_addr	oia, ooa;
 
 	PF_ACPY(&oia, ia, af);
 	if (oa)
 		PF_ACPY(&ooa, oa, af);
 
 	/* Change inner protocol port, fix inner protocol checksum. */
 	if (ip != NULL) {
 		u_int16_t	oip = *ip;
 		u_int32_t	opc;
 
 		if (pc != NULL)
 			opc = *pc;
 		*ip = np;
 		if (pc != NULL)
 			*pc = pf_cksum_fixup(*pc, oip, *ip, u);
 		*ic = pf_cksum_fixup(*ic, oip, *ip, 0);
 		if (pc != NULL)
 			*ic = pf_cksum_fixup(*ic, opc, *pc, 0);
 	}
 	/* Change inner ip address, fix inner ip and icmp checksums. */
 	PF_ACPY(ia, na, af);
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		u_int32_t	 oh2c = *h2c;
 
 		*h2c = pf_cksum_fixup(pf_cksum_fixup(*h2c,
 		    oia.addr16[0], ia->addr16[0], 0),
 		    oia.addr16[1], ia->addr16[1], 0);
 		*ic = pf_cksum_fixup(pf_cksum_fixup(*ic,
 		    oia.addr16[0], ia->addr16[0], 0),
 		    oia.addr16[1], ia->addr16[1], 0);
 		*ic = pf_cksum_fixup(*ic, oh2c, *h2c, 0);
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 		    pf_cksum_fixup(pf_cksum_fixup(*ic,
 		    oia.addr16[0], ia->addr16[0], u),
 		    oia.addr16[1], ia->addr16[1], u),
 		    oia.addr16[2], ia->addr16[2], u),
 		    oia.addr16[3], ia->addr16[3], u),
 		    oia.addr16[4], ia->addr16[4], u),
 		    oia.addr16[5], ia->addr16[5], u),
 		    oia.addr16[6], ia->addr16[6], u),
 		    oia.addr16[7], ia->addr16[7], u);
 		break;
 #endif /* INET6 */
 	}
 	/* Outer ip address, fix outer ip or icmpv6 checksum, if necessary. */
 	if (oa) {
 		PF_ACPY(oa, na, af);
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			*hc = pf_cksum_fixup(pf_cksum_fixup(*hc,
 			    ooa.addr16[0], oa->addr16[0], 0),
 			    ooa.addr16[1], oa->addr16[1], 0);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			*ic = pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 			    pf_cksum_fixup(pf_cksum_fixup(pf_cksum_fixup(
 			    pf_cksum_fixup(pf_cksum_fixup(*ic,
 			    ooa.addr16[0], oa->addr16[0], u),
 			    ooa.addr16[1], oa->addr16[1], u),
 			    ooa.addr16[2], oa->addr16[2], u),
 			    ooa.addr16[3], oa->addr16[3], u),
 			    ooa.addr16[4], oa->addr16[4], u),
 			    ooa.addr16[5], oa->addr16[5], u),
 			    ooa.addr16[6], oa->addr16[6], u),
 			    ooa.addr16[7], oa->addr16[7], u);
 			break;
 #endif /* INET6 */
 		}
 	}
 }
 
 
 /*
  * Need to modulate the sequence numbers in the TCP SACK option
  * (credits to Krzysztof Pfaff for report and patch)
  */
 static int
 pf_modulate_sack(struct mbuf *m, int off, struct pf_pdesc *pd,
     struct tcphdr *th, struct pf_state_peer *dst)
 {
 	int hlen = (th->th_off << 2) - sizeof(*th), thoptlen = hlen;
 	u_int8_t opts[TCP_MAXOLEN], *opt = opts;
 	int copyback = 0, i, olen;
 	struct sackblk sack;
 
 #define	TCPOLEN_SACKLEN	(TCPOLEN_SACK + 2)
 	if (hlen < TCPOLEN_SACKLEN ||
 	    !pf_pull_hdr(m, off + sizeof(*th), opts, hlen, NULL, NULL, pd->af))
 		return 0;
 
 	while (hlen >= TCPOLEN_SACKLEN) {
 		olen = opt[1];
 		switch (*opt) {
 		case TCPOPT_EOL:	/* FALLTHROUGH */
 		case TCPOPT_NOP:
 			opt++;
 			hlen--;
 			break;
 		case TCPOPT_SACK:
 			if (olen > hlen)
 				olen = hlen;
 			if (olen >= TCPOLEN_SACKLEN) {
 				for (i = 2; i + TCPOLEN_SACK <= olen;
 				    i += TCPOLEN_SACK) {
 					memcpy(&sack, &opt[i], sizeof(sack));
 					pf_change_proto_a(m, &sack.start, &th->th_sum,
 					    htonl(ntohl(sack.start) - dst->seqdiff), 0);
 					pf_change_proto_a(m, &sack.end, &th->th_sum,
 					    htonl(ntohl(sack.end) - dst->seqdiff), 0);
 					memcpy(&opt[i], &sack, sizeof(sack));
 				}
 				copyback = 1;
 			}
 			/* FALLTHROUGH */
 		default:
 			if (olen < 2)
 				olen = 2;
 			hlen -= olen;
 			opt += olen;
 		}
 	}
 
 	if (copyback)
 		m_copyback(m, off + sizeof(*th), thoptlen, (caddr_t)opts);
 	return (copyback);
 }
 
 static void
 pf_send_tcp(struct mbuf *replyto, const struct pf_rule *r, sa_family_t af,
     const struct pf_addr *saddr, const struct pf_addr *daddr,
     u_int16_t sport, u_int16_t dport, u_int32_t seq, u_int32_t ack,
     u_int8_t flags, u_int16_t win, u_int16_t mss, u_int8_t ttl, int tag,
     u_int16_t rtag, struct ifnet *ifp)
 {
 	struct pf_send_entry *pfse;
 	struct mbuf	*m;
 	int		 len, tlen;
 #ifdef INET
 	struct ip	*h = NULL;
 #endif /* INET */
 #ifdef INET6
 	struct ip6_hdr	*h6 = NULL;
 #endif /* INET6 */
 	struct tcphdr	*th;
 	char		*opt;
 	struct pf_mtag  *pf_mtag;
 
 	len = 0;
 	th = NULL;
 
 	/* maximum segment size tcp option */
 	tlen = sizeof(struct tcphdr);
 	if (mss)
 		tlen += 4;
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		len = sizeof(struct ip) + tlen;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		len = sizeof(struct ip6_hdr) + tlen;
 		break;
 #endif /* INET6 */
 	default:
 		panic("%s: unsupported af %d", __func__, af);
 	}
 
 	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
 	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
 	if (pfse == NULL)
 		return;
 	m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		free(pfse, M_PFTEMP);
 		return;
 	}
 #ifdef MAC
 	mac_netinet_firewall_send(m);
 #endif
 	if ((pf_mtag = pf_get_mtag(m)) == NULL) {
 		free(pfse, M_PFTEMP);
 		m_freem(m);
 		return;
 	}
 	if (tag)
 		m->m_flags |= M_SKIP_FIREWALL;
 	pf_mtag->tag = rtag;
 
 	if (r != NULL && r->rtableid >= 0)
 		M_SETFIB(m, r->rtableid);
 
 #ifdef ALTQ
 	if (r != NULL && r->qid) {
 		pf_mtag->qid = r->qid;
 
 		/* add hints for ecn */
 		pf_mtag->hdr = mtod(m, struct ip *);
 	}
 #endif /* ALTQ */
 	m->m_data += max_linkhdr;
 	m->m_pkthdr.len = m->m_len = len;
 	m->m_pkthdr.rcvif = NULL;
 	bzero(m->m_data, len);
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		h = mtod(m, struct ip *);
 
 		/* IP header fields included in the TCP checksum */
 		h->ip_p = IPPROTO_TCP;
 		h->ip_len = htons(tlen);
 		h->ip_src.s_addr = saddr->v4.s_addr;
 		h->ip_dst.s_addr = daddr->v4.s_addr;
 
 		th = (struct tcphdr *)((caddr_t)h + sizeof(struct ip));
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		h6 = mtod(m, struct ip6_hdr *);
 
 		/* IP header fields included in the TCP checksum */
 		h6->ip6_nxt = IPPROTO_TCP;
 		h6->ip6_plen = htons(tlen);
 		memcpy(&h6->ip6_src, &saddr->v6, sizeof(struct in6_addr));
 		memcpy(&h6->ip6_dst, &daddr->v6, sizeof(struct in6_addr));
 
 		th = (struct tcphdr *)((caddr_t)h6 + sizeof(struct ip6_hdr));
 		break;
 #endif /* INET6 */
 	}
 
 	/* TCP header */
 	th->th_sport = sport;
 	th->th_dport = dport;
 	th->th_seq = htonl(seq);
 	th->th_ack = htonl(ack);
 	th->th_off = tlen >> 2;
 	th->th_flags = flags;
 	th->th_win = htons(win);
 
 	if (mss) {
 		opt = (char *)(th + 1);
 		opt[0] = TCPOPT_MAXSEG;
 		opt[1] = 4;
 		HTONS(mss);
 		bcopy((caddr_t)&mss, (caddr_t)(opt + 2), 2);
 	}
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		/* TCP checksum */
 		th->th_sum = in_cksum(m, len);
 
 		/* Finish the IP header */
 		h->ip_v = 4;
 		h->ip_hl = sizeof(*h) >> 2;
 		h->ip_tos = IPTOS_LOWDELAY;
 		h->ip_off = htons(V_path_mtu_discovery ? IP_DF : 0);
 		h->ip_len = htons(len);
 		h->ip_ttl = ttl ? ttl : V_ip_defttl;
 		h->ip_sum = 0;
 
 		pfse->pfse_type = PFSE_IP;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		/* TCP checksum */
 		th->th_sum = in6_cksum(m, IPPROTO_TCP,
 		    sizeof(struct ip6_hdr), tlen);
 
 		h6->ip6_vfc |= IPV6_VERSION;
 		h6->ip6_hlim = IPV6_DEFHLIM;
 
 		pfse->pfse_type = PFSE_IP6;
 		break;
 #endif /* INET6 */
 	}
 	pfse->pfse_m = m;
 	pf_send(pfse);
 }
 
 static void
 pf_return(struct pf_rule *r, struct pf_rule *nr, struct pf_pdesc *pd,
     struct pf_state_key *sk, int off, struct mbuf *m, struct tcphdr *th,
     struct pfi_kif *kif, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen,
     u_short *reason)
 {
 	struct pf_addr	* const saddr = pd->src;
 	struct pf_addr	* const daddr = pd->dst;
 	sa_family_t	 af = pd->af;
 
 	/* undo NAT changes, if they have taken place */
 	if (nr != NULL) {
 		PF_ACPY(saddr, &sk->addr[pd->sidx], af);
 		PF_ACPY(daddr, &sk->addr[pd->didx], af);
 		if (pd->sport)
 			*pd->sport = sk->port[pd->sidx];
 		if (pd->dport)
 			*pd->dport = sk->port[pd->didx];
 		if (pd->proto_sum)
 			*pd->proto_sum = bproto_sum;
 		if (pd->ip_sum)
 			*pd->ip_sum = bip_sum;
 		m_copyback(m, off, hdrlen, pd->hdr.any);
 	}
 	if (pd->proto == IPPROTO_TCP &&
 	    ((r->rule_flag & PFRULE_RETURNRST) ||
 	    (r->rule_flag & PFRULE_RETURN)) &&
 	    !(th->th_flags & TH_RST)) {
 		u_int32_t	 ack = ntohl(th->th_seq) + pd->p_len;
 		int		 len = 0;
 #ifdef INET
 		struct ip	*h4;
 #endif
 #ifdef INET6
 		struct ip6_hdr	*h6;
 #endif
 
 		switch (af) {
 #ifdef INET
 		case AF_INET:
 			h4 = mtod(m, struct ip *);
 			len = ntohs(h4->ip_len) - off;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			h6 = mtod(m, struct ip6_hdr *);
 			len = ntohs(h6->ip6_plen) - (off - sizeof(*h6));
 			break;
 #endif
 		}
 
 		if (pf_check_proto_cksum(m, off, len, IPPROTO_TCP, af))
 			REASON_SET(reason, PFRES_PROTCKSUM);
 		else {
 			if (th->th_flags & TH_SYN)
 				ack++;
 			if (th->th_flags & TH_FIN)
 				ack++;
 			pf_send_tcp(m, r, af, pd->dst,
 				pd->src, th->th_dport, th->th_sport,
 				ntohl(th->th_ack), ack, TH_RST|TH_ACK, 0, 0,
 				r->return_ttl, 1, 0, kif->pfik_ifp);
 		}
 	} else if (pd->proto != IPPROTO_ICMP && af == AF_INET &&
 		r->return_icmp)
 		pf_send_icmp(m, r->return_icmp >> 8,
 			r->return_icmp & 255, af, r);
 	else if (pd->proto != IPPROTO_ICMPV6 && af == AF_INET6 &&
 		r->return_icmp6)
 		pf_send_icmp(m, r->return_icmp6 >> 8,
 			r->return_icmp6 & 255, af, r);
 }
 
 
 static int
 pf_ieee8021q_setpcp(struct mbuf *m, u_int8_t prio)
 {
 	struct m_tag *mtag;
 
 	KASSERT(prio <= PF_PRIO_MAX,
 	    ("%s with invalid pcp", __func__));
 
 	mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_OUT, NULL);
 	if (mtag == NULL) {
 		mtag = m_tag_alloc(MTAG_8021Q, MTAG_8021Q_PCP_OUT,
 		    sizeof(uint8_t), M_NOWAIT);
 		if (mtag == NULL)
 			return (ENOMEM);
 		m_tag_prepend(m, mtag);
 	}
 
 	*(uint8_t *)(mtag + 1) = prio;
 	return (0);
 }
 
 static int
 pf_match_ieee8021q_pcp(u_int8_t prio, struct mbuf *m)
 {
 	struct m_tag *mtag;
 	u_int8_t mpcp;
 
 	mtag = m_tag_locate(m, MTAG_8021Q, MTAG_8021Q_PCP_IN, NULL);
 	if (mtag == NULL)
 		return (0);
 
 	if (prio == PF_PRIO_ZERO)
 		prio = 0;
 
 	mpcp = *(uint8_t *)(mtag + 1);
 
 	return (mpcp == prio);
 }
 
 static void
 pf_send_icmp(struct mbuf *m, u_int8_t type, u_int8_t code, sa_family_t af,
     struct pf_rule *r)
 {
 	struct pf_send_entry *pfse;
 	struct mbuf *m0;
 	struct pf_mtag *pf_mtag;
 
 	/* Allocate outgoing queue entry, mbuf and mbuf tag. */
 	pfse = malloc(sizeof(*pfse), M_PFTEMP, M_NOWAIT);
 	if (pfse == NULL)
 		return;
 
 	if ((m0 = m_copypacket(m, M_NOWAIT)) == NULL) {
 		free(pfse, M_PFTEMP);
 		return;
 	}
 
 	if ((pf_mtag = pf_get_mtag(m0)) == NULL) {
 		free(pfse, M_PFTEMP);
 		return;
 	}
 	/* XXX: revisit */
 	m0->m_flags |= M_SKIP_FIREWALL;
 
 	if (r->rtableid >= 0)
 		M_SETFIB(m0, r->rtableid);
 
 #ifdef ALTQ
 	if (r->qid) {
 		pf_mtag->qid = r->qid;
 		/* add hints for ecn */
 		pf_mtag->hdr = mtod(m0, struct ip *);
 	}
 #endif /* ALTQ */
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		pfse->pfse_type = PFSE_ICMP;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		pfse->pfse_type = PFSE_ICMP6;
 		break;
 #endif /* INET6 */
 	}
 	pfse->pfse_m = m0;
 	pfse->icmpopts.type = type;
 	pfse->icmpopts.code = code;
 	pf_send(pfse);
 }
 
 /*
  * Return 1 if the addresses a and b match (with mask m), otherwise return 0.
  * If n is 0, they match if they are equal. If n is != 0, they match if they
  * are different.
  */
 int
 pf_match_addr(u_int8_t n, struct pf_addr *a, struct pf_addr *m,
     struct pf_addr *b, sa_family_t af)
 {
 	int	match = 0;
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		if ((a->addr32[0] & m->addr32[0]) ==
 		    (b->addr32[0] & m->addr32[0]))
 			match++;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		if (((a->addr32[0] & m->addr32[0]) ==
 		     (b->addr32[0] & m->addr32[0])) &&
 		    ((a->addr32[1] & m->addr32[1]) ==
 		     (b->addr32[1] & m->addr32[1])) &&
 		    ((a->addr32[2] & m->addr32[2]) ==
 		     (b->addr32[2] & m->addr32[2])) &&
 		    ((a->addr32[3] & m->addr32[3]) ==
 		     (b->addr32[3] & m->addr32[3])))
 			match++;
 		break;
 #endif /* INET6 */
 	}
 	if (match) {
 		if (n)
 			return (0);
 		else
 			return (1);
 	} else {
 		if (n)
 			return (1);
 		else
 			return (0);
 	}
 }
 
 /*
  * Return 1 if b <= a <= e, otherwise return 0.
  */
 int
 pf_match_addr_range(struct pf_addr *b, struct pf_addr *e,
     struct pf_addr *a, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		if ((ntohl(a->addr32[0]) < ntohl(b->addr32[0])) ||
 		    (ntohl(a->addr32[0]) > ntohl(e->addr32[0])))
 			return (0);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		int	i;
 
 		/* check a >= b */
 		for (i = 0; i < 4; ++i)
 			if (ntohl(a->addr32[i]) > ntohl(b->addr32[i]))
 				break;
 			else if (ntohl(a->addr32[i]) < ntohl(b->addr32[i]))
 				return (0);
 		/* check a <= e */
 		for (i = 0; i < 4; ++i)
 			if (ntohl(a->addr32[i]) < ntohl(e->addr32[i]))
 				break;
 			else if (ntohl(a->addr32[i]) > ntohl(e->addr32[i]))
 				return (0);
 		break;
 	}
 #endif /* INET6 */
 	}
 	return (1);
 }
 
 static int
 pf_match(u_int8_t op, u_int32_t a1, u_int32_t a2, u_int32_t p)
 {
 	switch (op) {
 	case PF_OP_IRG:
 		return ((p > a1) && (p < a2));
 	case PF_OP_XRG:
 		return ((p < a1) || (p > a2));
 	case PF_OP_RRG:
 		return ((p >= a1) && (p <= a2));
 	case PF_OP_EQ:
 		return (p == a1);
 	case PF_OP_NE:
 		return (p != a1);
 	case PF_OP_LT:
 		return (p < a1);
 	case PF_OP_LE:
 		return (p <= a1);
 	case PF_OP_GT:
 		return (p > a1);
 	case PF_OP_GE:
 		return (p >= a1);
 	}
 	return (0); /* never reached */
 }
 
 int
 pf_match_port(u_int8_t op, u_int16_t a1, u_int16_t a2, u_int16_t p)
 {
 	NTOHS(a1);
 	NTOHS(a2);
 	NTOHS(p);
 	return (pf_match(op, a1, a2, p));
 }
 
 static int
 pf_match_uid(u_int8_t op, uid_t a1, uid_t a2, uid_t u)
 {
 	if (u == UID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
 		return (0);
 	return (pf_match(op, a1, a2, u));
 }
 
 static int
 pf_match_gid(u_int8_t op, gid_t a1, gid_t a2, gid_t g)
 {
 	if (g == GID_MAX && op != PF_OP_EQ && op != PF_OP_NE)
 		return (0);
 	return (pf_match(op, a1, a2, g));
 }
 
 int
 pf_match_tag(struct mbuf *m, struct pf_rule *r, int *tag, int mtag)
 {
 	if (*tag == -1)
 		*tag = mtag;
 
 	return ((!r->match_tag_not && r->match_tag == *tag) ||
 	    (r->match_tag_not && r->match_tag != *tag));
 }
 
 int
 pf_tag_packet(struct mbuf *m, struct pf_pdesc *pd, int tag)
 {
 
 	KASSERT(tag > 0, ("%s: tag %d", __func__, tag));
 
 	if (pd->pf_mtag == NULL && ((pd->pf_mtag = pf_get_mtag(m)) == NULL))
 		return (ENOMEM);
 
 	pd->pf_mtag->tag = tag;
 
 	return (0);
 }
 
 #define	PF_ANCHOR_STACKSIZE	32
 struct pf_anchor_stackframe {
 	struct pf_ruleset	*rs;
 	struct pf_rule		*r;	/* XXX: + match bit */
 	struct pf_anchor	*child;
 };
 
 /*
  * XXX: We rely on malloc(9) returning pointer aligned addresses.
  */
 #define	PF_ANCHORSTACK_MATCH	0x00000001
 #define	PF_ANCHORSTACK_MASK	(PF_ANCHORSTACK_MATCH)
 
 #define	PF_ANCHOR_MATCH(f)	((uintptr_t)(f)->r & PF_ANCHORSTACK_MATCH)
 #define	PF_ANCHOR_RULE(f)	(struct pf_rule *)			\
 				((uintptr_t)(f)->r & ~PF_ANCHORSTACK_MASK)
 #define	PF_ANCHOR_SET_MATCH(f)	do { (f)->r = (void *) 			\
 				((uintptr_t)(f)->r | PF_ANCHORSTACK_MATCH);  \
 } while (0)
 
 void
 pf_step_into_anchor(struct pf_anchor_stackframe *stack, int *depth,
     struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
     int *match)
 {
 	struct pf_anchor_stackframe	*f;
 
 	PF_RULES_RASSERT();
 
 	if (match)
 		*match = 0;
 	if (*depth >= PF_ANCHOR_STACKSIZE) {
 		printf("%s: anchor stack overflow on %s\n",
 		    __func__, (*r)->anchor->name);
 		*r = TAILQ_NEXT(*r, entries);
 		return;
 	} else if (*depth == 0 && a != NULL)
 		*a = *r;
 	f = stack + (*depth)++;
 	f->rs = *rs;
 	f->r = *r;
 	if ((*r)->anchor_wildcard) {
 		struct pf_anchor_node *parent = &(*r)->anchor->children;
 
 		if ((f->child = RB_MIN(pf_anchor_node, parent)) == NULL) {
 			*r = NULL;
 			return;
 		}
 		*rs = &f->child->ruleset;
 	} else {
 		f->child = NULL;
 		*rs = &(*r)->anchor->ruleset;
 	}
 	*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
 }
 
 int
 pf_step_out_of_anchor(struct pf_anchor_stackframe *stack, int *depth,
     struct pf_ruleset **rs, int n, struct pf_rule **r, struct pf_rule **a,
     int *match)
 {
 	struct pf_anchor_stackframe	*f;
 	struct pf_rule *fr;
 	int quick = 0;
 
 	PF_RULES_RASSERT();
 
 	do {
 		if (*depth <= 0)
 			break;
 		f = stack + *depth - 1;
 		fr = PF_ANCHOR_RULE(f);
 		if (f->child != NULL) {
 			struct pf_anchor_node *parent;
 
 			/*
 			 * This block traverses through
 			 * a wildcard anchor.
 			 */
 			parent = &fr->anchor->children;
 			if (match != NULL && *match) {
 				/*
 				 * If any of "*" matched, then
 				 * "foo/ *" matched, mark frame
 				 * appropriately.
 				 */
 				PF_ANCHOR_SET_MATCH(f);
 				*match = 0;
 			}
 			f->child = RB_NEXT(pf_anchor_node, parent, f->child);
 			if (f->child != NULL) {
 				*rs = &f->child->ruleset;
 				*r = TAILQ_FIRST((*rs)->rules[n].active.ptr);
 				if (*r == NULL)
 					continue;
 				else
 					break;
 			}
 		}
 		(*depth)--;
 		if (*depth == 0 && a != NULL)
 			*a = NULL;
 		*rs = f->rs;
 		if (PF_ANCHOR_MATCH(f) || (match != NULL && *match))
 			quick = fr->quick;
 		*r = TAILQ_NEXT(fr, entries);
 	} while (*r == NULL);
 
 	return (quick);
 }
 
 #ifdef INET6
 void
 pf_poolmask(struct pf_addr *naddr, struct pf_addr *raddr,
     struct pf_addr *rmask, struct pf_addr *saddr, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
 		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
 		break;
 #endif /* INET */
 	case AF_INET6:
 		naddr->addr32[0] = (raddr->addr32[0] & rmask->addr32[0]) |
 		((rmask->addr32[0] ^ 0xffffffff ) & saddr->addr32[0]);
 		naddr->addr32[1] = (raddr->addr32[1] & rmask->addr32[1]) |
 		((rmask->addr32[1] ^ 0xffffffff ) & saddr->addr32[1]);
 		naddr->addr32[2] = (raddr->addr32[2] & rmask->addr32[2]) |
 		((rmask->addr32[2] ^ 0xffffffff ) & saddr->addr32[2]);
 		naddr->addr32[3] = (raddr->addr32[3] & rmask->addr32[3]) |
 		((rmask->addr32[3] ^ 0xffffffff ) & saddr->addr32[3]);
 		break;
 	}
 }
 
 void
 pf_addr_inc(struct pf_addr *addr, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		addr->addr32[0] = htonl(ntohl(addr->addr32[0]) + 1);
 		break;
 #endif /* INET */
 	case AF_INET6:
 		if (addr->addr32[3] == 0xffffffff) {
 			addr->addr32[3] = 0;
 			if (addr->addr32[2] == 0xffffffff) {
 				addr->addr32[2] = 0;
 				if (addr->addr32[1] == 0xffffffff) {
 					addr->addr32[1] = 0;
 					addr->addr32[0] =
 					    htonl(ntohl(addr->addr32[0]) + 1);
 				} else
 					addr->addr32[1] =
 					    htonl(ntohl(addr->addr32[1]) + 1);
 			} else
 				addr->addr32[2] =
 				    htonl(ntohl(addr->addr32[2]) + 1);
 		} else
 			addr->addr32[3] =
 			    htonl(ntohl(addr->addr32[3]) + 1);
 		break;
 	}
 }
 #endif /* INET6 */
 
 int
 pf_socket_lookup(int direction, struct pf_pdesc *pd, struct mbuf *m)
 {
 	struct pf_addr		*saddr, *daddr;
 	u_int16_t		 sport, dport;
 	struct inpcbinfo	*pi;
 	struct inpcb		*inp;
 
 	pd->lookup.uid = UID_MAX;
 	pd->lookup.gid = GID_MAX;
 
 	switch (pd->proto) {
 	case IPPROTO_TCP:
 		if (pd->hdr.tcp == NULL)
 			return (-1);
 		sport = pd->hdr.tcp->th_sport;
 		dport = pd->hdr.tcp->th_dport;
 		pi = &V_tcbinfo;
 		break;
 	case IPPROTO_UDP:
 		if (pd->hdr.udp == NULL)
 			return (-1);
 		sport = pd->hdr.udp->uh_sport;
 		dport = pd->hdr.udp->uh_dport;
 		pi = &V_udbinfo;
 		break;
 	default:
 		return (-1);
 	}
 	if (direction == PF_IN) {
 		saddr = pd->src;
 		daddr = pd->dst;
 	} else {
 		u_int16_t	p;
 
 		p = sport;
 		sport = dport;
 		dport = p;
 		saddr = pd->dst;
 		daddr = pd->src;
 	}
 	switch (pd->af) {
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup_mbuf(pi, saddr->v4, sport, daddr->v4,
 		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
 		if (inp == NULL) {
 			inp = in_pcblookup_mbuf(pi, saddr->v4, sport,
 			   daddr->v4, dport, INPLOOKUP_WILDCARD |
 			   INPLOOKUP_RLOCKPCB, NULL, m);
 			if (inp == NULL)
 				return (-1);
 		}
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport, &daddr->v6,
 		    dport, INPLOOKUP_RLOCKPCB, NULL, m);
 		if (inp == NULL) {
 			inp = in6_pcblookup_mbuf(pi, &saddr->v6, sport,
 			    &daddr->v6, dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, NULL, m);
 			if (inp == NULL)
 				return (-1);
 		}
 		break;
 #endif /* INET6 */
 
 	default:
 		return (-1);
 	}
 	INP_RLOCK_ASSERT(inp);
 	pd->lookup.uid = inp->inp_cred->cr_uid;
 	pd->lookup.gid = inp->inp_cred->cr_groups[0];
 	INP_RUNLOCK(inp);
 
 	return (1);
 }
 
 static u_int8_t
 pf_get_wscale(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
 {
 	int		 hlen;
 	u_int8_t	 hdr[60];
 	u_int8_t	*opt, optlen;
 	u_int8_t	 wscale = 0;
 
 	hlen = th_off << 2;		/* hlen <= sizeof(hdr) */
 	if (hlen <= sizeof(struct tcphdr))
 		return (0);
 	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
 		return (0);
 	opt = hdr + sizeof(struct tcphdr);
 	hlen -= sizeof(struct tcphdr);
 	while (hlen >= 3) {
 		switch (*opt) {
 		case TCPOPT_EOL:
 		case TCPOPT_NOP:
 			++opt;
 			--hlen;
 			break;
 		case TCPOPT_WINDOW:
 			wscale = opt[2];
 			if (wscale > TCP_MAX_WINSHIFT)
 				wscale = TCP_MAX_WINSHIFT;
 			wscale |= PF_WSCALE_FLAG;
 			/* FALLTHROUGH */
 		default:
 			optlen = opt[1];
 			if (optlen < 2)
 				optlen = 2;
 			hlen -= optlen;
 			opt += optlen;
 			break;
 		}
 	}
 	return (wscale);
 }
 
 static u_int16_t
 pf_get_mss(struct mbuf *m, int off, u_int16_t th_off, sa_family_t af)
 {
 	int		 hlen;
 	u_int8_t	 hdr[60];
 	u_int8_t	*opt, optlen;
 	u_int16_t	 mss = V_tcp_mssdflt;
 
 	hlen = th_off << 2;	/* hlen <= sizeof(hdr) */
 	if (hlen <= sizeof(struct tcphdr))
 		return (0);
 	if (!pf_pull_hdr(m, off, hdr, hlen, NULL, NULL, af))
 		return (0);
 	opt = hdr + sizeof(struct tcphdr);
 	hlen -= sizeof(struct tcphdr);
 	while (hlen >= TCPOLEN_MAXSEG) {
 		switch (*opt) {
 		case TCPOPT_EOL:
 		case TCPOPT_NOP:
 			++opt;
 			--hlen;
 			break;
 		case TCPOPT_MAXSEG:
 			bcopy((caddr_t)(opt + 2), (caddr_t)&mss, 2);
 			NTOHS(mss);
 			/* FALLTHROUGH */
 		default:
 			optlen = opt[1];
 			if (optlen < 2)
 				optlen = 2;
 			hlen -= optlen;
 			opt += optlen;
 			break;
 		}
 	}
 	return (mss);
 }
 
 static u_int16_t
 pf_calc_mss(struct pf_addr *addr, sa_family_t af, int rtableid, u_int16_t offer)
 {
 #ifdef INET
 	struct nhop4_basic	nh4;
 #endif /* INET */
 #ifdef INET6
 	struct nhop6_basic	nh6;
 	struct in6_addr		dst6;
 	uint32_t		scopeid;
 #endif /* INET6 */
 	int			 hlen = 0;
 	uint16_t		 mss = 0;
 
 	switch (af) {
 #ifdef INET
 	case AF_INET:
 		hlen = sizeof(struct ip);
 		if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) == 0)
 			mss = nh4.nh_mtu - hlen - sizeof(struct tcphdr);
 		break;
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6:
 		hlen = sizeof(struct ip6_hdr);
 		in6_splitscope(&addr->v6, &dst6, &scopeid);
 		if (fib6_lookup_nh_basic(rtableid, &dst6, scopeid, 0,0,&nh6)==0)
 			mss = nh6.nh_mtu - hlen - sizeof(struct tcphdr);
 		break;
 #endif /* INET6 */
 	}
 
 	mss = max(V_tcp_mssdflt, mss);
 	mss = min(mss, offer);
 	mss = max(mss, 64);		/* sanity - at least max opt space */
 	return (mss);
 }
 
 static u_int32_t
 pf_tcp_iss(struct pf_pdesc *pd)
 {
 	MD5_CTX ctx;
 	u_int32_t digest[4];
 
 	if (V_pf_tcp_secret_init == 0) {
 		arc4random_buf(&V_pf_tcp_secret, sizeof(V_pf_tcp_secret));
 		MD5Init(&V_pf_tcp_secret_ctx);
 		MD5Update(&V_pf_tcp_secret_ctx, V_pf_tcp_secret,
 		    sizeof(V_pf_tcp_secret));
 		V_pf_tcp_secret_init = 1;
 	}
 
 	ctx = V_pf_tcp_secret_ctx;
 
 	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_sport, sizeof(u_short));
 	MD5Update(&ctx, (char *)&pd->hdr.tcp->th_dport, sizeof(u_short));
 	if (pd->af == AF_INET6) {
 		MD5Update(&ctx, (char *)&pd->src->v6, sizeof(struct in6_addr));
 		MD5Update(&ctx, (char *)&pd->dst->v6, sizeof(struct in6_addr));
 	} else {
 		MD5Update(&ctx, (char *)&pd->src->v4, sizeof(struct in_addr));
 		MD5Update(&ctx, (char *)&pd->dst->v4, sizeof(struct in_addr));
 	}
 	MD5Final((u_char *)digest, &ctx);
 	V_pf_tcp_iss_off += 4096;
 #define	ISN_RANDOM_INCREMENT (4096 - 1)
 	return (digest[0] + (arc4random() & ISN_RANDOM_INCREMENT) +
 	    V_pf_tcp_iss_off);
 #undef	ISN_RANDOM_INCREMENT
 }
 
 static int
 pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
     struct pfi_kif *kif, struct mbuf *m, int off, struct pf_pdesc *pd,
     struct pf_rule **am, struct pf_ruleset **rsm, struct inpcb *inp)
 {
 	struct pf_rule		*nr = NULL;
 	struct pf_addr		* const saddr = pd->src;
 	struct pf_addr		* const daddr = pd->dst;
 	sa_family_t		 af = pd->af;
 	struct pf_rule		*r, *a = NULL;
 	struct pf_ruleset	*ruleset = NULL;
 	struct pf_src_node	*nsn = NULL;
 	struct tcphdr		*th = pd->hdr.tcp;
 	struct pf_state_key	*sk = NULL, *nk = NULL;
 	u_short			 reason;
 	int			 rewrite = 0, hdrlen = 0;
 	int			 tag = -1, rtableid = -1;
 	int			 asd = 0;
 	int			 match = 0;
 	int			 state_icmp = 0;
 	u_int16_t		 sport = 0, dport = 0;
 	u_int16_t		 bproto_sum = 0, bip_sum = 0;
 	u_int8_t		 icmptype = 0, icmpcode = 0;
 	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
 
 	PF_RULES_RASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		pd->lookup.uid = inp->inp_cred->cr_uid;
 		pd->lookup.gid = inp->inp_cred->cr_groups[0];
 		pd->lookup.done = 1;
 	}
 
 	switch (pd->proto) {
 	case IPPROTO_TCP:
 		sport = th->th_sport;
 		dport = th->th_dport;
 		hdrlen = sizeof(*th);
 		break;
 	case IPPROTO_UDP:
 		sport = pd->hdr.udp->uh_sport;
 		dport = pd->hdr.udp->uh_dport;
 		hdrlen = sizeof(*pd->hdr.udp);
 		break;
 #ifdef INET
 	case IPPROTO_ICMP:
 		if (pd->af != AF_INET)
 			break;
 		sport = dport = pd->hdr.icmp->icmp_id;
 		hdrlen = sizeof(*pd->hdr.icmp);
 		icmptype = pd->hdr.icmp->icmp_type;
 		icmpcode = pd->hdr.icmp->icmp_code;
 
 		if (icmptype == ICMP_UNREACH ||
 		    icmptype == ICMP_SOURCEQUENCH ||
 		    icmptype == ICMP_REDIRECT ||
 		    icmptype == ICMP_TIMXCEED ||
 		    icmptype == ICMP_PARAMPROB)
 			state_icmp++;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 		if (af != AF_INET6)
 			break;
 		sport = dport = pd->hdr.icmp6->icmp6_id;
 		hdrlen = sizeof(*pd->hdr.icmp6);
 		icmptype = pd->hdr.icmp6->icmp6_type;
 		icmpcode = pd->hdr.icmp6->icmp6_code;
 
 		if (icmptype == ICMP6_DST_UNREACH ||
 		    icmptype == ICMP6_PACKET_TOO_BIG ||
 		    icmptype == ICMP6_TIME_EXCEEDED ||
 		    icmptype == ICMP6_PARAM_PROB)
 			state_icmp++;
 		break;
 #endif /* INET6 */
 	default:
 		sport = dport = hdrlen = 0;
 		break;
 	}
 
 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
 
 	/* check packet for BINAT/NAT/RDR */
 	if ((nr = pf_get_translation(pd, m, off, direction, kif, &nsn, &sk,
 	    &nk, saddr, daddr, sport, dport, anchor_stack)) != NULL) {
 		KASSERT(sk != NULL, ("%s: null sk", __func__));
 		KASSERT(nk != NULL, ("%s: null nk", __func__));
 
 		if (pd->ip_sum)
 			bip_sum = *pd->ip_sum;
 
 		switch (pd->proto) {
 		case IPPROTO_TCP:
 			bproto_sum = th->th_sum;
 			pd->proto_sum = &th->th_sum;
 
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
 			    nk->port[pd->sidx] != sport) {
 				pf_change_ap(m, saddr, &th->th_sport, pd->ip_sum,
 				    &th->th_sum, &nk->addr[pd->sidx],
 				    nk->port[pd->sidx], 0, af);
 				pd->sport = &th->th_sport;
 				sport = th->th_sport;
 			}
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
 			    nk->port[pd->didx] != dport) {
 				pf_change_ap(m, daddr, &th->th_dport, pd->ip_sum,
 				    &th->th_sum, &nk->addr[pd->didx],
 				    nk->port[pd->didx], 0, af);
 				dport = th->th_dport;
 				pd->dport = &th->th_dport;
 			}
 			rewrite++;
 			break;
 		case IPPROTO_UDP:
 			bproto_sum = pd->hdr.udp->uh_sum;
 			pd->proto_sum = &pd->hdr.udp->uh_sum;
 
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], af) ||
 			    nk->port[pd->sidx] != sport) {
 				pf_change_ap(m, saddr, &pd->hdr.udp->uh_sport,
 				    pd->ip_sum, &pd->hdr.udp->uh_sum,
 				    &nk->addr[pd->sidx],
 				    nk->port[pd->sidx], 1, af);
 				sport = pd->hdr.udp->uh_sport;
 				pd->sport = &pd->hdr.udp->uh_sport;
 			}
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], af) ||
 			    nk->port[pd->didx] != dport) {
 				pf_change_ap(m, daddr, &pd->hdr.udp->uh_dport,
 				    pd->ip_sum, &pd->hdr.udp->uh_sum,
 				    &nk->addr[pd->didx],
 				    nk->port[pd->didx], 1, af);
 				dport = pd->hdr.udp->uh_dport;
 				pd->dport = &pd->hdr.udp->uh_dport;
 			}
 			rewrite++;
 			break;
 #ifdef INET
 		case IPPROTO_ICMP:
 			nk->port[0] = nk->port[1];
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET))
 				pf_change_a(&saddr->v4.s_addr, pd->ip_sum,
 				    nk->addr[pd->sidx].v4.s_addr, 0);
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET))
 				pf_change_a(&daddr->v4.s_addr, pd->ip_sum,
 				    nk->addr[pd->didx].v4.s_addr, 0);
 
 			if (nk->port[1] != pd->hdr.icmp->icmp_id) {
 				pd->hdr.icmp->icmp_cksum = pf_cksum_fixup(
 				    pd->hdr.icmp->icmp_cksum, sport,
 				    nk->port[1], 0);
 				pd->hdr.icmp->icmp_id = nk->port[1];
 				pd->sport = &pd->hdr.icmp->icmp_id;
 			}
 			m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
 			break;
 #endif /* INET */
 #ifdef INET6
 		case IPPROTO_ICMPV6:
 			nk->port[0] = nk->port[1];
 			if (PF_ANEQ(saddr, &nk->addr[pd->sidx], AF_INET6))
 				pf_change_a6(saddr, &pd->hdr.icmp6->icmp6_cksum,
 				    &nk->addr[pd->sidx], 0);
 
 			if (PF_ANEQ(daddr, &nk->addr[pd->didx], AF_INET6))
 				pf_change_a6(daddr, &pd->hdr.icmp6->icmp6_cksum,
 				    &nk->addr[pd->didx], 0);
 			rewrite++;
 			break;
 #endif /* INET */
 		default:
 			switch (af) {
 #ifdef INET
 			case AF_INET:
 				if (PF_ANEQ(saddr,
 				    &nk->addr[pd->sidx], AF_INET))
 					pf_change_a(&saddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->sidx].v4.s_addr, 0);
 
 				if (PF_ANEQ(daddr,
 				    &nk->addr[pd->didx], AF_INET))
 					pf_change_a(&daddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->didx].v4.s_addr, 0);
 				break;
 #endif /* INET */
 #ifdef INET6
 			case AF_INET6:
 				if (PF_ANEQ(saddr,
 				    &nk->addr[pd->sidx], AF_INET6))
 					PF_ACPY(saddr, &nk->addr[pd->sidx], af);
 
 				if (PF_ANEQ(daddr,
 				    &nk->addr[pd->didx], AF_INET6))
 					PF_ACPY(daddr, &nk->addr[pd->didx], af);
 				break;
 #endif /* INET */
 			}
 			break;
 		}
 		if (nr->natpass)
 			r = NULL;
 		pd->nat_rule = nr;
 	}
 
 	while (r != NULL) {
 		r->evaluations++;
 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
 			r = r->skip[PF_SKIP_IFP].ptr;
 		else if (r->direction && r->direction != direction)
 			r = r->skip[PF_SKIP_DIR].ptr;
 		else if (r->af && r->af != af)
 			r = r->skip[PF_SKIP_AF].ptr;
 		else if (r->proto && r->proto != pd->proto)
 			r = r->skip[PF_SKIP_PROTO].ptr;
 		else if (PF_MISMATCHAW(&r->src.addr, saddr, af,
 		    r->src.neg, kif, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
 		/* tcp/udp only. port_op always 0 in other cases */
 		else if (r->src.port_op && !pf_match_port(r->src.port_op,
 		    r->src.port[0], r->src.port[1], sport))
 			r = r->skip[PF_SKIP_SRC_PORT].ptr;
 		else if (PF_MISMATCHAW(&r->dst.addr, daddr, af,
 		    r->dst.neg, NULL, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
 		/* tcp/udp only. port_op always 0 in other cases */
 		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
 		    r->dst.port[0], r->dst.port[1], dport))
 			r = r->skip[PF_SKIP_DST_PORT].ptr;
 		/* icmp only. type always 0 in other cases */
 		else if (r->type && r->type != icmptype + 1)
 			r = TAILQ_NEXT(r, entries);
 		/* icmp only. type always 0 in other cases */
 		else if (r->code && r->code != icmpcode + 1)
 			r = TAILQ_NEXT(r, entries);
 		else if (r->tos && !(r->tos == pd->tos))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->rule_flag & PFRULE_FRAGMENT)
 			r = TAILQ_NEXT(r, entries);
 		else if (pd->proto == IPPROTO_TCP &&
 		    (r->flagset & th->th_flags) != r->flags)
 			r = TAILQ_NEXT(r, entries);
 		/* tcp/udp only. uid.op always 0 in other cases */
 		else if (r->uid.op && (pd->lookup.done || (pd->lookup.done =
 		    pf_socket_lookup(direction, pd, m), 1)) &&
 		    !pf_match_uid(r->uid.op, r->uid.uid[0], r->uid.uid[1],
 		    pd->lookup.uid))
 			r = TAILQ_NEXT(r, entries);
 		/* tcp/udp only. gid.op always 0 in other cases */
 		else if (r->gid.op && (pd->lookup.done || (pd->lookup.done =
 		    pf_socket_lookup(direction, pd, m), 1)) &&
 		    !pf_match_gid(r->gid.op, r->gid.gid[0], r->gid.gid[1],
 		    pd->lookup.gid))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prio &&
 		    !pf_match_ieee8021q_pcp(r->prio, m))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prob &&
 		    r->prob <= arc4random())
 			r = TAILQ_NEXT(r, entries);
 		else if (r->match_tag && !pf_match_tag(m, r, &tag,
 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->os_fingerprint != PF_OSFP_ANY &&
 		    (pd->proto != IPPROTO_TCP || !pf_osfp_match(
 		    pf_osfp_fingerprint(pd, m, off, th),
 		    r->os_fingerprint)))
 			r = TAILQ_NEXT(r, entries);
 		else {
 			if (r->tag)
 				tag = r->tag;
 			if (r->rtableid >= 0)
 				rtableid = r->rtableid;
 			if (r->anchor == NULL) {
 				match = 1;
 				*rm = r;
 				*am = a;
 				*rsm = ruleset;
 				if ((*rm)->quick)
 					break;
 				r = TAILQ_NEXT(r, entries);
 			} else
 				pf_step_into_anchor(anchor_stack, &asd,
 				    &ruleset, PF_RULESET_FILTER, &r, &a,
 				    &match);
 		}
 		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
 		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
 			break;
 	}
 	r = *rm;
 	a = *am;
 	ruleset = *rsm;
 
 	REASON_SET(&reason, PFRES_MATCH);
 
 	if (r->log || (nr != NULL && nr->log)) {
 		if (rewrite)
 			m_copyback(m, off, hdrlen, pd->hdr.any);
 		PFLOG_PACKET(kif, m, af, direction, reason, r->log ? r : nr, a,
 		    ruleset, pd, 1);
 	}
 
 	if ((r->action == PF_DROP) &&
 	    ((r->rule_flag & PFRULE_RETURNRST) ||
 	    (r->rule_flag & PFRULE_RETURNICMP) ||
 	    (r->rule_flag & PFRULE_RETURN))) {
 		pf_return(r, nr, pd, sk, off, m, th, kif, bproto_sum,
 		    bip_sum, hdrlen, &reason);
 	}
 
 	if (r->action == PF_DROP)
 		goto cleanup;
 
 	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
 		REASON_SET(&reason, PFRES_MEMORY);
 		goto cleanup;
 	}
 	if (rtableid >= 0)
 		M_SETFIB(m, rtableid);
 
 	if (!state_icmp && (r->keep_state || nr != NULL ||
 	    (pd->flags & PFDESC_TCP_NORM))) {
 		int action;
 		action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
 		    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
 		    hdrlen);
 		if (action != PF_PASS) {
 			if (action == PF_DROP &&
 			    (r->rule_flag & PFRULE_RETURN))
 				pf_return(r, nr, pd, sk, off, m, th, kif,
 				    bproto_sum, bip_sum, hdrlen, &reason);
 			return (action);
 		}
 	} else {
 		if (sk != NULL)
 			uma_zfree(V_pf_state_key_z, sk);
 		if (nk != NULL)
 			uma_zfree(V_pf_state_key_z, nk);
 	}
 
 	/* copy back packet headers if we performed NAT operations */
 	if (rewrite)
 		m_copyback(m, off, hdrlen, pd->hdr.any);
 
 	if (*sm != NULL && !((*sm)->state_flags & PFSTATE_NOSYNC) &&
 	    direction == PF_OUT &&
 	    V_pfsync_defer_ptr != NULL && V_pfsync_defer_ptr(*sm, m))
 		/*
 		 * We want the state created, but we dont
 		 * want to send this in case a partner
 		 * firewall has to know about it to allow
 		 * replies through it.
 		 */
 		return (PF_DEFER);
 
 	return (PF_PASS);
 
 cleanup:
 	if (sk != NULL)
 		uma_zfree(V_pf_state_key_z, sk);
 	if (nk != NULL)
 		uma_zfree(V_pf_state_key_z, nk);
 	return (PF_DROP);
 }
 
 static int
 pf_create_state(struct pf_rule *r, struct pf_rule *nr, struct pf_rule *a,
     struct pf_pdesc *pd, struct pf_src_node *nsn, struct pf_state_key *nk,
     struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
     u_int16_t dport, int *rewrite, struct pfi_kif *kif, struct pf_state **sm,
     int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen)
 {
 	struct pf_state		*s = NULL;
 	struct pf_src_node	*sn = NULL;
 	struct tcphdr		*th = pd->hdr.tcp;
 	u_int16_t		 mss = V_tcp_mssdflt;
 	u_short			 reason;
 
 	/* check maximums */
 	if (r->max_states &&
 	    (counter_u64_fetch(r->states_cur) >= r->max_states)) {
 		counter_u64_add(V_pf_status.lcounters[LCNT_STATES], 1);
 		REASON_SET(&reason, PFRES_MAXSTATES);
 		goto csfailed;
 	}
 	/* src node for filter rule */
 	if ((r->rule_flag & PFRULE_SRCTRACK ||
 	    r->rpool.opts & PF_POOL_STICKYADDR) &&
 	    pf_insert_src_node(&sn, r, pd->src, pd->af) != 0) {
 		REASON_SET(&reason, PFRES_SRCLIMIT);
 		goto csfailed;
 	}
 	/* src node for translation rule */
 	if (nr != NULL && (nr->rpool.opts & PF_POOL_STICKYADDR) &&
 	    pf_insert_src_node(&nsn, nr, &sk->addr[pd->sidx], pd->af)) {
 		REASON_SET(&reason, PFRES_SRCLIMIT);
 		goto csfailed;
 	}
 	s = uma_zalloc(V_pf_state_z, M_NOWAIT | M_ZERO);
 	if (s == NULL) {
 		REASON_SET(&reason, PFRES_MEMORY);
 		goto csfailed;
 	}
 	s->rule.ptr = r;
 	s->nat_rule.ptr = nr;
 	s->anchor.ptr = a;
 	STATE_INC_COUNTERS(s);
 	if (r->allow_opts)
 		s->state_flags |= PFSTATE_ALLOWOPTS;
 	if (r->rule_flag & PFRULE_STATESLOPPY)
 		s->state_flags |= PFSTATE_SLOPPY;
 	s->log = r->log & PF_LOG_ALL;
 	s->sync_state = PFSYNC_S_NONE;
 	if (nr != NULL)
 		s->log |= nr->log & PF_LOG_ALL;
 	switch (pd->proto) {
 	case IPPROTO_TCP:
 		s->src.seqlo = ntohl(th->th_seq);
 		s->src.seqhi = s->src.seqlo + pd->p_len + 1;
 		if ((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN &&
 		    r->keep_state == PF_STATE_MODULATE) {
 			/* Generate sequence number modulator */
 			if ((s->src.seqdiff = pf_tcp_iss(pd) - s->src.seqlo) ==
 			    0)
 				s->src.seqdiff = 1;
 			pf_change_proto_a(m, &th->th_seq, &th->th_sum,
 			    htonl(s->src.seqlo + s->src.seqdiff), 0);
 			*rewrite = 1;
 		} else
 			s->src.seqdiff = 0;
 		if (th->th_flags & TH_SYN) {
 			s->src.seqhi++;
 			s->src.wscale = pf_get_wscale(m, off,
 			    th->th_off, pd->af);
 		}
 		s->src.max_win = MAX(ntohs(th->th_win), 1);
 		if (s->src.wscale & PF_WSCALE_MASK) {
 			/* Remove scale factor from initial window */
 			int win = s->src.max_win;
 			win += 1 << (s->src.wscale & PF_WSCALE_MASK);
 			s->src.max_win = (win - 1) >>
 			    (s->src.wscale & PF_WSCALE_MASK);
 		}
 		if (th->th_flags & TH_FIN)
 			s->src.seqhi++;
 		s->dst.seqhi = 1;
 		s->dst.max_win = 1;
 		s->src.state = TCPS_SYN_SENT;
 		s->dst.state = TCPS_CLOSED;
 		s->timeout = PFTM_TCP_FIRST_PACKET;
 		break;
 	case IPPROTO_UDP:
 		s->src.state = PFUDPS_SINGLE;
 		s->dst.state = PFUDPS_NO_TRAFFIC;
 		s->timeout = PFTM_UDP_FIRST_PACKET;
 		break;
 	case IPPROTO_ICMP:
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 #endif
 		s->timeout = PFTM_ICMP_FIRST_PACKET;
 		break;
 	default:
 		s->src.state = PFOTHERS_SINGLE;
 		s->dst.state = PFOTHERS_NO_TRAFFIC;
 		s->timeout = PFTM_OTHER_FIRST_PACKET;
 	}
 
 	if (r->rt) {
 		if (pf_map_addr(pd->af, r, pd->src, &s->rt_addr, NULL, &sn)) {
 			REASON_SET(&reason, PFRES_MAPFAILED);
 			pf_src_tree_remove_state(s);
 			STATE_DEC_COUNTERS(s);
 			uma_zfree(V_pf_state_z, s);
 			goto csfailed;
 		}
 		s->rt_kif = r->rpool.cur->kif;
 	}
 
 	s->creation = time_uptime;
 	s->expire = time_uptime;
 
 	if (sn != NULL)
 		s->src_node = sn;
 	if (nsn != NULL) {
 		/* XXX We only modify one side for now. */
 		PF_ACPY(&nsn->raddr, &nk->addr[1], pd->af);
 		s->nat_src_node = nsn;
 	}
 	if (pd->proto == IPPROTO_TCP) {
 		if ((pd->flags & PFDESC_TCP_NORM) && pf_normalize_tcp_init(m,
 		    off, pd, th, &s->src, &s->dst)) {
 			REASON_SET(&reason, PFRES_MEMORY);
 			pf_src_tree_remove_state(s);
 			STATE_DEC_COUNTERS(s);
 			uma_zfree(V_pf_state_z, s);
 			return (PF_DROP);
 		}
 		if ((pd->flags & PFDESC_TCP_NORM) && s->src.scrub &&
 		    pf_normalize_tcp_stateful(m, off, pd, &reason, th, s,
 		    &s->src, &s->dst, rewrite)) {
 			/* This really shouldn't happen!!! */
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("pf_normalize_tcp_stateful failed on first pkt"));
 			pf_normalize_tcp_cleanup(s);
 			pf_src_tree_remove_state(s);
 			STATE_DEC_COUNTERS(s);
 			uma_zfree(V_pf_state_z, s);
 			return (PF_DROP);
 		}
 	}
 	s->direction = pd->dir;
 
 	/*
 	 * sk/nk could already been setup by pf_get_translation().
 	 */
 	if (nr == NULL) {
 		KASSERT((sk == NULL && nk == NULL), ("%s: nr %p sk %p, nk %p",
 		    __func__, nr, sk, nk));
 		sk = pf_state_key_setup(pd, pd->src, pd->dst, sport, dport);
 		if (sk == NULL)
 			goto csfailed;
 		nk = sk;
 	} else
 		KASSERT((sk != NULL && nk != NULL), ("%s: nr %p sk %p, nk %p",
 		    __func__, nr, sk, nk));
 
 	/* Swap sk/nk for PF_OUT. */
 	if (pf_state_insert(BOUND_IFACE(r, kif),
 	    (pd->dir == PF_IN) ? sk : nk,
 	    (pd->dir == PF_IN) ? nk : sk, s)) {
 		if (pd->proto == IPPROTO_TCP)
 			pf_normalize_tcp_cleanup(s);
 		REASON_SET(&reason, PFRES_STATEINS);
 		pf_src_tree_remove_state(s);
 		STATE_DEC_COUNTERS(s);
 		uma_zfree(V_pf_state_z, s);
 		return (PF_DROP);
 	} else
 		*sm = s;
 
 	if (tag > 0)
 		s->tag = tag;
 	if (pd->proto == IPPROTO_TCP && (th->th_flags & (TH_SYN|TH_ACK)) ==
 	    TH_SYN && r->keep_state == PF_STATE_SYNPROXY) {
 		s->src.state = PF_TCPS_PROXY_SRC;
 		/* undo NAT changes, if they have taken place */
 		if (nr != NULL) {
 			struct pf_state_key *skt = s->key[PF_SK_WIRE];
 			if (pd->dir == PF_OUT)
 				skt = s->key[PF_SK_STACK];
 			PF_ACPY(pd->src, &skt->addr[pd->sidx], pd->af);
 			PF_ACPY(pd->dst, &skt->addr[pd->didx], pd->af);
 			if (pd->sport)
 				*pd->sport = skt->port[pd->sidx];
 			if (pd->dport)
 				*pd->dport = skt->port[pd->didx];
 			if (pd->proto_sum)
 				*pd->proto_sum = bproto_sum;
 			if (pd->ip_sum)
 				*pd->ip_sum = bip_sum;
 			m_copyback(m, off, hdrlen, pd->hdr.any);
 		}
 		s->src.seqhi = htonl(arc4random());
 		/* Find mss option */
 		int rtid = M_GETFIB(m);
 		mss = pf_get_mss(m, off, th->th_off, pd->af);
 		mss = pf_calc_mss(pd->src, pd->af, rtid, mss);
 		mss = pf_calc_mss(pd->dst, pd->af, rtid, mss);
 		s->src.mss = mss;
 		pf_send_tcp(NULL, r, pd->af, pd->dst, pd->src, th->th_dport,
 		    th->th_sport, s->src.seqhi, ntohl(th->th_seq) + 1,
 		    TH_SYN|TH_ACK, 0, s->src.mss, 0, 1, 0, NULL);
 		REASON_SET(&reason, PFRES_SYNPROXY);
 		return (PF_SYNPROXY_DROP);
 	}
 
 	return (PF_PASS);
 
 csfailed:
 	if (sk != NULL)
 		uma_zfree(V_pf_state_key_z, sk);
 	if (nk != NULL)
 		uma_zfree(V_pf_state_key_z, nk);
 
 	if (sn != NULL) {
 		struct pf_srchash *sh;
 
 		sh = &V_pf_srchash[pf_hashsrc(&sn->addr, sn->af)];
 		PF_HASHROW_LOCK(sh);
 		if (--sn->states == 0 && sn->expire == 0) {
 			pf_unlink_src_node(sn);
 			uma_zfree(V_pf_sources_z, sn);
 			counter_u64_add(
 			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
 		}
 		PF_HASHROW_UNLOCK(sh);
 	}
 
 	if (nsn != sn && nsn != NULL) {
 		struct pf_srchash *sh;
 
 		sh = &V_pf_srchash[pf_hashsrc(&nsn->addr, nsn->af)];
 		PF_HASHROW_LOCK(sh);
 		if (--nsn->states == 0 && nsn->expire == 0) {
 			pf_unlink_src_node(nsn);
 			uma_zfree(V_pf_sources_z, nsn);
 			counter_u64_add(
 			    V_pf_status.scounters[SCNT_SRC_NODE_REMOVALS], 1);
 		}
 		PF_HASHROW_UNLOCK(sh);
 	}
 
 	return (PF_DROP);
 }
 
 static int
 pf_test_fragment(struct pf_rule **rm, int direction, struct pfi_kif *kif,
     struct mbuf *m, void *h, struct pf_pdesc *pd, struct pf_rule **am,
     struct pf_ruleset **rsm)
 {
 	struct pf_rule		*r, *a = NULL;
 	struct pf_ruleset	*ruleset = NULL;
 	sa_family_t		 af = pd->af;
 	u_short			 reason;
 	int			 tag = -1;
 	int			 asd = 0;
 	int			 match = 0;
 	struct pf_anchor_stackframe	anchor_stack[PF_ANCHOR_STACKSIZE];
 
 	PF_RULES_RASSERT();
 
 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_FILTER].active.ptr);
 	while (r != NULL) {
 		r->evaluations++;
 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
 			r = r->skip[PF_SKIP_IFP].ptr;
 		else if (r->direction && r->direction != direction)
 			r = r->skip[PF_SKIP_DIR].ptr;
 		else if (r->af && r->af != af)
 			r = r->skip[PF_SKIP_AF].ptr;
 		else if (r->proto && r->proto != pd->proto)
 			r = r->skip[PF_SKIP_PROTO].ptr;
 		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
 		    r->src.neg, kif, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
 		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
 		    r->dst.neg, NULL, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
 		else if (r->tos && !(r->tos == pd->tos))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->os_fingerprint != PF_OSFP_ANY)
 			r = TAILQ_NEXT(r, entries);
 		else if (pd->proto == IPPROTO_UDP &&
 		    (r->src.port_op || r->dst.port_op))
 			r = TAILQ_NEXT(r, entries);
 		else if (pd->proto == IPPROTO_TCP &&
 		    (r->src.port_op || r->dst.port_op || r->flagset))
 			r = TAILQ_NEXT(r, entries);
 		else if ((pd->proto == IPPROTO_ICMP ||
 		    pd->proto == IPPROTO_ICMPV6) &&
 		    (r->type || r->code))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prio &&
 		    !pf_match_ieee8021q_pcp(r->prio, m))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->prob && r->prob <=
 		    (arc4random() % (UINT_MAX - 1) + 1))
 			r = TAILQ_NEXT(r, entries);
 		else if (r->match_tag && !pf_match_tag(m, r, &tag,
 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
 			r = TAILQ_NEXT(r, entries);
 		else {
 			if (r->anchor == NULL) {
 				match = 1;
 				*rm = r;
 				*am = a;
 				*rsm = ruleset;
 				if ((*rm)->quick)
 					break;
 				r = TAILQ_NEXT(r, entries);
 			} else
 				pf_step_into_anchor(anchor_stack, &asd,
 				    &ruleset, PF_RULESET_FILTER, &r, &a,
 				    &match);
 		}
 		if (r == NULL && pf_step_out_of_anchor(anchor_stack, &asd,
 		    &ruleset, PF_RULESET_FILTER, &r, &a, &match))
 			break;
 	}
 	r = *rm;
 	a = *am;
 	ruleset = *rsm;
 
 	REASON_SET(&reason, PFRES_MATCH);
 
 	if (r->log)
 		PFLOG_PACKET(kif, m, af, direction, reason, r, a, ruleset, pd,
 		    1);
 
 	if (r->action != PF_PASS)
 		return (PF_DROP);
 
 	if (tag > 0 && pf_tag_packet(m, pd, tag)) {
 		REASON_SET(&reason, PFRES_MEMORY);
 		return (PF_DROP);
 	}
 
 	return (PF_PASS);
 }
 
 static int
 pf_tcp_track_full(struct pf_state_peer *src, struct pf_state_peer *dst,
 	struct pf_state **state, struct pfi_kif *kif, struct mbuf *m, int off,
 	struct pf_pdesc *pd, u_short *reason, int *copyback)
 {
 	struct tcphdr		*th = pd->hdr.tcp;
 	u_int16_t		 win = ntohs(th->th_win);
 	u_int32_t		 ack, end, seq, orig_seq;
 	u_int8_t		 sws, dws;
 	int			 ackskew;
 
 	if (src->wscale && dst->wscale && !(th->th_flags & TH_SYN)) {
 		sws = src->wscale & PF_WSCALE_MASK;
 		dws = dst->wscale & PF_WSCALE_MASK;
 	} else
 		sws = dws = 0;
 
 	/*
 	 * Sequence tracking algorithm from Guido van Rooij's paper:
 	 *   http://www.madison-gurkha.com/publications/tcp_filtering/
 	 *	tcp_filtering.ps
 	 */
 
 	orig_seq = seq = ntohl(th->th_seq);
 	if (src->seqlo == 0) {
 		/* First packet from this end. Set its state */
 
 		if ((pd->flags & PFDESC_TCP_NORM || dst->scrub) &&
 		    src->scrub == NULL) {
 			if (pf_normalize_tcp_init(m, off, pd, th, src, dst)) {
 				REASON_SET(reason, PFRES_MEMORY);
 				return (PF_DROP);
 			}
 		}
 
 		/* Deferred generation of sequence number modulator */
 		if (dst->seqdiff && !src->seqdiff) {
 			/* use random iss for the TCP server */
 			while ((src->seqdiff = arc4random() - seq) == 0)
 				;
 			ack = ntohl(th->th_ack) - dst->seqdiff;
 			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
 			    src->seqdiff), 0);
 			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
 			*copyback = 1;
 		} else {
 			ack = ntohl(th->th_ack);
 		}
 
 		end = seq + pd->p_len;
 		if (th->th_flags & TH_SYN) {
 			end++;
 			if (dst->wscale & PF_WSCALE_FLAG) {
 				src->wscale = pf_get_wscale(m, off, th->th_off,
 				    pd->af);
 				if (src->wscale & PF_WSCALE_FLAG) {
 					/* Remove scale factor from initial
 					 * window */
 					sws = src->wscale & PF_WSCALE_MASK;
 					win = ((u_int32_t)win + (1 << sws) - 1)
 					    >> sws;
 					dws = dst->wscale & PF_WSCALE_MASK;
 				} else {
 					/* fixup other window */
 					dst->max_win <<= dst->wscale &
 					    PF_WSCALE_MASK;
 					/* in case of a retrans SYN|ACK */
 					dst->wscale = 0;
 				}
 			}
 		}
 		if (th->th_flags & TH_FIN)
 			end++;
 
 		src->seqlo = seq;
 		if (src->state < TCPS_SYN_SENT)
 			src->state = TCPS_SYN_SENT;
 
 		/*
 		 * May need to slide the window (seqhi may have been set by
 		 * the crappy stack check or if we picked up the connection
 		 * after establishment)
 		 */
 		if (src->seqhi == 1 ||
 		    SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi))
 			src->seqhi = end + MAX(1, dst->max_win << dws);
 		if (win > src->max_win)
 			src->max_win = win;
 
 	} else {
 		ack = ntohl(th->th_ack) - dst->seqdiff;
 		if (src->seqdiff) {
 			/* Modulate sequence numbers */
 			pf_change_proto_a(m, &th->th_seq, &th->th_sum, htonl(seq +
 			    src->seqdiff), 0);
 			pf_change_proto_a(m, &th->th_ack, &th->th_sum, htonl(ack), 0);
 			*copyback = 1;
 		}
 		end = seq + pd->p_len;
 		if (th->th_flags & TH_SYN)
 			end++;
 		if (th->th_flags & TH_FIN)
 			end++;
 	}
 
 	if ((th->th_flags & TH_ACK) == 0) {
 		/* Let it pass through the ack skew check */
 		ack = dst->seqlo;
 	} else if ((ack == 0 &&
 	    (th->th_flags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) ||
 	    /* broken tcp stacks do not set ack */
 	    (dst->state < TCPS_SYN_SENT)) {
 		/*
 		 * Many stacks (ours included) will set the ACK number in an
 		 * FIN|ACK if the SYN times out -- no sequence to ACK.
 		 */
 		ack = dst->seqlo;
 	}
 
 	if (seq == end) {
 		/* Ease sequencing restrictions on no data packets */
 		seq = src->seqlo;
 		end = seq;
 	}
 
 	ackskew = dst->seqlo - ack;
 
 
 	/*
 	 * Need to demodulate the sequence numbers in any TCP SACK options
 	 * (Selective ACK). We could optionally validate the SACK values
 	 * against the current ACK window, either forwards or backwards, but
 	 * I'm not confident that SACK has been implemented properly
 	 * everywhere. It wouldn't surprise me if several stacks accidentally
 	 * SACK too far backwards of previously ACKed data. There really aren't
 	 * any security implications of bad SACKing unless the target stack
 	 * doesn't validate the option length correctly. Someone trying to
 	 * spoof into a TCP connection won't bother blindly sending SACK
 	 * options anyway.
 	 */
 	if (dst->seqdiff && (th->th_off << 2) > sizeof(struct tcphdr)) {
 		if (pf_modulate_sack(m, off, pd, th, dst))
 			*copyback = 1;
 	}
 
 
 #define	MAXACKWINDOW (0xffff + 1500)	/* 1500 is an arbitrary fudge factor */
 	if (SEQ_GEQ(src->seqhi, end) &&
 	    /* Last octet inside other's window space */
 	    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) &&
 	    /* Retrans: not more than one window back */
 	    (ackskew >= -MAXACKWINDOW) &&
 	    /* Acking not more than one reassembled fragment backwards */
 	    (ackskew <= (MAXACKWINDOW << sws)) &&
 	    /* Acking not more than one window forward */
 	    ((th->th_flags & TH_RST) == 0 || orig_seq == src->seqlo ||
 	    (orig_seq == src->seqlo + 1) || (orig_seq + 1 == src->seqlo) ||
 	    (pd->flags & PFDESC_IP_REAS) == 0)) {
 	    /* Require an exact/+1 sequence match on resets when possible */
 
 		if (dst->scrub || src->scrub) {
 			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
 			    *state, src, dst, copyback))
 				return (PF_DROP);
 		}
 
 		/* update max window */
 		if (src->max_win < win)
 			src->max_win = win;
 		/* synchronize sequencing */
 		if (SEQ_GT(end, src->seqlo))
 			src->seqlo = end;
 		/* slide the window of what the other end can send */
 		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
 			dst->seqhi = ack + MAX((win << sws), 1);
 
 
 		/* update states */
 		if (th->th_flags & TH_SYN)
 			if (src->state < TCPS_SYN_SENT)
 				src->state = TCPS_SYN_SENT;
 		if (th->th_flags & TH_FIN)
 			if (src->state < TCPS_CLOSING)
 				src->state = TCPS_CLOSING;
 		if (th->th_flags & TH_ACK) {
 			if (dst->state == TCPS_SYN_SENT) {
 				dst->state = TCPS_ESTABLISHED;
 				if (src->state == TCPS_ESTABLISHED &&
 				    (*state)->src_node != NULL &&
 				    pf_src_connlimit(state)) {
 					REASON_SET(reason, PFRES_SRCLIMIT);
 					return (PF_DROP);
 				}
 			} else if (dst->state == TCPS_CLOSING)
 				dst->state = TCPS_FIN_WAIT_2;
 		}
 		if (th->th_flags & TH_RST)
 			src->state = dst->state = TCPS_TIME_WAIT;
 
 		/* update expire time */
 		(*state)->expire = time_uptime;
 		if (src->state >= TCPS_FIN_WAIT_2 &&
 		    dst->state >= TCPS_FIN_WAIT_2)
 			(*state)->timeout = PFTM_TCP_CLOSED;
 		else if (src->state >= TCPS_CLOSING &&
 		    dst->state >= TCPS_CLOSING)
 			(*state)->timeout = PFTM_TCP_FIN_WAIT;
 		else if (src->state < TCPS_ESTABLISHED ||
 		    dst->state < TCPS_ESTABLISHED)
 			(*state)->timeout = PFTM_TCP_OPENING;
 		else if (src->state >= TCPS_CLOSING ||
 		    dst->state >= TCPS_CLOSING)
 			(*state)->timeout = PFTM_TCP_CLOSING;
 		else
 			(*state)->timeout = PFTM_TCP_ESTABLISHED;
 
 		/* Fall through to PASS packet */
 
 	} else if ((dst->state < TCPS_SYN_SENT ||
 		dst->state >= TCPS_FIN_WAIT_2 ||
 		src->state >= TCPS_FIN_WAIT_2) &&
 	    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) &&
 	    /* Within a window forward of the originating packet */
 	    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) {
 	    /* Within a window backward of the originating packet */
 
 		/*
 		 * This currently handles three situations:
 		 *  1) Stupid stacks will shotgun SYNs before their peer
 		 *     replies.
 		 *  2) When PF catches an already established stream (the
 		 *     firewall rebooted, the state table was flushed, routes
 		 *     changed...)
 		 *  3) Packets get funky immediately after the connection
 		 *     closes (this should catch Solaris spurious ACK|FINs
 		 *     that web servers like to spew after a close)
 		 *
 		 * This must be a little more careful than the above code
 		 * since packet floods will also be caught here. We don't
 		 * update the TTL here to mitigate the damage of a packet
 		 * flood and so the same code can handle awkward establishment
 		 * and a loosened connection close.
 		 * In the establishment case, a correct peer response will
 		 * validate the connection, go through the normal state code
 		 * and keep updating the state TTL.
 		 */
 
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: loose state match: ");
 			pf_print_state(*state);
 			pf_print_flags(th->th_flags);
 			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
 			    "pkts=%llu:%llu dir=%s,%s\n", seq, orig_seq, ack,
 			    pd->p_len, ackskew, (unsigned long long)(*state)->packets[0],
 			    (unsigned long long)(*state)->packets[1],
 			    pd->dir == PF_IN ? "in" : "out",
 			    pd->dir == (*state)->direction ? "fwd" : "rev");
 		}
 
 		if (dst->scrub || src->scrub) {
 			if (pf_normalize_tcp_stateful(m, off, pd, reason, th,
 			    *state, src, dst, copyback))
 				return (PF_DROP);
 		}
 
 		/* update max window */
 		if (src->max_win < win)
 			src->max_win = win;
 		/* synchronize sequencing */
 		if (SEQ_GT(end, src->seqlo))
 			src->seqlo = end;
 		/* slide the window of what the other end can send */
 		if (SEQ_GEQ(ack + (win << sws), dst->seqhi))
 			dst->seqhi = ack + MAX((win << sws), 1);
 
 		/*
 		 * Cannot set dst->seqhi here since this could be a shotgunned
 		 * SYN and not an already established connection.
 		 */
 
 		if (th->th_flags & TH_FIN)
 			if (src->state < TCPS_CLOSING)
 				src->state = TCPS_CLOSING;
 		if (th->th_flags & TH_RST)
 			src->state = dst->state = TCPS_TIME_WAIT;
 
 		/* Fall through to PASS packet */
 
 	} else {
 		if ((*state)->dst.state == TCPS_SYN_SENT &&
 		    (*state)->src.state == TCPS_SYN_SENT) {
 			/* Send RST for state mismatches during handshake */
 			if (!(th->th_flags & TH_RST))
 				pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
 				    pd->dst, pd->src, th->th_dport,
 				    th->th_sport, ntohl(th->th_ack), 0,
 				    TH_RST, 0, 0,
 				    (*state)->rule.ptr->return_ttl, 1, 0,
 				    kif->pfik_ifp);
 			src->seqlo = 0;
 			src->seqhi = 1;
 			src->max_win = 1;
 		} else if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: BAD state: ");
 			pf_print_state(*state);
 			pf_print_flags(th->th_flags);
 			printf(" seq=%u (%u) ack=%u len=%u ackskew=%d "
 			    "pkts=%llu:%llu dir=%s,%s\n",
 			    seq, orig_seq, ack, pd->p_len, ackskew,
 			    (unsigned long long)(*state)->packets[0],
 			    (unsigned long long)(*state)->packets[1],
 			    pd->dir == PF_IN ? "in" : "out",
 			    pd->dir == (*state)->direction ? "fwd" : "rev");
 			printf("pf: State failure on: %c %c %c %c | %c %c\n",
 			    SEQ_GEQ(src->seqhi, end) ? ' ' : '1',
 			    SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) ?
 			    ' ': '2',
 			    (ackskew >= -MAXACKWINDOW) ? ' ' : '3',
 			    (ackskew <= (MAXACKWINDOW << sws)) ? ' ' : '4',
 			    SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) ?' ' :'5',
 			    SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW) ?' ' :'6');
 		}
 		REASON_SET(reason, PFRES_BADSTATE);
 		return (PF_DROP);
 	}
 
 	return (PF_PASS);
 }
 
 static int
 pf_tcp_track_sloppy(struct pf_state_peer *src, struct pf_state_peer *dst,
 	struct pf_state **state, struct pf_pdesc *pd, u_short *reason)
 {
 	struct tcphdr		*th = pd->hdr.tcp;
 
 	if (th->th_flags & TH_SYN)
 		if (src->state < TCPS_SYN_SENT)
 			src->state = TCPS_SYN_SENT;
 	if (th->th_flags & TH_FIN)
 		if (src->state < TCPS_CLOSING)
 			src->state = TCPS_CLOSING;
 	if (th->th_flags & TH_ACK) {
 		if (dst->state == TCPS_SYN_SENT) {
 			dst->state = TCPS_ESTABLISHED;
 			if (src->state == TCPS_ESTABLISHED &&
 			    (*state)->src_node != NULL &&
 			    pf_src_connlimit(state)) {
 				REASON_SET(reason, PFRES_SRCLIMIT);
 				return (PF_DROP);
 			}
 		} else if (dst->state == TCPS_CLOSING) {
 			dst->state = TCPS_FIN_WAIT_2;
 		} else if (src->state == TCPS_SYN_SENT &&
 		    dst->state < TCPS_SYN_SENT) {
 			/*
 			 * Handle a special sloppy case where we only see one
 			 * half of the connection. If there is a ACK after
 			 * the initial SYN without ever seeing a packet from
 			 * the destination, set the connection to established.
 			 */
 			dst->state = src->state = TCPS_ESTABLISHED;
 			if ((*state)->src_node != NULL &&
 			    pf_src_connlimit(state)) {
 				REASON_SET(reason, PFRES_SRCLIMIT);
 				return (PF_DROP);
 			}
 		} else if (src->state == TCPS_CLOSING &&
 		    dst->state == TCPS_ESTABLISHED &&
 		    dst->seqlo == 0) {
 			/*
 			 * Handle the closing of half connections where we
 			 * don't see the full bidirectional FIN/ACK+ACK
 			 * handshake.
 			 */
 			dst->state = TCPS_CLOSING;
 		}
 	}
 	if (th->th_flags & TH_RST)
 		src->state = dst->state = TCPS_TIME_WAIT;
 
 	/* update expire time */
 	(*state)->expire = time_uptime;
 	if (src->state >= TCPS_FIN_WAIT_2 &&
 	    dst->state >= TCPS_FIN_WAIT_2)
 		(*state)->timeout = PFTM_TCP_CLOSED;
 	else if (src->state >= TCPS_CLOSING &&
 	    dst->state >= TCPS_CLOSING)
 		(*state)->timeout = PFTM_TCP_FIN_WAIT;
 	else if (src->state < TCPS_ESTABLISHED ||
 	    dst->state < TCPS_ESTABLISHED)
 		(*state)->timeout = PFTM_TCP_OPENING;
 	else if (src->state >= TCPS_CLOSING ||
 	    dst->state >= TCPS_CLOSING)
 		(*state)->timeout = PFTM_TCP_CLOSING;
 	else
 		(*state)->timeout = PFTM_TCP_ESTABLISHED;
 
 	return (PF_PASS);
 }
 
 static int
 pf_test_state_tcp(struct pf_state **state, int direction, struct pfi_kif *kif,
     struct mbuf *m, int off, void *h, struct pf_pdesc *pd,
     u_short *reason)
 {
 	struct pf_state_key_cmp	 key;
 	struct tcphdr		*th = pd->hdr.tcp;
 	int			 copyback = 0;
 	struct pf_state_peer	*src, *dst;
 	struct pf_state_key	*sk;
 
 	bzero(&key, sizeof(key));
 	key.af = pd->af;
 	key.proto = IPPROTO_TCP;
 	if (direction == PF_IN)	{	/* wire side, straight */
 		PF_ACPY(&key.addr[0], pd->src, key.af);
 		PF_ACPY(&key.addr[1], pd->dst, key.af);
 		key.port[0] = th->th_sport;
 		key.port[1] = th->th_dport;
 	} else {			/* stack side, reverse */
 		PF_ACPY(&key.addr[1], pd->src, key.af);
 		PF_ACPY(&key.addr[0], pd->dst, key.af);
 		key.port[1] = th->th_sport;
 		key.port[0] = th->th_dport;
 	}
 
 	STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 	if (direction == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 	}
 
 	sk = (*state)->key[pd->didx];
 
 	if ((*state)->src.state == PF_TCPS_PROXY_SRC) {
 		if (direction != (*state)->direction) {
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		}
 		if (th->th_flags & TH_SYN) {
 			if (ntohl(th->th_seq) != (*state)->src.seqlo) {
 				REASON_SET(reason, PFRES_SYNPROXY);
 				return (PF_DROP);
 			}
 			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
 			    pd->src, th->th_dport, th->th_sport,
 			    (*state)->src.seqhi, ntohl(th->th_seq) + 1,
 			    TH_SYN|TH_ACK, 0, (*state)->src.mss, 0, 1, 0, NULL);
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		} else if ((th->th_flags & (TH_ACK|TH_RST|TH_FIN)) != TH_ACK ||
 		    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
 		    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_DROP);
 		} else if ((*state)->src_node != NULL &&
 		    pf_src_connlimit(state)) {
 			REASON_SET(reason, PFRES_SRCLIMIT);
 			return (PF_DROP);
 		} else
 			(*state)->src.state = PF_TCPS_PROXY_DST;
 	}
 	if ((*state)->src.state == PF_TCPS_PROXY_DST) {
 		if (direction == (*state)->direction) {
 			if (((th->th_flags & (TH_SYN|TH_ACK)) != TH_ACK) ||
 			    (ntohl(th->th_ack) != (*state)->src.seqhi + 1) ||
 			    (ntohl(th->th_seq) != (*state)->src.seqlo + 1)) {
 				REASON_SET(reason, PFRES_SYNPROXY);
 				return (PF_DROP);
 			}
 			(*state)->src.max_win = MAX(ntohs(th->th_win), 1);
 			if ((*state)->dst.seqhi == 1)
 				(*state)->dst.seqhi = htonl(arc4random());
 			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
 			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
 			    sk->port[pd->sidx], sk->port[pd->didx],
 			    (*state)->dst.seqhi, 0, TH_SYN, 0,
 			    (*state)->src.mss, 0, 0, (*state)->tag, NULL);
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		} else if (((th->th_flags & (TH_SYN|TH_ACK)) !=
 		    (TH_SYN|TH_ACK)) ||
 		    (ntohl(th->th_ack) != (*state)->dst.seqhi + 1)) {
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_DROP);
 		} else {
 			(*state)->dst.max_win = MAX(ntohs(th->th_win), 1);
 			(*state)->dst.seqlo = ntohl(th->th_seq);
 			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af, pd->dst,
 			    pd->src, th->th_dport, th->th_sport,
 			    ntohl(th->th_ack), ntohl(th->th_seq) + 1,
 			    TH_ACK, (*state)->src.max_win, 0, 0, 0,
 			    (*state)->tag, NULL);
 			pf_send_tcp(NULL, (*state)->rule.ptr, pd->af,
 			    &sk->addr[pd->sidx], &sk->addr[pd->didx],
 			    sk->port[pd->sidx], sk->port[pd->didx],
 			    (*state)->src.seqhi + 1, (*state)->src.seqlo + 1,
 			    TH_ACK, (*state)->dst.max_win, 0, 0, 1, 0, NULL);
 			(*state)->src.seqdiff = (*state)->dst.seqhi -
 			    (*state)->src.seqlo;
 			(*state)->dst.seqdiff = (*state)->src.seqhi -
 			    (*state)->dst.seqlo;
 			(*state)->src.seqhi = (*state)->src.seqlo +
 			    (*state)->dst.max_win;
 			(*state)->dst.seqhi = (*state)->dst.seqlo +
 			    (*state)->src.max_win;
 			(*state)->src.wscale = (*state)->dst.wscale = 0;
 			(*state)->src.state = (*state)->dst.state =
 			    TCPS_ESTABLISHED;
 			REASON_SET(reason, PFRES_SYNPROXY);
 			return (PF_SYNPROXY_DROP);
 		}
 	}
 
 	if (((th->th_flags & (TH_SYN|TH_ACK)) == TH_SYN) &&
 	    dst->state >= TCPS_FIN_WAIT_2 &&
 	    src->state >= TCPS_FIN_WAIT_2) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			printf("pf: state reuse ");
 			pf_print_state(*state);
 			pf_print_flags(th->th_flags);
 			printf("\n");
 		}
 		/* XXX make sure it's the same direction ?? */
 		(*state)->src.state = (*state)->dst.state = TCPS_CLOSED;
 		pf_unlink_state(*state, PF_ENTER_LOCKED);
 		*state = NULL;
 		return (PF_DROP);
 	}
 
 	if ((*state)->state_flags & PFSTATE_SLOPPY) {
 		if (pf_tcp_track_sloppy(src, dst, state, pd, reason) == PF_DROP)
 			return (PF_DROP);
 	} else {
 		if (pf_tcp_track_full(src, dst, state, kif, m, off, pd, reason,
 		    &copyback) == PF_DROP)
 			return (PF_DROP);
 	}
 
 	/* translate source/destination address, if necessary */
 	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 		struct pf_state_key *nk = (*state)->key[pd->didx];
 
 		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
 		    nk->port[pd->sidx] != th->th_sport)
 			pf_change_ap(m, pd->src, &th->th_sport,
 			    pd->ip_sum, &th->th_sum, &nk->addr[pd->sidx],
 			    nk->port[pd->sidx], 0, pd->af);
 
 		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
 		    nk->port[pd->didx] != th->th_dport)
 			pf_change_ap(m, pd->dst, &th->th_dport,
 			    pd->ip_sum, &th->th_sum, &nk->addr[pd->didx],
 			    nk->port[pd->didx], 0, pd->af);
 		copyback = 1;
 	}
 
 	/* Copyback sequence modulation or stateful scrub changes if needed */
 	if (copyback)
 		m_copyback(m, off, sizeof(*th), (caddr_t)th);
 
 	return (PF_PASS);
 }
 
 static int
 pf_test_state_udp(struct pf_state **state, int direction, struct pfi_kif *kif,
     struct mbuf *m, int off, void *h, struct pf_pdesc *pd)
 {
 	struct pf_state_peer	*src, *dst;
 	struct pf_state_key_cmp	 key;
 	struct udphdr		*uh = pd->hdr.udp;
 
 	bzero(&key, sizeof(key));
 	key.af = pd->af;
 	key.proto = IPPROTO_UDP;
 	if (direction == PF_IN)	{	/* wire side, straight */
 		PF_ACPY(&key.addr[0], pd->src, key.af);
 		PF_ACPY(&key.addr[1], pd->dst, key.af);
 		key.port[0] = uh->uh_sport;
 		key.port[1] = uh->uh_dport;
 	} else {			/* stack side, reverse */
 		PF_ACPY(&key.addr[1], pd->src, key.af);
 		PF_ACPY(&key.addr[0], pd->dst, key.af);
 		key.port[1] = uh->uh_sport;
 		key.port[0] = uh->uh_dport;
 	}
 
 	STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 	if (direction == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 	}
 
 	/* update states */
 	if (src->state < PFUDPS_SINGLE)
 		src->state = PFUDPS_SINGLE;
 	if (dst->state == PFUDPS_SINGLE)
 		dst->state = PFUDPS_MULTIPLE;
 
 	/* update expire time */
 	(*state)->expire = time_uptime;
 	if (src->state == PFUDPS_MULTIPLE && dst->state == PFUDPS_MULTIPLE)
 		(*state)->timeout = PFTM_UDP_MULTIPLE;
 	else
 		(*state)->timeout = PFTM_UDP_SINGLE;
 
 	/* translate source/destination address, if necessary */
 	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 		struct pf_state_key *nk = (*state)->key[pd->didx];
 
 		if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], pd->af) ||
 		    nk->port[pd->sidx] != uh->uh_sport)
 			pf_change_ap(m, pd->src, &uh->uh_sport, pd->ip_sum,
 			    &uh->uh_sum, &nk->addr[pd->sidx],
 			    nk->port[pd->sidx], 1, pd->af);
 
 		if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], pd->af) ||
 		    nk->port[pd->didx] != uh->uh_dport)
 			pf_change_ap(m, pd->dst, &uh->uh_dport, pd->ip_sum,
 			    &uh->uh_sum, &nk->addr[pd->didx],
 			    nk->port[pd->didx], 1, pd->af);
 		m_copyback(m, off, sizeof(*uh), (caddr_t)uh);
 	}
 
 	return (PF_PASS);
 }
 
 static int
 pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
     struct mbuf *m, int off, void *h, struct pf_pdesc *pd, u_short *reason)
 {
 	struct pf_addr  *saddr = pd->src, *daddr = pd->dst;
 	u_int16_t	 icmpid = 0, *icmpsum;
 	u_int8_t	 icmptype, icmpcode;
 	int		 state_icmp = 0;
 	struct pf_state_key_cmp key;
 
 	bzero(&key, sizeof(key));
 	switch (pd->proto) {
 #ifdef INET
 	case IPPROTO_ICMP:
 		icmptype = pd->hdr.icmp->icmp_type;
 		icmpcode = pd->hdr.icmp->icmp_code;
 		icmpid = pd->hdr.icmp->icmp_id;
 		icmpsum = &pd->hdr.icmp->icmp_cksum;
 
 		if (icmptype == ICMP_UNREACH ||
 		    icmptype == ICMP_SOURCEQUENCH ||
 		    icmptype == ICMP_REDIRECT ||
 		    icmptype == ICMP_TIMXCEED ||
 		    icmptype == ICMP_PARAMPROB)
 			state_icmp++;
 		break;
 #endif /* INET */
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 		icmptype = pd->hdr.icmp6->icmp6_type;
 		icmpcode = pd->hdr.icmp6->icmp6_code;
 		icmpid = pd->hdr.icmp6->icmp6_id;
 		icmpsum = &pd->hdr.icmp6->icmp6_cksum;
 
 		if (icmptype == ICMP6_DST_UNREACH ||
 		    icmptype == ICMP6_PACKET_TOO_BIG ||
 		    icmptype == ICMP6_TIME_EXCEEDED ||
 		    icmptype == ICMP6_PARAM_PROB)
 			state_icmp++;
 		break;
 #endif /* INET6 */
 	}
 
 	if (!state_icmp) {
 
 		/*
 		 * ICMP query/reply message not related to a TCP/UDP packet.
 		 * Search for an ICMP state.
 		 */
 		key.af = pd->af;
 		key.proto = pd->proto;
 		key.port[0] = key.port[1] = icmpid;
 		if (direction == PF_IN)	{	/* wire side, straight */
 			PF_ACPY(&key.addr[0], pd->src, key.af);
 			PF_ACPY(&key.addr[1], pd->dst, key.af);
 		} else {			/* stack side, reverse */
 			PF_ACPY(&key.addr[1], pd->src, key.af);
 			PF_ACPY(&key.addr[0], pd->dst, key.af);
 		}
 
 		STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 		(*state)->expire = time_uptime;
 		(*state)->timeout = PFTM_ICMP_ERROR_REPLY;
 
 		/* translate source/destination address, if necessary */
 		if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 			struct pf_state_key *nk = (*state)->key[pd->didx];
 
 			switch (pd->af) {
 #ifdef INET
 			case AF_INET:
 				if (PF_ANEQ(pd->src,
 				    &nk->addr[pd->sidx], AF_INET))
 					pf_change_a(&saddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->sidx].v4.s_addr, 0);
 
 				if (PF_ANEQ(pd->dst, &nk->addr[pd->didx],
 				    AF_INET))
 					pf_change_a(&daddr->v4.s_addr,
 					    pd->ip_sum,
 					    nk->addr[pd->didx].v4.s_addr, 0);
 
 				if (nk->port[0] !=
 				    pd->hdr.icmp->icmp_id) {
 					pd->hdr.icmp->icmp_cksum =
 					    pf_cksum_fixup(
 					    pd->hdr.icmp->icmp_cksum, icmpid,
 					    nk->port[pd->sidx], 0);
 					pd->hdr.icmp->icmp_id =
 					    nk->port[pd->sidx];
 				}
 
 				m_copyback(m, off, ICMP_MINLEN,
 				    (caddr_t )pd->hdr.icmp);
 				break;
 #endif /* INET */
 #ifdef INET6
 			case AF_INET6:
 				if (PF_ANEQ(pd->src,
 				    &nk->addr[pd->sidx], AF_INET6))
 					pf_change_a6(saddr,
 					    &pd->hdr.icmp6->icmp6_cksum,
 					    &nk->addr[pd->sidx], 0);
 
 				if (PF_ANEQ(pd->dst,
 				    &nk->addr[pd->didx], AF_INET6))
 					pf_change_a6(daddr,
 					    &pd->hdr.icmp6->icmp6_cksum,
 					    &nk->addr[pd->didx], 0);
 
 				m_copyback(m, off, sizeof(struct icmp6_hdr),
 				    (caddr_t )pd->hdr.icmp6);
 				break;
 #endif /* INET6 */
 			}
 		}
 		return (PF_PASS);
 
 	} else {
 		/*
 		 * ICMP error message in response to a TCP/UDP packet.
 		 * Extract the inner TCP/UDP header and search for that state.
 		 */
 
 		struct pf_pdesc	pd2;
 		bzero(&pd2, sizeof pd2);
 #ifdef INET
 		struct ip	h2;
 #endif /* INET */
 #ifdef INET6
 		struct ip6_hdr	h2_6;
 		int		terminal = 0;
 #endif /* INET6 */
 		int		ipoff2 = 0;
 		int		off2 = 0;
 
 		pd2.af = pd->af;
 		/* Payload packet is from the opposite direction. */
 		pd2.sidx = (direction == PF_IN) ? 1 : 0;
 		pd2.didx = (direction == PF_IN) ? 0 : 1;
 		switch (pd->af) {
 #ifdef INET
 		case AF_INET:
 			/* offset of h2 in mbuf chain */
 			ipoff2 = off + ICMP_MINLEN;
 
 			if (!pf_pull_hdr(m, ipoff2, &h2, sizeof(h2),
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(ip)\n"));
 				return (PF_DROP);
 			}
 			/*
 			 * ICMP error messages don't refer to non-first
 			 * fragments
 			 */
 			if (h2.ip_off & htons(IP_OFFMASK)) {
 				REASON_SET(reason, PFRES_FRAG);
 				return (PF_DROP);
 			}
 
 			/* offset of protocol header that follows h2 */
 			off2 = ipoff2 + (h2.ip_hl << 2);
 
 			pd2.proto = h2.ip_p;
 			pd2.src = (struct pf_addr *)&h2.ip_src;
 			pd2.dst = (struct pf_addr *)&h2.ip_dst;
 			pd2.ip_sum = &h2.ip_sum;
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			ipoff2 = off + sizeof(struct icmp6_hdr);
 
 			if (!pf_pull_hdr(m, ipoff2, &h2_6, sizeof(h2_6),
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(ip6)\n"));
 				return (PF_DROP);
 			}
 			pd2.proto = h2_6.ip6_nxt;
 			pd2.src = (struct pf_addr *)&h2_6.ip6_src;
 			pd2.dst = (struct pf_addr *)&h2_6.ip6_dst;
 			pd2.ip_sum = NULL;
 			off2 = ipoff2 + sizeof(h2_6);
 			do {
 				switch (pd2.proto) {
 				case IPPROTO_FRAGMENT:
 					/*
 					 * ICMPv6 error messages for
 					 * non-first fragments
 					 */
 					REASON_SET(reason, PFRES_FRAG);
 					return (PF_DROP);
 				case IPPROTO_AH:
 				case IPPROTO_HOPOPTS:
 				case IPPROTO_ROUTING:
 				case IPPROTO_DSTOPTS: {
 					/* get next header and header length */
 					struct ip6_ext opt6;
 
 					if (!pf_pull_hdr(m, off2, &opt6,
 					    sizeof(opt6), NULL, reason,
 					    pd2.af)) {
 						DPFPRINTF(PF_DEBUG_MISC,
 						    ("pf: ICMPv6 short opt\n"));
 						return (PF_DROP);
 					}
 					if (pd2.proto == IPPROTO_AH)
 						off2 += (opt6.ip6e_len + 2) * 4;
 					else
 						off2 += (opt6.ip6e_len + 1) * 8;
 					pd2.proto = opt6.ip6e_nxt;
 					/* goto the next header */
 					break;
 				}
 				default:
 					terminal++;
 					break;
 				}
 			} while (!terminal);
 			break;
 #endif /* INET6 */
 		}
 
 		if (PF_ANEQ(pd->dst, pd2.src, pd->af)) {
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				printf("pf: BAD ICMP %d:%d outer dst: ",
 				    icmptype, icmpcode);
 				pf_print_host(pd->src, 0, pd->af);
 				printf(" -> ");
 				pf_print_host(pd->dst, 0, pd->af);
 				printf(" inner src: ");
 				pf_print_host(pd2.src, 0, pd2.af);
 				printf(" -> ");
 				pf_print_host(pd2.dst, 0, pd2.af);
 				printf("\n");
 			}
 			REASON_SET(reason, PFRES_BADSTATE);
 			return (PF_DROP);
 		}
 
 		switch (pd2.proto) {
 		case IPPROTO_TCP: {
 			struct tcphdr		 th;
 			u_int32_t		 seq;
 			struct pf_state_peer	*src, *dst;
 			u_int8_t		 dws;
 			int			 copyback = 0;
 
 			/*
 			 * Only the first 8 bytes of the TCP header can be
 			 * expected. Don't access any TCP header fields after
 			 * th_seq, an ackskew test is not possible.
 			 */
 			if (!pf_pull_hdr(m, off2, &th, 8, NULL, reason,
 			    pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(tcp)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_TCP;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[pd2.sidx] = th.th_sport;
 			key.port[pd2.didx] = th.th_dport;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			if (direction == (*state)->direction) {
 				src = &(*state)->dst;
 				dst = &(*state)->src;
 			} else {
 				src = &(*state)->src;
 				dst = &(*state)->dst;
 			}
 
 			if (src->wscale && dst->wscale)
 				dws = dst->wscale & PF_WSCALE_MASK;
 			else
 				dws = 0;
 
 			/* Demodulate sequence number */
 			seq = ntohl(th.th_seq) - src->seqdiff;
 			if (src->seqdiff) {
 				pf_change_a(&th.th_seq, icmpsum,
 				    htonl(seq), 0);
 				copyback = 1;
 			}
 
 			if (!((*state)->state_flags & PFSTATE_SLOPPY) &&
 			    (!SEQ_GEQ(src->seqhi, seq) ||
 			    !SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)))) {
 				if (V_pf_status.debug >= PF_DEBUG_MISC) {
 					printf("pf: BAD ICMP %d:%d ",
 					    icmptype, icmpcode);
 					pf_print_host(pd->src, 0, pd->af);
 					printf(" -> ");
 					pf_print_host(pd->dst, 0, pd->af);
 					printf(" state: ");
 					pf_print_state(*state);
 					printf(" seq=%u\n", seq);
 				}
 				REASON_SET(reason, PFRES_BADSTATE);
 				return (PF_DROP);
 			} else {
 				if (V_pf_status.debug >= PF_DEBUG_MISC) {
 					printf("pf: OK ICMP %d:%d ",
 					    icmptype, icmpcode);
 					pf_print_host(pd->src, 0, pd->af);
 					printf(" -> ");
 					pf_print_host(pd->dst, 0, pd->af);
 					printf(" state: ");
 					pf_print_state(*state);
 					printf(" seq=%u\n", seq);
 				}
 			}
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != th.th_sport)
 					pf_change_icmp(pd2.src, &th.th_sport,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != th.th_dport)
 					pf_change_icmp(pd2.dst, &th.th_dport,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 				copyback = 1;
 			}
 
 			if (copyback) {
 				switch (pd2.af) {
 #ifdef INET
 				case AF_INET:
 					m_copyback(m, off, ICMP_MINLEN,
 					    (caddr_t )pd->hdr.icmp);
 					m_copyback(m, ipoff2, sizeof(h2),
 					    (caddr_t )&h2);
 					break;
 #endif /* INET */
 #ifdef INET6
 				case AF_INET6:
 					m_copyback(m, off,
 					    sizeof(struct icmp6_hdr),
 					    (caddr_t )pd->hdr.icmp6);
 					m_copyback(m, ipoff2, sizeof(h2_6),
 					    (caddr_t )&h2_6);
 					break;
 #endif /* INET6 */
 				}
 				m_copyback(m, off2, 8, (caddr_t)&th);
 			}
 
 			return (PF_PASS);
 			break;
 		}
 		case IPPROTO_UDP: {
 			struct udphdr		uh;
 
 			if (!pf_pull_hdr(m, off2, &uh, sizeof(uh),
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(udp)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_UDP;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[pd2.sidx] = uh.uh_sport;
 			key.port[pd2.didx] = uh.uh_dport;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != uh.uh_sport)
 					pf_change_icmp(pd2.src, &uh.uh_sport,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], &uh.uh_sum,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 1, pd2.af);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != uh.uh_dport)
 					pf_change_icmp(pd2.dst, &uh.uh_dport,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], &uh.uh_sum,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 1, pd2.af);
 
 				switch (pd2.af) {
 #ifdef INET
 				case AF_INET:
 					m_copyback(m, off, ICMP_MINLEN,
 					    (caddr_t )pd->hdr.icmp);
 					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
 					break;
 #endif /* INET */
 #ifdef INET6
 				case AF_INET6:
 					m_copyback(m, off,
 					    sizeof(struct icmp6_hdr),
 					    (caddr_t )pd->hdr.icmp6);
 					m_copyback(m, ipoff2, sizeof(h2_6),
 					    (caddr_t )&h2_6);
 					break;
 #endif /* INET6 */
 				}
 				m_copyback(m, off2, sizeof(uh), (caddr_t)&uh);
 			}
 			return (PF_PASS);
 			break;
 		}
 #ifdef INET
 		case IPPROTO_ICMP: {
 			struct icmp		iih;
 
 			if (!pf_pull_hdr(m, off2, &iih, ICMP_MINLEN,
 			    NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short i"
 				    "(icmp)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_ICMP;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[0] = key.port[1] = iih.icmp_id;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != iih.icmp_id)
 					pf_change_icmp(pd2.src, &iih.icmp_id,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != iih.icmp_id)
 					pf_change_icmp(pd2.dst, &iih.icmp_id,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET);
 
 				m_copyback(m, off, ICMP_MINLEN, (caddr_t)pd->hdr.icmp);
 				m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
 				m_copyback(m, off2, ICMP_MINLEN, (caddr_t)&iih);
 			}
 			return (PF_PASS);
 			break;
 		}
 #endif /* INET */
 #ifdef INET6
 		case IPPROTO_ICMPV6: {
 			struct icmp6_hdr	iih;
 
 			if (!pf_pull_hdr(m, off2, &iih,
 			    sizeof(struct icmp6_hdr), NULL, reason, pd2.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: ICMP error message too short "
 				    "(icmp6)\n"));
 				return (PF_DROP);
 			}
 
 			key.af = pd2.af;
 			key.proto = IPPROTO_ICMPV6;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[0] = key.port[1] = iih.icmp6_id;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af) ||
 				    nk->port[pd2.sidx] != iih.icmp6_id)
 					pf_change_icmp(pd2.src, &iih.icmp6_id,
 					    daddr, &nk->addr[pd2.sidx],
 					    nk->port[pd2.sidx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET6);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af) ||
 				    nk->port[pd2.didx] != iih.icmp6_id)
 					pf_change_icmp(pd2.dst, &iih.icmp6_id,
 					    saddr, &nk->addr[pd2.didx],
 					    nk->port[pd2.didx], NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, AF_INET6);
 
 				m_copyback(m, off, sizeof(struct icmp6_hdr),
 				    (caddr_t)pd->hdr.icmp6);
 				m_copyback(m, ipoff2, sizeof(h2_6), (caddr_t)&h2_6);
 				m_copyback(m, off2, sizeof(struct icmp6_hdr),
 				    (caddr_t)&iih);
 			}
 			return (PF_PASS);
 			break;
 		}
 #endif /* INET6 */
 		default: {
 			key.af = pd2.af;
 			key.proto = pd2.proto;
 			PF_ACPY(&key.addr[pd2.sidx], pd2.src, key.af);
 			PF_ACPY(&key.addr[pd2.didx], pd2.dst, key.af);
 			key.port[0] = key.port[1] = 0;
 
 			STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 			/* translate source/destination address, if necessary */
 			if ((*state)->key[PF_SK_WIRE] !=
 			    (*state)->key[PF_SK_STACK]) {
 				struct pf_state_key *nk =
 				    (*state)->key[pd->didx];
 
 				if (PF_ANEQ(pd2.src,
 				    &nk->addr[pd2.sidx], pd2.af))
 					pf_change_icmp(pd2.src, NULL, daddr,
 					    &nk->addr[pd2.sidx], 0, NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 
 				if (PF_ANEQ(pd2.dst,
 				    &nk->addr[pd2.didx], pd2.af))
 					pf_change_icmp(pd2.dst, NULL, saddr,
 					    &nk->addr[pd2.didx], 0, NULL,
 					    pd2.ip_sum, icmpsum,
 					    pd->ip_sum, 0, pd2.af);
 
 				switch (pd2.af) {
 #ifdef INET
 				case AF_INET:
 					m_copyback(m, off, ICMP_MINLEN,
 					    (caddr_t)pd->hdr.icmp);
 					m_copyback(m, ipoff2, sizeof(h2), (caddr_t)&h2);
 					break;
 #endif /* INET */
 #ifdef INET6
 				case AF_INET6:
 					m_copyback(m, off,
 					    sizeof(struct icmp6_hdr),
 					    (caddr_t )pd->hdr.icmp6);
 					m_copyback(m, ipoff2, sizeof(h2_6),
 					    (caddr_t )&h2_6);
 					break;
 #endif /* INET6 */
 				}
 			}
 			return (PF_PASS);
 			break;
 		}
 		}
 	}
 }
 
 static int
 pf_test_state_other(struct pf_state **state, int direction, struct pfi_kif *kif,
     struct mbuf *m, struct pf_pdesc *pd)
 {
 	struct pf_state_peer	*src, *dst;
 	struct pf_state_key_cmp	 key;
 
 	bzero(&key, sizeof(key));
 	key.af = pd->af;
 	key.proto = pd->proto;
 	if (direction == PF_IN)	{
 		PF_ACPY(&key.addr[0], pd->src, key.af);
 		PF_ACPY(&key.addr[1], pd->dst, key.af);
 		key.port[0] = key.port[1] = 0;
 	} else {
 		PF_ACPY(&key.addr[1], pd->src, key.af);
 		PF_ACPY(&key.addr[0], pd->dst, key.af);
 		key.port[1] = key.port[0] = 0;
 	}
 
 	STATE_LOOKUP(kif, &key, direction, *state, pd);
 
 	if (direction == (*state)->direction) {
 		src = &(*state)->src;
 		dst = &(*state)->dst;
 	} else {
 		src = &(*state)->dst;
 		dst = &(*state)->src;
 	}
 
 	/* update states */
 	if (src->state < PFOTHERS_SINGLE)
 		src->state = PFOTHERS_SINGLE;
 	if (dst->state == PFOTHERS_SINGLE)
 		dst->state = PFOTHERS_MULTIPLE;
 
 	/* update expire time */
 	(*state)->expire = time_uptime;
 	if (src->state == PFOTHERS_MULTIPLE && dst->state == PFOTHERS_MULTIPLE)
 		(*state)->timeout = PFTM_OTHER_MULTIPLE;
 	else
 		(*state)->timeout = PFTM_OTHER_SINGLE;
 
 	/* translate source/destination address, if necessary */
 	if ((*state)->key[PF_SK_WIRE] != (*state)->key[PF_SK_STACK]) {
 		struct pf_state_key *nk = (*state)->key[pd->didx];
 
 		KASSERT(nk, ("%s: nk is null", __func__));
 		KASSERT(pd, ("%s: pd is null", __func__));
 		KASSERT(pd->src, ("%s: pd->src is null", __func__));
 		KASSERT(pd->dst, ("%s: pd->dst is null", __func__));
 		switch (pd->af) {
 #ifdef INET
 		case AF_INET:
 			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
 				pf_change_a(&pd->src->v4.s_addr,
 				    pd->ip_sum,
 				    nk->addr[pd->sidx].v4.s_addr,
 				    0);
 
 
 			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
 				pf_change_a(&pd->dst->v4.s_addr,
 				    pd->ip_sum,
 				    nk->addr[pd->didx].v4.s_addr,
 				    0);
 
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 			if (PF_ANEQ(pd->src, &nk->addr[pd->sidx], AF_INET))
 				PF_ACPY(pd->src, &nk->addr[pd->sidx], pd->af);
 
 			if (PF_ANEQ(pd->dst, &nk->addr[pd->didx], AF_INET))
 				PF_ACPY(pd->dst, &nk->addr[pd->didx], pd->af);
 #endif /* INET6 */
 		}
 	}
 	return (PF_PASS);
 }
 
 /*
  * ipoff and off are measured from the start of the mbuf chain.
  * h must be at "ipoff" on the mbuf chain.
  */
 void *
 pf_pull_hdr(struct mbuf *m, int off, void *p, int len,
     u_short *actionp, u_short *reasonp, sa_family_t af)
 {
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		struct ip	*h = mtod(m, struct ip *);
 		u_int16_t	 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
 
 		if (fragoff) {
 			if (fragoff >= len)
 				ACTION_SET(actionp, PF_PASS);
 			else {
 				ACTION_SET(actionp, PF_DROP);
 				REASON_SET(reasonp, PFRES_FRAG);
 			}
 			return (NULL);
 		}
 		if (m->m_pkthdr.len < off + len ||
 		    ntohs(h->ip_len) < off + len) {
 			ACTION_SET(actionp, PF_DROP);
 			REASON_SET(reasonp, PFRES_SHORT);
 			return (NULL);
 		}
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		struct ip6_hdr	*h = mtod(m, struct ip6_hdr *);
 
 		if (m->m_pkthdr.len < off + len ||
 		    (ntohs(h->ip6_plen) + sizeof(struct ip6_hdr)) <
 		    (unsigned)(off + len)) {
 			ACTION_SET(actionp, PF_DROP);
 			REASON_SET(reasonp, PFRES_SHORT);
 			return (NULL);
 		}
 		break;
 	}
 #endif /* INET6 */
 	}
 	m_copydata(m, off, len, p);
 	return (p);
 }
 
 #ifdef RADIX_MPATH
 static int
 pf_routable_oldmpath(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
     int rtableid)
 {
 	struct radix_node_head	*rnh;
 	struct sockaddr_in	*dst;
 	int			 ret = 1;
 	int			 check_mpath;
 #ifdef INET6
 	struct sockaddr_in6	*dst6;
 	struct route_in6	 ro;
 #else
 	struct route		 ro;
 #endif
 	struct radix_node	*rn;
 	struct rtentry		*rt;
 	struct ifnet		*ifp;
 
 	check_mpath = 0;
 	/* XXX: stick to table 0 for now */
 	rnh = rt_tables_get_rnh(0, af);
 	if (rnh != NULL && rn_mpath_capable(rnh))
 		check_mpath = 1;
 	bzero(&ro, sizeof(ro));
 	switch (af) {
 	case AF_INET:
 		dst = satosin(&ro.ro_dst);
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = addr->v4;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		/*
 		 * Skip check for addresses with embedded interface scope,
 		 * as they would always match anyway.
 		 */
 		if (IN6_IS_SCOPE_EMBED(&addr->v6))
 			goto out;
 		dst6 = (struct sockaddr_in6 *)&ro.ro_dst;
 		dst6->sin6_family = AF_INET6;
 		dst6->sin6_len = sizeof(*dst6);
 		dst6->sin6_addr = addr->v6;
 		break;
 #endif /* INET6 */
 	default:
 		return (0);
 	}
 
 	/* Skip checks for ipsec interfaces */
 	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
 		goto out;
 
 	switch (af) {
 #ifdef INET6
 	case AF_INET6:
 		in6_rtalloc_ign(&ro, 0, rtableid);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		in_rtalloc_ign((struct route *)&ro, 0, rtableid);
 		break;
 #endif
 	}
 
 	if (ro.ro_rt != NULL) {
 		/* No interface given, this is a no-route check */
 		if (kif == NULL)
 			goto out;
 
 		if (kif->pfik_ifp == NULL) {
 			ret = 0;
 			goto out;
 		}
 
 		/* Perform uRPF check if passed input interface */
 		ret = 0;
 		rn = (struct radix_node *)ro.ro_rt;
 		do {
 			rt = (struct rtentry *)rn;
 			ifp = rt->rt_ifp;
 
 			if (kif->pfik_ifp == ifp)
 				ret = 1;
 			rn = rn_mpath_next(rn);
 		} while (check_mpath == 1 && rn != NULL && ret == 0);
 	} else
 		ret = 0;
 out:
 	if (ro.ro_rt != NULL)
 		RTFREE(ro.ro_rt);
 	return (ret);
 }
 #endif
 
 int
 pf_routable(struct pf_addr *addr, sa_family_t af, struct pfi_kif *kif,
     int rtableid)
 {
 #ifdef INET
 	struct nhop4_basic	nh4;
 #endif
 #ifdef INET6
 	struct nhop6_basic	nh6;
 #endif
 	struct ifnet		*ifp;
 #ifdef RADIX_MPATH
 	struct radix_node_head	*rnh;
 
 	/* XXX: stick to table 0 for now */
 	rnh = rt_tables_get_rnh(0, af);
 	if (rnh != NULL && rn_mpath_capable(rnh))
 		return (pf_routable_oldmpath(addr, af, kif, rtableid));
 #endif
 	/*
 	 * Skip check for addresses with embedded interface scope,
 	 * as they would always match anyway.
 	 */
 	if (af == AF_INET6 && IN6_IS_SCOPE_EMBED(&addr->v6))
 		return (1);
 
 	if (af != AF_INET && af != AF_INET6)
 		return (0);
 
 	/* Skip checks for ipsec interfaces */
 	if (kif != NULL && kif->pfik_ifp->if_type == IFT_ENC)
 		return (1);
 
 	ifp = NULL;
 
 	switch (af) {
 #ifdef INET6
 	case AF_INET6:
 		if (fib6_lookup_nh_basic(rtableid, &addr->v6, 0, 0, 0, &nh6)!=0)
 			return (0);
 		ifp = nh6.nh_ifp;
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		if (fib4_lookup_nh_basic(rtableid, addr->v4, 0, 0, &nh4) != 0)
 			return (0);
 		ifp = nh4.nh_ifp;
 		break;
 #endif
 	}
 
 	/* No interface given, this is a no-route check */
 	if (kif == NULL)
 		return (1);
 
 	if (kif->pfik_ifp == NULL)
 		return (0);
 
 	/* Perform uRPF check if passed input interface */
 	if (kif->pfik_ifp == ifp)
 		return (1);
 	return (0);
 }
 
 #ifdef INET
 static void
 pf_route(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
     struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp)
 {
 	struct mbuf		*m0, *m1;
 	struct sockaddr_in	dst;
 	struct ip		*ip;
 	struct ifnet		*ifp = NULL;
 	struct pf_addr		 naddr;
 	struct pf_src_node	*sn = NULL;
 	int			 error = 0;
 	uint16_t		 ip_len, ip_off;
 
 	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
 	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
 	    __func__));
 
 	if ((pd->pf_mtag == NULL &&
 	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
 	    pd->pf_mtag->routed++ > 3) {
 		m0 = *m;
 		*m = NULL;
 		goto bad_locked;
 	}
 
 	if (r->rt == PF_DUPTO) {
 		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
 			if (s)
 				PF_STATE_UNLOCK(s);
 			return;
 		}
 	} else {
 		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
 			if (s)
 				PF_STATE_UNLOCK(s);
 			return;
 		}
 		m0 = *m;
 	}
 
 	ip = mtod(m0, struct ip *);
 
 	bzero(&dst, sizeof(dst));
 	dst.sin_family = AF_INET;
 	dst.sin_len = sizeof(dst);
 	dst.sin_addr = ip->ip_dst;
 
 	bzero(&naddr, sizeof(naddr));
 
 	if (TAILQ_EMPTY(&r->rpool.list)) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
 		goto bad_locked;
 	}
 	if (s == NULL) {
 		pf_map_addr(AF_INET, r, (struct pf_addr *)&ip->ip_src,
 		    &naddr, NULL, &sn);
 		if (!PF_AZERO(&naddr, AF_INET))
 			dst.sin_addr.s_addr = naddr.v4.s_addr;
 		ifp = r->rpool.cur->kif ?
 		    r->rpool.cur->kif->pfik_ifp : NULL;
 	} else {
 		if (!PF_AZERO(&s->rt_addr, AF_INET))
 			dst.sin_addr.s_addr =
 			    s->rt_addr.v4.s_addr;
 		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
 		PF_STATE_UNLOCK(s);
 	}
 	if (ifp == NULL)
 		goto bad;
 
 	if (oifp != ifp) {
 		if (pf_test(PF_OUT, 0, ifp, &m0, inp) != PF_PASS)
 			goto bad;
 		else if (m0 == NULL)
 			goto done;
 		if (m0->m_len < sizeof(struct ip)) {
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("%s: m0->m_len < sizeof(struct ip)\n", __func__));
 			goto bad;
 		}
 		ip = mtod(m0, struct ip *);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		m0->m_flags |= M_SKIP_FIREWALL;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	/* Copied from FreeBSD 10.0-CURRENT ip_output. */
 	m0->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #ifdef SCTP
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 */
 	if (ip_len <= ifp->if_mtu ||
 	    (m0->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
 		ip->ip_sum = 0;
 		if (m0->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m0, ip->ip_hl << 2);
 			m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		m_clrprotoflags(m0);	/* Avoid confusing lower layers. */
 		error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) || (m0->m_pkthdr.csum_flags & CSUM_TSO)) {
 		error = EMSGSIZE;
 		KMOD_IPSTAT_INC(ips_cantfrag);
 		if (r->rt != PF_DUPTO) {
 			icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
 			    ifp->if_mtu);
 			goto done;
 		} else
 			goto bad;
 	}
 
 	error = ip_fragment(ip, &m0, ifp->if_mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 
 	for (; m0; m0 = m1) {
 		m1 = m0->m_nextpkt;
 		m0->m_nextpkt = NULL;
 		if (error == 0) {
 			m_clrprotoflags(m0);
 			error = (*ifp->if_output)(ifp, m0, sintosa(&dst), NULL);
 		} else
 			m_freem(m0);
 	}
 
 	if (error == 0)
 		KMOD_IPSTAT_INC(ips_fragmented);
 
 done:
 	if (r->rt != PF_DUPTO)
 		*m = NULL;
 	return;
 
 bad_locked:
 	if (s)
 		PF_STATE_UNLOCK(s);
 bad:
 	m_freem(m0);
 	goto done;
 }
 #endif /* INET */
 
 #ifdef INET6
 static void
 pf_route6(struct mbuf **m, struct pf_rule *r, int dir, struct ifnet *oifp,
     struct pf_state *s, struct pf_pdesc *pd, struct inpcb *inp)
 {
 	struct mbuf		*m0;
 	struct sockaddr_in6	dst;
 	struct ip6_hdr		*ip6;
 	struct ifnet		*ifp = NULL;
 	struct pf_addr		 naddr;
 	struct pf_src_node	*sn = NULL;
 
 	KASSERT(m && *m && r && oifp, ("%s: invalid parameters", __func__));
 	KASSERT(dir == PF_IN || dir == PF_OUT, ("%s: invalid direction",
 	    __func__));
 
 	if ((pd->pf_mtag == NULL &&
 	    ((pd->pf_mtag = pf_get_mtag(*m)) == NULL)) ||
 	    pd->pf_mtag->routed++ > 3) {
 		m0 = *m;
 		*m = NULL;
 		goto bad_locked;
 	}
 
 	if (r->rt == PF_DUPTO) {
 		if ((m0 = m_dup(*m, M_NOWAIT)) == NULL) {
 			if (s)
 				PF_STATE_UNLOCK(s);
 			return;
 		}
 	} else {
 		if ((r->rt == PF_REPLYTO) == (r->direction == dir)) {
 			if (s)
 				PF_STATE_UNLOCK(s);
 			return;
 		}
 		m0 = *m;
 	}
 
 	ip6 = mtod(m0, struct ip6_hdr *);
 
 	bzero(&dst, sizeof(dst));
 	dst.sin6_family = AF_INET6;
 	dst.sin6_len = sizeof(dst);
 	dst.sin6_addr = ip6->ip6_dst;
 
 	bzero(&naddr, sizeof(naddr));
 
 	if (TAILQ_EMPTY(&r->rpool.list)) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("%s: TAILQ_EMPTY(&r->rpool.list)\n", __func__));
 		goto bad_locked;
 	}
 	if (s == NULL) {
 		pf_map_addr(AF_INET6, r, (struct pf_addr *)&ip6->ip6_src,
 		    &naddr, NULL, &sn);
 		if (!PF_AZERO(&naddr, AF_INET6))
 			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
 			    &naddr, AF_INET6);
 		ifp = r->rpool.cur->kif ? r->rpool.cur->kif->pfik_ifp : NULL;
 	} else {
 		if (!PF_AZERO(&s->rt_addr, AF_INET6))
 			PF_ACPY((struct pf_addr *)&dst.sin6_addr,
 			    &s->rt_addr, AF_INET6);
 		ifp = s->rt_kif ? s->rt_kif->pfik_ifp : NULL;
 	}
 
 	if (s)
 		PF_STATE_UNLOCK(s);
 
 	if (ifp == NULL)
 		goto bad;
 
 	if (oifp != ifp) {
 		if (pf_test6(PF_OUT, PFIL_FWD, ifp, &m0, inp) != PF_PASS)
 			goto bad;
 		else if (m0 == NULL)
 			goto done;
 		if (m0->m_len < sizeof(struct ip6_hdr)) {
 			DPFPRINTF(PF_DEBUG_URGENT,
 			    ("%s: m0->m_len < sizeof(struct ip6_hdr)\n",
 			    __func__));
 			goto bad;
 		}
 		ip6 = mtod(m0, struct ip6_hdr *);
 	}
 
 	if (ifp->if_flags & IFF_LOOPBACK)
 		m0->m_flags |= M_SKIP_FIREWALL;
 
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 &
 	    ~ifp->if_hwassist) {
 		uint32_t plen = m0->m_pkthdr.len - sizeof(*ip6);
 		in6_delayed_cksum(m0, plen, sizeof(struct ip6_hdr));
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 
 	/*
 	 * If the packet is too large for the outgoing interface,
 	 * send back an icmp6 error.
 	 */
 	if (IN6_IS_SCOPE_EMBED(&dst.sin6_addr))
 		dst.sin6_addr.s6_addr16[1] = htons(ifp->if_index);
 	if ((u_long)m0->m_pkthdr.len <= ifp->if_mtu)
 		nd6_output_ifp(ifp, ifp, m0, &dst, NULL);
 	else {
 		in6_ifstat_inc(ifp, ifs6_in_toobig);
 		if (r->rt != PF_DUPTO)
 			icmp6_error(m0, ICMP6_PACKET_TOO_BIG, 0, ifp->if_mtu);
 		else
 			goto bad;
 	}
 
 done:
 	if (r->rt != PF_DUPTO)
 		*m = NULL;
 	return;
 
 bad_locked:
 	if (s)
 		PF_STATE_UNLOCK(s);
 bad:
 	m_freem(m0);
 	goto done;
 }
 #endif /* INET6 */
 
 /*
  * FreeBSD supports cksum offloads for the following drivers.
  *  em(4), fxp(4), lge(4), ndis(4), nge(4), re(4), ti(4), txp(4), xl(4)
  *
  * CSUM_DATA_VALID | CSUM_PSEUDO_HDR :
  *  network driver performed cksum including pseudo header, need to verify
  *   csum_data
  * CSUM_DATA_VALID :
  *  network driver performed cksum, needs to additional pseudo header
  *  cksum computation with partial csum_data(i.e. lack of H/W support for
  *  pseudo header, for instance hme(4), sk(4) and possibly gem(4))
  *
  * After validating the cksum of packet, set both flag CSUM_DATA_VALID and
  * CSUM_PSEUDO_HDR in order to avoid recomputation of the cksum in upper
  * TCP/UDP layer.
  * Also, set csum_data to 0xffff to force cksum validation.
  */
 static int
 pf_check_proto_cksum(struct mbuf *m, int off, int len, u_int8_t p, sa_family_t af)
 {
 	u_int16_t sum = 0;
 	int hw_assist = 0;
 	struct ip *ip;
 
 	if (off < sizeof(struct ip) || len < sizeof(struct udphdr))
 		return (1);
 	if (m->m_pkthdr.len < off + len)
 		return (1);
 
 	switch (p) {
 	case IPPROTO_TCP:
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
 				sum = m->m_pkthdr.csum_data;
 			} else {
 				ip = mtod(m, struct ip *);
 				sum = in_pseudo(ip->ip_src.s_addr,
 				ip->ip_dst.s_addr, htonl((u_short)len +
 				m->m_pkthdr.csum_data + IPPROTO_TCP));
 			}
 			sum ^= 0xffff;
 			++hw_assist;
 		}
 		break;
 	case IPPROTO_UDP:
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
 				sum = m->m_pkthdr.csum_data;
 			} else {
 				ip = mtod(m, struct ip *);
 				sum = in_pseudo(ip->ip_src.s_addr,
 				ip->ip_dst.s_addr, htonl((u_short)len +
 				m->m_pkthdr.csum_data + IPPROTO_UDP));
 			}
 			sum ^= 0xffff;
 			++hw_assist;
 		}
 		break;
 	case IPPROTO_ICMP:
 #ifdef INET6
 	case IPPROTO_ICMPV6:
 #endif /* INET6 */
 		break;
 	default:
 		return (1);
 	}
 
 	if (!hw_assist) {
 		switch (af) {
 		case AF_INET:
 			if (p == IPPROTO_ICMP) {
 				if (m->m_len < off)
 					return (1);
 				m->m_data += off;
 				m->m_len -= off;
 				sum = in_cksum(m, len);
 				m->m_data -= off;
 				m->m_len += off;
 			} else {
 				if (m->m_len < sizeof(struct ip))
 					return (1);
 				sum = in4_cksum(m, p, off, len);
 			}
 			break;
 #ifdef INET6
 		case AF_INET6:
 			if (m->m_len < sizeof(struct ip6_hdr))
 				return (1);
 			sum = in6_cksum(m, p, off, len);
 			break;
 #endif /* INET6 */
 		default:
 			return (1);
 		}
 	}
 	if (sum) {
 		switch (p) {
 		case IPPROTO_TCP:
 		    {
 			KMOD_TCPSTAT_INC(tcps_rcvbadsum);
 			break;
 		    }
 		case IPPROTO_UDP:
 		    {
 			KMOD_UDPSTAT_INC(udps_badsum);
 			break;
 		    }
 #ifdef INET
 		case IPPROTO_ICMP:
 		    {
 			KMOD_ICMPSTAT_INC(icps_checksum);
 			break;
 		    }
 #endif
 #ifdef INET6
 		case IPPROTO_ICMPV6:
 		    {
 			KMOD_ICMP6STAT_INC(icp6s_checksum);
 			break;
 		    }
 #endif /* INET6 */
 		}
 		return (1);
 	} else {
 		if (p == IPPROTO_TCP || p == IPPROTO_UDP) {
 			m->m_pkthdr.csum_flags |=
 			    (CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 	}
 	return (0);
 }
 
 
 #ifdef INET
 int
 pf_test(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
 {
 	struct pfi_kif		*kif;
 	u_short			 action, reason = 0, log = 0;
 	struct mbuf		*m = *m0;
 	struct ip		*h = NULL;
 	struct m_tag		*ipfwtag;
 	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
 	struct pf_state		*s = NULL;
 	struct pf_ruleset	*ruleset = NULL;
 	struct pf_pdesc		 pd;
 	int			 off, dirndx, pqid = 0;
 
 	PF_RULES_RLOCK_TRACKER;
 
 	M_ASSERTPKTHDR(m);
 
 	if (!V_pf_status.running)
 		return (PF_PASS);
 
 	memset(&pd, 0, sizeof(pd));
 
 	kif = (struct pfi_kif *)ifp->if_pf_kif;
 
 	if (kif == NULL) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("pf_test: kif == NULL, if_xname %s\n", ifp->if_xname));
 		return (PF_DROP);
 	}
 	if (kif->pfik_flags & PFI_IFLAG_SKIP)
 		return (PF_PASS);
 
 	if (m->m_flags & M_SKIP_FIREWALL)
 		return (PF_PASS);
 
 	pd.pf_mtag = pf_find_mtag(m);
 
 	PF_RULES_RLOCK();
 
 	if (ip_divert_ptr != NULL &&
 	    ((ipfwtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL)) != NULL)) {
 		struct ipfw_rule_ref *rr = (struct ipfw_rule_ref *)(ipfwtag+1);
 		if (rr->info & IPFW_IS_DIVERT && rr->rulenum == 0) {
 			if (pd.pf_mtag == NULL &&
 			    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 				action = PF_DROP;
 				goto done;
 			}
 			pd.pf_mtag->flags |= PF_PACKET_LOOPED;
 			m_tag_delete(m, ipfwtag);
 		}
 		if (pd.pf_mtag && pd.pf_mtag->flags & PF_FASTFWD_OURS_PRESENT) {
 			m->m_flags |= M_FASTFWD_OURS;
 			pd.pf_mtag->flags &= ~PF_FASTFWD_OURS_PRESENT;
 		}
 	} else if (pf_normalize_ip(m0, dir, kif, &reason, &pd) != PF_PASS) {
 		/* We do IP header normalization and packet reassembly here */
 		action = PF_DROP;
 		goto done;
 	}
 	m = *m0;	/* pf_normalize messes with m0 */
 	h = mtod(m, struct ip *);
 
 	off = h->ip_hl << 2;
 	if (off < (int)sizeof(struct ip)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_SHORT);
 		log = 1;
 		goto done;
 	}
 
 	pd.src = (struct pf_addr *)&h->ip_src;
 	pd.dst = (struct pf_addr *)&h->ip_dst;
 	pd.sport = pd.dport = NULL;
 	pd.ip_sum = &h->ip_sum;
 	pd.proto_sum = NULL;
 	pd.proto = h->ip_p;
 	pd.dir = dir;
 	pd.sidx = (dir == PF_IN) ? 0 : 1;
 	pd.didx = (dir == PF_IN) ? 1 : 0;
 	pd.af = AF_INET;
 	pd.tos = h->ip_tos & ~IPTOS_ECN_MASK;
 	pd.tot_len = ntohs(h->ip_len);
 
 	/* handle fragments that didn't get reassembled by normalization */
 	if (h->ip_off & htons(IP_MF | IP_OFFMASK)) {
 		action = pf_test_fragment(&r, dir, kif, m, h,
 		    &pd, &a, &ruleset);
 		goto done;
 	}
 
 	switch (h->ip_p) {
 
 	case IPPROTO_TCP: {
 		struct tcphdr	th;
 
 		pd.hdr.tcp = &th;
 		if (!pf_pull_hdr(m, off, &th, sizeof(th),
 		    &action, &reason, AF_INET)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		pd.p_len = pd.tot_len - off - (th.th_off << 2);
 		if ((th.th_flags & TH_ACK) && pd.p_len == 0)
 			pqid = 1;
 		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
 		if (action == PF_DROP)
 			goto done;
 		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
 		    &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	case IPPROTO_UDP: {
 		struct udphdr	uh;
 
 		pd.hdr.udp = &uh;
 		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
 		    &action, &reason, AF_INET)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		if (uh.uh_dport == 0 ||
 		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
 		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_SHORT);
 			goto done;
 		}
 		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	case IPPROTO_ICMP: {
 		struct icmp	ih;
 
 		pd.hdr.icmp = &ih;
 		if (!pf_pull_hdr(m, off, &ih, ICMP_MINLEN,
 		    &action, &reason, AF_INET)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		action = pf_test_state_icmp(&s, dir, kif, m, off, h, &pd,
 		    &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 #ifdef INET6
 	case IPPROTO_ICMPV6: {
 		action = PF_DROP;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping IPv4 packet with ICMPv6 payload\n"));
 		goto done;
 	}
 #endif
 
 	default:
 		action = pf_test_state_other(&s, dir, kif, m, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 done:
 	PF_RULES_RUNLOCK();
 	if (action == PF_PASS && h->ip_hl > 5 &&
 	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_IPOPTIONS);
 		log = r->log;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping packet with ip options\n"));
 	}
 
 	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_MEMORY);
 	}
 	if (r->rtableid >= 0)
 		M_SETFIB(m, r->rtableid);
 
 	if (r->scrub_flags & PFSTATE_SETPRIO) {
 		if (pd.tos & IPTOS_LOWDELAY)
 			pqid = 1;
 		if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 			log = 1;
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: failed to allocate 802.1q mtag\n"));
 		}
 	}
 
 #ifdef ALTQ
 	if (action == PF_PASS && r->qid) {
 		if (pd.pf_mtag == NULL &&
 		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 		} else {
 			if (s != NULL)
 				pd.pf_mtag->qid_hash = pf_state_hash(s);
 			if (pqid || (pd.tos & IPTOS_LOWDELAY))
 				pd.pf_mtag->qid = r->pqid;
 			else
 				pd.pf_mtag->qid = r->qid;
 			/* Add hints for ecn. */
 			pd.pf_mtag->hdr = h;
 		}
 
 	}
 #endif /* ALTQ */
 
 	/*
 	 * connections redirected to loopback should not match sockets
 	 * bound specifically to loopback due to security implications,
 	 * see tcp_input() and in_pcblookup_listen().
 	 */
 	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
 	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
 	    (s->nat_rule.ptr->action == PF_RDR ||
 	    s->nat_rule.ptr->action == PF_BINAT) &&
 	    IN_LOOPBACK(ntohl(pd.dst->v4.s_addr)))
 		m->m_flags |= M_SKIP_FIREWALL;
 
 	if (action == PF_PASS && r->divert.port && ip_divert_ptr != NULL &&
 	    !PACKET_LOOPED(&pd)) {
 
 		ipfwtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
 		if (ipfwtag != NULL) {
 			((struct ipfw_rule_ref *)(ipfwtag+1))->info =
 			    ntohs(r->divert.port);
 			((struct ipfw_rule_ref *)(ipfwtag+1))->rulenum = dir;
 
 			if (s)
 				PF_STATE_UNLOCK(s);
 
 			m_tag_prepend(m, ipfwtag);
 			if (m->m_flags & M_FASTFWD_OURS) {
 				if (pd.pf_mtag == NULL &&
 				    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 					action = PF_DROP;
 					REASON_SET(&reason, PFRES_MEMORY);
 					log = 1;
 					DPFPRINTF(PF_DEBUG_MISC,
 					    ("pf: failed to allocate tag\n"));
 				} else {
 					pd.pf_mtag->flags |=
 					    PF_FASTFWD_OURS_PRESENT;
 					m->m_flags &= ~M_FASTFWD_OURS;
 				}
 			}
 			ip_divert_ptr(*m0, dir == PF_IN);
 			*m0 = NULL;
 
 			return (action);
 		} else {
 			/* XXX: ipfw has the same behaviour! */
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 			log = 1;
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: failed to allocate divert tag\n"));
 		}
 	}
 
 	if (log) {
 		struct pf_rule *lr;
 
 		if (s != NULL && s->nat_rule.ptr != NULL &&
 		    s->nat_rule.ptr->log & PF_LOG_ALL)
 			lr = s->nat_rule.ptr;
 		else
 			lr = r;
 		PFLOG_PACKET(kif, m, AF_INET, dir, reason, lr, a, ruleset, &pd,
 		    (s == NULL));
 	}
 
 	kif->pfik_bytes[0][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
 	kif->pfik_packets[0][dir == PF_OUT][action != PF_PASS]++;
 
 	if (action == PF_PASS || r->action == PF_DROP) {
 		dirndx = (dir == PF_OUT);
 		r->packets[dirndx]++;
 		r->bytes[dirndx] += pd.tot_len;
 		if (a != NULL) {
 			a->packets[dirndx]++;
 			a->bytes[dirndx] += pd.tot_len;
 		}
 		if (s != NULL) {
 			if (s->nat_rule.ptr != NULL) {
 				s->nat_rule.ptr->packets[dirndx]++;
 				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
 			}
 			if (s->src_node != NULL) {
 				s->src_node->packets[dirndx]++;
 				s->src_node->bytes[dirndx] += pd.tot_len;
 			}
 			if (s->nat_src_node != NULL) {
 				s->nat_src_node->packets[dirndx]++;
 				s->nat_src_node->bytes[dirndx] += pd.tot_len;
 			}
 			dirndx = (dir == s->direction) ? 0 : 1;
 			s->packets[dirndx]++;
 			s->bytes[dirndx] += pd.tot_len;
 		}
 		tr = r;
 		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
 		if (nr != NULL && r == &V_pf_default_rule)
 			tr = nr;
 		if (tr->src.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->src.addr.p.tbl,
 			    (s == NULL) ? pd.src :
 			    &s->key[(s->direction == PF_IN)]->
 				addr[(s->direction == PF_OUT)],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->src.neg);
 		if (tr->dst.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->dst.addr.p.tbl,
 			    (s == NULL) ? pd.dst :
 			    &s->key[(s->direction == PF_IN)]->
 				addr[(s->direction == PF_IN)],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->dst.neg);
 	}
 
 	switch (action) {
 	case PF_SYNPROXY_DROP:
 		m_freem(*m0);
 	case PF_DEFER:
 		*m0 = NULL;
 		action = PF_PASS;
 		break;
 	case PF_DROP:
 		m_freem(*m0);
 		*m0 = NULL;
 		break;
 	default:
 		/* pf_route() returns unlocked. */
 		if (r->rt) {
 			pf_route(m0, r, dir, kif->pfik_ifp, s, &pd, inp);
 			return (action);
 		}
 		break;
 	}
 	if (s)
 		PF_STATE_UNLOCK(s);
 
 	return (action);
 }
 #endif /* INET */
 
 #ifdef INET6
 int
 pf_test6(int dir, int pflags, struct ifnet *ifp, struct mbuf **m0, struct inpcb *inp)
 {
 	struct pfi_kif		*kif;
 	u_short			 action, reason = 0, log = 0;
 	struct mbuf		*m = *m0, *n = NULL;
 	struct m_tag		*mtag;
 	struct ip6_hdr		*h = NULL;
 	struct pf_rule		*a = NULL, *r = &V_pf_default_rule, *tr, *nr;
 	struct pf_state		*s = NULL;
 	struct pf_ruleset	*ruleset = NULL;
 	struct pf_pdesc		 pd;
 	int			 off, terminal = 0, dirndx, rh_cnt = 0, pqid = 0;
 
 	PF_RULES_RLOCK_TRACKER;
 	M_ASSERTPKTHDR(m);
 
 	if (!V_pf_status.running)
 		return (PF_PASS);
 
 	memset(&pd, 0, sizeof(pd));
 	pd.pf_mtag = pf_find_mtag(m);
 
 	if (pd.pf_mtag && pd.pf_mtag->flags & PF_TAG_GENERATED)
 		return (PF_PASS);
 
 	kif = (struct pfi_kif *)ifp->if_pf_kif;
 	if (kif == NULL) {
 		DPFPRINTF(PF_DEBUG_URGENT,
 		    ("pf_test6: kif == NULL, if_xname %s\n", ifp->if_xname));
 		return (PF_DROP);
 	}
 	if (kif->pfik_flags & PFI_IFLAG_SKIP)
 		return (PF_PASS);
 
 	if (m->m_flags & M_SKIP_FIREWALL)
 		return (PF_PASS);
 
 	PF_RULES_RLOCK();
 
 	/* We do IP header normalization and packet reassembly here */
 	if (pf_normalize_ip6(m0, dir, kif, &reason, &pd) != PF_PASS) {
 		action = PF_DROP;
 		goto done;
 	}
 	m = *m0;	/* pf_normalize messes with m0 */
 	h = mtod(m, struct ip6_hdr *);
 
-#if 1
 	/*
-	 * we do not support jumbogram yet.  if we keep going, zero ip6_plen
+	 * we do not support jumbogram.  if we keep going, zero ip6_plen
 	 * will do something bad, so drop the packet for now.
 	 */
 	if (htons(h->ip6_plen) == 0) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_NORM);	/*XXX*/
 		goto done;
 	}
-#endif
 
 	pd.src = (struct pf_addr *)&h->ip6_src;
 	pd.dst = (struct pf_addr *)&h->ip6_dst;
 	pd.sport = pd.dport = NULL;
 	pd.ip_sum = NULL;
 	pd.proto_sum = NULL;
 	pd.dir = dir;
 	pd.sidx = (dir == PF_IN) ? 0 : 1;
 	pd.didx = (dir == PF_IN) ? 1 : 0;
 	pd.af = AF_INET6;
 	pd.tos = 0;
 	pd.tot_len = ntohs(h->ip6_plen) + sizeof(struct ip6_hdr);
 
 	off = ((caddr_t)h - m->m_data) + sizeof(struct ip6_hdr);
 	pd.proto = h->ip6_nxt;
 	do {
 		switch (pd.proto) {
 		case IPPROTO_FRAGMENT:
 			action = pf_test_fragment(&r, dir, kif, m, h,
 			    &pd, &a, &ruleset);
 			if (action == PF_DROP)
 				REASON_SET(&reason, PFRES_FRAG);
 			goto done;
 		case IPPROTO_ROUTING: {
 			struct ip6_rthdr rthdr;
 
 			if (rh_cnt++) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 more than one rthdr\n"));
 				action = PF_DROP;
 				REASON_SET(&reason, PFRES_IPOPTIONS);
 				log = 1;
 				goto done;
 			}
 			if (!pf_pull_hdr(m, off, &rthdr, sizeof(rthdr), NULL,
 			    &reason, pd.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 short rthdr\n"));
 				action = PF_DROP;
 				REASON_SET(&reason, PFRES_SHORT);
 				log = 1;
 				goto done;
 			}
 			if (rthdr.ip6r_type == IPV6_RTHDR_TYPE_0) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 rthdr0\n"));
 				action = PF_DROP;
 				REASON_SET(&reason, PFRES_IPOPTIONS);
 				log = 1;
 				goto done;
 			}
 			/* FALLTHROUGH */
 		}
 		case IPPROTO_AH:
 		case IPPROTO_HOPOPTS:
 		case IPPROTO_DSTOPTS: {
 			/* get next header and header length */
 			struct ip6_ext	opt6;
 
 			if (!pf_pull_hdr(m, off, &opt6, sizeof(opt6),
 			    NULL, &reason, pd.af)) {
 				DPFPRINTF(PF_DEBUG_MISC,
 				    ("pf: IPv6 short opt\n"));
 				action = PF_DROP;
 				log = 1;
 				goto done;
 			}
 			if (pd.proto == IPPROTO_AH)
 				off += (opt6.ip6e_len + 2) * 4;
 			else
 				off += (opt6.ip6e_len + 1) * 8;
 			pd.proto = opt6.ip6e_nxt;
 			/* goto the next header */
 			break;
 		}
 		default:
 			terminal++;
 			break;
 		}
 	} while (!terminal);
 
 	/* if there's no routing header, use unmodified mbuf for checksumming */
 	if (!n)
 		n = m;
 
 	switch (pd.proto) {
 
 	case IPPROTO_TCP: {
 		struct tcphdr	th;
 
 		pd.hdr.tcp = &th;
 		if (!pf_pull_hdr(m, off, &th, sizeof(th),
 		    &action, &reason, AF_INET6)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		pd.p_len = pd.tot_len - off - (th.th_off << 2);
 		action = pf_normalize_tcp(dir, kif, m, 0, off, h, &pd);
 		if (action == PF_DROP)
 			goto done;
 		action = pf_test_state_tcp(&s, dir, kif, m, off, h, &pd,
 		    &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	case IPPROTO_UDP: {
 		struct udphdr	uh;
 
 		pd.hdr.udp = &uh;
 		if (!pf_pull_hdr(m, off, &uh, sizeof(uh),
 		    &action, &reason, AF_INET6)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		if (uh.uh_dport == 0 ||
 		    ntohs(uh.uh_ulen) > m->m_pkthdr.len - off ||
 		    ntohs(uh.uh_ulen) < sizeof(struct udphdr)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_SHORT);
 			goto done;
 		}
 		action = pf_test_state_udp(&s, dir, kif, m, off, h, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	case IPPROTO_ICMP: {
 		action = PF_DROP;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping IPv6 packet with ICMPv4 payload\n"));
 		goto done;
 	}
 
 	case IPPROTO_ICMPV6: {
 		struct icmp6_hdr	ih;
 
 		pd.hdr.icmp6 = &ih;
 		if (!pf_pull_hdr(m, off, &ih, sizeof(ih),
 		    &action, &reason, AF_INET6)) {
 			log = action != PF_PASS;
 			goto done;
 		}
 		action = pf_test_state_icmp(&s, dir, kif,
 		    m, off, h, &pd, &reason);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 	default:
 		action = pf_test_state_other(&s, dir, kif, m, &pd);
 		if (action == PF_PASS) {
 			if (V_pfsync_update_state_ptr != NULL)
 				V_pfsync_update_state_ptr(s);
 			r = s->rule.ptr;
 			a = s->anchor.ptr;
 			log = s->log;
 		} else if (s == NULL)
 			action = pf_test_rule(&r, &s, dir, kif, m, off, &pd,
 			    &a, &ruleset, inp);
 		break;
 	}
 
 done:
 	PF_RULES_RUNLOCK();
 	if (n != m) {
 		m_freem(n);
 		n = NULL;
 	}
 
 	/* handle dangerous IPv6 extension headers. */
 	if (action == PF_PASS && rh_cnt &&
 	    !((s && s->state_flags & PFSTATE_ALLOWOPTS) || r->allow_opts)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_IPOPTIONS);
 		log = r->log;
 		DPFPRINTF(PF_DEBUG_MISC,
 		    ("pf: dropping packet with dangerous v6 headers\n"));
 	}
 
 	if (s && s->tag > 0 && pf_tag_packet(m, &pd, s->tag)) {
 		action = PF_DROP;
 		REASON_SET(&reason, PFRES_MEMORY);
 	}
 	if (r->rtableid >= 0)
 		M_SETFIB(m, r->rtableid);
 
 	if (r->scrub_flags & PFSTATE_SETPRIO) {
 		if (pd.tos & IPTOS_LOWDELAY)
 			pqid = 1;
 		if (pf_ieee8021q_setpcp(m, r->set_prio[pqid])) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 			log = 1;
 			DPFPRINTF(PF_DEBUG_MISC,
 			    ("pf: failed to allocate 802.1q mtag\n"));
 		}
 	}
 
 #ifdef ALTQ
 	if (action == PF_PASS && r->qid) {
 		if (pd.pf_mtag == NULL &&
 		    ((pd.pf_mtag = pf_get_mtag(m)) == NULL)) {
 			action = PF_DROP;
 			REASON_SET(&reason, PFRES_MEMORY);
 		} else {
 			if (s != NULL)
 				pd.pf_mtag->qid_hash = pf_state_hash(s);
 			if (pd.tos & IPTOS_LOWDELAY)
 				pd.pf_mtag->qid = r->pqid;
 			else
 				pd.pf_mtag->qid = r->qid;
 			/* Add hints for ecn. */
 			pd.pf_mtag->hdr = h;
 		}
 	}
 #endif /* ALTQ */
 
 	if (dir == PF_IN && action == PF_PASS && (pd.proto == IPPROTO_TCP ||
 	    pd.proto == IPPROTO_UDP) && s != NULL && s->nat_rule.ptr != NULL &&
 	    (s->nat_rule.ptr->action == PF_RDR ||
 	    s->nat_rule.ptr->action == PF_BINAT) &&
 	    IN6_IS_ADDR_LOOPBACK(&pd.dst->v6))
 		m->m_flags |= M_SKIP_FIREWALL;
 
 	/* XXX: Anybody working on it?! */
 	if (r->divert.port)
 		printf("pf: divert(9) is not supported for IPv6\n");
 
 	if (log) {
 		struct pf_rule *lr;
 
 		if (s != NULL && s->nat_rule.ptr != NULL &&
 		    s->nat_rule.ptr->log & PF_LOG_ALL)
 			lr = s->nat_rule.ptr;
 		else
 			lr = r;
 		PFLOG_PACKET(kif, m, AF_INET6, dir, reason, lr, a, ruleset,
 		    &pd, (s == NULL));
 	}
 
 	kif->pfik_bytes[1][dir == PF_OUT][action != PF_PASS] += pd.tot_len;
 	kif->pfik_packets[1][dir == PF_OUT][action != PF_PASS]++;
 
 	if (action == PF_PASS || r->action == PF_DROP) {
 		dirndx = (dir == PF_OUT);
 		r->packets[dirndx]++;
 		r->bytes[dirndx] += pd.tot_len;
 		if (a != NULL) {
 			a->packets[dirndx]++;
 			a->bytes[dirndx] += pd.tot_len;
 		}
 		if (s != NULL) {
 			if (s->nat_rule.ptr != NULL) {
 				s->nat_rule.ptr->packets[dirndx]++;
 				s->nat_rule.ptr->bytes[dirndx] += pd.tot_len;
 			}
 			if (s->src_node != NULL) {
 				s->src_node->packets[dirndx]++;
 				s->src_node->bytes[dirndx] += pd.tot_len;
 			}
 			if (s->nat_src_node != NULL) {
 				s->nat_src_node->packets[dirndx]++;
 				s->nat_src_node->bytes[dirndx] += pd.tot_len;
 			}
 			dirndx = (dir == s->direction) ? 0 : 1;
 			s->packets[dirndx]++;
 			s->bytes[dirndx] += pd.tot_len;
 		}
 		tr = r;
 		nr = (s != NULL) ? s->nat_rule.ptr : pd.nat_rule;
 		if (nr != NULL && r == &V_pf_default_rule)
 			tr = nr;
 		if (tr->src.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->src.addr.p.tbl,
 			    (s == NULL) ? pd.src :
 			    &s->key[(s->direction == PF_IN)]->addr[0],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->src.neg);
 		if (tr->dst.addr.type == PF_ADDR_TABLE)
 			pfr_update_stats(tr->dst.addr.p.tbl,
 			    (s == NULL) ? pd.dst :
 			    &s->key[(s->direction == PF_IN)]->addr[1],
 			    pd.af, pd.tot_len, dir == PF_OUT,
 			    r->action == PF_PASS, tr->dst.neg);
 	}
 
 	switch (action) {
 	case PF_SYNPROXY_DROP:
 		m_freem(*m0);
 	case PF_DEFER:
 		*m0 = NULL;
 		action = PF_PASS;
 		break;
 	case PF_DROP:
 		m_freem(*m0);
 		*m0 = NULL;
 		break;
 	default:
 		/* pf_route6() returns unlocked. */
 		if (r->rt) {
 			pf_route6(m0, r, dir, kif->pfik_ifp, s, &pd, inp);
 			return (action);
 		}
 		break;
 	}
 
 	if (s)
 		PF_STATE_UNLOCK(s);
 
 	/* If reassembled packet passed, create new fragments. */
 	if (action == PF_PASS && *m0 && (pflags & PFIL_FWD) &&
 	    (mtag = m_tag_find(m, PF_REASSEMBLED, NULL)) != NULL)
 		action = pf_refragment6(ifp, m0, mtag);
 
 	return (action);
 }
 #endif /* INET6 */
Index: projects/fuse2/sys/netpfil/pf/pf_norm.c
===================================================================
--- projects/fuse2/sys/netpfil/pf/pf_norm.c	(revision 350434)
+++ projects/fuse2/sys/netpfil/pf/pf_norm.c	(revision 350435)
@@ -1,2029 +1,2004 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright 2001 Niels Provos <provos@citi.umich.edu>
  * Copyright 2011-2018 Alexander Bluhm <bluhm@openbsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  *	$OpenBSD: pf_norm.c,v 1.114 2009/01/29 14:11:45 henning Exp $
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_pf.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/refcount.h>
 #include <sys/socket.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 #include <net/pfvar.h>
 #include <net/if_pflog.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif /* INET6 */
 
 struct pf_frent {
 	TAILQ_ENTRY(pf_frent)	fr_next;
 	struct mbuf	*fe_m;
 	uint16_t	fe_hdrlen;	/* ipv4 header length with ip options
 					   ipv6, extension, fragment header */
 	uint16_t	fe_extoff;	/* last extension header offset or 0 */
 	uint16_t	fe_len;		/* fragment length */
 	uint16_t	fe_off;		/* fragment offset */
 	uint16_t	fe_mff;		/* more fragment flag */
 };
 
 struct pf_fragment_cmp {
 	struct pf_addr	frc_src;
 	struct pf_addr	frc_dst;
 	uint32_t	frc_id;
 	sa_family_t	frc_af;
 	uint8_t		frc_proto;
 };
 
 struct pf_fragment {
 	struct pf_fragment_cmp	fr_key;
 #define fr_src	fr_key.frc_src
 #define fr_dst	fr_key.frc_dst
 #define fr_id	fr_key.frc_id
 #define fr_af	fr_key.frc_af
 #define fr_proto	fr_key.frc_proto
 
 	/* pointers to queue element */
 	struct pf_frent	*fr_firstoff[PF_FRAG_ENTRY_POINTS];
 	/* count entries between pointers */
 	uint8_t	fr_entries[PF_FRAG_ENTRY_POINTS];
 	RB_ENTRY(pf_fragment) fr_entry;
 	TAILQ_ENTRY(pf_fragment) frag_next;
 	uint32_t	fr_timeout;
 	uint16_t	fr_maxlen;	/* maximum length of single fragment */
 	u_int16_t	fr_holes;	/* number of holes in the queue */
 	TAILQ_HEAD(pf_fragq, pf_frent) fr_queue;
 };
 
 struct pf_fragment_tag {
 	uint16_t	ft_hdrlen;	/* header length of reassembled pkt */
 	uint16_t	ft_extoff;	/* last extension header offset or 0 */
 	uint16_t	ft_maxlen;	/* maximum fragment payload length */
 	uint32_t	ft_id;		/* fragment id */
 };
 
 static struct mtx pf_frag_mtx;
 MTX_SYSINIT(pf_frag_mtx, &pf_frag_mtx, "pf fragments", MTX_DEF);
 #define PF_FRAG_LOCK()		mtx_lock(&pf_frag_mtx)
 #define PF_FRAG_UNLOCK()	mtx_unlock(&pf_frag_mtx)
 #define PF_FRAG_ASSERT()	mtx_assert(&pf_frag_mtx, MA_OWNED)
 
 VNET_DEFINE(uma_zone_t, pf_state_scrub_z);	/* XXX: shared with pfsync */
 
 VNET_DEFINE_STATIC(uma_zone_t, pf_frent_z);
 #define	V_pf_frent_z	VNET(pf_frent_z)
 VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z);
 #define	V_pf_frag_z	VNET(pf_frag_z)
 
 TAILQ_HEAD(pf_fragqueue, pf_fragment);
 TAILQ_HEAD(pf_cachequeue, pf_fragment);
 VNET_DEFINE_STATIC(struct pf_fragqueue,	pf_fragqueue);
 #define	V_pf_fragqueue			VNET(pf_fragqueue)
 RB_HEAD(pf_frag_tree, pf_fragment);
 VNET_DEFINE_STATIC(struct pf_frag_tree,	pf_frag_tree);
 #define	V_pf_frag_tree			VNET(pf_frag_tree)
 static int		 pf_frag_compare(struct pf_fragment *,
 			    struct pf_fragment *);
 static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
 static RB_GENERATE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
 
 static void	pf_flush_fragments(void);
 static void	pf_free_fragment(struct pf_fragment *);
 static void	pf_remove_fragment(struct pf_fragment *);
 static int	pf_normalize_tcpopt(struct pf_rule *, struct mbuf *,
 		    struct tcphdr *, int, sa_family_t);
 static struct pf_frent *pf_create_fragment(u_short *);
 static int	pf_frent_holes(struct pf_frent *frent);
 static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key,
 		    struct pf_frag_tree *tree);
 static inline int	pf_frent_index(struct pf_frent *);
 static int	pf_frent_insert(struct pf_fragment *,
 			    struct pf_frent *, struct pf_frent *);
 void			pf_frent_remove(struct pf_fragment *,
 			    struct pf_frent *);
 struct pf_frent		*pf_frent_previous(struct pf_fragment *,
 			    struct pf_frent *);
 static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *,
 		    struct pf_frent *, u_short *);
 static struct mbuf *pf_join_fragment(struct pf_fragment *);
 #ifdef INET
 static void	pf_scrub_ip(struct mbuf **, uint32_t, uint8_t, uint8_t);
 static int	pf_reassemble(struct mbuf **, struct ip *, int, u_short *);
 #endif	/* INET */
 #ifdef INET6
 static int	pf_reassemble6(struct mbuf **, struct ip6_hdr *,
 		    struct ip6_frag *, uint16_t, uint16_t, u_short *);
 static void	pf_scrub_ip6(struct mbuf **, uint8_t);
 #endif	/* INET6 */
 
 #define	DPFPRINTF(x) do {				\
 	if (V_pf_status.debug >= PF_DEBUG_MISC) {	\
 		printf("%s: ", __func__);		\
 		printf x ;				\
 	}						\
 } while(0)
 
 #ifdef INET
 static void
 pf_ip2key(struct ip *ip, int dir, struct pf_fragment_cmp *key)
 {
 
 	key->frc_src.v4 = ip->ip_src;
 	key->frc_dst.v4 = ip->ip_dst;
 	key->frc_af = AF_INET;
 	key->frc_proto = ip->ip_p;
 	key->frc_id = ip->ip_id;
 }
 #endif	/* INET */
 
 void
 pf_normalize_init(void)
 {
 
 	V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	V_pf_state_scrub_z = uma_zcreate("pf state scrubs",
 	    sizeof(struct pf_state_scrub),  NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
 
 	V_pf_limits[PF_LIMIT_FRAGS].zone = V_pf_frent_z;
 	V_pf_limits[PF_LIMIT_FRAGS].limit = PFFRAG_FRENT_HIWAT;
 	uma_zone_set_max(V_pf_frent_z, PFFRAG_FRENT_HIWAT);
 	uma_zone_set_warning(V_pf_frent_z, "PF frag entries limit reached");
 
 	TAILQ_INIT(&V_pf_fragqueue);
 }
 
 void
 pf_normalize_cleanup(void)
 {
 
 	uma_zdestroy(V_pf_state_scrub_z);
 	uma_zdestroy(V_pf_frent_z);
 	uma_zdestroy(V_pf_frag_z);
 }
 
 static int
 pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
 {
 	int	diff;
 
 	if ((diff = a->fr_id - b->fr_id) != 0)
 		return (diff);
 	if ((diff = a->fr_proto - b->fr_proto) != 0)
 		return (diff);
 	if ((diff = a->fr_af - b->fr_af) != 0)
 		return (diff);
 	if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0)
 		return (diff);
 	if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0)
 		return (diff);
 	return (0);
 }
 
 void
 pf_purge_expired_fragments(void)
 {
 	u_int32_t	expire = time_uptime -
 			    V_pf_default_rule.timeout[PFTM_FRAG];
 
 	pf_purge_fragments(expire);
 }
 
 void
 pf_purge_fragments(uint32_t expire)
 {
 	struct pf_fragment	*frag;
 
 	PF_FRAG_LOCK();
 	while ((frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue)) != NULL) {
 		if (frag->fr_timeout > expire)
 			break;
 
 		DPFPRINTF(("expiring %d(%p)\n", frag->fr_id, frag));
 		pf_free_fragment(frag);
 	}
 
 	PF_FRAG_UNLOCK();
 }
 
 /*
  * Try to flush old fragments to make space for new ones
  */
 static void
 pf_flush_fragments(void)
 {
 	struct pf_fragment	*frag;
 	int			 goal;
 
 	PF_FRAG_ASSERT();
 
 	goal = uma_zone_get_cur(V_pf_frent_z) * 9 / 10;
 	DPFPRINTF(("trying to free %d frag entriess\n", goal));
 	while (goal < uma_zone_get_cur(V_pf_frent_z)) {
 		frag = TAILQ_LAST(&V_pf_fragqueue, pf_fragqueue);
 		if (frag)
 			pf_free_fragment(frag);
 		else
 			break;
 	}
 }
 
 /* Frees the fragments and all associated entries */
 static void
 pf_free_fragment(struct pf_fragment *frag)
 {
 	struct pf_frent		*frent;
 
 	PF_FRAG_ASSERT();
 
 	/* Free all fragments */
 	for (frent = TAILQ_FIRST(&frag->fr_queue); frent;
 	    frent = TAILQ_FIRST(&frag->fr_queue)) {
 		TAILQ_REMOVE(&frag->fr_queue, frent, fr_next);
 
 		m_freem(frent->fe_m);
 		uma_zfree(V_pf_frent_z, frent);
 	}
 
 	pf_remove_fragment(frag);
 }
 
 static struct pf_fragment *
 pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree)
 {
 	struct pf_fragment	*frag;
 
 	PF_FRAG_ASSERT();
 
 	frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key);
 	if (frag != NULL) {
 		/* XXX Are we sure we want to update the timeout? */
 		frag->fr_timeout = time_uptime;
 		TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
 		TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
 	}
 
 	return (frag);
 }
 
 /* Removes a fragment from the fragment queue and frees the fragment */
 static void
 pf_remove_fragment(struct pf_fragment *frag)
 {
 
 	PF_FRAG_ASSERT();
 	KASSERT(frag, ("frag != NULL"));
 
 	RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
 	TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
 	uma_zfree(V_pf_frag_z, frag);
 }
 
 static struct pf_frent *
 pf_create_fragment(u_short *reason)
 {
 	struct pf_frent *frent;
 
 	PF_FRAG_ASSERT();
 
 	frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
 	if (frent == NULL) {
 		pf_flush_fragments();
 		frent = uma_zalloc(V_pf_frent_z, M_NOWAIT);
 		if (frent == NULL) {
 			REASON_SET(reason, PFRES_MEMORY);
 			return (NULL);
 		}
 	}
 
 	return (frent);
 }
 
 /*
  * Calculate the additional holes that were created in the fragment
  * queue by inserting this fragment.  A fragment in the middle
  * creates one more hole by splitting.  For each connected side,
  * it loses one hole.
  * Fragment entry must be in the queue when calling this function.
  */
 static int
 pf_frent_holes(struct pf_frent *frent)
 {
 	struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next);
 	struct pf_frent *next = TAILQ_NEXT(frent, fr_next);
 	int holes = 1;
 
 	if (prev == NULL) {
 		if (frent->fe_off == 0)
 			holes--;
 	} else {
 		KASSERT(frent->fe_off != 0, ("frent->fe_off != 0"));
 		if (frent->fe_off == prev->fe_off + prev->fe_len)
 			holes--;
 	}
 	if (next == NULL) {
 		if (!frent->fe_mff)
 			holes--;
 	} else {
 		KASSERT(frent->fe_mff, ("frent->fe_mff"));
 		if (next->fe_off == frent->fe_off + frent->fe_len)
 			holes--;
 	}
 	return holes;
 }
 
 static inline int
 pf_frent_index(struct pf_frent *frent)
 {
 	/*
 	 * We have an array of 16 entry points to the queue.  A full size
 	 * 65535 octet IP packet can have 8192 fragments.  So the queue
 	 * traversal length is at most 512 and at most 16 entry points are
 	 * checked.  We need 128 additional bytes on a 64 bit architecture.
 	 */
 	CTASSERT(((u_int16_t)0xffff &~ 7) / (0x10000 / PF_FRAG_ENTRY_POINTS) ==
 	    16 - 1);
 	CTASSERT(((u_int16_t)0xffff >> 3) / PF_FRAG_ENTRY_POINTS == 512 - 1);
 
 	return frent->fe_off / (0x10000 / PF_FRAG_ENTRY_POINTS);
 }
 
 static int
 pf_frent_insert(struct pf_fragment *frag, struct pf_frent *frent,
     struct pf_frent *prev)
 {
 	int index;
 
 	CTASSERT(PF_FRAG_ENTRY_LIMIT <= 0xff);
 
 	/*
 	 * A packet has at most 65536 octets.  With 16 entry points, each one
 	 * spawns 4096 octets.  We limit these to 64 fragments each, which
 	 * means on average every fragment must have at least 64 octets.
 	 */
 	index = pf_frent_index(frent);
 	if (frag->fr_entries[index] >= PF_FRAG_ENTRY_LIMIT)
 		return ENOBUFS;
 	frag->fr_entries[index]++;
 
 	if (prev == NULL) {
 		TAILQ_INSERT_HEAD(&frag->fr_queue, frent, fr_next);
 	} else {
 		KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off,
 		    ("overlapping fragment"));
 		TAILQ_INSERT_AFTER(&frag->fr_queue, prev, frent, fr_next);
 	}
 
 	if (frag->fr_firstoff[index] == NULL) {
 		KASSERT(prev == NULL || pf_frent_index(prev) < index,
 		    ("prev == NULL || pf_frent_index(pref) < index"));
 		frag->fr_firstoff[index] = frent;
 	} else {
 		if (frent->fe_off < frag->fr_firstoff[index]->fe_off) {
 			KASSERT(prev == NULL || pf_frent_index(prev) < index,
 			    ("prev == NULL || pf_frent_index(pref) < index"));
 			frag->fr_firstoff[index] = frent;
 		} else {
 			KASSERT(prev != NULL, ("prev != NULL"));
 			KASSERT(pf_frent_index(prev) == index,
 			    ("pf_frent_index(prev) == index"));
 		}
 	}
 
 	frag->fr_holes += pf_frent_holes(frent);
 
 	return 0;
 }
 
 void
 pf_frent_remove(struct pf_fragment *frag, struct pf_frent *frent)
 {
 #ifdef INVARIANTS
 	struct pf_frent *prev = TAILQ_PREV(frent, pf_fragq, fr_next);
 #endif
 	struct pf_frent *next = TAILQ_NEXT(frent, fr_next);
 	int index;
 
 	frag->fr_holes -= pf_frent_holes(frent);
 
 	index = pf_frent_index(frent);
 	KASSERT(frag->fr_firstoff[index] != NULL, ("frent not found"));
 	if (frag->fr_firstoff[index]->fe_off == frent->fe_off) {
 		if (next == NULL) {
 			frag->fr_firstoff[index] = NULL;
 		} else {
 			KASSERT(frent->fe_off + frent->fe_len <= next->fe_off,
 			    ("overlapping fragment"));
 			if (pf_frent_index(next) == index) {
 				frag->fr_firstoff[index] = next;
 			} else {
 				frag->fr_firstoff[index] = NULL;
 			}
 		}
 	} else {
 		KASSERT(frag->fr_firstoff[index]->fe_off < frent->fe_off,
 		    ("frag->fr_firstoff[index]->fe_off < frent->fe_off"));
 		KASSERT(prev != NULL, ("prev != NULL"));
 		KASSERT(prev->fe_off + prev->fe_len <= frent->fe_off,
 		    ("overlapping fragment"));
 		KASSERT(pf_frent_index(prev) == index,
 		    ("pf_frent_index(prev) == index"));
 	}
 
 	TAILQ_REMOVE(&frag->fr_queue, frent, fr_next);
 
 	KASSERT(frag->fr_entries[index] > 0, ("No fragments remaining"));
 	frag->fr_entries[index]--;
 }
 
 struct pf_frent *
 pf_frent_previous(struct pf_fragment *frag, struct pf_frent *frent)
 {
 	struct pf_frent *prev, *next;
 	int index;
 
 	/*
 	 * If there are no fragments after frag, take the final one.  Assume
 	 * that the global queue is not empty.
 	 */
 	prev = TAILQ_LAST(&frag->fr_queue, pf_fragq);
 	KASSERT(prev != NULL, ("prev != NULL"));
 	if (prev->fe_off <= frent->fe_off)
 		return prev;
 	/*
 	 * We want to find a fragment entry that is before frag, but still
 	 * close to it.  Find the first fragment entry that is in the same
 	 * entry point or in the first entry point after that.  As we have
 	 * already checked that there are entries behind frag, this will
 	 * succeed.
 	 */
 	for (index = pf_frent_index(frent); index < PF_FRAG_ENTRY_POINTS;
 	    index++) {
 		prev = frag->fr_firstoff[index];
 		if (prev != NULL)
 			break;
 	}
 	KASSERT(prev != NULL, ("prev != NULL"));
 	/*
 	 * In prev we may have a fragment from the same entry point that is
 	 * before frent, or one that is just one position behind frent.
 	 * In the latter case, we go back one step and have the predecessor.
 	 * There may be none if the new fragment will be the first one.
 	 */
 	if (prev->fe_off > frent->fe_off) {
 		prev = TAILQ_PREV(prev, pf_fragq, fr_next);
 		if (prev == NULL)
 			return NULL;
 		KASSERT(prev->fe_off <= frent->fe_off,
 		    ("prev->fe_off <= frent->fe_off"));
 		return prev;
 	}
 	/*
 	 * In prev is the first fragment of the entry point.  The offset
 	 * of frag is behind it.  Find the closest previous fragment.
 	 */
 	for (next = TAILQ_NEXT(prev, fr_next); next != NULL;
 	    next = TAILQ_NEXT(next, fr_next)) {
 		if (next->fe_off > frent->fe_off)
 			break;
 		prev = next;
 	}
 	return prev;
 }
 
 static struct pf_fragment *
 pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
     u_short *reason)
 {
 	struct pf_frent		*after, *next, *prev;
 	struct pf_fragment	*frag;
 	uint16_t		total;
 
 	PF_FRAG_ASSERT();
 
 	/* No empty fragments. */
 	if (frent->fe_len == 0) {
 		DPFPRINTF(("bad fragment: len 0"));
 		goto bad_fragment;
 	}
 
 	/* All fragments are 8 byte aligned. */
 	if (frent->fe_mff && (frent->fe_len & 0x7)) {
 		DPFPRINTF(("bad fragment: mff and len %d", frent->fe_len));
 		goto bad_fragment;
 	}
 
 	/* Respect maximum length, IP_MAXPACKET == IPV6_MAXPACKET. */
 	if (frent->fe_off + frent->fe_len > IP_MAXPACKET) {
 		DPFPRINTF(("bad fragment: max packet %d",
 		    frent->fe_off + frent->fe_len));
 		goto bad_fragment;
 	}
 
 	DPFPRINTF((key->frc_af == AF_INET ?
 	    "reass frag %d @ %d-%d" : "reass frag %#08x @ %d-%d",
 	    key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len));
 
 	/* Fully buffer all of the fragments in this fragment queue. */
 	frag = pf_find_fragment(key, &V_pf_frag_tree);
 
 	/* Create a new reassembly queue for this packet. */
 	if (frag == NULL) {
 		frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
 		if (frag == NULL) {
 			pf_flush_fragments();
 			frag = uma_zalloc(V_pf_frag_z, M_NOWAIT);
 			if (frag == NULL) {
 				REASON_SET(reason, PFRES_MEMORY);
 				goto drop_fragment;
 			}
 		}
 
 		*(struct pf_fragment_cmp *)frag = *key;
 		memset(frag->fr_firstoff, 0, sizeof(frag->fr_firstoff));
 		memset(frag->fr_entries, 0, sizeof(frag->fr_entries));
 		frag->fr_timeout = time_uptime;
 		frag->fr_maxlen = frent->fe_len;
 		frag->fr_holes = 1;
 		TAILQ_INIT(&frag->fr_queue);
 
 		RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag);
 		TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
 
 		/* We do not have a previous fragment, cannot fail. */
 		pf_frent_insert(frag, frent, NULL);
 
 		return (frag);
 	}
 
 	KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue"));
 
 	/* Remember maximum fragment len for refragmentation. */
 	if (frent->fe_len > frag->fr_maxlen)
 		frag->fr_maxlen = frent->fe_len;
 
 	/* Maximum data we have seen already. */
 	total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
 		TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
 
 	/* Non terminal fragments must have more fragments flag. */
 	if (frent->fe_off + frent->fe_len < total && !frent->fe_mff)
 		goto bad_fragment;
 
 	/* Check if we saw the last fragment already. */
 	if (!TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_mff) {
 		if (frent->fe_off + frent->fe_len > total ||
 		    (frent->fe_off + frent->fe_len == total && frent->fe_mff))
 			goto bad_fragment;
 	} else {
 		if (frent->fe_off + frent->fe_len == total && !frent->fe_mff)
 			goto bad_fragment;
 	}
 
 	/* Find neighbors for newly inserted fragment */
 	prev = pf_frent_previous(frag, frent);
 	if (prev == NULL) {
 		after = TAILQ_FIRST(&frag->fr_queue);
 		KASSERT(after != NULL, ("after != NULL"));
 	} else {
 		after = TAILQ_NEXT(prev, fr_next);
 	}
 
 	if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) {
 		uint16_t precut;
 
 		precut = prev->fe_off + prev->fe_len - frent->fe_off;
 		if (precut >= frent->fe_len)
 			goto bad_fragment;
 		DPFPRINTF(("overlap -%d", precut));
 		m_adj(frent->fe_m, precut);
 		frent->fe_off += precut;
 		frent->fe_len -= precut;
 	}
 
 	for (; after != NULL && frent->fe_off + frent->fe_len > after->fe_off;
 	    after = next) {
 		uint16_t aftercut;
 
 		aftercut = frent->fe_off + frent->fe_len - after->fe_off;
 		DPFPRINTF(("adjust overlap %d", aftercut));
 		if (aftercut < after->fe_len) {
 			m_adj(after->fe_m, aftercut);
 			after->fe_off += aftercut;
 			after->fe_len -= aftercut;
 			break;
 		}
 
 		/* This fragment is completely overlapped, lose it. */
 		next = TAILQ_NEXT(after, fr_next);
 		pf_frent_remove(frag, after);
 		m_freem(after->fe_m);
 		uma_zfree(V_pf_frent_z, after);
 	}
 
 	/* If part of the queue gets too long, there is not way to recover. */
 	if (pf_frent_insert(frag, frent, prev)) {
 		DPFPRINTF(("fragment queue limit exceeded"));
 		goto bad_fragment;
 	}
 
 	return (frag);
 
 bad_fragment:
 	REASON_SET(reason, PFRES_FRAG);
 drop_fragment:
 	uma_zfree(V_pf_frent_z, frent);
 	return (NULL);
 }
 
 static struct mbuf *
 pf_join_fragment(struct pf_fragment *frag)
 {
 	struct mbuf *m, *m2;
 	struct pf_frent	*frent, *next;
 
 	frent = TAILQ_FIRST(&frag->fr_queue);
 	next = TAILQ_NEXT(frent, fr_next);
 
 	m = frent->fe_m;
 	m_adj(m, (frent->fe_hdrlen + frent->fe_len) - m->m_pkthdr.len);
 	uma_zfree(V_pf_frent_z, frent);
 	for (frent = next; frent != NULL; frent = next) {
 		next = TAILQ_NEXT(frent, fr_next);
 
 		m2 = frent->fe_m;
 		/* Strip off ip header. */
 		m_adj(m2, frent->fe_hdrlen);
 		/* Strip off any trailing bytes. */
 		m_adj(m2, frent->fe_len - m2->m_pkthdr.len);
 
 		uma_zfree(V_pf_frent_z, frent);
 		m_cat(m, m2);
 	}
 
 	/* Remove from fragment queue. */
 	pf_remove_fragment(frag);
 
 	return (m);
 }
 
 #ifdef INET
 static int
 pf_reassemble(struct mbuf **m0, struct ip *ip, int dir, u_short *reason)
 {
 	struct mbuf		*m = *m0;
 	struct pf_frent		*frent;
 	struct pf_fragment	*frag;
 	struct pf_fragment_cmp	key;
 	uint16_t		total, hdrlen;
 
 	/* Get an entry for the fragment queue */
 	if ((frent = pf_create_fragment(reason)) == NULL)
 		return (PF_DROP);
 
 	frent->fe_m = m;
 	frent->fe_hdrlen = ip->ip_hl << 2;
 	frent->fe_extoff = 0;
 	frent->fe_len = ntohs(ip->ip_len) - (ip->ip_hl << 2);
 	frent->fe_off = (ntohs(ip->ip_off) & IP_OFFMASK) << 3;
 	frent->fe_mff = ntohs(ip->ip_off) & IP_MF;
 
 	pf_ip2key(ip, dir, &key);
 
 	if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL)
 		return (PF_DROP);
 
 	/* The mbuf is part of the fragment entry, no direct free or access */
 	m = *m0 = NULL;
 
 	if (frag->fr_holes) {
 		DPFPRINTF(("frag %d, holes %d", frag->fr_id, frag->fr_holes));
 		return (PF_PASS);  /* drop because *m0 is NULL, no error */
 	}
 
 	/* We have all the data */
 	frent = TAILQ_FIRST(&frag->fr_queue);
 	KASSERT(frent != NULL, ("frent != NULL"));
 	total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
 		TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
 	hdrlen = frent->fe_hdrlen;
 
 	m = *m0 = pf_join_fragment(frag);
 	frag = NULL;
 
 	if (m->m_flags & M_PKTHDR) {
 		int plen = 0;
 		for (m = *m0; m; m = m->m_next)
 			plen += m->m_len;
 		m = *m0;
 		m->m_pkthdr.len = plen;
 	}
 
 	ip = mtod(m, struct ip *);
 	ip->ip_len = htons(hdrlen + total);
 	ip->ip_off &= ~(IP_MF|IP_OFFMASK);
 
 	if (hdrlen + total > IP_MAXPACKET) {
 		DPFPRINTF(("drop: too big: %d", total));
 		ip->ip_len = 0;
 		REASON_SET(reason, PFRES_SHORT);
 		/* PF_DROP requires a valid mbuf *m0 in pf_test() */
 		return (PF_DROP);
 	}
 
 	DPFPRINTF(("complete: %p(%d)\n", m, ntohs(ip->ip_len)));
 	return (PF_PASS);
 }
 #endif	/* INET */
 
 #ifdef INET6
 static int
 pf_reassemble6(struct mbuf **m0, struct ip6_hdr *ip6, struct ip6_frag *fraghdr,
     uint16_t hdrlen, uint16_t extoff, u_short *reason)
 {
 	struct mbuf		*m = *m0;
 	struct pf_frent		*frent;
 	struct pf_fragment	*frag;
 	struct pf_fragment_cmp	 key;
 	struct m_tag		*mtag;
 	struct pf_fragment_tag	*ftag;
 	int			 off;
 	uint32_t		 frag_id;
 	uint16_t		 total, maxlen;
 	uint8_t			 proto;
 
 	PF_FRAG_LOCK();
 
 	/* Get an entry for the fragment queue. */
 	if ((frent = pf_create_fragment(reason)) == NULL) {
 		PF_FRAG_UNLOCK();
 		return (PF_DROP);
 	}
 
 	frent->fe_m = m;
 	frent->fe_hdrlen = hdrlen;
 	frent->fe_extoff = extoff;
 	frent->fe_len = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - hdrlen;
 	frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK);
 	frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG;
 
 	key.frc_src.v6 = ip6->ip6_src;
 	key.frc_dst.v6 = ip6->ip6_dst;
 	key.frc_af = AF_INET6;
 	/* Only the first fragment's protocol is relevant. */
 	key.frc_proto = 0;
 	key.frc_id = fraghdr->ip6f_ident;
 
 	if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
 		PF_FRAG_UNLOCK();
 		return (PF_DROP);
 	}
 
 	/* The mbuf is part of the fragment entry, no direct free or access. */
 	m = *m0 = NULL;
 
 	if (frag->fr_holes) {
 		DPFPRINTF(("frag %d, holes %d", frag->fr_id, frag->fr_holes));
 		PF_FRAG_UNLOCK();
 		return (PF_PASS);  /* Drop because *m0 is NULL, no error. */
 	}
 
 	/* We have all the data. */
 	frent = TAILQ_FIRST(&frag->fr_queue);
 	KASSERT(frent != NULL, ("frent != NULL"));
 	extoff = frent->fe_extoff;
 	maxlen = frag->fr_maxlen;
 	frag_id = frag->fr_id;
 	total = TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_off +
 		TAILQ_LAST(&frag->fr_queue, pf_fragq)->fe_len;
 	hdrlen = frent->fe_hdrlen - sizeof(struct ip6_frag);
 
 	m = *m0 = pf_join_fragment(frag);
 	frag = NULL;
 
 	PF_FRAG_UNLOCK();
 
 	/* Take protocol from first fragment header. */
 	m = m_getptr(m, hdrlen + offsetof(struct ip6_frag, ip6f_nxt), &off);
 	KASSERT(m, ("%s: short mbuf chain", __func__));
 	proto = *(mtod(m, caddr_t) + off);
 	m = *m0;
 
 	/* Delete frag6 header */
 	if (ip6_deletefraghdr(m, hdrlen, M_NOWAIT) != 0)
 		goto fail;
 
 	if (m->m_flags & M_PKTHDR) {
 		int plen = 0;
 		for (m = *m0; m; m = m->m_next)
 			plen += m->m_len;
 		m = *m0;
 		m->m_pkthdr.len = plen;
 	}
 
 	if ((mtag = m_tag_get(PF_REASSEMBLED, sizeof(struct pf_fragment_tag),
 	    M_NOWAIT)) == NULL)
 		goto fail;
 	ftag = (struct pf_fragment_tag *)(mtag + 1);
 	ftag->ft_hdrlen = hdrlen;
 	ftag->ft_extoff = extoff;
 	ftag->ft_maxlen = maxlen;
 	ftag->ft_id = frag_id;
 	m_tag_prepend(m, mtag);
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_plen = htons(hdrlen - sizeof(struct ip6_hdr) + total);
 	if (extoff) {
 		/* Write protocol into next field of last extension header. */
 		m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt),
 		    &off);
 		KASSERT(m, ("%s: short mbuf chain", __func__));
 		*(mtod(m, char *) + off) = proto;
 		m = *m0;
 	} else
 		ip6->ip6_nxt = proto;
 
 	if (hdrlen - sizeof(struct ip6_hdr) + total > IPV6_MAXPACKET) {
 		DPFPRINTF(("drop: too big: %d", total));
 		ip6->ip6_plen = 0;
 		REASON_SET(reason, PFRES_SHORT);
 		/* PF_DROP requires a valid mbuf *m0 in pf_test6(). */
 		return (PF_DROP);
 	}
 
 	DPFPRINTF(("complete: %p(%d)", m, ntohs(ip6->ip6_plen)));
 	return (PF_PASS);
 
 fail:
 	REASON_SET(reason, PFRES_MEMORY);
 	/* PF_DROP requires a valid mbuf *m0 in pf_test6(), will free later. */
 	return (PF_DROP);
 }
 #endif	/* INET6 */
 
 #ifdef INET6
 int
 pf_refragment6(struct ifnet *ifp, struct mbuf **m0, struct m_tag *mtag)
 {
 	struct mbuf		*m = *m0, *t;
 	struct pf_fragment_tag	*ftag = (struct pf_fragment_tag *)(mtag + 1);
 	struct pf_pdesc		 pd;
 	uint32_t		 frag_id;
 	uint16_t		 hdrlen, extoff, maxlen;
 	uint8_t			 proto;
 	int			 error, action;
 
 	hdrlen = ftag->ft_hdrlen;
 	extoff = ftag->ft_extoff;
 	maxlen = ftag->ft_maxlen;
 	frag_id = ftag->ft_id;
 	m_tag_delete(m, mtag);
 	mtag = NULL;
 	ftag = NULL;
 
 	if (extoff) {
 		int off;
 
 		/* Use protocol from next field of last extension header */
 		m = m_getptr(m, extoff + offsetof(struct ip6_ext, ip6e_nxt),
 		    &off);
 		KASSERT((m != NULL), ("pf_refragment6: short mbuf chain"));
 		proto = *(mtod(m, caddr_t) + off);
 		*(mtod(m, char *) + off) = IPPROTO_FRAGMENT;
 		m = *m0;
 	} else {
 		struct ip6_hdr *hdr;
 
 		hdr = mtod(m, struct ip6_hdr *);
 		proto = hdr->ip6_nxt;
 		hdr->ip6_nxt = IPPROTO_FRAGMENT;
 	}
 
 	/* The MTU must be a multiple of 8 bytes, or we risk doing the
 	 * fragmentation wrong. */
 	maxlen = maxlen & ~7;
 
 	/*
 	 * Maxlen may be less than 8 if there was only a single
 	 * fragment.  As it was fragmented before, add a fragment
 	 * header also for a single fragment.  If total or maxlen
 	 * is less than 8, ip6_fragment() will return EMSGSIZE and
 	 * we drop the packet.
 	 */
 	error = ip6_fragment(ifp, m, hdrlen, proto, maxlen, frag_id);
 	m = (*m0)->m_nextpkt;
 	(*m0)->m_nextpkt = NULL;
 	if (error == 0) {
 		/* The first mbuf contains the unfragmented packet. */
 		m_freem(*m0);
 		*m0 = NULL;
 		action = PF_PASS;
 	} else {
 		/* Drop expects an mbuf to free. */
 		DPFPRINTF(("refragment error %d", error));
 		action = PF_DROP;
 	}
 	for (t = m; m; m = t) {
 		t = m->m_nextpkt;
 		m->m_nextpkt = NULL;
 		m->m_flags |= M_SKIP_FIREWALL;
 		memset(&pd, 0, sizeof(pd));
 		pd.pf_mtag = pf_find_mtag(m);
 		if (error == 0)
 			ip6_forward(m, 0);
 		else
 			m_freem(m);
 	}
 
 	return (action);
 }
 #endif /* INET6 */
 
 #ifdef INET
 int
 pf_normalize_ip(struct mbuf **m0, int dir, struct pfi_kif *kif, u_short *reason,
     struct pf_pdesc *pd)
 {
 	struct mbuf		*m = *m0;
 	struct pf_rule		*r;
 	struct ip		*h = mtod(m, struct ip *);
 	int			 mff = (ntohs(h->ip_off) & IP_MF);
 	int			 hlen = h->ip_hl << 2;
 	u_int16_t		 fragoff = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
 	u_int16_t		 max;
 	int			 ip_len;
 	int			 ip_off;
 	int			 tag = -1;
 	int			 verdict;
 
 	PF_RULES_RASSERT();
 
 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
 	while (r != NULL) {
 		r->evaluations++;
 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
 			r = r->skip[PF_SKIP_IFP].ptr;
 		else if (r->direction && r->direction != dir)
 			r = r->skip[PF_SKIP_DIR].ptr;
 		else if (r->af && r->af != AF_INET)
 			r = r->skip[PF_SKIP_AF].ptr;
 		else if (r->proto && r->proto != h->ip_p)
 			r = r->skip[PF_SKIP_PROTO].ptr;
 		else if (PF_MISMATCHAW(&r->src.addr,
 		    (struct pf_addr *)&h->ip_src.s_addr, AF_INET,
 		    r->src.neg, kif, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
 		else if (PF_MISMATCHAW(&r->dst.addr,
 		    (struct pf_addr *)&h->ip_dst.s_addr, AF_INET,
 		    r->dst.neg, NULL, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
 		else if (r->match_tag && !pf_match_tag(m, r, &tag,
 		    pd->pf_mtag ? pd->pf_mtag->tag : 0))
 			r = TAILQ_NEXT(r, entries);
 		else
 			break;
 	}
 
 	if (r == NULL || r->action == PF_NOSCRUB)
 		return (PF_PASS);
 	else {
 		r->packets[dir == PF_OUT]++;
 		r->bytes[dir == PF_OUT] += pd->tot_len;
 	}
 
 	/* Check for illegal packets */
 	if (hlen < (int)sizeof(struct ip)) {
 		REASON_SET(reason, PFRES_NORM);
 		goto drop;
 	}
 
 	if (hlen > ntohs(h->ip_len)) {
 		REASON_SET(reason, PFRES_NORM);
 		goto drop;
 	}
 
 	/* Clear IP_DF if the rule uses the no-df option */
 	if (r->rule_flag & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
 		u_int16_t ip_off = h->ip_off;
 
 		h->ip_off &= htons(~IP_DF);
 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
 	}
 
 	/* We will need other tests here */
 	if (!fragoff && !mff)
 		goto no_fragment;
 
 	/* We're dealing with a fragment now. Don't allow fragments
 	 * with IP_DF to enter the cache. If the flag was cleared by
 	 * no-df above, fine. Otherwise drop it.
 	 */
 	if (h->ip_off & htons(IP_DF)) {
 		DPFPRINTF(("IP_DF\n"));
 		goto bad;
 	}
 
 	ip_len = ntohs(h->ip_len) - hlen;
 	ip_off = (ntohs(h->ip_off) & IP_OFFMASK) << 3;
 
 	/* All fragments are 8 byte aligned */
 	if (mff && (ip_len & 0x7)) {
 		DPFPRINTF(("mff and %d\n", ip_len));
 		goto bad;
 	}
 
 	/* Respect maximum length */
 	if (fragoff + ip_len > IP_MAXPACKET) {
 		DPFPRINTF(("max packet %d\n", fragoff + ip_len));
 		goto bad;
 	}
 	max = fragoff + ip_len;
 
 	/* Fully buffer all of the fragments
 	 * Might return a completely reassembled mbuf, or NULL */
 	PF_FRAG_LOCK();
 	DPFPRINTF(("reass frag %d @ %d-%d\n", h->ip_id, fragoff, max));
 	verdict = pf_reassemble(m0, h, dir, reason);
 	PF_FRAG_UNLOCK();
 
 	if (verdict != PF_PASS)
 		return (PF_DROP);
 
 	m = *m0;
 	if (m == NULL)
 		return (PF_DROP);
 
 	h = mtod(m, struct ip *);
 
  no_fragment:
 	/* At this point, only IP_DF is allowed in ip_off */
 	if (h->ip_off & ~htons(IP_DF)) {
 		u_int16_t ip_off = h->ip_off;
 
 		h->ip_off &= htons(IP_DF);
 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
 	}
 
 	pf_scrub_ip(&m, r->rule_flag, r->min_ttl, r->set_tos);
 
 	return (PF_PASS);
 
  bad:
 	DPFPRINTF(("dropping bad fragment\n"));
 	REASON_SET(reason, PFRES_FRAG);
  drop:
 	if (r != NULL && r->log)
 		PFLOG_PACKET(kif, m, AF_INET, dir, *reason, r, NULL, NULL, pd,
 		    1);
 
 	return (PF_DROP);
 }
 #endif
 
 #ifdef INET6
 int
 pf_normalize_ip6(struct mbuf **m0, int dir, struct pfi_kif *kif,
     u_short *reason, struct pf_pdesc *pd)
 {
 	struct mbuf		*m = *m0;
 	struct pf_rule		*r;
 	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
 	int			 extoff;
 	int			 off;
 	struct ip6_ext		 ext;
 	struct ip6_opt		 opt;
-	struct ip6_opt_jumbo	 jumbo;
 	struct ip6_frag		 frag;
-	u_int32_t		 jumbolen = 0, plen;
+	u_int32_t		 plen;
 	int			 optend;
 	int			 ooff;
 	u_int8_t		 proto;
 	int			 terminal;
 
 	PF_RULES_RASSERT();
 
 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
 	while (r != NULL) {
 		r->evaluations++;
 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
 			r = r->skip[PF_SKIP_IFP].ptr;
 		else if (r->direction && r->direction != dir)
 			r = r->skip[PF_SKIP_DIR].ptr;
 		else if (r->af && r->af != AF_INET6)
 			r = r->skip[PF_SKIP_AF].ptr;
 #if 0 /* header chain! */
 		else if (r->proto && r->proto != h->ip6_nxt)
 			r = r->skip[PF_SKIP_PROTO].ptr;
 #endif
 		else if (PF_MISMATCHAW(&r->src.addr,
 		    (struct pf_addr *)&h->ip6_src, AF_INET6,
 		    r->src.neg, kif, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
 		else if (PF_MISMATCHAW(&r->dst.addr,
 		    (struct pf_addr *)&h->ip6_dst, AF_INET6,
 		    r->dst.neg, NULL, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
 		else
 			break;
 	}
 
 	if (r == NULL || r->action == PF_NOSCRUB)
 		return (PF_PASS);
 	else {
 		r->packets[dir == PF_OUT]++;
 		r->bytes[dir == PF_OUT] += pd->tot_len;
 	}
 
 	/* Check for illegal packets */
 	if (sizeof(struct ip6_hdr) + IPV6_MAXPACKET < m->m_pkthdr.len)
 		goto drop;
 
+	plen = ntohs(h->ip6_plen);
+	/* jumbo payload option not supported */
+	if (plen == 0)
+		goto drop;
+
 	extoff = 0;
 	off = sizeof(struct ip6_hdr);
 	proto = h->ip6_nxt;
 	terminal = 0;
 	do {
 		switch (proto) {
 		case IPPROTO_FRAGMENT:
 			goto fragment;
 			break;
 		case IPPROTO_AH:
 		case IPPROTO_ROUTING:
 		case IPPROTO_DSTOPTS:
 			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
 			    NULL, AF_INET6))
 				goto shortpkt;
 			extoff = off;
 			if (proto == IPPROTO_AH)
 				off += (ext.ip6e_len + 2) * 4;
 			else
 				off += (ext.ip6e_len + 1) * 8;
 			proto = ext.ip6e_nxt;
 			break;
 		case IPPROTO_HOPOPTS:
 			if (!pf_pull_hdr(m, off, &ext, sizeof(ext), NULL,
 			    NULL, AF_INET6))
 				goto shortpkt;
 			extoff = off;
 			optend = off + (ext.ip6e_len + 1) * 8;
 			ooff = off + sizeof(ext);
 			do {
 				if (!pf_pull_hdr(m, ooff, &opt.ip6o_type,
 				    sizeof(opt.ip6o_type), NULL, NULL,
 				    AF_INET6))
 					goto shortpkt;
 				if (opt.ip6o_type == IP6OPT_PAD1) {
 					ooff++;
 					continue;
 				}
 				if (!pf_pull_hdr(m, ooff, &opt, sizeof(opt),
 				    NULL, NULL, AF_INET6))
 					goto shortpkt;
 				if (ooff + sizeof(opt) + opt.ip6o_len > optend)
 					goto drop;
-				switch (opt.ip6o_type) {
-				case IP6OPT_JUMBO:
-					if (h->ip6_plen != 0)
-						goto drop;
-					if (!pf_pull_hdr(m, ooff, &jumbo,
-					    sizeof(jumbo), NULL, NULL,
-					    AF_INET6))
-						goto shortpkt;
-					memcpy(&jumbolen, jumbo.ip6oj_jumbo_len,
-					    sizeof(jumbolen));
-					jumbolen = ntohl(jumbolen);
-					if (jumbolen <= IPV6_MAXPACKET)
-						goto drop;
-					if (sizeof(struct ip6_hdr) + jumbolen !=
-					    m->m_pkthdr.len)
-						goto drop;
-					break;
-				default:
-					break;
-				}
+				if (opt.ip6o_type == IP6OPT_JUMBO)
+					goto drop;
 				ooff += sizeof(opt) + opt.ip6o_len;
 			} while (ooff < optend);
 
 			off = optend;
 			proto = ext.ip6e_nxt;
 			break;
 		default:
 			terminal = 1;
 			break;
 		}
 	} while (!terminal);
 
-	/* jumbo payload option must be present, or plen > 0 */
-	if (ntohs(h->ip6_plen) == 0)
-		plen = jumbolen;
-	else
-		plen = ntohs(h->ip6_plen);
-	if (plen == 0)
-		goto drop;
 	if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
 		goto shortpkt;
 
 	pf_scrub_ip6(&m, r->min_ttl);
 
 	return (PF_PASS);
 
  fragment:
-	/* Jumbo payload packets cannot be fragmented. */
-	plen = ntohs(h->ip6_plen);
-	if (plen == 0 || jumbolen)
-		goto drop;
 	if (sizeof(struct ip6_hdr) + plen > m->m_pkthdr.len)
 		goto shortpkt;
 
 	if (!pf_pull_hdr(m, off, &frag, sizeof(frag), NULL, NULL, AF_INET6))
 		goto shortpkt;
 
 	/* Offset now points to data portion. */
 	off += sizeof(frag);
 
 	/* Returns PF_DROP or *m0 is NULL or completely reassembled mbuf. */
 	if (pf_reassemble6(m0, h, &frag, off, extoff, reason) != PF_PASS)
 		return (PF_DROP);
 	m = *m0;
 	if (m == NULL)
 		return (PF_DROP);
 
 	pd->flags |= PFDESC_IP_REAS;
 	return (PF_PASS);
 
  shortpkt:
 	REASON_SET(reason, PFRES_SHORT);
 	if (r != NULL && r->log)
 		PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
 		    1);
 	return (PF_DROP);
 
  drop:
 	REASON_SET(reason, PFRES_NORM);
 	if (r != NULL && r->log)
 		PFLOG_PACKET(kif, m, AF_INET6, dir, *reason, r, NULL, NULL, pd,
 		    1);
 	return (PF_DROP);
 }
 #endif /* INET6 */
 
 int
 pf_normalize_tcp(int dir, struct pfi_kif *kif, struct mbuf *m, int ipoff,
     int off, void *h, struct pf_pdesc *pd)
 {
 	struct pf_rule	*r, *rm = NULL;
 	struct tcphdr	*th = pd->hdr.tcp;
 	int		 rewrite = 0;
 	u_short		 reason;
 	u_int8_t	 flags;
 	sa_family_t	 af = pd->af;
 
 	PF_RULES_RASSERT();
 
 	r = TAILQ_FIRST(pf_main_ruleset.rules[PF_RULESET_SCRUB].active.ptr);
 	while (r != NULL) {
 		r->evaluations++;
 		if (pfi_kif_match(r->kif, kif) == r->ifnot)
 			r = r->skip[PF_SKIP_IFP].ptr;
 		else if (r->direction && r->direction != dir)
 			r = r->skip[PF_SKIP_DIR].ptr;
 		else if (r->af && r->af != af)
 			r = r->skip[PF_SKIP_AF].ptr;
 		else if (r->proto && r->proto != pd->proto)
 			r = r->skip[PF_SKIP_PROTO].ptr;
 		else if (PF_MISMATCHAW(&r->src.addr, pd->src, af,
 		    r->src.neg, kif, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_SRC_ADDR].ptr;
 		else if (r->src.port_op && !pf_match_port(r->src.port_op,
 			    r->src.port[0], r->src.port[1], th->th_sport))
 			r = r->skip[PF_SKIP_SRC_PORT].ptr;
 		else if (PF_MISMATCHAW(&r->dst.addr, pd->dst, af,
 		    r->dst.neg, NULL, M_GETFIB(m)))
 			r = r->skip[PF_SKIP_DST_ADDR].ptr;
 		else if (r->dst.port_op && !pf_match_port(r->dst.port_op,
 			    r->dst.port[0], r->dst.port[1], th->th_dport))
 			r = r->skip[PF_SKIP_DST_PORT].ptr;
 		else if (r->os_fingerprint != PF_OSFP_ANY && !pf_osfp_match(
 			    pf_osfp_fingerprint(pd, m, off, th),
 			    r->os_fingerprint))
 			r = TAILQ_NEXT(r, entries);
 		else {
 			rm = r;
 			break;
 		}
 	}
 
 	if (rm == NULL || rm->action == PF_NOSCRUB)
 		return (PF_PASS);
 	else {
 		r->packets[dir == PF_OUT]++;
 		r->bytes[dir == PF_OUT] += pd->tot_len;
 	}
 
 	if (rm->rule_flag & PFRULE_REASSEMBLE_TCP)
 		pd->flags |= PFDESC_TCP_NORM;
 
 	flags = th->th_flags;
 	if (flags & TH_SYN) {
 		/* Illegal packet */
 		if (flags & TH_RST)
 			goto tcp_drop;
 
 		if (flags & TH_FIN)
 			goto tcp_drop;
 	} else {
 		/* Illegal packet */
 		if (!(flags & (TH_ACK|TH_RST)))
 			goto tcp_drop;
 	}
 
 	if (!(flags & TH_ACK)) {
 		/* These flags are only valid if ACK is set */
 		if ((flags & TH_FIN) || (flags & TH_PUSH) || (flags & TH_URG))
 			goto tcp_drop;
 	}
 
 	/* Check for illegal header length */
 	if (th->th_off < (sizeof(struct tcphdr) >> 2))
 		goto tcp_drop;
 
 	/* If flags changed, or reserved data set, then adjust */
 	if (flags != th->th_flags || th->th_x2 != 0) {
 		u_int16_t	ov, nv;
 
 		ov = *(u_int16_t *)(&th->th_ack + 1);
 		th->th_flags = flags;
 		th->th_x2 = 0;
 		nv = *(u_int16_t *)(&th->th_ack + 1);
 
 		th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, ov, nv, 0);
 		rewrite = 1;
 	}
 
 	/* Remove urgent pointer, if TH_URG is not set */
 	if (!(flags & TH_URG) && th->th_urp) {
 		th->th_sum = pf_proto_cksum_fixup(m, th->th_sum, th->th_urp,
 		    0, 0);
 		th->th_urp = 0;
 		rewrite = 1;
 	}
 
 	/* Process options */
 	if (r->max_mss && pf_normalize_tcpopt(r, m, th, off, pd->af))
 		rewrite = 1;
 
 	/* copy back packet headers if we sanitized */
 	if (rewrite)
 		m_copyback(m, off, sizeof(*th), (caddr_t)th);
 
 	return (PF_PASS);
 
  tcp_drop:
 	REASON_SET(&reason, PFRES_NORM);
 	if (rm != NULL && r->log)
 		PFLOG_PACKET(kif, m, AF_INET, dir, reason, r, NULL, NULL, pd,
 		    1);
 	return (PF_DROP);
 }
 
 int
 pf_normalize_tcp_init(struct mbuf *m, int off, struct pf_pdesc *pd,
     struct tcphdr *th, struct pf_state_peer *src, struct pf_state_peer *dst)
 {
 	u_int32_t tsval, tsecr;
 	u_int8_t hdr[60];
 	u_int8_t *opt;
 
 	KASSERT((src->scrub == NULL),
 	    ("pf_normalize_tcp_init: src->scrub != NULL"));
 
 	src->scrub = uma_zalloc(V_pf_state_scrub_z, M_ZERO | M_NOWAIT);
 	if (src->scrub == NULL)
 		return (1);
 
 	switch (pd->af) {
 #ifdef INET
 	case AF_INET: {
 		struct ip *h = mtod(m, struct ip *);
 		src->scrub->pfss_ttl = h->ip_ttl;
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
 		src->scrub->pfss_ttl = h->ip6_hlim;
 		break;
 	}
 #endif /* INET6 */
 	}
 
 
 	/*
 	 * All normalizations below are only begun if we see the start of
 	 * the connections.  They must all set an enabled bit in pfss_flags
 	 */
 	if ((th->th_flags & TH_SYN) == 0)
 		return (0);
 
 
 	if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
 	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
 		/* Diddle with TCP options */
 		int hlen;
 		opt = hdr + sizeof(struct tcphdr);
 		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
 		while (hlen >= TCPOLEN_TIMESTAMP) {
 			switch (*opt) {
 			case TCPOPT_EOL:	/* FALLTHROUGH */
 			case TCPOPT_NOP:
 				opt++;
 				hlen--;
 				break;
 			case TCPOPT_TIMESTAMP:
 				if (opt[1] >= TCPOLEN_TIMESTAMP) {
 					src->scrub->pfss_flags |=
 					    PFSS_TIMESTAMP;
 					src->scrub->pfss_ts_mod =
 					    htonl(arc4random());
 
 					/* note PFSS_PAWS not set yet */
 					memcpy(&tsval, &opt[2],
 					    sizeof(u_int32_t));
 					memcpy(&tsecr, &opt[6],
 					    sizeof(u_int32_t));
 					src->scrub->pfss_tsval0 = ntohl(tsval);
 					src->scrub->pfss_tsval = ntohl(tsval);
 					src->scrub->pfss_tsecr = ntohl(tsecr);
 					getmicrouptime(&src->scrub->pfss_last);
 				}
 				/* FALLTHROUGH */
 			default:
 				hlen -= MAX(opt[1], 2);
 				opt += MAX(opt[1], 2);
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
 
 void
 pf_normalize_tcp_cleanup(struct pf_state *state)
 {
 	if (state->src.scrub)
 		uma_zfree(V_pf_state_scrub_z, state->src.scrub);
 	if (state->dst.scrub)
 		uma_zfree(V_pf_state_scrub_z, state->dst.scrub);
 
 	/* Someday... flush the TCP segment reassembly descriptors. */
 }
 
 int
 pf_normalize_tcp_stateful(struct mbuf *m, int off, struct pf_pdesc *pd,
     u_short *reason, struct tcphdr *th, struct pf_state *state,
     struct pf_state_peer *src, struct pf_state_peer *dst, int *writeback)
 {
 	struct timeval uptime;
 	u_int32_t tsval, tsecr;
 	u_int tsval_from_last;
 	u_int8_t hdr[60];
 	u_int8_t *opt;
 	int copyback = 0;
 	int got_ts = 0;
 
 	KASSERT((src->scrub || dst->scrub),
 	    ("%s: src->scrub && dst->scrub!", __func__));
 
 	/*
 	 * Enforce the minimum TTL seen for this connection.  Negate a common
 	 * technique to evade an intrusion detection system and confuse
 	 * firewall state code.
 	 */
 	switch (pd->af) {
 #ifdef INET
 	case AF_INET: {
 		if (src->scrub) {
 			struct ip *h = mtod(m, struct ip *);
 			if (h->ip_ttl > src->scrub->pfss_ttl)
 				src->scrub->pfss_ttl = h->ip_ttl;
 			h->ip_ttl = src->scrub->pfss_ttl;
 		}
 		break;
 	}
 #endif /* INET */
 #ifdef INET6
 	case AF_INET6: {
 		if (src->scrub) {
 			struct ip6_hdr *h = mtod(m, struct ip6_hdr *);
 			if (h->ip6_hlim > src->scrub->pfss_ttl)
 				src->scrub->pfss_ttl = h->ip6_hlim;
 			h->ip6_hlim = src->scrub->pfss_ttl;
 		}
 		break;
 	}
 #endif /* INET6 */
 	}
 
 	if (th->th_off > (sizeof(struct tcphdr) >> 2) &&
 	    ((src->scrub && (src->scrub->pfss_flags & PFSS_TIMESTAMP)) ||
 	    (dst->scrub && (dst->scrub->pfss_flags & PFSS_TIMESTAMP))) &&
 	    pf_pull_hdr(m, off, hdr, th->th_off << 2, NULL, NULL, pd->af)) {
 		/* Diddle with TCP options */
 		int hlen;
 		opt = hdr + sizeof(struct tcphdr);
 		hlen = (th->th_off << 2) - sizeof(struct tcphdr);
 		while (hlen >= TCPOLEN_TIMESTAMP) {
 			switch (*opt) {
 			case TCPOPT_EOL:	/* FALLTHROUGH */
 			case TCPOPT_NOP:
 				opt++;
 				hlen--;
 				break;
 			case TCPOPT_TIMESTAMP:
 				/* Modulate the timestamps.  Can be used for
 				 * NAT detection, OS uptime determination or
 				 * reboot detection.
 				 */
 
 				if (got_ts) {
 					/* Huh?  Multiple timestamps!? */
 					if (V_pf_status.debug >= PF_DEBUG_MISC) {
 						DPFPRINTF(("multiple TS??"));
 						pf_print_state(state);
 						printf("\n");
 					}
 					REASON_SET(reason, PFRES_TS);
 					return (PF_DROP);
 				}
 				if (opt[1] >= TCPOLEN_TIMESTAMP) {
 					memcpy(&tsval, &opt[2],
 					    sizeof(u_int32_t));
 					if (tsval && src->scrub &&
 					    (src->scrub->pfss_flags &
 					    PFSS_TIMESTAMP)) {
 						tsval = ntohl(tsval);
 						pf_change_proto_a(m, &opt[2],
 						    &th->th_sum,
 						    htonl(tsval +
 						    src->scrub->pfss_ts_mod),
 						    0);
 						copyback = 1;
 					}
 
 					/* Modulate TS reply iff valid (!0) */
 					memcpy(&tsecr, &opt[6],
 					    sizeof(u_int32_t));
 					if (tsecr && dst->scrub &&
 					    (dst->scrub->pfss_flags &
 					    PFSS_TIMESTAMP)) {
 						tsecr = ntohl(tsecr)
 						    - dst->scrub->pfss_ts_mod;
 						pf_change_proto_a(m, &opt[6],
 						    &th->th_sum, htonl(tsecr),
 						    0);
 						copyback = 1;
 					}
 					got_ts = 1;
 				}
 				/* FALLTHROUGH */
 			default:
 				hlen -= MAX(opt[1], 2);
 				opt += MAX(opt[1], 2);
 				break;
 			}
 		}
 		if (copyback) {
 			/* Copyback the options, caller copys back header */
 			*writeback = 1;
 			m_copyback(m, off + sizeof(struct tcphdr),
 			    (th->th_off << 2) - sizeof(struct tcphdr), hdr +
 			    sizeof(struct tcphdr));
 		}
 	}
 
 
 	/*
 	 * Must invalidate PAWS checks on connections idle for too long.
 	 * The fastest allowed timestamp clock is 1ms.  That turns out to
 	 * be about 24 days before it wraps.  XXX Right now our lowerbound
 	 * TS echo check only works for the first 12 days of a connection
 	 * when the TS has exhausted half its 32bit space
 	 */
 #define TS_MAX_IDLE	(24*24*60*60)
 #define TS_MAX_CONN	(12*24*60*60)	/* XXX remove when better tsecr check */
 
 	getmicrouptime(&uptime);
 	if (src->scrub && (src->scrub->pfss_flags & PFSS_PAWS) &&
 	    (uptime.tv_sec - src->scrub->pfss_last.tv_sec > TS_MAX_IDLE ||
 	    time_uptime - state->creation > TS_MAX_CONN))  {
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			DPFPRINTF(("src idled out of PAWS\n"));
 			pf_print_state(state);
 			printf("\n");
 		}
 		src->scrub->pfss_flags = (src->scrub->pfss_flags & ~PFSS_PAWS)
 		    | PFSS_PAWS_IDLED;
 	}
 	if (dst->scrub && (dst->scrub->pfss_flags & PFSS_PAWS) &&
 	    uptime.tv_sec - dst->scrub->pfss_last.tv_sec > TS_MAX_IDLE) {
 		if (V_pf_status.debug >= PF_DEBUG_MISC) {
 			DPFPRINTF(("dst idled out of PAWS\n"));
 			pf_print_state(state);
 			printf("\n");
 		}
 		dst->scrub->pfss_flags = (dst->scrub->pfss_flags & ~PFSS_PAWS)
 		    | PFSS_PAWS_IDLED;
 	}
 
 	if (got_ts && src->scrub && dst->scrub &&
 	    (src->scrub->pfss_flags & PFSS_PAWS) &&
 	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
 		/* Validate that the timestamps are "in-window".
 		 * RFC1323 describes TCP Timestamp options that allow
 		 * measurement of RTT (round trip time) and PAWS
 		 * (protection against wrapped sequence numbers).  PAWS
 		 * gives us a set of rules for rejecting packets on
 		 * long fat pipes (packets that were somehow delayed
 		 * in transit longer than the time it took to send the
 		 * full TCP sequence space of 4Gb).  We can use these
 		 * rules and infer a few others that will let us treat
 		 * the 32bit timestamp and the 32bit echoed timestamp
 		 * as sequence numbers to prevent a blind attacker from
 		 * inserting packets into a connection.
 		 *
 		 * RFC1323 tells us:
 		 *  - The timestamp on this packet must be greater than
 		 *    or equal to the last value echoed by the other
 		 *    endpoint.  The RFC says those will be discarded
 		 *    since it is a dup that has already been acked.
 		 *    This gives us a lowerbound on the timestamp.
 		 *        timestamp >= other last echoed timestamp
 		 *  - The timestamp will be less than or equal to
 		 *    the last timestamp plus the time between the
 		 *    last packet and now.  The RFC defines the max
 		 *    clock rate as 1ms.  We will allow clocks to be
 		 *    up to 10% fast and will allow a total difference
 		 *    or 30 seconds due to a route change.  And this
 		 *    gives us an upperbound on the timestamp.
 		 *        timestamp <= last timestamp + max ticks
 		 *    We have to be careful here.  Windows will send an
 		 *    initial timestamp of zero and then initialize it
 		 *    to a random value after the 3whs; presumably to
 		 *    avoid a DoS by having to call an expensive RNG
 		 *    during a SYN flood.  Proof MS has at least one
 		 *    good security geek.
 		 *
 		 *  - The TCP timestamp option must also echo the other
 		 *    endpoints timestamp.  The timestamp echoed is the
 		 *    one carried on the earliest unacknowledged segment
 		 *    on the left edge of the sequence window.  The RFC
 		 *    states that the host will reject any echoed
 		 *    timestamps that were larger than any ever sent.
 		 *    This gives us an upperbound on the TS echo.
 		 *        tescr <= largest_tsval
 		 *  - The lowerbound on the TS echo is a little more
 		 *    tricky to determine.  The other endpoint's echoed
 		 *    values will not decrease.  But there may be
 		 *    network conditions that re-order packets and
 		 *    cause our view of them to decrease.  For now the
 		 *    only lowerbound we can safely determine is that
 		 *    the TS echo will never be less than the original
 		 *    TS.  XXX There is probably a better lowerbound.
 		 *    Remove TS_MAX_CONN with better lowerbound check.
 		 *        tescr >= other original TS
 		 *
 		 * It is also important to note that the fastest
 		 * timestamp clock of 1ms will wrap its 32bit space in
 		 * 24 days.  So we just disable TS checking after 24
 		 * days of idle time.  We actually must use a 12d
 		 * connection limit until we can come up with a better
 		 * lowerbound to the TS echo check.
 		 */
 		struct timeval delta_ts;
 		int ts_fudge;
 
 
 		/*
 		 * PFTM_TS_DIFF is how many seconds of leeway to allow
 		 * a host's timestamp.  This can happen if the previous
 		 * packet got delayed in transit for much longer than
 		 * this packet.
 		 */
 		if ((ts_fudge = state->rule.ptr->timeout[PFTM_TS_DIFF]) == 0)
 			ts_fudge = V_pf_default_rule.timeout[PFTM_TS_DIFF];
 
 		/* Calculate max ticks since the last timestamp */
 #define TS_MAXFREQ	1100		/* RFC max TS freq of 1Khz + 10% skew */
 #define TS_MICROSECS	1000000		/* microseconds per second */
 		delta_ts = uptime;
 		timevalsub(&delta_ts, &src->scrub->pfss_last);
 		tsval_from_last = (delta_ts.tv_sec + ts_fudge) * TS_MAXFREQ;
 		tsval_from_last += delta_ts.tv_usec / (TS_MICROSECS/TS_MAXFREQ);
 
 		if ((src->state >= TCPS_ESTABLISHED &&
 		    dst->state >= TCPS_ESTABLISHED) &&
 		    (SEQ_LT(tsval, dst->scrub->pfss_tsecr) ||
 		    SEQ_GT(tsval, src->scrub->pfss_tsval + tsval_from_last) ||
 		    (tsecr && (SEQ_GT(tsecr, dst->scrub->pfss_tsval) ||
 		    SEQ_LT(tsecr, dst->scrub->pfss_tsval0))))) {
 			/* Bad RFC1323 implementation or an insertion attack.
 			 *
 			 * - Solaris 2.6 and 2.7 are known to send another ACK
 			 *   after the FIN,FIN|ACK,ACK closing that carries
 			 *   an old timestamp.
 			 */
 
 			DPFPRINTF(("Timestamp failed %c%c%c%c\n",
 			    SEQ_LT(tsval, dst->scrub->pfss_tsecr) ? '0' : ' ',
 			    SEQ_GT(tsval, src->scrub->pfss_tsval +
 			    tsval_from_last) ? '1' : ' ',
 			    SEQ_GT(tsecr, dst->scrub->pfss_tsval) ? '2' : ' ',
 			    SEQ_LT(tsecr, dst->scrub->pfss_tsval0)? '3' : ' '));
 			DPFPRINTF((" tsval: %u  tsecr: %u  +ticks: %u  "
 			    "idle: %jus %lums\n",
 			    tsval, tsecr, tsval_from_last,
 			    (uintmax_t)delta_ts.tv_sec,
 			    delta_ts.tv_usec / 1000));
 			DPFPRINTF((" src->tsval: %u  tsecr: %u\n",
 			    src->scrub->pfss_tsval, src->scrub->pfss_tsecr));
 			DPFPRINTF((" dst->tsval: %u  tsecr: %u  tsval0: %u"
 			    "\n", dst->scrub->pfss_tsval,
 			    dst->scrub->pfss_tsecr, dst->scrub->pfss_tsval0));
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				pf_print_state(state);
 				pf_print_flags(th->th_flags);
 				printf("\n");
 			}
 			REASON_SET(reason, PFRES_TS);
 			return (PF_DROP);
 		}
 
 		/* XXX I'd really like to require tsecr but it's optional */
 
 	} else if (!got_ts && (th->th_flags & TH_RST) == 0 &&
 	    ((src->state == TCPS_ESTABLISHED && dst->state == TCPS_ESTABLISHED)
 	    || pd->p_len > 0 || (th->th_flags & TH_SYN)) &&
 	    src->scrub && dst->scrub &&
 	    (src->scrub->pfss_flags & PFSS_PAWS) &&
 	    (dst->scrub->pfss_flags & PFSS_PAWS)) {
 		/* Didn't send a timestamp.  Timestamps aren't really useful
 		 * when:
 		 *  - connection opening or closing (often not even sent).
 		 *    but we must not let an attacker to put a FIN on a
 		 *    data packet to sneak it through our ESTABLISHED check.
 		 *  - on a TCP reset.  RFC suggests not even looking at TS.
 		 *  - on an empty ACK.  The TS will not be echoed so it will
 		 *    probably not help keep the RTT calculation in sync and
 		 *    there isn't as much danger when the sequence numbers
 		 *    got wrapped.  So some stacks don't include TS on empty
 		 *    ACKs :-(
 		 *
 		 * To minimize the disruption to mostly RFC1323 conformant
 		 * stacks, we will only require timestamps on data packets.
 		 *
 		 * And what do ya know, we cannot require timestamps on data
 		 * packets.  There appear to be devices that do legitimate
 		 * TCP connection hijacking.  There are HTTP devices that allow
 		 * a 3whs (with timestamps) and then buffer the HTTP request.
 		 * If the intermediate device has the HTTP response cache, it
 		 * will spoof the response but not bother timestamping its
 		 * packets.  So we can look for the presence of a timestamp in
 		 * the first data packet and if there, require it in all future
 		 * packets.
 		 */
 
 		if (pd->p_len > 0 && (src->scrub->pfss_flags & PFSS_DATA_TS)) {
 			/*
 			 * Hey!  Someone tried to sneak a packet in.  Or the
 			 * stack changed its RFC1323 behavior?!?!
 			 */
 			if (V_pf_status.debug >= PF_DEBUG_MISC) {
 				DPFPRINTF(("Did not receive expected RFC1323 "
 				    "timestamp\n"));
 				pf_print_state(state);
 				pf_print_flags(th->th_flags);
 				printf("\n");
 			}
 			REASON_SET(reason, PFRES_TS);
 			return (PF_DROP);
 		}
 	}
 
 
 	/*
 	 * We will note if a host sends his data packets with or without
 	 * timestamps.  And require all data packets to contain a timestamp
 	 * if the first does.  PAWS implicitly requires that all data packets be
 	 * timestamped.  But I think there are middle-man devices that hijack
 	 * TCP streams immediately after the 3whs and don't timestamp their
 	 * packets (seen in a WWW accelerator or cache).
 	 */
 	if (pd->p_len > 0 && src->scrub && (src->scrub->pfss_flags &
 	    (PFSS_TIMESTAMP|PFSS_DATA_TS|PFSS_DATA_NOTS)) == PFSS_TIMESTAMP) {
 		if (got_ts)
 			src->scrub->pfss_flags |= PFSS_DATA_TS;
 		else {
 			src->scrub->pfss_flags |= PFSS_DATA_NOTS;
 			if (V_pf_status.debug >= PF_DEBUG_MISC && dst->scrub &&
 			    (dst->scrub->pfss_flags & PFSS_TIMESTAMP)) {
 				/* Don't warn if other host rejected RFC1323 */
 				DPFPRINTF(("Broken RFC1323 stack did not "
 				    "timestamp data packet. Disabled PAWS "
 				    "security.\n"));
 				pf_print_state(state);
 				pf_print_flags(th->th_flags);
 				printf("\n");
 			}
 		}
 	}
 
 
 	/*
 	 * Update PAWS values
 	 */
 	if (got_ts && src->scrub && PFSS_TIMESTAMP == (src->scrub->pfss_flags &
 	    (PFSS_PAWS_IDLED|PFSS_TIMESTAMP))) {
 		getmicrouptime(&src->scrub->pfss_last);
 		if (SEQ_GEQ(tsval, src->scrub->pfss_tsval) ||
 		    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
 			src->scrub->pfss_tsval = tsval;
 
 		if (tsecr) {
 			if (SEQ_GEQ(tsecr, src->scrub->pfss_tsecr) ||
 			    (src->scrub->pfss_flags & PFSS_PAWS) == 0)
 				src->scrub->pfss_tsecr = tsecr;
 
 			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0 &&
 			    (SEQ_LT(tsval, src->scrub->pfss_tsval0) ||
 			    src->scrub->pfss_tsval0 == 0)) {
 				/* tsval0 MUST be the lowest timestamp */
 				src->scrub->pfss_tsval0 = tsval;
 			}
 
 			/* Only fully initialized after a TS gets echoed */
 			if ((src->scrub->pfss_flags & PFSS_PAWS) == 0)
 				src->scrub->pfss_flags |= PFSS_PAWS;
 		}
 	}
 
 	/* I have a dream....  TCP segment reassembly.... */
 	return (0);
 }
 
 static int
 pf_normalize_tcpopt(struct pf_rule *r, struct mbuf *m, struct tcphdr *th,
     int off, sa_family_t af)
 {
 	u_int16_t	*mss;
 	int		 thoff;
 	int		 opt, cnt, optlen = 0;
 	int		 rewrite = 0;
 	u_char		 opts[TCP_MAXOLEN];
 	u_char		*optp = opts;
 
 	thoff = th->th_off << 2;
 	cnt = thoff - sizeof(struct tcphdr);
 
 	if (cnt > 0 && !pf_pull_hdr(m, off + sizeof(*th), opts, cnt,
 	    NULL, NULL, af))
 		return (rewrite);
 
 	for (; cnt > 0; cnt -= optlen, optp += optlen) {
 		opt = optp[0];
 		if (opt == TCPOPT_EOL)
 			break;
 		if (opt == TCPOPT_NOP)
 			optlen = 1;
 		else {
 			if (cnt < 2)
 				break;
 			optlen = optp[1];
 			if (optlen < 2 || optlen > cnt)
 				break;
 		}
 		switch (opt) {
 		case TCPOPT_MAXSEG:
 			mss = (u_int16_t *)(optp + 2);
 			if ((ntohs(*mss)) > r->max_mss) {
 				th->th_sum = pf_proto_cksum_fixup(m,
 				    th->th_sum, *mss, htons(r->max_mss), 0);
 				*mss = htons(r->max_mss);
 				rewrite = 1;
 			}
 			break;
 		default:
 			break;
 		}
 	}
 
 	if (rewrite)
 		m_copyback(m, off + sizeof(*th), thoff - sizeof(*th), opts);
 
 	return (rewrite);
 }
 
 #ifdef INET
 static void
 pf_scrub_ip(struct mbuf **m0, u_int32_t flags, u_int8_t min_ttl, u_int8_t tos)
 {
 	struct mbuf		*m = *m0;
 	struct ip		*h = mtod(m, struct ip *);
 
 	/* Clear IP_DF if no-df was requested */
 	if (flags & PFRULE_NODF && h->ip_off & htons(IP_DF)) {
 		u_int16_t ip_off = h->ip_off;
 
 		h->ip_off &= htons(~IP_DF);
 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_off, h->ip_off, 0);
 	}
 
 	/* Enforce a minimum ttl, may cause endless packet loops */
 	if (min_ttl && h->ip_ttl < min_ttl) {
 		u_int16_t ip_ttl = h->ip_ttl;
 
 		h->ip_ttl = min_ttl;
 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_ttl, h->ip_ttl, 0);
 	}
 
 	/* Enforce tos */
 	if (flags & PFRULE_SET_TOS) {
 		u_int16_t	ov, nv;
 
 		ov = *(u_int16_t *)h;
 		h->ip_tos = tos | (h->ip_tos & IPTOS_ECN_MASK);
 		nv = *(u_int16_t *)h;
 
 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ov, nv, 0);
 	}
 
 	/* random-id, but not for fragments */
 	if (flags & PFRULE_RANDOMID && !(h->ip_off & ~htons(IP_DF))) {
 		uint16_t ip_id = h->ip_id;
 
 		ip_fillid(h);
 		h->ip_sum = pf_cksum_fixup(h->ip_sum, ip_id, h->ip_id, 0);
 	}
 }
 #endif /* INET */
 
 #ifdef INET6
 static void
 pf_scrub_ip6(struct mbuf **m0, u_int8_t min_ttl)
 {
 	struct mbuf		*m = *m0;
 	struct ip6_hdr		*h = mtod(m, struct ip6_hdr *);
 
 	/* Enforce a minimum ttl, may cause endless packet loops */
 	if (min_ttl && h->ip6_hlim < min_ttl)
 		h->ip6_hlim = min_ttl;
 }
 #endif
Index: projects/fuse2/sys/riscv/riscv/copyinout.S
===================================================================
--- projects/fuse2/sys/riscv/riscv/copyinout.S	(revision 350434)
+++ projects/fuse2/sys/riscv/riscv/copyinout.S	(revision 350435)
@@ -1,183 +1,184 @@
 /*-
  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
  * Copyright (c) 2019 Mitchell Horne
  * All rights reserved.
  *
  * Portions of this software were developed by SRI International and the
  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Portions of this software were developed by the University of Cambridge
  * Computer Laboratory as part of the CTSRD Project, with support from the
  * UK Higher Education Innovation Fund (HEIF).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 __FBSDID("$FreeBSD$");
 
 #include <machine/riscvreg.h>
 #include <sys/errno.h>
 
 #include "assym.inc"
 
 /*
  * Fault handler for the copy{in,out} functions below.
  */
 ENTRY(copyio_fault)
 	SET_FAULT_HANDLER(x0, a1) /* Clear the handler */
 	EXIT_USER_ACCESS(a1)
 copyio_fault_nopcb:
 	li	a0, EFAULT
 	ret
 END(copyio_fault)
 
 /*
  * copycommon - common copy routine
  *
  * a0 - Source address
  * a1 - Destination address
  * a2 - Size of copy
  */
 	.macro copycommon
 	la	a6, copyio_fault	/* Get the handler address */
 	SET_FAULT_HANDLER(a6, a7)	/* Set the handler */
 	ENTER_USER_ACCESS(a7)
 
 	li	t2, XLEN_BYTES
-	blt	a2, t2, 3f		/* Byte-copy if len < XLEN_BYTES */
+	blt	a2, t2, 4f		/* Byte-copy if len < XLEN_BYTES */
 
 	/*
 	 * Compare lower bits of src and dest.
 	 * If they are aligned with each other, we can do word copy.
 	 */
 	andi	t0, a0, (XLEN_BYTES-1)	/* Low bits of src */
 	andi	t1, a1, (XLEN_BYTES-1)	/* Low bits of dest */
-	bne	t0, t1, 3f		/* Misaligned. Go to byte copy */
+	bne	t0, t1, 4f		/* Misaligned. Go to byte copy */
 	beqz	t0, 2f			/* Already word-aligned, skip ahead */
 
 	/* Byte copy until the first word-aligned address */
 1:	lb	a4, 0(a0)		/* Load byte from src */
 	addi	a0, a0, 1
 	sb	a4, 0(a1)		/* Store byte in dest */
 	addi	a1, a1, 1
 	addi	a2, a2, -1		/* len-- */
 	andi	t0, a0, (XLEN_BYTES-1)
 	bnez	t0, 1b
+	j	3f
 
 	/* Copy words */
 2:	ld	a4, 0(a0)		/* Load word from src */
 	addi	a0, a0, XLEN_BYTES
 	sd	a4, 0(a1)		/* Store word in dest */
 	addi	a1, a1, XLEN_BYTES
 	addi	a2, a2, -XLEN_BYTES	/* len -= XLEN_BYTES */
-	bgeu	a2, t2, 2b		/* Again if len >= XLEN_BYTES */
+3:	bgeu	a2, t2, 2b		/* Again if len >= XLEN_BYTES */
 
 	/* Check if we're finished */
-	beqz	a2, 4f
+	beqz	a2, 5f
 
 	/* Copy any remaining bytes */
-3:	lb	a4, 0(a0)		/* Load byte from src */
+4:	lb	a4, 0(a0)		/* Load byte from src */
 	addi	a0, a0, 1
 	sb	a4, 0(a1)		/* Store byte in dest */
 	addi	a1, a1, 1
 	addi	a2, a2, -1		/* len-- */
-	bnez	a2, 3b
+	bnez	a2, 4b
 
-4:	EXIT_USER_ACCESS(a7)
+5:	EXIT_USER_ACCESS(a7)
 	SET_FAULT_HANDLER(x0, a7)	/* Clear the handler */
 	.endm
 
 /*
  * Copies from a kernel to user address
  *
  * int copyout(const void *kaddr, void *udaddr, size_t len)
  */
 ENTRY(copyout)
 	beqz	a2, copyout_end	/* If len == 0 then skip loop */
 	add	a3, a1, a2
 	li	a4, VM_MAXUSER_ADDRESS
 	bgt	a3, a4, copyio_fault_nopcb
 
 	copycommon
 
 copyout_end:
 	li	a0, 0		/* return 0 */
 	ret
 END(copyout)
 
 /*
  * Copies from a user to kernel address
  *
  * int copyin(const void *uaddr, void *kaddr, size_t len)
  */
 ENTRY(copyin)
 	beqz	a2, copyin_end	/* If len == 0 then skip loop */
 	add	a3, a0, a2
 	li	a4, VM_MAXUSER_ADDRESS
 	bgt	a3, a4, copyio_fault_nopcb
 
 	copycommon
 
 copyin_end:
 	li	a0, 0		/* return 0 */
 	ret
 END(copyin)
 
 /*
  * Copies a string from a user to kernel address
  *
  * int copyinstr(const void *udaddr, void *kaddr, size_t len, size_t *done)
  */
 ENTRY(copyinstr)
 	mv	a5, x0		/* count = 0 */
 	beqz	a2, 3f		/* If len == 0 then skip loop */
 
 	la	a6, copyio_fault /* Get the handler address */
 	SET_FAULT_HANDLER(a6, a7) /* Set the handler */
 	ENTER_USER_ACCESS(a7)
 
 	li	a7, VM_MAXUSER_ADDRESS
 1:	bgt	a0, a7, copyio_fault
 	lb	a4, 0(a0)	/* Load from uaddr */
 	addi	a0, a0, 1
 	sb	a4, 0(a1)	/* Store in kaddr */
 	addi	a1, a1, 1
 	beqz	a4, 2f
 	addi	a2, a2, -1	/* len-- */
 	addi	a5, a5, 1	/* count++ */
 	bnez	a2, 1b
 
 2:	EXIT_USER_ACCESS(a7)
 	SET_FAULT_HANDLER(x0, a7) /* Clear the handler */
 
 3:	beqz	a3, 4f		/* Check if done != NULL */
 	addi	a5, a5, 1	/* count++ */
 	sd	a5, 0(a3)	/* done = count */
 
 4:	mv	a0, x0		/* return 0 */
 	beqz	a4, 5f
 	li	a0, ENAMETOOLONG
 5:
 	ret
 END(copyinstr)
Index: projects/fuse2/sys/rpc/svc_vc.c
===================================================================
--- projects/fuse2/sys/rpc/svc_vc.c	(revision 350434)
+++ projects/fuse2/sys/rpc/svc_vc.c	(revision 350435)
@@ -1,994 +1,995 @@
 /*	$NetBSD: svc_vc.c,v 1.7 2000/08/03 00:01:53 fvdl Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009, Sun Microsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without 
  * modification, are permitted provided that the following conditions are met:
  * - Redistributions of source code must retain the above copyright notice, 
  *   this list of conditions and the following disclaimer.
  * - Redistributions in binary form must reproduce the above copyright notice, 
  *   this list of conditions and the following disclaimer in the documentation 
  *   and/or other materials provided with the distribution.
  * - Neither the name of Sun Microsystems, Inc. nor the names of its 
  *   contributors may be used to endorse or promote products derived 
  *   from this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
 static char *sccsid2 = "@(#)svc_tcp.c 1.21 87/08/11 Copyr 1984 Sun Micro";
 static char *sccsid = "@(#)svc_tcp.c	2.2 88/08/01 4.0 RPCSRC";
 #endif
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * svc_vc.c, Server side for Connection Oriented based RPC. 
  *
  * Actually implements two flavors of transporter -
  * a tcp rendezvouser (a listner and connection establisher)
  * and a record/tcp stream.
  */
 
 #include <sys/param.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 
 #include <rpc/rpc.h>
 
 #include <rpc/krpc.h>
 #include <rpc/rpc_com.h>
 
 #include <security/mac/mac_framework.h>
 
 static bool_t svc_vc_rendezvous_recv(SVCXPRT *, struct rpc_msg *,
     struct sockaddr **, struct mbuf **);
 static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *);
 static void svc_vc_rendezvous_destroy(SVCXPRT *);
 static bool_t svc_vc_null(void);
 static void svc_vc_destroy(SVCXPRT *);
 static enum xprt_stat svc_vc_stat(SVCXPRT *);
 static bool_t svc_vc_ack(SVCXPRT *, uint32_t *);
 static bool_t svc_vc_recv(SVCXPRT *, struct rpc_msg *,
     struct sockaddr **, struct mbuf **);
 static bool_t svc_vc_reply(SVCXPRT *, struct rpc_msg *,
     struct sockaddr *, struct mbuf *, uint32_t *seq);
 static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in);
 static bool_t svc_vc_rendezvous_control (SVCXPRT *xprt, const u_int rq,
     void *in);
 static void svc_vc_backchannel_destroy(SVCXPRT *);
 static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *);
 static bool_t svc_vc_backchannel_recv(SVCXPRT *, struct rpc_msg *,
     struct sockaddr **, struct mbuf **);
 static bool_t svc_vc_backchannel_reply(SVCXPRT *, struct rpc_msg *,
     struct sockaddr *, struct mbuf *, uint32_t *);
 static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq,
     void *in);
 static SVCXPRT *svc_vc_create_conn(SVCPOOL *pool, struct socket *so,
     struct sockaddr *raddr);
 static int svc_vc_accept(struct socket *head, struct socket **sop);
 static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag);
 static int svc_vc_rendezvous_soupcall(struct socket *, void *, int);
 
 static struct xp_ops svc_vc_rendezvous_ops = {
 	.xp_recv =	svc_vc_rendezvous_recv,
 	.xp_stat =	svc_vc_rendezvous_stat,
 	.xp_reply =	(bool_t (*)(SVCXPRT *, struct rpc_msg *,
 		struct sockaddr *, struct mbuf *, uint32_t *))svc_vc_null,
 	.xp_destroy =	svc_vc_rendezvous_destroy,
 	.xp_control =	svc_vc_rendezvous_control
 };
 
 static struct xp_ops svc_vc_ops = {
 	.xp_recv =	svc_vc_recv,
 	.xp_stat =	svc_vc_stat,
 	.xp_ack =	svc_vc_ack,
 	.xp_reply =	svc_vc_reply,
 	.xp_destroy =	svc_vc_destroy,
 	.xp_control =	svc_vc_control
 };
 
 static struct xp_ops svc_vc_backchannel_ops = {
 	.xp_recv =	svc_vc_backchannel_recv,
 	.xp_stat =	svc_vc_backchannel_stat,
 	.xp_reply =	svc_vc_backchannel_reply,
 	.xp_destroy =	svc_vc_backchannel_destroy,
 	.xp_control =	svc_vc_backchannel_control
 };
 
 /*
  * Usage:
  *	xprt = svc_vc_create(sock, send_buf_size, recv_buf_size);
  *
  * Creates, registers, and returns a (rpc) tcp based transporter.
  * Once *xprt is initialized, it is registered as a transporter
  * see (svc.h, xprt_register).  This routine returns
  * a NULL if a problem occurred.
  *
  * The filedescriptor passed in is expected to refer to a bound, but
  * not yet connected socket.
  *
  * Since streams do buffered io similar to stdio, the caller can specify
  * how big the send and receive buffers are via the second and third parms;
  * 0 => use the system default.
  */
 SVCXPRT *
 svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize,
     size_t recvsize)
 {
 	SVCXPRT *xprt;
 	struct sockaddr* sa;
 	int error;
 
 	SOCK_LOCK(so);
 	if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED)) {
 		SOCK_UNLOCK(so);
 		CURVNET_SET(so->so_vnet);
 		error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
 		CURVNET_RESTORE();
 		if (error)
 			return (NULL);
 		xprt = svc_vc_create_conn(pool, so, sa);
 		free(sa, M_SONAME);
 		return (xprt);
 	}
 	SOCK_UNLOCK(so);
 
 	xprt = svc_xprt_alloc();
 	sx_init(&xprt->xp_lock, "xprt->xp_lock");
 	xprt->xp_pool = pool;
 	xprt->xp_socket = so;
 	xprt->xp_p1 = NULL;
 	xprt->xp_p2 = NULL;
 	xprt->xp_ops = &svc_vc_rendezvous_ops;
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
 	CURVNET_RESTORE();
 	if (error) {
 		goto cleanup_svc_vc_create;
 	}
 
 	memcpy(&xprt->xp_ltaddr, sa, sa->sa_len);
 	free(sa, M_SONAME);
 
 	xprt_register(xprt);
 
 	solisten(so, -1, curthread);
 
 	SOLISTEN_LOCK(so);
 	xprt->xp_upcallset = 1;
 	solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt);
 	SOLISTEN_UNLOCK(so);
 
 	return (xprt);
 
 cleanup_svc_vc_create:
 	sx_destroy(&xprt->xp_lock);
 	svc_xprt_free(xprt);
 
 	return (NULL);
 }
 
 /*
  * Create a new transport for a socket optained via soaccept().
  */
 SVCXPRT *
 svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr)
 {
 	SVCXPRT *xprt;
 	struct cf_conn *cd;
 	struct sockaddr* sa = NULL;
 	struct sockopt opt;
 	int one = 1;
 	int error;
 
 	bzero(&opt, sizeof(struct sockopt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = SOL_SOCKET;
 	opt.sopt_name = SO_KEEPALIVE;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(so, &opt);
 	if (error) {
 		return (NULL);
 	}
 
 	if (so->so_proto->pr_protocol == IPPROTO_TCP) {
 		bzero(&opt, sizeof(struct sockopt));
 		opt.sopt_dir = SOPT_SET;
 		opt.sopt_level = IPPROTO_TCP;
 		opt.sopt_name = TCP_NODELAY;
 		opt.sopt_val = &one;
 		opt.sopt_valsize = sizeof(one);
 		error = sosetopt(so, &opt);
 		if (error) {
 			return (NULL);
 		}
 	}
 
 	cd = mem_alloc(sizeof(*cd));
 	cd->strm_stat = XPRT_IDLE;
 
 	xprt = svc_xprt_alloc();
 	sx_init(&xprt->xp_lock, "xprt->xp_lock");
 	xprt->xp_pool = pool;
 	xprt->xp_socket = so;
 	xprt->xp_p1 = cd;
 	xprt->xp_p2 = NULL;
 	xprt->xp_ops = &svc_vc_ops;
 
 	/*
 	 * See http://www.connectathon.org/talks96/nfstcp.pdf - client
 	 * has a 5 minute timer, server has a 6 minute timer.
 	 */
 	xprt->xp_idletimeout = 6 * 60;
 
 	memcpy(&xprt->xp_rtaddr, raddr, raddr->sa_len);
 
 	CURVNET_SET(so->so_vnet);
 	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
 	CURVNET_RESTORE();
 	if (error)
 		goto cleanup_svc_vc_create;
 
 	memcpy(&xprt->xp_ltaddr, sa, sa->sa_len);
 	free(sa, M_SONAME);
 
 	xprt_register(xprt);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	xprt->xp_upcallset = 1;
 	soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
 	 * Throw the transport into the active list in case it already
 	 * has some data buffered.
 	 */
 	sx_xlock(&xprt->xp_lock);
 	xprt_active(xprt);
 	sx_xunlock(&xprt->xp_lock);
 
 	return (xprt);
 cleanup_svc_vc_create:
 	sx_destroy(&xprt->xp_lock);
 	svc_xprt_free(xprt);
 	mem_free(cd, sizeof(*cd));
 
 	return (NULL);
 }
 
 /*
  * Create a new transport for a backchannel on a clnt_vc socket.
  */
 SVCXPRT *
 svc_vc_create_backchannel(SVCPOOL *pool)
 {
 	SVCXPRT *xprt = NULL;
 	struct cf_conn *cd = NULL;
 
 	cd = mem_alloc(sizeof(*cd));
 	cd->strm_stat = XPRT_IDLE;
 
 	xprt = svc_xprt_alloc();
 	sx_init(&xprt->xp_lock, "xprt->xp_lock");
 	xprt->xp_pool = pool;
 	xprt->xp_socket = NULL;
 	xprt->xp_p1 = cd;
 	xprt->xp_p2 = NULL;
 	xprt->xp_ops = &svc_vc_backchannel_ops;
 	return (xprt);
 }
 
 /*
  * This does all of the accept except the final call to soaccept. The
  * caller will call soaccept after dropping its locks (soaccept may
  * call malloc).
  */
 int
 svc_vc_accept(struct socket *head, struct socket **sop)
 {
 	struct socket *so;
 	int error = 0;
 	short nbio;
 
 	/* XXXGL: shouldn't that be an assertion? */
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto done;
 	}
 #ifdef MAC
 	error = mac_socket_check_accept(curthread->td_ucred, head);
 	if (error != 0)
 		goto done;
 #endif
 	/*
 	 * XXXGL: we want non-blocking semantics.  The socket could be a
 	 * socket created by kernel as well as socket shared with userland,
 	 * so we can't be sure about presense of SS_NBIO.  We also shall not
 	 * toggle it on the socket, since that may surprise userland.  So we
 	 * set SS_NBIO only temporarily.
 	 */
 	SOLISTEN_LOCK(head);
 	nbio = head->so_state & SS_NBIO;
 	head->so_state |= SS_NBIO;
 	error = solisten_dequeue(head, &so, 0);
 	head->so_state &= (nbio & ~SS_NBIO);
 	if (error)
 		goto done;
 
 	so->so_state |= nbio;
 	*sop = so;
 
 	/* connection has been removed from the listen queue */
 	KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0);
 done:
 	return (error);
 }
 
 /*ARGSUSED*/
 static bool_t
 svc_vc_rendezvous_recv(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr **addrp, struct mbuf **mp)
 {
 	struct socket *so = NULL;
 	struct sockaddr *sa = NULL;
 	int error;
 	SVCXPRT *new_xprt;
 
 	/*
 	 * The socket upcall calls xprt_active() which will eventually
 	 * cause the server to call us here. We attempt to accept a
 	 * connection from the socket and turn it into a new
 	 * transport. If the accept fails, we have drained all pending
 	 * connections so we call xprt_inactive().
 	 */
 	sx_xlock(&xprt->xp_lock);
 
 	error = svc_vc_accept(xprt->xp_socket, &so);
 
 	if (error == EWOULDBLOCK) {
 		/*
 		 * We must re-test for new connections after taking
 		 * the lock to protect us in the case where a new
 		 * connection arrives after our call to accept fails
 		 * with EWOULDBLOCK.
 		 */
 		SOLISTEN_LOCK(xprt->xp_socket);
 		if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp))
 			xprt_inactive_self(xprt);
 		SOLISTEN_UNLOCK(xprt->xp_socket);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 
 	if (error) {
 		SOLISTEN_LOCK(xprt->xp_socket);
 		if (xprt->xp_upcallset) {
 			xprt->xp_upcallset = 0;
 			soupcall_clear(xprt->xp_socket, SO_RCV);
 		}
 		SOLISTEN_UNLOCK(xprt->xp_socket);
 		xprt_inactive_self(xprt);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 
 	sx_xunlock(&xprt->xp_lock);
 
 	sa = NULL;
 	error = soaccept(so, &sa);
 
 	if (error) {
 		/*
 		 * XXX not sure if I need to call sofree or soclose here.
 		 */
 		if (sa)
 			free(sa, M_SONAME);
 		return (FALSE);
 	}
 
 	/*
 	 * svc_vc_create_conn will call xprt_register - we don't need
 	 * to do anything with the new connection except derefence it.
 	 */
 	new_xprt = svc_vc_create_conn(xprt->xp_pool, so, sa);
 	if (!new_xprt) {
 		soclose(so);
 	} else {
 		SVC_RELEASE(new_xprt);
 	}
 
 	free(sa, M_SONAME);
 
 	return (FALSE); /* there is never an rpc msg to be processed */
 }
 
 /*ARGSUSED*/
 static enum xprt_stat
 svc_vc_rendezvous_stat(SVCXPRT *xprt)
 {
 
 	return (XPRT_IDLE);
 }
 
 static void
 svc_vc_destroy_common(SVCXPRT *xprt)
 {
 
 	if (xprt->xp_socket)
 		(void)soclose(xprt->xp_socket);
 
 	if (xprt->xp_netid)
 		(void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1);
 	svc_xprt_free(xprt);
 }
 
 static void
 svc_vc_rendezvous_destroy(SVCXPRT *xprt)
 {
 
 	SOLISTEN_LOCK(xprt->xp_socket);
 	if (xprt->xp_upcallset) {
 		xprt->xp_upcallset = 0;
 		solisten_upcall_set(xprt->xp_socket, NULL, NULL);
 	}
 	SOLISTEN_UNLOCK(xprt->xp_socket);
 
 	svc_vc_destroy_common(xprt);
 }
 
 static void
 svc_vc_destroy(SVCXPRT *xprt)
 {
 	struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1;
 
 	SOCKBUF_LOCK(&xprt->xp_socket->so_rcv);
 	if (xprt->xp_upcallset) {
 		xprt->xp_upcallset = 0;
 		soupcall_clear(xprt->xp_socket, SO_RCV);
 	}
 	SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv);
 
 	svc_vc_destroy_common(xprt);
 
 	if (cd->mreq)
 		m_freem(cd->mreq);
 	if (cd->mpending)
 		m_freem(cd->mpending);
 	mem_free(cd, sizeof(*cd));
 }
 
 static void
 svc_vc_backchannel_destroy(SVCXPRT *xprt)
 {
 	struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1;
 	struct mbuf *m, *m2;
 
 	svc_xprt_free(xprt);
 	m = cd->mreq;
 	while (m != NULL) {
 		m2 = m;
 		m = m->m_nextpkt;
 		m_freem(m2);
 	}
 	mem_free(cd, sizeof(*cd));
 }
 
 /*ARGSUSED*/
 static bool_t
 svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in)
 {
 	return (FALSE);
 }
 
 static bool_t
 svc_vc_rendezvous_control(SVCXPRT *xprt, const u_int rq, void *in)
 {
 
 	return (FALSE);
 }
 
 static bool_t
 svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in)
 {
 
 	return (FALSE);
 }
 
 static enum xprt_stat
 svc_vc_stat(SVCXPRT *xprt)
 {
 	struct cf_conn *cd;
 
 	cd = (struct cf_conn *)(xprt->xp_p1);
 
 	if (cd->strm_stat == XPRT_DIED)
 		return (XPRT_DIED);
 
 	if (cd->mreq != NULL && cd->resid == 0 && cd->eor)
 		return (XPRT_MOREREQS);
 
 	if (soreadable(xprt->xp_socket))
 		return (XPRT_MOREREQS);
 
 	return (XPRT_IDLE);
 }
 
 static bool_t
 svc_vc_ack(SVCXPRT *xprt, uint32_t *ack)
 {
 
 	*ack = atomic_load_acq_32(&xprt->xp_snt_cnt);
 	*ack -= sbused(&xprt->xp_socket->so_snd);
 	return (TRUE);
 }
 
 static enum xprt_stat
 svc_vc_backchannel_stat(SVCXPRT *xprt)
 {
 	struct cf_conn *cd;
 
 	cd = (struct cf_conn *)(xprt->xp_p1);
 
 	if (cd->mreq != NULL)
 		return (XPRT_MOREREQS);
 
 	return (XPRT_IDLE);
 }
 
 /*
  * If we have an mbuf chain in cd->mpending, try to parse a record from it,
  * leaving the result in cd->mreq. If we don't have a complete record, leave
  * the partial result in cd->mreq and try to read more from the socket.
  */
 static int
 svc_vc_process_pending(SVCXPRT *xprt)
 {
 	struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1;
 	struct socket *so = xprt->xp_socket;
 	struct mbuf *m;
 
 	/*
 	 * If cd->resid is non-zero, we have part of the
 	 * record already, otherwise we are expecting a record
 	 * marker.
 	 */
 	if (!cd->resid && cd->mpending) {
 		/*
 		 * See if there is enough data buffered to
 		 * make up a record marker. Make sure we can
 		 * handle the case where the record marker is
 		 * split across more than one mbuf.
 		 */
 		size_t n = 0;
 		uint32_t header;
 
 		m = cd->mpending;
 		while (n < sizeof(uint32_t) && m) {
 			n += m->m_len;
 			m = m->m_next;
 		}
 		if (n < sizeof(uint32_t)) {
 			so->so_rcv.sb_lowat = sizeof(uint32_t) - n;
 			return (FALSE);
 		}
 		m_copydata(cd->mpending, 0, sizeof(header),
 		    (char *)&header);
 		header = ntohl(header);
 		cd->eor = (header & 0x80000000) != 0;
 		cd->resid = header & 0x7fffffff;
 		m_adj(cd->mpending, sizeof(uint32_t));
 	}
 
 	/*
 	 * Start pulling off mbufs from cd->mpending
 	 * until we either have a complete record or
 	 * we run out of data. We use m_split to pull
 	 * data - it will pull as much as possible and
 	 * split the last mbuf if necessary.
 	 */
 	while (cd->mpending && cd->resid) {
 		m = cd->mpending;
 		if (cd->mpending->m_next
 		    || cd->mpending->m_len > cd->resid)
 			cd->mpending = m_split(cd->mpending,
 			    cd->resid, M_WAITOK);
 		else
 			cd->mpending = NULL;
 		if (cd->mreq)
 			m_last(cd->mreq)->m_next = m;
 		else
 			cd->mreq = m;
 		while (m) {
 			cd->resid -= m->m_len;
 			m = m->m_next;
 		}
 	}
 
 	/*
 	 * Block receive upcalls if we have more data pending,
 	 * otherwise report our need.
 	 */
 	if (cd->mpending)
 		so->so_rcv.sb_lowat = INT_MAX;
 	else
 		so->so_rcv.sb_lowat =
 		    imax(1, imin(cd->resid, so->so_rcv.sb_hiwat / 2));
 	return (TRUE);
 }
 
 static bool_t
 svc_vc_recv(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr **addrp, struct mbuf **mp)
 {
 	struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1;
 	struct uio uio;
 	struct mbuf *m;
 	struct socket* so = xprt->xp_socket;
 	XDR xdrs;
 	int error, rcvflag;
 	uint32_t xid_plus_direction[2];
 
 	/*
 	 * Serialise access to the socket and our own record parsing
 	 * state.
 	 */
 	sx_xlock(&xprt->xp_lock);
 
 	for (;;) {
 		/* If we have no request ready, check pending queue. */
 		while (cd->mpending &&
 		    (cd->mreq == NULL || cd->resid != 0 || !cd->eor)) {
 			if (!svc_vc_process_pending(xprt))
 				break;
 		}
 
 		/* Process and return complete request in cd->mreq. */
 		if (cd->mreq != NULL && cd->resid == 0 && cd->eor) {
 
 			/*
 			 * Now, check for a backchannel reply.
 			 * The XID is in the first uint32_t of the reply
 			 * and the message direction is the second one.
 			 */
 			if ((cd->mreq->m_len >= sizeof(xid_plus_direction) ||
 			    m_length(cd->mreq, NULL) >=
 			    sizeof(xid_plus_direction)) &&
 			    xprt->xp_p2 != NULL) {
 				m_copydata(cd->mreq, 0,
 				    sizeof(xid_plus_direction),
 				    (char *)xid_plus_direction);
 				xid_plus_direction[0] =
 				    ntohl(xid_plus_direction[0]);
 				xid_plus_direction[1] =
 				    ntohl(xid_plus_direction[1]);
 				/* Check message direction. */
 				if (xid_plus_direction[1] == REPLY) {
 					clnt_bck_svccall(xprt->xp_p2,
 					    cd->mreq,
 					    xid_plus_direction[0]);
 					cd->mreq = NULL;
 					continue;
 				}
 			}
 
 			xdrmbuf_create(&xdrs, cd->mreq, XDR_DECODE);
 			cd->mreq = NULL;
 
 			/* Check for next request in a pending queue. */
 			svc_vc_process_pending(xprt);
 			if (cd->mreq == NULL || cd->resid != 0) {
 				SOCKBUF_LOCK(&so->so_rcv);
 				if (!soreadable(so))
 					xprt_inactive_self(xprt);
 				SOCKBUF_UNLOCK(&so->so_rcv);
 			}
 
 			sx_xunlock(&xprt->xp_lock);
 
 			if (! xdr_callmsg(&xdrs, msg)) {
 				XDR_DESTROY(&xdrs);
 				return (FALSE);
 			}
 
 			*addrp = NULL;
 			*mp = xdrmbuf_getall(&xdrs);
 			XDR_DESTROY(&xdrs);
 
 			return (TRUE);
 		}
 
 		/*
 		 * The socket upcall calls xprt_active() which will eventually
 		 * cause the server to call us here. We attempt to
 		 * read as much as possible from the socket and put
 		 * the result in cd->mpending. If the read fails,
 		 * we have drained both cd->mpending and the socket so
 		 * we can call xprt_inactive().
 		 */
 		uio.uio_resid = 1000000000;
 		uio.uio_td = curthread;
 		m = NULL;
 		rcvflag = MSG_DONTWAIT;
 		error = soreceive(so, NULL, &uio, &m, NULL, &rcvflag);
 
 		if (error == EWOULDBLOCK) {
 			/*
 			 * We must re-test for readability after
 			 * taking the lock to protect us in the case
 			 * where a new packet arrives on the socket
 			 * after our call to soreceive fails with
 			 * EWOULDBLOCK.
 			 */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (!soreadable(so))
 				xprt_inactive_self(xprt);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			sx_xunlock(&xprt->xp_lock);
 			return (FALSE);
 		}
 
 		if (error) {
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (xprt->xp_upcallset) {
 				xprt->xp_upcallset = 0;
 				soupcall_clear(so, SO_RCV);
 			}
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			xprt_inactive_self(xprt);
 			cd->strm_stat = XPRT_DIED;
 			sx_xunlock(&xprt->xp_lock);
 			return (FALSE);
 		}
 
 		if (!m) {
 			/*
 			 * EOF - the other end has closed the socket.
 			 */
 			xprt_inactive_self(xprt);
 			cd->strm_stat = XPRT_DIED;
 			sx_xunlock(&xprt->xp_lock);
 			return (FALSE);
 		}
 
 		if (cd->mpending)
 			m_last(cd->mpending)->m_next = m;
 		else
 			cd->mpending = m;
 	}
 }
 
 static bool_t
 svc_vc_backchannel_recv(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr **addrp, struct mbuf **mp)
 {
 	struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1;
 	struct ct_data *ct;
 	struct mbuf *m;
 	XDR xdrs;
 
 	sx_xlock(&xprt->xp_lock);
 	ct = (struct ct_data *)xprt->xp_p2;
 	if (ct == NULL) {
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 	mtx_lock(&ct->ct_lock);
 	m = cd->mreq;
 	if (m == NULL) {
 		xprt_inactive_self(xprt);
 		mtx_unlock(&ct->ct_lock);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 	cd->mreq = m->m_nextpkt;
 	mtx_unlock(&ct->ct_lock);
 	sx_xunlock(&xprt->xp_lock);
 
 	xdrmbuf_create(&xdrs, m, XDR_DECODE);
 	if (! xdr_callmsg(&xdrs, msg)) {
 		XDR_DESTROY(&xdrs);
 		return (FALSE);
 	}
 	*addrp = NULL;
 	*mp = xdrmbuf_getall(&xdrs);
 	XDR_DESTROY(&xdrs);
 	return (TRUE);
 }
 
 static bool_t
 svc_vc_reply(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr *addr, struct mbuf *m, uint32_t *seq)
 {
 	XDR xdrs;
 	struct mbuf *mrep;
 	bool_t stat = TRUE;
 	int error, len;
 
 	/*
 	 * Leave space for record mark.
 	 */
 	mrep = m_gethdr(M_WAITOK, MT_DATA);
 	mrep->m_data += sizeof(uint32_t);
 
 	xdrmbuf_create(&xdrs, mrep, XDR_ENCODE);
 
 	if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
 	    msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
 		if (!xdr_replymsg(&xdrs, msg))
 			stat = FALSE;
 		else
 			xdrmbuf_append(&xdrs, m);
 	} else {
 		stat = xdr_replymsg(&xdrs, msg);
 	}
 
 	if (stat) {
 		m_fixhdr(mrep);
 
 		/*
 		 * Prepend a record marker containing the reply length.
 		 */
 		M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK);
 		len = mrep->m_pkthdr.len;
 		*mtod(mrep, uint32_t *) =
 			htonl(0x80000000 | (len - sizeof(uint32_t)));
 		atomic_add_32(&xprt->xp_snd_cnt, len);
 		error = sosend(xprt->xp_socket, NULL, NULL, mrep, NULL,
 		    0, curthread);
 		if (!error) {
 			atomic_add_rel_32(&xprt->xp_snt_cnt, len);
 			if (seq)
 				*seq = xprt->xp_snd_cnt;
 			stat = TRUE;
 		} else
 			atomic_subtract_32(&xprt->xp_snd_cnt, len);
 	} else {
 		m_freem(mrep);
 	}
 
 	XDR_DESTROY(&xdrs);
 
 	return (stat);
 }
 
 static bool_t
 svc_vc_backchannel_reply(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr *addr, struct mbuf *m, uint32_t *seq)
 {
 	struct ct_data *ct;
 	XDR xdrs;
 	struct mbuf *mrep;
 	bool_t stat = TRUE;
 	int error;
 
 	/*
 	 * Leave space for record mark.
 	 */
 	mrep = m_gethdr(M_WAITOK, MT_DATA);
 	mrep->m_data += sizeof(uint32_t);
 
 	xdrmbuf_create(&xdrs, mrep, XDR_ENCODE);
 
 	if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
 	    msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
 		if (!xdr_replymsg(&xdrs, msg))
 			stat = FALSE;
 		else
 			xdrmbuf_append(&xdrs, m);
 	} else {
 		stat = xdr_replymsg(&xdrs, msg);
 	}
 
 	if (stat) {
 		m_fixhdr(mrep);
 
 		/*
 		 * Prepend a record marker containing the reply length.
 		 */
 		M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK);
 		*mtod(mrep, uint32_t *) =
 			htonl(0x80000000 | (mrep->m_pkthdr.len
 				- sizeof(uint32_t)));
 		sx_xlock(&xprt->xp_lock);
 		ct = (struct ct_data *)xprt->xp_p2;
 		if (ct != NULL)
 			error = sosend(ct->ct_socket, NULL, NULL, mrep, NULL,
 			    0, curthread);
 		else
 			error = EPIPE;
 		sx_xunlock(&xprt->xp_lock);
 		if (!error) {
 			stat = TRUE;
 		}
 	} else {
 		m_freem(mrep);
 	}
 
 	XDR_DESTROY(&xdrs);
 
 	return (stat);
 }
 
 static bool_t
 svc_vc_null()
 {
 
 	return (FALSE);
 }
 
 static int
 svc_vc_soupcall(struct socket *so, void *arg, int waitflag)
 {
 	SVCXPRT *xprt = (SVCXPRT *) arg;
 
 	if (soreadable(xprt->xp_socket))
 		xprt_active(xprt);
 	return (SU_OK);
 }
 
 static int
 svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag)
 {
 	SVCXPRT *xprt = (SVCXPRT *) arg;
 
 	if (!TAILQ_EMPTY(&head->sol_comp))
 		xprt_active(xprt);
 	return (SU_OK);
 }
 
 #if 0
 /*
  * Get the effective UID of the sending process. Used by rpcbind, keyserv
  * and rpc.yppasswdd on AF_LOCAL.
  */
 int
 __rpc_get_local_uid(SVCXPRT *transp, uid_t *uid) {
 	int sock, ret;
 	gid_t egid;
 	uid_t euid;
 	struct sockaddr *sa;
 
 	sock = transp->xp_fd;
 	sa = (struct sockaddr *)transp->xp_rtaddr;
 	if (sa->sa_family == AF_LOCAL) {
 		ret = getpeereid(sock, &euid, &egid);
 		if (ret == 0)
 			*uid = euid;
 		return (ret);
 	} else
 		return (-1);
 }
 #endif
Index: projects/fuse2/sys/sys/ata.h
===================================================================
--- projects/fuse2/sys/sys/ata.h	(revision 350434)
+++ projects/fuse2/sys/sys/ata.h	(revision 350435)
@@ -1,1056 +1,1059 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_ATA_H_
 #define _SYS_ATA_H_
 
 #include <sys/ioccom.h>
 
 /* ATA/ATAPI device parameters */
 struct ata_params {
 /*000*/ u_int16_t       config;         /* configuration info */
 #define ATA_PROTO_MASK                  0x8003
 #define ATA_PROTO_ATAPI                 0x8000
 #define ATA_PROTO_ATAPI_12              0x8000
 #define ATA_PROTO_ATAPI_16              0x8001
 #define ATA_PROTO_CFA                   0x848a
 #define ATA_ATAPI_TYPE_MASK             0x1f00
 #define ATA_ATAPI_TYPE_DIRECT           0x0000  /* disk/floppy */
 #define ATA_ATAPI_TYPE_TAPE             0x0100  /* streaming tape */
 #define ATA_ATAPI_TYPE_CDROM            0x0500  /* CD-ROM device */
 #define ATA_ATAPI_TYPE_OPTICAL          0x0700  /* optical disk */
 #define ATA_DRQ_MASK                    0x0060
 #define ATA_DRQ_SLOW                    0x0000  /* cpu 3 ms delay */
 #define ATA_DRQ_INTR                    0x0020  /* interrupt 10 ms delay */
 #define ATA_DRQ_FAST                    0x0040  /* accel 50 us delay */
 #define ATA_RESP_INCOMPLETE             0x0004
 
 /*001*/ u_int16_t       cylinders;              /* # of cylinders */
 /*002*/ u_int16_t       specconf;		/* specific configuration */
 /*003*/ u_int16_t       heads;                  /* # heads */
 	u_int16_t       obsolete4;
 	u_int16_t       obsolete5;
 /*006*/ u_int16_t       sectors;                /* # sectors/track */
 /*007*/ u_int16_t       vendor7[3];
 /*010*/ u_int8_t        serial[20];             /* serial number */
 /*020*/ u_int16_t       retired20;
 	u_int16_t       retired21;
 	u_int16_t       obsolete22;
 /*023*/ u_int8_t        revision[8];            /* firmware revision */
 /*027*/ u_int8_t        model[40];              /* model name */
 /*047*/ u_int16_t       sectors_intr;           /* sectors per interrupt */
 /*048*/ u_int16_t       tcg;                    /* Trusted Computing Group */
 #define ATA_SUPPORT_TCG                 0x0001
 /*049*/ u_int16_t       capabilities1;
 #define ATA_SUPPORT_DMA                 0x0100
 #define ATA_SUPPORT_LBA                 0x0200
 #define ATA_SUPPORT_IORDYDIS            0x0400
 #define ATA_SUPPORT_IORDY               0x0800
 #define ATA_SUPPORT_OVERLAP             0x4000
 
 /*050*/ u_int16_t       capabilities2;
 /*051*/ u_int16_t       retired_piomode;        /* PIO modes 0-2 */
 #define ATA_RETIRED_PIO_MASK            0x0300
 
 /*052*/ u_int16_t       retired_dmamode;        /* DMA modes */
 #define ATA_RETIRED_DMA_MASK            0x0003
 
 /*053*/ u_int16_t       atavalid;               /* fields valid */
 #define ATA_FLAG_54_58                  0x0001  /* words 54-58 valid */
 #define ATA_FLAG_64_70                  0x0002  /* words 64-70 valid */
 #define ATA_FLAG_88                     0x0004  /* word 88 valid */
 
 /*054*/ u_int16_t       current_cylinders;
 /*055*/ u_int16_t       current_heads;
 /*056*/ u_int16_t       current_sectors;
 /*057*/ u_int16_t       current_size_1;
 /*058*/ u_int16_t       current_size_2;
 /*059*/ u_int16_t       multi;
 #define ATA_SUPPORT_BLOCK_ERASE_EXT     0x8000
 #define ATA_SUPPORT_OVERWRITE_EXT       0x4000
 #define ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT 0x2000
 #define ATA_SUPPORT_SANITIZE            0x1000
 #define	ATA_SUPPORT_SANITIZE_ALLOWED	0x0800
 #define	ATA_SUPPORT_ANTIFREEZE_LOCK_EXT	0x0400
 #define ATA_MULTI_VALID                 0x0100
 
 /*060*/ u_int16_t       lba_size_1;
 	u_int16_t       lba_size_2;
 	u_int16_t       obsolete62;
 /*063*/ u_int16_t       mwdmamodes;             /* multiword DMA modes */
 /*064*/ u_int16_t       apiomodes;              /* advanced PIO modes */
 
 /*065*/ u_int16_t       mwdmamin;               /* min. M/W DMA time/word ns */
 /*066*/ u_int16_t       mwdmarec;               /* rec. M/W DMA time ns */
 /*067*/ u_int16_t       pioblind;               /* min. PIO cycle w/o flow */
 /*068*/ u_int16_t       pioiordy;               /* min. PIO cycle IORDY flow */
 /*069*/ u_int16_t       support3;
 #define ATA_SUPPORT_RZAT                0x0020
 #define ATA_SUPPORT_DRAT                0x4000
 #define ATA_ENCRYPTS_ALL_USER_DATA      0x0010  /* Self-encrypting drive */
 #define	ATA_SUPPORT_ZONE_MASK		0x0003
 #define	ATA_SUPPORT_ZONE_NR		0x0000
 #define	ATA_SUPPORT_ZONE_HOST_AWARE	0x0001
 #define	ATA_SUPPORT_ZONE_DEV_MANAGED	0x0002
 	u_int16_t       reserved70;
 /*071*/ u_int16_t       rlsovlap;               /* rel time (us) for overlap */
 /*072*/ u_int16_t       rlsservice;             /* rel time (us) for service */
 	u_int16_t       reserved73;
 	u_int16_t       reserved74;
 /*075*/ u_int16_t       queue;
 #define ATA_QUEUE_LEN(x)                ((x) & 0x001f)
 
 /*76*/  u_int16_t       satacapabilities;
 #define ATA_SATA_GEN1                   0x0002
 #define ATA_SATA_GEN2                   0x0004
 #define ATA_SATA_GEN3                   0x0008
 #define ATA_SUPPORT_NCQ                 0x0100
 #define ATA_SUPPORT_IFPWRMNGTRCV        0x0200
 #define ATA_SUPPORT_PHYEVENTCNT         0x0400
 #define ATA_SUPPORT_NCQ_UNLOAD          0x0800
 #define ATA_SUPPORT_NCQ_PRIO            0x1000
 #define ATA_SUPPORT_HAPST               0x2000
 #define ATA_SUPPORT_DAPST               0x4000
 #define ATA_SUPPORT_READLOGDMAEXT       0x8000
 
 /*77*/  u_int16_t       satacapabilities2;
 #define ATA_SATA_CURR_GEN_MASK          0x0006
 #define ATA_SUPPORT_NCQ_STREAM          0x0010
-#define ATA_SUPPORT_NCQ_QMANAGEMENT     0x0020
+#define ATA_SUPPORT_NCQ_NON_DATA        0x0020
+#define ATA_SUPPORT_NCQ_QMANAGEMENT     ATA_SUPPORT_NCQ_NON_DATA
 #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040
 /*78*/  u_int16_t       satasupport;
 #define ATA_SUPPORT_NONZERO             0x0002
 #define ATA_SUPPORT_AUTOACTIVATE        0x0004
 #define ATA_SUPPORT_IFPWRMNGT           0x0008
 #define ATA_SUPPORT_INORDERDATA         0x0010
 #define ATA_SUPPORT_ASYNCNOTIF          0x0020
 #define ATA_SUPPORT_SOFTSETPRESERVE     0x0040
+#define ATA_SUPPORT_NCQ_AUTOSENSE       0x0080
 /*79*/  u_int16_t       sataenabled;
 #define ATA_ENABLED_DAPST               0x0080
 
 /*080*/ u_int16_t       version_major;
 /*081*/ u_int16_t       version_minor;
 
 	struct {
 /*082/085*/ u_int16_t   command1;
 #define ATA_SUPPORT_SMART               0x0001
 #define ATA_SUPPORT_SECURITY            0x0002
 #define ATA_SUPPORT_REMOVABLE           0x0004
 #define ATA_SUPPORT_POWERMGT            0x0008
 #define ATA_SUPPORT_PACKET              0x0010
 #define ATA_SUPPORT_WRITECACHE          0x0020
 #define ATA_SUPPORT_LOOKAHEAD           0x0040
 #define ATA_SUPPORT_RELEASEIRQ          0x0080
 #define ATA_SUPPORT_SERVICEIRQ          0x0100
 #define ATA_SUPPORT_RESET               0x0200
 #define ATA_SUPPORT_PROTECTED           0x0400
 #define ATA_SUPPORT_WRITEBUFFER         0x1000
 #define ATA_SUPPORT_READBUFFER          0x2000
 #define ATA_SUPPORT_NOP                 0x4000
 
 /*083/086*/ u_int16_t   command2;
 #define ATA_SUPPORT_MICROCODE           0x0001
 #define ATA_SUPPORT_QUEUED              0x0002
 #define ATA_SUPPORT_CFA                 0x0004
 #define ATA_SUPPORT_APM                 0x0008
 #define ATA_SUPPORT_NOTIFY              0x0010
 #define ATA_SUPPORT_STANDBY             0x0020
 #define ATA_SUPPORT_SPINUP              0x0040
 #define ATA_SUPPORT_MAXSECURITY         0x0100
 #define ATA_SUPPORT_AUTOACOUSTIC        0x0200
 #define ATA_SUPPORT_ADDRESS48           0x0400
 #define ATA_SUPPORT_OVERLAY             0x0800
 #define ATA_SUPPORT_FLUSHCACHE          0x1000
 #define ATA_SUPPORT_FLUSHCACHE48        0x2000
 
 /*084/087*/ u_int16_t   extension;
 #define ATA_SUPPORT_SMARTLOG		0x0001
 #define ATA_SUPPORT_SMARTTEST		0x0002
 #define ATA_SUPPORT_MEDIASN		0x0004
 #define ATA_SUPPORT_MEDIAPASS		0x0008
 #define ATA_SUPPORT_STREAMING		0x0010
 #define ATA_SUPPORT_GENLOG		0x0020
 #define ATA_SUPPORT_WRITEDMAFUAEXT	0x0040
 #define ATA_SUPPORT_WRITEDMAQFUAEXT	0x0080
 #define ATA_SUPPORT_64BITWWN		0x0100
 #define ATA_SUPPORT_UNLOAD		0x2000
 	} __packed support, enabled;
 
 /*088*/ u_int16_t       udmamodes;              /* UltraDMA modes */
 /*089*/ u_int16_t       erase_time;             /* time req'd in 2min units */
 /*090*/ u_int16_t       enhanced_erase_time;    /* time req'd in 2min units */
 /*091*/ u_int16_t       apm_value;
 /*092*/ u_int16_t       master_passwd_revision; /* password revision code */
 /*093*/ u_int16_t       hwres;
 #define ATA_CABLE_ID                    0x2000
 
 /*094*/ u_int16_t       acoustic;
 #define ATA_ACOUSTIC_CURRENT(x)         ((x) & 0x00ff)
 #define ATA_ACOUSTIC_VENDOR(x)          (((x) & 0xff00) >> 8)
 
 /*095*/ u_int16_t       stream_min_req_size;
 /*096*/ u_int16_t       stream_transfer_time;
 /*097*/ u_int16_t       stream_access_latency;
 /*098*/ u_int32_t       stream_granularity;
 /*100*/ u_int16_t       lba_size48_1;
 	u_int16_t       lba_size48_2;
 	u_int16_t       lba_size48_3;
 	u_int16_t       lba_size48_4;
 	u_int16_t       reserved104;
 /*105*/	u_int16_t       max_dsm_blocks;
 /*106*/	u_int16_t       pss;
 #define ATA_PSS_LSPPS			0x000F
 #define ATA_PSS_LSSABOVE512		0x1000
 #define ATA_PSS_MULTLS			0x2000
 #define ATA_PSS_VALID_MASK		0xC000
 #define ATA_PSS_VALID_VALUE		0x4000
 /*107*/ u_int16_t       isd;
 /*108*/ u_int16_t       wwn[4];
 	u_int16_t       reserved112[5];
 /*117*/ u_int16_t       lss_1;
 /*118*/ u_int16_t       lss_2;
 /*119*/ u_int16_t       support2;
 #define ATA_SUPPORT_WRITEREADVERIFY	0x0002
 #define ATA_SUPPORT_WRITEUNCORREXT	0x0004
 #define ATA_SUPPORT_RWLOGDMAEXT		0x0008
 #define ATA_SUPPORT_MICROCODE3		0x0010
 #define ATA_SUPPORT_FREEFALL		0x0020
 #define ATA_SUPPORT_SENSE_REPORT	0x0040
 #define ATA_SUPPORT_EPC			0x0080
 #define ATA_SUPPORT_AMAX_ADDR		0x0100
 #define ATA_SUPPORT_DSN			0x0200
 /*120*/ u_int16_t       enabled2;
 #define ATA_ENABLED_WRITEREADVERIFY	0x0002
 #define ATA_ENABLED_WRITEUNCORREXT	0x0004
 #define ATA_ENABLED_FREEFALL		0x0020
 #define ATA_ENABLED_SENSE_REPORT	0x0040
 #define ATA_ENABLED_EPC			0x0080
 #define ATA_ENABLED_DSN			0x0200
 	u_int16_t       reserved121[6];
 /*127*/ u_int16_t       removable_status;
 /*128*/ u_int16_t       security_status;
 #define ATA_SECURITY_LEVEL		0x0100	/* 0: high, 1: maximum */
 #define ATA_SECURITY_ENH_SUPP		0x0020	/* enhanced erase supported */
 #define ATA_SECURITY_COUNT_EXP		0x0010	/* count expired */
 #define ATA_SECURITY_FROZEN		0x0008	/* security config is frozen */
 #define ATA_SECURITY_LOCKED		0x0004	/* drive is locked */
 #define ATA_SECURITY_ENABLED		0x0002	/* ATA Security is enabled */
 #define ATA_SECURITY_SUPPORTED		0x0001	/* ATA Security is supported */
 
 	u_int16_t       reserved129[31];
 /*160*/ u_int16_t       cfa_powermode1;
 	u_int16_t       reserved161;
 /*162*/ u_int16_t       cfa_kms_support;
 /*163*/ u_int16_t       cfa_trueide_modes;
 /*164*/ u_int16_t       cfa_memory_modes;
 	u_int16_t       reserved165[3];
 /*168*/ u_int16_t       form_factor;
 #define ATA_FORM_FACTOR_MASK		0x000f
 #define ATA_FORM_FACTOR_NOT_REPORTED	0x0000
 #define ATA_FORM_FACTOR_5_25		0x0001
 #define ATA_FORM_FACTOR_3_5		0x0002
 #define ATA_FORM_FACTOR_2_5		0x0003
 #define ATA_FORM_FACTOR_1_8		0x0004
 #define ATA_FORM_FACTOR_SUB_1_8		0x0005
 #define ATA_FORM_FACTOR_MSATA		0x0006
 #define ATA_FORM_FACTOR_M_2		0x0007
 #define ATA_FORM_FACTOR_MICRO_SSD	0x0008
 #define ATA_FORM_FACTOR_C_FAST		0x0009
 /*169*/	u_int16_t       support_dsm;
 #define ATA_SUPPORT_DSM_TRIM		0x0001
-	u_int16_t       reserved170[6];
+/*170*/ u_int8_t        product_id[8];	/* Additional Product Identifier */
+	u_int16_t       reserved174[2];
 /*176*/ u_int8_t        media_serial[60];
 /*206*/ u_int16_t       sct;
 	u_int16_t       reserved207[2];
 /*209*/ u_int16_t       lsalign;
 /*210*/ u_int16_t       wrv_sectors_m3_1;
 	u_int16_t       wrv_sectors_m3_2;
 /*212*/ u_int16_t       wrv_sectors_m2_1;
 	u_int16_t       wrv_sectors_m2_2;
 /*214*/ u_int16_t       nv_cache_caps;
 /*215*/ u_int16_t       nv_cache_size_1;
 	u_int16_t       nv_cache_size_2;
 /*217*/ u_int16_t       media_rotation_rate;
 #define ATA_RATE_NOT_REPORTED		0x0000
 #define ATA_RATE_NON_ROTATING		0x0001
 	u_int16_t       reserved218;
 /*219*/ u_int16_t       nv_cache_opt;
 /*220*/ u_int16_t       wrv_mode;
 	u_int16_t       reserved221;
 /*222*/ u_int16_t       transport_major;
 /*223*/ u_int16_t       transport_minor;
 	u_int16_t       reserved224[31];
 /*255*/ u_int16_t       integrity;
 } __packed;
 
 /* ATA Dataset Management */
 #define ATA_DSM_BLK_SIZE	512
 #define ATA_DSM_BLK_RANGES	64
 #define ATA_DSM_RANGE_SIZE	8
 #define ATA_DSM_RANGE_MAX	65535
 
 /*
  * ATA Device Register
  *
  * bit 7 Obsolete (was 1 in early ATA specs)
  * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS 
  * bit 5 Obsolete (was 1 in early ATA specs)
  * bit 4 1 = Slave Drive, 0 = Master Drive
  * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number
 */
 
 #define ATA_DEV_MASTER		0x00
 #define ATA_DEV_SLAVE		0x10
 #define ATA_DEV_LBA		0x40
 
 /* ATA limits */
 #define ATA_MAX_28BIT_LBA	268435455UL
 
 /* ATA Status Register */
 #define ATA_STATUS_ERROR		0x01
 #define ATA_STATUS_SENSE_AVAIL		0x02
 #define ATA_STATUS_ALIGN_ERR		0x04
 #define ATA_STATUS_DATA_REQ		0x08
 #define ATA_STATUS_DEF_WRITE_ERR	0x10
 #define ATA_STATUS_DEVICE_FAULT		0x20
 #define ATA_STATUS_DEVICE_READY		0x40
 #define ATA_STATUS_BUSY			0x80
 
 /* ATA Error Register */
 #define ATA_ERROR_ABORT		0x04
 #define ATA_ERROR_ID_NOT_FOUND	0x10
 
 /* ATA HPA Features */
 #define ATA_HPA_FEAT_MAX_ADDR	0x00
 #define ATA_HPA_FEAT_SET_PWD	0x01
 #define ATA_HPA_FEAT_LOCK	0x02
 #define ATA_HPA_FEAT_UNLOCK	0x03
 #define ATA_HPA_FEAT_FREEZE	0x04
 
 /* ATA transfer modes */
 #define ATA_MODE_MASK           0x0f
 #define ATA_DMA_MASK            0xf0
 #define ATA_PIO                 0x00
 #define ATA_PIO0                0x08
 #define ATA_PIO1                0x09
 #define ATA_PIO2                0x0a
 #define ATA_PIO3                0x0b
 #define ATA_PIO4                0x0c
 #define ATA_PIO_MAX             0x0f
 #define ATA_DMA                 0x10
 #define ATA_WDMA0               0x20
 #define ATA_WDMA1               0x21
 #define ATA_WDMA2               0x22
 #define ATA_UDMA0               0x40
 #define ATA_UDMA1               0x41
 #define ATA_UDMA2               0x42
 #define ATA_UDMA3               0x43
 #define ATA_UDMA4               0x44
 #define ATA_UDMA5               0x45
 #define ATA_UDMA6               0x46
 #define ATA_SA150               0x47
 #define ATA_SA300               0x48
 #define ATA_SA600               0x49
 #define ATA_DMA_MAX             0x4f
 
 
 /* ATA commands */
 #define ATA_NOP                         0x00    /* NOP */
 #define         ATA_NF_FLUSHQUEUE       0x00    /* flush queued cmd's */
 #define         ATA_NF_AUTOPOLL         0x01    /* start autopoll function */
 #define ATA_DATA_SET_MANAGEMENT		0x06
 #define 	ATA_DSM_TRIM		0x01
 #define ATA_DEVICE_RESET                0x08    /* reset device */
 #define ATA_READ                        0x20    /* read */
 #define ATA_READ48                      0x24    /* read 48bit LBA */
 #define ATA_READ_DMA48                  0x25    /* read DMA 48bit LBA */
 #define ATA_READ_DMA_QUEUED48           0x26    /* read DMA QUEUED 48bit LBA */
 #define ATA_READ_NATIVE_MAX_ADDRESS48   0x27    /* read native max addr 48bit */
 #define ATA_READ_MUL48                  0x29    /* read multi 48bit LBA */
 #define ATA_READ_STREAM_DMA48           0x2a    /* read DMA stream 48bit LBA */
 #define ATA_READ_LOG_EXT                0x2f    /* read log ext - PIO Data-In */
 #define ATA_READ_STREAM48               0x2b    /* read stream 48bit LBA */
 #define ATA_WRITE                       0x30    /* write */
 #define ATA_WRITE48                     0x34    /* write 48bit LBA */
 #define ATA_WRITE_DMA48                 0x35    /* write DMA 48bit LBA */
 #define ATA_WRITE_DMA_QUEUED48          0x36    /* write DMA QUEUED 48bit LBA*/
 #define ATA_SET_MAX_ADDRESS48           0x37    /* set max address 48bit */
 #define ATA_WRITE_MUL48                 0x39    /* write multi 48bit LBA */
 #define ATA_WRITE_STREAM_DMA48          0x3a
 #define ATA_WRITE_STREAM48              0x3b
 #define ATA_WRITE_DMA_FUA48             0x3d
 #define ATA_WRITE_DMA_QUEUED_FUA48      0x3e
 #define ATA_WRITE_LOG_EXT               0x3f
 #define ATA_READ_VERIFY                 0x40
 #define ATA_READ_VERIFY48               0x42
 #define ATA_WRITE_UNCORRECTABLE48       0x45    /* write uncorrectable 48bit LBA */
 #define         ATA_WU_PSEUDO           0x55    /* pseudo-uncorrectable error */
 #define         ATA_WU_FLAGGED          0xaa    /* flagged-uncorrectable error */
 #define ATA_READ_LOG_DMA_EXT            0x47    /* read log DMA ext - PIO Data-In */
 #define	ATA_ZAC_MANAGEMENT_IN		0x4a	/* ZAC management in */
 #define		ATA_ZM_REPORT_ZONES	0x00	/* report zones */
 #define	ATA_WRITE_LOG_DMA_EXT		0x57	/* WRITE LOG DMA EXT */
 #define	ATA_TRUSTED_NON_DATA		0x5b	/* TRUSTED NON-DATA */
 #define	ATA_TRUSTED_RECEIVE		0x5c	/* TRUSTED RECEIVE */
 #define	ATA_TRUSTED_RECEIVE_DMA		0x5d	/* TRUSTED RECEIVE DMA */
 #define	ATA_TRUSTED_SEND		0x5e	/* TRUSTED SEND */
 #define	ATA_TRUSTED_SEND_DMA		0x5f	/* TRUSTED SEND DMA */
 #define ATA_READ_FPDMA_QUEUED           0x60    /* read DMA NCQ */
 #define ATA_WRITE_FPDMA_QUEUED          0x61    /* write DMA NCQ */
 #define ATA_NCQ_NON_DATA		0x63	/* NCQ non-data command */
 #define		ATA_ABORT_NCQ_QUEUE	0x00	/* abort NCQ queue */
 #define		ATA_DEADLINE_HANDLING	0x01	/* deadline handling */
 #define		ATA_SET_FEATURES	0x05	/* set features */
 #define		ATA_ZERO_EXT		0x06	/* zero ext */
 #define		ATA_NCQ_ZAC_MGMT_OUT	0x07	/* NCQ ZAC mgmt out no data */
 #define ATA_SEND_FPDMA_QUEUED           0x64    /* send DMA NCQ */
 #define		ATA_SFPDMA_DSM		0x00	/* Data set management */
 #define			ATA_SFPDMA_DSM_TRIM	0x01	/* Set trim bit in auxiliary */
 #define		ATA_SFPDMA_HYBRID_EVICT	0x01	/* Hybrid Evict */
 #define		ATA_SFPDMA_WLDMA	0x02	/* Write Log DMA EXT */
 #define		ATA_SFPDMA_ZAC_MGMT_OUT	0x03	/* NCQ ZAC mgmt out w/data */
 #define ATA_RECV_FPDMA_QUEUED           0x65    /* receive DMA NCQ */
 #define		ATA_RFPDMA_RL_DMA_EXT	0x00	/* Read Log DMA EXT */
 #define		ATA_RFPDMA_ZAC_MGMT_IN	0x02	/* NCQ ZAC mgmt in w/data */
 #define ATA_SEP_ATTN                    0x67    /* SEP request */
 #define ATA_SEEK                        0x70    /* seek */
 #define	ATA_AMAX_ADDR			0x78	/* Accessible Max Address */
 #define		ATA_AMAX_ADDR_GET	0x00	/* GET NATIVE MAX ADDRESS EXT */
 #define		ATA_AMAX_ADDR_SET	0x01	/* SET ACCESSIBLE MAX ADDRESS EXT */
 #define		ATA_AMAX_ADDR_FREEZE	0x02	/* FREEZE ACCESSIBLE MAX ADDRESS EXT */
 #define	ATA_ZAC_MANAGEMENT_OUT		0x9f	/* ZAC management out */
 #define		ATA_ZM_CLOSE_ZONE	0x01	/* close zone */
 #define		ATA_ZM_FINISH_ZONE	0x02	/* finish zone */
 #define		ATA_ZM_OPEN_ZONE	0x03	/* open zone */
 #define		ATA_ZM_RWP		0x04	/* reset write pointer */
 #define	ATA_DOWNLOAD_MICROCODE		0x92	/* DOWNLOAD MICROCODE */
 #define	ATA_DOWNLOAD_MICROCODE_DMA	0x93	/* DOWNLOAD MICROCODE DMA */
 #define ATA_PACKET_CMD                  0xa0    /* packet command */
 #define ATA_ATAPI_IDENTIFY              0xa1    /* get ATAPI params*/
 #define ATA_SERVICE                     0xa2    /* service command */
 #define ATA_SMART_CMD                   0xb0    /* SMART command */
 #define	ATA_SANITIZE			0xb4	/* sanitize device */
 #define ATA_CFA_ERASE                   0xc0    /* CFA erase */
 #define ATA_READ_MUL                    0xc4    /* read multi */
 #define ATA_WRITE_MUL                   0xc5    /* write multi */
 #define ATA_SET_MULTI                   0xc6    /* set multi size */
 #define ATA_READ_DMA_QUEUED             0xc7    /* read DMA QUEUED */
 #define ATA_READ_DMA                    0xc8    /* read DMA */
 #define ATA_WRITE_DMA                   0xca    /* write DMA */
 #define ATA_WRITE_DMA_QUEUED            0xcc    /* write DMA QUEUED */
 #define ATA_WRITE_MUL_FUA48             0xce
 #define ATA_STANDBY_IMMEDIATE           0xe0    /* standby immediate */
 #define ATA_IDLE_IMMEDIATE              0xe1    /* idle immediate */
 #define ATA_STANDBY_CMD                 0xe2    /* standby */
 #define ATA_IDLE_CMD                    0xe3    /* idle */
 #define ATA_READ_BUFFER                 0xe4    /* read buffer */
 #define ATA_READ_PM                     0xe4    /* read portmultiplier */
 #define ATA_CHECK_POWER_MODE            0xe5    /* device power mode */
 #define ATA_SLEEP                       0xe6    /* sleep */
 #define ATA_FLUSHCACHE                  0xe7    /* flush cache to disk */
 #define	ATA_WRITE_BUFFER		0xe8    /* write buffer */
 #define ATA_WRITE_PM                    0xe8    /* write portmultiplier */
 #define	ATA_READ_BUFFER_DMA		0xe9    /* read buffer DMA */
 #define ATA_FLUSHCACHE48                0xea    /* flush cache to disk */
 #define	ATA_WRITE_BUFFER_DMA		0xeb    /* write buffer DMA */
 #define ATA_ATA_IDENTIFY                0xec    /* get ATA params */
 #define ATA_SETFEATURES                 0xef    /* features command */
 #define         ATA_SF_ENAB_WCACHE      0x02    /* enable write cache */
 #define         ATA_SF_DIS_WCACHE       0x82    /* disable write cache */
 #define         ATA_SF_SETXFER          0x03    /* set transfer mode */
 #define		ATA_SF_APM		0x05	/* Enable APM feature set */
 #define         ATA_SF_ENAB_PUIS        0x06    /* enable PUIS */
 #define         ATA_SF_DIS_PUIS         0x86    /* disable PUIS */
 #define         ATA_SF_PUIS_SPINUP      0x07    /* PUIS spin-up */
 #define		ATA_SF_WRV		0x0b	/* Enable Write-Read-Verify */
 #define 	ATA_SF_DLC		0x0c	/* Enable device life control */
 #define 	ATA_SF_SATA		0x10	/* Enable use of SATA feature */
 #define 	ATA_SF_FFC		0x41	/* Free-fall Control */
 #define 	ATA_SF_MHIST		0x43	/* Set Max Host Sect. Times */
 #define 	ATA_SF_RATE		0x45	/* Set Rate Basis */
 #define 	ATA_SF_EPC		0x4A	/* Extended Power Conditions */
 #define         ATA_SF_ENAB_RCACHE      0xaa    /* enable readahead cache */
 #define         ATA_SF_DIS_RCACHE       0x55    /* disable readahead cache */
 #define         ATA_SF_ENAB_RELIRQ      0x5d    /* enable release interrupt */
 #define         ATA_SF_DIS_RELIRQ       0xdd    /* disable release interrupt */
 #define         ATA_SF_ENAB_SRVIRQ      0x5e    /* enable service interrupt */
 #define         ATA_SF_DIS_SRVIRQ       0xde    /* disable service interrupt */
 #define 	ATA_SF_LPSAERC		0x62	/* Long Phys Sect Align ErrRep*/
 #define 	ATA_SF_DSN		0x63	/* Device Stats Notification */
 #define ATA_CHECK_POWER_MODE		0xe5	/* Check Power Mode */
 #define ATA_SECURITY_SET_PASSWORD       0xf1    /* set drive password */
 #define ATA_SECURITY_UNLOCK             0xf2    /* unlock drive using passwd */
 #define ATA_SECURITY_ERASE_PREPARE      0xf3    /* prepare to erase drive */
 #define ATA_SECURITY_ERASE_UNIT         0xf4    /* erase all blocks on drive */
 #define ATA_SECURITY_FREEZE_LOCK        0xf5    /* freeze security config */
 #define ATA_SECURITY_DISABLE_PASSWORD   0xf6    /* disable drive password */
 #define ATA_READ_NATIVE_MAX_ADDRESS     0xf8    /* read native max address */
 #define ATA_SET_MAX_ADDRESS             0xf9    /* set max address */
 
 
 /* ATAPI commands */
 #define ATAPI_TEST_UNIT_READY           0x00    /* check if device is ready */
 #define ATAPI_REZERO                    0x01    /* rewind */
 #define ATAPI_REQUEST_SENSE             0x03    /* get sense data */
 #define ATAPI_FORMAT                    0x04    /* format unit */
 #define ATAPI_READ                      0x08    /* read data */
 #define ATAPI_WRITE                     0x0a    /* write data */
 #define ATAPI_WEOF                      0x10    /* write filemark */
 #define         ATAPI_WF_WRITE          0x01
 #define ATAPI_SPACE                     0x11    /* space command */
 #define         ATAPI_SP_FM             0x01
 #define         ATAPI_SP_EOD            0x03
 #define ATAPI_INQUIRY			0x12	/* get inquiry data */
 #define ATAPI_MODE_SELECT               0x15    /* mode select */
 #define ATAPI_ERASE                     0x19    /* erase */
 #define ATAPI_MODE_SENSE                0x1a    /* mode sense */
 #define ATAPI_START_STOP                0x1b    /* start/stop unit */
 #define         ATAPI_SS_LOAD           0x01
 #define         ATAPI_SS_RETENSION      0x02
 #define         ATAPI_SS_EJECT          0x04
 #define ATAPI_PREVENT_ALLOW             0x1e    /* media removal */
 #define ATAPI_READ_FORMAT_CAPACITIES    0x23    /* get format capacities */
 #define ATAPI_READ_CAPACITY             0x25    /* get volume capacity */
 #define ATAPI_READ_BIG                  0x28    /* read data */
 #define ATAPI_WRITE_BIG                 0x2a    /* write data */
 #define ATAPI_LOCATE                    0x2b    /* locate to position */
 #define ATAPI_READ_POSITION             0x34    /* read position */
 #define ATAPI_SYNCHRONIZE_CACHE         0x35    /* flush buf, close channel */
 #define ATAPI_WRITE_BUFFER              0x3b    /* write device buffer */
 #define ATAPI_READ_BUFFER               0x3c    /* read device buffer */
 #define ATAPI_READ_SUBCHANNEL           0x42    /* get subchannel info */
 #define ATAPI_READ_TOC                  0x43    /* get table of contents */
 #define ATAPI_PLAY_10                   0x45    /* play by lba */
 #define ATAPI_PLAY_MSF                  0x47    /* play by MSF address */
 #define ATAPI_PLAY_TRACK                0x48    /* play by track number */
 #define ATAPI_PAUSE                     0x4b    /* pause audio operation */
 #define ATAPI_READ_DISK_INFO            0x51    /* get disk info structure */
 #define ATAPI_READ_TRACK_INFO           0x52    /* get track info structure */
 #define ATAPI_RESERVE_TRACK             0x53    /* reserve track */
 #define ATAPI_SEND_OPC_INFO             0x54    /* send OPC structurek */
 #define ATAPI_MODE_SELECT_BIG           0x55    /* set device parameters */
 #define ATAPI_REPAIR_TRACK              0x58    /* repair track */
 #define ATAPI_READ_MASTER_CUE           0x59    /* read master CUE info */
 #define ATAPI_MODE_SENSE_BIG            0x5a    /* get device parameters */
 #define ATAPI_CLOSE_TRACK               0x5b    /* close track/session */
 #define ATAPI_READ_BUFFER_CAPACITY      0x5c    /* get buffer capicity */
 #define ATAPI_SEND_CUE_SHEET            0x5d    /* send CUE sheet */
 #define ATAPI_SERVICE_ACTION_IN         0x96	/* get service data */
 #define ATAPI_BLANK                     0xa1    /* blank the media */
 #define ATAPI_SEND_KEY                  0xa3    /* send DVD key structure */
 #define ATAPI_REPORT_KEY                0xa4    /* get DVD key structure */
 #define ATAPI_PLAY_12                   0xa5    /* play by lba */
 #define ATAPI_LOAD_UNLOAD               0xa6    /* changer control command */
 #define ATAPI_READ_STRUCTURE            0xad    /* get DVD structure */
 #define ATAPI_PLAY_CD                   0xb4    /* universal play command */
 #define ATAPI_SET_SPEED                 0xbb    /* set drive speed */
 #define ATAPI_MECH_STATUS               0xbd    /* get changer status */
 #define ATAPI_READ_CD                   0xbe    /* read data */
 #define ATAPI_POLL_DSC                  0xff    /* poll DSC status bit */
 
 
 struct ata_ioc_devices {
     int                 channel;
     char                name[2][32];
     struct ata_params   params[2];
 };
 
 /* pr channel ATA ioctl calls */
 #define IOCATAGMAXCHANNEL       _IOR('a',  1, int)
 #define IOCATAREINIT            _IOW('a',  2, int)
 #define IOCATAATTACH            _IOW('a',  3, int)
 #define IOCATADETACH            _IOW('a',  4, int)
 #define IOCATADEVICES           _IOWR('a',  5, struct ata_ioc_devices)
 
 /* ATAPI request sense structure */
 struct atapi_sense {
     u_int8_t	error;				/* current or deferred errors */
 #define	ATA_SENSE_VALID			0x80
 
     u_int8_t	segment;			/* segment number */
     u_int8_t	key;				/* sense key */
 #define ATA_SENSE_KEY_MASK		0x0f    /* sense key mask */
 #define ATA_SENSE_NO_SENSE		0x00    /* no specific sense key info */
 #define ATA_SENSE_RECOVERED_ERROR 	0x01    /* command OK, data recovered */
 #define ATA_SENSE_NOT_READY		0x02    /* no access to drive */
 #define ATA_SENSE_MEDIUM_ERROR		0x03    /* non-recovered data error */
 #define ATA_SENSE_HARDWARE_ERROR	0x04    /* non-recoverable HW failure */
 #define ATA_SENSE_ILLEGAL_REQUEST	0x05    /* invalid command param(s) */
 #define ATA_SENSE_UNIT_ATTENTION	0x06    /* media changed */
 #define ATA_SENSE_DATA_PROTECT		0x07    /* write protect */
 #define ATA_SENSE_BLANK_CHECK		0x08    /* blank check */
 #define ATA_SENSE_VENDOR_SPECIFIC	0x09    /* vendor specific skey */
 #define ATA_SENSE_COPY_ABORTED		0x0a    /* copy aborted */
 #define ATA_SENSE_ABORTED_COMMAND	0x0b    /* command aborted, try again */
 #define ATA_SENSE_EQUAL			0x0c    /* equal */
 #define ATA_SENSE_VOLUME_OVERFLOW	0x0d    /* volume overflow */
 #define ATA_SENSE_MISCOMPARE		0x0e    /* data dont match the medium */
 #define ATA_SENSE_RESERVED		0x0f
 #define	ATA_SENSE_ILI			0x20;
 #define	ATA_SENSE_EOM			0x40;
 #define	ATA_SENSE_FILEMARK		0x80;
 
     u_int32_t   cmd_info;		/* cmd information */
     u_int8_t	sense_length;		/* additional sense len (n-7) */
     u_int32_t   cmd_specific_info;	/* additional cmd spec info */
     u_int8_t    asc;			/* additional sense code */
     u_int8_t    ascq;			/* additional sense code qual */
     u_int8_t    replaceable_unit_code;	/* replaceable unit code */
     u_int8_t	specific;		/* sense key specific */
 #define	ATA_SENSE_SPEC_VALID	0x80
 #define	ATA_SENSE_SPEC_MASK	0x7f
 	
     u_int8_t	specific1;		/* sense key specific */
     u_int8_t	specific2;		/* sense key specific */
 } __packed;
 
 /*
  * SET FEATURES subcommands
  */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * These values go in the LBA 3:0.
  */
 #define ATA_SF_EPC_RESTORE	0x00	/* Restore Power Condition Settings */
 #define ATA_SF_EPC_GOTO		0x01	/* Go To Power Condition */
 #define ATA_SF_EPC_SET_TIMER	0x02	/* Set Power Condition Timer */
 #define ATA_SF_EPC_SET_STATE	0x03	/* Set Power Condition State */
 #define ATA_SF_EPC_ENABLE	0x04	/* Enable the EPC feature set */
 #define ATA_SF_EPC_DISABLE	0x05	/* Disable the EPC feature set */
 #define ATA_SF_EPC_SET_SOURCE	0x06	/* Set EPC Power Source */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Power Condition ID field
  * These values go in the count register.
  */
 #define ATA_EPC_STANDBY_Z	0x00	/* Substate of PM2:Standby */
 #define ATA_EPC_STANDBY_Y	0x01	/* Substate of PM2:Standby */
 #define ATA_EPC_IDLE_A		0x81	/* Substate of PM1:Idle */
 #define ATA_EPC_IDLE_B		0x82	/* Substate of PM1:Idle */
 #define ATA_EPC_IDLE_C		0x83	/* Substate of PM1:Idle */
 #define ATA_EPC_ALL		0xff	/* All supported power conditions */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Restore Power Conditions Settings subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_RST_DFLT	0x40	/* 1=Rst from Default, 0= from Saved */
 #define ATA_SF_EPC_RST_SAVE	0x10	/* 1=Save on completion */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Got To Power Condition subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_GOTO_DELAY	0x02000000	/* Delayed entry bit */
 #define ATA_SF_EPC_GOTO_HOLD	0x01000000	/* Hold Power Cond bit */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set Power Condition Timer subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_TIMER_MASK	0x00ffff00	/* Timer field */
 #define ATA_SF_EPC_TIMER_SHIFT	8
 #define ATA_SF_EPC_TIMER_SEC	0x00000080	/* Timer units, 1=sec, 0=.1s */
 #define ATA_SF_EPC_TIMER_EN	0x00000020	/* Enable/disable cond. */
 #define ATA_SF_EPC_TIMER_SAVE	0x00000010	/* Save settings on comp.  */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set Power Condition State subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_SETCON_EN	0x00000020	/* Enable power cond. */
 #define ATA_SF_EPC_SETCON_SAVE	0x00000010	/* Save settings on comp */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set EPC Power Source subcommand
  * These values go in the count register.
  */
 #define ATA_SF_EPC_SRC_UNKNOWN	0x0000	/* Unknown source */
 #define ATA_SF_EPC_SRC_BAT	0x0001	/* battery source */
 #define ATA_SF_EPC_SRC_NOT_BAT	0x0002	/* not battery source */
 
 #define	ATA_LOG_DIRECTORY	0x00	/* Directory of all logs */
 #define	ATA_POWER_COND_LOG	0x08	/* Power Conditions Log */
 #define	ATA_PCL_IDLE		0x00	/* Idle Power Conditions Page */
 #define	ATA_PCL_STANDBY		0x01	/* Standby Power Conditions Page */
 #define	ATA_IDENTIFY_DATA_LOG	0x30	/* Identify Device Data Log */
 #define	ATA_IDL_PAGE_LIST	0x00	/* List of supported pages */
 #define	ATA_IDL_IDENTIFY_DATA	0x01	/* Copy of Identify Device data */
 #define	ATA_IDL_CAPACITY	0x02	/* Capacity */
 #define	ATA_IDL_SUP_CAP		0x03	/* Supported Capabilities */
 #define	ATA_IDL_CUR_SETTINGS	0x04	/* Current Settings */
 #define	ATA_IDL_ATA_STRINGS	0x05	/* ATA Strings */
 #define	ATA_IDL_SECURITY	0x06	/* Security */
 #define	ATA_IDL_PARALLEL_ATA	0x07	/* Parallel ATA */
 #define	ATA_IDL_SERIAL_ATA	0x08	/* Serial ATA */
 #define	ATA_IDL_ZDI		0x09	/* Zoned Device Information */
 
 struct ata_gp_log_dir {
 	uint8_t header[2];
 #define	ATA_GP_LOG_DIR_VERSION		0x0001
 	uint8_t num_pages[255*2];	/* Number of log pages at address */
 };
 
 /*
  * ATA Power Conditions log descriptor
  */
 struct ata_power_cond_log_desc {
 	uint8_t reserved1;
 	uint8_t flags;
 #define ATA_PCL_COND_SUPPORTED		0x80
 #define ATA_PCL_COND_SAVEABLE		0x40
 #define ATA_PCL_COND_CHANGEABLE		0x20
 #define ATA_PCL_DEFAULT_TIMER_EN	0x10
 #define ATA_PCL_SAVED_TIMER_EN		0x08
 #define ATA_PCL_CURRENT_TIMER_EN	0x04
 #define ATA_PCL_HOLD_PC_NOT_SUP		0x02
 	uint8_t reserved2[2];
 	uint8_t default_timer[4];
 	uint8_t saved_timer[4];
 	uint8_t current_timer[4];
 	uint8_t nom_time_to_active[4];
 	uint8_t min_timer[4];
 	uint8_t max_timer[4];
 	uint8_t num_transitions_to_pc[4];
 	uint8_t hours_in_pc[4];
 	uint8_t reserved3[28];
 };
 
 /*
  * ATA Power Conditions Log (0x08), Idle power conditions page (0x00)
  */
 struct ata_power_cond_log_idle {
 	struct ata_power_cond_log_desc idle_a_desc;
 	struct ata_power_cond_log_desc idle_b_desc;
 	struct ata_power_cond_log_desc idle_c_desc;
 	uint8_t reserved[320];
 };
 
 /*
  * ATA Power Conditions Log (0x08), Standby power conditions page (0x01)
  */
 struct ata_power_cond_log_standby {
 	uint8_t reserved[384];
 	struct ata_power_cond_log_desc standby_y_desc;
 	struct ata_power_cond_log_desc standby_z_desc;
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30) page 0x00
  * List of Supported IDENTIFY DEVICE data pages.
  */
 struct ata_identify_log_pages {
 	uint8_t header[8];
 #define	ATA_IDLOG_REVISION	0x0000000000000001
 	uint8_t entry_count;
 	uint8_t entries[503];
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30)
  * Capacity (Page 0x02).
  */
 struct ata_identify_log_capacity {
 	uint8_t header[8];
 #define	ATA_CAP_HEADER_VALID	0x8000000000000000
 #define	ATA_CAP_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_CAP_PAGE_NUM_SHIFT	16
 #define ATA_CAP_REV_MASK	0x00000000000000ff
 	uint8_t capacity[8];
 #define	ATA_CAP_CAPACITY_VALID	0x8000000000000000
 #define	ATA_CAP_ACCESSIBLE_CAP	0x0000ffffffffffff
 	uint8_t phys_logical_sect_size[8];
 #define	ATA_CAP_PL_VALID	0x8000000000000000
 #define	ATA_CAP_LTOP_REL_SUP	0x4000000000000000
 #define	ATA_CAP_LOG_SECT_SUP	0x2000000000000000
 #define	ATA_CAP_ALIGN_ERR_MASK	0x0000000000300000
 #define	ATA_CAP_LTOP_MASK	0x00000000000f0000
 #define	ATA_CAP_LOG_SECT_OFF	0x000000000000ffff
 	uint8_t logical_sect_size[8];
 #define	ATA_CAP_LOG_SECT_VALID	0x8000000000000000
 #define	ATA_CAP_LOG_SECT_SIZE	0x00000000ffffffff
 	uint8_t nominal_buffer_size[8];
 #define	ATA_CAP_NOM_BUF_VALID	0x8000000000000000
 #define	ATA_CAP_NOM_BUF_SIZE	0x7fffffffffffffff
 	uint8_t reserved[472];
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30)
  * Supported Capabilities (Page 0x03).
  */
 
 struct ata_identify_log_sup_cap {
 	uint8_t header[8];
 #define	ATA_SUP_CAP_HEADER_VALID	0x8000000000000000
 #define	ATA_SUP_CAP_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_SUP_CAP_PAGE_NUM_SHIFT	16
 #define ATA_SUP_CAP_REV_MASK		0x00000000000000ff
 	uint8_t sup_cap[8];
 #define	ATA_SUP_CAP_VALID		0x8000000000000000
 #define	ATA_SC_SET_SECT_CONFIG_SUP	0x0002000000000000 /* Set Sect Conf*/
 #define	ATA_SC_ZERO_EXT_SUP		0x0001000000000000 /* Zero EXT */
 #define	ATA_SC_SUCC_NCQ_SENSE_SUP	0x0000800000000000 /* Succ. NCQ Sns */
 #define	ATA_SC_DLC_SUP			0x0000400000000000 /* DLC */
 #define	ATA_SC_RQSN_DEV_FAULT_SUP	0x0000200000000000 /* Req Sns Dev Flt*/
 #define	ATA_SC_DSN_SUP			0x0000100000000000 /* DSN */
 #define	ATA_SC_LP_STANDBY_SUP		0x0000080000000000 /* LP Standby */
 #define	ATA_SC_SET_EPC_PS_SUP		0x0000040000000000 /* Set EPC PS */
 #define	ATA_SC_AMAX_ADDR_SUP		0x0000020000000000 /* AMAX Addr */
 #define	ATA_SC_DRAT_SUP			0x0000008000000000 /* DRAT */
 #define	ATA_SC_LPS_MISALGN_SUP		0x0000004000000000 /* LPS Misalign */
 #define	ATA_SC_RB_DMA_SUP		0x0000001000000000 /* Read Buf DMA */
 #define	ATA_SC_WB_DMA_SUP		0x0000000800000000 /* Write Buf DMA */
 #define	ATA_SC_DNLD_MC_DMA_SUP		0x0000000200000000 /* DL MCode DMA */
 #define	ATA_SC_28BIT_SUP		0x0000000100000000 /* 28-bit */
 #define	ATA_SC_RZAT_SUP			0x0000000080000000 /* RZAT */
 #define	ATA_SC_NOP_SUP			0x0000000020000000 /* NOP */
 #define	ATA_SC_READ_BUFFER_SUP		0x0000000010000000 /* Read Buffer */
 #define	ATA_SC_WRITE_BUFFER_SUP		0x0000000008000000 /* Write Buffer */
 #define	ATA_SC_READ_LOOK_AHEAD_SUP	0x0000000002000000 /* Read Look-Ahead*/
 #define	ATA_SC_VOLATILE_WC_SUP		0x0000000001000000 /* Volatile WC */
 #define	ATA_SC_SMART_SUP		0x0000000000800000 /* SMART */
 #define	ATA_SC_FLUSH_CACHE_EXT_SUP	0x0000000000400000 /* Flush Cache Ext */
 #define	ATA_SC_48BIT_SUP		0x0000000000100000 /* 48-Bit */
 #define	ATA_SC_SPINUP_SUP		0x0000000000040000 /* Spin-Up */
 #define	ATA_SC_PUIS_SUP			0x0000000000020000 /* PUIS */
 #define	ATA_SC_APM_SUP			0x0000000000010000 /* APM */
 #define	ATA_SC_DL_MICROCODE_SUP		0x0000000000004000 /* DL Microcode */
 #define	ATA_SC_UNLOAD_SUP		0x0000000000002000 /* Unload */
 #define	ATA_SC_WRITE_FUA_EXT_SUP	0x0000000000001000 /* Write FUA EXT */
 #define	ATA_SC_GPL_SUP			0x0000000000000800 /* GPL */
 #define	ATA_SC_STREAMING_SUP		0x0000000000000400 /* Streaming */
 #define	ATA_SC_SMART_SELFTEST_SUP	0x0000000000000100 /* SMART self-test */
 #define	ATA_SC_SMART_ERR_LOG_SUP	0x0000000000000080 /* SMART Err Log */
 #define	ATA_SC_EPC_SUP			0x0000000000000040 /* EPC */
 #define	ATA_SC_SENSE_SUP		0x0000000000000020 /* Sense data */
 #define	ATA_SC_FREEFALL_SUP		0x0000000000000010 /* Free-Fall */
 #define	ATA_SC_DM_MODE3_SUP		0x0000000000000008 /* DM Mode 3 */
 #define	ATA_SC_GPL_DMA_SUP		0x0000000000000004 /* GPL DMA */
 #define ATA_SC_WRITE_UNCOR_SUP		0x0000000000000002 /* Write uncorr.  */
 #define ATA_SC_WRV_SUP			0x0000000000000001 /* WRV */
 	uint8_t download_code_cap[8];
 #define ATA_DL_CODE_VALID		0x8000000000000000
 #define	ATA_DLC_DM_OFFSETS_DEFER_SUP	0x0000000400000000
 #define	ATA_DLC_DM_IMMED_SUP		0x0000000200000000
 #define	ATA_DLC_DM_OFF_IMMED_SUP	0x0000000100000000
 #define	ATA_DLC_DM_MAX_XFER_SIZE_MASK	0x00000000ffff0000
 #define	ATA_DLC_DM_MAX_XFER_SIZE_SHIFT	16
 #define	ATA_DLC_DM_MIN_XFER_SIZE_MASK	0x000000000000ffff
 	uint8_t nom_media_rotation_rate[8];
 #define	ATA_NOM_MEDIA_ROTATION_VALID	0x8000000000000000
 #define	ATA_ROTATION_MASK		0x000000000000ffff
 	uint8_t form_factor[8];
 #define	ATA_FORM_FACTOR_VALID		0x8000000000000000
 #define	ATA_FF_MASK			0x000000000000000f
 #define	ATA_FF_NOT_REPORTED		0x0000000000000000 /* Not reported */
 #define	ATA_FF_525_IN			0x0000000000000001 /* 5.25 inch */
 #define	ATA_FF_35_IN			0x0000000000000002 /* 3.5 inch */
 #define	ATA_FF_25_IN			0x0000000000000003 /* 2.5 inch */
 #define	ATA_FF_18_IN			0x0000000000000004 /* 1.8 inch */
 #define	ATA_FF_LT_18_IN			0x0000000000000005 /* < 1.8 inch */
 #define	ATA_FF_MSATA			0x0000000000000006 /* mSATA */
 #define	ATA_FF_M2			0x0000000000000007 /* M.2 */
 #define	ATA_FF_MICROSSD			0x0000000000000008 /* MicroSSD */
 #define	ATA_FF_CFAST			0x0000000000000009 /* CFast */
 	uint8_t wrv_sec_cnt_mode3[8];
 #define ATA_WRV_MODE3_VALID		0x8000000000000000
 #define ATA_WRV_MODE3_COUNT		0x00000000ffffffff
 	uint8_t wrv_sec_cnt_mode2[8];
 #define	ATA_WRV_MODE2_VALID		0x8000000000000000
 #define ATA_WRV_MODE2_COUNT		0x00000000ffffffff
 	uint8_t wwn[16];
 	/* XXX KDM need to figure out how to handle 128-bit fields */
 	uint8_t dsm[8];
 #define	ATA_DSM_VALID			0x8000000000000000
 #define	ATA_LB_MARKUP_SUP		0x000000000000ff00
 #define	ATA_TRIM_SUP			0x0000000000000001
 	uint8_t util_per_unit_time[16];
 	/* XXX KDM need to figure out how to handle 128-bit fields */
 	uint8_t util_usage_rate_sup[8];
 #define	ATA_UTIL_USAGE_RATE_VALID	0x8000000000000000
 #define	ATA_SETTING_RATE_SUP		0x0000000000800000
 #define	ATA_SINCE_POWERON_SUP		0x0000000000000100
 #define	ATA_POH_RATE_SUP		0x0000000000000010
 #define	ATA_DATE_TIME_RATE_SUP		0x0000000000000001
 	uint8_t zoned_cap[8];
 #define	ATA_ZONED_VALID			0x8000000000000000
 #define	ATA_ZONED_MASK			0x0000000000000003
 	uint8_t sup_zac_cap[8];
 #define	ATA_SUP_ZAC_CAP_VALID		0x8000000000000000
 #define	ATA_ND_RWP_SUP			0x0000000000000010 /* Reset Write Ptr*/
 #define	ATA_ND_FINISH_ZONE_SUP		0x0000000000000008 /* Finish Zone */
 #define	ATA_ND_CLOSE_ZONE_SUP		0x0000000000000004 /* Close Zone */
 #define	ATA_ND_OPEN_ZONE_SUP		0x0000000000000002 /* Open Zone */
 #define	ATA_REPORT_ZONES_SUP		0x0000000000000001 /* Report Zones */
 	uint8_t reserved[392];
 };
 
 /*
  * ATA Identify Device Data Log Zoned Device Information Page (0x09).
  * Current as of ZAC r04a, August 25, 2015.
  */
 struct ata_zoned_info_log {
 	uint8_t header[8];
 #define	ATA_ZDI_HEADER_VALID	0x8000000000000000
 #define	ATA_ZDI_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_ZDI_PAGE_NUM_SHIFT	16
 #define ATA_ZDI_REV_MASK	0x00000000000000ff
 	uint8_t zoned_cap[8];
 #define	ATA_ZDI_CAP_VALID	0x8000000000000000
 #define	ATA_ZDI_CAP_URSWRZ	0x0000000000000001
 	uint8_t zoned_settings[8];
 #define	ATA_ZDI_SETTINGS_VALID	0x8000000000000000
 	uint8_t optimal_seq_zones[8];
 #define	ATA_ZDI_OPT_SEQ_VALID	0x8000000000000000
 #define	ATA_ZDI_OPT_SEQ_MASK	0x00000000ffffffff
 	uint8_t optimal_nonseq_zones[8];
 #define	ATA_ZDI_OPT_NS_VALID	0x8000000000000000
 #define	ATA_ZDI_OPT_NS_MASK	0x00000000ffffffff
 	uint8_t max_seq_req_zones[8];
 #define	ATA_ZDI_MAX_SEQ_VALID	0x8000000000000000
 #define	ATA_ZDI_MAX_SEQ_MASK	0x00000000ffffffff
 	uint8_t version_info[8];
 #define	ATA_ZDI_VER_VALID	0x8000000000000000
 #define	ATA_ZDI_VER_ZAC_SUP	0x0100000000000000
 #define	ATA_ZDI_VER_ZAC_MASK	0x00000000000000ff
 	uint8_t reserved[456];
 };
 
 struct ata_ioc_request {
     union {
 	struct {
 	    u_int8_t            command;
 	    u_int8_t            feature;
 	    u_int64_t           lba;
 	    u_int16_t           count;
 	} ata;
 	struct {
 	    char                ccb[16];
 	    struct atapi_sense	sense;
 	} atapi;
     } u;
     caddr_t             data;
     int                 count;
     int                 flags;
 #define ATA_CMD_CONTROL                 0x01
 #define ATA_CMD_READ                    0x02
 #define ATA_CMD_WRITE                   0x04
 #define ATA_CMD_ATAPI                   0x08
 
     int                 timeout;
     int                 error;
 };
 
 struct ata_security_password {
 	u_int16_t		ctrl;
 #define ATA_SECURITY_PASSWORD_USER	0x0000
 #define ATA_SECURITY_PASSWORD_MASTER	0x0001
 #define ATA_SECURITY_ERASE_NORMAL	0x0000
 #define ATA_SECURITY_ERASE_ENHANCED	0x0002
 #define ATA_SECURITY_LEVEL_HIGH		0x0000
 #define ATA_SECURITY_LEVEL_MAXIMUM	0x0100
 
 	u_int8_t		password[32];
 	u_int16_t		revision;
 	u_int16_t		reserved[238];
 };
 
 /* pr device ATA ioctl calls */
 #define IOCATAREQUEST           _IOWR('a', 100, struct ata_ioc_request)
 #define IOCATAGPARM             _IOR('a', 101, struct ata_params)
 #define IOCATAGMODE             _IOR('a', 102, int)
 #define IOCATASMODE             _IOW('a', 103, int)
 
 #define IOCATAGSPINDOWN		_IOR('a', 104, int)
 #define IOCATASSPINDOWN		_IOW('a', 105, int)
 
 
 struct ata_ioc_raid_config {
 	    int                 lun;
 	    int                 type;
 #define AR_JBOD                         0x0001
 #define AR_SPAN                         0x0002
 #define AR_RAID0                        0x0004
 #define AR_RAID1                        0x0008
 #define AR_RAID01                       0x0010
 #define AR_RAID3                        0x0020
 #define AR_RAID4                        0x0040
 #define AR_RAID5                        0x0080
 
 	    int                 interleave;
 	    int                 status;
 #define AR_READY                        1
 #define AR_DEGRADED                     2
 #define AR_REBUILDING                   4
 
 	    int                 progress;
 	    int                 total_disks;
 	    int                 disks[16];
 };
 
 struct ata_ioc_raid_status {
 	    int                 lun;
 	    int                 type;
 	    int                 interleave;
 	    int                 status;
 	    int                 progress;
 	    int                 total_disks;
 	    struct {
 		    int		state;
 #define AR_DISK_ONLINE			0x01
 #define AR_DISK_PRESENT			0x02
 #define AR_DISK_SPARE			0x04
 		    int		lun;
 	    } disks[16];
 };
 
 /* ATA RAID ioctl calls */
 #define IOCATARAIDCREATE        _IOWR('a', 200, struct ata_ioc_raid_config)
 #define IOCATARAIDDELETE        _IOW('a', 201, int)
 #define IOCATARAIDSTATUS        _IOWR('a', 202, struct ata_ioc_raid_status)
 #define IOCATARAIDADDSPARE      _IOW('a', 203, struct ata_ioc_raid_config)
 #define IOCATARAIDREBUILD       _IOW('a', 204, int)
 
 #endif /* _SYS_ATA_H_ */
Index: projects/fuse2/sys/vm/swap_pager.c
===================================================================
--- projects/fuse2/sys/vm/swap_pager.c	(revision 350434)
+++ projects/fuse2/sys/vm/swap_pager.c	(revision 350435)
@@ -1,3004 +1,2991 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1998 Matthew Dillon,
  * Copyright (c) 1994 John S. Dyson
  * Copyright (c) 1990 University of Utah.
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *				New Swap System
  *				Matthew Dillon
  *
  * Radix Bitmap 'blists'.
  *
  *	- The new swapper uses the new radix bitmap code.  This should scale
  *	  to arbitrarily small or arbitrarily large swap spaces and an almost
  *	  arbitrary degree of fragmentation.
  *
  * Features:
  *
  *	- on the fly reallocation of swap during putpages.  The new system
  *	  does not try to keep previously allocated swap blocks for dirty
  *	  pages.
  *
  *	- on the fly deallocation of swap
  *
  *	- No more garbage collection required.  Unnecessarily allocated swap
  *	  blocks only exist for dirty vm_page_t's now and these are already
  *	  cycled (in a high-load system) by the pager.  We also do on-the-fly
  *	  removal of invalidated swap blocks when a page is destroyed
  *	  or renamed.
  *
  * from: Utah $Hdr: swap_pager.c 1.4 91/04/30$
  *
  *	@(#)swap_pager.c	8.9 (Berkeley) 3/21/94
  *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bio.h>
 #include <sys/blist.h>
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/disk.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/malloc.h>
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/racct.h>
 #include <sys/resource.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_param.h>
 #include <vm/swap_pager.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
 
 #include <geom/geom.h>
 
 /*
  * MAX_PAGEOUT_CLUSTER must be a power of 2 between 1 and 64.
  * The 64-page limit is due to the radix code (kern/subr_blist.c).
  */
 #ifndef MAX_PAGEOUT_CLUSTER
 #define	MAX_PAGEOUT_CLUSTER	32
 #endif
 
 #if !defined(SWB_NPAGES)
 #define SWB_NPAGES	MAX_PAGEOUT_CLUSTER
 #endif
 
 #define	SWAP_META_PAGES		PCTRIE_COUNT
 
 /*
  * A swblk structure maps each page index within a
  * SWAP_META_PAGES-aligned and sized range to the address of an
  * on-disk swap block (or SWAPBLK_NONE). The collection of these
  * mappings for an entire vm object is implemented as a pc-trie.
  */
 struct swblk {
 	vm_pindex_t	p;
 	daddr_t		d[SWAP_META_PAGES];
 };
 
 static MALLOC_DEFINE(M_VMPGDATA, "vm_pgdata", "swap pager private data");
 static struct mtx sw_dev_mtx;
 static TAILQ_HEAD(, swdevt) swtailq = TAILQ_HEAD_INITIALIZER(swtailq);
 static struct swdevt *swdevhd;	/* Allocate from here next */
 static int nswapdev;		/* Number of swap devices */
 int swap_pager_avail;
 static struct sx swdev_syscall_lock;	/* serialize swap(on|off) */
 
 static u_long swap_reserved;
 static u_long swap_total;
 static int sysctl_page_shift(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_reserved, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &swap_reserved, 0, sysctl_page_shift, "A", 
     "Amount of swap storage needed to back all allocated anonymous memory.");
 SYSCTL_PROC(_vm, OID_AUTO, swap_total, CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
     &swap_total, 0, sysctl_page_shift, "A", 
     "Total amount of available swap storage.");
 
 static int overcommit = 0;
 SYSCTL_INT(_vm, VM_OVERCOMMIT, overcommit, CTLFLAG_RW, &overcommit, 0,
     "Configure virtual memory overcommit behavior. See tuning(7) "
     "for details.");
 static unsigned long swzone;
 SYSCTL_ULONG(_vm, OID_AUTO, swzone, CTLFLAG_RD, &swzone, 0,
     "Actual size of swap metadata zone");
 static unsigned long swap_maxpages;
 SYSCTL_ULONG(_vm, OID_AUTO, swap_maxpages, CTLFLAG_RD, &swap_maxpages, 0,
     "Maximum amount of swap supported");
 
 /* bits from overcommit */
 #define	SWAP_RESERVE_FORCE_ON		(1 << 0)
 #define	SWAP_RESERVE_RLIMIT_ON		(1 << 1)
 #define	SWAP_RESERVE_ALLOW_NONWIRED	(1 << 2)
 
 static int
 sysctl_page_shift(SYSCTL_HANDLER_ARGS)
 {
 	uint64_t newval;
 	u_long value = *(u_long *)arg1;
 
 	newval = ((uint64_t)value) << PAGE_SHIFT;
 	return (sysctl_handle_64(oidp, &newval, 0, req));
 }
 
 int
 swap_reserve(vm_ooffset_t incr)
 {
 
 	return (swap_reserve_by_cred(incr, curthread->td_ucred));
 }
 
 int
 swap_reserve_by_cred(vm_ooffset_t incr, struct ucred *cred)
 {
 	u_long r, s, prev, pincr;
 	int res, error;
 	static int curfail;
 	static struct timeval lastfail;
 	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__,
 	    (uintmax_t)incr));
 
 #ifdef RACCT
 	if (racct_enable) {
 		PROC_LOCK(curproc);
 		error = racct_add(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 		if (error != 0)
 			return (0);
 	}
 #endif
 
 	pincr = atop(incr);
 	res = 0;
 	prev = atomic_fetchadd_long(&swap_reserved, pincr);
 	r = prev + pincr;
 	if (overcommit & SWAP_RESERVE_ALLOW_NONWIRED) {
 		s = vm_cnt.v_page_count - vm_cnt.v_free_reserved -
 		    vm_wire_count();
 	} else
 		s = 0;
 	s += swap_total;
 	if ((overcommit & SWAP_RESERVE_FORCE_ON) == 0 || r <= s ||
 	    (error = priv_check(curthread, PRIV_VM_SWAP_NOQUOTA)) == 0) {
 		res = 1;
 	} else {
 		prev = atomic_fetchadd_long(&swap_reserved, -pincr);
 		if (prev < pincr)
 			panic("swap_reserved < incr on overcommit fail");
 	}
 	if (res) {
 		prev = atomic_fetchadd_long(&uip->ui_vmsize, pincr);
 		if ((overcommit & SWAP_RESERVE_RLIMIT_ON) != 0 &&
 		    prev + pincr > lim_cur(curthread, RLIMIT_SWAP) &&
 		    priv_check(curthread, PRIV_VM_SWAP_NORLIMIT)) {
 			res = 0;
 			prev = atomic_fetchadd_long(&uip->ui_vmsize, -pincr);
 			if (prev < pincr)
 				panic("uip->ui_vmsize < incr on overcommit fail");
 		}
 	}
 	if (!res && ppsratecheck(&lastfail, &curfail, 1)) {
 		printf("uid %d, pid %d: swap reservation for %jd bytes failed\n",
 		    uip->ui_uid, curproc->p_pid, incr);
 	}
 
 #ifdef RACCT
 	if (racct_enable && !res) {
 		PROC_LOCK(curproc);
 		racct_sub(curproc, RACCT_SWAP, incr);
 		PROC_UNLOCK(curproc);
 	}
 #endif
 
 	return (res);
 }
 
 void
 swap_reserve_force(vm_ooffset_t incr)
 {
 	struct uidinfo *uip;
 	u_long pincr;
 
 	KASSERT((incr & PAGE_MASK) == 0, ("%s: incr: %ju & PAGE_MASK", __func__,
 	    (uintmax_t)incr));
 
 	PROC_LOCK(curproc);
 #ifdef RACCT
 	if (racct_enable)
 		racct_add_force(curproc, RACCT_SWAP, incr);
 #endif
 	pincr = atop(incr);
 	atomic_add_long(&swap_reserved, pincr);
 	uip = curproc->p_ucred->cr_ruidinfo;
 	atomic_add_long(&uip->ui_vmsize, pincr);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release(vm_ooffset_t decr)
 {
 	struct ucred *cred;
 
 	PROC_LOCK(curproc);
 	cred = curproc->p_ucred;
 	swap_release_by_cred(decr, cred);
 	PROC_UNLOCK(curproc);
 }
 
 void
 swap_release_by_cred(vm_ooffset_t decr, struct ucred *cred)
 {
 	u_long prev, pdecr;
  	struct uidinfo *uip;
 
 	uip = cred->cr_ruidinfo;
 
 	KASSERT((decr & PAGE_MASK) == 0, ("%s: decr: %ju & PAGE_MASK", __func__,
 	    (uintmax_t)decr));
 
 	pdecr = atop(decr);
 	prev = atomic_fetchadd_long(&swap_reserved, -pdecr);
 	if (prev < pdecr)
 		panic("swap_reserved < decr");
 
 	prev = atomic_fetchadd_long(&uip->ui_vmsize, -pdecr);
 	if (prev < pdecr)
 		printf("negative vmsize for uid = %d\n", uip->ui_uid);
 #ifdef RACCT
 	if (racct_enable)
 		racct_sub_cred(cred, RACCT_SWAP, decr);
 #endif
 }
 
 #define SWM_POP		0x01	/* pop out			*/
 
 static int swap_pager_full = 2;	/* swap space exhaustion (task killing) */
 static int swap_pager_almost_full = 1; /* swap space exhaustion (w/hysteresis)*/
 static struct mtx swbuf_mtx;	/* to sync nsw_wcount_async */
 static int nsw_wcount_async;	/* limit async write buffers */
 static int nsw_wcount_async_max;/* assigned maximum			*/
 static int nsw_cluster_max;	/* maximum VOP I/O allowed		*/
 
 static int sysctl_swap_async_max(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_async_max, CTLTYPE_INT | CTLFLAG_RW |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_async_max, "I",
     "Maximum running async swap ops");
 static int sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS);
 SYSCTL_PROC(_vm, OID_AUTO, swap_fragmentation, CTLTYPE_STRING | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, sysctl_swap_fragmentation, "A",
     "Swap Fragmentation Info");
 
 static struct sx sw_alloc_sx;
 
 /*
  * "named" and "unnamed" anon region objects.  Try to reduce the overhead
  * of searching a named list by hashing it just a little.
  */
 
 #define NOBJLISTS		8
 
 #define NOBJLIST(handle)	\
 	(&swap_pager_object_list[((int)(intptr_t)handle >> 4) & (NOBJLISTS-1)])
 
 static struct pagerlst	swap_pager_object_list[NOBJLISTS];
 static uma_zone_t swwbuf_zone;
 static uma_zone_t swrbuf_zone;
 static uma_zone_t swblk_zone;
 static uma_zone_t swpctrie_zone;
 
 /*
  * pagerops for OBJT_SWAP - "swap pager".  Some ops are also global procedure
  * calls hooked from other parts of the VM system and do not appear here.
  * (see vm/swap_pager.h).
  */
 static vm_object_t
 		swap_pager_alloc(void *handle, vm_ooffset_t size,
 		    vm_prot_t prot, vm_ooffset_t offset, struct ucred *);
 static void	swap_pager_dealloc(vm_object_t object);
 static int	swap_pager_getpages(vm_object_t, vm_page_t *, int, int *,
     int *);
 static int	swap_pager_getpages_async(vm_object_t, vm_page_t *, int, int *,
     int *, pgo_getpages_iodone_t, void *);
 static void	swap_pager_putpages(vm_object_t, vm_page_t *, int, boolean_t, int *);
 static boolean_t
 		swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before, int *after);
 static void	swap_pager_init(void);
 static void	swap_pager_unswapped(vm_page_t);
 static void	swap_pager_swapoff(struct swdevt *sp);
 
 struct pagerops swappagerops = {
 	.pgo_init =	swap_pager_init,	/* early system initialization of pager	*/
 	.pgo_alloc =	swap_pager_alloc,	/* allocate an OBJT_SWAP object		*/
 	.pgo_dealloc =	swap_pager_dealloc,	/* deallocate an OBJT_SWAP object	*/
 	.pgo_getpages =	swap_pager_getpages,	/* pagein				*/
 	.pgo_getpages_async = swap_pager_getpages_async, /* pagein (async)		*/
 	.pgo_putpages =	swap_pager_putpages,	/* pageout				*/
 	.pgo_haspage =	swap_pager_haspage,	/* get backing store status for page	*/
 	.pgo_pageunswapped = swap_pager_unswapped,	/* remove swap related to page		*/
 };
 
 /*
  * swap_*() routines are externally accessible.  swp_*() routines are
  * internal.
  */
 static int nswap_lowat = 128;	/* in pages, swap_pager_almost_full warn */
 static int nswap_hiwat = 512;	/* in pages, swap_pager_almost_full warn */
 
 SYSCTL_INT(_vm, OID_AUTO, dmmax, CTLFLAG_RD, &nsw_cluster_max, 0,
     "Maximum size of a swap block in pages");
 
 static void	swp_sizecheck(void);
 static void	swp_pager_async_iodone(struct buf *bp);
 static bool	swp_pager_swblk_empty(struct swblk *sb, int start, int limit);
 static int	swapongeom(struct vnode *);
 static int	swaponvp(struct thread *, struct vnode *, u_long);
 static int	swapoff_one(struct swdevt *sp, struct ucred *cred);
 
 /*
  * Swap bitmap functions
  */
 static void	swp_pager_freeswapspace(daddr_t blk, daddr_t npages);
 static daddr_t	swp_pager_getswapspace(int *npages, int limit);
 
 /*
  * Metadata functions
  */
 static daddr_t swp_pager_meta_build(vm_object_t, vm_pindex_t, daddr_t);
 static void swp_pager_meta_free(vm_object_t, vm_pindex_t, vm_pindex_t);
 static void swp_pager_meta_free_all(vm_object_t);
 static daddr_t swp_pager_meta_ctl(vm_object_t, vm_pindex_t, int);
 
 static void
 swp_pager_init_freerange(daddr_t *start, daddr_t *num)
 {
 
 	*start = SWAPBLK_NONE;
 	*num = 0;
 }
 
 static void
 swp_pager_update_freerange(daddr_t *start, daddr_t *num, daddr_t addr)
 {
 
 	if (*start + *num == addr) {
 		(*num)++;
 	} else {
 		swp_pager_freeswapspace(*start, *num);
 		*start = addr;
 		*num = 1;
 	}
 }
 
 static void *
 swblk_trie_alloc(struct pctrie *ptree)
 {
 
 	return (uma_zalloc(swpctrie_zone, M_NOWAIT | (curproc == pageproc ?
 	    M_USE_RESERVE : 0)));
 }
 
 static void
 swblk_trie_free(struct pctrie *ptree, void *node)
 {
 
 	uma_zfree(swpctrie_zone, node);
 }
 
 PCTRIE_DEFINE(SWAP, swblk, p, swblk_trie_alloc, swblk_trie_free);
 
 /*
  * SWP_SIZECHECK() -	update swap_pager_full indication
  *
  *	update the swap_pager_almost_full indication and warn when we are
  *	about to run out of swap space, using lowat/hiwat hysteresis.
  *
  *	Clear swap_pager_full ( task killing ) indication when lowat is met.
  *
  *	No restrictions on call
  *	This routine may not block.
  */
 static void
 swp_sizecheck(void)
 {
 
 	if (swap_pager_avail < nswap_lowat) {
 		if (swap_pager_almost_full == 0) {
 			printf("swap_pager: out of swap space\n");
 			swap_pager_almost_full = 1;
 		}
 	} else {
 		swap_pager_full = 0;
 		if (swap_pager_avail > nswap_hiwat)
 			swap_pager_almost_full = 0;
 	}
 }
 
 /*
  * SWAP_PAGER_INIT() -	initialize the swap pager!
  *
  *	Expected to be started from system init.  NOTE:  This code is run
  *	before much else so be careful what you depend on.  Most of the VM
  *	system has yet to be initialized at this point.
  */
 static void
 swap_pager_init(void)
 {
 	/*
 	 * Initialize object lists
 	 */
 	int i;
 
 	for (i = 0; i < NOBJLISTS; ++i)
 		TAILQ_INIT(&swap_pager_object_list[i]);
 	mtx_init(&sw_dev_mtx, "swapdev", NULL, MTX_DEF);
 	sx_init(&sw_alloc_sx, "swspsx");
 	sx_init(&swdev_syscall_lock, "swsysc");
 }
 
 /*
  * SWAP_PAGER_SWAP_INIT() - swap pager initialization from pageout process
  *
  *	Expected to be started from pageout process once, prior to entering
  *	its main loop.
  */
 void
 swap_pager_swap_init(void)
 {
 	unsigned long n, n2;
 
 	/*
 	 * Number of in-transit swap bp operations.  Don't
 	 * exhaust the pbufs completely.  Make sure we
 	 * initialize workable values (0 will work for hysteresis
 	 * but it isn't very efficient).
 	 *
 	 * The nsw_cluster_max is constrained by the bp->b_pages[]
 	 * array, which has MAXPHYS / PAGE_SIZE entries, and our locally
 	 * defined MAX_PAGEOUT_CLUSTER.   Also be aware that swap ops are
 	 * constrained by the swap device interleave stripe size.
 	 *
 	 * Currently we hardwire nsw_wcount_async to 4.  This limit is
 	 * designed to prevent other I/O from having high latencies due to
 	 * our pageout I/O.  The value 4 works well for one or two active swap
 	 * devices but is probably a little low if you have more.  Even so,
 	 * a higher value would probably generate only a limited improvement
 	 * with three or four active swap devices since the system does not
 	 * typically have to pageout at extreme bandwidths.   We will want
 	 * at least 2 per swap devices, and 4 is a pretty good value if you
 	 * have one NFS swap device due to the command/ack latency over NFS.
 	 * So it all works out pretty well.
 	 */
 	nsw_cluster_max = min(MAXPHYS / PAGE_SIZE, MAX_PAGEOUT_CLUSTER);
 
 	nsw_wcount_async = 4;
 	nsw_wcount_async_max = nsw_wcount_async;
 	mtx_init(&swbuf_mtx, "async swbuf mutex", NULL, MTX_DEF);
 
 	swwbuf_zone = pbuf_zsecond_create("swwbuf", nswbuf / 4);
 	swrbuf_zone = pbuf_zsecond_create("swrbuf", nswbuf / 2);
 
 	/*
 	 * Initialize our zone, taking the user's requested size or
 	 * estimating the number we need based on the number of pages
 	 * in the system.
 	 */
 	n = maxswzone != 0 ? maxswzone / sizeof(struct swblk) :
 	    vm_cnt.v_page_count / 2;
 	swpctrie_zone = uma_zcreate("swpctrie", pctrie_node_size(), NULL, NULL,
 	    pctrie_zone_init, NULL, UMA_ALIGN_PTR, UMA_ZONE_VM);
 	if (swpctrie_zone == NULL)
 		panic("failed to create swap pctrie zone.");
 	swblk_zone = uma_zcreate("swblk", sizeof(struct swblk), NULL, NULL,
 	    NULL, NULL, _Alignof(struct swblk) - 1, UMA_ZONE_VM);
 	if (swblk_zone == NULL)
 		panic("failed to create swap blk zone.");
 	n2 = n;
 	do {
 		if (uma_zone_reserve_kva(swblk_zone, n))
 			break;
 		/*
 		 * if the allocation failed, try a zone two thirds the
 		 * size of the previous attempt.
 		 */
 		n -= ((n + 2) / 3);
 	} while (n > 0);
 
 	/*
 	 * Often uma_zone_reserve_kva() cannot reserve exactly the
 	 * requested size.  Account for the difference when
 	 * calculating swap_maxpages.
 	 */
 	n = uma_zone_get_max(swblk_zone);
 
 	if (n < n2)
 		printf("Swap blk zone entries changed from %lu to %lu.\n",
 		    n2, n);
 	swap_maxpages = n * SWAP_META_PAGES;
 	swzone = n * sizeof(struct swblk);
 	if (!uma_zone_reserve_kva(swpctrie_zone, n))
 		printf("Cannot reserve swap pctrie zone, "
 		    "reduce kern.maxswzone.\n");
 }
 
 static vm_object_t
 swap_pager_alloc_init(void *handle, struct ucred *cred, vm_ooffset_t size,
     vm_ooffset_t offset)
 {
 	vm_object_t object;
 
 	if (cred != NULL) {
 		if (!swap_reserve_by_cred(size, cred))
 			return (NULL);
 		crhold(cred);
 	}
 
 	/*
 	 * The un_pager.swp.swp_blks trie is initialized by
 	 * vm_object_allocate() to ensure the correct order of
 	 * visibility to other threads.
 	 */
 	object = vm_object_allocate(OBJT_SWAP, OFF_TO_IDX(offset +
 	    PAGE_MASK + size));
 
 	object->handle = handle;
 	if (cred != NULL) {
 		object->cred = cred;
 		object->charge = size;
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_ALLOC() -	allocate a new OBJT_SWAP VM object and instantiate
  *			its metadata structures.
  *
  *	This routine is called from the mmap and fork code to create a new
  *	OBJT_SWAP object.
  *
  *	This routine must ensure that no live duplicate is created for
  *	the named object request, which is protected against by
  *	holding the sw_alloc_sx lock in case handle != NULL.
  */
 static vm_object_t
 swap_pager_alloc(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t offset, struct ucred *cred)
 {
 	vm_object_t object;
 
 	if (handle != NULL) {
 		/*
 		 * Reference existing named region or allocate new one.  There
 		 * should not be a race here against swp_pager_meta_build()
 		 * as called from vm_page_remove() in regards to the lookup
 		 * of the handle.
 		 */
 		sx_xlock(&sw_alloc_sx);
 		object = vm_pager_object_lookup(NOBJLIST(handle), handle);
 		if (object == NULL) {
 			object = swap_pager_alloc_init(handle, cred, size,
 			    offset);
 			if (object != NULL) {
 				TAILQ_INSERT_TAIL(NOBJLIST(object->handle),
 				    object, pager_object_list);
 			}
 		}
 		sx_xunlock(&sw_alloc_sx);
 	} else {
 		object = swap_pager_alloc_init(handle, cred, size, offset);
 	}
 	return (object);
 }
 
 /*
  * SWAP_PAGER_DEALLOC() -	remove swap metadata from object
  *
  *	The swap backing for the object is destroyed.  The code is
  *	designed such that we can reinstantiate it later, but this
  *	routine is typically called only when the entire object is
  *	about to be destroyed.
  *
  *	The object must be locked.
  */
 static void
 swap_pager_dealloc(vm_object_t object)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT((object->flags & OBJ_DEAD) != 0, ("dealloc of reachable obj"));
 
 	/*
 	 * Remove from list right away so lookups will fail if we block for
 	 * pageout completion.
 	 */
 	if (object->handle != NULL) {
 		VM_OBJECT_WUNLOCK(object);
 		sx_xlock(&sw_alloc_sx);
 		TAILQ_REMOVE(NOBJLIST(object->handle), object,
 		    pager_object_list);
 		sx_xunlock(&sw_alloc_sx);
 		VM_OBJECT_WLOCK(object);
 	}
 
 	vm_object_pip_wait(object, "swpdea");
 
 	/*
 	 * Free all remaining metadata.  We only bother to free it from
 	 * the swap meta data.  We do not attempt to free swapblk's still
 	 * associated with vm_page_t's for this object.  We do not care
 	 * if paging is still in progress on some objects.
 	 */
 	swp_pager_meta_free_all(object);
 	object->handle = NULL;
 	object->type = OBJT_DEAD;
 }
 
 /************************************************************************
  *			SWAP PAGER BITMAP ROUTINES			*
  ************************************************************************/
 
 /*
  * SWP_PAGER_GETSWAPSPACE() -	allocate raw swap space
  *
  *	Allocate swap for up to the requested number of pages, and at
  *	least a minimum number of pages.  The starting swap block number
  *	(a page index) is returned or SWAPBLK_NONE if the allocation
  *	failed.
  *
  *	Also has the side effect of advising that somebody made a mistake
  *	when they configured swap and didn't configure enough.
  *
  *	This routine may not sleep.
  *
  *	We allocate in round-robin fashion from the configured devices.
  */
 static daddr_t
 swp_pager_getswapspace(int *io_npages, int limit)
 {
 	daddr_t blk;
 	struct swdevt *sp;
 	int mpages, npages;
 
 	blk = SWAPBLK_NONE;
 	mpages = *io_npages;
 	npages = imin(BLIST_MAX_ALLOC, mpages);
 	mtx_lock(&sw_dev_mtx);
 	sp = swdevhd;
 	while (!TAILQ_EMPTY(&swtailq)) {
 		if (sp == NULL)
 			sp = TAILQ_FIRST(&swtailq);
 		if ((sp->sw_flags & SW_CLOSING) == 0)
 			blk = blist_alloc(sp->sw_blist, &npages, mpages);
 		if (blk != SWAPBLK_NONE)
 			break;
 		sp = TAILQ_NEXT(sp, sw_list);
 		if (swdevhd == sp) {
 			if (npages <= limit)
 				break;
 			mpages = npages - 1;
 			npages >>= 1;
 		}
 	}
 	if (blk != SWAPBLK_NONE) {
 		*io_npages = npages;
 		blk += sp->sw_first;
 		sp->sw_used += npages;
 		swap_pager_avail -= npages;
 		swp_sizecheck();
 		swdevhd = TAILQ_NEXT(sp, sw_list);
 	} else {
 		if (swap_pager_full != 2) {
 			printf("swp_pager_getswapspace(%d): failed\n",
 			    *io_npages);
 			swap_pager_full = 2;
 			swap_pager_almost_full = 1;
 		}
 		swdevhd = NULL;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (blk);
 }
 
 static bool
 swp_pager_isondev(daddr_t blk, struct swdevt *sp)
 {
 
 	return (blk >= sp->sw_first && blk < sp->sw_end);
 }
 
 static void
 swp_pager_strategy(struct buf *bp)
 {
 	struct swdevt *sp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (swp_pager_isondev(bp->b_blkno, sp)) {
 			mtx_unlock(&sw_dev_mtx);
 			if ((sp->sw_flags & SW_UNMAPPED) != 0 &&
 			    unmapped_buf_allowed) {
 				bp->b_data = unmapped_buf;
 				bp->b_offset = 0;
 			} else {
 				pmap_qenter((vm_offset_t)bp->b_data,
 				    &bp->b_pages[0], bp->b_bcount / PAGE_SIZE);
 			}
 			sp->sw_strategy(bp, sp);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 
 /*
  * SWP_PAGER_FREESWAPSPACE() -	free raw swap space
  *
  *	This routine returns the specified swap blocks back to the bitmap.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_freeswapspace(daddr_t blk, daddr_t npages)
 {
 	struct swdevt *sp;
 
 	if (npages == 0)
 		return;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (swp_pager_isondev(blk, sp)) {
 			sp->sw_used -= npages;
 			/*
 			 * If we are attempting to stop swapping on
 			 * this device, we don't want to mark any
 			 * blocks free lest they be reused.
 			 */
 			if ((sp->sw_flags & SW_CLOSING) == 0) {
 				blist_free(sp->sw_blist, blk - sp->sw_first,
 				    npages);
 				swap_pager_avail += npages;
 				swp_sizecheck();
 			}
 			mtx_unlock(&sw_dev_mtx);
 			return;
 		}
 	}
 	panic("Swapdev not found");
 }
 
 /*
  * SYSCTL_SWAP_FRAGMENTATION() -	produce raw swap space stats
  */
 static int
 sysctl_swap_fragmentation(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf sbuf;
 	struct swdevt *sp;
 	const char *devname;
 	int error;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		sbuf_printf(&sbuf, "\nFree space on device %s:\n", devname);
 		blist_stats(sp->sw_blist, &sbuf);
 	}
 	mtx_unlock(&sw_dev_mtx);
 	error = sbuf_finish(&sbuf);
 	sbuf_delete(&sbuf);
 	return (error);
 }
 
 /*
  * SWAP_PAGER_FREESPACE() -	frees swap blocks associated with a page
  *				range within an object.
  *
  *	This is a globally accessible routine.
  *
  *	This routine removes swapblk assignments from swap metadata.
  *
  *	The external callers of this routine typically have already destroyed
  *	or renamed vm_page_t's associated with this range in the object so
  *	we should be ok.
  *
  *	The object must be locked.
  */
 void
 swap_pager_freespace(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 
 	swp_pager_meta_free(object, start, size);
 }
 
 /*
  * SWAP_PAGER_RESERVE() - reserve swap blocks in object
  *
  *	Assigns swap blocks to the specified range within the object.  The
  *	swap blocks are not zeroed.  Any previous swap assignment is destroyed.
  *
  *	Returns 0 on success, -1 on failure.
  */
 int
 swap_pager_reserve(vm_object_t object, vm_pindex_t start, vm_size_t size)
 {
 	daddr_t addr, blk, n_free, s_free;
 	int i, j, n;
 
 	swp_pager_init_freerange(&s_free, &n_free);
 	VM_OBJECT_WLOCK(object);
 	for (i = 0; i < size; i += n) {
 		n = size - i;
 		blk = swp_pager_getswapspace(&n, 1);
 		if (blk == SWAPBLK_NONE) {
 			swp_pager_meta_free(object, start, i);
 			VM_OBJECT_WUNLOCK(object);
 			return (-1);
 		}
 		for (j = 0; j < n; ++j) {
 			addr = swp_pager_meta_build(object,
 			    start + i + j, blk + j);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&s_free, &n_free,
 				    addr);
 		}
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 	VM_OBJECT_WUNLOCK(object);
 	return (0);
 }
 
 /*
  * SWAP_PAGER_COPY() -  copy blocks from source pager to destination pager
  *			and destroy the source.
  *
  *	Copy any valid swapblks from the source to the destination.  In
  *	cases where both the source and destination have a valid swapblk,
  *	we keep the destination's.
  *
  *	This routine is allowed to sleep.  It may sleep allocating metadata
  *	indirectly through swp_pager_meta_build() or if paging is still in
  *	progress on the source.
  *
  *	The source object contains no vm_page_t's (which is just as well)
  *
  *	The source object is of type OBJT_SWAP.
  *
  *	The source and destination objects must be locked.
  *	Both object locks may temporarily be released.
  */
 void
 swap_pager_copy(vm_object_t srcobject, vm_object_t dstobject,
     vm_pindex_t offset, int destroysource)
 {
 	vm_pindex_t i;
 	daddr_t dstaddr, n_free, s_free, srcaddr;
 
 	VM_OBJECT_ASSERT_WLOCKED(srcobject);
 	VM_OBJECT_ASSERT_WLOCKED(dstobject);
 
 	/*
 	 * If destroysource is set, we remove the source object from the
 	 * swap_pager internal queue now.
 	 */
 	if (destroysource && srcobject->handle != NULL) {
 		vm_object_pip_add(srcobject, 1);
 		VM_OBJECT_WUNLOCK(srcobject);
 		vm_object_pip_add(dstobject, 1);
 		VM_OBJECT_WUNLOCK(dstobject);
 		sx_xlock(&sw_alloc_sx);
 		TAILQ_REMOVE(NOBJLIST(srcobject->handle), srcobject,
 		    pager_object_list);
 		sx_xunlock(&sw_alloc_sx);
 		VM_OBJECT_WLOCK(dstobject);
 		vm_object_pip_wakeup(dstobject);
 		VM_OBJECT_WLOCK(srcobject);
 		vm_object_pip_wakeup(srcobject);
 	}
 
 	/*
 	 * Transfer source to destination.
 	 */
 	swp_pager_init_freerange(&s_free, &n_free);
 	for (i = 0; i < dstobject->size; ++i) {
 		srcaddr = swp_pager_meta_ctl(srcobject, i + offset, SWM_POP);
 		if (srcaddr == SWAPBLK_NONE)
 			continue;
 		dstaddr = swp_pager_meta_ctl(dstobject, i, 0);
 		if (dstaddr != SWAPBLK_NONE) {
 			/*
 			 * Destination has valid swapblk or it is represented
 			 * by a resident page.  We destroy the source block.
 			 */
 			swp_pager_update_freerange(&s_free, &n_free, srcaddr);
 			continue;
 		}
 
 		/*
 		 * Destination has no swapblk and is not resident,
 		 * copy source.
 		 *
 		 * swp_pager_meta_build() can sleep.
 		 */
 		vm_object_pip_add(srcobject, 1);
 		VM_OBJECT_WUNLOCK(srcobject);
 		vm_object_pip_add(dstobject, 1);
 		dstaddr = swp_pager_meta_build(dstobject, i, srcaddr);
 		KASSERT(dstaddr == SWAPBLK_NONE,
 		    ("Unexpected destination swapblk"));
 		vm_object_pip_wakeup(dstobject);
 		VM_OBJECT_WLOCK(srcobject);
 		vm_object_pip_wakeup(srcobject);
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 
 	/*
 	 * Free left over swap blocks in source.
 	 *
 	 * We have to revert the type to OBJT_DEFAULT so we do not accidentally
 	 * double-remove the object from the swap queues.
 	 */
 	if (destroysource) {
 		swp_pager_meta_free_all(srcobject);
 		/*
 		 * Reverting the type is not necessary, the caller is going
 		 * to destroy srcobject directly, but I'm doing it here
 		 * for consistency since we've removed the object from its
 		 * queues.
 		 */
 		srcobject->type = OBJT_DEFAULT;
 	}
 }
 
 /*
  * SWAP_PAGER_HASPAGE() -	determine if we have good backing store for
  *				the requested page.
  *
  *	We determine whether good backing store exists for the requested
  *	page and return TRUE if it does, FALSE if it doesn't.
  *
  *	If TRUE, we also try to determine how much valid, contiguous backing
  *	store exists before and after the requested page.
  */
 static boolean_t
 swap_pager_haspage(vm_object_t object, vm_pindex_t pindex, int *before,
     int *after)
 {
 	daddr_t blk, blk0;
 	int i;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 
 	/*
 	 * do we have good backing store at the requested index ?
 	 */
 	blk0 = swp_pager_meta_ctl(object, pindex, 0);
 	if (blk0 == SWAPBLK_NONE) {
 		if (before)
 			*before = 0;
 		if (after)
 			*after = 0;
 		return (FALSE);
 	}
 
 	/*
 	 * find backwards-looking contiguous good backing store
 	 */
 	if (before != NULL) {
 		for (i = 1; i < SWB_NPAGES; i++) {
 			if (i > pindex)
 				break;
 			blk = swp_pager_meta_ctl(object, pindex - i, 0);
 			if (blk != blk0 - i)
 				break;
 		}
 		*before = i - 1;
 	}
 
 	/*
 	 * find forward-looking contiguous good backing store
 	 */
 	if (after != NULL) {
 		for (i = 1; i < SWB_NPAGES; i++) {
 			blk = swp_pager_meta_ctl(object, pindex + i, 0);
 			if (blk != blk0 + i)
 				break;
 		}
 		*after = i - 1;
 	}
 	return (TRUE);
 }
 
 /*
  * SWAP_PAGER_PAGE_UNSWAPPED() - remove swap backing store related to page
  *
  *	This removes any associated swap backing store, whether valid or
  *	not, from the page.
  *
  *	This routine is typically called when a page is made dirty, at
  *	which point any associated swap can be freed.  MADV_FREE also
  *	calls us in a special-case situation
  *
  *	NOTE!!!  If the page is clean and the swap was valid, the caller
  *	should make the page dirty before calling this routine.  This routine
  *	does NOT change the m->dirty status of the page.  Also: MADV_FREE
  *	depends on it.
  *
  *	This routine may not sleep.
  *
  *	The object containing the page must be locked.
  */
 static void
 swap_pager_unswapped(vm_page_t m)
 {
 	daddr_t srcaddr;
 
 	srcaddr = swp_pager_meta_ctl(m->object, m->pindex, SWM_POP);
 	if (srcaddr != SWAPBLK_NONE)
 		swp_pager_freeswapspace(srcaddr, 1);
 }
 
 /*
  * swap_pager_getpages() - bring pages in from swap
  *
  *	Attempt to page in the pages in array "ma" of length "count".  The
  *	caller may optionally specify that additional pages preceding and
  *	succeeding the specified range be paged in.  The number of such pages
  *	is returned in the "rbehind" and "rahead" parameters, and they will
  *	be in the inactive queue upon return.
  *
  *	The pages in "ma" must be busied and will remain busied upon return.
  */
 static int
 swap_pager_getpages(vm_object_t object, vm_page_t *ma, int count, int *rbehind,
     int *rahead)
 {
 	struct buf *bp;
 	vm_page_t bm, mpred, msucc, p;
 	vm_pindex_t pindex;
 	daddr_t blk;
 	int i, maxahead, maxbehind, reqcount;
 
 	reqcount = count;
 
 	/*
 	 * Determine the final number of read-behind pages and
 	 * allocate them BEFORE releasing the object lock.  Otherwise,
 	 * there can be a problematic race with vm_object_split().
 	 * Specifically, vm_object_split() might first transfer pages
 	 * that precede ma[0] in the current object to a new object,
 	 * and then this function incorrectly recreates those pages as
 	 * read-behind pages in the current object.
 	 */
 	if (!swap_pager_haspage(object, ma[0]->pindex, &maxbehind, &maxahead))
 		return (VM_PAGER_FAIL);
 
 	/*
 	 * Clip the readahead and readbehind ranges to exclude resident pages.
 	 */
 	if (rahead != NULL) {
 		KASSERT(reqcount - 1 <= maxahead,
 		    ("page count %d extends beyond swap block", reqcount));
 		*rahead = imin(*rahead, maxahead - (reqcount - 1));
 		pindex = ma[reqcount - 1]->pindex;
 		msucc = TAILQ_NEXT(ma[reqcount - 1], listq);
 		if (msucc != NULL && msucc->pindex - pindex - 1 < *rahead)
 			*rahead = msucc->pindex - pindex - 1;
 	}
 	if (rbehind != NULL) {
 		*rbehind = imin(*rbehind, maxbehind);
 		pindex = ma[0]->pindex;
 		mpred = TAILQ_PREV(ma[0], pglist, listq);
 		if (mpred != NULL && pindex - mpred->pindex - 1 < *rbehind)
 			*rbehind = pindex - mpred->pindex - 1;
 	}
 
 	bm = ma[0];
 	for (i = 0; i < count; i++)
 		ma[i]->oflags |= VPO_SWAPINPROG;
 
 	/*
 	 * Allocate readahead and readbehind pages.
 	 */
 	if (rbehind != NULL) {
 		for (i = 1; i <= *rbehind; i++) {
 			p = vm_page_alloc(object, ma[0]->pindex - i,
 			    VM_ALLOC_NORMAL);
 			if (p == NULL)
 				break;
 			p->oflags |= VPO_SWAPINPROG;
 			bm = p;
 		}
 		*rbehind = i - 1;
 	}
 	if (rahead != NULL) {
 		for (i = 0; i < *rahead; i++) {
 			p = vm_page_alloc(object,
 			    ma[reqcount - 1]->pindex + i + 1, VM_ALLOC_NORMAL);
 			if (p == NULL)
 				break;
 			p->oflags |= VPO_SWAPINPROG;
 		}
 		*rahead = i;
 	}
 	if (rbehind != NULL)
 		count += *rbehind;
 	if (rahead != NULL)
 		count += *rahead;
 
 	vm_object_pip_add(object, count);
 
 	pindex = bm->pindex;
 	blk = swp_pager_meta_ctl(object, pindex, 0);
 	KASSERT(blk != SWAPBLK_NONE,
 	    ("no swap blocking containing %p(%jx)", object, (uintmax_t)pindex));
 
 	VM_OBJECT_WUNLOCK(object);
 	bp = uma_zalloc(swrbuf_zone, M_WAITOK);
 	/* Pages cannot leave the object while busy. */
 	for (i = 0, p = bm; i < count; i++, p = TAILQ_NEXT(p, listq)) {
 		MPASS(p->pindex == bm->pindex + i);
 		bp->b_pages[i] = p;
 	}
 
 	bp->b_flags |= B_PAGING;
 	bp->b_iocmd = BIO_READ;
 	bp->b_iodone = swp_pager_async_iodone;
 	bp->b_rcred = crhold(thread0.td_ucred);
 	bp->b_wcred = crhold(thread0.td_ucred);
 	bp->b_blkno = blk;
 	bp->b_bcount = PAGE_SIZE * count;
 	bp->b_bufsize = PAGE_SIZE * count;
 	bp->b_npages = count;
 	bp->b_pgbefore = rbehind != NULL ? *rbehind : 0;
 	bp->b_pgafter = rahead != NULL ? *rahead : 0;
 
 	VM_CNT_INC(v_swapin);
 	VM_CNT_ADD(v_swappgsin, count);
 
 	/*
 	 * perform the I/O.  NOTE!!!  bp cannot be considered valid after
 	 * this point because we automatically release it on completion.
 	 * Instead, we look at the one page we are interested in which we
 	 * still hold a lock on even through the I/O completion.
 	 *
 	 * The other pages in our ma[] array are also released on completion,
 	 * so we cannot assume they are valid anymore either.
 	 *
 	 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
 	 */
 	BUF_KERNPROC(bp);
 	swp_pager_strategy(bp);
 
 	/*
 	 * Wait for the pages we want to complete.  VPO_SWAPINPROG is always
 	 * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
 	 * is set in the metadata for each page in the request.
 	 */
 	VM_OBJECT_WLOCK(object);
 	while ((ma[0]->oflags & VPO_SWAPINPROG) != 0) {
 		ma[0]->oflags |= VPO_SWAPSLEEP;
 		VM_CNT_INC(v_intrans);
 		if (VM_OBJECT_SLEEP(object, &object->paging_in_progress, PSWP,
 		    "swread", hz * 20)) {
 			printf(
 "swap_pager: indefinite wait buffer: bufobj: %p, blkno: %jd, size: %ld\n",
 			    bp->b_bufobj, (intmax_t)bp->b_blkno, bp->b_bcount);
 		}
 	}
 
 	/*
 	 * If we had an unrecoverable read error pages will not be valid.
 	 */
 	for (i = 0; i < reqcount; i++)
 		if (ma[i]->valid != VM_PAGE_BITS_ALL)
 			return (VM_PAGER_ERROR);
 
 	return (VM_PAGER_OK);
 
 	/*
 	 * A final note: in a low swap situation, we cannot deallocate swap
 	 * and mark a page dirty here because the caller is likely to mark
 	 * the page clean when we return, causing the page to possibly revert
 	 * to all-zero's later.
 	 */
 }
 
 /*
  * 	swap_pager_getpages_async():
  *
  *	Right now this is emulation of asynchronous operation on top of
  *	swap_pager_getpages().
  */
 static int
 swap_pager_getpages_async(vm_object_t object, vm_page_t *ma, int count,
     int *rbehind, int *rahead, pgo_getpages_iodone_t iodone, void *arg)
 {
 	int r, error;
 
 	r = swap_pager_getpages(object, ma, count, rbehind, rahead);
 	VM_OBJECT_WUNLOCK(object);
 	switch (r) {
 	case VM_PAGER_OK:
 		error = 0;
 		break;
 	case VM_PAGER_ERROR:
 		error = EIO;
 		break;
 	case VM_PAGER_FAIL:
 		error = EINVAL;
 		break;
 	default:
 		panic("unhandled swap_pager_getpages() error %d", r);
 	}
 	(iodone)(arg, ma, count, error);
 	VM_OBJECT_WLOCK(object);
 
 	return (r);
 }
 
 /*
  *	swap_pager_putpages:
  *
  *	Assign swap (if necessary) and initiate I/O on the specified pages.
  *
  *	We support both OBJT_DEFAULT and OBJT_SWAP objects.  DEFAULT objects
  *	are automatically converted to SWAP objects.
  *
  *	In a low memory situation we may block in VOP_STRATEGY(), but the new
  *	vm_page reservation system coupled with properly written VFS devices
  *	should ensure that no low-memory deadlock occurs.  This is an area
  *	which needs work.
  *
  *	The parent has N vm_object_pip_add() references prior to
  *	calling us and will remove references for rtvals[] that are
  *	not set to VM_PAGER_PEND.  We need to remove the rest on I/O
  *	completion.
  *
  *	The parent has soft-busy'd the pages it passes us and will unbusy
- *	those whos rtvals[] entry is not set to VM_PAGER_PEND on return.
+ *	those whose rtvals[] entry is not set to VM_PAGER_PEND on return.
  *	We need to unbusy the rest on I/O completion.
  */
 static void
 swap_pager_putpages(vm_object_t object, vm_page_t *ma, int count,
     int flags, int *rtvals)
 {
-	int i, n;
-	boolean_t sync;
-	daddr_t addr, n_free, s_free;
+	struct buf *bp;
+	daddr_t addr, blk, n_free, s_free;
+	vm_page_t mreq;
+	int i, j, n;
+	bool async;
 
-	swp_pager_init_freerange(&s_free, &n_free);
-	if (count && ma[0]->object != object) {
-		panic("swap_pager_putpages: object mismatch %p/%p",
-		    object,
-		    ma[0]->object
-		);
-	}
+	KASSERT(count == 0 || ma[0]->object == object,
+	    ("%s: object mismatch %p/%p",
+	    __func__, object, ma[0]->object));
 
 	/*
 	 * Step 1
 	 *
-	 * Turn object into OBJT_SWAP
-	 * check for bogus sysops
-	 * force sync if not pageout process
+	 * Turn object into OBJT_SWAP.  Force sync if not a pageout process.
 	 */
 	if (object->type != OBJT_SWAP) {
 		addr = swp_pager_meta_build(object, 0, SWAPBLK_NONE);
 		KASSERT(addr == SWAPBLK_NONE,
 		    ("unexpected object swap block"));
 	}
 	VM_OBJECT_WUNLOCK(object);
+	async = curproc == pageproc && (flags & VM_PAGER_PUT_SYNC) == 0;
+	swp_pager_init_freerange(&s_free, &n_free);
 
-	n = 0;
-	if (curproc != pageproc)
-		sync = TRUE;
-	else
-		sync = (flags & VM_PAGER_PUT_SYNC) != 0;
-
 	/*
 	 * Step 2
 	 *
 	 * Assign swap blocks and issue I/O.  We reallocate swap on the fly.
 	 * The page is left dirty until the pageout operation completes
 	 * successfully.
 	 */
 	for (i = 0; i < count; i += n) {
-		int j;
-		struct buf *bp;
-		daddr_t blk;
-
 		/* Maximum I/O size is limited by maximum swap block size. */
 		n = min(count - i, nsw_cluster_max);
 
 		/* Get a block of swap of size up to size n. */
 		blk = swp_pager_getswapspace(&n, 4);
 		if (blk == SWAPBLK_NONE) {
 			for (j = 0; j < n; ++j)
-				rtvals[i+j] = VM_PAGER_FAIL;
+				rtvals[i + j] = VM_PAGER_FAIL;
 			continue;
 		}
 
 		/*
-		 * All I/O parameters have been satisfied, build the I/O
+		 * All I/O parameters have been satisfied.  Build the I/O
 		 * request and assign the swap space.
 		 */
-		if (sync != TRUE) {
+		if (async) {
 			mtx_lock(&swbuf_mtx);
 			while (nsw_wcount_async == 0)
 				msleep(&nsw_wcount_async, &swbuf_mtx, PVM,
 				    "swbufa", 0);
 			nsw_wcount_async--;
 			mtx_unlock(&swbuf_mtx);
 		}
 		bp = uma_zalloc(swwbuf_zone, M_WAITOK);
-		if (sync != TRUE)
+		if (async)
 			bp->b_flags = B_ASYNC;
 		bp->b_flags |= B_PAGING;
 		bp->b_iocmd = BIO_WRITE;
 
 		bp->b_rcred = crhold(thread0.td_ucred);
 		bp->b_wcred = crhold(thread0.td_ucred);
 		bp->b_bcount = PAGE_SIZE * n;
 		bp->b_bufsize = PAGE_SIZE * n;
 		bp->b_blkno = blk;
 
 		VM_OBJECT_WLOCK(object);
 		for (j = 0; j < n; ++j) {
-			vm_page_t mreq = ma[i+j];
-
+			mreq = ma[i + j];
 			addr = swp_pager_meta_build(mreq->object, mreq->pindex,
 			    blk + j);
 			if (addr != SWAPBLK_NONE)
 				swp_pager_update_freerange(&s_free, &n_free,
 				    addr);
 			MPASS(mreq->dirty == VM_PAGE_BITS_ALL);
 			mreq->oflags |= VPO_SWAPINPROG;
 			bp->b_pages[j] = mreq;
 		}
 		VM_OBJECT_WUNLOCK(object);
 		bp->b_npages = n;
 		/*
 		 * Must set dirty range for NFS to work.
 		 */
 		bp->b_dirtyoff = 0;
 		bp->b_dirtyend = bp->b_bcount;
 
 		VM_CNT_INC(v_swapout);
 		VM_CNT_ADD(v_swappgsout, bp->b_npages);
 
 		/*
 		 * We unconditionally set rtvals[] to VM_PAGER_PEND so that we
 		 * can call the async completion routine at the end of a
 		 * synchronous I/O operation.  Otherwise, our caller would
 		 * perform duplicate unbusy and wakeup operations on the page
 		 * and object, respectively.
 		 */
 		for (j = 0; j < n; j++)
 			rtvals[i + j] = VM_PAGER_PEND;
 
 		/*
 		 * asynchronous
 		 *
-		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
+		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy.
 		 */
-		if (sync == FALSE) {
+		if (async) {
 			bp->b_iodone = swp_pager_async_iodone;
 			BUF_KERNPROC(bp);
 			swp_pager_strategy(bp);
 			continue;
 		}
 
 		/*
 		 * synchronous
 		 *
-		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy
+		 * NOTE: b_blkno is destroyed by the call to swapdev_strategy.
 		 */
 		bp->b_iodone = bdone;
 		swp_pager_strategy(bp);
 
 		/*
 		 * Wait for the sync I/O to complete.
 		 */
 		bwait(bp, PVM, "swwrt");
 
 		/*
 		 * Now that we are through with the bp, we can call the
 		 * normal async completion, which frees everything up.
 		 */
 		swp_pager_async_iodone(bp);
 	}
-	VM_OBJECT_WLOCK(object);
 	swp_pager_freeswapspace(s_free, n_free);
+	VM_OBJECT_WLOCK(object);
 }
 
 /*
  *	swp_pager_async_iodone:
  *
  *	Completion routine for asynchronous reads and writes from/to swap.
  *	Also called manually by synchronous code to finish up a bp.
  *
  *	This routine may not sleep.
  */
 static void
 swp_pager_async_iodone(struct buf *bp)
 {
 	int i;
 	vm_object_t object = NULL;
 
 	/*
 	 * Report error - unless we ran out of memory, in which case
 	 * we've already logged it in swapgeom_strategy().
 	 */
 	if (bp->b_ioflags & BIO_ERROR && bp->b_error != ENOMEM) {
 		printf(
 		    "swap_pager: I/O error - %s failed; blkno %ld,"
 			"size %ld, error %d\n",
 		    ((bp->b_iocmd == BIO_READ) ? "pagein" : "pageout"),
 		    (long)bp->b_blkno,
 		    (long)bp->b_bcount,
 		    bp->b_error
 		);
 	}
 
 	/*
 	 * remove the mapping for kernel virtual
 	 */
 	if (buf_mapped(bp))
 		pmap_qremove((vm_offset_t)bp->b_data, bp->b_npages);
 	else
 		bp->b_data = bp->b_kvabase;
 
 	if (bp->b_npages) {
 		object = bp->b_pages[0]->object;
 		VM_OBJECT_WLOCK(object);
 	}
 
 	/*
 	 * cleanup pages.  If an error occurs writing to swap, we are in
 	 * very serious trouble.  If it happens to be a disk error, though,
 	 * we may be able to recover by reassigning the swap later on.  So
 	 * in this case we remove the m->swapblk assignment for the page
 	 * but do not free it in the rlist.  The errornous block(s) are thus
 	 * never reallocated as swap.  Redirty the page and continue.
 	 */
 	for (i = 0; i < bp->b_npages; ++i) {
 		vm_page_t m = bp->b_pages[i];
 
 		m->oflags &= ~VPO_SWAPINPROG;
 		if (m->oflags & VPO_SWAPSLEEP) {
 			m->oflags &= ~VPO_SWAPSLEEP;
 			wakeup(&object->paging_in_progress);
 		}
 
 		if (bp->b_ioflags & BIO_ERROR) {
 			/*
 			 * If an error occurs I'd love to throw the swapblk
 			 * away without freeing it back to swapspace, so it
 			 * can never be used again.  But I can't from an
 			 * interrupt.
 			 */
 			if (bp->b_iocmd == BIO_READ) {
 				/*
 				 * NOTE: for reads, m->dirty will probably
 				 * be overridden by the original caller of
 				 * getpages so don't play cute tricks here.
 				 */
 				m->valid = 0;
 			} else {
 				/*
 				 * If a write error occurs, reactivate page
 				 * so it doesn't clog the inactive list,
 				 * then finish the I/O.
 				 */
 				MPASS(m->dirty == VM_PAGE_BITS_ALL);
 				vm_page_lock(m);
 				vm_page_activate(m);
 				vm_page_unlock(m);
 				vm_page_sunbusy(m);
 			}
 		} else if (bp->b_iocmd == BIO_READ) {
 			/*
 			 * NOTE: for reads, m->dirty will probably be
 			 * overridden by the original caller of getpages so
 			 * we cannot set them in order to free the underlying
 			 * swap in a low-swap situation.  I don't think we'd
 			 * want to do that anyway, but it was an optimization
 			 * that existed in the old swapper for a time before
 			 * it got ripped out due to precisely this problem.
 			 */
 			KASSERT(!pmap_page_is_mapped(m),
 			    ("swp_pager_async_iodone: page %p is mapped", m));
 			KASSERT(m->dirty == 0,
 			    ("swp_pager_async_iodone: page %p is dirty", m));
 
 			m->valid = VM_PAGE_BITS_ALL;
 			if (i < bp->b_pgbefore ||
 			    i >= bp->b_npages - bp->b_pgafter)
 				vm_page_readahead_finish(m);
 		} else {
 			/*
 			 * For write success, clear the dirty
 			 * status, then finish the I/O ( which decrements the
 			 * busy count and possibly wakes waiter's up ).
 			 * A page is only written to swap after a period of
 			 * inactivity.  Therefore, we do not expect it to be
 			 * reused.
 			 */
 			KASSERT(!pmap_page_is_write_mapped(m),
 			    ("swp_pager_async_iodone: page %p is not write"
 			    " protected", m));
 			vm_page_undirty(m);
 			vm_page_lock(m);
 			vm_page_deactivate_noreuse(m);
 			vm_page_unlock(m);
 			vm_page_sunbusy(m);
 		}
 	}
 
 	/*
 	 * adjust pip.  NOTE: the original parent may still have its own
 	 * pip refs on the object.
 	 */
 	if (object != NULL) {
 		vm_object_pip_wakeupn(object, bp->b_npages);
 		VM_OBJECT_WUNLOCK(object);
 	}
 
 	/*
 	 * swapdev_strategy() manually sets b_vp and b_bufobj before calling
 	 * bstrategy(). Set them back to NULL now we're done with it, or we'll
 	 * trigger a KASSERT in relpbuf().
 	 */
 	if (bp->b_vp) {
 		    bp->b_vp = NULL;
 		    bp->b_bufobj = NULL;
 	}
 	/*
 	 * release the physical I/O buffer
 	 */
 	if (bp->b_flags & B_ASYNC) {
 		mtx_lock(&swbuf_mtx);
 		if (++nsw_wcount_async == 1)
 			wakeup(&nsw_wcount_async);
 		mtx_unlock(&swbuf_mtx);
 	}
 	uma_zfree((bp->b_iocmd == BIO_READ) ? swrbuf_zone : swwbuf_zone, bp);
 }
 
 int
 swap_pager_nswapdev(void)
 {
 
 	return (nswapdev);
 }
 
 static void
 swp_pager_force_dirty(vm_page_t m)
 {
 
 	vm_page_dirty(m);
 #ifdef INVARIANTS
 	vm_page_lock(m);
 	if (!vm_page_wired(m) && m->queue == PQ_NONE)
 		panic("page %p is neither wired nor queued", m);
 	vm_page_unlock(m);
 #endif
 	vm_page_xunbusy(m);
 	swap_pager_unswapped(m);
 }
 
 static void
 swp_pager_force_launder(vm_page_t m)
 {
 
 	vm_page_dirty(m);
 	vm_page_lock(m);
 	vm_page_launder(m);
 	vm_page_unlock(m);
 	vm_page_xunbusy(m);
 	swap_pager_unswapped(m);
 }
 
 /*
  * SWP_PAGER_FORCE_PAGEIN() - force swap blocks to be paged in
  *
  *	This routine dissociates pages starting at the given index within an
  *	object from their backing store, paging them in if they do not reside
  *	in memory.  Pages that are paged in are marked dirty and placed in the
  *	laundry queue.  Pages are marked dirty because they no longer have
  *	backing store.  They are placed in the laundry queue because they have
  *	not been accessed recently.  Otherwise, they would already reside in
  *	memory.
  */
 static void
 swp_pager_force_pagein(vm_object_t object, vm_pindex_t pindex, int npages)
 {
 	vm_page_t ma[npages];
 	int i, j;
 
 	KASSERT(npages > 0, ("%s: No pages", __func__));
 	KASSERT(npages <= MAXPHYS / PAGE_SIZE,
 	    ("%s: Too many pages: %d", __func__, npages));
 	vm_object_pip_add(object, npages);
 	vm_page_grab_pages(object, pindex, VM_ALLOC_NORMAL, ma, npages);
 	for (i = j = 0;; i++) {
 		/* Count nonresident pages, to page-in all at once. */
 		if (i < npages && ma[i]->valid != VM_PAGE_BITS_ALL)
 			continue;
 		if (j < i) {
 			/* Page-in nonresident pages. Mark for laundering. */
 			if (swap_pager_getpages(object, &ma[j], i - j, NULL,
 			    NULL) != VM_PAGER_OK)
 				panic("%s: read from swap failed", __func__);
 			do {
 				swp_pager_force_launder(ma[j]);
 			} while (++j < i);
 		}
 		if (i == npages)
 			break;
 		/* Mark dirty a resident page. */
 		swp_pager_force_dirty(ma[j++]);
 	}
 	vm_object_pip_wakeupn(object, npages);
 }
 
 /*
  *	swap_pager_swapoff_object:
  *
  *	Page in all of the pages that have been paged out for an object
  *	to a swap device.
  */
 static void
 swap_pager_swapoff_object(struct swdevt *sp, vm_object_t object)
 {
 	struct swblk *sb;
 	vm_pindex_t pi, s_pindex;
 	daddr_t blk, n_blks, s_blk;
 	int i;
 
 	n_blks = 0;
 	for (pi = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
 	    &object->un_pager.swp.swp_blks, pi)) != NULL; ) {
 		for (i = 0; i < SWAP_META_PAGES; i++) {
 			blk = sb->d[i];
 			if (!swp_pager_isondev(blk, sp))
 				blk = SWAPBLK_NONE;
 
 			/*
 			 * If there are no blocks/pages accumulated, start a new
 			 * accumulation here.
 			 */
 			if (n_blks == 0) {
 				if (blk != SWAPBLK_NONE) {
 					s_blk = blk;
 					s_pindex = sb->p + i;
 					n_blks = 1;
 				}
 				continue;
 			}
 
 			/*
 			 * If the accumulation can be extended without breaking
 			 * the sequence of consecutive blocks and pages that
 			 * swp_pager_force_pagein() depends on, do so.
 			 */
 			if (n_blks < MAXPHYS / PAGE_SIZE &&
 			    s_blk + n_blks == blk &&
 			    s_pindex + n_blks == sb->p + i) {
 				++n_blks;
 				continue;
 			}
 
 			/*
 			 * The sequence of consecutive blocks and pages cannot
 			 * be extended, so page them all in here.  Then,
 			 * because doing so involves releasing and reacquiring
 			 * a lock that protects the swap block pctrie, do not
 			 * rely on the current swap block.  Break this loop and
 			 * re-fetch the same pindex from the pctrie again.
 			 */
 			swp_pager_force_pagein(object, s_pindex, n_blks);
 			n_blks = 0;
 			break;
 		}
 		if (i == SWAP_META_PAGES)
 			pi = sb->p + SWAP_META_PAGES;
 	}
 	if (n_blks > 0)
 		swp_pager_force_pagein(object, s_pindex, n_blks);
 }
 
 /*
  *	swap_pager_swapoff:
  *
  *	Page in all of the pages that have been paged out to the
  *	given device.  The corresponding blocks in the bitmap must be
  *	marked as allocated and the device must be flagged SW_CLOSING.
  *	There may be no processes swapped out to the device.
  *
  *	This routine may block.
  */
 static void
 swap_pager_swapoff(struct swdevt *sp)
 {
 	vm_object_t object;
 	int retries;
 
 	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 
 	retries = 0;
 full_rescan:
 	mtx_lock(&vm_object_list_mtx);
 	TAILQ_FOREACH(object, &vm_object_list, object_list) {
 		if (object->type != OBJT_SWAP)
 			continue;
 		mtx_unlock(&vm_object_list_mtx);
 		/* Depends on type-stability. */
 		VM_OBJECT_WLOCK(object);
 
 		/*
 		 * Dead objects are eventually terminated on their own.
 		 */
 		if ((object->flags & OBJ_DEAD) != 0)
 			goto next_obj;
 
 		/*
 		 * Sync with fences placed after pctrie
 		 * initialization.  We must not access pctrie below
 		 * unless we checked that our object is swap and not
 		 * dead.
 		 */
 		atomic_thread_fence_acq();
 		if (object->type != OBJT_SWAP)
 			goto next_obj;
 
 		swap_pager_swapoff_object(sp, object);
 next_obj:
 		VM_OBJECT_WUNLOCK(object);
 		mtx_lock(&vm_object_list_mtx);
 	}
 	mtx_unlock(&vm_object_list_mtx);
 
 	if (sp->sw_used) {
 		/*
 		 * Objects may be locked or paging to the device being
 		 * removed, so we will miss their pages and need to
 		 * make another pass.  We have marked this device as
 		 * SW_CLOSING, so the activity should finish soon.
 		 */
 		retries++;
 		if (retries > 100) {
 			panic("swapoff: failed to locate %d swap blocks",
 			    sp->sw_used);
 		}
 		pause("swpoff", hz / 20);
 		goto full_rescan;
 	}
 	EVENTHANDLER_INVOKE(swapoff, sp);
 }
 
 /************************************************************************
  *				SWAP META DATA 				*
  ************************************************************************
  *
  *	These routines manipulate the swap metadata stored in the
  *	OBJT_SWAP object.
  *
  *	Swap metadata is implemented with a global hash and not directly
  *	linked into the object.  Instead the object simply contains
  *	appropriate tracking counters.
  */
 
 /*
  * SWP_PAGER_SWBLK_EMPTY() - is a range of blocks free?
  */
 static bool
 swp_pager_swblk_empty(struct swblk *sb, int start, int limit)
 {
 	int i;
 
 	MPASS(0 <= start && start <= limit && limit <= SWAP_META_PAGES);
 	for (i = start; i < limit; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
 			return (false);
 	}
 	return (true);
 }
    
 /*
  * SWP_PAGER_META_BUILD() -	add swap block to swap meta data for object
  *
  *	We first convert the object to a swap object if it is a default
  *	object.
  *
  *	The specified swapblk is added to the object's swap metadata.  If
  *	the swapblk is not valid, it is freed instead.  Any previously
  *	assigned swapblk is returned.
  */
 static daddr_t
 swp_pager_meta_build(vm_object_t object, vm_pindex_t pindex, daddr_t swapblk)
 {
 	static volatile int swblk_zone_exhausted, swpctrie_zone_exhausted;
 	struct swblk *sb, *sb1;
 	vm_pindex_t modpi, rdpi;
 	daddr_t prev_swapblk;
 	int error, i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 
 	/*
 	 * Convert default object to swap object if necessary
 	 */
 	if (object->type != OBJT_SWAP) {
 		pctrie_init(&object->un_pager.swp.swp_blks);
 
 		/*
 		 * Ensure that swap_pager_swapoff()'s iteration over
 		 * object_list does not see a garbage pctrie.
 		 */
 		atomic_thread_fence_rel();
 
 		object->type = OBJT_SWAP;
 		KASSERT(object->handle == NULL, ("default pager with handle"));
 	}
 
 	rdpi = rounddown(pindex, SWAP_META_PAGES);
 	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks, rdpi);
 	if (sb == NULL) {
 		if (swapblk == SWAPBLK_NONE)
 			return (SWAPBLK_NONE);
 		for (;;) {
 			sb = uma_zalloc(swblk_zone, M_NOWAIT | (curproc ==
 			    pageproc ? M_USE_RESERVE : 0));
 			if (sb != NULL) {
 				sb->p = rdpi;
 				for (i = 0; i < SWAP_META_PAGES; i++)
 					sb->d[i] = SWAPBLK_NONE;
 				if (atomic_cmpset_int(&swblk_zone_exhausted,
 				    1, 0))
 					printf("swblk zone ok\n");
 				break;
 			}
 			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swblk_zone)) {
 				if (atomic_cmpset_int(&swblk_zone_exhausted,
 				    0, 1))
 					printf("swap blk zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonxb", 10);
 			} else
 				uma_zwait(swblk_zone);
 			VM_OBJECT_WLOCK(object);
 			sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
 			    rdpi);
 			if (sb != NULL)
 				/*
 				 * Somebody swapped out a nearby page,
 				 * allocating swblk at the rdpi index,
 				 * while we dropped the object lock.
 				 */
 				goto allocated;
 		}
 		for (;;) {
 			error = SWAP_PCTRIE_INSERT(
 			    &object->un_pager.swp.swp_blks, sb);
 			if (error == 0) {
 				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
 				    1, 0))
 					printf("swpctrie zone ok\n");
 				break;
 			}
 			VM_OBJECT_WUNLOCK(object);
 			if (uma_zone_exhausted(swpctrie_zone)) {
 				if (atomic_cmpset_int(&swpctrie_zone_exhausted,
 				    0, 1))
 					printf("swap pctrie zone exhausted, "
 					    "increase kern.maxswzone\n");
 				vm_pageout_oom(VM_OOM_SWAPZ);
 				pause("swzonxp", 10);
 			} else
 				uma_zwait(swpctrie_zone);
 			VM_OBJECT_WLOCK(object);
 			sb1 = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
 			    rdpi);
 			if (sb1 != NULL) {
 				uma_zfree(swblk_zone, sb);
 				sb = sb1;
 				goto allocated;
 			}
 		}
 	}
 allocated:
 	MPASS(sb->p == rdpi);
 
 	modpi = pindex % SWAP_META_PAGES;
 	/* Return prior contents of metadata. */
 	prev_swapblk = sb->d[modpi];
 	/* Enter block into metadata. */
 	sb->d[modpi] = swapblk;
 
 	/*
 	 * Free the swblk if we end up with the empty page run.
 	 */
 	if (swapblk == SWAPBLK_NONE &&
 	    swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
 		SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, rdpi);
 		uma_zfree(swblk_zone, sb);
 	}
 	return (prev_swapblk);
 }
 
 /*
  * SWP_PAGER_META_FREE() - free a range of blocks in the object's swap metadata
  *
  *	The requested range of blocks is freed, with any associated swap
  *	returned to the swap bitmap.
  *
  *	This routine will free swap metadata structures as they are cleaned
  *	out.  This routine does *NOT* operate on swap metadata associated
  *	with resident pages.
  */
 static void
 swp_pager_meta_free(vm_object_t object, vm_pindex_t pindex, vm_pindex_t count)
 {
 	struct swblk *sb;
 	daddr_t n_free, s_free;
 	vm_pindex_t last;
 	int i, limit, start;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP || count == 0)
 		return;
 
 	swp_pager_init_freerange(&s_free, &n_free);
 	last = pindex + count;
 	for (;;) {
 		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
 		    rounddown(pindex, SWAP_META_PAGES));
 		if (sb == NULL || sb->p >= last)
 			break;
 		start = pindex > sb->p ? pindex - sb->p : 0;
 		limit = last - sb->p < SWAP_META_PAGES ? last - sb->p :
 		    SWAP_META_PAGES;
 		for (i = start; i < limit; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			swp_pager_update_freerange(&s_free, &n_free, sb->d[i]);
 			sb->d[i] = SWAPBLK_NONE;
 		}
 		pindex = sb->p + SWAP_META_PAGES;
 		if (swp_pager_swblk_empty(sb, 0, start) &&
 		    swp_pager_swblk_empty(sb, limit, SWAP_META_PAGES)) {
 			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
 			    sb->p);
 			uma_zfree(swblk_zone, sb);
 		}
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 }
 
 /*
  * SWP_PAGER_META_FREE_ALL() - destroy all swap metadata associated with object
  *
  *	This routine locates and destroys all swap metadata associated with
  *	an object.
  */
 static void
 swp_pager_meta_free_all(vm_object_t object)
 {
 	struct swblk *sb;
 	daddr_t n_free, s_free;
 	vm_pindex_t pindex;
 	int i;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return;
 
 	swp_pager_init_freerange(&s_free, &n_free);
 	for (pindex = 0; (sb = SWAP_PCTRIE_LOOKUP_GE(
 	    &object->un_pager.swp.swp_blks, pindex)) != NULL;) {
 		pindex = sb->p + SWAP_META_PAGES;
 		for (i = 0; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] == SWAPBLK_NONE)
 				continue;
 			swp_pager_update_freerange(&s_free, &n_free, sb->d[i]);
 		}
 		SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks, sb->p);
 		uma_zfree(swblk_zone, sb);
 	}
 	swp_pager_freeswapspace(s_free, n_free);
 }
 
 /*
  * SWP_PAGER_METACTL() -  misc control of swap meta data.
  *
  *	This routine is capable of looking up, or removing swapblk
  *	assignments in the swap meta data.  It returns the swapblk being
  *	looked-up, popped, or SWAPBLK_NONE if the block was invalid.
  *
  *	When acting on a busy resident page and paging is in progress, we
  *	have to wait until paging is complete but otherwise can act on the
  *	busy page.
  *
  *	SWM_POP		remove from meta data but do not free it
  */
 static daddr_t
 swp_pager_meta_ctl(vm_object_t object, vm_pindex_t pindex, int flags)
 {
 	struct swblk *sb;
 	daddr_t r1;
 
 	if ((flags & SWM_POP) != 0)
 		VM_OBJECT_ASSERT_WLOCKED(object);
 	else
 		VM_OBJECT_ASSERT_LOCKED(object);
 
 	/*
 	 * The meta data only exists if the object is OBJT_SWAP
 	 * and even then might not be allocated yet.
 	 */
 	if (object->type != OBJT_SWAP)
 		return (SWAPBLK_NONE);
 
 	sb = SWAP_PCTRIE_LOOKUP(&object->un_pager.swp.swp_blks,
 	    rounddown(pindex, SWAP_META_PAGES));
 	if (sb == NULL)
 		return (SWAPBLK_NONE);
 	r1 = sb->d[pindex % SWAP_META_PAGES];
 	if (r1 == SWAPBLK_NONE)
 		return (SWAPBLK_NONE);
 	if ((flags & SWM_POP) != 0) {
 		sb->d[pindex % SWAP_META_PAGES] = SWAPBLK_NONE;
 		if (swp_pager_swblk_empty(sb, 0, SWAP_META_PAGES)) {
 			SWAP_PCTRIE_REMOVE(&object->un_pager.swp.swp_blks,
 			    rounddown(pindex, SWAP_META_PAGES));
 			uma_zfree(swblk_zone, sb);
 		}
 	}
 	return (r1);
 }
 
 /*
  * Returns the least page index which is greater than or equal to the
  * parameter pindex and for which there is a swap block allocated.
  * Returns object's size if the object's type is not swap or if there
  * are no allocated swap blocks for the object after the requested
  * pindex.
  */
 vm_pindex_t
 swap_pager_find_least(vm_object_t object, vm_pindex_t pindex)
 {
 	struct swblk *sb;
 	int i;
 
 	VM_OBJECT_ASSERT_LOCKED(object);
 	if (object->type != OBJT_SWAP)
 		return (object->size);
 
 	sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
 	    rounddown(pindex, SWAP_META_PAGES));
 	if (sb == NULL)
 		return (object->size);
 	if (sb->p < pindex) {
 		for (i = pindex % SWAP_META_PAGES; i < SWAP_META_PAGES; i++) {
 			if (sb->d[i] != SWAPBLK_NONE)
 				return (sb->p + i);
 		}
 		sb = SWAP_PCTRIE_LOOKUP_GE(&object->un_pager.swp.swp_blks,
 		    roundup(pindex, SWAP_META_PAGES));
 		if (sb == NULL)
 			return (object->size);
 	}
 	for (i = 0; i < SWAP_META_PAGES; i++) {
 		if (sb->d[i] != SWAPBLK_NONE)
 			return (sb->p + i);
 	}
 
 	/*
 	 * We get here if a swblk is present in the trie but it
 	 * doesn't map any blocks.
 	 */
 	MPASS(0);
 	return (object->size);
 }
 
 /*
  * System call swapon(name) enables swapping on device name,
  * which must be in the swdevsw.  Return EBUSY
  * if already swapping on this device.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapon_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapon(struct thread *td, struct swapon_args *uap)
 {
 	struct vattr attr;
 	struct vnode *vp;
 	struct nameidata nd;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPON);
 	if (error)
 		return (error);
 
 	sx_xlock(&swdev_syscall_lock);
 
 	/*
 	 * Swap metadata may not fit in the KVM if we have physical
 	 * memory of >1GB.
 	 */
 	if (swblk_zone == NULL) {
 		error = ENOMEM;
 		goto done;
 	}
 
 	NDINIT(&nd, LOOKUP, ISOPEN | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
 	    uap->name, td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	if (vn_isdisk(vp, &error)) {
 		error = swapongeom(vp);
 	} else if (vp->v_type == VREG &&
 	    (vp->v_mount->mnt_vfc->vfc_flags & VFCF_NETWORK) != 0 &&
 	    (error = VOP_GETATTR(vp, &attr, td->td_ucred)) == 0) {
 		/*
 		 * Allow direct swapping to NFS regular files in the same
 		 * way that nfs_mountroot() sets up diskless swapping.
 		 */
 		error = swaponvp(td, vp, attr.va_size / DEV_BSIZE);
 	}
 
 	if (error)
 		vrele(vp);
 done:
 	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
 /*
  * Check that the total amount of swap currently configured does not
  * exceed half the theoretical maximum.  If it does, print a warning
  * message.
  */
 static void
 swapon_check_swzone(void)
 {
 	unsigned long maxpages, npages;
 
 	npages = swap_total;
 	/* absolute maximum we can handle assuming 100% efficiency */
 	maxpages = uma_zone_get_max(swblk_zone) * SWAP_META_PAGES;
 
 	/* recommend using no more than half that amount */
 	if (npages > maxpages / 2) {
 		printf("warning: total configured swap (%lu pages) "
 		    "exceeds maximum recommended amount (%lu pages).\n",
 		    npages, maxpages / 2);
 		printf("warning: increase kern.maxswzone "
 		    "or reduce amount of swap.\n");
 	}
 }
 
 static void
 swaponsomething(struct vnode *vp, void *id, u_long nblks,
     sw_strategy_t *strategy, sw_close_t *close, dev_t dev, int flags)
 {
 	struct swdevt *sp, *tsp;
 	swblk_t dvbase;
 	u_long mblocks;
 
 	/*
 	 * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks.
 	 * First chop nblks off to page-align it, then convert.
 	 *
 	 * sw->sw_nblks is in page-sized chunks now too.
 	 */
 	nblks &= ~(ctodb(1) - 1);
 	nblks = dbtoc(nblks);
 
 	/*
 	 * If we go beyond this, we get overflows in the radix
 	 * tree bitmap code.
 	 */
 	mblocks = 0x40000000 / BLIST_META_RADIX;
 	if (nblks > mblocks) {
 		printf(
     "WARNING: reducing swap size to maximum of %luMB per unit\n",
 		    mblocks / 1024 / 1024 * PAGE_SIZE);
 		nblks = mblocks;
 	}
 
 	sp = malloc(sizeof *sp, M_VMPGDATA, M_WAITOK | M_ZERO);
 	sp->sw_vp = vp;
 	sp->sw_id = id;
 	sp->sw_dev = dev;
 	sp->sw_nblks = nblks;
 	sp->sw_used = 0;
 	sp->sw_strategy = strategy;
 	sp->sw_close = close;
 	sp->sw_flags = flags;
 
 	sp->sw_blist = blist_create(nblks, M_WAITOK);
 	/*
 	 * Do not free the first two block in order to avoid overwriting
 	 * any bsd label at the front of the partition
 	 */
 	blist_free(sp->sw_blist, 2, nblks - 2);
 
 	dvbase = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(tsp, &swtailq, sw_list) {
 		if (tsp->sw_end >= dvbase) {
 			/*
 			 * We put one uncovered page between the devices
 			 * in order to definitively prevent any cross-device
 			 * I/O requests
 			 */
 			dvbase = tsp->sw_end + 1;
 		}
 	}
 	sp->sw_first = dvbase;
 	sp->sw_end = dvbase + nblks;
 	TAILQ_INSERT_TAIL(&swtailq, sp, sw_list);
 	nswapdev++;
 	swap_pager_avail += nblks - 2;
 	swap_total += nblks;
 	swapon_check_swzone();
 	swp_sizecheck();
 	mtx_unlock(&sw_dev_mtx);
 	EVENTHANDLER_INVOKE(swapon, sp);
 }
 
 /*
  * SYSCALL: swapoff(devname)
  *
  * Disable swapping on the given device.
  *
  * XXX: Badly designed system call: it should use a device index
  * rather than filename as specification.  We keep sw_vp around
  * only to make this work.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct swapoff_args {
 	char *name;
 };
 #endif
 
 /*
  * MPSAFE
  */
 /* ARGSUSED */
 int
 sys_swapoff(struct thread *td, struct swapoff_args *uap)
 {
 	struct vnode *vp;
 	struct nameidata nd;
 	struct swdevt *sp;
 	int error;
 
 	error = priv_check(td, PRIV_SWAPOFF);
 	if (error)
 		return (error);
 
 	sx_xlock(&swdev_syscall_lock);
 
 	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->name,
 	    td);
 	error = namei(&nd);
 	if (error)
 		goto done;
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_vp == vp)
 			break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (sp == NULL) {
 		error = EINVAL;
 		goto done;
 	}
 	error = swapoff_one(sp, td->td_ucred);
 done:
 	sx_xunlock(&swdev_syscall_lock);
 	return (error);
 }
 
 static int
 swapoff_one(struct swdevt *sp, struct ucred *cred)
 {
 	u_long nblks;
 #ifdef MAC
 	int error;
 #endif
 
 	sx_assert(&swdev_syscall_lock, SA_XLOCKED);
 #ifdef MAC
 	(void) vn_lock(sp->sw_vp, LK_EXCLUSIVE | LK_RETRY);
 	error = mac_system_check_swapoff(cred, sp->sw_vp);
 	(void) VOP_UNLOCK(sp->sw_vp, 0);
 	if (error != 0)
 		return (error);
 #endif
 	nblks = sp->sw_nblks;
 
 	/*
 	 * We can turn off this swap device safely only if the
 	 * available virtual memory in the system will fit the amount
 	 * of data we will have to page back in, plus an epsilon so
 	 * the system doesn't become critically low on swap space.
 	 */
 	if (vm_free_count() + swap_pager_avail < nblks + nswap_lowat)
 		return (ENOMEM);
 
 	/*
 	 * Prevent further allocations on this device.
 	 */
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_flags |= SW_CLOSING;
 	swap_pager_avail -= blist_fill(sp->sw_blist, 0, nblks);
 	swap_total -= nblks;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * Page in the contents of the device and close it.
 	 */
 	swap_pager_swapoff(sp);
 
 	sp->sw_close(curthread, sp);
 	mtx_lock(&sw_dev_mtx);
 	sp->sw_id = NULL;
 	TAILQ_REMOVE(&swtailq, sp, sw_list);
 	nswapdev--;
 	if (nswapdev == 0) {
 		swap_pager_full = 2;
 		swap_pager_almost_full = 1;
 	}
 	if (swdevhd == sp)
 		swdevhd = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	blist_destroy(sp->sw_blist);
 	free(sp, M_VMPGDATA);
 	return (0);
 }
 
 void
 swapoff_all(void)
 {
 	struct swdevt *sp, *spt;
 	const char *devname;
 	int error;
 
 	sx_xlock(&swdev_syscall_lock);
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH_SAFE(sp, &swtailq, sw_list, spt) {
 		mtx_unlock(&sw_dev_mtx);
 		if (vn_isdisk(sp->sw_vp, NULL))
 			devname = devtoname(sp->sw_vp->v_rdev);
 		else
 			devname = "[file]";
 		error = swapoff_one(sp, thread0.td_ucred);
 		if (error != 0) {
 			printf("Cannot remove swap device %s (error=%d), "
 			    "skipping.\n", devname, error);
 		} else if (bootverbose) {
 			printf("Swap device %s removed.\n", devname);
 		}
 		mtx_lock(&sw_dev_mtx);
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	sx_xunlock(&swdev_syscall_lock);
 }
 
 void
 swap_pager_status(int *total, int *used)
 {
 	struct swdevt *sp;
 
 	*total = 0;
 	*used = 0;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		*total += sp->sw_nblks;
 		*used += sp->sw_used;
 	}
 	mtx_unlock(&sw_dev_mtx);
 }
 
 int
 swap_dev_info(int name, struct xswdev *xs, char *devname, size_t len)
 {
 	struct swdevt *sp;
 	const char *tmp_devname;
 	int error, n;
 
 	n = 0;
 	error = ENOENT;
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (n != name) {
 			n++;
 			continue;
 		}
 		xs->xsw_version = XSWDEV_VERSION;
 		xs->xsw_dev = sp->sw_dev;
 		xs->xsw_flags = sp->sw_flags;
 		xs->xsw_nblks = sp->sw_nblks;
 		xs->xsw_used = sp->sw_used;
 		if (devname != NULL) {
 			if (vn_isdisk(sp->sw_vp, NULL))
 				tmp_devname = devtoname(sp->sw_vp->v_rdev);
 			else
 				tmp_devname = "[file]";
 			strncpy(devname, tmp_devname, len);
 		}
 		error = 0;
 		break;
 	}
 	mtx_unlock(&sw_dev_mtx);
 	return (error);
 }
 
 #if defined(COMPAT_FREEBSD11)
 #define XSWDEV_VERSION_11	1
 struct xswdev11 {
 	u_int	xsw_version;
 	uint32_t xsw_dev;
 	int	xsw_flags;
 	int	xsw_nblks;
 	int     xsw_used;
 };
 #endif
 
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 struct xswdev32 {
 	u_int	xsw_version;
 	u_int	xsw_dev1, xsw_dev2;
 	int	xsw_flags;
 	int	xsw_nblks;
 	int     xsw_used;
 };
 #endif
 
 static int
 sysctl_vm_swap_info(SYSCTL_HANDLER_ARGS)
 {
 	struct xswdev xs;
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 	struct xswdev32 xs32;
 #endif
 #if defined(COMPAT_FREEBSD11)
 	struct xswdev11 xs11;
 #endif
 	int error;
 
 	if (arg2 != 1)			/* name length */
 		return (EINVAL);
 	error = swap_dev_info(*(int *)arg1, &xs, NULL, 0);
 	if (error != 0)
 		return (error);
 #if defined(__amd64__) && defined(COMPAT_FREEBSD32)
 	if (req->oldlen == sizeof(xs32)) {
 		xs32.xsw_version = XSWDEV_VERSION;
 		xs32.xsw_dev1 = xs.xsw_dev;
 		xs32.xsw_dev2 = xs.xsw_dev >> 32;
 		xs32.xsw_flags = xs.xsw_flags;
 		xs32.xsw_nblks = xs.xsw_nblks;
 		xs32.xsw_used = xs.xsw_used;
 		error = SYSCTL_OUT(req, &xs32, sizeof(xs32));
 		return (error);
 	}
 #endif
 #if defined(COMPAT_FREEBSD11)
 	if (req->oldlen == sizeof(xs11)) {
 		xs11.xsw_version = XSWDEV_VERSION_11;
 		xs11.xsw_dev = xs.xsw_dev; /* truncation */
 		xs11.xsw_flags = xs.xsw_flags;
 		xs11.xsw_nblks = xs.xsw_nblks;
 		xs11.xsw_used = xs.xsw_used;
 		error = SYSCTL_OUT(req, &xs11, sizeof(xs11));
 		return (error);
 	}
 #endif
 	error = SYSCTL_OUT(req, &xs, sizeof(xs));
 	return (error);
 }
 
 SYSCTL_INT(_vm, OID_AUTO, nswapdev, CTLFLAG_RD, &nswapdev, 0,
     "Number of swap devices");
 SYSCTL_NODE(_vm, OID_AUTO, swap_info, CTLFLAG_RD | CTLFLAG_MPSAFE,
     sysctl_vm_swap_info,
     "Swap statistics by device");
 
 /*
  * Count the approximate swap usage in pages for a vmspace.  The
  * shadowed or not yet copied on write swap blocks are not accounted.
  * The map must be locked.
  */
 long
 vmspace_swap_count(struct vmspace *vmspace)
 {
 	vm_map_t map;
 	vm_map_entry_t cur;
 	vm_object_t object;
 	struct swblk *sb;
 	vm_pindex_t e, pi;
 	long count;
 	int i;
 
 	map = &vmspace->vm_map;
 	count = 0;
 
 	for (cur = map->header.next; cur != &map->header; cur = cur->next) {
 		if ((cur->eflags & MAP_ENTRY_IS_SUB_MAP) != 0)
 			continue;
 		object = cur->object.vm_object;
 		if (object == NULL || object->type != OBJT_SWAP)
 			continue;
 		VM_OBJECT_RLOCK(object);
 		if (object->type != OBJT_SWAP)
 			goto unlock;
 		pi = OFF_TO_IDX(cur->offset);
 		e = pi + OFF_TO_IDX(cur->end - cur->start);
 		for (;; pi = sb->p + SWAP_META_PAGES) {
 			sb = SWAP_PCTRIE_LOOKUP_GE(
 			    &object->un_pager.swp.swp_blks, pi);
 			if (sb == NULL || sb->p >= e)
 				break;
 			for (i = 0; i < SWAP_META_PAGES; i++) {
 				if (sb->p + i < e &&
 				    sb->d[i] != SWAPBLK_NONE)
 					count++;
 			}
 		}
 unlock:
 		VM_OBJECT_RUNLOCK(object);
 	}
 	return (count);
 }
 
 /*
  * GEOM backend
  *
  * Swapping onto disk devices.
  *
  */
 
 static g_orphan_t swapgeom_orphan;
 
 static struct g_class g_swap_class = {
 	.name = "SWAP",
 	.version = G_VERSION,
 	.orphan = swapgeom_orphan,
 };
 
 DECLARE_GEOM_CLASS(g_swap_class, g_class);
 
 
 static void
 swapgeom_close_ev(void *arg, int flags)
 {
 	struct g_consumer *cp;
 
 	cp = arg;
 	g_access(cp, -1, -1, 0);
 	g_detach(cp);
 	g_destroy_consumer(cp);
 }
 
 /*
  * Add a reference to the g_consumer for an inflight transaction.
  */
 static void
 swapgeom_acquire(struct g_consumer *cp)
 {
 
 	mtx_assert(&sw_dev_mtx, MA_OWNED);
 	cp->index++;
 }
 
 /*
  * Remove a reference from the g_consumer.  Post a close event if all
  * references go away, since the function might be called from the
  * biodone context.
  */
 static void
 swapgeom_release(struct g_consumer *cp, struct swdevt *sp)
 {
 
 	mtx_assert(&sw_dev_mtx, MA_OWNED);
 	cp->index--;
 	if (cp->index == 0) {
 		if (g_post_event(swapgeom_close_ev, cp, M_NOWAIT, NULL) == 0)
 			sp->sw_id = NULL;
 	}
 }
 
 static void
 swapgeom_done(struct bio *bp2)
 {
 	struct swdevt *sp;
 	struct buf *bp;
 	struct g_consumer *cp;
 
 	bp = bp2->bio_caller2;
 	cp = bp2->bio_from;
 	bp->b_ioflags = bp2->bio_flags;
 	if (bp2->bio_error)
 		bp->b_ioflags |= BIO_ERROR;
 	bp->b_resid = bp->b_bcount - bp2->bio_completed;
 	bp->b_error = bp2->bio_error;
 	bp->b_caller1 = NULL;
 	bufdone(bp);
 	sp = bp2->bio_caller1;
 	mtx_lock(&sw_dev_mtx);
 	swapgeom_release(cp, sp);
 	mtx_unlock(&sw_dev_mtx);
 	g_destroy_bio(bp2);
 }
 
 static void
 swapgeom_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct bio *bio;
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sp->sw_id;
 	if (cp == NULL) {
 		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENXIO;
 		bp->b_ioflags |= BIO_ERROR;
 		bufdone(bp);
 		return;
 	}
 	swapgeom_acquire(cp);
 	mtx_unlock(&sw_dev_mtx);
 	if (bp->b_iocmd == BIO_WRITE)
 		bio = g_new_bio();
 	else
 		bio = g_alloc_bio();
 	if (bio == NULL) {
 		mtx_lock(&sw_dev_mtx);
 		swapgeom_release(cp, sp);
 		mtx_unlock(&sw_dev_mtx);
 		bp->b_error = ENOMEM;
 		bp->b_ioflags |= BIO_ERROR;
 		printf("swap_pager: cannot allocate bio\n");
 		bufdone(bp);
 		return;
 	}
 
 	bp->b_caller1 = bio;
 	bio->bio_caller1 = sp;
 	bio->bio_caller2 = bp;
 	bio->bio_cmd = bp->b_iocmd;
 	bio->bio_offset = (bp->b_blkno - sp->sw_first) * PAGE_SIZE;
 	bio->bio_length = bp->b_bcount;
 	bio->bio_done = swapgeom_done;
 	if (!buf_mapped(bp)) {
 		bio->bio_ma = bp->b_pages;
 		bio->bio_data = unmapped_buf;
 		bio->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
 		bio->bio_ma_n = bp->b_npages;
 		bio->bio_flags |= BIO_UNMAPPED;
 	} else {
 		bio->bio_data = bp->b_data;
 		bio->bio_ma = NULL;
 	}
 	g_io_request(bio, cp);
 	return;
 }
 
 static void
 swapgeom_orphan(struct g_consumer *cp)
 {
 	struct swdevt *sp;
 	int destroy;
 
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == cp) {
 			sp->sw_flags |= SW_CLOSING;
 			break;
 		}
 	}
 	/*
 	 * Drop reference we were created with. Do directly since we're in a
 	 * special context where we don't have to queue the call to
 	 * swapgeom_close_ev().
 	 */
 	cp->index--;
 	destroy = ((sp != NULL) && (cp->index == 0));
 	if (destroy)
 		sp->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 	if (destroy)
 		swapgeom_close_ev(cp, 0);
 }
 
 static void
 swapgeom_close(struct thread *td, struct swdevt *sw)
 {
 	struct g_consumer *cp;
 
 	mtx_lock(&sw_dev_mtx);
 	cp = sw->sw_id;
 	sw->sw_id = NULL;
 	mtx_unlock(&sw_dev_mtx);
 
 	/*
 	 * swapgeom_close() may be called from the biodone context,
 	 * where we cannot perform topology changes.  Delegate the
 	 * work to the events thread.
 	 */
 	if (cp != NULL)
 		g_waitfor_event(swapgeom_close_ev, cp, M_WAITOK, NULL);
 }
 
 static int
 swapongeom_locked(struct cdev *dev, struct vnode *vp)
 {
 	struct g_provider *pp;
 	struct g_consumer *cp;
 	static struct g_geom *gp;
 	struct swdevt *sp;
 	u_long nblks;
 	int error;
 
 	pp = g_dev_getprovider(dev);
 	if (pp == NULL)
 		return (ENODEV);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		cp = sp->sw_id;
 		if (cp != NULL && cp->provider == pp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 	if (gp == NULL)
 		gp = g_new_geomf(&g_swap_class, "swap");
 	cp = g_new_consumer(gp);
 	cp->index = 1;	/* Number of active I/Os, plus one for being active. */
 	cp->flags |=  G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
 	g_attach(cp, pp);
 	/*
 	 * XXX: Every time you think you can improve the margin for
 	 * footshooting, somebody depends on the ability to do so:
 	 * savecore(8) wants to write to our swapdev so we cannot
 	 * set an exclusive count :-(
 	 */
 	error = g_access(cp, 1, 1, 0);
 	if (error != 0) {
 		g_detach(cp);
 		g_destroy_consumer(cp);
 		return (error);
 	}
 	nblks = pp->mediasize / DEV_BSIZE;
 	swaponsomething(vp, cp, nblks, swapgeom_strategy,
 	    swapgeom_close, dev2udev(dev),
 	    (pp->flags & G_PF_ACCEPT_UNMAPPED) != 0 ? SW_UNMAPPED : 0);
 	return (0);
 }
 
 static int
 swapongeom(struct vnode *vp)
 {
 	int error;
 
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type != VCHR || (vp->v_iflag & VI_DOOMED) != 0) {
 		error = ENOENT;
 	} else {
 		g_topology_lock();
 		error = swapongeom_locked(vp->v_rdev, vp);
 		g_topology_unlock();
 	}
 	VOP_UNLOCK(vp, 0);
 	return (error);
 }
 
 /*
  * VNODE backend
  *
  * This is used mainly for network filesystem (read: probably only tested
  * with NFS) swapfiles.
  *
  */
 
 static void
 swapdev_strategy(struct buf *bp, struct swdevt *sp)
 {
 	struct vnode *vp2;
 
 	bp->b_blkno = ctodb(bp->b_blkno - sp->sw_first);
 
 	vp2 = sp->sw_id;
 	vhold(vp2);
 	if (bp->b_iocmd == BIO_WRITE) {
 		if (bp->b_bufobj)
 			bufobj_wdrop(bp->b_bufobj);
 		bufobj_wref(&vp2->v_bufobj);
 	}
 	if (bp->b_bufobj != &vp2->v_bufobj)
 		bp->b_bufobj = &vp2->v_bufobj;
 	bp->b_vp = vp2;
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	bstrategy(bp);
 	return;
 }
 
 static void
 swapdev_close(struct thread *td, struct swdevt *sp)
 {
 
 	VOP_CLOSE(sp->sw_vp, FREAD | FWRITE, td->td_ucred, td);
 	vrele(sp->sw_vp);
 }
 
 
 static int
 swaponvp(struct thread *td, struct vnode *vp, u_long nblks)
 {
 	struct swdevt *sp;
 	int error;
 
 	if (nblks == 0)
 		return (ENXIO);
 	mtx_lock(&sw_dev_mtx);
 	TAILQ_FOREACH(sp, &swtailq, sw_list) {
 		if (sp->sw_id == vp) {
 			mtx_unlock(&sw_dev_mtx);
 			return (EBUSY);
 		}
 	}
 	mtx_unlock(&sw_dev_mtx);
 
 	(void) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 #ifdef MAC
 	error = mac_system_check_swapon(td->td_ucred, vp);
 	if (error == 0)
 #endif
 		error = VOP_OPEN(vp, FREAD | FWRITE, td->td_ucred, td, NULL);
 	(void) VOP_UNLOCK(vp, 0);
 	if (error)
 		return (error);
 
 	swaponsomething(vp, vp, nblks, swapdev_strategy, swapdev_close,
 	    NODEV, 0);
 	return (0);
 }
 
 static int
 sysctl_swap_async_max(SYSCTL_HANDLER_ARGS)
 {
 	int error, new, n;
 
 	new = nsw_wcount_async_max;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 
 	if (new > nswbuf / 2 || new < 1)
 		return (EINVAL);
 
 	mtx_lock(&swbuf_mtx);
 	while (nsw_wcount_async_max != new) {
 		/*
 		 * Adjust difference.  If the current async count is too low,
 		 * we will need to sqeeze our update slowly in.  Sleep with a
 		 * higher priority than getpbuf() to finish faster.
 		 */
 		n = new - nsw_wcount_async_max;
 		if (nsw_wcount_async + n >= 0) {
 			nsw_wcount_async += n;
 			nsw_wcount_async_max += n;
 			wakeup(&nsw_wcount_async);
 		} else {
 			nsw_wcount_async_max -= nsw_wcount_async;
 			nsw_wcount_async = 0;
 			msleep(&nsw_wcount_async, &swbuf_mtx, PSWP,
 			    "swpsysctl", 0);
 		}
 	}
 	mtx_unlock(&swbuf_mtx);
 
 	return (0);
 }
Index: projects/fuse2/usr.bin/nfsstat/nfsstat.c
===================================================================
--- projects/fuse2/usr.bin/nfsstat/nfsstat.c	(revision 350434)
+++ projects/fuse2/usr.bin/nfsstat/nfsstat.c	(revision 350435)
@@ -1,1206 +1,1206 @@
 /*
  * Copyright (c) 1983, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Rick Macklem at The University of Guelph.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 /*-
  * Copyright (c) 2004, 2008, 2009 Silicon Graphics International Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  */
 
 
 #ifndef lint
 static const char copyright[] =
 "@(#) Copyright (c) 1983, 1989, 1993\n\
 	The Regents of the University of California.  All rights reserved.\n";
 #endif /* not lint */
 
 #ifndef lint
 #if 0
 static char sccsid[] = "@(#)nfsstat.c	8.2 (Berkeley) 3/31/95";
 #endif
 static const char rcsid[] =
   "$FreeBSD$";
 #endif /* not lint */
 
 #include <sys/param.h>
 #include <sys/module.h>
 #include <sys/mount.h>
 #include <sys/time.h>
 #include <sys/sysctl.h>
 #include <nfs/nfsproto.h>
 #include <nfsclient/nfs.h>
 #include <nfsserver/nfs.h>
 #include <nfs/nfssvc.h>
 
 #include <fs/nfs/nfsport.h>
 
 #include <signal.h>
 #include <fcntl.h>
 #include <ctype.h>
 #include <errno.h>
 #include <limits.h>
 #include <nlist.h>
 #include <unistd.h>
 #include <stdio.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <paths.h>
 #include <devstat.h>
 #include <err.h>
 
 #include <libxo/xo.h>
 
 static int widemode = 0;
 static int zflag = 0;
 static int printtitle = 1;
 static struct nfsstatsv1 ext_nfsstats;
 static int extra_output = 0;
 
 static void intpr(int, int);
 static void printhdr(int, int, int);
 static void usage(void);
 static char *sperc1(int, int);
 static char *sperc2(int, int);
 static void exp_intpr(int, int, int);
 static void exp_sidewaysintpr(u_int, int, int, int);
 static void compute_new_stats(struct nfsstatsv1 *cur_stats,
     struct nfsstatsv1 *prev_stats, int curop, long double etime,
     long double *mbsec, long double *kb_per_transfer,
     long double *transfers_per_second, long double *ms_per_transfer,
     uint64_t *queue_len, long double *busy_pct);
 
 #define DELTA(field)	(nfsstats.field - lastst.field)
 
 #define	STAT_TYPE_READ		0
 #define	STAT_TYPE_WRITE		1
 #define	STAT_TYPE_COMMIT	2
 #define	NUM_STAT_TYPES		3
 
 struct stattypes {
 	int stat_type;
 	int nfs_type;
 };
 static struct stattypes statstruct[] = {
 	{STAT_TYPE_READ, NFSV4OP_READ},
 	{STAT_TYPE_WRITE, NFSV4OP_WRITE},
 	{STAT_TYPE_COMMIT, NFSV4OP_COMMIT}
 };
 
 #define	STAT_TYPE_TO_NFS(stat_type)	statstruct[stat_type].nfs_type
 
 #define	NFSSTAT_XO_VERSION	"1"
 
 int
 main(int argc, char **argv)
 {
 	u_int interval;
 	int clientOnly = -1;
 	int serverOnly = -1;
 	int newStats = 0;
 	int ch;
 	char *memf, *nlistf;
 	int mntlen, i;
 	char buf[1024];
 	struct statfs *mntbuf;
 	struct nfscl_dumpmntopts dumpmntopts;
 
 	interval = 0;
 	memf = nlistf = NULL;
 
 	argc = xo_parse_args(argc, argv);
 	if (argc < 0)
 		exit(1);
 
 	xo_set_version(NFSSTAT_XO_VERSION);
 
 	while ((ch = getopt(argc, argv, "cdEesWM:mN:w:zq")) != -1)
 		switch(ch) {
 		case 'M':
 			memf = optarg;
 			break;
 		case 'm':
 			/* Display mount options for NFS mount points. */
 			mntlen = getmntinfo(&mntbuf, MNT_NOWAIT);
 			for (i = 0; i < mntlen; i++) {
 				if (strcmp(mntbuf->f_fstypename, "nfs") == 0) {
 					dumpmntopts.ndmnt_fname =
 					    mntbuf->f_mntonname;
 					dumpmntopts.ndmnt_buf = buf;
 					dumpmntopts.ndmnt_blen = sizeof(buf);
 					if (nfssvc(NFSSVC_DUMPMNTOPTS,
 					    &dumpmntopts) >= 0)
 						printf("%s on %s\n%s\n",
 						    mntbuf->f_mntfromname,
 						    mntbuf->f_mntonname, buf);
 					else if (errno == EPERM)
 						errx(1, "Only priviledged users"
 						    " can use the -m option");
 				}
 				mntbuf++;
 			}
 			exit(0);
 		case 'N':
 			nlistf = optarg;
 			break;
 		case 'W':
 			widemode = 1;
 			break;
 		case 'w':
 			interval = atoi(optarg);
 			break;
 		case 'c':
 			clientOnly = 1;
 			if (serverOnly < 0)
 				serverOnly = 0;
 			break;
 		case 'd':
 			newStats = 1;
 			if (interval == 0)
 				interval = 1;
 			break;
 		case 's':
 			serverOnly = 1;
 			if (clientOnly < 0)
 				clientOnly = 0;
 			break;
 		case 'z':
 			zflag = 1;
 			break;
 		case 'E':
 			if (extra_output != 0)
 				xo_err(1, "-e and -E are mutually exclusive");
 			extra_output = 2;
 			break;
 		case 'e':
 			if (extra_output != 0)
 				xo_err(1, "-e and -E are mutually exclusive");
 			extra_output = 1;
 			break;
 		case 'q':
 			printtitle = 0;
 			break;
 		case '?':
 		default:
 			usage();
 		}
 	argc -= optind;
 	argv += optind;
 
 #define	BACKWARD_COMPATIBILITY
 #ifdef	BACKWARD_COMPATIBILITY
 	if (*argv) {
 		interval = atoi(*argv);
 		if (*++argv) {
 			nlistf = *argv;
 			if (*++argv)
 				memf = *argv;
 		}
 	}
 #endif
 	if (modfind("nfscommon") < 0)
 		xo_err(1, "NFS client/server not loaded");
 
 	if (interval) {
 		exp_sidewaysintpr(interval, clientOnly, serverOnly,
 		    newStats);
 	} else {
 		xo_open_container("nfsstat");
 		if (extra_output != 0)
 			exp_intpr(clientOnly, serverOnly, extra_output - 1);
 		else
 			intpr(clientOnly, serverOnly);
 		xo_close_container("nfsstat");
 	}
 
 	xo_finish();
 	exit(0);
 }
 
 /*
  * Print a description of the nfs stats.
  */
 static void
 intpr(int clientOnly, int serverOnly)
 {
 	int nfssvc_flag;
 
 	nfssvc_flag = NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT;
 	if (zflag != 0) {
 		if (clientOnly != 0)
 			nfssvc_flag |= NFSSVC_ZEROCLTSTATS;
 		if (serverOnly != 0)
 			nfssvc_flag |= NFSSVC_ZEROSRVSTATS;
 	}
 	ext_nfsstats.vers = NFSSTATS_V1;
 	if (nfssvc(nfssvc_flag, &ext_nfsstats) < 0)
 		xo_err(1, "Can't get stats");
 	if (clientOnly) {
 		xo_open_container("clientstats");
 
 		if (printtitle)
 			xo_emit("{T:Client Info:\n");
 
 		xo_open_container("operations");
 		xo_emit("{T:Rpc Counts:}\n");
 
 		xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}"
 		    "{T:Lookup/%13.13s}{T:Readlink/%13.13s}"
 		    "{T:Read/%13.13s}{T:Write/%13.13s}"
 		  "{T:Create/%13.13s}{T:Remove/%13.13s}\n");
 		xo_emit("{:getattr/%13ju}{:setattr/%13ju}"
 		    "{:lookup/%13ju}{:readlink/%13ju}"
 		    "{:read/%13ju}{:write/%13ju}"
 		    "{:create/%13ju}{:remove/%13ju}\n",
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETATTR],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETATTR],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOOKUP],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READLINK],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READ],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_WRITE],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATE],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_REMOVE]);
 
 		xo_emit("{T:Rename/%13.13s}{T:Link/%13.13s}"
 		    "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}"
 		    "{T:Rmdir/%13.13s}{T:Readdir/%13.13s}"
 		  "{T:RdirPlus/%13.13s}{T:Access/%13.13s}\n");
 		xo_emit("{:rename/%13ju}{:link/%13ju}"
 		    "{:symlink/%13ju}{:mkdir/%13ju}"
 		    "{:rmdir/%13ju}{:readdir/%13ju}"
 		    "{:rdirplus/%13ju}{:access/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RENAME],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LINK],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SYMLINK],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKDIR],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RMDIR],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIR],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIRPLUS],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_ACCESS]);
 
 		xo_emit("{T:Mknod/%13.13s}{T:Fsstat/%13.13s}"
 		    "{T:Fsinfo/%13.13s}{T:PathConf/%13.13s}"
 		    "{T:Commit/%13.13s}\n");
 		xo_emit("{:mknod/%13ju}{:fsstat/%13ju}"
 		    "{:fsinfo/%13ju}{:pathconf/%13ju}"
 		    "{:commit/%13ju}\n",
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKNOD],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSSTAT],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSINFO],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_PATHCONF],
 			(uintmax_t)ext_nfsstats.rpccnt[NFSPROC_COMMIT]);
 
 		xo_close_container("operations");
 
 		xo_open_container("rpcs");
 		xo_emit("{T:Rpc Info:}\n");
 
 		xo_emit("{T:TimedOut/%13.13s}{T:Invalid/%13.13s}"
 		    "{T:X Replies/%13.13s}{T:Retries/%13.13s}"
 		    "{T:Requests/%13.13s}\n");
 		xo_emit("{:timedout/%13ju}{:invalid/%13ju}"
 		    "{:xreplies/%13ju}{:retries/%13ju}"
 		    "{:requests/%13ju}\n",
 			(uintmax_t)ext_nfsstats.rpctimeouts,
 			(uintmax_t)ext_nfsstats.rpcinvalid,
 			(uintmax_t)ext_nfsstats.rpcunexpected,
 			(uintmax_t)ext_nfsstats.rpcretries,
 			(uintmax_t)ext_nfsstats.rpcrequests);
 		xo_close_container("rpcs");
 
 		xo_open_container("cache");
 		xo_emit("{T:Cache Info:}\n");
 
 		xo_emit("{T:Attr Hits/%13.13s}{T:Attr Misses/%13.13s}"
 		    "{T:Lkup Hits/%13.13s}{T:Lkup Misses/%13.13s}"
 		    "{T:BioR Hits/%13.13s}{T:BioR Misses/%13.13s}"
 		    "{T:BioW Hits/%13.13s}{T:BioW Misses/%13.13s}\n");
 		xo_emit("{:attrhits/%13ju}{:attrmisses/%13ju}"
 		    "{:lkuphits/%13ju}{:lkupmisses/%13ju}"
 		    "{:biorhits/%13ju}{:biormisses/%13ju}"
 		    "{:biowhits/%13ju}{:biowmisses/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.attrcache_hits,
 		    (uintmax_t)ext_nfsstats.attrcache_misses,
 		    (uintmax_t)ext_nfsstats.lookupcache_hits,
 		    (uintmax_t)ext_nfsstats.lookupcache_misses,
 		    (uintmax_t)(ext_nfsstats.biocache_reads -
 		    ext_nfsstats.read_bios),
 		    (uintmax_t)ext_nfsstats.read_bios,
 		    (uintmax_t)(ext_nfsstats.biocache_writes -
 		    ext_nfsstats.write_bios),
 		    (uintmax_t)ext_nfsstats.write_bios);
 
 		xo_emit("{T:BioRL Hits/%13.13s}{T:BioRL Misses/%13.13s}"
 		    "{T:BioD Hits/%13.13s}{T:BioD Misses/%13.13s}"
 		    "{T:DirE Hits/%13.13s}{T:DirE Misses/%13.13s}"
 		    "{T:Accs Hits/%13.13s}{T:Accs Misses/%13.13s}\n");
 		xo_emit("{:biosrlhits/%13ju}{:biorlmisses/%13ju}"
 		    "{:biodhits/%13ju}{:biodmisses/%13ju}"
 		    "{:direhits/%13ju}{:diremisses/%13ju}"
 		    "{:accshits/%13ju}{:accsmisses/%13ju}\n",
 		    (uintmax_t)(ext_nfsstats.biocache_readlinks -
 		    ext_nfsstats.readlink_bios),
 		    (uintmax_t)ext_nfsstats.readlink_bios,
 		    (uintmax_t)(ext_nfsstats.biocache_readdirs -
 		    ext_nfsstats.readdir_bios),
 		    (uintmax_t)ext_nfsstats.readdir_bios,
 		    (uintmax_t)ext_nfsstats.direofcache_hits,
 		    (uintmax_t)ext_nfsstats.direofcache_misses,
 		    (uintmax_t)ext_nfsstats.accesscache_hits,
 		    (uintmax_t)ext_nfsstats.accesscache_misses);
 
 		xo_close_container("cache");
 
 		xo_close_container("clientstats");
 	}
 	if (serverOnly) {
 		xo_open_container("serverstats");
 
 		xo_emit("{T:Server Info:}\n");
 		xo_open_container("operations");
 
 		xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}"
 		    "{T:Lookup/%13.13s}{T:Readlink/%13.13s}"
 		    "{T:Read/%13.13s}{T:Write/%13.13s}"
 		    "{T:Create/%13.13s}{T:Remove/%13.13s}\n");
 		xo_emit("{:getattr/%13ju}{:setattr/%13ju}"
 		    "{:lookup/%13ju}{:readlink/%13ju}"
 		    "{:read/%13ju}{:write/%13ju}"
 		    "{:create/%13ju}{:remove/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETATTR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETATTR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOOKUP],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READLINK],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READ],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CREATE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_REMOVE]);
 
 		xo_emit("{T:Rename/%13.13s}{T:Link/%13.13s}"
 		    "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}"
 		    "{T:Rmdir/%13.13s}{T:Readdir/%13.13s}"
 		    "{T:RdirPlus/%13.13s}{T:Access/%13.13s}\n");
 		xo_emit("{:rename/%13ju}{:link/%13ju}"
 		    "{:symlink/%13ju}{:mkdir/%13ju}"
 		    "{:rmdir/%13ju}{:readdir/%13ju}"
 		    "{:rdirplus/%13ju}{:access/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RENAME],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LINK],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SYMLINK],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKDIR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RMDIR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIRPLUS],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_ACCESS]);
 
 		xo_emit("{T:Mknod/%13.13s}{T:Fsstat/%13.13s}"
 		    "{T:Fsinfo/%13.13s}{T:PathConf/%13.13s}"
 		    "{T:Commit/%13.13s}\n");
 		xo_emit("{:mknod/%13ju}{:fsstat/%13ju}"
 		    "{:fsinfo/%13ju}{:pathconf/%13ju}"
 		    "{:commit/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKNOD],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSSTAT],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSINFO],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PATHCONF],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_COMMIT]);
 
 		xo_close_container("operations");
 
 		xo_open_container("server");
-		xo_emit("{T:Server Re-Failed:}\n");
-		xo_emit("{T:retfailed/%17ju}\n", (uintmax_t)ext_nfsstats.srvrpc_errs);
+		xo_emit("{T:Server Re-Failed}\n");
+		xo_emit("{:retfailed/%16ju}\n", (uintmax_t)ext_nfsstats.srvrpc_errs);
 
-		xo_emit("{T:Server Faults:}\n");
-		xo_emit("{T:faults/%13ju}\n", (uintmax_t)ext_nfsstats.srv_errs);
+		xo_emit("{T:Server Faults}\n");
+		xo_emit("{:faults/%13ju}\n", (uintmax_t)ext_nfsstats.srv_errs);
 
 		xo_emit("{T:Server Write Gathering:/%13.13s}\n");
 
 		xo_emit("{T:WriteOps/%13.13s}{T:WriteRPC/%13.13s}"
 		    "{T:Opsaved/%13.13s}\n");
 		xo_emit("{:writeops/%13ju}{:writerpc/%13ju}"
 		    "{:opsaved/%13ju}\n",
 		/*
 		 * The new client doesn't do write gathering. It was
 		 * only useful for NFSv2.
 		 */
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE], 0);
 
 		xo_close_container("server");
 
 		xo_open_container("cache");
 		xo_emit("{T:Server Cache Stats:/%13.13s}\n");
 		xo_emit("{T:Inprog/%13.13s}{T:Idem/%13.13s}"
 		    "{T:Non-Idem/%13.13s}{T:Misses/%13.13s}\n");
 		xo_emit("{:inprog/%13ju}{:idem/%13ju}"
 		    "{:nonidem/%13ju}{:misses/%13ju}\n",
 			(uintmax_t)ext_nfsstats.srvcache_inproghits,
 			(uintmax_t)ext_nfsstats.srvcache_idemdonehits,
 			(uintmax_t)ext_nfsstats.srvcache_nonidemdonehits,
 			(uintmax_t)ext_nfsstats.srvcache_misses);
 		xo_close_container("cache");
 
 		xo_close_container("serverstats");
 	}
 }
 
 static void
 printhdr(int clientOnly, int serverOnly, int newStats)
 {
 
 	if (newStats) {
 		printf(" [%s Read %s]  [%s Write %s]  "
 		    "%s[=========== Total ============]\n"
 		    " KB/t   tps    MB/s%s  KB/t   tps    MB/s%s  "
 		    "%sKB/t   tps    MB/s    ms  ql  %%b",
 		    widemode ? "========" : "=====",
 		    widemode ? "========" : "=====",
 		    widemode ? "========" : "=====",
 		    widemode ? "======="  : "====",
 		    widemode ? "[Commit ]  " : "",
 		    widemode ? "    ms" : "",
 		    widemode ? "    ms" : "",
 		    widemode ? "tps    ms  " : "");
 	} else {
 		printf("%s%6.6s %6.6s %6.6s %6.6s %6.6s %6.6s %6.6s %6.6s",
 		    ((serverOnly && clientOnly) ? "        " : " "),
 		    "GtAttr", "Lookup", "Rdlink", "Read", "Write", "Rename",
 		    "Access", "Rddir");
 		if (widemode && clientOnly) {
 			printf(" Attr Lkup BioR BioW Accs BioD");
 		}
 	}
 	printf("\n");
 	fflush(stdout);
 }
 
 static void
 usage(void)
 {
 	(void)fprintf(stderr,
 	    "usage: nfsstat [-cdemszW] [-M core] [-N system] [-w wait]\n");
 	exit(1);
 }
 
 static char SPBuf[64][8];
 static int SPIndex;
 
 static char * 
 sperc1(int hits, int misses)
 {
 	char *p = SPBuf[SPIndex];
 
 	if (hits + misses) {
 		sprintf(p, "%3d%%", 
 		    (int)(char)((quad_t)hits * 100 / (hits + misses)));
 	} else {
 		sprintf(p, "   -");
 	}
 	SPIndex = (SPIndex + 1) & 63;
 	return(p);
 }
 
 static char * 
 sperc2(int ttl, int misses)
 {
 	char *p = SPBuf[SPIndex];
 
 	if (ttl) {
 		sprintf(p, "%3d%%",
 		    (int)(char)((quad_t)(ttl - misses) * 100 / ttl));
 	} else {
 		sprintf(p, "   -");
 	}
 	SPIndex = (SPIndex + 1) & 63;
 	return(p);
 }
 
 #define DELTA_T(field)					\
 	devstat_compute_etime(&cur_stats->field,	\
 	(prev_stats ? &prev_stats->field : NULL))
 
 /*
  * XXX KDM mostly copied from ctlstat.  We should commonize the code (and
  * the devstat code) somehow.
  */
 static void
 compute_new_stats(struct nfsstatsv1 *cur_stats,
 		  struct nfsstatsv1 *prev_stats, int curop,
 		  long double etime, long double *mbsec,
 		  long double *kb_per_transfer,
 		  long double *transfers_per_second,
 		  long double *ms_per_transfer, uint64_t *queue_len,
 		  long double *busy_pct)
 {
 	uint64_t total_bytes = 0, total_operations = 0;
 	struct bintime total_time_bt;
 	struct timespec total_time_ts;
 
 	bzero(&total_time_bt, sizeof(total_time_bt));
 	bzero(&total_time_ts, sizeof(total_time_ts));
 
 	total_bytes = cur_stats->srvbytes[curop];
 	total_operations = cur_stats->srvops[curop];
 	if (prev_stats != NULL) {
 		total_bytes -= prev_stats->srvbytes[curop];
 		total_operations -= prev_stats->srvops[curop];
 	}
 
 	*mbsec = total_bytes;
 	*mbsec /= 1024 * 1024;
 	if (etime > 0.0) {
 		*busy_pct = DELTA_T(busytime);
 		if (*busy_pct < 0)
 			*busy_pct = 0;
 		*busy_pct /= etime;
 		*busy_pct *= 100;
 		if (*busy_pct < 0)
 			*busy_pct = 0;
 		*mbsec /= etime;
 	} else {
 		*busy_pct = 0;
 		*mbsec = 0;
 	}
 	*kb_per_transfer = total_bytes;
 	*kb_per_transfer /= 1024;
 	if (total_operations > 0)
 		*kb_per_transfer /= total_operations;
 	else
 		*kb_per_transfer = 0;
 	if (etime > 0.0) {
 		*transfers_per_second = total_operations;
 		*transfers_per_second /= etime;
 	} else {
 		*transfers_per_second = 0.0;
 	}
                         
 	if (total_operations > 0) {
 		*ms_per_transfer = DELTA_T(srvduration[curop]);
 		*ms_per_transfer /= total_operations;
 		*ms_per_transfer *= 1000;
 	} else
 		*ms_per_transfer = 0.0;
 
 	*queue_len = cur_stats->srvstartcnt - cur_stats->srvdonecnt;
 }
 
 /*
  * Print a description of the nfs stats for the client/server,
  * including NFSv4.1.
  */
 static void
 exp_intpr(int clientOnly, int serverOnly, int nfs41)
 {
 	int nfssvc_flag;
 
 	xo_open_container("nfsv4");
 
 	nfssvc_flag = NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT;
 	if (zflag != 0) {
 		if (clientOnly != 0)
 			nfssvc_flag |= NFSSVC_ZEROCLTSTATS;
 		if (serverOnly != 0)
 			nfssvc_flag |= NFSSVC_ZEROSRVSTATS;
 	}
 	ext_nfsstats.vers = NFSSTATS_V1;
 	if (nfssvc(nfssvc_flag, &ext_nfsstats) < 0)
 		xo_err(1, "Can't get stats");
 	if (clientOnly != 0) {
 		xo_open_container("clientstats");
 
 		xo_open_container("operations");
 		if (printtitle) {
 			xo_emit("{T:Client Info:}\n");
 			xo_emit("{T:RPC Counts:}\n");
 		}
 		xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}"
 		    "{T:Lookup/%13.13s}{T:Readlink/%13.13s}"
 		    "{T:Read/%13.13s}{T:Write/%13.13s}\n");
 		xo_emit("{:getattr/%13ju}{:setattr/%13ju}{:lookup/%13ju}"
 		    "{:readlink/%13ju}{:read/%13ju}{:write/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETATTR],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETATTR],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOOKUP],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READLINK],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READ],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_WRITE]);
 		xo_emit("{T:Create/%13.13s}{T:Remove/%13.13s}"
 		    "{T:Rename/%13.13s}{T:Link/%13.13s}"
 		    "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}\n");
 		xo_emit("{:create/%13ju}{:remove/%13ju}{:rename/%13ju}"
 		  "{:link/%13ju}{:symlink/%13ju}{:mkdir/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATE],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_REMOVE],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RENAME],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LINK],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SYMLINK],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKDIR]);
 		xo_emit("{T:Rmdir/%13.13s}{T:Readdir/%13.13s}"
 		    "{T:RdirPlus/%13.13s}{T:Access/%13.13s}"
 		    "{T:Mknod/%13.13s}{T:Fsstat/%13.13s}\n");
 		xo_emit("{:rmdir/%13ju}{:readdir/%13ju}{:rdirplus/%13ju}"
 		    "{:access/%13ju}{:mknod/%13ju}{:fsstat/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RMDIR],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIR],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDIRPLUS],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_ACCESS],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_MKNOD],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSSTAT]);
 		xo_emit("{T:FSinfo/%13.13s}{T:pathConf/%13.13s}"
 		    "{T:Commit/%13.13s}{T:SetClId/%13.13s}"
 		    "{T:SetClIdCf/%13.13s}{T:Lock/%13.13s}\n");
 		xo_emit("{:fsinfo/%13ju}{:pathconf/%13ju}{:commit/%13ju}"
 		    "{:setclientid/%13ju}{:setclientidcf/%13ju}{:lock/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FSINFO],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_PATHCONF],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_COMMIT],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETCLIENTID],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETCLIENTIDCFRM],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOCK]);
 		xo_emit("{T:LockT/%13.13s}{T:LockU/%13.13s}"
 		    "{T:Open/%13.13s}{T:OpenCfr/%13.13s}\n");
 		xo_emit("{:lockt/%13ju}{:locku/%13ju}"
 		    "{:open/%13ju}{:opencfr/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOCKT],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LOCKU],
 		    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPEN],
 		  (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPENCONFIRM]);
 
 		if (nfs41) {
 			xo_open_container("nfsv41");
 
 			xo_emit("{T:OpenDownGr/%13.13s}{T:Close/%13.13s}\n");
 			xo_emit("{:opendowngr/%13ju}{:close/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPENDOWNGRADE],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CLOSE]);
 
 			xo_emit("{T:RelLckOwn/%13.13s}{T:FreeStateID/%13.13s}"
 			    "{T:PutRootFH/%13.13s}{T:DelegRet/%13.13s}"
 			    "{T:GetAcl/%13.13s}{T:SetAcl/%13.13s}\n");
 			xo_emit("{:rellckown/%13ju}{:freestateid/%13ju}"
 			    "{:getacl/%13ju}{:delegret/%13ju}"
 			    "{:getacl/%13ju}{:setacl/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RELEASELCKOWN],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_FREESTATEID],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_PUTROOTFH],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_DELEGRETURN],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETACL],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_SETACL]);
 
 			xo_emit("{T:ExchangeId/%13.13s}{T:CreateSess/%13.13s}"
 			    "{T:DestroySess/%13.13s}{T:DestroyClId/%13.13s}"
 			    "{T:LayoutGet/%13.13s}{T:GetDevInfo/%13.13s}\n");
 			xo_emit("{:exchangeid/%13ju}{:createsess/%13ju}"
 			    "{:destroysess/%13ju}{:destroyclid/%13ju}"
 			    "{:layoutget/%13ju}{:getdevinfo/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_EXCHANGEID],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATESESSION],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_DESTROYSESSION],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_DESTROYCLIENT],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LAYOUTGET],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_GETDEVICEINFO]);
 
 			xo_emit("{T:LayoutCommit/%13.13s}{T:LayoutReturn/%13.13s}"
 			    "{T:ReclaimCompl/%13.13s}{T:ReadDataS/%13.13s}"
 			    "{T:WriteDataS/%13.13s}{T:CommitDataS/%13.13s}\n");
 			xo_emit("{:layoutcomit/%13ju}{:layoutreturn/%13ju}"
 			    "{:reclaimcompl/%13ju}{:readdatas/%13ju}"
 			    "{:writedatas/%13ju}{:commitdatas/%13ju}\n",
 			  (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LAYOUTCOMMIT],
 			  (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_LAYOUTRETURN],
 			  (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_RECLAIMCOMPL],
 			  (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_READDS],
 			  (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_WRITEDS],
 			  (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_COMMITDS]);
 
 			xo_emit("{T:OpenLayout/%13.13s}{T:CreateLayout/%13.13s}\n");
 			xo_emit("{:openlayout/%13ju}{:createlayout/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_OPENLAYGET],
 			    (uintmax_t)ext_nfsstats.rpccnt[NFSPROC_CREATELAYGET]);
 
 			xo_close_container("nfsv41");
 		}
 		xo_close_container("operations");
 
 		xo_open_container("client");
 		xo_emit("{T:OpenOwner/%13.13s}{T:Opens/%13.13s}"
 		    "{T:LockOwner/%13.13s}{T:Locks/%13.13s}"
 		    "{T:Delegs/%13.13s}{T:LocalOwn/%13.13s}\n");
 		xo_emit("{:openowner/%13ju}{:opens/%13ju}"
 		    "{:lockowner/%13ju}{:locks/%13ju}"
 		    "{:delegs/%13ju}{:localown/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.clopenowners,
 		    (uintmax_t)ext_nfsstats.clopens,
 		    (uintmax_t)ext_nfsstats.cllockowners,
 		    (uintmax_t)ext_nfsstats.cllocks,
 		    (uintmax_t)ext_nfsstats.cldelegates,
 		    (uintmax_t)ext_nfsstats.cllocalopenowners);
 
 		xo_emit("{T:LocalOpen/%13.13s}{T:LocalLown/%13.13s}"
 		    "{T:LocalLock/%13.13s}\n");
 		xo_emit("{:localopen/%13ju}{:locallown/%13ju}"
 		    "{:locallock/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.cllocalopens,
 		    (uintmax_t)ext_nfsstats.cllocallockowners,
 		    (uintmax_t)ext_nfsstats.cllocallocks);
 		xo_close_container("client");
 
 		xo_open_container("rpc");
 		if (printtitle)
 			xo_emit("{T:Rpc Info:}\n");
 		xo_emit("{T:TimedOut/%13.13s}{T:Invalid/%13.13s}"
 		    "{T:X Replies/%13.13s}{T:Retries/%13.13s}"
 		    "{T:Requests/%13.13s}\n");
 		xo_emit("{:timedout/%13ju}{:invalid/%13ju}"
 		    "{:xreplies/%13ju}{:retries/%13ju}"
 		    "{:requests/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.rpctimeouts,
 		    (uintmax_t)ext_nfsstats.rpcinvalid,
 		    (uintmax_t)ext_nfsstats.rpcunexpected,
 		    (uintmax_t)ext_nfsstats.rpcretries,
 		    (uintmax_t)ext_nfsstats.rpcrequests);
 		xo_close_container("rpc");
 
 		xo_open_container("cache");
 		if (printtitle)
 			xo_emit("{T:Cache Info:}\n");
 		xo_emit("{T:Attr Hits/%13.13s}{T:Attr Misses/%13.13s}"
 		    "{T:Lkup Hits/%13.13s}{T:Lkup Misses/%13.13s}\n");
 		xo_emit("{:attrhits/%13ju}{:attrmisses/%13ju}"
 		    "{:lkuphits/%13ju}{:lkupmisses/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.attrcache_hits,
 		    (uintmax_t)ext_nfsstats.attrcache_misses,
 		    (uintmax_t)ext_nfsstats.lookupcache_hits,
 		    (uintmax_t)ext_nfsstats.lookupcache_misses);
 
 		xo_emit("{T:BioR Hits/%13.13s}{T:BioR Misses/%13.13s}"
 		    "{T:BioW Hits/%13.13s}{T:BioW Misses/%13.13s}\n");
 		xo_emit("{:biorhits/%13ju}{:biormisses/%13ju}"
 		    "{:biowhits/%13ju}{:biowmisses/%13ju}\n",
 		    (uintmax_t)(ext_nfsstats.biocache_reads -
 		    ext_nfsstats.read_bios),
 		    (uintmax_t)ext_nfsstats.read_bios,
 		    (uintmax_t)(ext_nfsstats.biocache_writes -
 		    ext_nfsstats.write_bios),
 		    (uintmax_t)ext_nfsstats.write_bios);
 
 		xo_emit("{T:BioRL Hits/%13.13s}{T:BioRL Misses/%13.13s}"
 		    "{T:BioD Hits/%13.13s}{T:BioD Misses/%13.13s}\n");
 		xo_emit("{:biorlhits/%13ju}{:biorlmisses/%13ju}"
 		    "{:biodhits/%13ju}{:biodmisses/%13ju}\n",
 		    (uintmax_t)(ext_nfsstats.biocache_readlinks -
 		    ext_nfsstats.readlink_bios),
 		    (uintmax_t)ext_nfsstats.readlink_bios,
 		    (uintmax_t)(ext_nfsstats.biocache_readdirs -
 		    ext_nfsstats.readdir_bios),
 		    (uintmax_t)ext_nfsstats.readdir_bios);
 
 		xo_emit("{T:DirE Hits/%13.13s}{T:DirE Misses/%13.13s}\n");
 		xo_emit("{:direhits/%13ju}{:diremisses/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.direofcache_hits,
 		    (uintmax_t)ext_nfsstats.direofcache_misses);
 		xo_open_container("cache");
 
 		xo_close_container("clientstats");
 	}
 	if (serverOnly != 0) {
 		xo_open_container("serverstats");
 
 		xo_open_container("operations");
 		if (printtitle)
 			xo_emit("{T:Server Info:}\n");
 		xo_emit("{T:Getattr/%13.13s}{T:Setattr/%13.13s}"
 		    "{T:Lookup/%13.13s}{T:Readlink/%13.13s}"
 		    "{T:Read/%13.13s}{T:Write/%13.13s}\n");
 		xo_emit("{:getattr/%13ju}{:setattr/%13ju}{:lookup/%13ju}"
 		    "{:readlink/%13ju}{:read/%13ju}{:write/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETATTR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETATTR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOOKUP],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READLINK],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READ],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WRITE]);
 		xo_emit("{T:Create/%13.13s}{T:Remove/%13.13s}"
 		    "{T:Rename/%13.13s}{T:Link/%13.13s}"
 		    "{T:Symlink/%13.13s}{T:Mkdir/%13.13s}\n");
 		xo_emit("{:create/%13ju}{:remove/%13ju}{:rename/%13ju}"
 		    "{:link/%13ju}{:symlink/%13ju}{:mkdir/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_V3CREATE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_REMOVE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RENAME],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LINK],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SYMLINK],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKDIR]);
 		xo_emit("{T:Rmdir/%13.13s}{T:Readdir/%13.13s}"
 		    "{T:RdirPlus/%13.13s}{T:Access/%13.13s}"
 		    "{T:Mknod/%13.13s}{T:Fsstat/%13.13s}\n");
 		xo_emit("{:rmdir/%13ju}{:readdir/%13ju}{:rdirplus/%13ju}"
 		    "{:access/%13ju}{:mknod/%13ju}{:fsstat/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RMDIR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_READDIRPLUS],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_ACCESS],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_MKNOD],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSSTAT]);
 		xo_emit("{T:FSinfo/%13.13s}{T:pathConf/%13.13s}"
 		    "{T:Commit/%13.13s}{T:LookupP/%13.13s}"
 		    "{T:SetClId/%13.13s}{T:SetClIdCf/%13.13s}\n");
 		xo_emit("{:fsinfo/%13ju}{:pathconf/%13ju}{:commit/%13ju}"
 		    "{:lookupp/%13ju}{:setclientid/%13ju}{:setclientidcfrm/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FSINFO],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PATHCONF],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_COMMIT],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOOKUPP],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETCLIENTID],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETCLIENTIDCFRM]);
 		xo_emit("{T:Open/%13.13s}{T:OpenAttr/%13.13s}"
 		    "{T:OpenDwnGr/%13.13s}{T:OpenCfrm/%13.13s}"
 		    "{T:DelePurge/%13.13s}{T:DelRet/%13.13s}\n");
 		xo_emit("{:open/%13ju}{:openattr/%13ju}{:opendwgr/%13ju}"
 		    "{:opencfrm/%13ju}{:delepurge/%13ju}{:delreg/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPEN],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPENATTR],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPENDOWNGRADE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_OPENCONFIRM],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DELEGPURGE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DELEGRETURN]);
 		xo_emit("{T:GetFH/%13.13s}{T:Lock/%13.13s}"
 		    "{T:LockT/%13.13s}{T:LockU/%13.13s}"
 		    "{T:Close/%13.13s}{T:Verify/%13.13s}\n");
 		xo_emit("{:getfh/%13ju}{:lock/%13ju}{:lockt/%13ju}"
 		    "{:locku/%13ju}{:close/%13ju}{:verify/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETFH],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOCK],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOCKT],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LOCKU],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CLOSE],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_VERIFY]);
 		xo_emit("{T:NVerify/%13.13s}{T:PutFH/%13.13s}"
 		    "{T:PutPubFH/%13.13s}{T:PutRootFH/%13.13s}"
 		    "{T:Renew/%13.13s}{T:RestoreFH/%13.13s}\n");
 		xo_emit("{:nverify/%13ju}{:putfh/%13ju}{:putpubfh/%13ju}"
 		    "{:putrootfh/%13ju}{:renew/%13ju}{:restore/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_NVERIFY],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PUTFH],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PUTPUBFH],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_PUTROOTFH],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RENEW],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RESTOREFH]);
 		xo_emit("{T:SaveFH/%13.13s}{T:Secinfo/%13.13s}"
 		    "{T:RelLockOwn/%13.13s}{T:V4Create/%13.13s}\n");
 		xo_emit("{:savefh/%13ju}{:secinfo/%13ju}{:rellockown/%13ju}"
 		    "{:v4create/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SAVEFH],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SECINFO],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RELEASELCKOWN],
 		    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CREATE]);
 		if (nfs41) {
 			xo_open_container("nfsv41");
 			xo_emit("{T:BackChannelCtrl/%13.13s}{T:BindConnToSess/%13.13s}"
 			    "{T:ExchangeID/%13.13s}{T:CreateSess/%13.13s}"
 			    "{T:DestroySess/%13.13s}{T:FreeStateID/%13.13s}\n");
 			xo_emit("{:backchannelctrl/%13ju}{:bindconntosess/%13ju}"
 			    "{:exchangeid/%13ju}{:createsess/%13ju}"
 			    "{:destroysess/%13ju}{:freestateid/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_BACKCHANNELCTL],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_BINDCONNTOSESS],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_EXCHANGEID],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_CREATESESSION],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DESTROYSESSION],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_FREESTATEID]),
 
 			xo_emit("{T:GetDirDeleg/%13.13s}{T:GetDevInfo/%13.13s}"
 			    "{T:GetDevList/%13.13s}{T:layoutCommit/%13.13s}"
 			    "{T:LayoutGet/%13.13s}{T:LayoutReturn/%13.13s}\n");
 			xo_emit("{:getdirdeleg/%13ju}{:getdevinfo/%13ju}"
 			    "{:getdevlist/%13ju}{:layoutcommit/%13ju}"
 			    "{:layoutget/%13ju}{:layoutreturn/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETDIRDELEG],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETDEVINFO],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_GETDEVLIST],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LAYOUTCOMMIT],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LAYOUTGET],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_LAYOUTRETURN]);
 
 			xo_emit("{T:SecInfNoName/%13.13s}{T:Sequence/%13.13s}"
 			    "{T:SetSSV/%13.13s}{T:TestStateID/%13.13s}"
 			    "{T:WantDeleg/%13.13s}{T:DestroyClId/%13.13s}\n");
 			xo_emit("{:secinfnoname/%13ju}{:sequence/%13ju}"
 			    "{:setssv/%13ju}{:teststateid/%13ju}{:wantdeleg/%13ju}"
 			    "{:destroyclid/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SECINFONONAME],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SEQUENCE],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_SETSSV],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_TESTSTATEID],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_WANTDELEG],
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_DESTROYCLIENTID]);
 
 			xo_emit("{T:ReclaimCompl/%13.13s}\n");
 			xo_emit("{:reclaimcompl/%13ju}\n",
 			    (uintmax_t)ext_nfsstats.srvrpccnt[NFSV4OP_RECLAIMCOMPL]);
 
 			xo_close_container("nfsv41");
 		}
 
 		xo_close_container("operations");
 
 		if (printtitle)
 			xo_emit("{T:Server:}\n");
 		xo_open_container("server");
 		xo_emit("{T:Retfailed/%13.13s}{T:Faults/%13.13s}"
 		    "{T:Clients/%13.13s}\n");
 		xo_emit("{:retfailed/%13ju}{:faults/%13ju}{:clients/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srv_errs,
 		    (uintmax_t)ext_nfsstats.srvrpc_errs,
 		    (uintmax_t)ext_nfsstats.srvclients);
 		xo_emit("{T:OpenOwner/%13.13s}{T:Opens/%13.13s}"
 		    "{T:LockOwner/%13.13s}{T:Locks/%13.13s}"
 		    "{T:Delegs/%13.13s}\n");
 		xo_emit("{:openowner/%13ju}{:opens/%13ju}{:lockowner/%13ju}"
 		  "{:locks/%13ju}{:delegs/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvopenowners,
 		    (uintmax_t)ext_nfsstats.srvopens,
 		    (uintmax_t)ext_nfsstats.srvlockowners,
 		    (uintmax_t)ext_nfsstats.srvlocks,
 		    (uintmax_t)ext_nfsstats.srvdelegates);
 		xo_close_container("server");
 
 		if (printtitle)
 			xo_emit("{T:Server Cache Stats:}\n");
 		xo_open_container("cache");
 		xo_emit("{T:Inprog/%13.13s}{T:Idem/%13.13s}"
 		    "{T:Non-idem/%13.13s}{T:Misses/%13.13s}"
 		    "{T:CacheSize/%13.13s}{T:TCPPeak/%13.13s}\n");
 		xo_emit("{:inprog/%13ju}{:idem/%13ju}{:nonidem/%13ju}"
 		    "{:misses/%13ju}{:cachesize/%13ju}{:tcppeak/%13ju}\n",
 		    (uintmax_t)ext_nfsstats.srvcache_inproghits,
 		    (uintmax_t)ext_nfsstats.srvcache_idemdonehits,
 		    (uintmax_t)ext_nfsstats.srvcache_nonidemdonehits,
 		    (uintmax_t)ext_nfsstats.srvcache_misses,
 		    (uintmax_t)ext_nfsstats.srvcache_size,
 		    (uintmax_t)ext_nfsstats.srvcache_tcppeak);
 		xo_close_container("cache");
 
 		xo_close_container("serverstats");
 	}
 
 	xo_close_container("nfsv4");
 }
 
 static void
 compute_totals(struct nfsstatsv1 *total_stats, struct nfsstatsv1 *cur_stats)
 {
 	int i;
 
 	bzero(total_stats, sizeof(*total_stats));
 	for (i = 0; i < (NFSV42_NOPS + NFSV4OP_FAKENOPS); i++) {
 		total_stats->srvbytes[0] += cur_stats->srvbytes[i];
 		total_stats->srvops[0] += cur_stats->srvops[i];
 		bintime_add(&total_stats->srvduration[0],
 			    &cur_stats->srvduration[i]);
 		total_stats->srvrpccnt[i] = cur_stats->srvrpccnt[i];
 	}
 	total_stats->srvstartcnt = cur_stats->srvstartcnt;
 	total_stats->srvdonecnt = cur_stats->srvdonecnt;
 	total_stats->busytime = cur_stats->busytime;
 
 }
 
 /*
  * Print a running summary of nfs statistics for the experimental client and/or
  * server.
  * Repeat display every interval seconds, showing statistics
  * collected over that interval.  Assumes that interval is non-zero.
  * First line printed at top of screen is always cumulative.
  */
 static void
 exp_sidewaysintpr(u_int interval, int clientOnly, int serverOnly,
     int newStats)
 {
 	struct nfsstatsv1 nfsstats, lastst, *ext_nfsstatsp;
 	struct nfsstatsv1 curtotal, lasttotal;
 	struct timespec ts, lastts;
 	int hdrcnt = 1;
 
 	ext_nfsstatsp = &lastst;
 	ext_nfsstatsp->vers = NFSSTATS_V1;
 	if (nfssvc(NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT, ext_nfsstatsp) < 0)
 		err(1, "Can't get stats");
 	clock_gettime(CLOCK_MONOTONIC, &lastts);
 	compute_totals(&lasttotal, ext_nfsstatsp);
 	sleep(interval);
 
 	for (;;) {
 		ext_nfsstatsp = &nfsstats;
 		ext_nfsstatsp->vers = NFSSTATS_V1;
 		if (nfssvc(NFSSVC_GETSTATS | NFSSVC_NEWSTRUCT, ext_nfsstatsp)
 		    < 0)
 			err(1, "Can't get stats");
 		clock_gettime(CLOCK_MONOTONIC, &ts);
 
 		if (--hdrcnt == 0) {
 			printhdr(clientOnly, serverOnly, newStats);
 			if (newStats)
 				hdrcnt = 20;
 			else if (clientOnly && serverOnly)
 				hdrcnt = 10;
 			else
 				hdrcnt = 20;
 		}
 		if (clientOnly && newStats == 0) {
 		    printf("%s %6ju %6ju %6ju %6ju %6ju %6ju %6ju %6ju",
 			((clientOnly && serverOnly) ? "Client:" : ""),
 			(uintmax_t)DELTA(rpccnt[NFSPROC_GETATTR]),
 			(uintmax_t)DELTA(rpccnt[NFSPROC_LOOKUP]),
 			(uintmax_t)DELTA(rpccnt[NFSPROC_READLINK]),
 			(uintmax_t)DELTA(rpccnt[NFSPROC_READ]),
 			(uintmax_t)DELTA(rpccnt[NFSPROC_WRITE]),
 			(uintmax_t)DELTA(rpccnt[NFSPROC_RENAME]),
 			(uintmax_t)DELTA(rpccnt[NFSPROC_ACCESS]),
 			(uintmax_t)(DELTA(rpccnt[NFSPROC_READDIR]) +
 			DELTA(rpccnt[NFSPROC_READDIRPLUS]))
 		    );
 		    if (widemode) {
 			    printf(" %s %s %s %s %s %s",
 				sperc1(DELTA(attrcache_hits),
 				    DELTA(attrcache_misses)),
 				sperc1(DELTA(lookupcache_hits), 
 				    DELTA(lookupcache_misses)),
 				sperc2(DELTA(biocache_reads),
 				    DELTA(read_bios)),
 				sperc2(DELTA(biocache_writes),
 				    DELTA(write_bios)),
 				sperc1(DELTA(accesscache_hits),
 				    DELTA(accesscache_misses)),
 				sperc2(DELTA(biocache_readdirs),
 				    DELTA(readdir_bios))
 			    );
 		    }
 		    printf("\n");
 		}
 
 		if (serverOnly && newStats) {
 			long double cur_secs, last_secs, etime;
 			long double mbsec;
 			long double kb_per_transfer;
 			long double transfers_per_second;
 			long double ms_per_transfer;
 			uint64_t queue_len;
 			long double busy_pct;
 			int i;
 
 			cur_secs = ts.tv_sec +
 			    ((long double)ts.tv_nsec / 1000000000);
 			last_secs = lastts.tv_sec +
 			    ((long double)lastts.tv_nsec / 1000000000);
 			etime = cur_secs - last_secs;
 
 			compute_totals(&curtotal, &nfsstats);
 
 			for (i = 0; i < NUM_STAT_TYPES; i++) {
 				compute_new_stats(&nfsstats, &lastst,
 				    STAT_TYPE_TO_NFS(i), etime, &mbsec,
 				    &kb_per_transfer,
 				    &transfers_per_second,
 				    &ms_per_transfer, &queue_len,
 				    &busy_pct);
 
 				if (i == STAT_TYPE_COMMIT) {
 					if (widemode == 0)
 						continue;
 
 					printf("%2.0Lf %7.2Lf ",
 					    transfers_per_second,
 					    ms_per_transfer);
 				} else {
 					printf("%5.2Lf %5.0Lf %7.2Lf ",
 					    kb_per_transfer,
 					    transfers_per_second, mbsec);
 					if (widemode)
 						printf("%5.2Lf ",
 						    ms_per_transfer);
 				}
 			}
 
 			compute_new_stats(&curtotal, &lasttotal, 0, etime,
 			    &mbsec, &kb_per_transfer, &transfers_per_second,
 			    &ms_per_transfer, &queue_len, &busy_pct);
 
 			printf("%5.2Lf %5.0Lf %7.2Lf %5.2Lf %3ju %3.0Lf\n",
 			    kb_per_transfer, transfers_per_second, mbsec,
 			    ms_per_transfer, queue_len, busy_pct);
 		} else if (serverOnly) {
 		    printf("%s %6ju %6ju %6ju %6ju %6ju %6ju %6ju %6ju",
 			((clientOnly && serverOnly) ? "Server:" : ""),
 			(uintmax_t)DELTA(srvrpccnt[NFSV4OP_GETATTR]),
 			(uintmax_t)DELTA(srvrpccnt[NFSV4OP_LOOKUP]),
 			(uintmax_t)DELTA(srvrpccnt[NFSV4OP_READLINK]),
 			(uintmax_t)DELTA(srvrpccnt[NFSV4OP_READ]),
 			(uintmax_t)DELTA(srvrpccnt[NFSV4OP_WRITE]),
 			(uintmax_t)DELTA(srvrpccnt[NFSV4OP_RENAME]),
 			(uintmax_t)DELTA(srvrpccnt[NFSV4OP_ACCESS]),
 			(uintmax_t)(DELTA(srvrpccnt[NFSV4OP_READDIR]) +
 			DELTA(srvrpccnt[NFSV4OP_READDIRPLUS])));
 		    printf("\n");
 		}
 		bcopy(&nfsstats, &lastst, sizeof(lastst));
 		bcopy(&curtotal, &lasttotal, sizeof(lasttotal));
 		lastts = ts;
 		fflush(stdout);
 		sleep(interval);
 	}
 	/*NOTREACHED*/
 }
Index: projects/fuse2/usr.bin/printf/printf.1
===================================================================
--- projects/fuse2/usr.bin/printf/printf.1	(revision 350434)
+++ projects/fuse2/usr.bin/printf/printf.1	(revision 350435)
@@ -1,382 +1,385 @@
 .\" Copyright (c) 1989, 1990, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" This code is derived from software contributed to Berkeley by
 .\" the Institute of Electrical and Electronics Engineers, Inc.
 .\"
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
 .\"	@(#)printf.1	8.1 (Berkeley) 6/6/93
 .\" $FreeBSD$
 .\"
-.Dd April 21, 2014
+.Dd July 29, 2019
 .Dt PRINTF 1
 .Os
 .Sh NAME
 .Nm printf
 .Nd formatted output
 .Sh SYNOPSIS
 .Nm
 .Ar format Op Ar arguments ...
 .Sh DESCRIPTION
 The
 .Nm
 utility formats and prints its arguments, after the first, under control
 of the
 .Ar format .
 The
 .Ar format
 is a character string which contains three types of objects: plain characters,
 which are simply copied to standard output, character escape sequences which
 are converted and copied to the standard output, and format specifications,
 each of which causes printing of the next successive
 .Ar argument .
 .Pp
 The
 .Ar arguments
 after the first are treated as strings if the corresponding format is
 either
 .Cm c , b
 or
 .Cm s ;
 otherwise it is evaluated as a C constant, with the following extensions:
 .Pp
 .Bl -bullet -offset indent -compact
 .It
 A leading plus or minus sign is allowed.
 .It
 If the leading character is a single or double quote, the value is the
 character code of the next character.
 .El
 .Pp
 The format string is reused as often as necessary to satisfy the
 .Ar arguments .
 Any extra format specifications are evaluated with zero or the null
 string.
 .Pp
 Character escape sequences are in backslash notation as defined in the
 .St -ansiC ,
 with extensions.
 The characters and their meanings
 are as follows:
 .Pp
 .Bl -tag -width Ds -offset indent -compact
 .It Cm \ea
 Write a <bell> character.
 .It Cm \eb
 Write a <backspace> character.
-.It Cm \ec
-Ignore remaining characters in this string.
 .It Cm \ef
 Write a <form-feed> character.
 .It Cm \en
 Write a <new-line> character.
 .It Cm \er
 Write a <carriage return> character.
 .It Cm \et
 Write a <tab> character.
 .It Cm \ev
 Write a <vertical tab> character.
 .It Cm \e\'
 Write a <single quote> character.
 .It Cm \e\e
 Write a backslash character.
 .It Cm \e Ns Ar num
 Write a byte whose
 value is the 1-, 2-, or 3-digit
 octal number
 .Ar num .
 Multibyte characters can be constructed using multiple
 .Cm \e Ns Ar num
 sequences.
 .El
 .Pp
 Each format specification is introduced by the percent character
 (``%'').
 The remainder of the format specification includes,
 in the following order:
 .Bl -tag -width Ds
 .It "Zero or more of the following flags:"
 .Bl -tag -width Ds
 .It Cm #
 A `#' character
 specifying that the value should be printed in an ``alternate form''.
 For
 .Cm b , c , d , s
 and
 .Cm u
 formats, this option has no effect.
 For the
 .Cm o
 formats the precision of the number is increased to force the first
 character of the output string to a zero.
 For the
 .Cm x
 .Pq Cm X
 format, a non-zero result has the string
 .Li 0x
 .Pq Li 0X
 prepended to it.
 For
 .Cm a , A , e , E , f , F , g
 and
 .Cm G
 formats, the result will always contain a decimal point, even if no
 digits follow the point (normally, a decimal point only appears in the
 results of those formats if a digit follows the decimal point).
 For
 .Cm g
 and
 .Cm G
 formats, trailing zeros are not removed from the result as they
 would otherwise be;
 .It Cm \&\-
 A minus sign `\-' which specifies
 .Em left adjustment
 of the output in the indicated field;
 .It Cm \&+
 A `+' character specifying that there should always be
 a sign placed before the number when using signed formats.
 .It Sq \&\ \&
 A space specifying that a blank should be left before a positive number
 for a signed format.
 A `+' overrides a space if both are used;
 .It Cm \&0
 A zero `0' character indicating that zero-padding should be used
 rather than blank-padding.
 A `\-' overrides a `0' if both are used;
 .El
 .It "Field Width:"
 An optional digit string specifying a
 .Em field width ;
 if the output string has fewer bytes than the field width it will
 be blank-padded on the left (or right, if the left-adjustment indicator
 has been given) to make up the field width (note that a leading zero
 is a flag, but an embedded zero is part of a field width);
 .It Precision:
 An optional period,
 .Sq Cm \&.\& ,
 followed by an optional digit string giving a
 .Em precision
 which specifies the number of digits to appear after the decimal point,
 for
 .Cm e
 and
 .Cm f
 formats, or the maximum number of bytes to be printed
 from a string; if the digit string is missing, the precision is treated
 as zero;
 .It Format:
 A character which indicates the type of format to use (one of
 .Cm diouxXfFeEgGaAcsb ) .
 The uppercase formats differ from their lowercase counterparts only in
 that the output of the former is entirely in uppercase.
 The floating-point format specifiers
 .Pq Cm fFeEgGaA
 may be prefixed by an
 .Cm L
 to request that additional precision be used, if available.
 .El
 .Pp
 A field width or precision may be
 .Sq Cm \&*
 instead of a digit string.
 In this case an
 .Ar argument
 supplies the field width or precision.
 .Pp
 The format characters and their meanings are:
 .Bl -tag -width Fl
 .It Cm diouXx
 The
 .Ar argument
 is printed as a signed decimal (d or i), unsigned octal, unsigned decimal,
 or unsigned hexadecimal (X or x), respectively.
 .It Cm fF
 The
 .Ar argument
 is printed in the style `[\-]ddd.ddd' where the number of d's
 after the decimal point is equal to the precision specification for
 the argument.
 If the precision is missing, 6 digits are given; if the precision
 is explicitly 0, no digits and no decimal point are printed.
 The values \*[If] and \*[Na] are printed as
 .Ql inf
 and
 .Ql nan ,
 respectively.
 .It Cm eE
 The
 .Ar argument
 is printed in the style
 .Cm e
 .Sm off
 .Sq Op - Ar d.ddd No \(+- Ar dd
 .Sm on
 where there
 is one digit before the decimal point and the number after is equal to
 the precision specification for the argument; when the precision is
 missing, 6 digits are produced.
 The values \*[If] and \*[Na] are printed as
 .Ql inf
 and
 .Ql nan ,
 respectively.
 .It Cm gG
 The
 .Ar argument
 is printed in style
 .Cm f
 .Pq Cm F
 or in style
 .Cm e
 .Pq Cm E
 whichever gives full precision in minimum space.
 .It Cm aA
 The
 .Ar argument
 is printed in style
 .Sm off
 .Sq Op - Ar h.hhh No \(+- Li p Ar d
 .Sm on
 where there is one digit before the hexadecimal point and the number
 after is equal to the precision specification for the argument;
 when the precision is missing, enough digits are produced to convey
 the argument's exact double-precision floating-point representation.
 The values \*[If] and \*[Na] are printed as
 .Ql inf
 and
 .Ql nan ,
 respectively.
 .It Cm c
 The first byte of
 .Ar argument
 is printed.
 .It Cm s
 Bytes from the string
 .Ar argument
 are printed until the end is reached or until the number of bytes
 indicated by the precision specification is reached; however if the
 precision is 0 or missing, the string is printed entirely.
 .It Cm b
 As for
 .Cm s ,
 but interpret character escapes in backslash notation in the string
 .Ar argument .
 The permitted escape sequences are slightly different in that
 octal escapes are
 .Cm \e0 Ns Ar num
 instead of
-.Cm \e Ns Ar num .
+.Cm \e Ns Ar num
+and that an additional escape sequence
+.Cm \ec
+stops further output from this
+.Nm
+invocation.
 .It Cm n$
 Allows reordering of the output according to
 .Ar argument .
 .It Cm \&%
 Print a `%'; no argument is used.
 .El
 .Pp
 The decimal point
 character is defined in the program's locale (category
 .Dv LC_NUMERIC ) .
 .Pp
 In no case does a non-existent or small field width cause truncation of
 a field; padding takes place only if the specified field width exceeds
 the actual width.
 .Pp
 Some shells may provide a builtin
 .Nm
 command which is similar or identical to this utility.
 Consult the
 .Xr builtin 1
 manual page.
 .Sh EXIT STATUS
 .Ex -std
 .Sh COMPATIBILITY
 The traditional
 .Bx
 behavior of converting arguments of numeric formats not beginning
 with a digit to the
 .Tn ASCII
 code of the first character is not supported.
 .Sh SEE ALSO
 .Xr builtin 1 ,
 .Xr echo 1 ,
 .Xr sh 1 ,
 .Xr printf 3
 .Sh STANDARDS
 The
 .Nm
 command is expected to be compatible with the
 .St -p1003.2
 specification.
 .Sh HISTORY
 The
 .Nm
 command appeared in
 .Bx 4.3 Reno .
 It is modeled
 after the standard library function,
 .Xr printf 3 .
 .Sh CAVEATS
 .Tn ANSI
 hexadecimal character constants were deliberately not provided.
 .Pp
 Trying to print a dash ("-") as the first character causes
 .Nm
 to interpret the dash as a program argument.
 .Nm --
 must be used before
 .Ar format .
 .Pp
 If the locale contains multibyte characters
 (such as UTF-8),
 the
 .Cm c
 format and
 .Cm b
 and
 .Cm s
 formats with a precision
 may not operate as expected.
 .Sh BUGS
 Since the floating point numbers are translated from
 .Tn ASCII
 to floating-point and
 then back again, floating-point precision may be lost.
 (By default, the number is translated to an IEEE-754 double-precision
 value before being printed.
 The
 .Cm L
 modifier may produce additional precision, depending on the hardware platform.)
 .Pp
 The escape sequence \e000 is the string terminator.
 When present in the argument for the
 .Cm b
 format, the argument will be truncated at the \e000 character.
 .Pp
 Multibyte characters are not recognized in format strings (this is only
 a problem if
 .Ql %
 can appear inside a multibyte character).
Index: projects/fuse2
===================================================================
--- projects/fuse2	(revision 350434)
+++ projects/fuse2	(revision 350435)

Property changes on: projects/fuse2
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r350391-350426