Index: head/etc/mtree/BSD.tests.dist
===================================================================
--- head/etc/mtree/BSD.tests.dist	(revision 299089)
+++ head/etc/mtree/BSD.tests.dist	(revision 299090)
@@ -1,664 +1,666 @@
 # $FreeBSD$
 #
 # Please see the file src/etc/mtree/README before making changes to this file.
 #
 
 /set type=dir uname=root gname=wheel mode=0755
 .
     bin
         cat
         ..
         chown
         ..
         date
         ..
         dd
         ..
         expr
         ..
         ls
         ..
         mv
         ..
         pax
         ..
         pkill
         ..
         sh
             builtins
             ..
             errors
             ..
             execution
             ..
             expansion
             ..
             parameters
             ..
             parser
             ..
             set-e
             ..
         ..
         sleep
         ..
         test
         ..
     ..
     cddl
         lib
         ..
         sbin
         ..
         usr.bin
         ..
         usr.sbin
             dtrace
                 common
                     aggs
                     ..
                     arithmetic
                     ..
                     arrays
                     ..
                     assocs
                     ..
                     begin
                     ..
                     bitfields
                     ..
                     buffering
                     ..
                     builtinvar
                     ..
                     cg
                     ..
                     clauses
                     ..
                     cpc
                     ..
                     decls
                     ..
                     drops
                     ..
                     dtraceUtil
                     ..
                     end
                     ..
                     enum
                     ..
                     error
                     ..
                     exit
                     ..
                     fbtprovider
                     ..
                     funcs
                     ..
                     grammar
                     ..
                     include
                     ..
                     inline
                     ..
                     io
                     ..
                     ip
                     ..
                     java_api
                     ..
                     json
                     ..
                     lexer
                     ..
                     llquantize
                     ..
                     mdb
                     ..
                     mib
                     ..
                     misc
                     ..
                     multiaggs
                     ..
                     offsetof
                     ..
                     operators
                     ..
                     pid
                     ..
                     plockstat
                     ..
                     pointers
                     ..
                     pragma
                     ..
                     predicates
                     ..
                     preprocessor
                     ..
                     print
                     ..
                     printa
                     ..
                     printf
                     ..
                     privs
                     ..
                     probes
                     ..
                     proc
                     ..
                     profile-n
                     ..
                     providers
                     ..
                     raise
                     ..
                     rates
                     ..
                     safety
                     ..
                     scalars
                     ..
                     sched
                     ..
                     scripting
                     ..
                     sdt
                     ..
                     sizeof
                     ..
                     speculation
                     ..
                     stability
                     ..
                     stack
                     ..
                     stackdepth
                     ..
                     stop
                     ..
                     strlen
                     ..
                     strtoll
                     ..
                     struct
                     ..
                     syscall
                     ..
                     sysevent
                     ..
                     tick-n
                     ..
                     trace
                     ..
                     tracemem
                     ..
                     translators
                     ..
                     typedef
                     ..
                     types
                     ..
                     uctf
                     ..
                     union
                     ..
                     usdt
                     ..
                     ustack
                     ..
                     vars
                     ..
                     version
                     ..
                 ..
             ..
         ..
     ..
     etc
         rc.d
         ..
     ..
     games
     ..
     gnu
         lib
         ..
         usr.bin
             diff
             ..
         ..
     ..
     lib
         atf
             libatf-c
                 detail
                 ..
             ..
             libatf-c++
                 detail
                 ..
             ..
             test-programs
             ..
         ..
         libarchive
         ..
         libc
             c063
             ..
             db
             ..
             gen
                 execve
                 ..
                 posix_spawn
                 ..
             ..
             hash
                 data
                 ..
             ..
             inet
             ..
             locale
             ..
             net
                 getaddrinfo
                     data
                     ..
                 ..
             ..
             nss
             ..
             regex
                 data
                 ..
             ..
             resolv
             ..
             rpc
             ..
             ssp
             ..
             setjmp
             ..
             stdio
             ..
             stdlib
             ..
             string
             ..
             sys
             ..
             time
             ..
             tls
                 dso
                 ..
             ..
             termios
             ..
             ttyio
             ..
         ..
         libcrypt
         ..
         libmp
         ..
         libnv
         ..
         libpam
         ..
         libproc
         ..
         librt
         ..
         libthr
             dlopen
             ..
         ..
         libutil
         ..
         libxo
         ..
         msun
         ..
     ..
     libexec
         atf
             atf-check
             ..
             atf-sh
             ..
         ..
         rtld-elf
         ..
     ..
     sbin
         dhclient
         ..
         devd
         ..
         growfs
         ..
         ifconfig
         ..
         mdconfig
         ..
     ..
     secure
         lib
         ..
         libexec
         ..
         usr.bin
         ..
         usr.sbin
         ..
     ..
     share
         examples
             tests
                 atf
                 ..
                 plain
                 ..
             ..
         ..
     ..
     sys
         acl
         ..
         aio
         ..
         fifo
         ..
         file
         ..
         geom
             class
                 concat
                 ..
                 eli
                 ..
                 gate
                 ..
                 gpt
                 ..
                 mirror
                 ..
                 nop
                 ..
                 raid3
                 ..
                 shsec
                 ..
                 stripe
                 ..
                 uzip
                     etalon
                     ..
                 ..
             ..
         ..
         kern
             acct
             ..
             execve
             ..
             pipe
             ..
         ..
         kqueue
         ..
         mac
             bsdextended
             ..
             portacl
             ..
         ..
         mqueue
         ..
         netinet
         ..
         opencrypto
         ..
         pjdfstest
             chflags
             ..
             chmod
             ..
             chown
             ..
             ftruncate
             ..
             granular
             ..
             link
             ..
             mkdir
             ..
             mkfifo
             ..
             mknod
             ..
             open
             ..
             rename
             ..
             rmdir
             ..
             symlink
             ..
             truncate
             ..
             unlink
             ..
         ..
         posixshm
         ..
+        sys
+        ..
         vfs
         ..
         vm
         ..
     ..
     usr.bin
         apply
         ..
         basename
         ..
         bmake
             archives
                 fmt_44bsd
                 ..
                 fmt_44bsd_mod
                 ..
                 fmt_oldbsd
                 ..
             ..
             basic
                 t0
                 ..
                 t1
                 ..
                 t2
                 ..
                 t3
                 ..
             ..
             execution
                 ellipsis
                 ..
                 empty
                 ..
                 joberr
                 ..
                 plus
                 ..
             ..
             shell
                 builtin
                 ..
                 meta
                 ..
                 path
                 ..
                 path_select
                 ..
                 replace
                 ..
                 select
                 ..
             ..
             suffixes
                 basic
                 ..
                 src_wild1
                 ..
                 src_wild2
                 ..
             ..
             syntax
                 directive-t0
                 ..
                 enl
                 ..
                 funny-targets
                 ..
                 semi
                 ..
             ..
             sysmk
                 t0
                     2
                         1
                         ..
                     ..
                     mk
                     ..
                 ..
                 t1
                     2
                         1
                         ..
                     ..
                     mk
                     ..
                 ..
                 t2
                     2
                         1
                         ..
                     ..
                     mk
                     ..
                 ..
             ..
             variables
                 modifier_M
                 ..
                 modifier_t
                 ..
                 opt_V
                 ..
                 t0
                 ..
             ..
         ..
         calendar
         ..
         cmp
         ..
         cpio
         ..
         col
         ..
         comm
         ..
         cut
         ..
         dirname
         ..
         file2c
         ..
         grep
         ..
         gzip
         ..
         ident
         ..
         join
         ..
         jot
         ..
         lastcomm
         ..
         limits
         ..
         m4
         ..
         mkimg
         ..
         ncal
         ..
         opensm
         ..
         printf
         ..
         sdiff
         ..
         sed
             regress.multitest.out
             ..
         ..
         soelim
         ..
         tar
         ..
         timeout
         ..
         tr
         ..
         truncate
         ..
         units
         ..
         uudecode
         ..
         uuencode
         ..
         xargs
         ..
         xo
         ..
         yacc
             yacc
             ..
         ..
     ..
     usr.sbin
         etcupdate
         ..
         extattr
         ..
         fstyp
         ..
         makefs
         ..
         newsyslog
         ..
         nmtree
         ..
         pw
         ..
         rpcbind
         ..
         sa
         ..
     ..
 ..
 
 # vim: set expandtab ts=4 sw=4:
Index: head/include/bitstring.h
===================================================================
--- head/include/bitstring.h	(revision 299089)
+++ head/include/bitstring.h	(revision 299090)
@@ -1,35 +1,37 @@
 /*-
  * Copyright (c) 2003 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _BITSTRING_H_
 #define	_BITSTRING_H_
 
+#include <stdlib.h>
+#include <strings.h>
 #include <sys/bitstring.h>
 
 #endif /* _BITSTRING_H_ */
 
Index: head/lib/libbluetooth/bluetooth.h
===================================================================
--- head/lib/libbluetooth/bluetooth.h	(revision 299089)
+++ head/lib/libbluetooth/bluetooth.h	(revision 299090)
@@ -1,216 +1,218 @@
 /*
  * bluetooth.h
  */
 
 /*-
  * Copyright (c) 2001-2009 Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $Id: bluetooth.h,v 1.5 2003/09/14 23:28:42 max Exp $
  * $FreeBSD$
  */
 
 #ifndef _BLUETOOTH_H_
 #define _BLUETOOTH_H_
 
 #include <sys/types.h>
-#include <sys/bitstring.h>
 #include <sys/endian.h>
 #include <sys/ioctl.h>
 #include <sys/socket.h>
 #include <sys/uio.h>
 #include <sys/un.h>
+
 #include <errno.h>
 #include <netdb.h>
+#include <bitstring.h>
+
 #include <netgraph/ng_message.h>
 #include <netgraph/bluetooth/include/ng_hci.h>
 #include <netgraph/bluetooth/include/ng_l2cap.h>
 #include <netgraph/bluetooth/include/ng_btsocket.h>
 #include <time.h>
 
 __BEGIN_DECLS
 
 /*
  * Linux BlueZ compatibility
  */
 
 #define	bacmp(ba1, ba2)	memcmp((ba1), (ba2), sizeof(bdaddr_t))
 #define	bacpy(dst, src)	memcpy((dst), (src), sizeof(bdaddr_t))
 #define ba2str(ba, str)	bt_ntoa((ba), (str))
 #define str2ba(str, ba)	(bt_aton((str), (ba)) == 1? 0 : -1)
 #define htobs(d)	htole16(d)
 #define htobl(d)	htole32(d)
 #define btohs(d)	le16toh(d)
 #define btohl(d)	le32toh(d)
 
 /*
  * Interface to the outside world
  */
 
 struct hostent *  bt_gethostbyname    (char const *name);
 struct hostent *  bt_gethostbyaddr    (char const *addr, int len, int type);
 struct hostent *  bt_gethostent       (void);
 void              bt_sethostent       (int stayopen);
 void              bt_endhostent       (void);
 
 struct protoent * bt_getprotobyname   (char const *name);
 struct protoent * bt_getprotobynumber (int proto);
 struct protoent * bt_getprotoent      (void);
 void              bt_setprotoent      (int stayopen);
 void              bt_endprotoent      (void);
 
 char const *      bt_ntoa             (bdaddr_t const *ba, char *str);
 int               bt_aton             (char const *str, bdaddr_t *ba);
 
 /* bt_devXXXX() functions (inspired by NetBSD) */
 int               bt_devaddr          (char const *devname, bdaddr_t *addr);
 int               bt_devname          (char *devname, bdaddr_t const *addr);
 
 /* 
  * Bluetooth HCI functions
  */
 
 #define	HCI_DEVMAX			32		/* arbitrary */
 #define	HCI_DEVNAME_SIZE		NG_NODESIZ
 #define	HCI_DEVFEATURES_SIZE		NG_HCI_FEATURES_SIZE
 
 struct bt_devinfo
 {
 	char		devname[HCI_DEVNAME_SIZE];
 
 	uint32_t	state;		/* device/implementation specific */
 
 	bdaddr_t	bdaddr;
 	uint16_t	_reserved0;
 
 	uint8_t		features[HCI_DEVFEATURES_SIZE];
 
 	/* buffer info */
 	uint16_t	_reserved1;
 	uint16_t	cmd_free;
 	uint16_t	sco_size;
 	uint16_t	sco_pkts;
 	uint16_t	sco_free;
 	uint16_t	acl_size;
 	uint16_t	acl_pkts;
 	uint16_t	acl_free;
 
 	/* stats */
 	uint32_t	cmd_sent;
 	uint32_t	evnt_recv;
 	uint32_t	acl_recv;
 	uint32_t	acl_sent;
 	uint32_t	sco_recv;
 	uint32_t	sco_sent;
 	uint32_t	bytes_recv;
 	uint32_t	bytes_sent;
 
 	/* misc/specific */
 	uint16_t	link_policy_info;
 	uint16_t	packet_type_info;
 	uint16_t	role_switch_info;
 	uint16_t	debug;
 
 	uint8_t		_padding[20];	/* leave space for future additions */
 };
 
 struct bt_devreq
 {
 	uint16_t	opcode;
 	uint8_t		event;
 	void		*cparam;
 	size_t		clen;
 	void		*rparam;
 	size_t		rlen;
 };
 
 struct bt_devfilter {
 	bitstr_t	bit_decl(packet_mask, 8);
 	bitstr_t	bit_decl(event_mask, 256);
 };
 
 struct bt_devinquiry {
 	bdaddr_t        bdaddr;
 	uint8_t         pscan_rep_mode;
 	uint8_t         pscan_period_mode;
 	uint8_t         dev_class[3];
 	uint16_t        clock_offset;
 	int8_t          rssi;
 	uint8_t         data[240];
 };
 
 typedef int	(bt_devenum_cb_t)(int, struct bt_devinfo const *, void *);
 
 int		bt_devopen (char const *devname);
 int		bt_devclose(int s);
 int		bt_devsend (int s, uint16_t opcode, void *param, size_t plen);
 ssize_t		bt_devrecv (int s, void *buf, size_t size, time_t to);
 int		bt_devreq  (int s, struct bt_devreq *r, time_t to);
 int		bt_devfilter(int s, struct bt_devfilter const *newp,
 			     struct bt_devfilter *oldp);
 void		bt_devfilter_pkt_set(struct bt_devfilter *filter, uint8_t type);
 void		bt_devfilter_pkt_clr(struct bt_devfilter *filter, uint8_t type);
 int		bt_devfilter_pkt_tst(struct bt_devfilter const *filter, uint8_t type);
 void		bt_devfilter_evt_set(struct bt_devfilter *filter, uint8_t event);
 void		bt_devfilter_evt_clr(struct bt_devfilter *filter, uint8_t event);
 int		bt_devfilter_evt_tst(struct bt_devfilter const *filter, uint8_t event);
 int		bt_devinquiry(char const *devname, time_t length, int num_rsp,
 			      struct bt_devinquiry **ii);
 int		bt_devinfo (struct bt_devinfo *di);
 int		bt_devenum (bt_devenum_cb_t cb, void *arg);
 
 /*
  * bdaddr utility functions (from NetBSD)
  */
 
 static __inline int
 bdaddr_same(const bdaddr_t *a, const bdaddr_t *b)
 {
 	return (a->b[0] == b->b[0] && a->b[1] == b->b[1] &&
 		a->b[2] == b->b[2] && a->b[3] == b->b[3] &&
 		a->b[4] == b->b[4] && a->b[5] == b->b[5]);
 }
 
 static __inline int
 bdaddr_any(const bdaddr_t *a)
 {
 	return (a->b[0] == 0 && a->b[1] == 0 && a->b[2] == 0 &&
 		a->b[3] == 0 && a->b[4] == 0 && a->b[5] == 0);
 }
 
 static __inline void
 bdaddr_copy(bdaddr_t *d, const bdaddr_t *s)
 {
 	d->b[0] = s->b[0];
 	d->b[1] = s->b[1];
 	d->b[2] = s->b[2];
 	d->b[3] = s->b[3];
 	d->b[4] = s->b[4];
 	d->b[5] = s->b[5];
 }
 
 __END_DECLS
 
 #endif /* ndef _BLUETOOTH_H_ */
 
Index: head/sbin/hastd/activemap.c
===================================================================
--- head/sbin/hastd/activemap.c	(revision 299089)
+++ head/sbin/hastd/activemap.c	(revision 299090)
@@ -1,701 +1,701 @@
 /*-
  * Copyright (c) 2009-2010 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by Pawel Jakub Dawidek under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>	/* powerof2() */
 #include <sys/queue.h>
 
 #include <bitstring.h>
 #include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #include <pjdlog.h>
 
 #include "activemap.h"
 
 #ifndef	PJDLOG_ASSERT
 #include <assert.h>
 #define	PJDLOG_ASSERT(...)	assert(__VA_ARGS__)
 #endif
 
 #define	ACTIVEMAP_MAGIC	0xac71e4
 struct activemap {
 	int		 am_magic;	/* Magic value. */
 	off_t		 am_mediasize;	/* Media size in bytes. */
 	uint32_t	 am_extentsize;	/* Extent size in bytes,
 					   must be power of 2. */
 	uint8_t		 am_extentshift;/* 2 ^ extentbits == extentsize */
 	int		 am_nextents;	/* Number of extents. */
 	size_t		 am_mapsize;	/* Bitmap size in bytes. */
 	uint16_t	*am_memtab;	/* An array that holds number of pending
 					   writes per extent. */
 	bitstr_t	*am_diskmap;	/* On-disk bitmap of dirty extents. */
 	bitstr_t	*am_memmap;	/* In-memory bitmap of dirty extents. */
 	size_t		 am_diskmapsize; /* Map size rounded up to sector size. */
 	uint64_t	 am_ndirty;	/* Number of dirty regions. */
 	bitstr_t	*am_syncmap;	/* Bitmap of extents to sync. */
 	off_t		 am_syncoff;	/* Next synchronization offset. */
 	TAILQ_HEAD(skeepdirty, keepdirty) am_keepdirty; /* List of extents that
 					   we keep dirty to reduce bitmap
 					   updates. */
 	int		 am_nkeepdirty;	/* Number of am_keepdirty elements. */
 	int		 am_nkeepdirty_limit; /* Maximum number of am_keepdirty
 					         elements. */
 };
 
 struct keepdirty {
 	int	kd_extent;
 	TAILQ_ENTRY(keepdirty) kd_next;
 };
 
 /*
  * Helper function taken from sys/systm.h to calculate extentshift.
  */
 static uint32_t
 bitcount32(uint32_t x)
 {
 
 	x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
 	x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
 	x = (x + (x >> 4)) & 0x0f0f0f0f;
 	x = (x + (x >> 8));
 	x = (x + (x >> 16)) & 0x000000ff;
 	return (x);
 }
 
 static __inline int
 off2ext(const struct activemap *amp, off_t offset)
 {
 	int extent;
 
 	PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
 	extent = (offset >> amp->am_extentshift);
 	PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
 	return (extent);
 }
 
 static __inline off_t
 ext2off(const struct activemap *amp, int extent)
 {
 	off_t offset;
 
 	PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
 	offset = ((off_t)extent << amp->am_extentshift);
 	PJDLOG_ASSERT(offset >= 0 && offset < amp->am_mediasize);
 	return (offset);
 }
 
 /*
  * Function calculates number of requests needed to synchronize the given
  * extent.
  */
 static __inline int
 ext2reqs(const struct activemap *amp, int ext)
 {
 	off_t left;
 
 	if (ext < amp->am_nextents - 1)
 		return (((amp->am_extentsize - 1) / MAXPHYS) + 1);
 
 	PJDLOG_ASSERT(ext == amp->am_nextents - 1);
 	left = amp->am_mediasize % amp->am_extentsize;
 	if (left == 0)
 		left = amp->am_extentsize;
 	return (((left - 1) / MAXPHYS) + 1);
 }
 
 /*
  * Initialize activemap structure and allocate memory for internal needs.
  * Function returns 0 on success and -1 if any of the allocations failed.
  */
 int
 activemap_init(struct activemap **ampp, uint64_t mediasize, uint32_t extentsize,
     uint32_t sectorsize, uint32_t keepdirty)
 {
 	struct activemap *amp;
 
 	PJDLOG_ASSERT(ampp != NULL);
 	PJDLOG_ASSERT(mediasize > 0);
 	PJDLOG_ASSERT(extentsize > 0);
 	PJDLOG_ASSERT(powerof2(extentsize));
 	PJDLOG_ASSERT(sectorsize > 0);
 	PJDLOG_ASSERT(powerof2(sectorsize));
 	PJDLOG_ASSERT(keepdirty > 0);
 
 	amp = malloc(sizeof(*amp));
 	if (amp == NULL)
 		return (-1);
 
 	amp->am_mediasize = mediasize;
 	amp->am_nkeepdirty_limit = keepdirty;
 	amp->am_extentsize = extentsize;
 	amp->am_extentshift = bitcount32(extentsize - 1);
 	amp->am_nextents = ((mediasize - 1) / extentsize) + 1;
-	amp->am_mapsize = sizeof(bitstr_t) * bitstr_size(amp->am_nextents);
+	amp->am_mapsize = bitstr_size(amp->am_nextents);
 	amp->am_diskmapsize = roundup2(amp->am_mapsize, sectorsize);
 	amp->am_ndirty = 0;
 	amp->am_syncoff = -2;
 	TAILQ_INIT(&amp->am_keepdirty);
 	amp->am_nkeepdirty = 0;
 
 	amp->am_memtab = calloc(amp->am_nextents, sizeof(amp->am_memtab[0]));
 	amp->am_diskmap = calloc(1, amp->am_diskmapsize);
 	amp->am_memmap = bit_alloc(amp->am_nextents);
 	amp->am_syncmap = bit_alloc(amp->am_nextents);
 
 	/*
 	 * Check to see if any of the allocations above failed.
 	 */
 	if (amp->am_memtab == NULL || amp->am_diskmap == NULL ||
 	    amp->am_memmap == NULL || amp->am_syncmap == NULL) {
 		if (amp->am_memtab != NULL)
 			free(amp->am_memtab);
 		if (amp->am_diskmap != NULL)
 			free(amp->am_diskmap);
 		if (amp->am_memmap != NULL)
 			free(amp->am_memmap);
 		if (amp->am_syncmap != NULL)
 			free(amp->am_syncmap);
 		amp->am_magic = 0;
 		free(amp);
 		errno = ENOMEM;
 		return (-1);
 	}
 
 	amp->am_magic = ACTIVEMAP_MAGIC;
 	*ampp = amp;
 
 	return (0);
 }
 
 static struct keepdirty *
 keepdirty_find(struct activemap *amp, int extent)
 {
 	struct keepdirty *kd;
 
 	TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next) {
 		if (kd->kd_extent == extent)
 			break;
 	}
 	return (kd);
 }
 
 static bool
 keepdirty_add(struct activemap *amp, int extent)
 {
 	struct keepdirty *kd;
 
 	kd = keepdirty_find(amp, extent);
 	if (kd != NULL) {
 		/*
 		 * Only move element at the beginning.
 		 */
 		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
 		TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
 		return (false);
 	}
 	/*
 	 * Add new element, but first remove the most unused one if
 	 * we have too many.
 	 */
 	if (amp->am_nkeepdirty >= amp->am_nkeepdirty_limit) {
 		kd = TAILQ_LAST(&amp->am_keepdirty, skeepdirty);
 		PJDLOG_ASSERT(kd != NULL);
 		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
 		amp->am_nkeepdirty--;
 		PJDLOG_ASSERT(amp->am_nkeepdirty > 0);
 	}
 	if (kd == NULL)
 		kd = malloc(sizeof(*kd));
 	/* We can ignore allocation failure. */
 	if (kd != NULL) {
 		kd->kd_extent = extent;
 		amp->am_nkeepdirty++;
 		TAILQ_INSERT_HEAD(&amp->am_keepdirty, kd, kd_next);
 	}
 
 	return (true);
 }
 
 static void
 keepdirty_fill(struct activemap *amp)
 {
 	struct keepdirty *kd;
 
 	TAILQ_FOREACH(kd, &amp->am_keepdirty, kd_next)
 		bit_set(amp->am_diskmap, kd->kd_extent);
 }
 
 static void
 keepdirty_free(struct activemap *amp)
 {
 	struct keepdirty *kd;
 
 	while ((kd = TAILQ_FIRST(&amp->am_keepdirty)) != NULL) {
 		TAILQ_REMOVE(&amp->am_keepdirty, kd, kd_next);
 		amp->am_nkeepdirty--;
 		free(kd);
 	}
 	PJDLOG_ASSERT(amp->am_nkeepdirty == 0);
 }
 
 /*
  * Function frees resources allocated by activemap_init() function.
  */
 void
 activemap_free(struct activemap *amp)
 {
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	amp->am_magic = 0;
 
 	keepdirty_free(amp);
 	free(amp->am_memtab);
 	free(amp->am_diskmap);
 	free(amp->am_memmap);
 	free(amp->am_syncmap);
 }
 
 /*
  * Function should be called before we handle write requests. It updates
  * internal structures and returns true if on-disk metadata should be updated.
  */
 bool
 activemap_write_start(struct activemap *amp, off_t offset, off_t length)
 {
 	bool modified;
 	off_t end;
 	int ext;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 	PJDLOG_ASSERT(length > 0);
 
 	modified = false;
 	end = offset + length - 1;
 
 	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
 		/*
 		 * If the number of pending writes is increased from 0,
 		 * we have to mark the extent as dirty also in on-disk bitmap.
 		 * By returning true we inform the caller that on-disk bitmap
 		 * was modified and has to be flushed to disk.
 		 */
 		if (amp->am_memtab[ext]++ == 0) {
 			PJDLOG_ASSERT(!bit_test(amp->am_memmap, ext));
 			bit_set(amp->am_memmap, ext);
 			amp->am_ndirty++;
 		}
 		if (keepdirty_add(amp, ext))
 			modified = true;
 	}
 
 	return (modified);
 }
 
 /*
  * Function should be called after receiving write confirmation. It updates
  * internal structures and returns true if on-disk metadata should be updated.
  */
 bool
 activemap_write_complete(struct activemap *amp, off_t offset, off_t length)
 {
 	bool modified;
 	off_t end;
 	int ext;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 	PJDLOG_ASSERT(length > 0);
 
 	modified = false;
 	end = offset + length - 1;
 
 	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
 		/*
 		 * If the number of pending writes goes down to 0, we have to
 		 * mark the extent as clean also in on-disk bitmap.
 		 * By returning true we inform the caller that on-disk bitmap
 		 * was modified and has to be flushed to disk.
 		 */
 		PJDLOG_ASSERT(amp->am_memtab[ext] > 0);
 		PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
 		if (--amp->am_memtab[ext] == 0) {
 			bit_clear(amp->am_memmap, ext);
 			amp->am_ndirty--;
 			if (keepdirty_find(amp, ext) == NULL)
 				modified = true;
 		}
 	}
 
 	return (modified);
 }
 
 /*
  * Function should be called after finishing synchronization of one extent.
  * It returns true if on-disk metadata should be updated.
  */
 bool
 activemap_extent_complete(struct activemap *amp, int extent)
 {
 	bool modified;
 	int reqs;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 	PJDLOG_ASSERT(extent >= 0 && extent < amp->am_nextents);
 
 	modified = false;
 
 	reqs = ext2reqs(amp, extent);
 	PJDLOG_ASSERT(amp->am_memtab[extent] >= reqs);
 	amp->am_memtab[extent] -= reqs;
 	PJDLOG_ASSERT(bit_test(amp->am_memmap, extent));
 	if (amp->am_memtab[extent] == 0) {
 		bit_clear(amp->am_memmap, extent);
 		amp->am_ndirty--;
 		modified = true;
 	}
 
 	return (modified);
 }
 
 /*
  * Function returns number of dirty regions.
  */
 uint64_t
 activemap_ndirty(const struct activemap *amp)
 {
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	return (amp->am_ndirty);
 }
 
 /*
  * Function compare on-disk bitmap and in-memory bitmap and returns true if
  * they differ and should be flushed to the disk.
  */
 bool
 activemap_differ(const struct activemap *amp)
 {
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	return (memcmp(amp->am_diskmap, amp->am_memmap,
 	    amp->am_mapsize) != 0);
 }
 
 /*
  * Function returns number of bytes used by bitmap.
  */
 size_t
 activemap_size(const struct activemap *amp)
 {
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	return (amp->am_mapsize);
 }
 
 /*
  * Function returns number of bytes needed for storing on-disk bitmap.
  * This is the same as activemap_size(), but rounded up to sector size.
  */
 size_t
 activemap_ondisk_size(const struct activemap *amp)
 {
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	return (amp->am_diskmapsize);
 }
 
 /*
  * Function copies the given buffer read from disk to the internal bitmap.
  */
 void
 activemap_copyin(struct activemap *amp, const unsigned char *buf, size_t size)
 {
 	int ext;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 	PJDLOG_ASSERT(size >= amp->am_mapsize);
 
 	memcpy(amp->am_diskmap, buf, amp->am_mapsize);
 	memcpy(amp->am_memmap, buf, amp->am_mapsize);
 	memcpy(amp->am_syncmap, buf, amp->am_mapsize);
 
 	bit_ffs(amp->am_memmap, amp->am_nextents, &ext);
 	if (ext == -1) {
 		/* There are no dirty extents, so we can leave now. */
 		return;
 	}
 	/*
 	 * Set synchronization offset to the first dirty extent.
 	 */
 	activemap_sync_rewind(amp);
 	/*
 	 * We have dirty extents and we want them to stay that way until
 	 * we synchronize, so we set number of pending writes to number
 	 * of requests needed to synchronize one extent.
 	 */
 	amp->am_ndirty = 0;
 	for (; ext < amp->am_nextents; ext++) {
 		if (bit_test(amp->am_memmap, ext)) {
 			amp->am_memtab[ext] = ext2reqs(amp, ext);
 			amp->am_ndirty++;
 		}
 	}
 }
 
 /*
  * Function merges the given bitmap with existing one.
  */
 void
 activemap_merge(struct activemap *amp, const unsigned char *buf, size_t size)
 {
 	bitstr_t *remmap = __DECONST(bitstr_t *, buf);
 	int ext;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 	PJDLOG_ASSERT(size >= amp->am_mapsize);
 
 	bit_ffs(remmap, amp->am_nextents, &ext);
 	if (ext == -1) {
 		/* There are no dirty extents, so we can leave now. */
 		return;
 	}
 	/*
 	 * We have dirty extents and we want them to stay that way until
 	 * we synchronize, so we set number of pending writes to number
 	 * of requests needed to synchronize one extent.
 	 */
 	for (; ext < amp->am_nextents; ext++) {
 		/* Local extent already dirty. */
 		if (bit_test(amp->am_syncmap, ext))
 			continue;
 		/* Remote extent isn't dirty. */
 		if (!bit_test(remmap, ext))
 			continue;
 		bit_set(amp->am_syncmap, ext);
 		bit_set(amp->am_memmap, ext);
 		bit_set(amp->am_diskmap, ext);
 		if (amp->am_memtab[ext] == 0)
 			amp->am_ndirty++;
 		amp->am_memtab[ext] = ext2reqs(amp, ext);
 	}
 	/*
 	 * Set synchronization offset to the first dirty extent.
 	 */
 	activemap_sync_rewind(amp);
 }
 
 /*
  * Function returns pointer to internal bitmap that should be written to disk.
  */
 const unsigned char *
 activemap_bitmap(struct activemap *amp, size_t *sizep)
 {
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	if (sizep != NULL)
 		*sizep = amp->am_diskmapsize;
 	memcpy(amp->am_diskmap, amp->am_memmap, amp->am_mapsize);
 	keepdirty_fill(amp);
 	return ((const unsigned char *)amp->am_diskmap);
 }
 
 /*
  * Function calculates size needed to store bitmap on disk.
  */
 size_t
 activemap_calc_ondisk_size(uint64_t mediasize, uint32_t extentsize,
     uint32_t sectorsize)
 {
 	uint64_t nextents, mapsize;
 
 	PJDLOG_ASSERT(mediasize > 0);
 	PJDLOG_ASSERT(extentsize > 0);
 	PJDLOG_ASSERT(powerof2(extentsize));
 	PJDLOG_ASSERT(sectorsize > 0);
 	PJDLOG_ASSERT(powerof2(sectorsize));
 
 	nextents = ((mediasize - 1) / extentsize) + 1;
-	mapsize = sizeof(bitstr_t) * bitstr_size(nextents);
+	mapsize = bitstr_size(nextents);
 	return (roundup2(mapsize, sectorsize));
 }
 
 /*
  * Set synchronization offset to the first dirty extent.
  */
 void
 activemap_sync_rewind(struct activemap *amp)
 {
 	int ext;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
 	if (ext == -1) {
 		/* There are no extents to synchronize. */
 		amp->am_syncoff = -2;
 		return;
 	}
 	/*
 	 * Mark that we want to start synchronization from the beginning.
 	 */
 	amp->am_syncoff = -1;
 }
 
 /*
  * Return next offset of where we should synchronize.
  */
 off_t
 activemap_sync_offset(struct activemap *amp, off_t *lengthp, int *syncextp)
 {
 	off_t syncoff, left;
 	int ext;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 	PJDLOG_ASSERT(lengthp != NULL);
 	PJDLOG_ASSERT(syncextp != NULL);
 
 	*syncextp = -1;
 
 	if (amp->am_syncoff == -2)
 		return (-1);
 
 	if (amp->am_syncoff >= 0 &&
 	    (amp->am_syncoff + MAXPHYS >= amp->am_mediasize ||
 	     off2ext(amp, amp->am_syncoff) !=
 	     off2ext(amp, amp->am_syncoff + MAXPHYS))) {
 		/*
 		 * We are about to change extent, so mark previous one as clean.
 		 */
 		ext = off2ext(amp, amp->am_syncoff);
 		bit_clear(amp->am_syncmap, ext);
 		*syncextp = ext;
 		amp->am_syncoff = -1;
 	}
 
 	if (amp->am_syncoff == -1) {
 		/*
 		 * Let's find first extent to synchronize.
 		 */
 		bit_ffs(amp->am_syncmap, amp->am_nextents, &ext);
 		if (ext == -1) {
 			amp->am_syncoff = -2;
 			return (-1);
 		}
 		amp->am_syncoff = ext2off(amp, ext);
 	} else {
 		/*
 		 * We don't change extent, so just increase offset.
 		 */
 		amp->am_syncoff += MAXPHYS;
 		if (amp->am_syncoff >= amp->am_mediasize) {
 			amp->am_syncoff = -2;
 			return (-1);
 		}
 	}
 
 	syncoff = amp->am_syncoff;
 	left = ext2off(amp, off2ext(amp, syncoff)) +
 	    amp->am_extentsize - syncoff;
 	if (syncoff + left > amp->am_mediasize)
 		left = amp->am_mediasize - syncoff;
 	if (left > MAXPHYS)
 		left = MAXPHYS;
 
 	PJDLOG_ASSERT(left >= 0 && left <= MAXPHYS);
 	PJDLOG_ASSERT(syncoff >= 0 && syncoff < amp->am_mediasize);
 	PJDLOG_ASSERT(syncoff + left >= 0 &&
 	    syncoff + left <= amp->am_mediasize);
 
 	*lengthp = left;
 	return (syncoff);
 }
 
 /*
  * Mark extent(s) containing the given region for synchronization.
  * Most likely one of the components is unavailable.
  */
 bool
 activemap_need_sync(struct activemap *amp, off_t offset, off_t length)
 {
 	bool modified;
 	off_t end;
 	int ext;
 
 	PJDLOG_ASSERT(amp->am_magic == ACTIVEMAP_MAGIC);
 
 	modified = false;
 	end = offset + length - 1;
 
 	for (ext = off2ext(amp, offset); ext <= off2ext(amp, end); ext++) {
 		if (bit_test(amp->am_syncmap, ext)) {
 			/* Already marked for synchronization. */
 			PJDLOG_ASSERT(bit_test(amp->am_memmap, ext));
 			continue;
 		}
 		bit_set(amp->am_syncmap, ext);
 		if (!bit_test(amp->am_memmap, ext)) {
 			bit_set(amp->am_memmap, ext);
 			amp->am_ndirty++;
 		}
 		amp->am_memtab[ext] += ext2reqs(amp, ext);
 		modified = true;
 	}
 
 	return (modified);
 }
 
 void
 activemap_dump(const struct activemap *amp)
 {
 	int bit;
 
 	printf("M: ");
 	for (bit = 0; bit < amp->am_nextents; bit++)
 		printf("%d", bit_test(amp->am_memmap, bit) ? 1 : 0);
 	printf("\n");
 	printf("D: ");
 	for (bit = 0; bit < amp->am_nextents; bit++)
 		printf("%d", bit_test(amp->am_diskmap, bit) ? 1 : 0);
 	printf("\n");
 	printf("S: ");
 	for (bit = 0; bit < amp->am_nextents; bit++)
 		printf("%d", bit_test(amp->am_syncmap, bit) ? 1 : 0);
 	printf("\n");
 }
Index: head/share/man/man3/bitstring.3
===================================================================
--- head/share/man/man3/bitstring.3	(revision 299089)
+++ head/share/man/man3/bitstring.3	(revision 299090)
@@ -1,187 +1,263 @@
 .\" Copyright (c) 1989, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
 .\"
 .\" This code is derived from software contributed to Berkeley by
 .\" Paul Vixie.
 .\" Redistribution and use in source and binary forms, with or without
 .\" modification, are permitted provided that the following conditions
 .\" are met:
 .\" 1. Redistributions of source code must retain the above copyright
 .\"    notice, this list of conditions and the following disclaimer.
 .\" 2. Redistributions in binary form must reproduce the above copyright
 .\"    notice, this list of conditions and the following disclaimer in the
 .\"    documentation and/or other materials provided with the distribution.
 .\" 3. Neither the name of the University nor the names of its contributors
 .\"    may be used to endorse or promote products derived from this software
 .\"    without specific prior written permission.
 .\"
 .\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 .\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 .\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 .\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 .\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
+.\" Copyright (c) 2014 Spectra Logic Corporation
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions, and the following disclaimer,
+.\"    without modification.
+.\" 2. Redistributions in binary form must reproduce at minimum a disclaimer
+.\"    substantially similar to the "NO WARRANTY" disclaimer below
+.\"    ("Disclaimer") and any redistribution must be conditioned upon
+.\"    including a substantially similar Disclaimer requirement for further
+.\"    binary redistribution.
+.\"
+.\" NO WARRANTY
+.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+.\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+.\" HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+.\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+.\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGES.
+.\"
 .\"     @(#)bitstring.3	8.1 (Berkeley) 7/19/93
 .\" $FreeBSD$
 .\"
-.Dd October 17, 2015
+.Dd May 4, 2016
 .Dt BITSTRING 3
 .Os
 .Sh NAME
 .Nm bit_alloc ,
 .Nm bit_clear ,
 .Nm bit_decl ,
+.Nm bit_ffc ,
 .Nm bit_ffs ,
+.Nm bit_ffc_at ,
+.Nm bit_ffs_at ,
 .Nm bit_nclear ,
 .Nm bit_nset ,
 .Nm bit_set ,
-.Nm bitstr_size ,
-.Nm bit_test
-.Nd bit-string manipulation macros
+.Nm bit_test ,
+.Nm bitstr_size
+.Nd bit-string manipulation functions and macros
 .Sh SYNOPSIS
 .In bitstring.h
 .Ft bitstr_t *
 .Fn bit_alloc "int nbits"
 .Ft void
 .Fn bit_decl "bitstr_t *name" "int nbits"
 .Ft void
 .Fn bit_clear "bitstr_t *name" "int bit"
 .Ft void
 .Fn bit_ffc "bitstr_t *name" "int nbits" "int *value"
 .Ft void
 .Fn bit_ffs "bitstr_t *name" "int nbits" "int *value"
 .Ft void
+.Fn bit_ffc_at "bitstr_t *name" "int start" "int nbits" "int *value"
+.Ft void
+.Fn bit_ffs_at "bitstr_t *name" "int start" "int nbits" "int *value"
+.Ft void
 .Fn bit_nclear "bitstr_t *name" "int start" "int stop"
 .Ft void
 .Fn bit_nset "bitstr_t *name" "int start" "int stop"
 .Ft void
 .Fn bit_set "bitstr_t *name" "int bit"
 .Ft int
 .Fn bitstr_size "int nbits"
 .Ft int
 .Fn bit_test "bitstr_t *name" "int bit"
 .Sh DESCRIPTION
 These macros operate on strings of bits.
 .Pp
-The macro
+The function
 .Fn bit_alloc
 returns a pointer of type
 .Dq Fa "bitstr_t *"
 to sufficient space to store
 .Fa nbits
 bits, or
 .Dv NULL
 if no space is available.
+If successful, the returned bit string is initialized with all bits cleared.
 .Pp
 The macro
 .Fn bit_decl
-allocates sufficient space to store
+declares a bit string with sufficient space to store
 .Fa nbits
-bits on the stack.
+bits.
+.Fn bit_decl
+may be used to include statically sized bit strings in structure
+definitions or to create bit strings on the stack.
+Users of this macro are responsible for initialization of the bit string,
+typically via a global initialization of the containing struct or use of the
+.Fn bit_nset
+or
+.Fn bin_nclear
+functions.
 .Pp
 The macro
 .Fn bitstr_size
-returns the number of elements of type
-.Fa bitstr_t
-necessary to store
+returns the number of bytes necessary to store
 .Fa nbits
 bits.
 This is useful for copying bit strings.
 .Pp
-The macros
+The functions
 .Fn bit_clear
 and
 .Fn bit_set
 clear or set the zero-based numbered bit
 .Fa bit ,
 in the bit string
 .Ar name .
 .Pp
 The
 .Fn bit_nset
 and
 .Fn bit_nclear
-macros
+functions
 set or clear the zero-based numbered bits from
 .Fa start
 through
 .Fa stop
 in the bit string
 .Ar name .
 .Pp
 The
 .Fn bit_test
-macro
+function
 evaluates to non-zero if the zero-based numbered bit
 .Fa bit
 of bit string
 .Fa name
 is set, and zero otherwise.
 .Pp
+The function
+.Fn bit_ffc
+stores in the location referenced by
+.Fa value
+the zero-based number of the first bit not set in the array of
+.Fa nbits
+bits referenced by
+.Fa name .
+If all bits are set, the location referenced by
+.Fa value
+is set to \-1.
+.Pp
 The
 .Fn bit_ffs
-macro
+function
 stores in the location referenced by
 .Fa value
 the zero-based number of the first bit set in the array of
 .Fa nbits
 bits referenced by
 .Fa name .
 If no bits are set, the location referenced by
 .Fa value
 is set to \-1.
 .Pp
-The macro
-.Fn bit_ffc
+The function
+.Fn bit_ffc_at
 stores in the location referenced by
 .Fa value
 the zero-based number of the first bit not set in the array of
 .Fa nbits
 bits referenced by
-.Fa name .
-If all bits are set, the location referenced by
+.Fa name ,
+at or after the zero-based bit index
+.Fa start .
+If all bits at or after
+.Fa start
+are set, the location referenced by
 .Fa value
 is set to \-1.
 .Pp
-The arguments to these macros are evaluated only once and may safely
+The
+.Fn bit_ffs_at
+function
+stores in the location referenced by
+.Fa value
+the zero-based number of the first bit set in the array of
+.Fa nbits
+bits referenced by
+.Fa name ,
+at or after the zero-based bit index
+.Fa start .
+If no bits are set after
+.Fa start ,
+the location referenced by
+.Fa value
+is set to \-1.
+.Pp
+The arguments in bit string macros are evaluated only once and may safely
 have side effects.
 .Sh EXAMPLES
 .Bd -literal -offset indent
 #include <limits.h>
 #include <bitstring.h>
 
 \&...
 #define	LPR_BUSY_BIT		0
 #define	LPR_FORMAT_BIT		1
 #define	LPR_DOWNLOAD_BIT	2
 \&...
 #define	LPR_AVAILABLE_BIT	9
 #define	LPR_MAX_BITS		10
 
 make_lpr_available()
 {
 	bitstr_t bit_decl(bitlist, LPR_MAX_BITS);
 	...
 	bit_nclear(bitlist, 0, LPR_MAX_BITS - 1);
 	...
 	if (!bit_test(bitlist, LPR_BUSY_BIT)) {
 		bit_clear(bitlist, LPR_FORMAT_BIT);
 		bit_clear(bitlist, LPR_DOWNLOAD_BIT);
 		bit_set(bitlist, LPR_AVAILABLE_BIT);
 	}
 }
 .Ed
 .Sh SEE ALSO
 .Xr malloc 3 ,
 .Xr bitset 9
 .Sh HISTORY
 The
 .Nm bitstring
 functions first appeared in
 .Bx 4.4 .
Index: head/sys/dev/xen/blkback/blkback.c
===================================================================
--- head/sys/dev/xen/blkback/blkback.c	(revision 299089)
+++ head/sys/dev/xen/blkback/blkback.c	(revision 299090)
@@ -1,3891 +1,3887 @@
 /*-
  * Copyright (c) 2009-2012 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
  *          Ken Merry           (Spectra Logic Corporation)
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /**
  * \file blkback.c
  *
  * \brief Device driver supporting the vending of block storage from
  *        a FreeBSD domain to other domains.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/devicestat.h>
 #include <sys/disk.h>
 #include <sys/fcntl.h>
 #include <sys/filedesc.h>
 #include <sys/kdb.h>
 #include <sys/module.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/rman.h>
 #include <sys/taskqueue.h>
 #include <sys/types.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/sysctl.h>
 #include <sys/bitstring.h>
 #include <sys/sdt.h>
 
 #include <geom/geom.h>
 
 #include <machine/_inttypes.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
 #include <xen/xen-os.h>
 #include <xen/blkif.h>
 #include <xen/gnttab.h>
 #include <xen/xen_intr.h>
 
 #include <xen/interface/event_channel.h>
 #include <xen/interface/grant_table.h>
 
 #include <xen/xenbus/xenbusvar.h>
 
 /*--------------------------- Compile-time Tunables --------------------------*/
 /**
  * The maximum number of shared memory ring pages we will allow in a
  * negotiated block-front/back communication channel.  Allow enough
  * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
  */
 #define	XBB_MAX_RING_PAGES		32
 
 /**
  * The maximum number of outstanding request blocks (request headers plus
  * additional segment blocks) we will allow in a negotiated block-front/back
  * communication channel.
  */
 #define	XBB_MAX_REQUESTS 					\
 	__CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES)
 
 /**
  * \brief Define to force all I/O to be performed on memory owned by the
  *        backend device, with a copy-in/out to the remote domain's memory.
  *
  * \note  This option is currently required when this driver's domain is
  *        operating in HVM mode on a system using an IOMMU.
  *
  * This driver uses Xen's grant table API to gain access to the memory of
  * the remote domains it serves.  When our domain is operating in PV mode,
  * the grant table mechanism directly updates our domain's page table entries
  * to point to the physical pages of the remote domain.  This scheme guarantees
  * that blkback and the backing devices it uses can safely perform DMA
  * operations to satisfy requests.  In HVM mode, Xen may use a HW IOMMU to
  * insure that our domain cannot DMA to pages owned by another domain.  As
  * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
  * table API.  For this reason, in HVM mode, we must bounce all requests into
  * memory that is mapped into our domain at domain startup and thus has
  * valid IOMMU mappings.
  */
 #define XBB_USE_BOUNCE_BUFFERS
 
 /**
  * \brief Define to enable rudimentary request logging to the console.
  */
 #undef XBB_DEBUG
 
 /*---------------------------------- Macros ----------------------------------*/
 /**
  * Custom malloc type for all driver allocations.
  */
 static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
 
 #ifdef XBB_DEBUG
 #define DPRINTF(fmt, args...)					\
     printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
 #else
 #define DPRINTF(fmt, args...) do {} while(0)
 #endif
 
 /**
  * The maximum mapped region size per request we will allow in a negotiated
  * block-front/back communication channel.
  */
 #define	XBB_MAX_REQUEST_SIZE					\
 	MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
 
 /**
  * The maximum number of segments (within a request header and accompanying
  * segment blocks) per request we will allow in a negotiated block-front/back
  * communication channel.
  */
 #define	XBB_MAX_SEGMENTS_PER_REQUEST				\
 	(MIN(UIO_MAXIOV,					\
 	     MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST,		\
 		 (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
 
 /**
  * The maximum number of ring pages that we can allow per request list.
  * We limit this to the maximum number of segments per request, because
  * that is already a reasonable number of segments to aggregate.  This
  * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST,
  * because that would leave situations where we can't dispatch even one
  * large request.
  */
 #define	XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST
 
 /*--------------------------- Forward Declarations ---------------------------*/
 struct xbb_softc;
 struct xbb_xen_req;
 
 static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
 			      ...) __attribute__((format(printf, 3, 4)));
 static int  xbb_shutdown(struct xbb_softc *xbb);
 static int  xbb_detach(device_t dev);
 
 /*------------------------------ Data Structures -----------------------------*/
 
 STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req);
 
 typedef enum {
 	XBB_REQLIST_NONE	= 0x00,
 	XBB_REQLIST_MAPPED	= 0x01
 } xbb_reqlist_flags;
 
 struct xbb_xen_reqlist {
 	/**
 	 * Back reference to the parent block back instance for this
 	 * request.  Used during bio_done handling.
 	 */
 	struct xbb_softc        *xbb;
 
 	/**
 	 * BLKIF_OP code for this request.
 	 */
 	int			 operation;
 
 	/**
 	 * Set to BLKIF_RSP_* to indicate request status.
 	 *
 	 * This field allows an error status to be recorded even if the
 	 * delivery of this status must be deferred.  Deferred reporting
 	 * is necessary, for example, when an error is detected during
 	 * completion processing of one bio when other bios for this
 	 * request are still outstanding.
 	 */
 	int			 status;
 
 	/**
 	 * Number of 512 byte sectors not transferred.
 	 */
 	int			 residual_512b_sectors;
 
 	/**
 	 * Starting sector number of the first request in the list.
 	 */
 	off_t			 starting_sector_number;
 
 	/**
 	 * If we're going to coalesce, the next contiguous sector would be
 	 * this one.
 	 */
 	off_t			 next_contig_sector;
 
 	/**
 	 * Number of child requests in the list.
 	 */
 	int			 num_children;
 
 	/**
 	 * Number of I/O requests still pending on the backend.
 	 */
 	int			 pendcnt;
 
 	/**
 	 * Total number of segments for requests in the list.
 	 */
 	int			 nr_segments;
 
 	/**
 	 * Flags for this particular request list.
 	 */
 	xbb_reqlist_flags	 flags;
 
 	/**
 	 * Kernel virtual address space reserved for this request
 	 * list structure and used to map the remote domain's pages for
 	 * this I/O, into our domain's address space.
 	 */
 	uint8_t			*kva;
 
 	/**
 	 * Base, psuedo-physical address, corresponding to the start
 	 * of this request's kva region.
 	 */
 	uint64_t	 	 gnt_base;
 
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	/**
 	 * Pre-allocated domain local memory used to proxy remote
 	 * domain memory during I/O operations.
 	 */
 	uint8_t			*bounce;
 #endif
 
 	/**
 	 * Array of grant handles (one per page) used to map this request.
 	 */
 	grant_handle_t		*gnt_handles;
 
 	/**
 	 * Device statistics request ordering type (ordered or simple).
 	 */
 	devstat_tag_type	 ds_tag_type;
 
 	/**
 	 * Device statistics request type (read, write, no_data).
 	 */
 	devstat_trans_flags	 ds_trans_type;
 
 	/**
 	 * The start time for this request.
 	 */
 	struct bintime		 ds_t0;
 
 	/**
 	 * Linked list of contiguous requests with the same operation type.
 	 */
 	struct xbb_xen_req_list	 contig_req_list;
 
 	/**
 	 * Linked list links used to aggregate idle requests in the
 	 * request list free pool (xbb->reqlist_free_stailq) and pending
 	 * requests waiting for execution (xbb->reqlist_pending_stailq).
 	 */
 	STAILQ_ENTRY(xbb_xen_reqlist) links;
 };
 
 STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist);
 
 /**
  * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
  */
 struct xbb_xen_req {
 	/**
 	 * Linked list links used to aggregate requests into a reqlist
 	 * and to store them in the request free pool.
 	 */
 	STAILQ_ENTRY(xbb_xen_req) links;
 
 	/**
 	 * The remote domain's identifier for this I/O request.
 	 */
 	uint64_t		  id;
 
 	/**
 	 * The number of pages currently mapped for this request.
 	 */
 	int			  nr_pages;
 
 	/**
 	 * The number of 512 byte sectors comprising this requests.
 	 */
 	int			  nr_512b_sectors;
 
 	/**
 	 * BLKIF_OP code for this request.
 	 */
 	int			  operation;
 
 	/**
 	 * Storage used for non-native ring requests.
 	 */
 	blkif_request_t		 ring_req_storage;
 
 	/**
 	 * Pointer to the Xen request in the ring.
 	 */
 	blkif_request_t		*ring_req;
 
 	/**
 	 * Consumer index for this request.
 	 */
 	RING_IDX		 req_ring_idx;
 
 	/**
 	 * The start time for this request.
 	 */
 	struct bintime		 ds_t0;
 
 	/**
 	 * Pointer back to our parent request list.
 	 */
 	struct xbb_xen_reqlist  *reqlist;
 };
 SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
 
 /**
  * \brief Configuration data for the shared memory request ring
  *        used to communicate with the front-end client of this
  *        this driver.
  */
 struct xbb_ring_config {
 	/** KVA address where ring memory is mapped. */
 	vm_offset_t	va;
 
 	/** The pseudo-physical address where ring memory is mapped.*/
 	uint64_t	gnt_addr;
 
 	/**
 	 * Grant table handles, one per-ring page, returned by the
 	 * hyperpervisor upon mapping of the ring and required to
 	 * unmap it when a connection is torn down.
 	 */
 	grant_handle_t	handle[XBB_MAX_RING_PAGES];
 
 	/**
 	 * The device bus address returned by the hypervisor when
 	 * mapping the ring and required to unmap it when a connection
 	 * is torn down.
 	 */
 	uint64_t	bus_addr[XBB_MAX_RING_PAGES];
 
 	/** The number of ring pages mapped for the current connection. */
 	u_int		ring_pages;
 
 	/**
 	 * The grant references, one per-ring page, supplied by the
 	 * front-end, allowing us to reference the ring pages in the
 	 * front-end's domain and to map these pages into our own domain.
 	 */
 	grant_ref_t	ring_ref[XBB_MAX_RING_PAGES];
 
 	/** The interrupt driven even channel used to signal ring events. */
 	evtchn_port_t   evtchn;
 };
 
 /**
  * Per-instance connection state flags.
  */
 typedef enum
 {
 	/**
 	 * The front-end requested a read-only mount of the
 	 * back-end device/file.
 	 */
 	XBBF_READ_ONLY         = 0x01,
 
 	/** Communication with the front-end has been established. */
 	XBBF_RING_CONNECTED    = 0x02,
 
 	/**
 	 * Front-end requests exist in the ring and are waiting for
 	 * xbb_xen_req objects to free up.
 	 */
 	XBBF_RESOURCE_SHORTAGE = 0x04,
 
 	/** Connection teardown in progress. */
 	XBBF_SHUTDOWN          = 0x08,
 
 	/** A thread is already performing shutdown processing. */
 	XBBF_IN_SHUTDOWN       = 0x10
 } xbb_flag_t;
 
 /** Backend device type.  */
 typedef enum {
 	/** Backend type unknown. */
 	XBB_TYPE_NONE		= 0x00,
 
 	/**
 	 * Backend type disk (access via cdev switch
 	 * strategy routine).
 	 */
 	XBB_TYPE_DISK		= 0x01,
 
 	/** Backend type file (access vnode operations.). */
 	XBB_TYPE_FILE		= 0x02
 } xbb_type;
 
 /**
  * \brief Structure used to memoize information about a per-request
  *        scatter-gather list.
  *
  * The chief benefit of using this data structure is it avoids having
  * to reparse the possibly discontiguous S/G list in the original
  * request.  Due to the way that the mapping of the memory backing an
  * I/O transaction is handled by Xen, a second pass is unavoidable.
  * At least this way the second walk is a simple array traversal.
  *
  * \note A single Scatter/Gather element in the block interface covers
  *       at most 1 machine page.  In this context a sector (blkif
  *       nomenclature, not what I'd choose) is a 512b aligned unit
  *       of mapping within the machine page referenced by an S/G
  *       element.
  */
 struct xbb_sg {
 	/** The number of 512b data chunks mapped in this S/G element. */
 	int16_t nsect;
 
 	/**
 	 * The index (0 based) of the first 512b data chunk mapped
 	 * in this S/G element.
 	 */
 	uint8_t first_sect;
 
 	/**
 	 * The index (0 based) of the last 512b data chunk mapped
 	 * in this S/G element.
 	 */
 	uint8_t last_sect;
 };
 
 /**
  * Character device backend specific configuration data.
  */
 struct xbb_dev_data {
 	/** Cdev used for device backend access.  */
 	struct cdev   *cdev;
 
 	/** Cdev switch used for device backend access.  */
 	struct cdevsw *csw;
 
 	/** Used to hold a reference on opened cdev backend devices. */
 	int	       dev_ref;
 };
 
 /**
  * File backend specific configuration data.
  */
 struct xbb_file_data {
 	/** Credentials to use for vnode backed (file based) I/O. */
 	struct ucred   *cred;
 
 	/**
 	 * \brief Array of io vectors used to process file based I/O.
 	 *
 	 * Only a single file based request is outstanding per-xbb instance,
 	 * so we only need one of these.
 	 */
 	struct iovec	xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 #ifdef XBB_USE_BOUNCE_BUFFERS
 
 	/**
 	 * \brief Array of io vectors used to handle bouncing of file reads.
 	 *
 	 * Vnode operations are free to modify uio data during their
 	 * exectuion.  In the case of a read with bounce buffering active,
 	 * we need some of the data from the original uio in order to
 	 * bounce-out the read data.  This array serves as the temporary
 	 * storage for this saved data.
 	 */
 	struct iovec	saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/**
 	 * \brief Array of memoized bounce buffer kva offsets used
 	 *        in the file based backend.
 	 *
 	 * Due to the way that the mapping of the memory backing an
 	 * I/O transaction is handled by Xen, a second pass through
 	 * the request sg elements is unavoidable. We memoize the computed
 	 * bounce address here to reduce the cost of the second walk.
 	 */
 	void		*xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST];
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 };
 
 /**
  * Collection of backend type specific data.
  */
 union xbb_backend_data {
 	struct xbb_dev_data  dev;
 	struct xbb_file_data file;
 };
 
 /**
  * Function signature of backend specific I/O handlers.
  */
 typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb,
 			      struct xbb_xen_reqlist *reqlist, int operation,
 			      int flags);
 
 /**
  * Per-instance configuration data.
  */
 struct xbb_softc {
 
 	/**
 	 * Task-queue used to process I/O requests.
 	 */
 	struct taskqueue	 *io_taskqueue;
 
 	/**
 	 * Single "run the request queue" task enqueued
 	 * on io_taskqueue.
 	 */
 	struct task		  io_task;
 
 	/** Device type for this instance. */
 	xbb_type		  device_type;
 
 	/** NewBus device corresponding to this instance. */
 	device_t		  dev;
 
 	/** Backend specific dispatch routine for this instance. */
 	xbb_dispatch_t		  dispatch_io;
 
 	/** The number of requests outstanding on the backend device/file. */
 	int			  active_request_count;
 
 	/** Free pool of request tracking structures. */
 	struct xbb_xen_req_list   request_free_stailq;
 
 	/** Array, sized at connection time, of request tracking structures. */
 	struct xbb_xen_req	 *requests;
 
 	/** Free pool of request list structures. */
 	struct xbb_xen_reqlist_list reqlist_free_stailq;
 
 	/** List of pending request lists awaiting execution. */
 	struct xbb_xen_reqlist_list reqlist_pending_stailq;
 
 	/** Array, sized at connection time, of request list structures. */
 	struct xbb_xen_reqlist	 *request_lists;
 
 	/**
 	 * Global pool of kva used for mapping remote domain ring
 	 * and I/O transaction data.
 	 */
 	vm_offset_t		  kva;
 
 	/** Psuedo-physical address corresponding to kva. */
 	uint64_t		  gnt_base_addr;
 
 	/** The size of the global kva pool. */
 	int			  kva_size;
 
 	/** The size of the KVA area used for request lists. */
 	int			  reqlist_kva_size;
 
 	/** The number of pages of KVA used for request lists */
 	int			  reqlist_kva_pages;
 
 	/** Bitmap of free KVA pages */
 	bitstr_t		 *kva_free;
 
 	/**
 	 * \brief Cached value of the front-end's domain id.
 	 * 
 	 * This value is used at once for each mapped page in
 	 * a transaction.  We cache it to avoid incuring the
 	 * cost of an ivar access every time this is needed.
 	 */
 	domid_t			  otherend_id;
 
 	/**
 	 * \brief The blkif protocol abi in effect.
 	 *
 	 * There are situations where the back and front ends can
 	 * have a different, native abi (e.g. intel x86_64 and
 	 * 32bit x86 domains on the same machine).  The back-end
 	 * always accommodates the front-end's native abi.  That
 	 * value is pulled from the XenStore and recorded here.
 	 */
 	int			  abi;
 
 	/**
 	 * \brief The maximum number of requests and request lists allowed
 	 *        to be in flight at a time.
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
 	u_int			  max_requests;
 
 	/**
 	 * \brief The maximum number of segments (1 page per segment)
 	 *	  that can be mapped by a request.
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
 	u_int			  max_request_segments;
 
 	/**
 	 * \brief Maximum number of segments per request list.
 	 *
 	 * This value is derived from and will generally be larger than
 	 * max_request_segments.
 	 */
 	u_int			  max_reqlist_segments;
 
 	/**
 	 * The maximum size of any request to this back-end
 	 * device.
 	 *
 	 * This value is negotiated via the XenStore.
 	 */
 	u_int			  max_request_size;
 
 	/**
 	 * The maximum size of any request list.  This is derived directly
 	 * from max_reqlist_segments.
 	 */
 	u_int			  max_reqlist_size;
 
 	/** Various configuration and state bit flags. */
 	xbb_flag_t		  flags;
 
 	/** Ring mapping and interrupt configuration data. */
 	struct xbb_ring_config	  ring_config;
 
 	/** Runtime, cross-abi safe, structures for ring access. */
 	blkif_back_rings_t	  rings;
 
 	/** IRQ mapping for the communication ring event channel. */
 	xen_intr_handle_t	  xen_intr_handle;
 
 	/**
 	 * \brief Backend access mode flags (e.g. write, or read-only).
 	 *
 	 * This value is passed to us by the front-end via the XenStore.
 	 */
 	char			 *dev_mode;
 
 	/**
 	 * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
 	 *
 	 * This value is passed to us by the front-end via the XenStore.
 	 * Currently unused.
 	 */
 	char			 *dev_type;
 
 	/**
 	 * \brief Backend device/file identifier.
 	 *
 	 * This value is passed to us by the front-end via the XenStore.
 	 * We expect this to be a POSIX path indicating the file or
 	 * device to open.
 	 */
 	char			 *dev_name;
 
 	/**
 	 * Vnode corresponding to the backend device node or file
 	 * we are acessing.
 	 */
 	struct vnode		 *vn;
 
 	union xbb_backend_data	  backend;
 
 	/** The native sector size of the backend. */
 	u_int			  sector_size;
 
 	/** log2 of sector_size.  */
 	u_int			  sector_size_shift;
 
 	/** Size in bytes of the backend device or file.  */
 	off_t			  media_size;
 
 	/**
 	 * \brief media_size expressed in terms of the backend native
 	 *	  sector size.
 	 *
 	 * (e.g. xbb->media_size >> xbb->sector_size_shift).
 	 */
 	uint64_t		  media_num_sectors;
 
 	/**
 	 * \brief Array of memoized scatter gather data computed during the
 	 *	  conversion of blkif ring requests to internal xbb_xen_req
 	 *	  structures.
 	 *
 	 * Ring processing is serialized so we only need one of these.
 	 */
 	struct xbb_sg		  xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/**
 	 * Temporary grant table map used in xbb_dispatch_io().  When
 	 * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the
 	 * stack could cause a stack overflow.
 	 */
 	struct gnttab_map_grant_ref   maps[XBB_MAX_SEGMENTS_PER_REQLIST];
 
 	/** Mutex protecting per-instance data. */
 	struct mtx		  lock;
 
 	/**
 	 * Resource representing allocated physical address space
 	 * associated with our per-instance kva region.
 	 */
 	struct resource		 *pseudo_phys_res;
 
 	/** Resource id for allocated physical address space. */
 	int			  pseudo_phys_res_id;
 
 	/**
 	 * I/O statistics from BlockBack dispatch down.  These are
 	 * coalesced requests, and we start them right before execution.
 	 */
 	struct devstat		 *xbb_stats;
 
 	/**
 	 * I/O statistics coming into BlockBack.  These are the requests as
 	 * we get them from BlockFront.  They are started as soon as we
 	 * receive a request, and completed when the I/O is complete.
 	 */
 	struct devstat		 *xbb_stats_in;
 
 	/** Disable sending flush to the backend */
 	int			  disable_flush;
 
 	/** Send a real flush for every N flush requests */
 	int			  flush_interval;
 
 	/** Count of flush requests in the interval */
 	int			  flush_count;
 
 	/** Don't coalesce requests if this is set */
 	int			  no_coalesce_reqs;
 
 	/** Number of requests we have received */
 	uint64_t		  reqs_received;
 
 	/** Number of requests we have completed*/
 	uint64_t		  reqs_completed;
 
 	/** Number of requests we queued but not pushed*/
 	uint64_t		  reqs_queued_for_completion;
 
 	/** Number of requests we completed with an error status*/
 	uint64_t		  reqs_completed_with_error;
 
 	/** How many forced dispatches (i.e. without coalescing) have happened */
 	uint64_t		  forced_dispatch;
 
 	/** How many normal dispatches have happened */
 	uint64_t		  normal_dispatch;
 
 	/** How many total dispatches have happened */
 	uint64_t		  total_dispatch;
 
 	/** How many times we have run out of KVA */
 	uint64_t		  kva_shortages;
 
 	/** How many times we have run out of request structures */
 	uint64_t		  request_shortages;
 };
 
 /*---------------------------- Request Processing ----------------------------*/
 /**
  * Allocate an internal transaction tracking structure from the free pool.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  On success, a pointer to the allocated xbb_xen_req structure.
  *          Otherwise NULL.
  */
 static inline struct xbb_xen_req *
 xbb_get_req(struct xbb_softc *xbb)
 {
 	struct xbb_xen_req *req;
 
 	req = NULL;
 
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) {
 		STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links);
 		xbb->active_request_count++;
 	}
 
 	return (req);
 }
 
 /**
  * Return an allocated transaction tracking structure to the free pool.
  *
  * \param xbb  Per-instance xbb configuration structure.
  * \param req  The request structure to free.
  */
 static inline void
 xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
 {
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links);
 	xbb->active_request_count--;
 
 	KASSERT(xbb->active_request_count >= 0,
 		("xbb_release_req: negative active count"));
 }
 
 /**
  * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool.
  *
  * \param xbb	    Per-instance xbb configuration structure.
  * \param req_list  The list of requests to free.
  * \param nreqs	    The number of items in the list.
  */
 static inline void
 xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list,
 		 int nreqs)
 {
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	STAILQ_CONCAT(&xbb->request_free_stailq, req_list);
 	xbb->active_request_count -= nreqs;
 
 	KASSERT(xbb->active_request_count >= 0,
 		("xbb_release_reqs: negative active count"));
 }
 
 /**
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's kva region.
  *
  * \param reqlist The request structure whose kva region will be accessed.
  * \param pagenr  The page index used to compute the kva offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                kva offset.
  *
  * \return  The computed global KVA offset.
  */
 static inline uint8_t *
 xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 	return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 /**
  * Given a page index and 512b sector offset within that page,
  * calculate an offset into a request's local bounce memory region.
  *
  * \param reqlist The request structure whose bounce region will be accessed.
  * \param pagenr  The page index used to compute the bounce offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                bounce offset.
  *
  * \return  The computed global bounce buffer address.
  */
 static inline uint8_t *
 xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 	return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
 }
 #endif
 
 /**
  * Given a page number and 512b sector offset within that page,
  * calculate an offset into the request's memory region that the
  * underlying backend device/file should use for I/O.
  *
  * \param reqlist The request structure whose I/O region will be accessed.
  * \param pagenr  The page index used to compute the I/O offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                I/O offset.
  *
  * \return  The computed global I/O address.
  *
  * Depending on configuration, this will either be a local bounce buffer
  * or a pointer to the memory mapped in from the front-end domain for
  * this request.
  */
 static inline uint8_t *
 xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector));
 #else
 	return (xbb_reqlist_vaddr(reqlist, pagenr, sector));
 #endif
 }
 
 /**
  * Given a page index and 512b sector offset within that page, calculate
  * an offset into the local psuedo-physical address space used to map a
  * front-end's request data into a request.
  *
  * \param reqlist The request list structure whose pseudo-physical region
  *                will be accessed.
  * \param pagenr  The page index used to compute the pseudo-physical offset.
  * \param sector  The 512b sector index used to compute the page relative
  *                pseudo-physical offset.
  *
  * \return  The computed global pseudo-phsyical address.
  *
  * Depending on configuration, this will either be a local bounce buffer
  * or a pointer to the memory mapped in from the front-end domain for
  * this request.
  */
 static inline uintptr_t
 xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector)
 {
 	struct xbb_softc *xbb;
 
 	xbb = reqlist->xbb;
 
 	return ((uintptr_t)(xbb->gnt_base_addr +
 		(uintptr_t)(reqlist->kva - xbb->kva) +
 		(PAGE_SIZE * pagenr) + (sector << 9)));
 }
 
 /**
  * Get Kernel Virtual Address space for mapping requests.
  *
  * \param xbb         Per-instance xbb configuration structure.
  * \param nr_pages    Number of pages needed.
  * \param check_only  If set, check for free KVA but don't allocate it.
  * \param have_lock   If set, xbb lock is already held.
  *
  * \return  On success, a pointer to the allocated KVA region.  Otherwise NULL.
  *
  * Note:  This should be unnecessary once we have either chaining or
  * scatter/gather support for struct bio.  At that point we'll be able to
  * put multiple addresses and lengths in one bio/bio chain and won't need
  * to map everything into one virtual segment.
  */
 static uint8_t *
 xbb_get_kva(struct xbb_softc *xbb, int nr_pages)
 {
-	intptr_t first_clear;
-	intptr_t num_clear;
+	int first_clear;
+	int num_clear;
 	uint8_t *free_kva;
 	int      i;
 
 	KASSERT(nr_pages != 0, ("xbb_get_kva of zero length"));
 
 	first_clear = 0;
 	free_kva = NULL;
 
 	mtx_lock(&xbb->lock);
 
 	/*
 	 * Look for the first available page.  If there are none, we're done.
 	 */
 	bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear);
 
 	if (first_clear == -1)
 		goto bailout;
 
 	/*
 	 * Starting at the first available page, look for consecutive free
 	 * pages that will satisfy the user's request.
 	 */
 	for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) {
 		/*
 		 * If this is true, the page is used, so we have to reset
 		 * the number of clear pages and the first clear page
 		 * (since it pointed to a region with an insufficient number
 		 * of clear pages).
 		 */
 		if (bit_test(xbb->kva_free, i)) {
 			num_clear = 0;
 			first_clear = -1;
 			continue;
 		}
 
 		if (first_clear == -1)
 			first_clear = i;
 
 		/*
 		 * If this is true, we've found a large enough free region
 		 * to satisfy the request.
 		 */
 		if (++num_clear == nr_pages) {
 
 			bit_nset(xbb->kva_free, first_clear,
 				 first_clear + nr_pages - 1);
 
 			free_kva = xbb->kva +
-				(uint8_t *)(first_clear * PAGE_SIZE);
+				(uint8_t *)((intptr_t)first_clear * PAGE_SIZE);
 
 			KASSERT(free_kva >= (uint8_t *)xbb->kva &&
 				free_kva + (nr_pages * PAGE_SIZE) <=
 				(uint8_t *)xbb->ring_config.va,
 				("Free KVA %p len %d out of range, "
 				 "kva = %#jx, ring VA = %#jx\n", free_kva,
 				 nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva,
 				 (uintmax_t)xbb->ring_config.va));
 			break;
 		}
 	}
 
 bailout:
 
 	if (free_kva == NULL) {
 		xbb->flags |= XBBF_RESOURCE_SHORTAGE;
 		xbb->kva_shortages++;
 	}
 
 	mtx_unlock(&xbb->lock);
 
 	return (free_kva);
 }
 
 /**
  * Free allocated KVA.
  *
  * \param xbb	    Per-instance xbb configuration structure.
  * \param kva_ptr   Pointer to allocated KVA region.  
  * \param nr_pages  Number of pages in the KVA region.
  */
 static void
 xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages)
 {
 	intptr_t start_page;
 
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT;
 	bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1);
 
 }
 
 /**
  * Unmap the front-end pages associated with this I/O request.
  *
  * \param req  The request structure to unmap.
  */
 static void
 xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist)
 {
 	struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST];
 	u_int			      i;
 	u_int			      invcount;
 	int			      error;
 
 	invcount = 0;
 	for (i = 0; i < reqlist->nr_segments; i++) {
 
 		if (reqlist->gnt_handles[i] == GRANT_REF_INVALID)
 			continue;
 
 		unmap[invcount].host_addr    = xbb_get_gntaddr(reqlist, i, 0);
 		unmap[invcount].dev_bus_addr = 0;
 		unmap[invcount].handle       = reqlist->gnt_handles[i];
 		reqlist->gnt_handles[i]	     = GRANT_REF_INVALID;
 		invcount++;
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
 					  unmap, invcount);
 	KASSERT(error == 0, ("Grant table operation failed"));
 }
 
 /**
  * Allocate an internal transaction tracking structure from the free pool.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  On success, a pointer to the allocated xbb_xen_reqlist structure.
  *          Otherwise NULL.
  */
 static inline struct xbb_xen_reqlist *
 xbb_get_reqlist(struct xbb_softc *xbb)
 {
 	struct xbb_xen_reqlist *reqlist;
 
 	reqlist = NULL;
 
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) {
 
 		STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links);
 		reqlist->flags = XBB_REQLIST_NONE;
 		reqlist->kva = NULL;
 		reqlist->status = BLKIF_RSP_OKAY;
 		reqlist->residual_512b_sectors = 0;
 		reqlist->num_children = 0;
 		reqlist->nr_segments = 0;
 		STAILQ_INIT(&reqlist->contig_req_list);
 	}
 
 	return (reqlist);
 }
 
 /**
  * Return an allocated transaction tracking structure to the free pool.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param req        The request list structure to free.
  * \param wakeup     If set, wakeup the work thread if freeing this reqlist
  *                   during a resource shortage condition.
  */
 static inline void
 xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 		    int wakeup)
 {
 
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	if (wakeup) {
 		wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE;
 		xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
 	}
 
 	if (reqlist->kva != NULL)
 		xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments);
 
 	xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children);
 
 	STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
 
 	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
 		/*
 		 * Shutdown is in progress.  See if we can
 		 * progress further now that one more request
 		 * has completed and been returned to the
 		 * free pool.
 		 */
 		xbb_shutdown(xbb);
 	}
 
 	if (wakeup != 0)
 		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 }
 
 /**
  * Request resources and do basic request setup.
  *
  * \param xbb          Per-instance xbb configuration structure.
  * \param reqlist      Pointer to reqlist pointer.
  * \param ring_req     Pointer to a block ring request.
  * \param ring_index   The ring index of this request.
  *
  * \return  0 for success, non-zero for failure.
  */
 static int
 xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist,
 		  blkif_request_t *ring_req, RING_IDX ring_idx)
 {
 	struct xbb_xen_reqlist *nreqlist;
 	struct xbb_xen_req     *nreq;
 
 	nreqlist = NULL;
 	nreq     = NULL;
 
 	mtx_lock(&xbb->lock);
 
 	/*
 	 * We don't allow new resources to be allocated if we're in the
 	 * process of shutting down.
 	 */
 	if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
 		mtx_unlock(&xbb->lock);
 		return (1);
 	}
 
 	/*
 	 * Allocate a reqlist if the caller doesn't have one already.
 	 */
 	if (*reqlist == NULL) {
 		nreqlist = xbb_get_reqlist(xbb);
 		if (nreqlist == NULL)
 			goto bailout_error;
 	}
 
 	/* We always allocate a request. */
 	nreq = xbb_get_req(xbb);
 	if (nreq == NULL)
 		goto bailout_error;
 
 	mtx_unlock(&xbb->lock);
 
 	if (*reqlist == NULL) {
 		*reqlist = nreqlist;
 		nreqlist->operation = ring_req->operation;
 		nreqlist->starting_sector_number = ring_req->sector_number;
 		STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist,
 				   links);
 	}
 
 	nreq->reqlist = *reqlist;
 	nreq->req_ring_idx = ring_idx;
 	nreq->id = ring_req->id;
 	nreq->operation = ring_req->operation;
 
 	if (xbb->abi != BLKIF_PROTOCOL_NATIVE) {
 		bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req));
 		nreq->ring_req = &nreq->ring_req_storage;
 	} else {
 		nreq->ring_req = ring_req;
 	}
 
 	binuptime(&nreq->ds_t0);
 	devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0);
 	STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links);
 	(*reqlist)->num_children++;
 	(*reqlist)->nr_segments += ring_req->nr_segments;
 
 	return (0);
 
 bailout_error:
 
 	/*
 	 * We're out of resources, so set the shortage flag.  The next time
 	 * a request is released, we'll try waking up the work thread to
 	 * see if we can allocate more resources.
 	 */
 	xbb->flags |= XBBF_RESOURCE_SHORTAGE;
 	xbb->request_shortages++;
 
 	if (nreq != NULL)
 		xbb_release_req(xbb, nreq);
 
 	if (nreqlist != NULL)
 		xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0);
 
 	mtx_unlock(&xbb->lock);
 
 	return (1);
 }
 
 /**
  * Create and queue a response to a blkif request.
  * 
  * \param xbb     Per-instance xbb configuration structure.
  * \param req     The request structure to which to respond.
  * \param status  The status code to report.  See BLKIF_RSP_*
  *                in sys/xen/interface/io/blkif.h.
  */
 static void
 xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
 {
 	blkif_response_t *resp;
 
 	/*
 	 * The mutex is required here, and should be held across this call
 	 * until after the subsequent call to xbb_push_responses().  This
 	 * is to guarantee that another context won't queue responses and
 	 * push them while we're active.
 	 *
 	 * That could lead to the other end being notified of responses
 	 * before the resources have been freed on this end.  The other end
 	 * would then be able to queue additional I/O, and we may run out
  	 * of resources because we haven't freed them all yet.
 	 */
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	/*
 	 * Place on the response ring for the relevant domain.
 	 * For now, only the spacing between entries is different
 	 * in the different ABIs, not the response entry layout.
 	 */
 	switch (xbb->abi) {
 	case BLKIF_PROTOCOL_NATIVE:
 		resp = RING_GET_RESPONSE(&xbb->rings.native,
 					 xbb->rings.native.rsp_prod_pvt);
 		break;
 	case BLKIF_PROTOCOL_X86_32:
 		resp = (blkif_response_t *)
 		    RING_GET_RESPONSE(&xbb->rings.x86_32,
 				      xbb->rings.x86_32.rsp_prod_pvt);
 		break;
 	case BLKIF_PROTOCOL_X86_64:
 		resp = (blkif_response_t *)
 		    RING_GET_RESPONSE(&xbb->rings.x86_64,
 				      xbb->rings.x86_64.rsp_prod_pvt);
 		break;
 	default:
 		panic("Unexpected blkif protocol ABI.");
 	}
 
 	resp->id        = req->id;
 	resp->operation = req->operation;
 	resp->status    = status;
 
 	if (status != BLKIF_RSP_OKAY)
 		xbb->reqs_completed_with_error++;
 
 	xbb->rings.common.rsp_prod_pvt++;
 
 	xbb->reqs_queued_for_completion++;
 
 }
 
 /**
  * Send queued responses to blkif requests.
  * 
  * \param xbb            Per-instance xbb configuration structure.
  * \param run_taskqueue  Flag that is set to 1 if the taskqueue
  *			 should be run, 0 if it does not need to be run.
  * \param notify	 Flag that is set to 1 if the other end should be
  * 			 notified via irq, 0 if the other end should not be
  *			 notified.
  */
 static void
 xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify)
 {
 	int more_to_do;
 
 	/*
 	 * The mutex is required here.
 	 */
 	mtx_assert(&xbb->lock, MA_OWNED);
 
 	more_to_do = 0;
 
 	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify);
 
 	if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
 
 		/*
 		 * Tail check for pending requests. Allows frontend to avoid
 		 * notifications if requests are already in flight (lower
 		 * overheads and promotes batching).
 		 */
 		RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
 	} else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
 
 		more_to_do = 1;
 	}
 
 	xbb->reqs_completed += xbb->reqs_queued_for_completion;
 	xbb->reqs_queued_for_completion = 0;
 
 	*run_taskqueue = more_to_do;
 }
 
 /**
  * Complete a request list.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param reqlist    Allocated internal request list structure.
  */
 static void
 xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
 {
 	struct xbb_xen_req *nreq;
 	off_t		    sectors_sent;
 	int		    notify, run_taskqueue;
 
 	sectors_sent = 0;
 
 	if (reqlist->flags & XBB_REQLIST_MAPPED)
 		xbb_unmap_reqlist(reqlist);
 
 	mtx_lock(&xbb->lock);
 
 	/*
 	 * All I/O is done, send the response. A lock is not necessary
 	 * to protect the request list, because all requests have
 	 * completed.  Therefore this is the only context accessing this
 	 * reqlist right now.  However, in order to make sure that no one
 	 * else queues responses onto the queue or pushes them to the other
 	 * side while we're active, we need to hold the lock across the
 	 * calls to xbb_queue_response() and xbb_push_responses().
 	 */
 	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
 		off_t cur_sectors_sent;
 
 		/* Put this response on the ring, but don't push yet */
 		xbb_queue_response(xbb, nreq, reqlist->status);
 
 		/* We don't report bytes sent if there is an error. */
 		if (reqlist->status == BLKIF_RSP_OKAY)
 			cur_sectors_sent = nreq->nr_512b_sectors;
 		else
 			cur_sectors_sent = 0;
 
 		sectors_sent += cur_sectors_sent;
 
 		devstat_end_transaction(xbb->xbb_stats_in,
 					/*bytes*/cur_sectors_sent << 9,
 					reqlist->ds_tag_type,
 					reqlist->ds_trans_type,
 					/*now*/NULL,
 					/*then*/&nreq->ds_t0);
 	}
 
 	/*
 	 * Take out any sectors not sent.  If we wind up negative (which
 	 * might happen if an error is reported as well as a residual), just
 	 * report 0 sectors sent.
 	 */
 	sectors_sent -= reqlist->residual_512b_sectors;
 	if (sectors_sent < 0)
 		sectors_sent = 0;
 
 	devstat_end_transaction(xbb->xbb_stats,
 				/*bytes*/ sectors_sent << 9,
 				reqlist->ds_tag_type,
 				reqlist->ds_trans_type,
 				/*now*/NULL,
 				/*then*/&reqlist->ds_t0);
 
 	xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1);
 
 	xbb_push_responses(xbb, &run_taskqueue, &notify);
 
 	mtx_unlock(&xbb->lock);
 
 	if (run_taskqueue)
 		taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 
 	if (notify)
 		xen_intr_signal(xbb->xen_intr_handle);
 }
 
 /**
  * Completion handler for buffer I/O requests issued by the device
  * backend driver.
  *
  * \param bio  The buffer I/O request on which to perform completion
  *             processing.
  */
 static void
 xbb_bio_done(struct bio *bio)
 {
 	struct xbb_softc       *xbb;
 	struct xbb_xen_reqlist *reqlist;
 
 	reqlist = bio->bio_caller1;
 	xbb     = reqlist->xbb;
 
 	reqlist->residual_512b_sectors += bio->bio_resid >> 9;
 
 	/*
 	 * This is a bit imprecise.  With aggregated I/O a single
 	 * request list can contain multiple front-end requests and
 	 * a multiple bios may point to a single request.  By carefully
 	 * walking the request list, we could map residuals and errors
 	 * back to the original front-end request, but the interface
 	 * isn't sufficiently rich for us to properly report the error.
 	 * So, we just treat the entire request list as having failed if an
 	 * error occurs on any part.  And, if an error occurs, we treat
 	 * the amount of data transferred as 0.
 	 *
 	 * For residuals, we report it on the overall aggregated device,
 	 * but not on the individual requests, since we don't currently
 	 * do the work to determine which front-end request to which the
 	 * residual applies.
 	 */
 	if (bio->bio_error) {
 		DPRINTF("BIO returned error %d for operation on device %s\n",
 			bio->bio_error, xbb->dev_name);
 		reqlist->status = BLKIF_RSP_ERROR;
 
 		if (bio->bio_error == ENXIO
 		 && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
 
 			/*
 			 * Backend device has disappeared.  Signal the
 			 * front-end that we (the device proxy) want to
 			 * go away.
 			 */
 			xenbus_set_state(xbb->dev, XenbusStateClosing);
 		}
 	}
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	if (bio->bio_cmd == BIO_READ) {
 		vm_offset_t kva_offset;
 
 		kva_offset = (vm_offset_t)bio->bio_data
 			   - (vm_offset_t)reqlist->bounce;
 		memcpy((uint8_t *)reqlist->kva + kva_offset,
 		       bio->bio_data, bio->bio_bcount);
 	}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 	/*
 	 * Decrement the pending count for the request list.  When we're
 	 * done with the requests, send status back for all of them.
 	 */
 	if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1)
 		xbb_complete_reqlist(xbb, reqlist);
 
 	g_destroy_bio(bio);
 }
 
 /**
  * Parse a blkif request into an internal request structure and send
  * it to the backend for processing.
  *
  * \param xbb       Per-instance xbb configuration structure.
  * \param reqlist   Allocated internal request list structure.
  *
  * \return          On success, 0.  For resource shortages, non-zero.
  *  
  * This routine performs the backend common aspects of request parsing
  * including compiling an internal request structure, parsing the S/G
  * list and any secondary ring requests in which they may reside, and
  * the mapping of front-end I/O pages into our domain.
  */
 static int
 xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist)
 {
 	struct xbb_sg                *xbb_sg;
 	struct gnttab_map_grant_ref  *map;
 	struct blkif_request_segment *sg;
 	struct blkif_request_segment *last_block_sg;
 	struct xbb_xen_req	     *nreq;
 	u_int			      nseg;
 	u_int			      seg_idx;
 	u_int			      block_segs;
 	int			      nr_sects;
 	int			      total_sects;
 	int			      operation;
 	uint8_t			      bio_flags;
 	int			      error;
 
 	reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE;
 	bio_flags            = 0;
 	total_sects	     = 0;
 	nr_sects	     = 0;
 
 	/*
 	 * First determine whether we have enough free KVA to satisfy this
 	 * request list.  If not, tell xbb_run_queue() so it can go to
 	 * sleep until we have more KVA.
 	 */
 	reqlist->kva = NULL;
 	if (reqlist->nr_segments != 0) {
 		reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments);
 		if (reqlist->kva == NULL) {
 			/*
 			 * If we're out of KVA, return ENOMEM.
 			 */
 			return (ENOMEM);
 		}
 	}
 
 	binuptime(&reqlist->ds_t0);
 	devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0);
 
 	switch (reqlist->operation) {
 	case BLKIF_OP_WRITE_BARRIER:
 		bio_flags       |= BIO_ORDERED;
 		reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		/* FALLTHROUGH */
 	case BLKIF_OP_WRITE:
 		operation = BIO_WRITE;
 		reqlist->ds_trans_type = DEVSTAT_WRITE;
 		if ((xbb->flags & XBBF_READ_ONLY) != 0) {
 			DPRINTF("Attempt to write to read only device %s\n",
 				xbb->dev_name);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 		break;
 	case BLKIF_OP_READ:
 		operation = BIO_READ;
 		reqlist->ds_trans_type = DEVSTAT_READ;
 		break;
 	case BLKIF_OP_FLUSH_DISKCACHE:
 		/*
 		 * If this is true, the user has requested that we disable
 		 * flush support.  So we just complete the requests
 		 * successfully.
 		 */
 		if (xbb->disable_flush != 0) {
 			goto send_response;
 		}
 
 		/*
 		 * The user has requested that we only send a real flush
 		 * for every N flush requests.  So keep count, and either
 		 * complete the request immediately or queue it for the
 		 * backend.
 		 */
 		if (xbb->flush_interval != 0) {
 		 	if (++(xbb->flush_count) < xbb->flush_interval) {
 				goto send_response;
 			} else
 				xbb->flush_count = 0;
 		}
 
 		operation = BIO_FLUSH;
 		reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED;
 		reqlist->ds_trans_type = DEVSTAT_NO_DATA;
 		goto do_dispatch;
 		/*NOTREACHED*/
 	default:
 		DPRINTF("error: unknown block io operation [%d]\n",
 			reqlist->operation);
 		reqlist->status = BLKIF_RSP_ERROR;
 		goto send_response;
 	}
 
 	reqlist->xbb  = xbb;
 	xbb_sg        = xbb->xbb_sgs;
 	map	      = xbb->maps;
 	seg_idx	      = 0;
 
 	STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) {
 		blkif_request_t		*ring_req;
 		RING_IDX		 req_ring_idx;
 		u_int			 req_seg_idx;
 
 		ring_req	      = nreq->ring_req;
 		req_ring_idx	      = nreq->req_ring_idx;
 		nr_sects              = 0;
 		nseg                  = ring_req->nr_segments;
 		nreq->nr_pages        = nseg;
 		nreq->nr_512b_sectors = 0;
 		req_seg_idx	      = 0;
 		sg	              = NULL;
 
 		/* Check that number of segments is sane. */
 		if (__predict_false(nseg == 0)
 		 || __predict_false(nseg > xbb->max_request_segments)) {
 			DPRINTF("Bad number of segments in request (%d)\n",
 				nseg);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 
 		block_segs    = nseg;
 		sg            = ring_req->seg;
 		last_block_sg = sg + block_segs;
 
 		while (sg < last_block_sg) {
 			KASSERT(seg_idx <
 				XBB_MAX_SEGMENTS_PER_REQLIST,
 				("seg_idx %d is too large, max "
 				"segs %d\n", seg_idx,
 				XBB_MAX_SEGMENTS_PER_REQLIST));
 
 			xbb_sg->first_sect = sg->first_sect;
 			xbb_sg->last_sect  = sg->last_sect;
 			xbb_sg->nsect =
 			    (int8_t)(sg->last_sect -
 			    sg->first_sect + 1);
 
 			if ((sg->last_sect >= (PAGE_SIZE >> 9))
 			 || (xbb_sg->nsect <= 0)) {
 				reqlist->status = BLKIF_RSP_ERROR;
 				goto send_response;
 			}
 
 			nr_sects += xbb_sg->nsect;
 			map->host_addr = xbb_get_gntaddr(reqlist,
 						seg_idx, /*sector*/0);
 			KASSERT(map->host_addr + PAGE_SIZE <=
 				xbb->ring_config.gnt_addr,
 				("Host address %#jx len %d overlaps "
 				 "ring address %#jx\n",
 				(uintmax_t)map->host_addr, PAGE_SIZE,
 				(uintmax_t)xbb->ring_config.gnt_addr));
 
 			map->flags     = GNTMAP_host_map;
 			map->ref       = sg->gref;
 			map->dom       = xbb->otherend_id;
 			if (operation == BIO_WRITE)
 				map->flags |= GNTMAP_readonly;
 			sg++;
 			map++;
 			xbb_sg++;
 			seg_idx++;
 			req_seg_idx++;
 		}
 
 		/* Convert to the disk's sector size */
 		nreq->nr_512b_sectors = nr_sects;
 		nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
 		total_sects += nr_sects;
 
 		if ((nreq->nr_512b_sectors &
 		    ((xbb->sector_size >> 9) - 1)) != 0) {
 			device_printf(xbb->dev, "%s: I/O size (%d) is not "
 				      "a multiple of the backing store sector "
 				      "size (%d)\n", __func__,
 				      nreq->nr_512b_sectors << 9,
 				      xbb->sector_size);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
 					  xbb->maps, reqlist->nr_segments);
 	if (error != 0)
 		panic("Grant table operation failed (%d)", error);
 
 	reqlist->flags |= XBB_REQLIST_MAPPED;
 
 	for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments;
 	     seg_idx++, map++){
 
 		if (__predict_false(map->status != 0)) {
 			DPRINTF("invalid buffer -- could not remap "
 			        "it (%d)\n", map->status);
 			DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags "
 			        "0x%x ref 0x%x, dom %d\n", seg_idx,
 				map->host_addr, map->flags, map->ref,
 				map->dom);
 			reqlist->status = BLKIF_RSP_ERROR;
 			goto send_response;
 		}
 
 		reqlist->gnt_handles[seg_idx] = map->handle;
 	}
 	if (reqlist->starting_sector_number + total_sects >
 	    xbb->media_num_sectors) {
 
 		DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
 			"extends past end of device %s\n",
 			operation == BIO_READ ? "read" : "write",
 			reqlist->starting_sector_number,
 			reqlist->starting_sector_number + total_sects,
 			xbb->dev_name); 
 		reqlist->status = BLKIF_RSP_ERROR;
 		goto send_response;
 	}
 
 do_dispatch:
 
 	error = xbb->dispatch_io(xbb,
 				 reqlist,
 				 operation,
 				 bio_flags);
 
 	if (error != 0) {
 		reqlist->status = BLKIF_RSP_ERROR;
 		goto send_response;
 	}
 
 	return (0);
 
 send_response:
 
 	xbb_complete_reqlist(xbb, reqlist);
 
 	return (0);
 }
 
 static __inline int
 xbb_count_sects(blkif_request_t *ring_req)
 {
 	int i;
 	int cur_size = 0;
 
 	for (i = 0; i < ring_req->nr_segments; i++) {
 		int nsect;
 
 		nsect = (int8_t)(ring_req->seg[i].last_sect -
 			ring_req->seg[i].first_sect + 1);
 		if (nsect <= 0)
 			break;
 
 		cur_size += nsect;
 	}
 
 	return (cur_size);
 }
 
 /**
  * Process incoming requests from the shared communication ring in response
  * to a signal on the ring's event channel.
  *
  * \param context  Callback argument registerd during task initialization -
  *                 the xbb_softc for this instance.
  * \param pending  The number of taskqueue_enqueue events that have
  *                 occurred since this handler was last run.
  */
 static void
 xbb_run_queue(void *context, int pending)
 {
 	struct xbb_softc       *xbb;
 	blkif_back_rings_t     *rings;
 	RING_IDX		rp;
 	uint64_t		cur_sector;
 	int			cur_operation;
 	struct xbb_xen_reqlist *reqlist;
 
 
 	xbb   = (struct xbb_softc *)context;
 	rings = &xbb->rings;
 
 	/*
 	 * Work gather and dispatch loop.  Note that we have a bias here
 	 * towards gathering I/O sent by blockfront.  We first gather up
 	 * everything in the ring, as long as we have resources.  Then we
 	 * dispatch one request, and then attempt to gather up any
 	 * additional requests that have come in while we were dispatching
 	 * the request.
 	 *
 	 * This allows us to get a clearer picture (via devstat) of how
 	 * many requests blockfront is queueing to us at any given time.
 	 */
 	for (;;) {
 		int retval;
 
 		/*
 		 * Initialize reqlist to the last element in the pending
 		 * queue, if there is one.  This allows us to add more
 		 * requests to that request list, if we have room.
 		 */
 		reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq,
 				      xbb_xen_reqlist, links);
 		if (reqlist != NULL) {
 			cur_sector = reqlist->next_contig_sector;
 			cur_operation = reqlist->operation;
 		} else {
 			cur_operation = 0;
 			cur_sector    = 0;
 		}
 
 		/*
 		 * Cache req_prod to avoid accessing a cache line shared
 		 * with the frontend.
 		 */
 		rp = rings->common.sring->req_prod;
 
 		/* Ensure we see queued requests up to 'rp'. */
 		rmb();
 
 		/**
 		 * Run so long as there is work to consume and the generation
 		 * of a response will not overflow the ring.
 		 *
 		 * @note There's a 1 to 1 relationship between requests and
 		 *       responses, so an overflow should never occur.  This
 		 *       test is to protect our domain from digesting bogus
 		 *       data.  Shouldn't we log this?
 		 */
 		while (rings->common.req_cons != rp
 		    && RING_REQUEST_CONS_OVERFLOW(&rings->common,
 						  rings->common.req_cons) == 0){
 			blkif_request_t	        ring_req_storage;
 			blkif_request_t	       *ring_req;
 			int			cur_size;
 
 			switch (xbb->abi) {
 			case BLKIF_PROTOCOL_NATIVE:
 				ring_req = RING_GET_REQUEST(&xbb->rings.native,
 				    rings->common.req_cons);
 				break;
 			case BLKIF_PROTOCOL_X86_32:
 			{
 				struct blkif_x86_32_request *ring_req32;
 
 				ring_req32 = RING_GET_REQUEST(
 				    &xbb->rings.x86_32, rings->common.req_cons);
 				blkif_get_x86_32_req(&ring_req_storage,
 						     ring_req32);
 				ring_req = &ring_req_storage;
 				break;
 			}
 			case BLKIF_PROTOCOL_X86_64:
 			{
 				struct blkif_x86_64_request *ring_req64;
 
 				ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64,
 				    rings->common.req_cons);
 				blkif_get_x86_64_req(&ring_req_storage,
 						     ring_req64);
 				ring_req = &ring_req_storage;
 				break;
 			}
 			default:
 				panic("Unexpected blkif protocol ABI.");
 				/* NOTREACHED */
 			} 
 
 			/*
 			 * Check for situations that would require closing
 			 * off this I/O for further coalescing:
 			 *  - Coalescing is turned off.
 			 *  - Current I/O is out of sequence with the previous
 			 *    I/O.
 			 *  - Coalesced I/O would be too large.
 			 */
 			if ((reqlist != NULL)
 			 && ((xbb->no_coalesce_reqs != 0)
 			  || ((xbb->no_coalesce_reqs == 0)
 			   && ((ring_req->sector_number != cur_sector)
 			    || (ring_req->operation != cur_operation)
 			    || ((ring_req->nr_segments + reqlist->nr_segments) >
 			         xbb->max_reqlist_segments))))) {
 				reqlist = NULL;
 			}
 
 			/*
 			 * Grab and check for all resources in one shot.
 			 * If we can't get all of the resources we need,
 			 * the shortage is noted and the thread will get
 			 * woken up when more resources are available.
 			 */
 			retval = xbb_get_resources(xbb, &reqlist, ring_req,
 						   xbb->rings.common.req_cons);
 
 			if (retval != 0) {
 				/*
 				 * Resource shortage has been recorded.
 				 * We'll be scheduled to run once a request
 				 * object frees up due to a completion.
 				 */
 				break;
 			}
 
 			/*
 			 * Signify that	we can overwrite this request with
 			 * a response by incrementing our consumer index.
 			 * The response won't be generated until after
 			 * we've already consumed all necessary data out
 			 * of the version of the request in the ring buffer
 			 * (for native mode).  We must update the consumer
 			 * index  before issuing back-end I/O so there is
 			 * no possibility that it will complete and a
 			 * response be generated before we make room in 
 			 * the queue for that response.
 			 */
 			xbb->rings.common.req_cons++;
 			xbb->reqs_received++;
 
 			cur_size = xbb_count_sects(ring_req);
 			cur_sector = ring_req->sector_number + cur_size;
 			reqlist->next_contig_sector = cur_sector;
 			cur_operation = ring_req->operation;
 		}
 
 		/* Check for I/O to dispatch */
 		reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
 		if (reqlist == NULL) {
 			/*
 			 * We're out of work to do, put the task queue to
 			 * sleep.
 			 */
 			break;
 		}
 
 		/*
 		 * Grab the first request off the queue and attempt
 		 * to dispatch it.
 		 */
 		STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links);
 
 		retval = xbb_dispatch_io(xbb, reqlist);
 		if (retval != 0) {
 			/*
 			 * xbb_dispatch_io() returns non-zero only when
 			 * there is a resource shortage.  If that's the
 			 * case, re-queue this request on the head of the
 			 * queue, and go to sleep until we have more
 			 * resources.
 			 */
 			STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq,
 					   reqlist, links);
 			break;
 		} else {
 			/*
 			 * If we still have anything on the queue after
 			 * removing the head entry, that is because we
 			 * met one of the criteria to create a new
 			 * request list (outlined above), and we'll call
 			 * that a forced dispatch for statistical purposes.
 			 *
 			 * Otherwise, if there is only one element on the
 			 * queue, we coalesced everything available on
 			 * the ring and we'll call that a normal dispatch.
 			 */
 			reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq);
 
 			if (reqlist != NULL)
 				xbb->forced_dispatch++;
 			else
 				xbb->normal_dispatch++;
 
 			xbb->total_dispatch++;
 		}
 	}
 }
 
 /**
  * Interrupt handler bound to the shared ring's event channel.
  *
  * \param arg  Callback argument registerd during event channel
  *             binding - the xbb_softc for this instance.
  */
 static int
 xbb_filter(void *arg)
 {
 	struct xbb_softc *xbb;
 
 	/* Defer to taskqueue thread. */
 	xbb = (struct xbb_softc *)arg;
 	taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); 
 
 	return (FILTER_HANDLED);
 }
 
 SDT_PROVIDER_DEFINE(xbb);
 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t",
 		  "uint64_t");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int",
 		  "uint64_t", "uint64_t");
 
 /*----------------------------- Backend Handlers -----------------------------*/
 /**
  * Backend handler for character device access.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param reqlist    Allocated internal request list structure.
  * \param operation  BIO_* I/O operation code.
  * \param bio_flags  Additional bio_flag data to pass to any generated
  *                   bios (e.g. BIO_ORDERED)..
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 		 int operation, int bio_flags)
 {
 	struct xbb_dev_data *dev_data;
 	struct bio          *bios[XBB_MAX_SEGMENTS_PER_REQLIST];
 	off_t                bio_offset;
 	struct bio          *bio;
 	struct xbb_sg       *xbb_sg;
 	u_int	             nbio;
 	u_int                bio_idx;
 	u_int		     nseg;
 	u_int                seg_idx;
 	int                  error;
 
 	dev_data   = &xbb->backend.dev;
 	bio_offset = (off_t)reqlist->starting_sector_number
 		   << xbb->sector_size_shift;
 	error      = 0;
 	nbio       = 0;
 	bio_idx    = 0;
 
 	if (operation == BIO_FLUSH) {
 		bio = g_new_bio();
 		if (__predict_false(bio == NULL)) {
 			DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
 			error = ENOMEM;
 			return (error);
 		}
 
 		bio->bio_cmd	 = BIO_FLUSH;
 		bio->bio_flags	|= BIO_ORDERED;
 		bio->bio_dev	 = dev_data->cdev;
 		bio->bio_offset	 = 0;
 		bio->bio_data	 = 0;
 		bio->bio_done	 = xbb_bio_done;
 		bio->bio_caller1 = reqlist;
 		bio->bio_pblkno	 = 0;
 
 		reqlist->pendcnt = 1;
 
 		SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush,
 			   device_get_unit(xbb->dev));
 
 		(*dev_data->csw->d_strategy)(bio);
 
 		return (0);
 	}
 
 	xbb_sg = xbb->xbb_sgs;
 	bio    = NULL;
 	nseg = reqlist->nr_segments;
 
 	for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
 
 		/*
 		 * KVA will not be contiguous, so any additional
 		 * I/O will need to be represented in a new bio.
 		 */
 		if ((bio != NULL)
 		 && (xbb_sg->first_sect != 0)) {
 			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
 				printf("%s: Discontiguous I/O request "
 				       "from domain %d ends on "
 				       "non-sector boundary\n",
 				       __func__, xbb->otherend_id);
 				error = EINVAL;
 				goto fail_free_bios;
 			}
 			bio = NULL;
 		}
 
 		if (bio == NULL) {
 			/*
 			 * Make sure that the start of this bio is
 			 * aligned to a device sector.
 			 */
 			if ((bio_offset & (xbb->sector_size - 1)) != 0){
 				printf("%s: Misaligned I/O request "
 				       "from domain %d\n", __func__,
 				       xbb->otherend_id);
 				error = EINVAL;
 				goto fail_free_bios;
 			}
 
 			bio = bios[nbio++] = g_new_bio();
 			if (__predict_false(bio == NULL)) {
 				error = ENOMEM;
 				goto fail_free_bios;
 			}
 			bio->bio_cmd     = operation;
 			bio->bio_flags  |= bio_flags;
 			bio->bio_dev     = dev_data->cdev;
 			bio->bio_offset  = bio_offset;
 			bio->bio_data    = xbb_reqlist_ioaddr(reqlist, seg_idx,
 						xbb_sg->first_sect);
 			bio->bio_done    = xbb_bio_done;
 			bio->bio_caller1 = reqlist;
 			bio->bio_pblkno  = bio_offset >> xbb->sector_size_shift;
 		}
 
 		bio->bio_length += xbb_sg->nsect << 9;
 		bio->bio_bcount  = bio->bio_length;
 		bio_offset      += xbb_sg->nsect << 9;
 
 		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
 
 			if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
 				printf("%s: Discontiguous I/O request "
 				       "from domain %d ends on "
 				       "non-sector boundary\n",
 				       __func__, xbb->otherend_id);
 				error = EINVAL;
 				goto fail_free_bios;
 			}
 			/*
 			 * KVA will not be contiguous, so any additional
 			 * I/O will need to be represented in a new bio.
 			 */
 			bio = NULL;
 		}
 	}
 
 	reqlist->pendcnt = nbio;
 
 	for (bio_idx = 0; bio_idx < nbio; bio_idx++)
 	{
 #ifdef XBB_USE_BOUNCE_BUFFERS
 		vm_offset_t kva_offset;
 
 		kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
 			   - (vm_offset_t)reqlist->bounce;
 		if (operation == BIO_WRITE) {
 			memcpy(bios[bio_idx]->bio_data,
 			       (uint8_t *)reqlist->kva + kva_offset,
 			       bios[bio_idx]->bio_bcount);
 		}
 #endif
 		if (operation == BIO_READ) {
 			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read,
 				   device_get_unit(xbb->dev),
 				   bios[bio_idx]->bio_offset,
 				   bios[bio_idx]->bio_length);
 		} else if (operation == BIO_WRITE) {
 			SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write,
 				   device_get_unit(xbb->dev),
 				   bios[bio_idx]->bio_offset,
 				   bios[bio_idx]->bio_length);
 		}
 		(*dev_data->csw->d_strategy)(bios[bio_idx]);
 	}
 
 	return (error);
 
 fail_free_bios:
 	for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
 		g_destroy_bio(bios[bio_idx]);
 	
 	return (error);
 }
 
 SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t",
 		  "uint64_t");
 SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int",
 		  "uint64_t", "uint64_t");
 
 /**
  * Backend handler for file access.
  *
  * \param xbb        Per-instance xbb configuration structure.
  * \param reqlist    Allocated internal request list.
  * \param operation  BIO_* I/O operation code.
  * \param flags      Additional bio_flag data to pass to any generated bios
  *                   (e.g. BIO_ORDERED)..
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist,
 		  int operation, int flags)
 {
 	struct xbb_file_data *file_data;
 	u_int                 seg_idx;
 	u_int		      nseg;
 	struct uio            xuio;
 	struct xbb_sg        *xbb_sg;
 	struct iovec         *xiovec;
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	void                **p_vaddr;
 	int                   saved_uio_iovcnt;
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 	int                   error;
 
 	file_data = &xbb->backend.file;
 	error = 0;
 	bzero(&xuio, sizeof(xuio));
 
 	switch (operation) {
 	case BIO_READ:
 		xuio.uio_rw = UIO_READ;
 		break;
 	case BIO_WRITE:
 		xuio.uio_rw = UIO_WRITE;
 		break;
 	case BIO_FLUSH: {
 		struct mount *mountpoint;
 
 		SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush,
 			   device_get_unit(xbb->dev));
 
 		(void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 		error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
 		VOP_UNLOCK(xbb->vn, 0);
 
 		vn_finished_write(mountpoint);
 
 		goto bailout_send_response;
 		/* NOTREACHED */
 	}
 	default:
 		panic("invalid operation %d", operation);
 		/* NOTREACHED */
 	}
 	xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number
 			<< xbb->sector_size_shift;
 	xuio.uio_segflg = UIO_SYSSPACE;
 	xuio.uio_iov = file_data->xiovecs;
 	xuio.uio_iovcnt = 0;
 	xbb_sg = xbb->xbb_sgs;
 	nseg = reqlist->nr_segments;
 
 	for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) {
 
 		/*
 		 * If the first sector is not 0, the KVA will
 		 * not be contiguous and we'll need to go on
 		 * to another segment.
 		 */
 		if (xbb_sg->first_sect != 0)
 			xiovec = NULL;
 
 		if (xiovec == NULL) {
 			xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
 			xiovec->iov_base = xbb_reqlist_ioaddr(reqlist,
 			    seg_idx, xbb_sg->first_sect);
 #ifdef XBB_USE_BOUNCE_BUFFERS
 			/*
 			 * Store the address of the incoming
 			 * buffer at this particular offset
 			 * as well, so we can do the copy
 			 * later without having to do more
 			 * work to recalculate this address.
 		 	 */
 			p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
 			*p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx,
 			    xbb_sg->first_sect);
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 			xiovec->iov_len = 0;
 			xuio.uio_iovcnt++;
 		}
 
 		xiovec->iov_len += xbb_sg->nsect << 9;
 
 		xuio.uio_resid += xbb_sg->nsect << 9;
 
 		/*
 		 * If the last sector is not the full page
 		 * size count, the next segment will not be
 		 * contiguous in KVA and we need a new iovec.
 		 */
 		if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
 			xiovec = NULL;
 	}
 
 	xuio.uio_td = curthread;
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	saved_uio_iovcnt = xuio.uio_iovcnt;
 
 	if (operation == BIO_WRITE) {
 		/* Copy the write data to the local buffer. */
 		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
 		     xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
 		     seg_idx++, xiovec++, p_vaddr++) {
 
 			memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
 		}
 	} else {
 		/*
 		 * We only need to save off the iovecs in the case of a
 		 * read, because the copy for the read happens after the
 		 * VOP_READ().  (The uio will get modified in that call
 		 * sequence.)
 		 */
 		memcpy(file_data->saved_xiovecs, xuio.uio_iov,
 		       xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
 	}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 	switch (operation) {
 	case BIO_READ:
 
 		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read,
 			   device_get_unit(xbb->dev), xuio.uio_offset,
 			   xuio.uio_resid);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for reads.  If the
 		 * DIRECTIO option is configured into the kernel, it calls
 		 * ffs_rawread().  But that only works for single-segment
 		 * uios with user space addresses.  In our case, with a
 		 * kernel uio, it still reads into the buffer cache, but it
 		 * will just try to release the buffer from the cache later
 		 * on in ffs_read().
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for reads.
 		 *
 		 * UFS does not pay attention to IO_SYNC for reads.
 		 *
 		 * ZFS pays attention to IO_SYNC (which translates into the
 		 * Solaris define FRSYNC for zfs_read()) for reads.  It
 		 * attempts to sync the file before reading.
 		 *
 		 * So, to attempt to provide some barrier semantics in the
 		 * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.  
 		 */
 		error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? 
 				 (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
 
 		VOP_UNLOCK(xbb->vn, 0);
 		break;
 	case BIO_WRITE: {
 		struct mount *mountpoint;
 
 		SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write,
 			   device_get_unit(xbb->dev), xuio.uio_offset,
 			   xuio.uio_resid);
 
 		(void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
 
 		vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
 
 		/*
 		 * UFS pays attention to IO_DIRECT for writes.  The write
 		 * is done asynchronously.  (Normally the write would just
 		 * get put into cache.
 		 *
 		 * UFS pays attention to IO_SYNC for writes.  It will
 		 * attempt to write the buffer out synchronously if that
 		 * flag is set.
 		 *
 		 * ZFS does not pay attention to IO_DIRECT for writes.
 		 *
 		 * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
 		 * for writes.  It will flush the transaction from the
 		 * cache before returning.
 		 *
 		 * So if we've got the BIO_ORDERED flag set, we want
 		 * IO_SYNC in either the UFS or ZFS case.
 		 */
 		error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
 				  IO_SYNC : 0, file_data->cred);
 		VOP_UNLOCK(xbb->vn, 0);
 
 		vn_finished_write(mountpoint);
 
 		break;
 	}
 	default:
 		panic("invalid operation %d", operation);
 		/* NOTREACHED */
 	}
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 	/* We only need to copy here for read operations */
 	if (operation == BIO_READ) {
 
 		for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
 		     xiovec = file_data->saved_xiovecs;
 		     seg_idx < saved_uio_iovcnt; seg_idx++,
 		     xiovec++, p_vaddr++) {
 
 			/*
 			 * Note that we have to use the copy of the 
 			 * io vector we made above.  uiomove() modifies
 			 * the uio and its referenced vector as uiomove
 			 * performs the copy, so we can't rely on any
 			 * state from the original uio.
 			 */
 			memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
 		}
 	}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 bailout_send_response:
 
 	if (error != 0)
 		reqlist->status = BLKIF_RSP_ERROR;
 
 	xbb_complete_reqlist(xbb, reqlist);
 
 	return (0);
 }
 
 /*--------------------------- Backend Configuration --------------------------*/
 /**
  * Close and cleanup any backend device/file specific state for this
  * block back instance. 
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static void
 xbb_close_backend(struct xbb_softc *xbb)
 {
 	DROP_GIANT();
 	DPRINTF("closing dev=%s\n", xbb->dev_name);
 	if (xbb->vn) {
 		int flags = FREAD;
 
 		if ((xbb->flags & XBBF_READ_ONLY) == 0)
 			flags |= FWRITE;
 
 		switch (xbb->device_type) {
 		case XBB_TYPE_DISK:
 			if (xbb->backend.dev.csw) {
 				dev_relthread(xbb->backend.dev.cdev,
 					      xbb->backend.dev.dev_ref);
 				xbb->backend.dev.csw  = NULL;
 				xbb->backend.dev.cdev = NULL;
 			}
 			break;
 		case XBB_TYPE_FILE:
 			break;
 		case XBB_TYPE_NONE:
 		default:
 			panic("Unexpected backend type.");
 			break;
 		}
 
 		(void)vn_close(xbb->vn, flags, NOCRED, curthread);
 		xbb->vn = NULL;
 
 		switch (xbb->device_type) {
 		case XBB_TYPE_DISK:
 			break;
 		case XBB_TYPE_FILE:
 			if (xbb->backend.file.cred != NULL) {
 				crfree(xbb->backend.file.cred);
 				xbb->backend.file.cred = NULL;
 			}
 			break;
 		case XBB_TYPE_NONE:
 		default:
 			panic("Unexpected backend type.");
 			break;
 		}
 	}
 	PICKUP_GIANT();
 }
 
 /**
  * Open a character device to be used for backend I/O.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_open_dev(struct xbb_softc *xbb)
 {
 	struct vattr   vattr;
 	struct cdev   *dev;
 	struct cdevsw *devsw;
 	int	       error;
 
 	xbb->device_type = XBB_TYPE_DISK;
 	xbb->dispatch_io = xbb_dispatch_dev;
 	xbb->backend.dev.cdev = xbb->vn->v_rdev;
 	xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
 					     &xbb->backend.dev.dev_ref);
 	if (xbb->backend.dev.csw == NULL)
 		panic("Unable to retrieve device switch");
 
 	error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
 	if (error) {
 		xenbus_dev_fatal(xbb->dev, error, "error getting "
 				 "vnode attributes for device %s",
 				 xbb->dev_name);
 		return (error);
 	}
 
 
 	dev = xbb->vn->v_rdev;
 	devsw = dev->si_devsw;
 	if (!devsw->d_ioctl) {
 		xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
 				 "device %s!", xbb->dev_name);
 		return (ENODEV);
 	}
 
 	error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
 			       (caddr_t)&xbb->sector_size, FREAD,
 			       curthread);
 	if (error) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "error calling ioctl DIOCGSECTORSIZE "
 				 "for device %s", xbb->dev_name);
 		return (error);
 	}
 
 	error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
 			       (caddr_t)&xbb->media_size, FREAD,
 			       curthread);
 	if (error) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "error calling ioctl DIOCGMEDIASIZE "
 				 "for device %s", xbb->dev_name);
 		return (error);
 	}
 
 	return (0);
 }
 
 /**
  * Open a file to be used for backend I/O.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_open_file(struct xbb_softc *xbb)
 {
 	struct xbb_file_data *file_data;
 	struct vattr          vattr;
 	int                   error;
 
 	file_data = &xbb->backend.file;
 	xbb->device_type = XBB_TYPE_FILE;
 	xbb->dispatch_io = xbb_dispatch_file;
 	error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "error calling VOP_GETATTR()"
 				 "for file %s", xbb->dev_name);
 		return (error);
 	}
 
 	/*
 	 * Verify that we have the ability to upgrade to exclusive
 	 * access on this file so we can trap errors at open instead
 	 * of reporting them during first access.
 	 */
 	if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
 		vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
 		if (xbb->vn->v_iflag & VI_DOOMED) {
 			error = EBADF;
 			xenbus_dev_fatal(xbb->dev, error,
 					 "error locking file %s",
 					 xbb->dev_name);
 
 			return (error);
 		}
 	}
 
 	file_data->cred = crhold(curthread->td_ucred);
 	xbb->media_size = vattr.va_size;
 
 	/*
 	 * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
 	 * With ZFS, it is 131072 bytes.  Block sizes that large don't work
 	 * with disklabel and UFS on FreeBSD at least.  Large block sizes
 	 * may not work with other OSes as well.  So just export a sector
 	 * size of 512 bytes, which should work with any OS or
 	 * application.  Since our backing is a file, any block size will
 	 * work fine for the backing store.
 	 */
 #if 0
 	xbb->sector_size = vattr.va_blocksize;
 #endif
 	xbb->sector_size = 512;
 
 	/*
 	 * Sanity check.  The media size has to be at least one
 	 * sector long.
 	 */
 	if (xbb->media_size < xbb->sector_size) {
 		error = EINVAL;
 		xenbus_dev_fatal(xbb->dev, error,
 				 "file %s size %ju < block size %u",
 				 xbb->dev_name,
 				 (uintmax_t)xbb->media_size,
 				 xbb->sector_size);
 	}
 	return (error);
 }
 
 /**
  * Open the backend provider for this connection.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_open_backend(struct xbb_softc *xbb)
 {
 	struct nameidata nd;
 	int		 flags;
 	int		 error;
 
 	flags = FREAD;
 	error = 0;
 
 	DPRINTF("opening dev=%s\n", xbb->dev_name);
 
 	if (rootvnode == NULL) {
 		xenbus_dev_fatal(xbb->dev, ENOENT,
 				 "Root file system not mounted");
 		return (ENOENT);
 	}
 
 	if ((xbb->flags & XBBF_READ_ONLY) == 0)
 		flags |= FWRITE;
 
 	pwd_ensure_dirs();
 
  again:
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error) {
 		/*
 		 * This is the only reasonable guess we can make as far as
 		 * path if the user doesn't give us a fully qualified path.
 		 * If they want to specify a file, they need to specify the
 		 * full path.
 		 */
 		if (xbb->dev_name[0] != '/') {
 			char *dev_path = "/dev/";
 			char *dev_name;
 
 			/* Try adding device path at beginning of name */
 			dev_name = malloc(strlen(xbb->dev_name)
 					+ strlen(dev_path) + 1,
 					  M_XENBLOCKBACK, M_NOWAIT);
 			if (dev_name) {
 				sprintf(dev_name, "%s%s", dev_path,
 					xbb->dev_name);
 				free(xbb->dev_name, M_XENBLOCKBACK);
 				xbb->dev_name = dev_name;
 				goto again;
 			}
 		}
 		xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
 				 xbb->dev_name);
 		return (error);
 	}
 
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 		
 	xbb->vn = nd.ni_vp;
 
 	/* We only support disks and files. */
 	if (vn_isdisk(xbb->vn, &error)) {
 		error = xbb_open_dev(xbb);
 	} else if (xbb->vn->v_type == VREG) {
 		error = xbb_open_file(xbb);
 	} else {
 		error = EINVAL;
 		xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
 				 "or file", xbb->dev_name);
 	}
 	VOP_UNLOCK(xbb->vn, 0);
 
 	if (error != 0) {
 		xbb_close_backend(xbb);
 		return (error);
 	}
 
 	xbb->sector_size_shift = fls(xbb->sector_size) - 1;
 	xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
 
 	DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
 		(xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
 		xbb->dev_name, xbb->sector_size, xbb->media_size);
 
 	return (0);
 }
 
 /*------------------------ Inter-Domain Communication ------------------------*/
 /**
  * Free dynamically allocated KVA or pseudo-physical address allocations.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static void
 xbb_free_communication_mem(struct xbb_softc *xbb)
 {
 	if (xbb->kva != 0) {
 		if (xbb->pseudo_phys_res != NULL) {
 			xenmem_free(xbb->dev, xbb->pseudo_phys_res_id,
 			    xbb->pseudo_phys_res);
 			xbb->pseudo_phys_res = NULL;
 		}
 	}
 	xbb->kva = 0;
 	xbb->gnt_base_addr = 0;
 	if (xbb->kva_free != NULL) {
 		free(xbb->kva_free, M_XENBLOCKBACK);
 		xbb->kva_free = NULL;
 	}
 }
 
 /**
  * Cleanup all inter-domain communication mechanisms.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_disconnect(struct xbb_softc *xbb)
 {
 	struct gnttab_unmap_grant_ref  ops[XBB_MAX_RING_PAGES];
 	struct gnttab_unmap_grant_ref *op;
 	u_int			       ring_idx;
 	int			       error;
 
 	DPRINTF("\n");
 
 	if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
 		return (0);
 
 	xen_intr_unbind(&xbb->xen_intr_handle);
 
 	mtx_unlock(&xbb->lock);
 	taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); 
 	mtx_lock(&xbb->lock);
 
 	/*
 	 * No new interrupts can generate work, but we must wait
 	 * for all currently active requests to drain.
 	 */
 	if (xbb->active_request_count != 0)
 		return (EAGAIN);
 	
 	for (ring_idx = 0, op = ops;
 	     ring_idx < xbb->ring_config.ring_pages;
 	     ring_idx++, op++) {
 
 		op->host_addr    = xbb->ring_config.gnt_addr
 			         + (ring_idx * PAGE_SIZE);
 		op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
 		op->handle	 = xbb->ring_config.handle[ring_idx];
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
 					  xbb->ring_config.ring_pages);
 	if (error != 0)
 		panic("Grant table op failed (%d)", error);
 
 	xbb_free_communication_mem(xbb);
 
 	if (xbb->requests != NULL) {
 		free(xbb->requests, M_XENBLOCKBACK);
 		xbb->requests = NULL;
 	}
 
 	if (xbb->request_lists != NULL) {
 		struct xbb_xen_reqlist *reqlist;
 		int i;
 
 		/* There is one request list for ever allocated request. */
 		for (i = 0, reqlist = xbb->request_lists;
 		     i < xbb->max_requests; i++, reqlist++){
 #ifdef XBB_USE_BOUNCE_BUFFERS
 			if (reqlist->bounce != NULL) {
 				free(reqlist->bounce, M_XENBLOCKBACK);
 				reqlist->bounce = NULL;
 			}
 #endif
 			if (reqlist->gnt_handles != NULL) {
 				free(reqlist->gnt_handles, M_XENBLOCKBACK);
 				reqlist->gnt_handles = NULL;
 			}
 		}
 		free(xbb->request_lists, M_XENBLOCKBACK);
 		xbb->request_lists = NULL;
 	}
 
 	xbb->flags &= ~XBBF_RING_CONNECTED;
 	return (0);
 }
 
 /**
  * Map shared memory ring into domain local address space, initialize
  * ring control structures, and bind an interrupt to the event channel
  * used to notify us of ring changes.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_connect_ring(struct xbb_softc *xbb)
 {
 	struct gnttab_map_grant_ref  gnts[XBB_MAX_RING_PAGES];
 	struct gnttab_map_grant_ref *gnt;
 	u_int			     ring_idx;
 	int			     error;
 
 	if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
 		return (0);
 
 	/*
 	 * Kva for our ring is at the tail of the region of kva allocated
 	 * by xbb_alloc_communication_mem().
 	 */
 	xbb->ring_config.va = xbb->kva
 			    + (xbb->kva_size
 			     - (xbb->ring_config.ring_pages * PAGE_SIZE));
 	xbb->ring_config.gnt_addr = xbb->gnt_base_addr
 				  + (xbb->kva_size
 				   - (xbb->ring_config.ring_pages * PAGE_SIZE));
 
 	for (ring_idx = 0, gnt = gnts;
 	     ring_idx < xbb->ring_config.ring_pages;
 	     ring_idx++, gnt++) {
 
 		gnt->host_addr = xbb->ring_config.gnt_addr
 			       + (ring_idx * PAGE_SIZE);
 		gnt->flags     = GNTMAP_host_map;
 		gnt->ref       = xbb->ring_config.ring_ref[ring_idx];
 		gnt->dom       = xbb->otherend_id;
 	}
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
 					  xbb->ring_config.ring_pages);
 	if (error)
 		panic("blkback: Ring page grant table op failed (%d)", error);
 
 	for (ring_idx = 0, gnt = gnts;
 	     ring_idx < xbb->ring_config.ring_pages;
 	     ring_idx++, gnt++) {
 		if (gnt->status != 0) {
 			xbb->ring_config.va = 0;
 			xenbus_dev_fatal(xbb->dev, EACCES,
 					 "Ring shared page mapping failed. "
 					 "Status %d.", gnt->status);
 			return (EACCES);
 		}
 		xbb->ring_config.handle[ring_idx]   = gnt->handle;
 		xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
 	}
 
 	/* Initialize the ring based on ABI. */
 	switch (xbb->abi) {
 	case BLKIF_PROTOCOL_NATIVE:
 	{
 		blkif_sring_t *sring;
 		sring = (blkif_sring_t *)xbb->ring_config.va;
 		BACK_RING_INIT(&xbb->rings.native, sring,
 			       xbb->ring_config.ring_pages * PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_32:
 	{
 		blkif_x86_32_sring_t *sring_x86_32;
 		sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
 		BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
 			       xbb->ring_config.ring_pages * PAGE_SIZE);
 		break;
 	}
 	case BLKIF_PROTOCOL_X86_64:
 	{
 		blkif_x86_64_sring_t *sring_x86_64;
 		sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
 		BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
 			       xbb->ring_config.ring_pages * PAGE_SIZE);
 		break;
 	}
 	default:
 		panic("Unexpected blkif protocol ABI.");
 	}
 
 	xbb->flags |= XBBF_RING_CONNECTED;
 
 	error = xen_intr_bind_remote_port(xbb->dev,
 					  xbb->otherend_id,
 					  xbb->ring_config.evtchn,
 					  xbb_filter,
 					  /*ithread_handler*/NULL,
 					  /*arg*/xbb,
 					  INTR_TYPE_BIO | INTR_MPSAFE,
 					  &xbb->xen_intr_handle);
 	if (error) {
 		(void)xbb_disconnect(xbb);
 		xenbus_dev_fatal(xbb->dev, error, "binding event channel");
 		return (error);
 	}
 
 	DPRINTF("rings connected!\n");
 
 	return 0;
 }
 
-/* Needed to make bit_alloc() macro work */
-#define	calloc(count, size) malloc((count)*(size), M_XENBLOCKBACK,	\
-				   M_NOWAIT|M_ZERO);
-
 /**
  * Size KVA and pseudo-physical address allocations based on negotiated
  * values for the size and number of I/O requests, and the size of our
  * communication ring.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * These address spaces are used to dynamically map pages in the
  * front-end's domain into our own.
  */
 static int
 xbb_alloc_communication_mem(struct xbb_softc *xbb)
 {
 	xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments;
 	xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE;
 	xbb->kva_size = xbb->reqlist_kva_size +
 			(xbb->ring_config.ring_pages * PAGE_SIZE);
 
-	xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages);
+	xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT);
 	if (xbb->kva_free == NULL)
 		return (ENOMEM);
 
 	DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n",
 		device_get_nameunit(xbb->dev), xbb->kva_size,
 		xbb->reqlist_kva_size);
 	/*
 	 * Reserve a range of pseudo physical memory that we can map
 	 * into kva.  These pages will only be backed by machine
 	 * pages ("real memory") during the lifetime of front-end requests
 	 * via grant table operations.
 	 */
 	xbb->pseudo_phys_res_id = 0;
 	xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id,
 	    xbb->kva_size);
 	if (xbb->pseudo_phys_res == NULL) {
 		xbb->kva = 0;
 		return (ENOMEM);
 	}
 	xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
 	xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
 
 	DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n",
 		device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva,
 		(uintmax_t)xbb->gnt_base_addr); 
 	return (0);
 }
 
 /**
  * Collect front-end information from the XenStore.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_collect_frontend_info(struct xbb_softc *xbb)
 {
 	char	    protocol_abi[64];
 	const char *otherend_path;
 	int	    error;
 	u_int	    ring_idx;
 	u_int	    ring_page_order;
 	size_t	    ring_size;
 
 	otherend_path = xenbus_get_otherend_path(xbb->dev);
 
 	/*
 	 * Protocol defaults valid even if all negotiation fails.
 	 */
 	xbb->ring_config.ring_pages = 1;
 	xbb->max_request_segments   = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	xbb->max_request_size	    = xbb->max_request_segments * PAGE_SIZE;
 
 	/*
 	 * Mandatory data (used in all versions of the protocol) first.
 	 */
 	error = xs_scanf(XST_NIL, otherend_path,
 			 "event-channel", NULL, "%" PRIu32,
 			 &xbb->ring_config.evtchn);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "Unable to retrieve event-channel information "
 				 "from frontend %s.  Unable to connect.",
 				 xenbus_get_otherend_path(xbb->dev));
 		return (error);
 	}
 
 	/*
 	 * These fields are initialized to legacy protocol defaults
 	 * so we only need to fail if reading the updated value succeeds
 	 * and the new value is outside of its allowed range.
 	 *
 	 * \note xs_gather() returns on the first encountered error, so
 	 *       we must use independent calls in order to guarantee
 	 *       we don't miss information in a sparsly populated front-end
 	 *       tree.
 	 *
 	 * \note xs_scanf() does not update variables for unmatched
 	 *       fields.
 	 */
 	ring_page_order = 0;
 	xbb->max_requests = 32;
 
 	(void)xs_scanf(XST_NIL, otherend_path,
 		       "ring-page-order", NULL, "%u",
 		       &ring_page_order);
 	xbb->ring_config.ring_pages = 1 << ring_page_order;
 	ring_size = PAGE_SIZE * xbb->ring_config.ring_pages;
 	xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size);
 
 	if (xbb->ring_config.ring_pages	> XBB_MAX_RING_PAGES) {
 		xenbus_dev_fatal(xbb->dev, EINVAL,
 				 "Front-end specified ring-pages of %u "
 				 "exceeds backend limit of %u.  "
 				 "Unable to connect.",
 				 xbb->ring_config.ring_pages,
 				 XBB_MAX_RING_PAGES);
 		return (EINVAL);
 	}
 
 	if (xbb->ring_config.ring_pages	== 1) {
 		error = xs_gather(XST_NIL, otherend_path,
 				  "ring-ref", "%" PRIu32,
 				  &xbb->ring_config.ring_ref[0],
 				  NULL);
 		if (error != 0) {
 			xenbus_dev_fatal(xbb->dev, error,
 					 "Unable to retrieve ring information "
 					 "from frontend %s.  Unable to "
 					 "connect.",
 					 xenbus_get_otherend_path(xbb->dev));
 			return (error);
 		}
 	} else {
 		/* Multi-page ring format. */
 		for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages;
 		     ring_idx++) {
 			char ring_ref_name[]= "ring_refXX";
 
 			snprintf(ring_ref_name, sizeof(ring_ref_name),
 				 "ring-ref%u", ring_idx);
 			error = xs_scanf(XST_NIL, otherend_path,
 					 ring_ref_name, NULL, "%" PRIu32,
 					 &xbb->ring_config.ring_ref[ring_idx]);
 			if (error != 0) {
 				xenbus_dev_fatal(xbb->dev, error,
 						 "Failed to retriev grant "
 						 "reference for page %u of "
 						 "shared ring.  Unable "
 						 "to connect.", ring_idx);
 				return (error);
 			}
 		}
 	}
 
 	error = xs_gather(XST_NIL, otherend_path,
 			  "protocol", "%63s", protocol_abi,
 			  NULL); 
 	if (error != 0
 	 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
 		/*
 		 * Assume native if the frontend has not
 		 * published ABI data or it has published and
 		 * matches our own ABI.
 		 */
 		xbb->abi = BLKIF_PROTOCOL_NATIVE;
 	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
 
 		xbb->abi = BLKIF_PROTOCOL_X86_32;
 	} else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
 
 		xbb->abi = BLKIF_PROTOCOL_X86_64;
 	} else {
 
 		xenbus_dev_fatal(xbb->dev, EINVAL,
 				 "Unknown protocol ABI (%s) published by "
 				 "frontend.  Unable to connect.", protocol_abi);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /**
  * Allocate per-request data structures given request size and number
  * information negotiated with the front-end.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_alloc_requests(struct xbb_softc *xbb)
 {
 	struct xbb_xen_req *req;
 	struct xbb_xen_req *last_req;
 
 	/*
 	 * Allocate request book keeping datastructures.
 	 */
 	xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
 			       M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 	if (xbb->requests == NULL) {
 		xenbus_dev_fatal(xbb->dev, ENOMEM, 
 				  "Unable to allocate request structures");
 		return (ENOMEM);
 	}
 
 	req      = xbb->requests;
 	last_req = &xbb->requests[xbb->max_requests - 1];
 	STAILQ_INIT(&xbb->request_free_stailq);
 	while (req <= last_req) {
 		STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links);
 		req++;
 	}
 	return (0);
 }
 
 static int
 xbb_alloc_request_lists(struct xbb_softc *xbb)
 {
 	struct xbb_xen_reqlist *reqlist;
 	int			i;
 
 	/*
 	 * If no requests can be merged, we need 1 request list per
 	 * in flight request.
 	 */
 	xbb->request_lists = malloc(xbb->max_requests *
 		sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 	if (xbb->request_lists == NULL) {
 		xenbus_dev_fatal(xbb->dev, ENOMEM, 
 				  "Unable to allocate request list structures");
 		return (ENOMEM);
 	}
 
 	STAILQ_INIT(&xbb->reqlist_free_stailq);
 	STAILQ_INIT(&xbb->reqlist_pending_stailq);
 	for (i = 0; i < xbb->max_requests; i++) {
 		int seg;
 
 		reqlist      = &xbb->request_lists[i];
 
 		reqlist->xbb = xbb;
 
 #ifdef XBB_USE_BOUNCE_BUFFERS
 		reqlist->bounce = malloc(xbb->max_reqlist_size,
 					 M_XENBLOCKBACK, M_NOWAIT);
 		if (reqlist->bounce == NULL) {
 			xenbus_dev_fatal(xbb->dev, ENOMEM, 
 					 "Unable to allocate request "
 					 "bounce buffers");
 			return (ENOMEM);
 		}
 #endif /* XBB_USE_BOUNCE_BUFFERS */
 
 		reqlist->gnt_handles = malloc(xbb->max_reqlist_segments *
 					      sizeof(*reqlist->gnt_handles),
 					      M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
 		if (reqlist->gnt_handles == NULL) {
 			xenbus_dev_fatal(xbb->dev, ENOMEM,
 					  "Unable to allocate request "
 					  "grant references");
 			return (ENOMEM);
 		}
 
 		for (seg = 0; seg < xbb->max_reqlist_segments; seg++)
 			reqlist->gnt_handles[seg] = GRANT_REF_INVALID;
 
 		STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links);
 	}
 	return (0);
 }
 
 /**
  * Supply information about the physical device to the frontend
  * via XenBus.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static int
 xbb_publish_backend_info(struct xbb_softc *xbb)
 {
 	struct xs_transaction xst;
 	const char	     *our_path;
 	const char	     *leaf;
 	int		      error;
 
 	our_path = xenbus_get_node(xbb->dev);
 	while (1) {
 		error = xs_transaction_start(&xst);
 		if (error != 0) {
 			xenbus_dev_fatal(xbb->dev, error,
 					 "Error publishing backend info "
 					 "(start transaction)");
 			return (error);
 		}
 
 		leaf = "sectors";
 		error = xs_printf(xst, our_path, leaf,
 				  "%"PRIu64, xbb->media_num_sectors);
 		if (error != 0)
 			break;
 
 		/* XXX Support all VBD attributes here. */
 		leaf = "info";
 		error = xs_printf(xst, our_path, leaf, "%u",
 				  xbb->flags & XBBF_READ_ONLY
 				? VDISK_READONLY : 0);
 		if (error != 0)
 			break;
 
 		leaf = "sector-size";
 		error = xs_printf(xst, our_path, leaf, "%u",
 				  xbb->sector_size);
 		if (error != 0)
 			break;
 
 		error = xs_transaction_end(xst, 0);
 		if (error == 0) {
 			return (0);
 		} else if (error != EAGAIN) {
 			xenbus_dev_fatal(xbb->dev, error, "ending transaction");
 			return (error);
 		}
 	}
 
 	xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
 			our_path, leaf);
 	xs_transaction_end(xst, 1);
 	return (error);
 }
 
 /**
  * Connect to our blkfront peer now that it has completed publishing
  * its configuration into the XenStore.
  *
  * \param xbb  Per-instance xbb configuration structure.
  */
 static void
 xbb_connect(struct xbb_softc *xbb)
 {
 	int error;
 
 	if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
 		return;
 
 	if (xbb_collect_frontend_info(xbb) != 0)
 		return;
 
 	xbb->flags &= ~XBBF_SHUTDOWN;
 
 	/*
 	 * We limit the maximum number of reqlist segments to the maximum
 	 * number of segments in the ring, or our absolute maximum,
 	 * whichever is smaller.
 	 */
 	xbb->max_reqlist_segments = MIN(xbb->max_request_segments *
 		xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST);
 
 	/*
 	 * The maximum size is simply a function of the number of segments
 	 * we can handle.
 	 */
 	xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE;
 
 	/* Allocate resources whose size depends on front-end configuration. */
 	error = xbb_alloc_communication_mem(xbb);
 	if (error != 0) {
 		xenbus_dev_fatal(xbb->dev, error,
 				 "Unable to allocate communication memory");
 		return;
 	}
 
 	error = xbb_alloc_requests(xbb);
 	if (error != 0) {
 		/* Specific errors are reported by xbb_alloc_requests(). */
 		return;
 	}
 
 	error = xbb_alloc_request_lists(xbb);
 	if (error != 0) {
 		/* Specific errors are reported by xbb_alloc_request_lists(). */
 		return;
 	}
 
 	/*
 	 * Connect communication channel.
 	 */
 	error = xbb_connect_ring(xbb);
 	if (error != 0) {
 		/* Specific errors are reported by xbb_connect_ring(). */
 		return;
 	}
 	
 	if (xbb_publish_backend_info(xbb) != 0) {
 		/*
 		 * If we can't publish our data, we cannot participate
 		 * in this connection, and waiting for a front-end state
 		 * change will not help the situation.
 		 */
 		(void)xbb_disconnect(xbb);
 		return;
 	}
 
 	/* Ready for I/O. */
 	xenbus_set_state(xbb->dev, XenbusStateConnected);
 }
 
 /*-------------------------- Device Teardown Support -------------------------*/
 /**
  * Perform device shutdown functions.
  *
  * \param xbb  Per-instance xbb configuration structure.
  *
  * Mark this instance as shutting down, wait for any active I/O on the
  * backend device/file to drain, disconnect from the front-end, and notify
  * any waiters (e.g. a thread invoking our detach method) that detach can
  * now proceed.
  */
 static int
 xbb_shutdown(struct xbb_softc *xbb)
 {
 	XenbusState frontState;
 	int	    error;
 
 	DPRINTF("\n");
 
 	/*
 	 * Due to the need to drop our mutex during some
 	 * xenbus operations, it is possible for two threads
 	 * to attempt to close out shutdown processing at
 	 * the same time.  Tell the caller that hits this
 	 * race to try back later. 
 	 */
 	if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0)
 		return (EAGAIN);
 
 	xbb->flags |= XBBF_IN_SHUTDOWN;
 	mtx_unlock(&xbb->lock);
 
 	if (xenbus_get_state(xbb->dev) < XenbusStateClosing)
 		xenbus_set_state(xbb->dev, XenbusStateClosing);
 
 	frontState = xenbus_get_otherend_state(xbb->dev);
 	mtx_lock(&xbb->lock);
 	xbb->flags &= ~XBBF_IN_SHUTDOWN;
 
 	/* The front can submit I/O until entering the closed state. */
 	if (frontState < XenbusStateClosed)
 		return (EAGAIN);
 
 	DPRINTF("\n");
 
 	/* Indicate shutdown is in progress. */
 	xbb->flags |= XBBF_SHUTDOWN;
 
 	/* Disconnect from the front-end. */
 	error = xbb_disconnect(xbb);
 	if (error != 0) {
 		/*
 		 * Requests still outstanding.  We'll be called again
 		 * once they complete.
 		 */
 		KASSERT(error == EAGAIN,
 			("%s: Unexpected xbb_disconnect() failure %d",
 			 __func__, error));
 
 		return (error);
 	}
 
 	DPRINTF("\n");
 
 	/* Indicate to xbb_detach() that is it safe to proceed. */
 	wakeup(xbb);
 
 	return (0);
 }
 
 /**
  * Report an attach time error to the console and Xen, and cleanup
  * this instance by forcing immediate detach processing.
  *
  * \param xbb  Per-instance xbb configuration structure.
  * \param err  Errno describing the error.
  * \param fmt  Printf style format and arguments
  */
 static void
 xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
 {
 	va_list ap;
 	va_list ap_hotplug;
 
 	va_start(ap, fmt);
 	va_copy(ap_hotplug, ap);
 	xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
 		  "hotplug-error", fmt, ap_hotplug);
 	va_end(ap_hotplug);
 	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 		  "hotplug-status", "error");
 
 	xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
 	va_end(ap);
 
 	xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 		  "online", "0");
 	xbb_detach(xbb->dev);
 }
 
 /*---------------------------- NewBus Entrypoints ----------------------------*/
 /**
  * Inspect a XenBus device and claim it if is of the appropriate type.
  * 
  * \param dev  NewBus device object representing a candidate XenBus device.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_probe(device_t dev)
 {
  
         if (!strcmp(xenbus_get_type(dev), "vbd")) {
                 device_set_desc(dev, "Backend Virtual Block Device");
                 device_quiet(dev);
                 return (0);
         }
 
         return (ENXIO);
 }
 
 /**
  * Setup sysctl variables to control various Block Back parameters.
  *
  * \param xbb  Xen Block Back softc.
  *
  */
 static void
 xbb_setup_sysctl(struct xbb_softc *xbb)
 {
 	struct sysctl_ctx_list *sysctl_ctx = NULL;
 	struct sysctl_oid      *sysctl_tree = NULL;
 	
 	sysctl_ctx = device_get_sysctl_ctx(xbb->dev);
 	if (sysctl_ctx == NULL)
 		return;
 
 	sysctl_tree = device_get_sysctl_tree(xbb->dev);
 	if (sysctl_tree == NULL)
 		return;
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		       "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0,
 		       "fake the flush command");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		       "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0,
 		       "send a real flush for N flush requests");
 
 	SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		       "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0,
 		       "Don't coalesce contiguous requests");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "reqs_received", CTLFLAG_RW, &xbb->reqs_received,
 			 "how many I/O requests we have received");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed,
 			 "how many I/O requests have been completed");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "reqs_queued_for_completion", CTLFLAG_RW,
 			 &xbb->reqs_queued_for_completion,
 			 "how many I/O requests queued but not yet pushed");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "reqs_completed_with_error", CTLFLAG_RW,
 			 &xbb->reqs_completed_with_error,
 			 "how many I/O requests completed with error status");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch,
 			 "how many I/O dispatches were forced");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch,
 			 "how many I/O dispatches were normal");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch,
 			 "total number of I/O dispatches");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages,
 			 "how many times we have run out of KVA");
 
 	SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 			 "request_shortages", CTLFLAG_RW,
 			 &xbb->request_shortages,
 			 "how many times we have run out of requests");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "max_requests", CTLFLAG_RD, &xbb->max_requests, 0,
 		        "maximum outstanding requests (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "max_request_segments", CTLFLAG_RD,
 		        &xbb->max_request_segments, 0,
 		        "maximum number of pages per requests (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "max_request_size", CTLFLAG_RD,
 		        &xbb->max_request_size, 0,
 		        "maximum size in bytes of a request (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO,
 		        "ring_pages", CTLFLAG_RD,
 		        &xbb->ring_config.ring_pages, 0,
 		        "communication channel pages (negotiated)");
 }
 
 /**
  * Attach to a XenBus device that has been claimed by our probe routine.
  *
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_attach(device_t dev)
 {
 	struct xbb_softc	*xbb;
 	int			 error;
 	u_int			 max_ring_page_order;
 
 	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
 
 	/*
 	 * Basic initialization.
 	 * After this block it is safe to call xbb_detach()
 	 * to clean up any allocated data for this instance.
 	 */
 	xbb = device_get_softc(dev);
 	xbb->dev = dev;
 	xbb->otherend_id = xenbus_get_otherend_id(dev);
 	TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
 	mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
 
 	/*
 	 * Publish protocol capabilities for consumption by the
 	 * front-end.
 	 */
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "feature-barrier", "1");
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "feature-flush-cache", "1");
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1;
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "max-ring-page-order", "%u", max_ring_page_order);
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	/* Collect physical device information. */
 	error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
 			  "device-type", NULL, &xbb->dev_type,
 			  NULL);
 	if (error != 0)
 		xbb->dev_type = NULL;
 
 	error = xs_gather(XST_NIL, xenbus_get_node(dev),
                           "mode", NULL, &xbb->dev_mode,
 			  "params", NULL, &xbb->dev_name,
                           NULL);
 	if (error != 0) {
 		xbb_attach_failed(xbb, error, "reading backend fields at %s",
 				  xenbus_get_node(dev));
                 return (ENXIO);
         }
 
 	/* Parse fopen style mode flags. */
 	if (strchr(xbb->dev_mode, 'w') == NULL)
 		xbb->flags |= XBBF_READ_ONLY;
 
 	/*
 	 * Verify the physical device is present and can support
 	 * the desired I/O mode.
 	 */
 	DROP_GIANT();
 	error = xbb_open_backend(xbb);
 	PICKUP_GIANT();
 	if (error != 0) {
 		xbb_attach_failed(xbb, error, "Unable to open %s",
 				  xbb->dev_name);
 		return (ENXIO);
 	}
 
 	/* Use devstat(9) for recording statistics. */
 	xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
 					   xbb->sector_size,
 					   DEVSTAT_ALL_SUPPORTED,
 					   DEVSTAT_TYPE_DIRECT
 					 | DEVSTAT_TYPE_IF_OTHER,
 					   DEVSTAT_PRIORITY_OTHER);
 
 	xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev),
 					      xbb->sector_size,
 					      DEVSTAT_ALL_SUPPORTED,
 					      DEVSTAT_TYPE_DIRECT
 					    | DEVSTAT_TYPE_IF_OTHER,
 					      DEVSTAT_PRIORITY_OTHER);
 	/*
 	 * Setup sysctl variables.
 	 */
 	xbb_setup_sysctl(xbb);
 
 	/*
 	 * Create a taskqueue for doing work that must occur from a
 	 * thread context.
 	 */
 	xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev),
 						  M_NOWAIT,
 						  taskqueue_thread_enqueue,
 						  /*contxt*/&xbb->io_taskqueue);
 	if (xbb->io_taskqueue == NULL) {
 		xbb_attach_failed(xbb, error, "Unable to create taskqueue");
 		return (ENOMEM);
 	}
 
 	taskqueue_start_threads(&xbb->io_taskqueue,
 				/*num threads*/1,
 				/*priority*/PWAIT,
 				/*thread name*/
 				"%s taskq", device_get_nameunit(dev));
 
 	/* Update hot-plug status to satisfy xend. */
 	error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
 			  "hotplug-status", "connected");
 	if (error) {
 		xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
 				  xenbus_get_node(xbb->dev));
 		return (error);
 	}
 
 	/* Tell the front end that we are ready to connect. */
 	xenbus_set_state(dev, XenbusStateInitWait);
 
 	return (0);
 }
 
 /**
  * Detach from a block back device instance.
  *
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  * 
  * \note A block back device may be detached at any time in its life-cycle,
  *       including part way through the attach process.  For this reason,
  *       initialization order and the initialization state checks in this
  *       routine must be carefully coupled so that attach time failures
  *       are gracefully handled.
  */
 static int
 xbb_detach(device_t dev)
 {
         struct xbb_softc *xbb;
 
 	DPRINTF("\n");
 
         xbb = device_get_softc(dev);
 	mtx_lock(&xbb->lock);
 	while (xbb_shutdown(xbb) == EAGAIN) {
 		msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
 		       "xbb_shutdown", 0);
 	}
 	mtx_unlock(&xbb->lock);
 
 	DPRINTF("\n");
 
 	if (xbb->io_taskqueue != NULL)
 		taskqueue_free(xbb->io_taskqueue);
 
 	if (xbb->xbb_stats != NULL)
 		devstat_remove_entry(xbb->xbb_stats);
 
 	if (xbb->xbb_stats_in != NULL)
 		devstat_remove_entry(xbb->xbb_stats_in);
 
 	xbb_close_backend(xbb);
 
 	if (xbb->dev_mode != NULL) {
 		free(xbb->dev_mode, M_XENSTORE);
 		xbb->dev_mode = NULL;
 	}
 
 	if (xbb->dev_type != NULL) {
 		free(xbb->dev_type, M_XENSTORE);
 		xbb->dev_type = NULL;
 	}
 
 	if (xbb->dev_name != NULL) {
 		free(xbb->dev_name, M_XENSTORE);
 		xbb->dev_name = NULL;
 	}
 
 	mtx_destroy(&xbb->lock);
         return (0);
 }
 
 /**
  * Prepare this block back device for suspension of this VM.
  * 
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_suspend(device_t dev)
 {
 #ifdef NOT_YET
         struct xbb_softc *sc = device_get_softc(dev);
 
         /* Prevent new requests being issued until we fix things up. */
         mtx_lock(&sc->xb_io_lock);
         sc->connected = BLKIF_STATE_SUSPENDED;
         mtx_unlock(&sc->xb_io_lock);
 #endif
 
         return (0);
 }
 
 /**
  * Perform any processing required to recover from a suspended state.
  * 
  * \param dev  NewBus device object representing this Xen Block Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xbb_resume(device_t dev)
 {
 	return (0);
 }
 
 /**
  * Handle state changes expressed via the XenStore by our front-end peer.
  *
  * \param dev             NewBus device object representing this Xen
  *                        Block Back instance.
  * \param frontend_state  The new state of the front-end.
  *
  * \return  0 for success, errno codes for failure.
  */
 static void
 xbb_frontend_changed(device_t dev, XenbusState frontend_state)
 {
 	struct xbb_softc *xbb = device_get_softc(dev);
 
 	DPRINTF("frontend_state=%s, xbb_state=%s\n",
 	        xenbus_strstate(frontend_state),
 		xenbus_strstate(xenbus_get_state(xbb->dev)));
 
 	switch (frontend_state) {
 	case XenbusStateInitialising:
 		break;
 	case XenbusStateInitialised:
 	case XenbusStateConnected:
 		xbb_connect(xbb);
 		break;
 	case XenbusStateClosing:
 	case XenbusStateClosed:
 		mtx_lock(&xbb->lock);
 		xbb_shutdown(xbb);
 		mtx_unlock(&xbb->lock);
 		if (frontend_state == XenbusStateClosed)
 			xenbus_set_state(xbb->dev, XenbusStateClosed);
 		break;
 	default:
 		xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
 				 frontend_state);
 		break;
 	}
 }
 
 /*---------------------------- NewBus Registration ---------------------------*/
 static device_method_t xbb_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		xbb_probe),
 	DEVMETHOD(device_attach,	xbb_attach),
 	DEVMETHOD(device_detach,	xbb_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	xbb_suspend),
 	DEVMETHOD(device_resume,	xbb_resume),
 
 	/* Xenbus interface */
 	DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
 
 	{ 0, 0 }
 };
 
 static driver_t xbb_driver = {
         "xbbd",
         xbb_methods,
         sizeof(struct xbb_softc),
 };
 devclass_t xbb_devclass;
 
 DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);
Index: head/sys/kern/subr_unit.c
===================================================================
--- head/sys/kern/subr_unit.c	(revision 299089)
+++ head/sys/kern/subr_unit.c	(revision 299090)
@@ -1,1060 +1,1061 @@
 /*-
  * Copyright (c) 2004 Poul-Henning Kamp
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $FreeBSD$
  *
  *
  * Unit number allocation functions.
  *
  * These functions implement a mixed run-length/bitmap management of unit
  * number spaces in a very memory efficient manner.
  *
  * Allocation policy is always lowest free number first.
  *
  * A return value of -1 signals that no more unit numbers are available.
  *
  * There is no cost associated with the range of unitnumbers, so unless
  * the resource really is finite, specify INT_MAX to new_unrhdr() and
  * forget about checking the return value.
  *
  * If a mutex is not provided when the unit number space is created, a
  * default global mutex is used.  The advantage to passing a mutex in, is
  * that the alloc_unrl() function can be called with the mutex already
  * held (it will not be released by alloc_unrl()).
  *
  * The allocation function alloc_unr{l}() never sleeps (but it may block on
  * the mutex of course).
  *
  * Freeing a unit number may require allocating memory, and can therefore
  * sleep so the free_unr() function does not come in a pre-locked variant.
  *
  * A userland test program is included.
  *
  * Memory usage is a very complex function of the exact allocation
  * pattern, but always very compact:
  *    * For the very typical case where a single unbroken run of unit
  *      numbers are allocated 44 bytes are used on i386.
  *    * For a unit number space of 1000 units and the random pattern
  *      in the usermode test program included, the worst case usage
  *	was 252 bytes on i386 for 500 allocated and 500 free units.
  *    * For a unit number space of 10000 units and the random pattern
  *      in the usermode test program included, the worst case usage
  *	was 798 bytes on i386 for 5000 allocated and 5000 free units.
  *    * The worst case is where every other unit number is allocated and
  *	the rest are free.  In that case 44 + N/4 bytes are used where
  *	N is the number of the highest unit allocated.
  */
 
+#include <sys/param.h>
 #include <sys/types.h>
 #include <sys/_unrhdr.h>
 
 #ifdef _KERNEL
 
 #include <sys/bitstring.h>
-#include <sys/param.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 
 /*
  * In theory it would be smarter to allocate the individual blocks
  * with the zone allocator, but at this time the expectation is that
  * there will typically not even be enough allocations to fill a single
  * page, so we stick with malloc for now.
  */
 static MALLOC_DEFINE(M_UNIT, "Unitno", "Unit number allocation");
 
 #define Malloc(foo) malloc(foo, M_UNIT, M_WAITOK | M_ZERO)
 #define Free(foo) free(foo, M_UNIT)
 
 static struct mtx unitmtx;
 
 MTX_SYSINIT(unit, &unitmtx, "unit# allocation", MTX_DEF);
 
 #else /* ...USERLAND */
 
 #include <bitstring.h>
 #include <err.h>
 #include <errno.h>
 #include <getopt.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 
 #define KASSERT(cond, arg) \
 	do { \
 		if (!(cond)) { \
 			printf arg; \
 			abort(); \
 		} \
 	} while (0)
 
 static int no_alloc;
 #define Malloc(foo) _Malloc(foo, __LINE__)
 static void *
 _Malloc(size_t foo, int line)
 {
 
 	KASSERT(no_alloc == 0, ("malloc in wrong place() line %d", line));
 	return (calloc(foo, 1));
 }
 #define Free(foo) free(foo)
 
 struct unrhdr;
 
 
 struct mtx {
 	int	state;
 } unitmtx;
 
 static void
 mtx_lock(struct mtx *mp)
 {
 	KASSERT(mp->state == 0, ("mutex already locked"));
 	mp->state = 1;
 }
 
 static void
 mtx_unlock(struct mtx *mp)
 {
 	KASSERT(mp->state == 1, ("mutex not locked"));
 	mp->state = 0;
 }
 
 #define MA_OWNED	9
 
 static void
 mtx_assert(struct mtx *mp, int flag)
 {
 	if (flag == MA_OWNED) {
 		KASSERT(mp->state == 1, ("mtx_assert(MA_OWNED) not true"));
 	}
 }
 
 #define CTASSERT(foo)
 #define WITNESS_WARN(flags, lock, fmt, ...)	(void)0
 
 #endif /* USERLAND */
 
 /*
  * This is our basic building block.
  *
  * It can be used in three different ways depending on the value of the ptr
  * element:
  *     If ptr is NULL, it represents a run of free items.
  *     If ptr points to the unrhdr it represents a run of allocated items.
- *     Otherwise it points to an bitstring of allocated items.
+ *     Otherwise it points to a bitstring of allocated items.
  *
  * For runs the len field is the length of the run.
  * For bitmaps the len field represents the number of allocated items.
  *
  * The bitmap is the same size as struct unr to optimize memory management.
  */
 struct unr {
 	TAILQ_ENTRY(unr)	list;
 	u_int			len;
 	void			*ptr;
 };
 
 struct unrb {
-	u_char			busy;
-	bitstr_t		map[sizeof(struct unr) - 1];
+	bitstr_t		map[sizeof(struct unr) / sizeof(bitstr_t)];
 };
 
-CTASSERT(sizeof(struct unr) == sizeof(struct unrb));
+CTASSERT((sizeof(struct unr) % sizeof(bitstr_t)) == 0);
 
-/* Number of bits in the bitmap */
-#define NBITS	((int)sizeof(((struct unrb *)NULL)->map) * 8)
+/* Number of bits we can store in the bitmap */
+#define NBITS (8 * sizeof(((struct unrb*)NULL)->map))
 
+/* Is the unrb empty in at least the first len bits? */
+static inline bool
+ub_empty(struct unrb *ub, int len) {
+	int first_set;
+
+	bit_ffs(ub->map, len, &first_set);
+	return (first_set == -1);
+}
+
+/* Is the unrb full?  That is, is the number of set elements equal to len? */
+static inline bool
+ub_full(struct unrb *ub, int len)
+{
+	int first_clear;
+
+	bit_ffc(ub->map, len, &first_clear);
+	return (first_clear == -1);
+}
+
+
 #if defined(DIAGNOSTIC) || !defined(_KERNEL)
 /*
  * Consistency check function.
  *
  * Checks the internal consistency as well as we can.
  * 
  * Called at all boundaries of this API.
  */
 static void
 check_unrhdr(struct unrhdr *uh, int line)
 {
 	struct unr *up;
 	struct unrb *ub;
 	u_int x, y, z, w;
 
 	y = uh->first;
 	z = 0;
 	TAILQ_FOREACH(up, &uh->head, list) {
 		z++;
 		if (up->ptr != uh && up->ptr != NULL) {
 			ub = up->ptr;
 			KASSERT (up->len <= NBITS,
-			    ("UNR inconsistency: len %u max %d (line %d)\n",
+			    ("UNR inconsistency: len %u max %zd (line %d)\n",
 			    up->len, NBITS, line));
 			z++;
 			w = 0;
 			for (x = 0; x < up->len; x++)
 				if (bit_test(ub->map, x))
 					w++;
-			KASSERT (w == ub->busy,
-			    ("UNR inconsistency: busy %u found %u (line %d)\n",
-			    ub->busy, w, line));
 			y += w;
 		} else if (up->ptr != NULL) 
 			y += up->len;
 	}
 	KASSERT (y == uh->busy,
 	    ("UNR inconsistency: items %u found %u (line %d)\n",
 	    uh->busy, y, line));
 	KASSERT (z == uh->alloc,
 	    ("UNR inconsistency: chunks %u found %u (line %d)\n",
 	    uh->alloc, z, line));
 }
 
 #else
 
 static __inline void
-check_unrhdr(struct unrhdr *uh, int line)
+check_unrhdr(struct unrhdr *uh __unused, int line __unused)
 {
 
 }
 
 #endif
 
 
 /*
  * Userland memory management.  Just use calloc and keep track of how
  * many elements we have allocated for check_unrhdr().
  */
 
 static __inline void *
 new_unr(struct unrhdr *uh, void **p1, void **p2)
 {
 	void *p;
 
 	uh->alloc++;
 	KASSERT(*p1 != NULL || *p2 != NULL, ("Out of cached memory"));
 	if (*p1 != NULL) {
 		p = *p1;
 		*p1 = NULL;
 		return (p);
 	} else {
 		p = *p2;
 		*p2 = NULL;
 		return (p);
 	}
 }
 
 static __inline void
 delete_unr(struct unrhdr *uh, void *ptr)
 {
 	struct unr *up;
 
 	uh->alloc--;
 	up = ptr;
 	TAILQ_INSERT_TAIL(&uh->ppfree, up, list);
 }
 
 void
 clean_unrhdrl(struct unrhdr *uh)
 {
 	struct unr *up;
 
 	mtx_assert(uh->mtx, MA_OWNED);
 	while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) {
 		TAILQ_REMOVE(&uh->ppfree, up, list);
 		mtx_unlock(uh->mtx);
 		Free(up);
 		mtx_lock(uh->mtx);
 	}
 
 }
 
 void
 clean_unrhdr(struct unrhdr *uh)
 {
 
 	mtx_lock(uh->mtx);
 	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 }
 
 void
 init_unrhdr(struct unrhdr *uh, int low, int high, struct mtx *mutex)
 {
 
 	KASSERT(low >= 0 && low <= high,
 	    ("UNR: use error: new_unrhdr(%d, %d)", low, high));
 	if (mutex != NULL)
 		uh->mtx = mutex;
 	else
 		uh->mtx = &unitmtx;
 	TAILQ_INIT(&uh->head);
 	TAILQ_INIT(&uh->ppfree);
 	uh->low = low;
 	uh->high = high;
 	uh->first = 0;
 	uh->last = 1 + (high - low);
 	check_unrhdr(uh, __LINE__);
 }
 
 /*
  * Allocate a new unrheader set.
  *
  * Highest and lowest valid values given as parameters.
  */
 
 struct unrhdr *
 new_unrhdr(int low, int high, struct mtx *mutex)
 {
 	struct unrhdr *uh;
 
 	uh = Malloc(sizeof *uh);
 	init_unrhdr(uh, low, high, mutex);
 	return (uh);
 }
 
 void
 delete_unrhdr(struct unrhdr *uh)
 {
 
 	check_unrhdr(uh, __LINE__);
 	KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy));
 	KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr"));
 	KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL,
 	    ("unrhdr has postponed item for free"));
 	Free(uh);
 }
 
 static __inline int
 is_bitmap(struct unrhdr *uh, struct unr *up)
 {
 	return (up->ptr != uh && up->ptr != NULL);
 }
 
 /*
  * Look for sequence of items which can be combined into a bitmap, if
  * multiple are present, take the one which saves most memory.
  * 
  * Return (1) if a sequence was found to indicate that another call
  * might be able to do more.  Return (0) if we found no suitable sequence.
  *
  * NB: called from alloc_unr(), no new memory allocation allowed.
  */
 static int
 optimize_unr(struct unrhdr *uh)
 {
 	struct unr *up, *uf, *us;
 	struct unrb *ub, *ubf;
 	u_int a, l, ba;
 
 	/*
 	 * Look for the run of items (if any) which when collapsed into
 	 * a bitmap would save most memory.
 	 */
 	us = NULL;
 	ba = 0;
 	TAILQ_FOREACH(uf, &uh->head, list) {
 		if (uf->len >= NBITS)
 			continue;
 		a = 1;
 		if (is_bitmap(uh, uf))
 			a++;
 		l = uf->len;
 		up = uf;
 		while (1) {
 			up = TAILQ_NEXT(up, list);
 			if (up == NULL)
 				break;
 			if ((up->len + l) > NBITS)
 				break;
 			a++;
 			if (is_bitmap(uh, up))
 				a++;
 			l += up->len;
 		}
 		if (a > ba) {
 			ba = a;
 			us = uf;
 		}
 	}
 	if (ba < 3)
 		return (0);
 
 	/*
 	 * If the first element is not a bitmap, make it one.
 	 * Trying to do so without allocating more memory complicates things
 	 * a bit
 	 */
 	if (!is_bitmap(uh, us)) {
 		uf = TAILQ_NEXT(us, list);
 		TAILQ_REMOVE(&uh->head, us, list);
 		a = us->len;
 		l = us->ptr == uh ? 1 : 0;
 		ub = (void *)us;
-		ub->busy = 0;
-		if (l) {
+		bit_nclear(ub->map, 0, NBITS - 1);
+		if (l)
 			bit_nset(ub->map, 0, a);
-			ub->busy += a;
-		} else {
-			bit_nclear(ub->map, 0, a);
-		}
 		if (!is_bitmap(uh, uf)) {
-			if (uf->ptr == NULL) {
+			if (uf->ptr == NULL)
 				bit_nclear(ub->map, a, a + uf->len - 1);
-			} else {
+			else
 				bit_nset(ub->map, a, a + uf->len - 1);
-				ub->busy += uf->len;
-			}
 			uf->ptr = ub;
 			uf->len += a;
 			us = uf;
 		} else {
 			ubf = uf->ptr;
 			for (l = 0; l < uf->len; l++, a++) {
-				if (bit_test(ubf->map, l)) {
+				if (bit_test(ubf->map, l))
 					bit_set(ub->map, a);
-					ub->busy++;
-				} else {
+				else
 					bit_clear(ub->map, a);
-				}
 			}
 			uf->len = a;
 			delete_unr(uh, uf->ptr);
 			uf->ptr = ub;
 			us = uf;
 		}
 	}
 	ub = us->ptr;
 	while (1) {
 		uf = TAILQ_NEXT(us, list);
 		if (uf == NULL)
 			return (1);
 		if (uf->len + us->len > NBITS)
 			return (1);
 		if (uf->ptr == NULL) {
 			bit_nclear(ub->map, us->len, us->len + uf->len - 1);
 			us->len += uf->len;
 			TAILQ_REMOVE(&uh->head, uf, list);
 			delete_unr(uh, uf);
 		} else if (uf->ptr == uh) {
 			bit_nset(ub->map, us->len, us->len + uf->len - 1);
-			ub->busy += uf->len;
 			us->len += uf->len;
 			TAILQ_REMOVE(&uh->head, uf, list);
 			delete_unr(uh, uf);
 		} else {
 			ubf = uf->ptr;
 			for (l = 0; l < uf->len; l++, us->len++) {
-				if (bit_test(ubf->map, l)) {
+				if (bit_test(ubf->map, l))
 					bit_set(ub->map, us->len);
-					ub->busy++;
-				} else {
+				else
 					bit_clear(ub->map, us->len);
-				}
 			}
 			TAILQ_REMOVE(&uh->head, uf, list);
 			delete_unr(uh, ubf);
 			delete_unr(uh, uf);
 		}
 	}
 }
 
 /*
  * See if a given unr should be collapsed with a neighbor.
  *
  * NB: called from alloc_unr(), no new memory allocation allowed.
  */
 static void
 collapse_unr(struct unrhdr *uh, struct unr *up)
 {
 	struct unr *upp;
 	struct unrb *ub;
 
 	/* If bitmap is all set or clear, change it to runlength */
 	if (is_bitmap(uh, up)) {
 		ub = up->ptr;
-		if (ub->busy == up->len) {
+		if (ub_full(ub, up->len)) {
 			delete_unr(uh, up->ptr);
 			up->ptr = uh;
-		} else if (ub->busy == 0) {
+		} else if (ub_empty(ub, up->len)) {
 			delete_unr(uh, up->ptr);
 			up->ptr = NULL;
 		}
 	}
 
 	/* If nothing left in runlength, delete it */
 	if (up->len == 0) {
 		upp = TAILQ_PREV(up, unrhd, list);
 		if (upp == NULL)
 			upp = TAILQ_NEXT(up, list);
 		TAILQ_REMOVE(&uh->head, up, list);
 		delete_unr(uh, up);
 		up = upp;
 	}
 
 	/* If we have "hot-spot" still, merge with neighbor if possible */
 	if (up != NULL) {
 		upp = TAILQ_PREV(up, unrhd, list);
 		if (upp != NULL && up->ptr == upp->ptr) {
 			up->len += upp->len;
 			TAILQ_REMOVE(&uh->head, upp, list);
 			delete_unr(uh, upp);
 			}
 		upp = TAILQ_NEXT(up, list);
 		if (upp != NULL && up->ptr == upp->ptr) {
 			up->len += upp->len;
 			TAILQ_REMOVE(&uh->head, upp, list);
 			delete_unr(uh, upp);
 		}
 	}
 
 	/* Merge into ->first if possible */
 	upp = TAILQ_FIRST(&uh->head);
 	if (upp != NULL && upp->ptr == uh) {
 		uh->first += upp->len;
 		TAILQ_REMOVE(&uh->head, upp, list);
 		delete_unr(uh, upp);
 		if (up == upp)
 			up = NULL;
 	}
 
 	/* Merge into ->last if possible */
 	upp = TAILQ_LAST(&uh->head, unrhd);
 	if (upp != NULL && upp->ptr == NULL) {
 		uh->last += upp->len;
 		TAILQ_REMOVE(&uh->head, upp, list);
 		delete_unr(uh, upp);
 		if (up == upp)
 			up = NULL;
 	}
 
 	/* Try to make bitmaps */
 	while (optimize_unr(uh))
 		continue;
 }
 
 /*
  * Allocate a free unr.
  */
 int
 alloc_unrl(struct unrhdr *uh)
 {
 	struct unr *up;
 	struct unrb *ub;
 	u_int x;
 	int y;
 
 	mtx_assert(uh->mtx, MA_OWNED);
 	check_unrhdr(uh, __LINE__);
 	x = uh->low + uh->first;
 
 	up = TAILQ_FIRST(&uh->head);
 
 	/*
 	 * If we have an ideal split, just adjust the first+last
 	 */
 	if (up == NULL && uh->last > 0) {
 		uh->first++;
 		uh->last--;
 		uh->busy++;
 		return (x);
 	}
 
 	/*
 	 * We can always allocate from the first list element, so if we have 
 	 * nothing on the list, we must have run out of unit numbers.
 	 */
 	if (up == NULL)
 		return (-1);
 
 	KASSERT(up->ptr != uh, ("UNR first element is allocated"));
 
 	if (up->ptr == NULL) {	/* free run */
 		uh->first++;
 		up->len--;
 	} else {		/* bitmap */
 		ub = up->ptr;
-		KASSERT(ub->busy < up->len, ("UNR bitmap confusion"));
 		bit_ffc(ub->map, up->len, &y);
 		KASSERT(y != -1, ("UNR corruption: No clear bit in bitmap."));
 		bit_set(ub->map, y);
-		ub->busy++;
 		x += y;
 	}
 	uh->busy++;
 	collapse_unr(uh, up);
 	return (x);
 }
 
 int
 alloc_unr(struct unrhdr *uh)
 {
 	int i;
 
 	mtx_lock(uh->mtx);
 	i = alloc_unrl(uh);
 	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 	return (i);
 }
 
 static int
 alloc_unr_specificl(struct unrhdr *uh, u_int item, void **p1, void **p2)
 {
 	struct unr *up, *upn;
 	struct unrb *ub;
 	u_int i, last, tl;
 
 	mtx_assert(uh->mtx, MA_OWNED);
 
 	if (item < uh->low + uh->first || item > uh->high)
 		return (-1);
 
 	up = TAILQ_FIRST(&uh->head);
 	/* Ideal split. */
 	if (up == NULL && item - uh->low == uh->first) {
 		uh->first++;
 		uh->last--;
 		uh->busy++;
 		check_unrhdr(uh, __LINE__);
 		return (item);
 	}
 
 	i = item - uh->low - uh->first;
 
 	if (up == NULL) {
 		up = new_unr(uh, p1, p2);
 		up->ptr = NULL;
 		up->len = i;
 		TAILQ_INSERT_TAIL(&uh->head, up, list);
 		up = new_unr(uh, p1, p2);
 		up->ptr = uh;
 		up->len = 1;
 		TAILQ_INSERT_TAIL(&uh->head, up, list);
 		uh->last = uh->high - uh->low - i;
 		uh->busy++;
 		check_unrhdr(uh, __LINE__);
 		return (item);
 	} else {
 		/* Find the item which contains the unit we want to allocate. */
 		TAILQ_FOREACH(up, &uh->head, list) {
 			if (up->len > i)
 				break;
 			i -= up->len;
 		}
 	}
 
 	if (up == NULL) {
 		if (i > 0) {
 			up = new_unr(uh, p1, p2);
 			up->ptr = NULL;
 			up->len = i;
 			TAILQ_INSERT_TAIL(&uh->head, up, list);
 		}
 		up = new_unr(uh, p1, p2);
 		up->ptr = uh;
 		up->len = 1;
 		TAILQ_INSERT_TAIL(&uh->head, up, list);
 		goto done;
 	}
 
 	if (is_bitmap(uh, up)) {
 		ub = up->ptr;
 		if (bit_test(ub->map, i) == 0) {
 			bit_set(ub->map, i);
-			ub->busy++;
 			goto done;
 		} else
 			return (-1);
 	} else if (up->ptr == uh)
 		return (-1);
 
 	KASSERT(up->ptr == NULL,
 	    ("alloc_unr_specificl: up->ptr != NULL (up=%p)", up));
 
 	/* Split off the tail end, if any. */
 	tl = up->len - (1 + i);
 	if (tl > 0) {
 		upn = new_unr(uh, p1, p2);
 		upn->ptr = NULL;
 		upn->len = tl;
 		TAILQ_INSERT_AFTER(&uh->head, up, upn, list);
 	}
 
 	/* Split off head end, if any */
 	if (i > 0) {
 		upn = new_unr(uh, p1, p2);
 		upn->len = i;
 		upn->ptr = NULL;
 		TAILQ_INSERT_BEFORE(up, upn, list);
 	}
 	up->len = 1;
 	up->ptr = uh;
 
 done:
 	last = uh->high - uh->low - (item - uh->low);
 	if (uh->last > last)
 		uh->last = last;
 	uh->busy++;
 	collapse_unr(uh, up);
 	check_unrhdr(uh, __LINE__);
 	return (item);
 }
 
 int
 alloc_unr_specific(struct unrhdr *uh, u_int item)
 {
 	void *p1, *p2;
 	int i;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "alloc_unr_specific");
 
 	p1 = Malloc(sizeof(struct unr));
 	p2 = Malloc(sizeof(struct unr));
 
 	mtx_lock(uh->mtx);
 	i = alloc_unr_specificl(uh, item, &p1, &p2);
 	mtx_unlock(uh->mtx);
 
 	if (p1 != NULL)
 		Free(p1);
 	if (p2 != NULL)
 		Free(p2);
 
 	return (i);
 }
 
 /*
  * Free a unr.
  *
  * If we can save unrs by using a bitmap, do so.
  */
 static void
 free_unrl(struct unrhdr *uh, u_int item, void **p1, void **p2)
 {
 	struct unr *up, *upp, *upn;
 	struct unrb *ub;
 	u_int pl;
 
 	KASSERT(item >= uh->low && item <= uh->high,
 	    ("UNR: free_unr(%u) out of range [%u...%u]",
 	     item, uh->low, uh->high));
 	check_unrhdr(uh, __LINE__);
 	item -= uh->low;
 	upp = TAILQ_FIRST(&uh->head);
 	/*
 	 * Freeing in the ideal split case
 	 */
 	if (item + 1 == uh->first && upp == NULL) {
 		uh->last++;
 		uh->first--;
 		uh->busy--;
 		check_unrhdr(uh, __LINE__);
 		return;
 	}
 	/*
  	 * Freeing in the ->first section.  Create a run starting at the
 	 * freed item.  The code below will subdivide it.
 	 */
 	if (item < uh->first) {
 		up = new_unr(uh, p1, p2);
 		up->ptr = uh;
 		up->len = uh->first - item;
 		TAILQ_INSERT_HEAD(&uh->head, up, list);
 		uh->first -= up->len;
 	}
 
 	item -= uh->first;
 
 	/* Find the item which contains the unit we want to free */
 	TAILQ_FOREACH(up, &uh->head, list) {
 		if (up->len > item)
 			break;
 		item -= up->len;
 	}
 
 	/* Handle bitmap items */
 	if (is_bitmap(uh, up)) {
 		ub = up->ptr;
 		
 		KASSERT(bit_test(ub->map, item) != 0,
 		    ("UNR: Freeing free item %d (bitmap)\n", item));
 		bit_clear(ub->map, item);
 		uh->busy--;
-		ub->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	KASSERT(up->ptr == uh, ("UNR Freeing free item %d (run))\n", item));
 
 	/* Just this one left, reap it */
 	if (up->len == 1) {
 		up->ptr = NULL;
 		uh->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	/* Check if we can shift the item into the previous 'free' run */
 	upp = TAILQ_PREV(up, unrhd, list);
 	if (item == 0 && upp != NULL && upp->ptr == NULL) {
 		upp->len++;
 		up->len--;
 		uh->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	/* Check if we can shift the item to the next 'free' run */
 	upn = TAILQ_NEXT(up, list);
 	if (item == up->len - 1 && upn != NULL && upn->ptr == NULL) {
 		upn->len++;
 		up->len--;
 		uh->busy--;
 		collapse_unr(uh, up);
 		return;
 	}
 
 	/* Split off the tail end, if any. */
 	pl = up->len - (1 + item);
 	if (pl > 0) {
 		upp = new_unr(uh, p1, p2);
 		upp->ptr = uh;
 		upp->len = pl;
 		TAILQ_INSERT_AFTER(&uh->head, up, upp, list);
 	}
 
 	/* Split off head end, if any */
 	if (item > 0) {
 		upp = new_unr(uh, p1, p2);
 		upp->len = item;
 		upp->ptr = uh;
 		TAILQ_INSERT_BEFORE(up, upp, list);
 	}
 	up->len = 1;
 	up->ptr = NULL;
 	uh->busy--;
 	collapse_unr(uh, up);
 }
 
 void
 free_unr(struct unrhdr *uh, u_int item)
 {
 	void *p1, *p2;
 
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr");
 	p1 = Malloc(sizeof(struct unr));
 	p2 = Malloc(sizeof(struct unr));
 	mtx_lock(uh->mtx);
 	free_unrl(uh, item, &p1, &p2);
 	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 	if (p1 != NULL)
 		Free(p1);
 	if (p2 != NULL)
 		Free(p2);
 }
 
 #ifndef _KERNEL	/* USERLAND test driver */
 
 /*
  * Simple stochastic test driver for the above functions.  The code resides
  * here so that it can access static functions and structures.
  */
 
 static bool verbose;
 #define VPRINTF(...)	{if (verbose) printf(__VA_ARGS__);}
 
 static void
 print_unr(struct unrhdr *uh, struct unr *up)
 {
 	u_int x;
 	struct unrb *ub;
 
 	printf("  %p len = %5u ", up, up->len);
 	if (up->ptr == NULL)
 		printf("free\n");
 	else if (up->ptr == uh)
 		printf("alloc\n");
 	else {
 		ub = up->ptr;
-		printf("bitmap(%d) [", ub->busy);
+		printf("bitmap [");
 		for (x = 0; x < up->len; x++) {
 			if (bit_test(ub->map, x))
 				printf("#");
 			else 
 				printf(" ");
 		}
 		printf("]\n");
 	}
 }
 
 static void
 print_unrhdr(struct unrhdr *uh)
 {
 	struct unr *up;
 	u_int x;
 
 	printf(
 	    "%p low = %u high = %u first = %u last = %u busy %u chunks = %u\n",
 	    uh, uh->low, uh->high, uh->first, uh->last, uh->busy, uh->alloc);
 	x = uh->low + uh->first;
 	TAILQ_FOREACH(up, &uh->head, list) {
 		printf("  from = %5u", x);
 		print_unr(uh, up);
 		if (up->ptr == NULL || up->ptr == uh)
 			x += up->len;
 		else
 			x += NBITS;
 	}
 }
 
 static void
 test_alloc_unr(struct unrhdr *uh, u_int i, char a[])
 {
 	int j;
 
 	if (a[i]) {
 		VPRINTF("F %u\n", i);
 		free_unr(uh, i);
 		a[i] = 0;
 	} else {
 		no_alloc = 1;
 		j = alloc_unr(uh);
 		if (j != -1) {
 			a[j] = 1;
 			VPRINTF("A %d\n", j);
 		}
 		no_alloc = 0;
 	}
 }
 
 static void
 test_alloc_unr_specific(struct unrhdr *uh, u_int i, char a[])
 {
 	int j;
 
 	j = alloc_unr_specific(uh, i);
 	if (j == -1) {
 		VPRINTF("F %u\n", i);
 		a[i] = 0;
 		free_unr(uh, i);
 	} else {
 		a[i] = 1;
 		VPRINTF("A %d\n", j);
 	}
 }
 
 static void
 usage(char** argv)
 {
 	printf("%s [-h] [-r REPETITIONS] [-v]\n", argv[0]);
 }
 
 int
 main(int argc, char **argv)
 {
 	struct unrhdr *uh;
 	char *a;
 	long count = 10000;	/* Number of unrs to test */
 	long reps = 1;
 	int ch;
 	u_int i, x, m, j;
 
 	verbose = false;
 
 	while ((ch = getopt(argc, argv, "hr:v")) != -1) {
 		switch (ch) {
 		case 'r':
 			errno = 0;
 			reps = strtol(optarg, NULL, 0);
 			if (errno == ERANGE || errno == EINVAL) {
 				usage(argv);
 				exit(2);
 			}
 			
 			break;
 		case 'v':
 			verbose = true;
 			break;
 		case 'h':
 		default:
 			usage(argv);
 			exit(2);
 		}
 
 
 	}
 
 	setbuf(stdout, NULL);
 	uh = new_unrhdr(0, count - 1, NULL);
 	print_unrhdr(uh);
 
 	a = calloc(count, sizeof(char));
 	if (a == NULL)
 		err(1, "calloc failed");
 	srandomdev();
 
 	printf("sizeof(struct unr) %zu\n", sizeof(struct unr));
 	printf("sizeof(struct unrb) %zu\n", sizeof(struct unrb));
 	printf("sizeof(struct unrhdr) %zu\n", sizeof(struct unrhdr));
-	printf("NBITS %d\n", NBITS);
+	printf("NBITS %lu\n", NBITS);
 	x = 1;
 	for (m = 0; m < count * reps; m++) {
 		j = random();
 		i = (j >> 1) % count;
 #if 0
 		if (a[i] && (j & 1))
 			continue;
 #endif
 		if ((random() & 1) != 0)
 			test_alloc_unr(uh, i, a);
 		else
 			test_alloc_unr_specific(uh, i, a);
 
 		if (verbose)
 			print_unrhdr(uh);
 		check_unrhdr(uh, __LINE__);
 	}
 	for (i = 0; i < count; i++) {
 		if (a[i]) {
 			if (verbose) {
 				printf("C %u\n", i);
 				print_unrhdr(uh);
 			}
 			free_unr(uh, i);
 		}
 	}
 	print_unrhdr(uh);
 	delete_unrhdr(uh);
 	free(a);
 	return (0);
 }
 #endif
Index: head/sys/net/flowtable.c
===================================================================
--- head/sys/net/flowtable.c	(revision 299089)
+++ head/sys/net/flowtable.c	(revision 299090)
@@ -1,1194 +1,1189 @@
 /*-
  * Copyright (c) 2014 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 2008-2010, BitGravity Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  *  1. Redistributions of source code must retain the above copyright notice,
  *     this list of conditions and the following disclaimer.
  *
  *  2. Neither the name of the BitGravity Corporation nor the names of its
  *     contributors may be used to endorse or promote products derived from
  *     this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include "opt_route.h"
 #include "opt_mpath.h"
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/bitstring.h>
 #include <sys/condvar.h>
 #include <sys/callout.h>
 #include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_llatbl.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/flowtable.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #ifdef FLOWTABLE_HASH_ALL
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/sctp.h>
 #endif
 
 #include <ddb/ddb.h>
 
 #ifdef	FLOWTABLE_HASH_ALL
 #define	KEY_PORTS	(sizeof(uint16_t) * 2)
 #define	KEY_ADDRS	2
 #else
 #define	KEY_PORTS	0
 #define	KEY_ADDRS	1
 #endif
 
 #ifdef	INET6
 #define	KEY_ADDR_LEN	sizeof(struct in6_addr)
 #else
 #define	KEY_ADDR_LEN	sizeof(struct in_addr)
 #endif
 
 #define	KEYLEN	((KEY_ADDR_LEN * KEY_ADDRS + KEY_PORTS) / sizeof(uint32_t))
 
 struct flentry {
 	uint32_t		f_hash;		/* hash flowing forward */
 	uint32_t		f_key[KEYLEN];	/* address(es and ports) */
 	uint32_t		f_uptime;	/* uptime at last access */
 	uint16_t		f_fibnum;	/* fib index */
 #ifdef FLOWTABLE_HASH_ALL
 	uint8_t			f_proto;	/* protocol */
 	uint8_t			f_flags;	/* stale? */
 #define FL_STALE 		1
 #endif
 	SLIST_ENTRY(flentry)	f_next;		/* pointer to collision entry */
 	struct rtentry		*f_rt;		/* rtentry for flow */
 	struct llentry		*f_lle;		/* llentry for flow */
 };
 #undef KEYLEN
 
 SLIST_HEAD(flist, flentry);
 /* Make sure we can use pcpu_zone_ptr for struct flist. */
 CTASSERT(sizeof(struct flist) == sizeof(void *));
 
 struct flowtable {
 	counter_u64_t	*ft_stat;
 	int 		ft_size;
 	/*
 	 * ft_table is a malloc(9)ed array of pointers.  Pointers point to
 	 * memory from UMA_ZONE_PCPU zone.
 	 * ft_masks is per-cpu pointer itself.  Each instance points
 	 * to a malloc(9)ed bitset, that is private to corresponding CPU.
 	 */
 	struct flist	**ft_table;
 	bitstr_t 	**ft_masks;
 	bitstr_t	*ft_tmpmask;
 };
 
 #define	FLOWSTAT_ADD(ft, name, v)	\
 	counter_u64_add((ft)->ft_stat[offsetof(struct flowtable_stat, name) / sizeof(uint64_t)], (v))
 #define	FLOWSTAT_INC(ft, name)	FLOWSTAT_ADD(ft, name, 1)
 
 static struct proc *flowcleanerproc;
 static uint32_t flow_hashjitter;
 
 static struct cv 	flowclean_f_cv;
 static struct cv 	flowclean_c_cv;
 static struct mtx	flowclean_lock;
 static uint32_t		flowclean_cycles;
 
 /*
  * TODO:
  * - add sysctls to resize && flush flow tables
  * - Add per flowtable sysctls for statistics and configuring timeouts
  * - add saturation counter to rtentry to support per-packet load-balancing
  *   add flag to indicate round-robin flow, add list lookup from head
      for flows
  * - add sysctl / device node / syscall to support exporting and importing
  *   of flows with flag to indicate that a flow was imported so should
  *   not be considered for auto-cleaning
  * - support explicit connection state (currently only ad-hoc for DSR)
  * - idetach() cleanup for options VIMAGE builds.
  */
 #ifdef INET
 static VNET_DEFINE(struct flowtable, ip4_ft);
 #define	V_ip4_ft	VNET(ip4_ft)
 #endif
 #ifdef INET6
 static VNET_DEFINE(struct flowtable, ip6_ft);
 #define	V_ip6_ft	VNET(ip6_ft)
 #endif
 
 static uma_zone_t flow_zone;
 
 static VNET_DEFINE(int, flowtable_enable) = 1;
 #define	V_flowtable_enable		VNET(flowtable_enable)
 
 static SYSCTL_NODE(_net, OID_AUTO, flowtable, CTLFLAG_RD, NULL,
     "flowtable");
 SYSCTL_INT(_net_flowtable, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
 SYSCTL_UMA_MAX(_net_flowtable, OID_AUTO, maxflows, CTLFLAG_RW,
     &flow_zone, "Maximum number of flows allowed");
 
 static MALLOC_DEFINE(M_FTABLE, "flowtable", "flowtable hashes and bitstrings");
 
 static struct flentry *
 flowtable_lookup_common(struct flowtable *, uint32_t *, int, uint32_t);
 
 #ifdef INET
 static struct flentry *
 flowtable_lookup_ipv4(struct mbuf *m, struct route *ro)
 {
 	struct flentry *fle;
 	struct sockaddr_in *sin;
 	struct ip *ip;
 	uint32_t fibnum;
 #ifdef FLOWTABLE_HASH_ALL
 	uint32_t key[3];
 	int iphlen;
 	uint16_t sport, dport;
 	uint8_t proto;
 #endif
 
 	ip = mtod(m, struct ip *);
 
 	if (ip->ip_src.s_addr == ip->ip_dst.s_addr ||
 	    (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
 	    (ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)
 		return (NULL);
 
 	fibnum = M_GETFIB(m);
 
 #ifdef FLOWTABLE_HASH_ALL
 	iphlen = ip->ip_hl << 2;
 	proto = ip->ip_p;
 
 	switch (proto) {
 	case IPPROTO_TCP: {
 		struct tcphdr *th;
 
 		th = (struct tcphdr *)((char *)ip + iphlen);
 		sport = th->th_sport;
 		dport = th->th_dport;
 		if (th->th_flags & (TH_RST|TH_FIN))
 			fibnum |= (FL_STALE << 24);
 		break;
 	}
 	case IPPROTO_UDP: {
 		struct udphdr *uh;
 
 		uh = (struct udphdr *)((char *)ip + iphlen);
 		sport = uh->uh_sport;
 		dport = uh->uh_dport;
 		break;
 	}
 	case IPPROTO_SCTP: {
 		struct sctphdr *sh;
 
 		sh = (struct sctphdr *)((char *)ip + iphlen);
 		sport = sh->src_port;
 		dport = sh->dest_port;
 		/* XXXGL: handle stale? */
 		break;
 	}
 	default:
 		sport = dport = 0;
 		break;
 	}
 
 	key[0] = ip->ip_dst.s_addr;
 	key[1] = ip->ip_src.s_addr;
 	key[2] = (dport << 16) | sport;
 	fibnum |= proto << 16;
 
 	fle = flowtable_lookup_common(&V_ip4_ft, key, 3 * sizeof(uint32_t),
 	    fibnum);
 
 #else	/* !FLOWTABLE_HASH_ALL */
 
 	fle = flowtable_lookup_common(&V_ip4_ft, (uint32_t *)&ip->ip_dst,
 	    sizeof(struct in_addr), fibnum);
 
 #endif	/* FLOWTABLE_HASH_ALL */
 
 	if (fle == NULL)
 		return (NULL);
 
 	sin = (struct sockaddr_in *)&ro->ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = ip->ip_dst;
 
 	return (fle);
 }
 #endif /* INET */
 
 #ifdef INET6
 /*
  * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
  * then it sets p to point at the offset "len" in the mbuf. WARNING: the
  * pointer might become stale after other pullups (but we never use it
  * this way).
  */
 #define PULLUP_TO(_len, p, T)						\
 do {									\
 	int x = (_len) + sizeof(T);					\
 	if ((m)->m_len < x)						\
 		return (NULL);						\
 	p = (mtod(m, char *) + (_len));					\
 } while (0)
 
 #define	TCP(p)		((struct tcphdr *)(p))
 #define	SCTP(p)		((struct sctphdr *)(p))
 #define	UDP(p)		((struct udphdr *)(p))
 
 static struct flentry *
 flowtable_lookup_ipv6(struct mbuf *m, struct route *ro)
 {
 	struct flentry *fle;
 	struct sockaddr_in6 *sin6;
 	struct ip6_hdr *ip6;
 	uint32_t fibnum;
 #ifdef FLOWTABLE_HASH_ALL
 	uint32_t key[9];
 	void *ulp;
 	int hlen;
 	uint16_t sport, dport;
 	u_short offset;
 	uint8_t proto;
 #else
 	uint32_t key[4];
 #endif
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (in6_localaddr(&ip6->ip6_dst))
 		return (NULL);
 
 	fibnum = M_GETFIB(m);
 
 #ifdef	FLOWTABLE_HASH_ALL
 	hlen = sizeof(struct ip6_hdr);
 	proto = ip6->ip6_nxt;
 	offset = sport = dport = 0;
 	ulp = NULL;
 	while (ulp == NULL) {
 		switch (proto) {
 		case IPPROTO_ICMPV6:
 		case IPPROTO_OSPFIGP:
 		case IPPROTO_PIM:
 		case IPPROTO_CARP:
 		case IPPROTO_ESP:
 		case IPPROTO_NONE:
 			ulp = ip6;
 			break;
 		case IPPROTO_TCP:
 			PULLUP_TO(hlen, ulp, struct tcphdr);
 			dport = TCP(ulp)->th_dport;
 			sport = TCP(ulp)->th_sport;
 			if (TCP(ulp)->th_flags & (TH_RST|TH_FIN))
 				fibnum |= (FL_STALE << 24);
 			break;
 		case IPPROTO_SCTP:
 			PULLUP_TO(hlen, ulp, struct sctphdr);
 			dport = SCTP(ulp)->src_port;
 			sport = SCTP(ulp)->dest_port;
 			/* XXXGL: handle stale? */
 			break;
 		case IPPROTO_UDP:
 			PULLUP_TO(hlen, ulp, struct udphdr);
 			dport = UDP(ulp)->uh_dport;
 			sport = UDP(ulp)->uh_sport;
 			break;
 		case IPPROTO_HOPOPTS:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_hbh);
 			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 			ulp = NULL;
 			break;
 		case IPPROTO_ROUTING:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_rthdr);
 			hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
 			proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
 			ulp = NULL;
 			break;
 		case IPPROTO_FRAGMENT:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_frag);
 			hlen += sizeof (struct ip6_frag);
 			proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
 			offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
 			    IP6F_OFF_MASK;
 			ulp = NULL;
 			break;
 		case IPPROTO_DSTOPTS:	/* RFC 2460 */
 			PULLUP_TO(hlen, ulp, struct ip6_hbh);
 			hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
 			proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
 			ulp = NULL;
 			break;
 		case IPPROTO_AH:	/* RFC 2402 */
 			PULLUP_TO(hlen, ulp, struct ip6_ext);
 			hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
 			proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
 			ulp = NULL;
 			break;
 		default:
 			PULLUP_TO(hlen, ulp, struct ip6_ext);
 			break;
 		}
 	}
 
 	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
 	bcopy(&ip6->ip6_src, &key[4], sizeof(struct in6_addr));
 	key[8] = (dport << 16) | sport;
 	fibnum |= proto << 16;
 
 	fle = flowtable_lookup_common(&V_ip6_ft, key, 9 * sizeof(uint32_t),
 	    fibnum);
 #else	/* !FLOWTABLE_HASH_ALL */
 	bcopy(&ip6->ip6_dst, &key[0], sizeof(struct in6_addr));
 	fle = flowtable_lookup_common(&V_ip6_ft, key, sizeof(struct in6_addr),
 	    fibnum);
 #endif	/* FLOWTABLE_HASH_ALL */
 
 	if (fle == NULL)
 		return (NULL);
 
 	sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 	bcopy(&ip6->ip6_dst, &sin6->sin6_addr, sizeof(struct in6_addr));
 
 	return (fle);
 }
 #endif /* INET6 */
 
 static bitstr_t *
 flowtable_mask(struct flowtable *ft)
 {
 
 	/*
 	 * flowtable_free_stale() calls w/o critical section, but
 	 * with sched_bind(). Since pointer is stable throughout
 	 * ft lifetime, it is safe, otherwise...
 	 *
 	 * CRITICAL_ASSERT(curthread);
 	 */
 
 	return (*(bitstr_t **)zpcpu_get(ft->ft_masks));
 }
 
 static struct flist *
 flowtable_list(struct flowtable *ft, uint32_t hash)
 {
 
 	CRITICAL_ASSERT(curthread);
 	return (zpcpu_get(ft->ft_table[hash % ft->ft_size]));
 }
 
 static int
 flow_stale(struct flowtable *ft, struct flentry *fle, int maxidle)
 {
 
 	if (((fle->f_rt->rt_flags & RTF_UP) == 0) ||
 	    (fle->f_rt->rt_ifp == NULL) ||
 	    !RT_LINK_IS_UP(fle->f_rt->rt_ifp) ||
 	    (fle->f_lle->la_flags & LLE_VALID) == 0)
 		return (1);
 
 	if (time_uptime - fle->f_uptime > maxidle)
 		return (1);
 
 #ifdef FLOWTABLE_HASH_ALL
 	if (fle->f_flags & FL_STALE)
 		return (1);
 #endif
 
 	return (0);
 }
 
 static int
 flow_full(void)
 {
 	int count, max;
 
 	count = uma_zone_get_cur(flow_zone);
 	max = uma_zone_get_max(flow_zone);
 
 	return (count > (max - (max >> 3)));
 }
 
 static int
 flow_matches(struct flentry *fle, uint32_t *key, int keylen, uint32_t fibnum)
 {
 #ifdef FLOWTABLE_HASH_ALL
 	uint8_t proto;
 
 	proto = (fibnum >> 16) & 0xff;
 	fibnum &= 0xffff;
 #endif
 
 	CRITICAL_ASSERT(curthread);
 
 	/* Microoptimization for IPv4: don't use bcmp(). */
 	if (((keylen == sizeof(uint32_t) && (fle->f_key[0] == key[0])) ||
 	    (bcmp(fle->f_key, key, keylen) == 0)) &&
 	    fibnum == fle->f_fibnum &&
 #ifdef FLOWTABLE_HASH_ALL
 	    proto == fle->f_proto &&
 #endif
 	    (fle->f_rt->rt_flags & RTF_UP) &&
 	    fle->f_rt->rt_ifp != NULL &&
 	    (fle->f_lle->la_flags & LLE_VALID))
 		return (1);
 
 	return (0);
 }
 
 static struct flentry *
 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
     int keylen, uint32_t fibnum0)
 {
 #ifdef INET6
 	struct route_in6 sro6;
 #endif
 #ifdef INET
 	struct route sro;
 #endif
 	struct route *ro = NULL;
 	struct rtentry *rt;
 	struct lltable *lt = NULL;
 	struct llentry *lle;
 	struct sockaddr_storage *l3addr;
 	struct ifnet *ifp;
 	struct flist *flist;
 	struct flentry *fle, *iter;
 	bitstr_t *mask;
 	uint16_t fibnum = fibnum0;
 #ifdef FLOWTABLE_HASH_ALL
 	uint8_t proto;
 
 	proto = (fibnum0 >> 16) & 0xff;
 	fibnum = fibnum0 & 0xffff;
 #endif
 
 	/*
 	 * This bit of code ends up locking the
 	 * same route 3 times (just like ip_output + ether_output)
 	 * - at lookup
 	 * - in rt_check when called by arpresolve
 	 * - dropping the refcount for the rtentry
 	 *
 	 * This could be consolidated to one if we wrote a variant
 	 * of arpresolve with an rt_check variant that expected to
 	 * receive the route locked
 	 */
 #ifdef INET
 	if (ft == &V_ip4_ft) {
 		struct sockaddr_in *sin;
 
 		ro = &sro;
 		bzero(&sro.ro_dst, sizeof(sro.ro_dst));
 
 		sin = (struct sockaddr_in *)&sro.ro_dst;
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr.s_addr = key[0];
 	}
 #endif
 #ifdef INET6
 	if (ft == &V_ip6_ft) {
 		struct sockaddr_in6 *sin6;
 
 		ro = (struct route *)&sro6;
 		sin6 = &sro6.ro_dst;
 
 		bzero(sin6, sizeof(*sin6));
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_len = sizeof(*sin6);
 		bcopy(key, &sin6->sin6_addr, sizeof(struct in6_addr));
 	}
 #endif
 
 	ro->ro_rt = NULL;
 #ifdef RADIX_MPATH
 	rtalloc_mpath_fib(ro, hash, fibnum);
 #else
 	rtalloc_ign_fib(ro, 0, fibnum);
 #endif
 	if (ro->ro_rt == NULL)
 		return (NULL);
 
 	rt = ro->ro_rt;
 	ifp = rt->rt_ifp;
 
 	if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
 		RTFREE(rt);
 		return (NULL);
 	}
 
 #ifdef INET
 	if (ft == &V_ip4_ft)
 		lt = LLTABLE(ifp);
 #endif
 #ifdef INET6
 	if (ft == &V_ip6_ft)
 		lt = LLTABLE6(ifp);
 #endif
 
 	if (rt->rt_flags & RTF_GATEWAY)
 		l3addr = (struct sockaddr_storage *)rt->rt_gateway;
 	else
 		l3addr = (struct sockaddr_storage *)&ro->ro_dst;
 	lle = llentry_alloc(ifp, lt, l3addr);
 
 	if (lle == NULL) {
 		RTFREE(rt);
 		return (NULL);
 	}
 
 	/* Don't insert the entry if the ARP hasn't yet finished resolving. */
 	if ((lle->la_flags & LLE_VALID) == 0) {
 		RTFREE(rt);
 		LLE_FREE(lle);
 		FLOWSTAT_INC(ft, ft_fail_lle_invalid);
 		return (NULL);
 	}
 
 	fle = uma_zalloc(flow_zone, M_NOWAIT | M_ZERO);
 	if (fle == NULL) {
 		RTFREE(rt);
 		LLE_FREE(lle);
 		return (NULL);
 	}
 
 	fle->f_hash = hash;
 	bcopy(key, &fle->f_key, keylen);
 	fle->f_rt = rt;
 	fle->f_lle = lle;
 	fle->f_fibnum = fibnum;
 	fle->f_uptime = time_uptime;
 #ifdef FLOWTABLE_HASH_ALL
 	fle->f_proto = proto;
 	fle->f_flags = fibnum0 >> 24;
 #endif
 
 	critical_enter();
 	mask = flowtable_mask(ft);
 	flist = flowtable_list(ft, hash);
 
 	if (SLIST_EMPTY(flist)) {
 		bit_set(mask, (hash % ft->ft_size));
 		SLIST_INSERT_HEAD(flist, fle, f_next);
 		goto skip;
 	}
 
 	/*
 	 * find end of list and make sure that we were not
 	 * preempted by another thread handling this flow
 	 */
 	SLIST_FOREACH(iter, flist, f_next) {
 		KASSERT(iter->f_hash % ft->ft_size == hash % ft->ft_size,
 		    ("%s: wrong hash", __func__));
 		if (flow_matches(iter, key, keylen, fibnum)) {
 			/*
 			 * We probably migrated to an other CPU after
 			 * lookup in flowtable_lookup_common() failed.
 			 * It appeared that this CPU already has flow
 			 * entry.
 			 */
 			iter->f_uptime = time_uptime;
 #ifdef FLOWTABLE_HASH_ALL
 			iter->f_flags |= fibnum >> 24;
 #endif
 			critical_exit();
 			FLOWSTAT_INC(ft, ft_collisions);
 			uma_zfree(flow_zone, fle);
 			return (iter);
 		}
 	}
 
 	SLIST_INSERT_HEAD(flist, fle, f_next);
 skip:
 	critical_exit();
 	FLOWSTAT_INC(ft, ft_inserts);
 
 	return (fle);
 }
 
 int
 flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro)
 {
 	struct flentry *fle;
 	struct llentry *lle;
 
 	if (V_flowtable_enable == 0)
 		return (ENXIO);
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 		fle = flowtable_lookup_ipv4(m, ro);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		fle = flowtable_lookup_ipv6(m, ro);
 		break;
 #endif
 	default:
 		panic("%s: sa %d", __func__, sa);
 	}
 
 	if (fle == NULL)
 		return (EHOSTUNREACH);
 
 	if (M_HASHTYPE_GET(m) == M_HASHTYPE_NONE) {
 		M_HASHTYPE_SET(m, M_HASHTYPE_OPAQUE);
 		m->m_pkthdr.flowid = fle->f_hash;
 	}
 
 	ro->ro_rt = fle->f_rt;
 	ro->ro_flags |= RT_NORTREF;
 	lle = fle->f_lle;
 	if (lle != NULL && (lle->la_flags & LLE_VALID)) {
 		ro->ro_prepend = lle->r_linkdata;
 		ro->ro_plen = lle->r_hdrlen;
 		ro->ro_flags |= RT_MAY_LOOP;
 		if (lle->la_flags & LLE_IFADDR)
 			ro->ro_flags |= RT_L2_ME;
 	}
 
 	return (0);
 }
 
 static struct flentry *
 flowtable_lookup_common(struct flowtable *ft, uint32_t *key, int keylen,
     uint32_t fibnum)
 {
 	struct flist *flist;
 	struct flentry *fle;
 	uint32_t hash;
 
 	FLOWSTAT_INC(ft, ft_lookups);
 
 	hash = jenkins_hash32(key, keylen / sizeof(uint32_t), flow_hashjitter);
 
 	critical_enter();
 	flist = flowtable_list(ft, hash);
 	SLIST_FOREACH(fle, flist, f_next) {
 		KASSERT(fle->f_hash % ft->ft_size == hash % ft->ft_size,
 		    ("%s: wrong hash", __func__));
 		if (flow_matches(fle, key, keylen, fibnum)) {
 			fle->f_uptime = time_uptime;
 #ifdef FLOWTABLE_HASH_ALL
 			fle->f_flags |= fibnum >> 24;
 #endif
 			critical_exit();
 			FLOWSTAT_INC(ft, ft_hits);
 			return (fle);
 		}
 	}
 	critical_exit();
 
 	FLOWSTAT_INC(ft, ft_misses);
 
 	return (flowtable_insert(ft, hash, key, keylen, fibnum));
 }
 
-/*
- * used by the bit_alloc macro
- */
-#define calloc(count, size) malloc((count)*(size), M_FTABLE, M_WAITOK | M_ZERO)
 static void
 flowtable_alloc(struct flowtable *ft)
 {
 
 	ft->ft_table = malloc(ft->ft_size * sizeof(struct flist),
 	    M_FTABLE, M_WAITOK);
 	for (int i = 0; i < ft->ft_size; i++)
 		ft->ft_table[i] = uma_zalloc(pcpu_zone_ptr, M_WAITOK | M_ZERO);
 
 	ft->ft_masks = uma_zalloc(pcpu_zone_ptr, M_WAITOK);
 	for (int i = 0; i < mp_ncpus; i++) {
 		bitstr_t **b;
 
 		b = zpcpu_get_cpu(ft->ft_masks, i);
-		*b = bit_alloc(ft->ft_size);
+		*b = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK);
 	}
-	ft->ft_tmpmask = bit_alloc(ft->ft_size);
+	ft->ft_tmpmask = bit_alloc(ft->ft_size, M_FTABLE, M_WAITOK);
 }
-#undef calloc
 
 static void
 flowtable_free_stale(struct flowtable *ft, struct rtentry *rt, int maxidle)
 {
 	struct flist *flist, freelist;
 	struct flentry *fle, *fle1, *fleprev;
 	bitstr_t *mask, *tmpmask;
 	int curbit, tmpsize;
 
 	SLIST_INIT(&freelist);
 	mask = flowtable_mask(ft);
 	tmpmask = ft->ft_tmpmask;
 	tmpsize = ft->ft_size;
 	memcpy(tmpmask, mask, ft->ft_size/8);
 	curbit = 0;
 	fleprev = NULL; /* pacify gcc */
 	/*
 	 * XXX Note to self, bit_ffs operates at the byte level
 	 * and thus adds gratuitous overhead
 	 */
 	bit_ffs(tmpmask, ft->ft_size, &curbit);
 	while (curbit != -1) {
 		if (curbit >= ft->ft_size || curbit < -1) {
 			log(LOG_ALERT,
 			    "warning: bad curbit value %d \n",
 			    curbit);
 			break;
 		}
 
 		FLOWSTAT_INC(ft, ft_free_checks);
 
 		critical_enter();
 		flist = flowtable_list(ft, curbit);
 #ifdef DIAGNOSTIC
 		if (SLIST_EMPTY(flist) && curbit > 0) {
 			log(LOG_ALERT,
 			    "warning bit=%d set, but no fle found\n",
 			    curbit);
 		}
 #endif
 		SLIST_FOREACH_SAFE(fle, flist, f_next, fle1) {
 			if (rt != NULL && fle->f_rt != rt) {
 				fleprev = fle;
 				continue;
 			}
 			if (!flow_stale(ft, fle, maxidle)) {
 				fleprev = fle;
 				continue;
 			}
 
 			if (fle == SLIST_FIRST(flist))
 				SLIST_REMOVE_HEAD(flist, f_next);
 			else
 				SLIST_REMOVE_AFTER(fleprev, f_next);
 			SLIST_INSERT_HEAD(&freelist, fle, f_next);
 		}
 		if (SLIST_EMPTY(flist))
 			bit_clear(mask, curbit);
 		critical_exit();
 
 		bit_clear(tmpmask, curbit);
 		bit_ffs(tmpmask, tmpsize, &curbit);
 	}
 
 	SLIST_FOREACH_SAFE(fle, &freelist, f_next, fle1) {
 		FLOWSTAT_INC(ft, ft_frees);
 		if (fle->f_rt != NULL)
 			RTFREE(fle->f_rt);
 		if (fle->f_lle != NULL)
 			LLE_FREE(fle->f_lle);
 		uma_zfree(flow_zone, fle);
 	}
 }
 
 static void
 flowtable_clean_vnet(struct flowtable *ft, struct rtentry *rt, int maxidle)
 {
 	int i;
 
 	CPU_FOREACH(i) {
 		if (smp_started == 1) {
 			thread_lock(curthread);
 			sched_bind(curthread, i);
 			thread_unlock(curthread);
 		}
 
 		flowtable_free_stale(ft, rt, maxidle);
 
 		if (smp_started == 1) {
 			thread_lock(curthread);
 			sched_unbind(curthread);
 			thread_unlock(curthread);
 		}
 	}
 }
 
 void
 flowtable_route_flush(sa_family_t sa, struct rtentry *rt)
 {
 	struct flowtable *ft;
 
 	switch (sa) {
 #ifdef INET
 	case AF_INET:
 		ft = &V_ip4_ft;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		ft = &V_ip6_ft;
 		break;
 #endif
 	default:
 		panic("%s: sa %d", __func__, sa);
 	}
 
 	flowtable_clean_vnet(ft, rt, 0);
 }
 
 static void
 flowtable_cleaner(void)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 	struct thread *td;
 
 	if (bootverbose)
 		log(LOG_INFO, "flowtable cleaner started\n");
 	td = curthread;
 	while (1) {
 		uint32_t flowclean_freq, maxidle;
 
 		/*
 		 * The maximum idle time, as well as frequency are arbitrary.
 		 */
 		if (flow_full())
 			maxidle = 5;
 		else
 			maxidle = 30;
 
 		VNET_LIST_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 #ifdef INET
 			flowtable_clean_vnet(&V_ip4_ft, NULL, maxidle);
 #endif
 #ifdef INET6
 			flowtable_clean_vnet(&V_ip6_ft, NULL, maxidle);
 #endif
 			CURVNET_RESTORE();
 		}
 		VNET_LIST_RUNLOCK();
 
 		if (flow_full())
 			flowclean_freq = 4*hz;
 		else
 			flowclean_freq = 20*hz;
 		mtx_lock(&flowclean_lock);
 		thread_lock(td);
 		sched_prio(td, PPAUSE);
 		thread_unlock(td);
 		flowclean_cycles++;
 		cv_broadcast(&flowclean_f_cv);
 		cv_timedwait(&flowclean_c_cv, &flowclean_lock, flowclean_freq);
 		mtx_unlock(&flowclean_lock);
 	}
 }
 
 static void
 flowtable_flush(void *unused __unused)
 {
 	uint64_t start;
 
 	mtx_lock(&flowclean_lock);
 	start = flowclean_cycles;
 	while (start == flowclean_cycles) {
 		cv_broadcast(&flowclean_c_cv);
 		cv_wait(&flowclean_f_cv, &flowclean_lock);
 	}
 	mtx_unlock(&flowclean_lock);
 }
 
 static struct kproc_desc flow_kp = {
 	"flowcleaner",
 	flowtable_cleaner,
 	&flowcleanerproc
 };
 SYSINIT(flowcleaner, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, kproc_start, &flow_kp);
 
 static int
 flowtable_get_size(char *name)
 {
 	int size;
 
 	if (TUNABLE_INT_FETCH(name, &size)) {
 		if (size < 256)
 			size = 256;
 		if (!powerof2(size)) {
 			printf("%s must be power of 2\n", name);
 			size = 2048;
 		}
 	} else {
 		/*
 		 * round up to the next power of 2
 		 */
 		size = 1 << fls((1024 + maxusers * 64) - 1);
 	}
 
 	return (size);
 }
 
 static void
 flowtable_init(const void *unused __unused)
 {
 
 	flow_hashjitter = arc4random();
 
 	flow_zone = uma_zcreate("flows", sizeof(struct flentry),
 	    NULL, NULL, NULL, NULL, (64-1), UMA_ZONE_MAXBUCKET);
 	uma_zone_set_max(flow_zone, 1024 + maxusers * 64 * mp_ncpus);
 
 	cv_init(&flowclean_c_cv, "c_flowcleanwait");
 	cv_init(&flowclean_f_cv, "f_flowcleanwait");
 	mtx_init(&flowclean_lock, "flowclean lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(ifnet_departure_event, flowtable_flush, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
 SYSINIT(flowtable_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
     flowtable_init, NULL);
 
 #ifdef INET
 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip4, CTLFLAG_RD, NULL,
     "Flowtable for IPv4");
 
 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip4_ftstat);
 VNET_PCPUSTAT_SYSINIT(ip4_ftstat);
 VNET_PCPUSTAT_SYSUNINIT(ip4_ftstat);
 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip4, OID_AUTO, stat, struct flowtable_stat,
     ip4_ftstat, "Flowtable statistics for IPv4 "
     "(struct flowtable_stat, net/flowtable.h)");
 
 static void
 flowtable_init_vnet_v4(const void *unused __unused)
 {
 
 	V_ip4_ft.ft_size = flowtable_get_size("net.flowtable.ip4.size");
 	V_ip4_ft.ft_stat = VNET(ip4_ftstat);
 	flowtable_alloc(&V_ip4_ft);
 }
 VNET_SYSINIT(ft_vnet_v4, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     flowtable_init_vnet_v4, NULL);
 #endif /* INET */
 
 #ifdef INET6
 static SYSCTL_NODE(_net_flowtable, OID_AUTO, ip6, CTLFLAG_RD, NULL,
     "Flowtable for IPv6");
 
 static VNET_PCPUSTAT_DEFINE(struct flowtable_stat, ip6_ftstat);
 VNET_PCPUSTAT_SYSINIT(ip6_ftstat);
 VNET_PCPUSTAT_SYSUNINIT(ip6_ftstat);
 SYSCTL_VNET_PCPUSTAT(_net_flowtable_ip6, OID_AUTO, stat, struct flowtable_stat,
     ip6_ftstat, "Flowtable statistics for IPv6 "
     "(struct flowtable_stat, net/flowtable.h)");
 
 static void
 flowtable_init_vnet_v6(const void *unused __unused)
 {
 
 	V_ip6_ft.ft_size = flowtable_get_size("net.flowtable.ip6.size");
 	V_ip6_ft.ft_stat = VNET(ip6_ftstat);
 	flowtable_alloc(&V_ip6_ft);
 }
 VNET_SYSINIT(flowtable_init_vnet_v6, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY,
     flowtable_init_vnet_v6, NULL);
 #endif /* INET6 */
 
 #ifdef DDB
 static bitstr_t *
 flowtable_mask_pcpu(struct flowtable *ft, int cpuid)
 {
 
 	return (zpcpu_get_cpu(*ft->ft_masks, cpuid));
 }
 
 static struct flist *
 flowtable_list_pcpu(struct flowtable *ft, uint32_t hash, int cpuid)
 {
 
 	return (zpcpu_get_cpu(&ft->ft_table[hash % ft->ft_size], cpuid));
 }
 
 static void
 flow_show(struct flowtable *ft, struct flentry *fle)
 {
 	int idle_time;
 	int rt_valid, ifp_valid;
 	volatile struct rtentry *rt;
 	struct ifnet *ifp = NULL;
 	uint32_t *hashkey = fle->f_key;
 
 	idle_time = (int)(time_uptime - fle->f_uptime);
 	rt = fle->f_rt;
 	rt_valid = rt != NULL;
 	if (rt_valid)
 		ifp = rt->rt_ifp;
 	ifp_valid = ifp != NULL;
 
 #ifdef INET
 	if (ft == &V_ip4_ft) {
 		char daddr[4*sizeof "123"];
 #ifdef FLOWTABLE_HASH_ALL
 		char saddr[4*sizeof "123"];
 		uint16_t sport, dport;
 #endif
 
 		inet_ntoa_r(*(struct in_addr *) &hashkey[0], daddr);
 #ifdef FLOWTABLE_HASH_ALL
 		inet_ntoa_r(*(struct in_addr *) &hashkey[1], saddr);
 		dport = ntohs((uint16_t)(hashkey[2] >> 16));
 		sport = ntohs((uint16_t)(hashkey[2] & 0xffff));
 		db_printf("%s:%d->%s:%d", saddr, sport, daddr, dport);
 #else
 		db_printf("%s ", daddr);
 #endif
 	}
 #endif /* INET */
 #ifdef INET6
 	if (ft == &V_ip6_ft) {
 #ifdef FLOWTABLE_HASH_ALL
 		db_printf("\n\tkey=%08x:%08x:%08x%08x:%08x:%08x%08x:%08x:%08x",
 		    hashkey[0], hashkey[1], hashkey[2],
 		    hashkey[3], hashkey[4], hashkey[5],
 		    hashkey[6], hashkey[7], hashkey[8]);
 #else
 		db_printf("\n\tkey=%08x:%08x:%08x ",
 		    hashkey[0], hashkey[1], hashkey[2]);
 #endif
 	}
 #endif /* INET6 */
 
 	db_printf("hash=%08x idle_time=%03d"
 	    "\n\tfibnum=%02d rt=%p",
 	    fle->f_hash, idle_time, fle->f_fibnum, fle->f_rt);
 
 #ifdef FLOWTABLE_HASH_ALL
 	if (fle->f_flags & FL_STALE)
 		db_printf(" FL_STALE ");
 #endif
 	if (rt_valid) {
 		if (rt->rt_flags & RTF_UP)
 			db_printf(" RTF_UP ");
 	}
 	if (ifp_valid) {
 		if (ifp->if_flags & IFF_LOOPBACK)
 			db_printf(" IFF_LOOPBACK ");
 		if (ifp->if_flags & IFF_UP)
 			db_printf(" IFF_UP ");
 		if (ifp->if_flags & IFF_POINTOPOINT)
 			db_printf(" IFF_POINTOPOINT ");
 	}
 	db_printf("\n");
 }
 
 static void
 flowtable_show(struct flowtable *ft, int cpuid)
 {
 	int curbit = 0;
 	bitstr_t *mask, *tmpmask;
 
 	if (cpuid != -1)
 		db_printf("cpu: %d\n", cpuid);
 	mask = flowtable_mask_pcpu(ft, cpuid);
 	tmpmask = ft->ft_tmpmask;
 	memcpy(tmpmask, mask, ft->ft_size/8);
 	/*
 	 * XXX Note to self, bit_ffs operates at the byte level
 	 * and thus adds gratuitous overhead
 	 */
 	bit_ffs(tmpmask, ft->ft_size, &curbit);
 	while (curbit != -1) {
 		struct flist *flist;
 		struct flentry *fle;
 
 		if (curbit >= ft->ft_size || curbit < -1) {
 			db_printf("warning: bad curbit value %d \n",
 			    curbit);
 			break;
 		}
 
 		flist = flowtable_list_pcpu(ft, curbit, cpuid);
 
 		SLIST_FOREACH(fle, flist, f_next)
 			flow_show(ft, fle);
 		bit_clear(tmpmask, curbit);
 		bit_ffs(tmpmask, ft->ft_size, &curbit);
 	}
 }
 
 static void
 flowtable_show_vnet(struct flowtable *ft)
 {
 
 	int i;
 
 	CPU_FOREACH(i)
 		flowtable_show(ft, i);
 }
 
 DB_SHOW_COMMAND(flowtables, db_show_flowtables)
 {
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 #ifdef VIMAGE
 		db_printf("vnet %p\n", vnet_iter);
 #endif
 #ifdef INET
 		printf("IPv4:\n");
 		flowtable_show_vnet(&V_ip4_ft);
 #endif
 #ifdef INET6
 		printf("IPv6:\n");
 		flowtable_show_vnet(&V_ip6_ft);
 #endif
 		CURVNET_RESTORE();
 	}
 }
 #endif
Index: head/sys/sys/bitstring.h
===================================================================
--- head/sys/sys/bitstring.h	(revision 299089)
+++ head/sys/sys/bitstring.h	(revision 299090)
@@ -1,146 +1,259 @@
 /*-
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * Paul Vixie.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
+ * Copyright (c) 2014 Spectra Logic Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
  * $FreeBSD$
  */
-
 #ifndef _SYS_BITSTRING_H_
 #define	_SYS_BITSTRING_H_
 
-typedef	unsigned char bitstr_t;
+#ifdef _KERNEL
+#include <sys/libkern.h>
+#include <sys/malloc.h>
+#endif
 
-/* internal macros */
-				/* byte of the bitstring bit is in */
-#define	_bit_byte(bit) \
-	((bit) >> 3)
+typedef	unsigned long bitstr_t;
 
-				/* mask for the bit within its byte */
-#define	_bit_mask(bit) \
-	(1 << ((bit)&0x7))
+/*---------------------- Private Implementation Details ----------------------*/
+#define	_BITSTR_MASK (~0UL)
+#define	_BITSTR_BITS (sizeof(bitstr_t) * 8)
 
-/* external macros */
-				/* bytes in a bitstring of nbits bits */
-#define	bitstr_size(nbits) \
-	(((nbits) + 7) >> 3)
+/* bitstr_t in bit string containing the bit. */
+static inline int
+_bit_idx(int _bit)
+{
+	return (_bit / _BITSTR_BITS);
+}
 
-				/* allocate a bitstring */
-#define	bit_alloc(nbits) \
-	(bitstr_t *)calloc((size_t)bitstr_size(nbits), sizeof(bitstr_t))
+/* bit number within bitstr_t at _bit_idx(_bit). */
+static inline int
+_bit_offset(int _bit)
+{
+	return (_bit % _BITSTR_BITS);
+}
 
-				/* allocate a bitstring on the stack */
+/* Mask for the bit within its long. */
+static inline bitstr_t
+_bit_mask(int _bit)
+{
+	return (1UL << _bit_offset(_bit));
+}
+
+static inline bitstr_t
+_bit_make_mask(int _start, int _stop)
+{
+	return ((_BITSTR_MASK << _bit_offset(_start)) &
+	    (_BITSTR_MASK >> (_BITSTR_BITS - _bit_offset(_stop) - 1)));
+}
+
+/*----------------------------- Public Interface -----------------------------*/
+/* Number of bytes consumed by a bit string of nbits bits */
+#define	bitstr_size(_nbits) \
+	(((_nbits) + _BITSTR_BITS - 1) / 8)
+
+/* Allocate a bit string initialized with no bits set. */
+#ifdef _KERNEL
+static inline bitstr_t *
+bit_alloc(int _nbits, struct malloc_type *type, int flags)
+{
+	return ((bitstr_t *)malloc(bitstr_size(_nbits), type, flags | M_ZERO));
+}
+#else
+static inline bitstr_t *
+bit_alloc(int _nbits)
+{
+	return ((bitstr_t *)calloc(bitstr_size(_nbits), 1));
+}
+#endif
+
+/* Allocate a bit string on the stack with no bits set. */
 #define	bit_decl(name, nbits) \
-	((name)[bitstr_size(nbits)])
+	((name)[bitstr_size(nbits) / sizeof(bitstr_t)])
 
-				/* is bit N of bitstring name set? */
-#define	bit_test(name, bit) \
-	((name)[_bit_byte(bit)] & _bit_mask(bit))
+/* Is bit N of bit string set? */
+static inline int
+bit_test(const bitstr_t *_bitstr, int _bit)
+{
+	return ((_bitstr[_bit_idx(_bit)] & _bit_mask(_bit)) != 0);
+}
 
-				/* set bit N of bitstring name */
-#define	bit_set(name, bit) \
-	((name)[_bit_byte(bit)] |= _bit_mask(bit))
+/* Set bit N of bit string. */
+static inline void
+bit_set(bitstr_t *_bitstr, int _bit)
+{
+	_bitstr[_bit_idx(_bit)] |= _bit_mask(_bit);
+}
 
-				/* clear bit N of bitstring name */
-#define	bit_clear(name, bit) \
-	((name)[_bit_byte(bit)] &= ~_bit_mask(bit))
+/* clear bit N of bit string name */
+static inline void
+bit_clear(bitstr_t *_bitstr, int _bit)
+{
+	_bitstr[_bit_idx(_bit)] &= ~_bit_mask(_bit);
+}
 
-				/* clear bits start ... stop in bitstring */
-#define	bit_nclear(name, start, stop) do { \
-	register bitstr_t *_name = (name); \
-	register int _start = (start), _stop = (stop); \
-	register int _startbyte = _bit_byte(_start); \
-	register int _stopbyte = _bit_byte(_stop); \
-	if (_startbyte == _stopbyte) { \
-		_name[_startbyte] &= ((0xff >> (8 - (_start&0x7))) | \
-				      (0xff << ((_stop&0x7) + 1))); \
-	} else { \
-		_name[_startbyte] &= 0xff >> (8 - (_start&0x7)); \
-		while (++_startbyte < _stopbyte) \
-			_name[_startbyte] = 0; \
-		_name[_stopbyte] &= 0xff << ((_stop&0x7) + 1); \
-	} \
-} while (0)
+/* Set bits start ... stop inclusive in bit string. */
+static inline void
+bit_nset(bitstr_t *_bitstr, int _start, int _stop)
+{
+	bitstr_t *_stopbitstr;
 
-				/* set bits start ... stop in bitstring */
-#define	bit_nset(name, start, stop) do { \
-	register bitstr_t *_name = (name); \
-	register int _start = (start), _stop = (stop); \
-	register int _startbyte = _bit_byte(_start); \
-	register int _stopbyte = _bit_byte(_stop); \
-	if (_startbyte == _stopbyte) { \
-		_name[_startbyte] |= ((0xff << (_start&0x7)) & \
-				    (0xff >> (7 - (_stop&0x7)))); \
-	} else { \
-		_name[_startbyte] |= 0xff << ((_start)&0x7); \
-		while (++_startbyte < _stopbyte) \
-	    		_name[_startbyte] = 0xff; \
-		_name[_stopbyte] |= 0xff >> (7 - (_stop&0x7)); \
-	} \
-} while (0)
+	_stopbitstr = _bitstr + _bit_idx(_stop);
+	_bitstr += _bit_idx(_start);
 
-				/* find first bit clear in name */
-#define	bit_ffc(name, nbits, value) do { \
-	register bitstr_t *_name = (name); \
-	register int _byte, _nbits = (nbits); \
-	register int _stopbyte = _bit_byte(_nbits - 1), _value = -1; \
-	if (_nbits > 0) \
-		for (_byte = 0; _byte <= _stopbyte; ++_byte) \
-			if (_name[_byte] != 0xff) { \
-				bitstr_t _lb; \
-				_value = _byte << 3; \
-				for (_lb = _name[_byte]; (_lb&0x1); \
-				    ++_value, _lb >>= 1); \
-				break; \
-			} \
-	if (_value >= nbits) \
-		_value = -1; \
-	*(value) = _value; \
-} while (0)
+	if (_bitstr == _stopbitstr) {
+		*_bitstr |= _bit_make_mask(_start, _stop);
+	} else {
+		*_bitstr |= _bit_make_mask(_start, _BITSTR_BITS - 1);
+		while (++_bitstr < _stopbitstr)
+	    		*_bitstr = _BITSTR_MASK;
+		*_stopbitstr |= _bit_make_mask(0, _stop);
+	}
+}
 
-				/* find first bit set in name */
-#define	bit_ffs(name, nbits, value) do { \
-	register bitstr_t *_name = (name); \
-	register int _byte, _nbits = (nbits); \
-	register int _stopbyte = _bit_byte(_nbits - 1), _value = -1; \
-	if (_nbits > 0) \
-		for (_byte = 0; _byte <= _stopbyte; ++_byte) \
-			if (_name[_byte]) { \
-				bitstr_t _lb; \
-				_value = _byte << 3; \
-				for (_lb = _name[_byte]; !(_lb&0x1); \
-				    ++_value, _lb >>= 1); \
-				break; \
-			} \
-	if (_value >= nbits) \
-		_value = -1; \
-	*(value) = _value; \
-} while (0)
+/* Clear bits start ... stop inclusive in bit string. */
+static inline void
+bit_nclear(bitstr_t *_bitstr, int _start, int _stop)
+{
+	bitstr_t *_stopbitstr;
 
-#endif /* !_SYS_BITSTRING_H_ */
+	_stopbitstr = _bitstr + _bit_idx(_stop);
+	_bitstr += _bit_idx(_start);
+
+	if (_bitstr == _stopbitstr) {
+		*_bitstr &= ~_bit_make_mask(_start, _stop);
+	} else {
+		*_bitstr &= ~_bit_make_mask(_start, _BITSTR_BITS - 1);
+		while (++_bitstr < _stopbitstr)
+			*_bitstr = 0;
+		*_stopbitstr &= ~_bit_make_mask(0, _stop);
+	}
+}
+
+/* Find the first bit set in bit string at or after bit start. */
+static inline void
+bit_ffs_at(bitstr_t *_bitstr, int _start, int _nbits, int *_result)
+{
+	bitstr_t *_curbitstr;
+	bitstr_t *_stopbitstr;
+	bitstr_t _test;
+	int _value, _offset;
+
+	if (_nbits > 0) {
+		_curbitstr = _bitstr + _bit_idx(_start);
+		_stopbitstr = _bitstr + _bit_idx(_nbits - 1);
+
+		_test = *_curbitstr;
+		if (_bit_offset(_start) != 0)
+			_test &= _bit_make_mask(_start, _BITSTR_BITS - 1);
+		while (_test == 0 && _curbitstr < _stopbitstr)
+			_test = *(++_curbitstr);
+		
+		_offset = ffsl(_test);
+		_value = ((_curbitstr - _bitstr) * _BITSTR_BITS) + _offset - 1;
+		if (_offset == 0 || _value >= _nbits)
+			_value = -1;
+	} else {
+		_value = -1;
+	}
+	*_result = _value;
+}
+
+/* Find the first bit clear in bit string at or after bit start. */
+static inline void
+bit_ffc_at(bitstr_t *_bitstr, int _start, int _nbits, int *_result)
+{
+	bitstr_t *_curbitstr;
+	bitstr_t *_stopbitstr;
+	bitstr_t _test;
+	int _value, _offset;
+
+	if (_nbits > 0) {
+		_curbitstr = _bitstr + _bit_idx(_start);
+		_stopbitstr = _bitstr + _bit_idx(_nbits - 1);
+
+		_test = *_curbitstr;
+		if (_bit_offset(_start) != 0)
+			_test |= _bit_make_mask(0, _start - 1);
+		while (_test == _BITSTR_MASK && _curbitstr < _stopbitstr)
+			_test = *(++_curbitstr);
+		
+		_offset = ffsl(~_test);
+		_value = ((_curbitstr - _bitstr) * _BITSTR_BITS) + _offset - 1;
+		if (_offset == 0 || _value >= _nbits)
+			_value = -1;
+	} else {
+		_value = -1;
+	}
+	*_result = _value;
+}
+
+/* Find the first bit set in bit string. */
+static inline void
+bit_ffs(bitstr_t *_bitstr, int _nbits, int *_result)
+{
+	bit_ffs_at(_bitstr, /*start*/0, _nbits, _result);
+}
+
+/* Find the first bit clear in bit string. */
+static inline void
+bit_ffc(bitstr_t *_bitstr, int _nbits, int *_result)
+{
+	bit_ffc_at(_bitstr, /*start*/0, _nbits, _result);
+}
+
+#endif	/* _SYS_BITSTRING_H_ */
Index: head/sys/sys/param.h
===================================================================
--- head/sys/sys/param.h	(revision 299089)
+++ head/sys/sys/param.h	(revision 299090)
@@ -1,363 +1,363 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)param.h	8.3 (Berkeley) 4/4/95
  * $FreeBSD$
  */
 
 #ifndef _SYS_PARAM_H_
 #define _SYS_PARAM_H_
 
 #include <sys/_null.h>
 
 #define	BSD	199506		/* System version (year & month). */
 #define BSD4_3	1
 #define BSD4_4	1
 
 /* 
  * __FreeBSD_version numbers are documented in the Porter's Handbook.
  * If you bump the version for any reason, you should update the documentation
  * there.
  * Currently this lives here in the doc/ repository:
  *
  *	head/en_US.ISO8859-1/books/porters-handbook/versions/chapter.xml
  *
  * scheme is:  <major><two digit minor>Rxx
  *		'R' is in the range 0 to 4 if this is a release branch or
  *		x.0-CURRENT before RELENG_*_0 is created, otherwise 'R' is
  *		in the range 5 to 9.
  */
 #undef __FreeBSD_version
-#define __FreeBSD_version 1100106	/* Master, propagated to newvers */
+#define __FreeBSD_version 1100107	/* Master, propagated to newvers */
 
 /*
  * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,
  * which by definition is always true on FreeBSD. This macro is also defined
  * on other systems that use the kernel of FreeBSD, such as GNU/kFreeBSD.
  *
  * It is tempting to use this macro in userland code when we want to enable
  * kernel-specific routines, and in fact it's fine to do this in code that
  * is part of FreeBSD itself.  However, be aware that as presence of this
  * macro is still not widespread (e.g. older FreeBSD versions, 3rd party
  * compilers, etc), it is STRONGLY DISCOURAGED to check for this macro in
  * external applications without also checking for __FreeBSD__ as an
  * alternative.
  */
 #undef __FreeBSD_kernel__
 #define __FreeBSD_kernel__
 
 #ifdef _KERNEL
 #define	P_OSREL_SIGWAIT			700000
 #define	P_OSREL_SIGSEGV			700004
 #define	P_OSREL_MAP_ANON		800104
 #define	P_OSREL_MAP_FSTRICT		1100036
 #define	P_OSREL_SHUTDOWN_ENOTCONN	1100077
 
 #define	P_OSREL_MAJOR(x)		((x) / 100000)
 #endif
 
 #ifndef LOCORE
 #include <sys/types.h>
 #endif
 
 /*
  * Machine-independent constants (some used in following include files).
  * Redefined constants are from POSIX 1003.1 limits file.
  *
  * MAXCOMLEN should be >= sizeof(ac_comm) (see <acct.h>)
  */
 #include <sys/syslimits.h>
 
 #define	MAXCOMLEN	19		/* max command name remembered */
 #define	MAXINTERP	PATH_MAX	/* max interpreter file name length */
 #define	MAXLOGNAME	33		/* max login name length (incl. NUL) */
 #define	MAXUPRC		CHILD_MAX	/* max simultaneous processes */
 #define	NCARGS		ARG_MAX		/* max bytes for an exec function */
 #define	NGROUPS		(NGROUPS_MAX+1)	/* max number groups */
 #define	NOFILE		OPEN_MAX	/* max open files per process */
 #define	NOGROUP		65535		/* marker for empty group set member */
 #define MAXHOSTNAMELEN	256		/* max hostname size */
 #define SPECNAMELEN	63		/* max length of devicename */
 
 /* More types and definitions used throughout the kernel. */
 #ifdef _KERNEL
 #include <sys/cdefs.h>
 #include <sys/errno.h>
 #ifndef LOCORE
 #include <sys/time.h>
 #include <sys/priority.h>
 #endif
 
 #ifndef FALSE
 #define	FALSE	0
 #endif
 #ifndef TRUE
 #define	TRUE	1
 #endif
 #endif
 
 #ifndef _KERNEL
 /* Signals. */
 #include <sys/signal.h>
 #endif
 
 /* Machine type dependent parameters. */
 #include <machine/param.h>
 #ifndef _KERNEL
 #include <sys/limits.h>
 #endif
 
 #ifndef DEV_BSHIFT
 #define	DEV_BSHIFT	9		/* log2(DEV_BSIZE) */
 #endif
 #define	DEV_BSIZE	(1<<DEV_BSHIFT)
 
 #ifndef BLKDEV_IOSIZE
 #define BLKDEV_IOSIZE  PAGE_SIZE	/* default block device I/O size */
 #endif
 #ifndef DFLTPHYS
 #define DFLTPHYS	(64 * 1024)	/* default max raw I/O transfer size */
 #endif
 #ifndef MAXPHYS
 #define MAXPHYS		(128 * 1024)	/* max raw I/O transfer size */
 #endif
 #ifndef MAXDUMPPGS
 #define MAXDUMPPGS	(DFLTPHYS/PAGE_SIZE)
 #endif
 
 /*
  * Constants related to network buffer management.
  * MCLBYTES must be no larger than PAGE_SIZE.
  */
 #ifndef	MSIZE
 #define	MSIZE		256		/* size of an mbuf */
 #endif
 
 #ifndef	MCLSHIFT
 #define MCLSHIFT	11		/* convert bytes to mbuf clusters */
 #endif	/* MCLSHIFT */
 
 #define MCLBYTES	(1 << MCLSHIFT)	/* size of an mbuf cluster */
 
 #if PAGE_SIZE < 2048
 #define	MJUMPAGESIZE	MCLBYTES
 #elif PAGE_SIZE <= 8192
 #define	MJUMPAGESIZE	PAGE_SIZE
 #else
 #define	MJUMPAGESIZE	(8 * 1024)
 #endif
 
 #define	MJUM9BYTES	(9 * 1024)	/* jumbo cluster 9k */
 #define	MJUM16BYTES	(16 * 1024)	/* jumbo cluster 16k */
 
 /*
  * Some macros for units conversion
  */
 
 /* clicks to bytes */
 #ifndef ctob
 #define ctob(x)	((x)<<PAGE_SHIFT)
 #endif
 
 /* bytes to clicks */
 #ifndef btoc
 #define btoc(x)	(((vm_offset_t)(x)+PAGE_MASK)>>PAGE_SHIFT)
 #endif
 
 /*
  * btodb() is messy and perhaps slow because `bytes' may be an off_t.  We
  * want to shift an unsigned type to avoid sign extension and we don't
  * want to widen `bytes' unnecessarily.  Assume that the result fits in
  * a daddr_t.
  */
 #ifndef btodb
 #define btodb(bytes)	 		/* calculates (bytes / DEV_BSIZE) */ \
 	(sizeof (bytes) > sizeof(long) \
 	 ? (daddr_t)((unsigned long long)(bytes) >> DEV_BSHIFT) \
 	 : (daddr_t)((unsigned long)(bytes) >> DEV_BSHIFT))
 #endif
 
 #ifndef dbtob
 #define dbtob(db)			/* calculates (db * DEV_BSIZE) */ \
 	((off_t)(db) << DEV_BSHIFT)
 #endif
 
 #define	PRIMASK	0x0ff
 #define	PCATCH	0x100		/* OR'd with pri for tsleep to check signals */
 #define	PDROP	0x200	/* OR'd with pri to stop re-entry of interlock mutex */
 
 #define	NZERO	0		/* default "nice" */
 
 #define	NBBY	8		/* number of bits in a byte */
 #define	NBPW	sizeof(int)	/* number of bytes per word (integer) */
 
 #define	CMASK	022		/* default file mask: S_IWGRP|S_IWOTH */
 
 #define	NODEV	(dev_t)(-1)	/* non-existent device */
 
 /*
  * File system parameters and macros.
  *
  * MAXBSIZE -	Filesystems are made out of blocks of at most MAXBSIZE bytes
  *		per block.  MAXBSIZE may be made larger without effecting
  *		any existing filesystems as long as it does not exceed MAXPHYS,
  *		and may be made smaller at the risk of not being able to use
  *		filesystems which require a block size exceeding MAXBSIZE.
  *
  * MAXBCACHEBUF - Maximum size of a buffer in the buffer cache.  This must
  *		be >= MAXBSIZE and can be set differently for different
  *		architectures by defining it in <machine/param.h>.
  *		Making this larger allows NFS to do larger reads/writes.
  *
  * BKVASIZE -	Nominal buffer space per buffer, in bytes.  BKVASIZE is the
  *		minimum KVM memory reservation the kernel is willing to make.
  *		Filesystems can of course request smaller chunks.  Actual 
  *		backing memory uses a chunk size of a page (PAGE_SIZE).
  *		The default value here can be overridden on a per-architecture
  *		basis by defining it in <machine/param.h>.  This should
  *		probably be done to increase its value, when MAXBCACHEBUF is
  *		defined as a larger value in <machine/param.h>.
  *
  *		If you make BKVASIZE too small you risk seriously fragmenting
  *		the buffer KVM map which may slow things down a bit.  If you
  *		make it too big the kernel will not be able to optimally use 
  *		the KVM memory reserved for the buffer cache and will wind 
  *		up with too-few buffers.
  *
  *		The default is 16384, roughly 2x the block size used by a
  *		normal UFS filesystem.
  */
 #define MAXBSIZE	65536	/* must be power of 2 */
 #ifndef	MAXBCACHEBUF
 #define	MAXBCACHEBUF	MAXBSIZE /* must be a power of 2 >= MAXBSIZE */
 #endif
 #ifndef	BKVASIZE
 #define BKVASIZE	16384	/* must be power of 2 */
 #endif
 #define BKVAMASK	(BKVASIZE-1)
 
 /*
  * MAXPATHLEN defines the longest permissible path length after expanding
  * symbolic links. It is used to allocate a temporary buffer from the buffer
  * pool in which to do the name expansion, hence should be a power of two,
  * and must be less than or equal to MAXBSIZE.  MAXSYMLINKS defines the
  * maximum number of symbolic links that may be expanded in a path name.
  * It should be set high enough to allow all legitimate uses, but halt
  * infinite loops reasonably quickly.
  */
 #define	MAXPATHLEN	PATH_MAX
 #define MAXSYMLINKS	32
 
 /* Bit map related macros. */
 #define	setbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] |= 1<<((i)%NBBY))
 #define	clrbit(a,i)	(((unsigned char *)(a))[(i)/NBBY] &= ~(1<<((i)%NBBY)))
 #define	isset(a,i)							\
 	(((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY)))
 #define	isclr(a,i)							\
 	((((const unsigned char *)(a))[(i)/NBBY] & (1<<((i)%NBBY))) == 0)
 
 /* Macros for counting and rounding. */
 #ifndef howmany
 #define	howmany(x, y)	(((x)+((y)-1))/(y))
 #endif
 #define	nitems(x)	(sizeof((x)) / sizeof((x)[0]))
 #define	rounddown(x, y)	(((x)/(y))*(y))
 #define	rounddown2(x, y) ((x)&(~((y)-1)))          /* if y is power of two */
 #define	roundup(x, y)	((((x)+((y)-1))/(y))*(y))  /* to any y */
 #define	roundup2(x, y)	(((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
 #define powerof2(x)	((((x)-1)&(x))==0)
 
 /* Macros for min/max. */
 #define	MIN(a,b) (((a)<(b))?(a):(b))
 #define	MAX(a,b) (((a)>(b))?(a):(b))
 
 #ifdef _KERNEL
 /*
  * Basic byte order function prototypes for non-inline functions.
  */
 #ifndef LOCORE
 #ifndef _BYTEORDER_PROTOTYPED
 #define	_BYTEORDER_PROTOTYPED
 __BEGIN_DECLS
 __uint32_t	 htonl(__uint32_t);
 __uint16_t	 htons(__uint16_t);
 __uint32_t	 ntohl(__uint32_t);
 __uint16_t	 ntohs(__uint16_t);
 __END_DECLS
 #endif
 #endif
 
 #ifndef lint
 #ifndef _BYTEORDER_FUNC_DEFINED
 #define	_BYTEORDER_FUNC_DEFINED
 #define	htonl(x)	__htonl(x)
 #define	htons(x)	__htons(x)
 #define	ntohl(x)	__ntohl(x)
 #define	ntohs(x)	__ntohs(x)
 #endif /* !_BYTEORDER_FUNC_DEFINED */
 #endif /* lint */
 #endif /* _KERNEL */
 
 /*
  * Scale factor for scaled integers used to count %cpu time and load avgs.
  *
  * The number of CPU `tick's that map to a unique `%age' can be expressed
  * by the formula (1 / (2 ^ (FSHIFT - 11))).  The maximum load average that
  * can be calculated (assuming 32 bits) can be closely approximated using
  * the formula (2 ^ (2 * (16 - FSHIFT))) for (FSHIFT < 15).
  *
  * For the scheduler to maintain a 1:1 mapping of CPU `tick' to `%age',
  * FSHIFT must be at least 11; this gives us a maximum load avg of ~1024.
  */
 #define	FSHIFT	11		/* bits to right of fixed binary point */
 #define FSCALE	(1<<FSHIFT)
 
 #define dbtoc(db)			/* calculates devblks to pages */ \
 	((db + (ctodb(1) - 1)) >> (PAGE_SHIFT - DEV_BSHIFT))
  
 #define ctodb(db)			/* calculates pages to devblks */ \
 	((db) << (PAGE_SHIFT - DEV_BSHIFT))
 
 /*
  * Old spelling of __containerof().
  */
 #define	member2struct(s, m, x)						\
 	((struct s *)(void *)((char *)(x) - offsetof(struct s, m)))
 
 /*
  * Access a variable length array that has been declared as a fixed
  * length array.
  */
 #define __PAST_END(array, offset) (((__typeof__(*(array)) *)(array))[offset])
 
 #endif	/* _SYS_PARAM_H_ */
Index: head/tests/sys/Makefile
===================================================================
--- head/tests/sys/Makefile	(revision 299089)
+++ head/tests/sys/Makefile	(revision 299090)
@@ -1,28 +1,29 @@
 # $FreeBSD$
 
 .include <bsd.own.mk>
 
 PACKAGE=	tests
 FILESGROUPS=	TESTS
 TESTSPACKAGE=	${PACKAGE}
 TESTSDIR= ${TESTSBASE}/sys
 
 TESTS_SUBDIRS+=		acl
 TESTS_SUBDIRS+=		aio
 TESTS_SUBDIRS+=		fifo
 TESTS_SUBDIRS+=		file
 TESTS_SUBDIRS+=		geom
 TESTS_SUBDIRS+=		kern
 TESTS_SUBDIRS+=		kqueue
 TESTS_SUBDIRS+=		mac
 TESTS_SUBDIRS+=		mqueue
 TESTS_SUBDIRS+=		netinet
 TESTS_SUBDIRS+=		opencrypto
 TESTS_SUBDIRS+=		posixshm
+TESTS_SUBDIRS+=		sys
 TESTS_SUBDIRS+=		vfs
 TESTS_SUBDIRS+=		vm
 
 # Items not integrated into kyua runs by default
 SUBDIR+=		pjdfstest
 
 .include <bsd.test.mk>
Index: head/tests/sys/sys/Makefile
===================================================================
--- head/tests/sys/sys/Makefile	(nonexistent)
+++ head/tests/sys/sys/Makefile	(revision 299090)
@@ -0,0 +1,13 @@
+# $FreeBSD$
+
+PACKAGE=	tests
+FILESGROUPS=	TESTS
+TESTSPACKAGE=	${PACKAGE}
+
+TESTSDIR=	${TESTSBASE}/sys/sys
+
+ATF_TESTS_C=	bitstring_test
+
+WARNS?=	5
+
+.include <bsd.test.mk>

Property changes on: head/tests/sys/sys/Makefile
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: head/tests/sys/sys/bitstring_test.c
===================================================================
--- head/tests/sys/sys/bitstring_test.c	(nonexistent)
+++ head/tests/sys/sys/bitstring_test.c	(revision 299090)
@@ -0,0 +1,359 @@
+/*-
+ * Copyright (c) 2014 Spectra Logic Corporation
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions, and the following disclaimer,
+ *    without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ *    substantially similar to the "NO WARRANTY" disclaimer below
+ *    ("Disclaimer") and any redistribution must be conditioned upon
+ *    including a substantially similar Disclaimer requirement for further
+ *    binary redistribution.
+ *
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
+ *
+ * $FreeBSD$
+ */
+#include <sys/param.h>
+
+#include <bitstring.h>
+#include <stdio.h>
+
+#include <atf-c.h>
+
+typedef void (testfunc_t)(bitstr_t *bstr, int nbits, const char *memloc);
+
+static void
+bitstring_run_stack_test(testfunc_t *test, int nbits)
+{
+	bitstr_t bit_decl(bitstr, nbits);
+
+	test(bitstr, nbits, "stack");
+}
+
+static void
+bitstring_run_heap_test(testfunc_t *test, int nbits)
+{
+	bitstr_t *bitstr = bit_alloc(nbits);
+
+	test(bitstr, nbits, "heap");
+}
+
+static void
+bitstring_test_runner(testfunc_t *test)
+{
+	const int bitstr_sizes[] = {
+		0,
+		1,
+		_BITSTR_BITS - 1,
+		_BITSTR_BITS,
+		_BITSTR_BITS + 1,
+		2 * _BITSTR_BITS - 1,
+		2 * _BITSTR_BITS,
+		1023,
+		1024
+	};
+
+	for (unsigned long i = 0; i < nitems(bitstr_sizes); i++) {
+		bitstring_run_stack_test(test, bitstr_sizes[i]);
+		bitstring_run_heap_test(test, bitstr_sizes[i]);
+	}
+}
+
+#define	BITSTRING_TC_DEFINE(name)				\
+ATF_TC_WITHOUT_HEAD(name);					\
+static testfunc_t name ## _test;				\
+								\
+ATF_TC_BODY(name, tc)						\
+{								\
+	bitstring_test_runner(name ## _test);			\
+}								\
+								\
+static void							\
+name ## _test(bitstr_t *bitstr, int nbits, const char *memloc)
+
+#define	BITSTRING_TC_ADD(tp, name)				\
+do {								\
+	ATF_TP_ADD_TC(tp, name);				\
+} while (0)
+
+ATF_TC_WITHOUT_HEAD(bitstr_in_struct);
+ATF_TC_BODY(bitstr_in_struct, tc)
+{
+	struct bitstr_containing_struct {
+		bitstr_t bit_decl(bitstr, 8);
+	} test_struct;
+
+	bit_nclear(test_struct.bitstr, 0, 8);
+}
+
+BITSTRING_TC_DEFINE(bit_set)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	memset(bitstr, 0, bitstr_size(nbits));
+	
+	for (int i = 0; i < nbits; i++) {
+		bit_set(bitstr, i);
+
+		for (int j = 0; j < nbits; j++) {
+			ATF_REQUIRE_MSG(bit_test(bitstr, j) == (j == i) ? 1 : 0,
+			    "bit_set_%d_%s: Failed on bit %d",
+			    nbits, memloc, i);
+		}
+
+		bit_clear(bitstr, i);
+	}
+}
+
+BITSTRING_TC_DEFINE(bit_clear)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	int i, j;
+
+	memset(bitstr, 0xFF, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_clear(bitstr, i);
+
+		for (j = 0; j < nbits; j++) {
+			ATF_REQUIRE_MSG(bit_test(bitstr, j) == (j == i) ? 0 : 1,
+			    "bit_clear_%d_%s: Failed on bit %d",
+			    nbits, memloc, i);
+		}
+
+		bit_set(bitstr, i);
+	}
+}
+
+BITSTRING_TC_DEFINE(bit_ffs)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	int i;
+	int found_set_bit;
+
+	memset(bitstr, 0, bitstr_size(nbits));
+	bit_ffs(bitstr, nbits, &found_set_bit);
+	ATF_REQUIRE_MSG(found_set_bit == -1,
+	    "bit_ffs_%d_%s: Failed all clear bits.", nbits, memloc);
+
+	for (i = 0; i < nbits; i++) {
+		memset(bitstr, 0xFF, bitstr_size(nbits));
+		if (i > 0)
+			bit_nclear(bitstr, 0, i - 1);
+
+		bit_ffs(bitstr, nbits, &found_set_bit);
+		ATF_REQUIRE_MSG(found_set_bit == i,
+		    "bit_ffs_%d_%s: Failed on bit %d, Result %d",
+		    nbits, memloc, i, found_set_bit);
+	}
+}
+
+BITSTRING_TC_DEFINE(bit_ffc)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	int i;
+	int found_clear_bit;
+
+	memset(bitstr, 0xFF, bitstr_size(nbits));
+	bit_ffc(bitstr, nbits, &found_clear_bit);
+	ATF_REQUIRE_MSG(found_clear_bit == -1,
+	    "bit_ffc_%d_%s: Failed all set bits.", nbits, memloc);
+
+	for (i = 0; i < nbits; i++) {
+		memset(bitstr, 0, bitstr_size(nbits));
+		if (i > 0)
+			bit_nset(bitstr, 0, i - 1);
+
+		bit_ffc(bitstr, nbits, &found_clear_bit);
+		ATF_REQUIRE_MSG(found_clear_bit == i,
+		    "bit_ffc_%d_%s: Failed on bit %d, Result %d",
+		    nbits, memloc, i, found_clear_bit);
+	}
+}
+
+BITSTRING_TC_DEFINE(bit_ffs_at)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	int i;
+	int found_set_bit;
+
+	memset(bitstr, 0xFF, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffs_at(bitstr, i, nbits, &found_set_bit);
+		ATF_REQUIRE_MSG(found_set_bit == i,
+		    "bit_ffs_at_%d_%s: Failed on bit %d, Result %d",
+		    nbits, memloc, i, found_set_bit);
+	}
+
+	memset(bitstr, 0, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffs_at(bitstr, i, nbits, &found_set_bit);
+		ATF_REQUIRE_MSG(found_set_bit == -1,
+		    "bit_ffs_at_%d_%s: Failed on bit %d, Result %d",
+		    nbits, memloc, i, found_set_bit);
+	}
+
+	memset(bitstr, 0x55, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffs_at(bitstr, i, nbits, &found_set_bit);
+		if (i == nbits - 1 && (nbits & 1) == 0) {
+			ATF_REQUIRE_MSG(found_set_bit == -1,
+			    "bit_ffs_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_set_bit);
+		} else {
+			ATF_REQUIRE_MSG(found_set_bit == i + (i & 1),
+			    "bit_ffs_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_set_bit);
+		}
+	}
+
+	memset(bitstr, 0xAA, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffs_at(bitstr, i, nbits, &found_set_bit);
+		if (i == nbits - 1 && (nbits & 1) != 0) {
+			ATF_REQUIRE_MSG(found_set_bit == -1,
+			    "bit_ffs_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_set_bit);
+		} else {
+			ATF_REQUIRE_MSG(
+			    found_set_bit == i + ((i & 1) ? 0 : 1),
+			    "bit_ffs_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_set_bit);
+		}
+	}
+}
+
+BITSTRING_TC_DEFINE(bit_ffc_at)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	int i, found_clear_bit;
+
+	memset(bitstr, 0, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffc_at(bitstr, i, nbits, &found_clear_bit);
+		ATF_REQUIRE_MSG(found_clear_bit == i,
+		    "bit_ffc_at_%d_%s: Failed on bit %d, Result %d",
+		    nbits, memloc, i, found_clear_bit);
+	}
+
+	memset(bitstr, 0xFF, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffc_at(bitstr, i, nbits, &found_clear_bit);
+		ATF_REQUIRE_MSG(found_clear_bit == -1,
+		    "bit_ffc_at_%d_%s: Failed on bit %d, Result %d",
+		    nbits, memloc, i, found_clear_bit);
+	}
+
+	memset(bitstr, 0x55, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffc_at(bitstr, i, nbits, &found_clear_bit);
+		if (i == nbits - 1 && (nbits & 1) != 0) {
+			ATF_REQUIRE_MSG(found_clear_bit == -1,
+			    "bit_ffc_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_clear_bit);
+		} else {
+			ATF_REQUIRE_MSG(
+			    found_clear_bit == i + ((i & 1) ? 0 : 1),
+			    "bit_ffc_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_clear_bit);
+		}
+	}
+
+	memset(bitstr, 0xAA, bitstr_size(nbits));
+	for (i = 0; i < nbits; i++) {
+		bit_ffc_at(bitstr, i, nbits, &found_clear_bit);
+		if (i == nbits - 1 && (nbits & 1) == 0) {
+			ATF_REQUIRE_MSG(found_clear_bit == -1,
+			    "bit_ffc_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_clear_bit);
+		} else {
+			ATF_REQUIRE_MSG(found_clear_bit == i + (i & 1),
+			    "bit_ffc_at_%d_%s: Failed on bit %d, Result %d",
+			    nbits, memloc, i, found_clear_bit);
+		}
+	}
+}
+
+BITSTRING_TC_DEFINE(bit_nclear)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	int i, j;
+	int found_set_bit;
+	int found_clear_bit;
+
+	for (i = 0; i < nbits; i++) {
+		for (j = i; j < nbits; j++) {
+			memset(bitstr, 0xFF, bitstr_size(nbits));
+			bit_nclear(bitstr, i, j);
+
+			bit_ffc(bitstr, nbits, &found_clear_bit);
+			ATF_REQUIRE_MSG(
+			    found_clear_bit == i,
+			    "bit_nclear_%d_%d_%d%s: Failed with result %d",
+			    nbits, i, j, memloc, found_clear_bit);
+
+			bit_ffs_at(bitstr, i, nbits, &found_set_bit);
+			ATF_REQUIRE_MSG(
+			    (j + 1 < nbits) ? found_set_bit == j + 1 : -1,
+			    "bit_nset_%d_%d_%d%s: Failed with result %d",
+			    nbits, i, j, memloc, found_set_bit);
+		}
+	}
+}
+
+BITSTRING_TC_DEFINE(bit_nset)
+/* bitstr_t *bitstr, int nbits, const char *memloc */
+{
+	int i, j;
+	int found_set_bit;
+	int found_clear_bit;
+
+	for (i = 0; i < nbits; i++) {
+		for (j = i; j < nbits; j++) {
+			memset(bitstr, 0, bitstr_size(nbits));
+			bit_nset(bitstr, i, j);
+
+			bit_ffs(bitstr, nbits, &found_set_bit);
+			ATF_REQUIRE_MSG(
+			    found_set_bit == i,
+			    "bit_nset_%d_%d_%d%s: Failed with result %d",
+			    nbits, i, j, memloc, found_set_bit);
+
+			bit_ffc_at(bitstr, i, nbits, &found_clear_bit);
+			ATF_REQUIRE_MSG(
+			    (j + 1 < nbits) ? found_clear_bit == j + 1 : -1,
+			    "bit_nset_%d_%d_%d%s: Failed with result %d",
+			    nbits, i, j, memloc, found_clear_bit);
+		}
+	}
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+
+	ATF_TP_ADD_TC(tp, bitstr_in_struct);
+	BITSTRING_TC_ADD(tp, bit_set);
+	BITSTRING_TC_ADD(tp, bit_clear);
+	BITSTRING_TC_ADD(tp, bit_ffs);
+	BITSTRING_TC_ADD(tp, bit_ffc);
+	BITSTRING_TC_ADD(tp, bit_ffs_at);
+	BITSTRING_TC_ADD(tp, bit_ffc_at);
+	BITSTRING_TC_ADD(tp, bit_nclear);
+	BITSTRING_TC_ADD(tp, bit_nset);
+
+	return (atf_no_error());
+}

Property changes on: head/tests/sys/sys/bitstring_test.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+FreeBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Index: head/usr.sbin/bluetooth/hccontrol/le.c
===================================================================
--- head/usr.sbin/bluetooth/hccontrol/le.c	(revision 299089)
+++ head/usr.sbin/bluetooth/hccontrol/le.c	(revision 299090)
@@ -1,356 +1,356 @@
 /*
  * le.c
  *
  * Copyright (c) 2015 Takanori Watanabe <takawata@freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $Id: hccontrol.c,v 1.5 2003/09/05 00:38:24 max Exp $
  * $FreeBSD$
  */
 
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <sys/sysctl.h>
-#include <sys/bitstring.h>
 #include <sys/select.h>
 #include <assert.h>
+#include <bitstring.h>
 #include <err.h>
 #include <errno.h>
 #include <netgraph/ng_message.h>
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <stdint.h>
 #define L2CAP_SOCKET_CHECKED
 #include <bluetooth.h>
 #include "hccontrol.h"
 
 static int le_set_scan_param(int s, int argc, char *argv[]);
 static int le_set_scan_enable(int s, int argc, char *argv[]);
 static int parse_param(int argc, char *argv[], char *buf, int *len);
 static int le_set_scan_response(int s, int argc, char *argv[]);
 static int le_read_supported_status(int s, int argc, char *argv[]);
 static int le_read_local_supported_features(int s, int argc ,char *argv[]);
 static int set_le_event_mask(int s, uint64_t mask);
 static int set_event_mask(int s, uint64_t mask);
 static int le_enable(int s, int argc, char *argv[]);
 
 static int
 le_set_scan_param(int s, int argc, char *argv[])
 {
 	int type;
 	int interval;
 	int window;
 	int adrtype;
 	int policy;
 	int e, n;
 
 	ng_hci_le_set_scan_parameters_cp cp;
 	ng_hci_le_set_scan_parameters_rp rp;
 
 	if (argc != 5)
 		return USAGE;
 	
 	if (strcmp(argv[0], "active") == 0)
 		type = 1;
 	else if (strcmp(argv[0], "passive") == 0)
 		type = 0;
 	else
 		return USAGE;
 
 	interval = (int)(atof(argv[1])/0.625);
 	interval = (interval < 4)? 4: interval;
 	window = (int)(atof(argv[2])/0.625);
 	window = (window < 4) ? 4 : interval;
 	
 	if (strcmp(argv[3], "public") == 0)
 		adrtype = 0;
 	else if (strcmp(argv[3], "random") == 0)
 		adrtype = 1;
 	else
 		return USAGE;
 
 	if (strcmp(argv[4], "all") == 0)
 		policy = 0;
 	else if (strcmp(argv[4], "whitelist") == 0)
 		policy = 1;
 	else
 		return USAGE;
 
 	cp.le_scan_type = type;
 	cp.le_scan_interval = interval;
 	cp.own_address_type = adrtype;
 	cp.le_scan_window = window;
 	cp.scanning_filter_policy = policy;
 	n = sizeof(rp);
 	e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE,
 		NG_HCI_OCF_LE_SET_SCAN_PARAMETERS), 
 		(void *)&cp, sizeof(cp), (void *)&rp, &n);
 
 	return 0;
 }
 
 static int
 le_set_scan_enable(int s, int argc, char *argv[])
 {
 	ng_hci_le_set_scan_enable_cp cp;
 	ng_hci_le_set_scan_enable_rp rp;
 	int e, n, enable = 0;
 
 	if (argc != 1)
 		return USAGE;
 	  
 	if (strcmp(argv[0], "enable") == 0)
 		enable = 1;
 	else if (strcmp(argv[0], "disable") != 0)
 		return USAGE;
 
 	n = sizeof(rp);
 	cp.le_scan_enable = enable;
 	cp.filter_duplicates = 0;
 	e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE,
 		NG_HCI_OCF_LE_SET_SCAN_ENABLE), 
 		(void *)&cp, sizeof(cp), (void *)&rp, &n);
 			
 	if (e != 0 || rp.status != 0)
 		return ERROR;
 
 	return OK;
 }
 
 static int
 parse_param(int argc, char *argv[], char *buf, int *len)
 {
 	char *buflast  =  buf + (*len);
 	char *curbuf = buf;
 	char *token,*lenpos;
 	int ch;
 	int datalen;
 	uint16_t value;
 	optreset = 1;
 	optind = 0;
 	while ((ch = getopt(argc, argv , "n:f:u:")) != -1) {
 		switch(ch){
 		case 'n':
 			datalen = strlen(optarg);
 			if ((curbuf + datalen + 2) >= buflast)
 				goto done;
 			curbuf[0] = datalen + 1;
 			curbuf[1] = 8;
 			curbuf += 2;
 			memcpy(curbuf, optarg, datalen);
 			curbuf += datalen;
 			break;
 		case 'f':
 			if (curbuf+3 > buflast)
 				goto done;
 			curbuf[0] = 2;
 			curbuf[1] = 1;
 			curbuf[2] = atoi(optarg);
 			curbuf += 3;
 			break;
 		case 'u':
 			lenpos = buf;
 			if ((buf+2) >= buflast)
 				goto done;
 			curbuf[1] = 2;
 			*lenpos = 1;
 			curbuf += 2;
 			while ((token = strsep(&optarg, ",")) != NULL) {
 				value = strtol(token, NULL, 16);
 				if ((curbuf+2) >= buflast)
 					break;
 				curbuf[0] = value &0xff;
 				curbuf[1] = (value>>8)&0xff;
 				curbuf += 2;
 			}
 				
 		}
 	}
 done:
 	*len = curbuf - buf;
 
 	return OK;
 }
 
 static int
 le_set_scan_response(int s, int argc, char *argv[])
 {
 	ng_hci_le_set_scan_response_data_cp cp;
 	ng_hci_le_set_scan_response_data_rp rp;
 	int n;
 	int e;
 	int len;
 	char buf[NG_HCI_ADVERTISING_DATA_SIZE];
 
 	len = sizeof(buf);
 	parse_param(argc, argv, buf, &len);
 	memset(cp.scan_response_data, 0, sizeof(cp.scan_response_data));
 	cp.scan_response_data_length = len;
 	memcpy(cp.scan_response_data, buf, len);
 	n = sizeof(rp);
 	e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE,
 			NG_HCI_OCF_LE_SET_SCAN_RESPONSE_DATA), 
 			(void *)&cp, sizeof(cp), (void *)&rp, &n);
 			
 	printf("SET SCAN RESPONSE %d %d %d\n", e, rp.status, n);
 
 	return OK;
 }
 
 static int
 le_read_local_supported_features(int s, int argc ,char *argv[])
 {
 	ng_hci_le_read_local_supported_features_rp rp;
 	int e;
 	int n = sizeof(rp);
 
 	e = hci_simple_request(s,
 			NG_HCI_OPCODE(NG_HCI_OGF_LE,
 			NG_HCI_OCF_LE_READ_LOCAL_SUPPORTED_FEATURES), 
 			(void *)&rp, &n);
 
 	printf("LOCAL SUPPORTED: %d %d %jx\n", e, rp.status,
 	       (uintmax_t) rp.le_features);
 
 	return 0;
 }
 
 static int
 le_read_supported_status(int s, int argc, char *argv[])
 {
 	ng_hci_le_read_supported_status_rp rp;
 	int e;
 	int n = sizeof(rp);
 
 	e = hci_simple_request(s, NG_HCI_OPCODE(
 					NG_HCI_OGF_LE,
 					NG_HCI_OCF_LE_READ_SUPPORTED_STATUS),
 			       		(void *)&rp, &n);
 
 	printf("LE_STATUS: %d %d %jx\n", e, rp.status, (uintmax_t)rp.le_status);
 
 	return 0;
 }
 
 static int
 set_le_event_mask(int s, uint64_t mask)
 {
 	ng_hci_le_set_event_mask_cp semc;
 	ng_hci_le_set_event_mask_rp rp;  
 	int i, n ,e;
 	
 	n = sizeof(rp);
 	
 	for (i=0; i < NG_HCI_LE_EVENT_MASK_SIZE; i++) {
 		semc.event_mask[i] = mask&0xff;
 		mask >>= 8;
 	}
 	e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_LE,
 			NG_HCI_OCF_LE_SET_EVENT_MASK),
 			(void *)&semc, sizeof(semc), (void *)&rp, &n);
 	
 	return 0;
 }
 
 static int
 set_event_mask(int s, uint64_t mask)
 {
 	ng_hci_set_event_mask_cp semc;
 	ng_hci_set_event_mask_rp rp;  
 	int i, n, e;
 	
 	n = sizeof(rp);
 	
 	for (i=0; i < NG_HCI_EVENT_MASK_SIZE; i++) {
 		semc.event_mask[i] = mask&0xff;
 		mask >>= 8;
 	}
 	e = hci_request(s, NG_HCI_OPCODE(NG_HCI_OGF_HC_BASEBAND,
 			NG_HCI_OCF_SET_EVENT_MASK),
 			(void *)&semc, sizeof(semc), (void *)&rp, &n);
 	
 	return 0;
 }
 
 static
 int le_enable(int s, int argc, char *argv[])
 {
 	if (argc != 1)
 		return USAGE;
 	
 	if (strcasecmp(argv[0], "enable") == 0) {
 		set_event_mask(s, NG_HCI_EVENT_MASK_DEFAULT |
 			       NG_HCI_EVENT_MASK_LE);
 		set_le_event_mask(s, NG_HCI_LE_EVENT_MASK_ALL);
 	} else if (strcasecmp(argv[0], "disble") == 0)
 		set_event_mask(s, NG_HCI_EVENT_MASK_DEFAULT);
 	else
 		return USAGE;
 
 	return OK;
 }
 
 struct hci_command le_commands[] = {
 {
 	"le_enable",
 	"le_enable [enable|disable] \n"
 	"Enable LE event ",
 	&le_enable,
 },
   {
 	  "le_read_local_supported_features",
 	  "le_read_local_supported_features\n" 
 	  "read local supported features mask",
 	  &le_read_local_supported_features,
   },
   {
 	  "le_read_supported_status",
 	  "le_read_supported_status\n"
 	  "read supported status"	  
 	  ,
 	  &le_read_supported_status,
   },
   {
 	  "le_set_scan_response",
 	  "le_set_scan_response -n $name -f $flag -u $uuid16,$uuid16 \n"
 	  "set LE scan response data"
 	  ,
 	  &le_set_scan_response,
   },
   {
 	  "le_set_scan_enable",
 	  "le_set_scan_enable [enable|disable] \n"
 	  "enable or disable LE device scan",
 	  &le_set_scan_enable
   },
   {
 	  "le_set_scan_param",
 	  "le_set_scan_param [active|passive] interval(ms) window(ms) [public|random] [all|whitelist] \n"
 	  "set LE device scan parameter",
 	  &le_set_scan_param
   },
 };