Index: head/sys/dev/xen/balloon/balloon.c
===================================================================
--- head/sys/dev/xen/balloon/balloon.c	(revision 358315)
+++ head/sys/dev/xen/balloon/balloon.c	(revision 358316)
@@ -1,418 +1,420 @@
 /******************************************************************************
  * balloon.c
  *
  * Xen balloon driver - enables returning/claiming memory to/from Xen.
  *
  * Copyright (c) 2003, B Dragovic
  * Copyright (c) 2003-2004, M Williamson, K Fraser
  * Copyright (c) 2005 Dan M. Smith, IBM Corporation
  * 
  * This file may be distributed separately from the Linux kernel, or
  * incorporated into other software packages, subject to the following license:
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this source file (the "Software"), to deal in the Software without
  * restriction, including without limitation the rights to use, copy, modify,
  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  * and to permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  * 
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  * 
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/module.h>
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 
 #include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <xen/features.h>
 #include <xen/xenstore/xenstorevar.h>
 
 static MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver");
 
 /* Convert from KB (as fetched from xenstore) to number of PAGES */
 #define KB_TO_PAGE_SHIFT	(PAGE_SHIFT - 10)
 
 struct mtx balloon_mutex;
 
 /* We increase/decrease in batches which fit in a page */
 static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)];
 
 struct balloon_stats {
 	/* We aim for 'current allocation' == 'target allocation'. */
 	unsigned long current_pages;
 	unsigned long target_pages;
 	/* We may hit the hard limit in Xen. If we do then we remember it. */
 	unsigned long hard_limit;
 	/*
 	 * Drivers may alter the memory reservation independently, but they
 	 * must inform the balloon driver so we avoid hitting the hard limit.
 	 */
 	unsigned long driver_pages;
 	/* Number of pages in high- and low-memory balloons. */
 	unsigned long balloon_low;
 	unsigned long balloon_high;
 };
 
 static struct balloon_stats balloon_stats;
 #define bs balloon_stats
 
 SYSCTL_DECL(_dev_xen);
-static SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD, NULL, "Balloon");
+static SYSCTL_NODE(_dev_xen, OID_AUTO, balloon,
+    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+    "Balloon");
 SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD,
     &bs.current_pages, 0, "Current allocation");
 SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD,
     &bs.target_pages, 0, "Target allocation");
 SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD,
     &bs.driver_pages, 0, "Driver pages");
 SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD,
     &bs.hard_limit, 0, "Xen hard limit");
 SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD,
     &bs.balloon_low, 0, "Low-mem balloon");
 SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD,
     &bs.balloon_high, 0, "High-mem balloon");
 
 /* List of ballooned pages, threaded through the mem_map array. */
 static TAILQ_HEAD(,vm_page) ballooned_pages;
 
 /* Main work function, always executed in process context. */
 static void balloon_process(void *unused);
 
 #define IPRINTK(fmt, args...) \
 	printk(KERN_INFO "xen_mem: " fmt, ##args)
 #define WPRINTK(fmt, args...) \
 	printk(KERN_WARNING "xen_mem: " fmt, ##args)
 
 static unsigned long 
 current_target(void)
 {
 	unsigned long target = min(bs.target_pages, bs.hard_limit);
 	if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high))
 		target = bs.current_pages + bs.balloon_low + bs.balloon_high;
 	return (target);
 }
 
 static unsigned long
 minimum_target(void)
 {
 	unsigned long min_pages, curr_pages = current_target();
 
 #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
 	/*
 	 * Simple continuous piecewiese linear function:
 	 *  max MiB -> min MiB	gradient
 	 *       0	   0
 	 *      16	  16
 	 *      32	  24
 	 *     128	  72	(1/2)
 	 *     512 	 168	(1/4)
 	 *    2048	 360	(1/8)
 	 *    8192	 552	(1/32)
 	 *   32768	1320
 	 *  131072	4392
 	 */
 	if (realmem < MB2PAGES(128))
 		min_pages = MB2PAGES(8) + (realmem >> 1);
 	else if (realmem < MB2PAGES(512))
 		min_pages = MB2PAGES(40) + (realmem >> 2);
 	else if (realmem < MB2PAGES(2048))
 		min_pages = MB2PAGES(104) + (realmem >> 3);
 	else
 		min_pages = MB2PAGES(296) + (realmem >> 5);
 #undef MB2PAGES
 
 	/* Don't enforce growth */
 	return (min(min_pages, curr_pages));
 }
 
 static int 
 increase_reservation(unsigned long nr_pages)
 {
 	unsigned long  i;
 	vm_page_t      page;
 	long           rc;
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
 		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
 
 	mtx_assert(&balloon_mutex, MA_OWNED);
 
 	if (nr_pages > nitems(frame_list))
 		nr_pages = nitems(frame_list);
 
 	for (page = TAILQ_FIRST(&ballooned_pages), i = 0;
 	    i < nr_pages; i++, page = TAILQ_NEXT(page, plinks.q)) {
 		KASSERT(page != NULL, ("ballooned_pages list corrupt"));
 		frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
 	}
 
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents   = nr_pages;
 	rc = HYPERVISOR_memory_op(
 		XENMEM_populate_physmap, &reservation);
 	if (rc < nr_pages) {
 		if (rc > 0) {
 			int ret;
 
 			/* We hit the Xen hard limit: reprobe. */
 			reservation.nr_extents = rc;
 			ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
 					&reservation);
 			KASSERT(ret == rc, ("HYPERVISOR_memory_op failed"));
 		}
 		if (rc >= 0)
 			bs.hard_limit = (bs.current_pages + rc -
 					 bs.driver_pages);
 		goto out;
 	}
 
 	for (i = 0; i < nr_pages; i++) {
 		page = TAILQ_FIRST(&ballooned_pages);
 		KASSERT(page != NULL, ("Unable to get ballooned page"));
 		TAILQ_REMOVE(&ballooned_pages, page, plinks.q);
 		bs.balloon_low--;
 
 		KASSERT(xen_feature(XENFEAT_auto_translated_physmap),
 		    ("auto translated physmap but mapping is valid"));
 
 		vm_page_free(page);
 	}
 
 	bs.current_pages += nr_pages;
 
  out:
 	return (0);
 }
 
 static int
 decrease_reservation(unsigned long nr_pages)
 {
 	unsigned long  i;
 	vm_page_t      page;
 	int            need_sleep = 0;
 	int ret;
 	struct xen_memory_reservation reservation = {
 		.address_bits = 0,
 		.extent_order = 0,
 		.domid        = DOMID_SELF
 	};
 
 	mtx_assert(&balloon_mutex, MA_OWNED);
 
 	if (nr_pages > nitems(frame_list))
 		nr_pages = nitems(frame_list);
 
 	for (i = 0; i < nr_pages; i++) {
 		if ((page = vm_page_alloc(NULL, 0, 
 			    VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | 
 			    VM_ALLOC_ZERO)) == NULL) {
 			nr_pages = i;
 			need_sleep = 1;
 			break;
 		}
 
 		if ((page->flags & PG_ZERO) == 0) {
 			/*
 			 * Zero the page, or else we might be leaking
 			 * important data to other domains on the same
 			 * host. Xen doesn't scrub ballooned out memory
 			 * pages, the guest is in charge of making
 			 * sure that no information is leaked.
 			 */
 			pmap_zero_page(page);
 		}
 
 		frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT);
 
 		TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q);
 		bs.balloon_low++;
 	}
 
 	set_xen_guest_handle(reservation.extent_start, frame_list);
 	reservation.nr_extents   = nr_pages;
 	ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
 	KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed"));
 
 	bs.current_pages -= nr_pages;
 
 	return (need_sleep);
 }
 
 /*
  * We avoid multiple worker processes conflicting via the balloon mutex.
  * We may of course race updates of the target counts (which are protected
  * by the balloon lock), or with changes to the Xen hard limit, but we will
  * recover from these in time.
  */
 static void 
 balloon_process(void *unused)
 {
 	int need_sleep = 0;
 	long credit;
 	
 	mtx_lock(&balloon_mutex);
 	for (;;) {
 		int sleep_time;
 
 		do {
 			credit = current_target() - bs.current_pages;
 			if (credit > 0)
 				need_sleep = (increase_reservation(credit) != 0);
 			if (credit < 0)
 				need_sleep = (decrease_reservation(-credit) != 0);
 			
 		} while ((credit != 0) && !need_sleep);
 		
 		/* Schedule more work if there is some still to be done. */
 		if (current_target() != bs.current_pages)
 			sleep_time = hz;
 		else
 			sleep_time = 0;
 
 		msleep(balloon_process, &balloon_mutex, 0, "balloon",
 		       sleep_time);
 	}
 	mtx_unlock(&balloon_mutex);
 }
 
 /* Resets the Xen limit, sets new target, and kicks off processing. */
 static void 
 set_new_target(unsigned long target)
 {
 	/* No need for lock. Not read-modify-write updates. */
 	bs.hard_limit   = ~0UL;
 	bs.target_pages = max(target, minimum_target());
 	wakeup(balloon_process);
 }
 
 static struct xs_watch target_watch =
 {
 	.node = "memory/target"
 };
 
 /* React to a change in the target key */
 static void 
 watch_target(struct xs_watch *watch,
 	     const char **vec, unsigned int len)
 {
 	unsigned long long new_target;
 	int err;
 
 	err = xs_scanf(XST_NIL, "memory", "target", NULL,
 	    "%llu", &new_target);
 	if (err) {
 		/* This is ok (for domain0 at least) - so just return */
 		return;
 	} 
         
 	/*
 	 * The given memory/target value is in KiB, so it needs converting to
 	 * pages.  PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10.
 	 */
 	set_new_target(new_target >> KB_TO_PAGE_SHIFT);
 }
 
 /*------------------ Private Device Attachment Functions  --------------------*/
 /**
  * \brief Identify instances of this device type in the system.
  *
  * \param driver  The driver performing this identify action.
  * \param parent  The NewBus parent device for any devices this method adds.
  */
 static void
 xenballoon_identify(driver_t *driver __unused, device_t parent)
 {
 	/*
 	 * A single device instance for our driver is always present
 	 * in a system operating under Xen.
 	 */
 	BUS_ADD_CHILD(parent, 0, driver->name, 0);
 }
 
 /**
  * \brief Probe for the existence of the Xen Balloon device
  *
  * \param dev  NewBus device_t for this Xen control instance.
  *
  * \return  Always returns 0 indicating success.
  */
 static int 
 xenballoon_probe(device_t dev)
 {
 
 	device_set_desc(dev, "Xen Balloon Device");
 	return (0);
 }
 
 /**
  * \brief Attach the Xen Balloon device.
  *
  * \param dev  NewBus device_t for this Xen control instance.
  *
  * \return  On success, 0. Otherwise an errno value indicating the
  *          type of failure.
  */
 static int
 xenballoon_attach(device_t dev)
 {
 	int err;
 
 	mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF);
 
 	bs.current_pages = realmem;
 	bs.target_pages  = bs.current_pages;
 	bs.balloon_low   = 0;
 	bs.balloon_high  = 0;
 	bs.driver_pages  = 0UL;
 	bs.hard_limit    = ~0UL;
 
 	kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon");
     
 	target_watch.callback = watch_target;
 
 	err = xs_register_watch(&target_watch);
 	if (err)
 		device_printf(dev,
 		    "xenballon: failed to set balloon watcher\n");
 
 	return (err);
 }
 
 /*-------------------- Private Device Attachment Data  -----------------------*/
 static device_method_t xenballoon_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_identify,	xenballoon_identify),
 	DEVMETHOD(device_probe,         xenballoon_probe),
 	DEVMETHOD(device_attach,        xenballoon_attach),
 
 	DEVMETHOD_END
 };
 
 DEFINE_CLASS_0(xenballoon, xenballoon_driver, xenballoon_methods, 0);
 devclass_t xenballoon_devclass;
 
 DRIVER_MODULE(xenballoon, xenstore, xenballoon_driver, xenballoon_devclass,
     NULL, NULL);
Index: head/sys/dev/xen/blkfront/blkfront.c
===================================================================
--- head/sys/dev/xen/blkfront/blkfront.c	(revision 358315)
+++ head/sys/dev/xen/blkfront/blkfront.c	(revision 358316)
@@ -1,1653 +1,1654 @@
 /*
  * XenBSD block device driver
  *
  * Copyright (c) 2010-2013 Spectra Logic Corporation
  * Copyright (c) 2009 Scott Long, Yahoo!
  * Copyright (c) 2009 Frank Suchomel, Citrix
  * Copyright (c) 2009 Doug F. Rabson, Citrix
  * Copyright (c) 2005 Kip Macy
  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
  *
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/kernel.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 #include <sys/bio.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/module.h>
 #include <sys/sysctl.h>
 
 #include <machine/bus.h>
 #include <sys/rman.h>
 #include <machine/resource.h>
 #include <machine/intr_machdep.h>
 #include <machine/vmparam.h>
 
 #include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/gnttab.h>
 #include <xen/interface/grant_table.h>
 #include <xen/interface/io/protocols.h>
 #include <xen/xenbus/xenbusvar.h>
 
 #include <machine/_inttypes.h>
 
 #include <geom/geom_disk.h>
 
 #include <dev/xen/blkfront/block.h>
 
 #include "xenbus_if.h"
 
 /*--------------------------- Forward Declarations ---------------------------*/
 static void xbd_closing(device_t);
 static void xbd_startio(struct xbd_softc *sc);
 
 /*---------------------------------- Macros ----------------------------------*/
 #if 0
 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args)
 #else
 #define DPRINTK(fmt, args...) 
 #endif
 
 #define XBD_SECTOR_SHFT		9
 
 /*---------------------------- Global Static Data ----------------------------*/
 static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data");
 
 static int xbd_enable_indirect = 1;
-SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD, 0, "xbd driver parameters");
+SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "xbd driver parameters");
 SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN,
     &xbd_enable_indirect, 0, "Enable xbd indirect segments");
 
 /*---------------------------- Command Processing ----------------------------*/
 static void
 xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag)
 {
 	if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) != 0)
 		return;
 
 	sc->xbd_flags |= xbd_flag;
 	sc->xbd_qfrozen_cnt++;
 }
 
 static void
 xbd_thaw(struct xbd_softc *sc, xbd_flag_t xbd_flag)
 {
 	if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) == 0)
 		return;
 
 	if (sc->xbd_qfrozen_cnt == 0)
 		panic("%s: Thaw with flag 0x%x while not frozen.",
 		    __func__, xbd_flag);
 
 	sc->xbd_flags &= ~xbd_flag;
 	sc->xbd_qfrozen_cnt--;
 }
 
 static void
 xbd_cm_freeze(struct xbd_softc *sc, struct xbd_command *cm, xbdc_flag_t cm_flag)
 {
 	if ((cm->cm_flags & XBDCF_FROZEN) != 0)
 		return;
 
 	cm->cm_flags |= XBDCF_FROZEN|cm_flag;
 	xbd_freeze(sc, XBDF_NONE);
 }
 
 static void
 xbd_cm_thaw(struct xbd_softc *sc, struct xbd_command *cm)
 {
 	if ((cm->cm_flags & XBDCF_FROZEN) == 0)
 		return;
 
 	cm->cm_flags &= ~XBDCF_FROZEN;
 	xbd_thaw(sc, XBDF_NONE);
 }
 
 static inline void 
 xbd_flush_requests(struct xbd_softc *sc)
 {
 	int notify;
 
 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->xbd_ring, notify);
 
 	if (notify)
 		xen_intr_signal(sc->xen_intr_handle);
 }
 
 static void
 xbd_free_command(struct xbd_command *cm)
 {
 
 	KASSERT((cm->cm_flags & XBDCF_Q_MASK) == XBD_Q_NONE,
 	    ("Freeing command that is still on queue %d.",
 	    cm->cm_flags & XBDCF_Q_MASK));
 
 	cm->cm_flags = XBDCF_INITIALIZER;
 	cm->cm_bp = NULL;
 	cm->cm_complete = NULL;
 	xbd_enqueue_cm(cm, XBD_Q_FREE);
 	xbd_thaw(cm->cm_sc, XBDF_CM_SHORTAGE);
 }
 
 static void
 xbd_mksegarray(bus_dma_segment_t *segs, int nsegs,
     grant_ref_t * gref_head, int otherend_id, int readonly,
     grant_ref_t * sg_ref, struct blkif_request_segment *sg)
 {
 	struct blkif_request_segment *last_block_sg = sg + nsegs;
 	vm_paddr_t buffer_ma;
 	uint64_t fsect, lsect;
 	int ref;
 
 	while (sg < last_block_sg) {
 		KASSERT(segs->ds_addr % (1 << XBD_SECTOR_SHFT) == 0,
 		    ("XEN disk driver I/O must be sector aligned"));
 		KASSERT(segs->ds_len % (1 << XBD_SECTOR_SHFT) == 0,
 		    ("XEN disk driver I/Os must be a multiple of "
 		    "the sector length"));
 		buffer_ma = segs->ds_addr;
 		fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT;
 		lsect = fsect + (segs->ds_len  >> XBD_SECTOR_SHFT) - 1;
 
 		KASSERT(lsect <= 7, ("XEN disk driver data cannot "
 		    "cross a page boundary"));
 
 		/* install a grant reference. */
 		ref = gnttab_claim_grant_reference(gref_head);
 
 		/*
 		 * GNTTAB_LIST_END == 0xffffffff, but it is private
 		 * to gnttab.c.
 		 */
 		KASSERT(ref != ~0, ("grant_reference failed"));
 
 		gnttab_grant_foreign_access_ref(
 		    ref,
 		    otherend_id,
 		    buffer_ma >> PAGE_SHIFT,
 		    readonly);
 
 		*sg_ref = ref;
 		*sg = (struct blkif_request_segment) {
 			.gref       = ref,
 			.first_sect = fsect, 
 			.last_sect  = lsect
 		};
 		sg++;
 		sg_ref++;
 		segs++;
 	}
 }
 
 static void
 xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 {
 	struct xbd_softc *sc;
 	struct xbd_command *cm;
 	int op;
 
 	cm = arg;
 	sc = cm->cm_sc;
 
 	if (error) {
 		cm->cm_bp->bio_error = EIO;
 		biodone(cm->cm_bp);
 		xbd_free_command(cm);
 		return;
 	}
 
 	KASSERT(nsegs <= sc->xbd_max_request_segments,
 	    ("Too many segments in a blkfront I/O"));
 
 	if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 		blkif_request_t	*ring_req;
 
 		/* Fill out a blkif_request_t structure. */
 		ring_req = (blkif_request_t *)
 		    RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
 		sc->xbd_ring.req_prod_pvt++;
 		ring_req->id = cm->cm_id;
 		ring_req->operation = cm->cm_operation;
 		ring_req->sector_number = cm->cm_sector_number;
 		ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
 		ring_req->nr_segments = nsegs;
 		cm->cm_nseg = nsegs;
 		xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
 		    xenbus_get_otherend_id(sc->xbd_dev),
 		    cm->cm_operation == BLKIF_OP_WRITE,
 		    cm->cm_sg_refs, ring_req->seg);
 	} else {
 		blkif_request_indirect_t *ring_req;
 
 		/* Fill out a blkif_request_indirect_t structure. */
 		ring_req = (blkif_request_indirect_t *)
 		    RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt);
 		sc->xbd_ring.req_prod_pvt++;
 		ring_req->id = cm->cm_id;
 		ring_req->operation = BLKIF_OP_INDIRECT;
 		ring_req->indirect_op = cm->cm_operation;
 		ring_req->sector_number = cm->cm_sector_number;
 		ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk;
 		ring_req->nr_segments = nsegs;
 		cm->cm_nseg = nsegs;
 		xbd_mksegarray(segs, nsegs, &cm->cm_gref_head,
 		    xenbus_get_otherend_id(sc->xbd_dev),
 		    cm->cm_operation == BLKIF_OP_WRITE,
 		    cm->cm_sg_refs, cm->cm_indirectionpages);
 		memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs,
 		    sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages);
 	}
 
 	if (cm->cm_operation == BLKIF_OP_READ)
 		op = BUS_DMASYNC_PREREAD;
 	else if (cm->cm_operation == BLKIF_OP_WRITE)
 		op = BUS_DMASYNC_PREWRITE;
 	else
 		op = 0;
 	bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op);
 
 	gnttab_free_grant_references(cm->cm_gref_head);
 
 	xbd_enqueue_cm(cm, XBD_Q_BUSY);
 
 	/*
 	 * If bus dma had to asynchronously call us back to dispatch
 	 * this command, we are no longer executing in the context of 
 	 * xbd_startio().  Thus we cannot rely on xbd_startio()'s call to
 	 * xbd_flush_requests() to publish this command to the backend
 	 * along with any other commands that it could batch.
 	 */
 	if ((cm->cm_flags & XBDCF_ASYNC_MAPPING) != 0)
 		xbd_flush_requests(sc);
 
 	return;
 }
 
 static int
 xbd_queue_request(struct xbd_softc *sc, struct xbd_command *cm)
 {
 	int error;
 
 	if (cm->cm_bp != NULL)
 		error = bus_dmamap_load_bio(sc->xbd_io_dmat, cm->cm_map,
 		    cm->cm_bp, xbd_queue_cb, cm, 0);
 	else
 		error = bus_dmamap_load(sc->xbd_io_dmat, cm->cm_map,
 		    cm->cm_data, cm->cm_datalen, xbd_queue_cb, cm, 0);
 	if (error == EINPROGRESS) {
 		/*
 		 * Maintain queuing order by freezing the queue.  The next
 		 * command may not require as many resources as the command
 		 * we just attempted to map, so we can't rely on bus dma
 		 * blocking for it too.
 		 */
 		xbd_cm_freeze(sc, cm, XBDCF_ASYNC_MAPPING);
 		return (0);
 	}
 
 	return (error);
 }
 
 static void
 xbd_restart_queue_callback(void *arg)
 {
 	struct xbd_softc *sc = arg;
 
 	mtx_lock(&sc->xbd_io_lock);
 
 	xbd_thaw(sc, XBDF_GNT_SHORTAGE);
 
 	xbd_startio(sc);
 
 	mtx_unlock(&sc->xbd_io_lock);
 }
 
 static struct xbd_command *
 xbd_bio_command(struct xbd_softc *sc)
 {
 	struct xbd_command *cm;
 	struct bio *bp;
 
 	if (__predict_false(sc->xbd_state != XBD_STATE_CONNECTED))
 		return (NULL);
 
 	bp = xbd_dequeue_bio(sc);
 	if (bp == NULL)
 		return (NULL);
 
 	if ((cm = xbd_dequeue_cm(sc, XBD_Q_FREE)) == NULL) {
 		xbd_freeze(sc, XBDF_CM_SHORTAGE);
 		xbd_requeue_bio(sc, bp);
 		return (NULL);
 	}
 
 	if (gnttab_alloc_grant_references(sc->xbd_max_request_segments,
 	    &cm->cm_gref_head) != 0) {
 		gnttab_request_free_callback(&sc->xbd_callback,
 		    xbd_restart_queue_callback, sc,
 		    sc->xbd_max_request_segments);
 		xbd_freeze(sc, XBDF_GNT_SHORTAGE);
 		xbd_requeue_bio(sc, bp);
 		xbd_enqueue_cm(cm, XBD_Q_FREE);
 		return (NULL);
 	}
 
 	cm->cm_bp = bp;
 	cm->cm_sector_number = (blkif_sector_t)bp->bio_pblkno;
 
 	switch (bp->bio_cmd) {
 	case BIO_READ:
 		cm->cm_operation = BLKIF_OP_READ;
 		break;
 	case BIO_WRITE:
 		cm->cm_operation = BLKIF_OP_WRITE;
 		if ((bp->bio_flags & BIO_ORDERED) != 0) {
 			if ((sc->xbd_flags & XBDF_BARRIER) != 0) {
 				cm->cm_operation = BLKIF_OP_WRITE_BARRIER;
 			} else {
 				/*
 				 * Single step this command.
 				 */
 				cm->cm_flags |= XBDCF_Q_FREEZE;
 				if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
 					/*
 					 * Wait for in-flight requests to
 					 * finish.
 					 */
 					xbd_freeze(sc, XBDF_WAIT_IDLE);
 					xbd_requeue_cm(cm, XBD_Q_READY);
 					return (NULL);
 				}
 			}
 		}
 		break;
 	case BIO_FLUSH:
 		if ((sc->xbd_flags & XBDF_FLUSH) != 0)
 			cm->cm_operation = BLKIF_OP_FLUSH_DISKCACHE;
 		else if ((sc->xbd_flags & XBDF_BARRIER) != 0)
 			cm->cm_operation = BLKIF_OP_WRITE_BARRIER;
 		else
 			panic("flush request, but no flush support available");
 		break;
 	default:
 		biofinish(bp, NULL, EOPNOTSUPP);
 		xbd_enqueue_cm(cm, XBD_Q_FREE);
 		return (NULL);
 	}
 
 	return (cm);
 }
 
 /*
  * Dequeue buffers and place them in the shared communication ring.
  * Return when no more requests can be accepted or all buffers have 
  * been queued.
  *
  * Signal XEN once the ring has been filled out.
  */
 static void
 xbd_startio(struct xbd_softc *sc)
 {
 	struct xbd_command *cm;
 	int error, queued = 0;
 
 	mtx_assert(&sc->xbd_io_lock, MA_OWNED);
 
 	if (sc->xbd_state != XBD_STATE_CONNECTED)
 		return;
 
 	while (!RING_FULL(&sc->xbd_ring)) {
 
 		if (sc->xbd_qfrozen_cnt != 0)
 			break;
 
 		cm = xbd_dequeue_cm(sc, XBD_Q_READY);
 
 		if (cm == NULL)
 		    cm = xbd_bio_command(sc);
 
 		if (cm == NULL)
 			break;
 
 		if ((cm->cm_flags & XBDCF_Q_FREEZE) != 0) {
 			/*
 			 * Single step command.  Future work is
 			 * held off until this command completes.
 			 */
 			xbd_cm_freeze(sc, cm, XBDCF_Q_FREEZE);
 		}
 
 		if ((error = xbd_queue_request(sc, cm)) != 0) {
 			printf("xbd_queue_request returned %d\n", error);
 			break;
 		}
 		queued++;
 	}
 
 	if (queued != 0) 
 		xbd_flush_requests(sc);
 }
 
 static void
 xbd_bio_complete(struct xbd_softc *sc, struct xbd_command *cm)
 {
 	struct bio *bp;
 
 	bp = cm->cm_bp;
 
 	if (__predict_false(cm->cm_status != BLKIF_RSP_OKAY)) {
 		disk_err(bp, "disk error" , -1, 0);
 		printf(" status: %x\n", cm->cm_status);
 		bp->bio_flags |= BIO_ERROR;
 	}
 
 	if (bp->bio_flags & BIO_ERROR)
 		bp->bio_error = EIO;
 	else
 		bp->bio_resid = 0;
 
 	xbd_free_command(cm);
 	biodone(bp);
 }
 
 static void
 xbd_int(void *xsc)
 {
 	struct xbd_softc *sc = xsc;
 	struct xbd_command *cm;
 	blkif_response_t *bret;
 	RING_IDX i, rp;
 	int op;
 
 	mtx_lock(&sc->xbd_io_lock);
 
 	if (__predict_false(sc->xbd_state == XBD_STATE_DISCONNECTED)) {
 		mtx_unlock(&sc->xbd_io_lock);
 		return;
 	}
 
  again:
 	rp = sc->xbd_ring.sring->rsp_prod;
 	rmb(); /* Ensure we see queued responses up to 'rp'. */
 
 	for (i = sc->xbd_ring.rsp_cons; i != rp;) {
 		bret = RING_GET_RESPONSE(&sc->xbd_ring, i);
 		cm   = &sc->xbd_shadow[bret->id];
 
 		xbd_remove_cm(cm, XBD_Q_BUSY);
 		gnttab_end_foreign_access_references(cm->cm_nseg,
 		    cm->cm_sg_refs);
 		i++;
 
 		if (cm->cm_operation == BLKIF_OP_READ)
 			op = BUS_DMASYNC_POSTREAD;
 		else if (cm->cm_operation == BLKIF_OP_WRITE ||
 		    cm->cm_operation == BLKIF_OP_WRITE_BARRIER)
 			op = BUS_DMASYNC_POSTWRITE;
 		else
 			op = 0;
 		bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op);
 		bus_dmamap_unload(sc->xbd_io_dmat, cm->cm_map);
 
 		/*
 		 * Release any hold this command has on future command
 		 * dispatch. 
 		 */
 		xbd_cm_thaw(sc, cm);
 
 		/*
 		 * Directly call the i/o complete routine to save an
 		 * an indirection in the common case.
 		 */
 		cm->cm_status = bret->status;
 		if (cm->cm_bp)
 			xbd_bio_complete(sc, cm);
 		else if (cm->cm_complete != NULL)
 			cm->cm_complete(cm);
 		else
 			xbd_free_command(cm);
 	}
 
 	sc->xbd_ring.rsp_cons = i;
 
 	if (i != sc->xbd_ring.req_prod_pvt) {
 		int more_to_do;
 		RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, more_to_do);
 		if (more_to_do)
 			goto again;
 	} else {
 		sc->xbd_ring.sring->rsp_event = i + 1;
 	}
 
 	if (xbd_queue_length(sc, XBD_Q_BUSY) == 0)
 		xbd_thaw(sc, XBDF_WAIT_IDLE);
 
 	xbd_startio(sc);
 
 	if (__predict_false(sc->xbd_state == XBD_STATE_SUSPENDED))
 		wakeup(&sc->xbd_cm_q[XBD_Q_BUSY]);
 
 	mtx_unlock(&sc->xbd_io_lock);
 }
 
 /*------------------------------- Dump Support -------------------------------*/
 /**
  * Quiesce the disk writes for a dump file before allowing the next buffer.
  */
 static void
 xbd_quiesce(struct xbd_softc *sc)
 {
 	int mtd;
 
 	// While there are outstanding requests
 	while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
 		RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, mtd);
 		if (mtd) {
 			/* Received request completions, update queue. */
 			xbd_int(sc);
 		}
 		if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
 			/*
 			 * Still pending requests, wait for the disk i/o
 			 * to complete.
 			 */
 			HYPERVISOR_yield();
 		}
 	}
 }
 
 /* Kernel dump function for a paravirtualized disk device */
 static void
 xbd_dump_complete(struct xbd_command *cm)
 {
 
 	xbd_enqueue_cm(cm, XBD_Q_COMPLETE);
 }
 
 static int
 xbd_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
     size_t length)
 {
 	struct disk *dp = arg;
 	struct xbd_softc *sc = dp->d_drv1;
 	struct xbd_command *cm;
 	size_t chunk;
 	int sbp;
 	int rc = 0;
 
 	if (length == 0)
 		return (0);
 
 	xbd_quiesce(sc);	/* All quiet on the western front. */
 
 	/*
 	 * If this lock is held, then this module is failing, and a
 	 * successful kernel dump is highly unlikely anyway.
 	 */
 	mtx_lock(&sc->xbd_io_lock);
 
 	/* Split the 64KB block as needed */
 	for (sbp=0; length > 0; sbp++) {
 		cm = xbd_dequeue_cm(sc, XBD_Q_FREE);
 		if (cm == NULL) {
 			mtx_unlock(&sc->xbd_io_lock);
 			device_printf(sc->xbd_dev, "dump: no more commands?\n");
 			return (EBUSY);
 		}
 
 		if (gnttab_alloc_grant_references(sc->xbd_max_request_segments,
 		    &cm->cm_gref_head) != 0) {
 			xbd_free_command(cm);
 			mtx_unlock(&sc->xbd_io_lock);
 			device_printf(sc->xbd_dev, "no more grant allocs?\n");
 			return (EBUSY);
 		}
 
 		chunk = length > sc->xbd_max_request_size ?
 		    sc->xbd_max_request_size : length;
 		cm->cm_data = virtual;
 		cm->cm_datalen = chunk;
 		cm->cm_operation = BLKIF_OP_WRITE;
 		cm->cm_sector_number = offset / dp->d_sectorsize;
 		cm->cm_complete = xbd_dump_complete;
 
 		xbd_enqueue_cm(cm, XBD_Q_READY);
 
 		length -= chunk;
 		offset += chunk;
 		virtual = (char *) virtual + chunk;
 	}
 
 	/* Tell DOM0 to do the I/O */
 	xbd_startio(sc);
 	mtx_unlock(&sc->xbd_io_lock);
 
 	/* Poll for the completion. */
 	xbd_quiesce(sc);	/* All quite on the eastern front */
 
 	/* If there were any errors, bail out... */
 	while ((cm = xbd_dequeue_cm(sc, XBD_Q_COMPLETE)) != NULL) {
 		if (cm->cm_status != BLKIF_RSP_OKAY) {
 			device_printf(sc->xbd_dev,
 			    "Dump I/O failed at sector %jd\n",
 			    cm->cm_sector_number);
 			rc = EIO;
 		}
 		xbd_free_command(cm);
 	}
 
 	return (rc);
 }
 
 /*----------------------------- Disk Entrypoints -----------------------------*/
 static int
 xbd_open(struct disk *dp)
 {
 	struct xbd_softc *sc = dp->d_drv1;
 
 	if (sc == NULL) {
 		printf("xbd%d: not found", dp->d_unit);
 		return (ENXIO);
 	}
 
 	sc->xbd_flags |= XBDF_OPEN;
 	sc->xbd_users++;
 	return (0);
 }
 
 static int
 xbd_close(struct disk *dp)
 {
 	struct xbd_softc *sc = dp->d_drv1;
 
 	if (sc == NULL)
 		return (ENXIO);
 	sc->xbd_flags &= ~XBDF_OPEN;
 	if (--(sc->xbd_users) == 0) {
 		/*
 		 * Check whether we have been instructed to close.  We will
 		 * have ignored this request initially, as the device was
 		 * still mounted.
 		 */
 		if (xenbus_get_otherend_state(sc->xbd_dev) ==
 		    XenbusStateClosing)
 			xbd_closing(sc->xbd_dev);
 	}
 	return (0);
 }
 
 static int
 xbd_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td)
 {
 	struct xbd_softc *sc = dp->d_drv1;
 
 	if (sc == NULL)
 		return (ENXIO);
 
 	return (ENOTTY);
 }
 
 /*
  * Read/write routine for a buffer.  Finds the proper unit, place it on
  * the sortq and kick the controller.
  */
 static void
 xbd_strategy(struct bio *bp)
 {
 	struct xbd_softc *sc = bp->bio_disk->d_drv1;
 
 	/* bogus disk? */
 	if (sc == NULL) {
 		bp->bio_error = EINVAL;
 		bp->bio_flags |= BIO_ERROR;
 		bp->bio_resid = bp->bio_bcount;
 		biodone(bp);
 		return;
 	}
 
 	/*
 	 * Place it in the queue of disk activities for this disk
 	 */
 	mtx_lock(&sc->xbd_io_lock);
 
 	xbd_enqueue_bio(sc, bp);
 	xbd_startio(sc);
 
 	mtx_unlock(&sc->xbd_io_lock);
 	return;
 }
 
 /*------------------------------ Ring Management -----------------------------*/
 static int 
 xbd_alloc_ring(struct xbd_softc *sc)
 {
 	blkif_sring_t *sring;
 	uintptr_t sring_page_addr;
 	int error;
 	int i;
 
 	sring = malloc(sc->xbd_ring_pages * PAGE_SIZE, M_XENBLOCKFRONT,
 	    M_NOWAIT|M_ZERO);
 	if (sring == NULL) {
 		xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "allocating shared ring");
 		return (ENOMEM);
 	}
 	SHARED_RING_INIT(sring);
 	FRONT_RING_INIT(&sc->xbd_ring, sring, sc->xbd_ring_pages * PAGE_SIZE);
 
 	for (i = 0, sring_page_addr = (uintptr_t)sring;
 	     i < sc->xbd_ring_pages;
 	     i++, sring_page_addr += PAGE_SIZE) {
 
 		error = xenbus_grant_ring(sc->xbd_dev,
 		    (vtophys(sring_page_addr) >> PAGE_SHIFT),
 		    &sc->xbd_ring_ref[i]);
 		if (error) {
 			xenbus_dev_fatal(sc->xbd_dev, error,
 			    "granting ring_ref(%d)", i);
 			return (error);
 		}
 	}
 	if (sc->xbd_ring_pages == 1) {
 		error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev),
 		    "ring-ref", "%u", sc->xbd_ring_ref[0]);
 		if (error) {
 			xenbus_dev_fatal(sc->xbd_dev, error,
 			    "writing %s/ring-ref",
 			    xenbus_get_node(sc->xbd_dev));
 			return (error);
 		}
 	} else {
 		for (i = 0; i < sc->xbd_ring_pages; i++) {
 			char ring_ref_name[]= "ring_refXX";
 
 			snprintf(ring_ref_name, sizeof(ring_ref_name),
 			    "ring-ref%u", i);
 			error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev),
 			     ring_ref_name, "%u", sc->xbd_ring_ref[i]);
 			if (error) {
 				xenbus_dev_fatal(sc->xbd_dev, error,
 				    "writing %s/%s",
 				    xenbus_get_node(sc->xbd_dev),
 				    ring_ref_name);
 				return (error);
 			}
 		}
 	}
 
 	error = xen_intr_alloc_and_bind_local_port(sc->xbd_dev,
 	    xenbus_get_otherend_id(sc->xbd_dev), NULL, xbd_int, sc,
 	    INTR_TYPE_BIO | INTR_MPSAFE, &sc->xen_intr_handle);
 	if (error) {
 		xenbus_dev_fatal(sc->xbd_dev, error,
 		    "xen_intr_alloc_and_bind_local_port failed");
 		return (error);
 	}
 
 	return (0);
 }
 
 static void
 xbd_free_ring(struct xbd_softc *sc)
 {
 	int i;
 
 	if (sc->xbd_ring.sring == NULL)
 		return;
 
 	for (i = 0; i < sc->xbd_ring_pages; i++) {
 		if (sc->xbd_ring_ref[i] != GRANT_REF_INVALID) {
 			gnttab_end_foreign_access_ref(sc->xbd_ring_ref[i]);
 			sc->xbd_ring_ref[i] = GRANT_REF_INVALID;
 		}
 	}
 	free(sc->xbd_ring.sring, M_XENBLOCKFRONT);
 	sc->xbd_ring.sring = NULL;
 }
 
 /*-------------------------- Initialization/Teardown -------------------------*/
 static int
 xbd_feature_string(struct xbd_softc *sc, char *features, size_t len)
 {
 	struct sbuf sb;
 	int feature_cnt;
 
 	sbuf_new(&sb, features, len, SBUF_FIXEDLEN);
 
 	feature_cnt = 0;
 	if ((sc->xbd_flags & XBDF_FLUSH) != 0) {
 		sbuf_printf(&sb, "flush");
 		feature_cnt++;
 	}
 
 	if ((sc->xbd_flags & XBDF_BARRIER) != 0) {
 		if (feature_cnt != 0)
 			sbuf_printf(&sb, ", ");
 		sbuf_printf(&sb, "write_barrier");
 		feature_cnt++;
 	}
 
 	if ((sc->xbd_flags & XBDF_DISCARD) != 0) {
 		if (feature_cnt != 0)
 			sbuf_printf(&sb, ", ");
 		sbuf_printf(&sb, "discard");
 		feature_cnt++;
 	}
 
 	if ((sc->xbd_flags & XBDF_PERSISTENT) != 0) {
 		if (feature_cnt != 0)
 			sbuf_printf(&sb, ", ");
 		sbuf_printf(&sb, "persistent_grants");
 		feature_cnt++;
 	}
 
 	(void) sbuf_finish(&sb);
 	return (sbuf_len(&sb));
 }
 
 static int
 xbd_sysctl_features(SYSCTL_HANDLER_ARGS)
 {
 	char features[80];
 	struct xbd_softc *sc = arg1;
 	int error;
 	int len;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
 
 	len = xbd_feature_string(sc, features, sizeof(features));
 
 	/* len is -1 on error, which will make the SYSCTL_OUT a no-op. */
 	return (SYSCTL_OUT(req, features, len + 1/*NUL*/));
 }
 
 static void
 xbd_setup_sysctl(struct xbd_softc *xbd)
 {
 	struct sysctl_ctx_list *sysctl_ctx = NULL;
 	struct sysctl_oid *sysctl_tree = NULL;
 	struct sysctl_oid_list *children;
 	
 	sysctl_ctx = device_get_sysctl_ctx(xbd->xbd_dev);
 	if (sysctl_ctx == NULL)
 		return;
 
 	sysctl_tree = device_get_sysctl_tree(xbd->xbd_dev);
 	if (sysctl_tree == NULL)
 		return;
 
 	children = SYSCTL_CHILDREN(sysctl_tree);
 	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
 	    "max_requests", CTLFLAG_RD, &xbd->xbd_max_requests, -1,
 	    "maximum outstanding requests (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
 	    "max_request_segments", CTLFLAG_RD,
 	    &xbd->xbd_max_request_segments, 0,
 	    "maximum number of pages per requests (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
 	    "max_request_size", CTLFLAG_RD, &xbd->xbd_max_request_size, 0,
 	    "maximum size in bytes of a request (negotiated)");
 
 	SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO,
 	    "ring_pages", CTLFLAG_RD, &xbd->xbd_ring_pages, 0,
 	    "communication channel pages (negotiated)");
 
 	SYSCTL_ADD_PROC(sysctl_ctx, children, OID_AUTO,
-	    "features", CTLTYPE_STRING|CTLFLAG_RD, xbd, 0,
-	    xbd_sysctl_features, "A", "protocol features (negotiated)");
+	    "features", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, xbd,
+	    0, xbd_sysctl_features, "A", "protocol features (negotiated)");
 }
 
 /*
  * Translate Linux major/minor to an appropriate name and unit
  * number. For HVM guests, this allows us to use the same drive names
  * with blkfront as the emulated drives, easing transition slightly.
  */
 static void
 xbd_vdevice_to_unit(uint32_t vdevice, int *unit, const char **name)
 {
 	static struct vdev_info {
 		int major;
 		int shift;
 		int base;
 		const char *name;
 	} info[] = {
 		{3,	6,	0,	"ada"},	/* ide0 */
 		{22,	6,	2,	"ada"},	/* ide1 */
 		{33,	6,	4,	"ada"},	/* ide2 */
 		{34,	6,	6,	"ada"},	/* ide3 */
 		{56,	6,	8,	"ada"},	/* ide4 */
 		{57,	6,	10,	"ada"},	/* ide5 */
 		{88,	6,	12,	"ada"},	/* ide6 */
 		{89,	6,	14,	"ada"},	/* ide7 */
 		{90,	6,	16,	"ada"},	/* ide8 */
 		{91,	6,	18,	"ada"},	/* ide9 */
 
 		{8,	4,	0,	"da"},	/* scsi disk0 */
 		{65,	4,	16,	"da"},	/* scsi disk1 */
 		{66,	4,	32,	"da"},	/* scsi disk2 */
 		{67,	4,	48,	"da"},	/* scsi disk3 */
 		{68,	4,	64,	"da"},	/* scsi disk4 */
 		{69,	4,	80,	"da"},	/* scsi disk5 */
 		{70,	4,	96,	"da"},	/* scsi disk6 */
 		{71,	4,	112,	"da"},	/* scsi disk7 */
 		{128,	4,	128,	"da"},	/* scsi disk8 */
 		{129,	4,	144,	"da"},	/* scsi disk9 */
 		{130,	4,	160,	"da"},	/* scsi disk10 */
 		{131,	4,	176,	"da"},	/* scsi disk11 */
 		{132,	4,	192,	"da"},	/* scsi disk12 */
 		{133,	4,	208,	"da"},	/* scsi disk13 */
 		{134,	4,	224,	"da"},	/* scsi disk14 */
 		{135,	4,	240,	"da"},	/* scsi disk15 */
 
 		{202,	4,	0,	"xbd"},	/* xbd */
 
 		{0,	0,	0,	NULL},
 	};
 	int major = vdevice >> 8;
 	int minor = vdevice & 0xff;
 	int i;
 
 	if (vdevice & (1 << 28)) {
 		*unit = (vdevice & ((1 << 28) - 1)) >> 8;
 		*name = "xbd";
 		return;
 	}
 
 	for (i = 0; info[i].major; i++) {
 		if (info[i].major == major) {
 			*unit = info[i].base + (minor >> info[i].shift);
 			*name = info[i].name;
 			return;
 		}
 	}
 
 	*unit = minor >> 4;
 	*name = "xbd";
 }
 
 int
 xbd_instance_create(struct xbd_softc *sc, blkif_sector_t sectors,
     int vdevice, uint16_t vdisk_info, unsigned long sector_size,
     unsigned long phys_sector_size)
 {
 	char features[80];
 	int unit, error = 0;
 	const char *name;
 
 	xbd_vdevice_to_unit(vdevice, &unit, &name);
 
 	sc->xbd_unit = unit;
 
 	if (strcmp(name, "xbd") != 0)
 		device_printf(sc->xbd_dev, "attaching as %s%d\n", name, unit);
 
 	if (xbd_feature_string(sc, features, sizeof(features)) > 0) {
 		device_printf(sc->xbd_dev, "features: %s\n",
 		    features);
 	}
 
 	sc->xbd_disk = disk_alloc();
 	sc->xbd_disk->d_unit = sc->xbd_unit;
 	sc->xbd_disk->d_open = xbd_open;
 	sc->xbd_disk->d_close = xbd_close;
 	sc->xbd_disk->d_ioctl = xbd_ioctl;
 	sc->xbd_disk->d_strategy = xbd_strategy;
 	sc->xbd_disk->d_dump = xbd_dump;
 	sc->xbd_disk->d_name = name;
 	sc->xbd_disk->d_drv1 = sc;
 	sc->xbd_disk->d_sectorsize = sector_size;
 	sc->xbd_disk->d_stripesize = phys_sector_size;
 	sc->xbd_disk->d_stripeoffset = 0;
 
 	sc->xbd_disk->d_mediasize = sectors * sector_size;
 	sc->xbd_disk->d_maxsize = sc->xbd_max_request_size;
 	sc->xbd_disk->d_flags = DISKFLAG_UNMAPPED_BIO;
 	if ((sc->xbd_flags & (XBDF_FLUSH|XBDF_BARRIER)) != 0) {
 		sc->xbd_disk->d_flags |= DISKFLAG_CANFLUSHCACHE;
 		device_printf(sc->xbd_dev,
 		    "synchronize cache commands enabled.\n");
 	}
 	disk_create(sc->xbd_disk, DISK_VERSION);
 
 	return error;
 }
 
 static void 
 xbd_free(struct xbd_softc *sc)
 {
 	int i;
 	
 	/* Prevent new requests being issued until we fix things up. */
 	mtx_lock(&sc->xbd_io_lock);
 	sc->xbd_state = XBD_STATE_DISCONNECTED; 
 	mtx_unlock(&sc->xbd_io_lock);
 
 	/* Free resources associated with old device channel. */
 	xbd_free_ring(sc);
 	if (sc->xbd_shadow) {
 
 		for (i = 0; i < sc->xbd_max_requests; i++) {
 			struct xbd_command *cm;
 
 			cm = &sc->xbd_shadow[i];
 			if (cm->cm_sg_refs != NULL) {
 				free(cm->cm_sg_refs, M_XENBLOCKFRONT);
 				cm->cm_sg_refs = NULL;
 			}
 
 			if (cm->cm_indirectionpages != NULL) {
 				gnttab_end_foreign_access_references(
 				    sc->xbd_max_request_indirectpages,
 				    &cm->cm_indirectionrefs[0]);
 				contigfree(cm->cm_indirectionpages, PAGE_SIZE *
 				    sc->xbd_max_request_indirectpages,
 				    M_XENBLOCKFRONT);
 				cm->cm_indirectionpages = NULL;
 			}
 
 			bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map);
 		}
 		free(sc->xbd_shadow, M_XENBLOCKFRONT);
 		sc->xbd_shadow = NULL;
 
 		bus_dma_tag_destroy(sc->xbd_io_dmat);
 		
 		xbd_initq_cm(sc, XBD_Q_FREE);
 		xbd_initq_cm(sc, XBD_Q_READY);
 		xbd_initq_cm(sc, XBD_Q_COMPLETE);
 	}
 		
 	xen_intr_unbind(&sc->xen_intr_handle);
 
 }
 
 /*--------------------------- State Change Handlers --------------------------*/
 static void
 xbd_initialize(struct xbd_softc *sc)
 {
 	const char *otherend_path;
 	const char *node_path;
 	uint32_t max_ring_page_order;
 	int error;
 
 	if (xenbus_get_state(sc->xbd_dev) != XenbusStateInitialising) {
 		/* Initialization has already been performed. */
 		return;
 	}
 
 	/*
 	 * Protocol defaults valid even if negotiation for a
 	 * setting fails.
 	 */
 	max_ring_page_order = 0;
 	sc->xbd_ring_pages = 1;
 
 	/*
 	 * Protocol negotiation.
 	 *
 	 * \note xs_gather() returns on the first encountered error, so
 	 *       we must use independent calls in order to guarantee
 	 *       we don't miss information in a sparsly populated back-end
 	 *       tree.
 	 *
 	 * \note xs_scanf() does not update variables for unmatched
 	 *	 fields.
 	 */
 	otherend_path = xenbus_get_otherend_path(sc->xbd_dev);
 	node_path = xenbus_get_node(sc->xbd_dev);
 
 	/* Support both backend schemes for relaying ring page limits. */
 	(void)xs_scanf(XST_NIL, otherend_path,
 	    "max-ring-page-order", NULL, "%" PRIu32,
 	    &max_ring_page_order);
 	sc->xbd_ring_pages = 1 << max_ring_page_order;
 	(void)xs_scanf(XST_NIL, otherend_path,
 	    "max-ring-pages", NULL, "%" PRIu32,
 	    &sc->xbd_ring_pages);
 	if (sc->xbd_ring_pages < 1)
 		sc->xbd_ring_pages = 1;
 
 	if (sc->xbd_ring_pages > XBD_MAX_RING_PAGES) {
 		device_printf(sc->xbd_dev,
 		    "Back-end specified ring-pages of %u "
 		    "limited to front-end limit of %u.\n",
 		    sc->xbd_ring_pages, XBD_MAX_RING_PAGES);
 		sc->xbd_ring_pages = XBD_MAX_RING_PAGES;
 	}
 
 	if (powerof2(sc->xbd_ring_pages) == 0) {
 		uint32_t new_page_limit;
 
 		new_page_limit = 0x01 << (fls(sc->xbd_ring_pages) - 1);
 		device_printf(sc->xbd_dev,
 		    "Back-end specified ring-pages of %u "
 		    "is not a power of 2. Limited to %u.\n",
 		    sc->xbd_ring_pages, new_page_limit);
 		sc->xbd_ring_pages = new_page_limit;
 	}
 
 	sc->xbd_max_requests =
 	    BLKIF_MAX_RING_REQUESTS(sc->xbd_ring_pages * PAGE_SIZE);
 	if (sc->xbd_max_requests > XBD_MAX_REQUESTS) {
 		device_printf(sc->xbd_dev,
 		    "Back-end specified max_requests of %u "
 		    "limited to front-end limit of %zu.\n",
 		    sc->xbd_max_requests, XBD_MAX_REQUESTS);
 		sc->xbd_max_requests = XBD_MAX_REQUESTS;
 	}
 
 	if (xbd_alloc_ring(sc) != 0)
 		return;
 
 	/* Support both backend schemes for relaying ring page limits. */
 	if (sc->xbd_ring_pages > 1) {
 		error = xs_printf(XST_NIL, node_path,
 		    "num-ring-pages","%u",
 		    sc->xbd_ring_pages);
 		if (error) {
 			xenbus_dev_fatal(sc->xbd_dev, error,
 			    "writing %s/num-ring-pages",
 			    node_path);
 			return;
 		}
 
 		error = xs_printf(XST_NIL, node_path,
 		    "ring-page-order", "%u",
 		    fls(sc->xbd_ring_pages) - 1);
 		if (error) {
 			xenbus_dev_fatal(sc->xbd_dev, error,
 			    "writing %s/ring-page-order",
 			    node_path);
 			return;
 		}
 	}
 
 	error = xs_printf(XST_NIL, node_path, "event-channel",
 	    "%u", xen_intr_port(sc->xen_intr_handle));
 	if (error) {
 		xenbus_dev_fatal(sc->xbd_dev, error,
 		    "writing %s/event-channel",
 		    node_path);
 		return;
 	}
 
 	error = xs_printf(XST_NIL, node_path, "protocol",
 	    "%s", XEN_IO_PROTO_ABI_NATIVE);
 	if (error) {
 		xenbus_dev_fatal(sc->xbd_dev, error,
 		    "writing %s/protocol",
 		    node_path);
 		return;
 	}
 
 	xenbus_set_state(sc->xbd_dev, XenbusStateInitialised);
 }
 
 /* 
  * Invoked when the backend is finally 'ready' (and has published
  * the details about the physical device - #sectors, size, etc). 
  */
 static void 
 xbd_connect(struct xbd_softc *sc)
 {
 	device_t dev = sc->xbd_dev;
 	unsigned long sectors, sector_size, phys_sector_size;
 	unsigned int binfo;
 	int err, feature_barrier, feature_flush;
 	int i, j;
 
 	DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev));
 
 	if (sc->xbd_state == XBD_STATE_SUSPENDED) {
 		return;
 	}
 
 	if (sc->xbd_state == XBD_STATE_CONNECTED) {
 		struct disk *disk;
 
 		disk = sc->xbd_disk;
 		if (disk == NULL) {
 			return;
 		}
 		err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
 		    "sectors", "%lu", &sectors, NULL);
 		if (err != 0) {
 			xenbus_dev_error(dev, err,
 			    "reading sectors at %s",
 			    xenbus_get_otherend_path(dev));
 			return;
 		}
 		disk->d_mediasize = disk->d_sectorsize * sectors;
 		err = disk_resize(disk, M_NOWAIT);
 		if (err) {
 			xenbus_dev_error(dev, err,
 			    "unable to resize disk %s%u",
 			    disk->d_name, disk->d_unit);
 			return;
 		}
 		device_printf(sc->xbd_dev,
 		    "changed capacity to %jd\n",
 		    (intmax_t)disk->d_mediasize);
 		return;
 	}
 
 	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
 	    "sectors", "%lu", &sectors,
 	    "info", "%u", &binfo,
 	    "sector-size", "%lu", &sector_size,
 	    NULL);
 	if (err) {
 		xenbus_dev_fatal(dev, err,
 		    "reading backend fields at %s",
 		    xenbus_get_otherend_path(dev));
 		return;
 	}
 	if ((sectors == 0) || (sector_size == 0)) {
 		xenbus_dev_fatal(dev, 0,
 		    "invalid parameters from %s:"
 		    " sectors = %lu, sector_size = %lu",
 		    xenbus_get_otherend_path(dev),
 		    sectors, sector_size);
 		return;
 	}
 	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
 	     "physical-sector-size", "%lu", &phys_sector_size,
 	     NULL);
 	if (err || phys_sector_size <= sector_size)
 		phys_sector_size = 0;
 	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
 	     "feature-barrier", "%d", &feature_barrier,
 	     NULL);
 	if (err == 0 && feature_barrier != 0)
 		sc->xbd_flags |= XBDF_BARRIER;
 
 	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
 	     "feature-flush-cache", "%d", &feature_flush,
 	     NULL);
 	if (err == 0 && feature_flush != 0)
 		sc->xbd_flags |= XBDF_FLUSH;
 
 	err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev),
 	    "feature-max-indirect-segments", "%" PRIu32,
 	    &sc->xbd_max_request_segments, NULL);
 	if ((err != 0) || (xbd_enable_indirect == 0))
 		sc->xbd_max_request_segments = 0;
 	if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS)
 		sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS;
 	if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS))
 		sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS);
 	sc->xbd_max_request_indirectpages =
 	    XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments);
 	if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
 		sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
 	sc->xbd_max_request_size =
 	    XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments);
 
 	/* Allocate datastructures based on negotiated values. */
 	err = bus_dma_tag_create(
 	    bus_get_dma_tag(sc->xbd_dev),	/* parent */
 	    512, PAGE_SIZE,			/* algnmnt, boundary */
 	    BUS_SPACE_MAXADDR,			/* lowaddr */
 	    BUS_SPACE_MAXADDR,			/* highaddr */
 	    NULL, NULL,				/* filter, filterarg */
 	    sc->xbd_max_request_size,
 	    sc->xbd_max_request_segments,
 	    PAGE_SIZE,				/* maxsegsize */
 	    BUS_DMA_ALLOCNOW,			/* flags */
 	    busdma_lock_mutex,			/* lockfunc */
 	    &sc->xbd_io_lock,			/* lockarg */
 	    &sc->xbd_io_dmat);
 	if (err != 0) {
 		xenbus_dev_fatal(sc->xbd_dev, err,
 		    "Cannot allocate parent DMA tag\n");
 		return;
 	}
 
 	/* Per-transaction data allocation. */
 	sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests,
 	    M_XENBLOCKFRONT, M_NOWAIT|M_ZERO);
 	if (sc->xbd_shadow == NULL) {
 		bus_dma_tag_destroy(sc->xbd_io_dmat);
 		xenbus_dev_fatal(sc->xbd_dev, ENOMEM,
 		    "Cannot allocate request structures\n");
 		return;
 	}
 
 	for (i = 0; i < sc->xbd_max_requests; i++) {
 		struct xbd_command *cm;
 		void * indirectpages;
 
 		cm = &sc->xbd_shadow[i];
 		cm->cm_sg_refs = malloc(
 		    sizeof(grant_ref_t) * sc->xbd_max_request_segments,
 		    M_XENBLOCKFRONT, M_NOWAIT);
 		if (cm->cm_sg_refs == NULL)
 			break;
 		cm->cm_id = i;
 		cm->cm_flags = XBDCF_INITIALIZER;
 		cm->cm_sc = sc;
 		if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0)
 			break;
 		if (sc->xbd_max_request_indirectpages > 0) {
 			indirectpages = contigmalloc(
 			    PAGE_SIZE * sc->xbd_max_request_indirectpages,
 			    M_XENBLOCKFRONT, M_ZERO | M_NOWAIT, 0, ~0,
 			    PAGE_SIZE, 0);
 			if (indirectpages == NULL)
 				sc->xbd_max_request_indirectpages = 0;
 		} else {
 			indirectpages = NULL;
 		}
 		for (j = 0; j < sc->xbd_max_request_indirectpages; j++) {
 			if (gnttab_grant_foreign_access(
 			    xenbus_get_otherend_id(sc->xbd_dev),
 			    (vtophys(indirectpages) >> PAGE_SHIFT) + j,
 			    1 /* grant read-only access */,
 			    &cm->cm_indirectionrefs[j]))
 				break;
 		}
 		if (j < sc->xbd_max_request_indirectpages) {
 			contigfree(indirectpages,
 			    PAGE_SIZE * sc->xbd_max_request_indirectpages,
 			    M_XENBLOCKFRONT);
 			break;
 		}
 		cm->cm_indirectionpages = indirectpages;
 		xbd_free_command(cm);
 	}
 
 	if (sc->xbd_disk == NULL) {
 		device_printf(dev, "%juMB <%s> at %s",
 		    (uintmax_t) sectors / (1048576 / sector_size),
 		    device_get_desc(dev),
 		    xenbus_get_node(dev));
 		bus_print_child_footer(device_get_parent(dev), dev);
 
 		xbd_instance_create(sc, sectors, sc->xbd_vdevice, binfo,
 		    sector_size, phys_sector_size);
 	}
 
 	(void)xenbus_set_state(dev, XenbusStateConnected); 
 
 	/* Kick pending requests. */
 	mtx_lock(&sc->xbd_io_lock);
 	sc->xbd_state = XBD_STATE_CONNECTED;
 	xbd_startio(sc);
 	sc->xbd_flags |= XBDF_READY;
 	mtx_unlock(&sc->xbd_io_lock);
 }
 
 /**
  * Handle the change of state of the backend to Closing.  We must delete our
  * device-layer structures now, to ensure that writes are flushed through to
  * the backend.  Once this is done, we can switch to Closed in
  * acknowledgement.
  */
 static void
 xbd_closing(device_t dev)
 {
 	struct xbd_softc *sc = device_get_softc(dev);
 
 	xenbus_set_state(dev, XenbusStateClosing);
 
 	DPRINTK("xbd_closing: %s removed\n", xenbus_get_node(dev));
 
 	if (sc->xbd_disk != NULL) {
 		disk_destroy(sc->xbd_disk);
 		sc->xbd_disk = NULL;
 	}
 
 	xenbus_set_state(dev, XenbusStateClosed); 
 }
 
 /*---------------------------- NewBus Entrypoints ----------------------------*/
 static int
 xbd_probe(device_t dev)
 {
 	if (strcmp(xenbus_get_type(dev), "vbd") != 0)
 		return (ENXIO);
 
 	if (xen_hvm_domain() && xen_disable_pv_disks != 0)
 		return (ENXIO);
 
 	if (xen_hvm_domain()) {
 		int error;
 		char *type;
 
 		/*
 		 * When running in an HVM domain, IDE disk emulation is
 		 * disabled early in boot so that native drivers will
 		 * not see emulated hardware.  However, CDROM device
 		 * emulation cannot be disabled.
 		 *
 		 * Through use of FreeBSD's vm_guest and xen_hvm_domain()
 		 * APIs, we could modify the native CDROM driver to fail its
 		 * probe when running under Xen.  Unfortunatlely, the PV
 		 * CDROM support in XenServer (up through at least version
 		 * 6.2) isn't functional, so we instead rely on the emulated
 		 * CDROM instance, and fail to attach the PV one here in
 		 * the blkfront driver.
 		 */
 		error = xs_read(XST_NIL, xenbus_get_node(dev),
 		    "device-type", NULL, (void **) &type);
 		if (error)
 			return (ENXIO);
 
 		if (strncmp(type, "cdrom", 5) == 0) {
 			free(type, M_XENSTORE);
 			return (ENXIO);
 		}
 		free(type, M_XENSTORE);
 	}
 
 	device_set_desc(dev, "Virtual Block Device");
 	device_quiet(dev);
 	return (0);
 }
 
 /*
  * Setup supplies the backend dir, virtual device.  We place an event
  * channel and shared frame entries.  We watch backend to wait if it's
  * ok.
  */
 static int
 xbd_attach(device_t dev)
 {
 	struct xbd_softc *sc;
 	const char *name;
 	uint32_t vdevice;
 	int error;
 	int i;
 	int unit;
 
 	/* FIXME: Use dynamic device id if this is not set. */
 	error = xs_scanf(XST_NIL, xenbus_get_node(dev),
 	    "virtual-device", NULL, "%" PRIu32, &vdevice);
 	if (error)
 		error = xs_scanf(XST_NIL, xenbus_get_node(dev),
 		    "virtual-device-ext", NULL, "%" PRIu32, &vdevice);
 	if (error) {
 		xenbus_dev_fatal(dev, error, "reading virtual-device");
 		device_printf(dev, "Couldn't determine virtual device.\n");
 		return (error);
 	}
 
 	xbd_vdevice_to_unit(vdevice, &unit, &name);
 	if (!strcmp(name, "xbd"))
 		device_set_unit(dev, unit);
 
 	sc = device_get_softc(dev);
 	mtx_init(&sc->xbd_io_lock, "blkfront i/o lock", NULL, MTX_DEF);
 	xbd_initqs(sc);
 	for (i = 0; i < XBD_MAX_RING_PAGES; i++)
 		sc->xbd_ring_ref[i] = GRANT_REF_INVALID;
 
 	sc->xbd_dev = dev;
 	sc->xbd_vdevice = vdevice;
 	sc->xbd_state = XBD_STATE_DISCONNECTED;
 
 	xbd_setup_sysctl(sc);
 
 	/* Wait for backend device to publish its protocol capabilities. */
 	xenbus_set_state(dev, XenbusStateInitialising);
 
 	return (0);
 }
 
 static int
 xbd_detach(device_t dev)
 {
 	struct xbd_softc *sc = device_get_softc(dev);
 
 	DPRINTK("%s: %s removed\n", __func__, xenbus_get_node(dev));
 
 	xbd_free(sc);
 	mtx_destroy(&sc->xbd_io_lock);
 
 	return 0;
 }
 
 static int
 xbd_suspend(device_t dev)
 {
 	struct xbd_softc *sc = device_get_softc(dev);
 	int retval;
 	int saved_state;
 
 	/* Prevent new requests being issued until we fix things up. */
 	mtx_lock(&sc->xbd_io_lock);
 	saved_state = sc->xbd_state;
 	sc->xbd_state = XBD_STATE_SUSPENDED;
 
 	/* Wait for outstanding I/O to drain. */
 	retval = 0;
 	while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) {
 		if (msleep(&sc->xbd_cm_q[XBD_Q_BUSY], &sc->xbd_io_lock,
 		    PRIBIO, "blkf_susp", 30 * hz) == EWOULDBLOCK) {
 			retval = EBUSY;
 			break;
 		}
 	}
 	mtx_unlock(&sc->xbd_io_lock);
 
 	if (retval != 0)
 		sc->xbd_state = saved_state;
 
 	return (retval);
 }
 
 static int
 xbd_resume(device_t dev)
 {
 	struct xbd_softc *sc = device_get_softc(dev);
 
 	if (xen_suspend_cancelled) {
 		sc->xbd_state = XBD_STATE_CONNECTED;
 		return (0);
 	}
 
 	DPRINTK("xbd_resume: %s\n", xenbus_get_node(dev));
 
 	xbd_free(sc);
 	xbd_initialize(sc);
 	return (0);
 }
 
 /**
  * Callback received when the backend's state changes.
  */
 static void
 xbd_backend_changed(device_t dev, XenbusState backend_state)
 {
 	struct xbd_softc *sc = device_get_softc(dev);
 
 	DPRINTK("backend_state=%d\n", backend_state);
 
 	switch (backend_state) {
 	case XenbusStateUnknown:
 	case XenbusStateInitialising:
 	case XenbusStateReconfigured:
 	case XenbusStateReconfiguring:
 	case XenbusStateClosed:
 		break;
 
 	case XenbusStateInitWait:
 	case XenbusStateInitialised:
 		xbd_initialize(sc);
 		break;
 
 	case XenbusStateConnected:
 		xbd_initialize(sc);
 		xbd_connect(sc);
 		break;
 
 	case XenbusStateClosing:
 		if (sc->xbd_users > 0) {
 			device_printf(dev, "detaching with pending users\n");
 			KASSERT(sc->xbd_disk != NULL,
 			    ("NULL disk with pending users\n"));
 			disk_gone(sc->xbd_disk);
 		} else {
 			xbd_closing(dev);
 		}
 		break;	
 	}
 }
 
 /*---------------------------- NewBus Registration ---------------------------*/
 static device_method_t xbd_methods[] = { 
 	/* Device interface */ 
 	DEVMETHOD(device_probe,         xbd_probe), 
 	DEVMETHOD(device_attach,        xbd_attach), 
 	DEVMETHOD(device_detach,        xbd_detach), 
 	DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
 	DEVMETHOD(device_suspend,       xbd_suspend), 
 	DEVMETHOD(device_resume,        xbd_resume), 
  
 	/* Xenbus interface */
 	DEVMETHOD(xenbus_otherend_changed, xbd_backend_changed),
 
 	{ 0, 0 } 
 }; 
 
 static driver_t xbd_driver = { 
 	"xbd", 
 	xbd_methods, 
 	sizeof(struct xbd_softc),                      
 }; 
 devclass_t xbd_devclass; 
  
 DRIVER_MODULE(xbd, xenbusb_front, xbd_driver, xbd_devclass, 0, 0); 
Index: head/sys/dev/xen/netback/netback.c
===================================================================
--- head/sys/dev/xen/netback/netback.c	(revision 358315)
+++ head/sys/dev/xen/netback/netback.c	(revision 358316)
@@ -1,2515 +1,2515 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2011 Spectra Logic Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
  *          Alan Somers         (Spectra Logic Corporation)
  *          John Suykerbuyk     (Spectra Logic Corporation)
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /**
  * \file netback.c
  *
  * \brief Device driver supporting the vending of network access
  * 	  from this FreeBSD domain to other domains.
  */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 
 #include <sys/bus.h>
 #include <sys/module.h>
 #include <sys/rman.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_arp.h>
 #include <net/ethernet.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/if_ether.h>
 #if __FreeBSD_version >= 700000
 #include <netinet/tcp.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/udp.h>
 #include <machine/in_cksum.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_kern.h>
 
 #include <machine/_inttypes.h>
 
 #include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 #include <xen/interface/io/netif.h>
 #include <xen/xenbus/xenbusvar.h>
 
 /*--------------------------- Compile-time Tunables --------------------------*/
 
 /*---------------------------------- Macros ----------------------------------*/
 /**
  * Custom malloc type for all driver allocations.
  */
 static MALLOC_DEFINE(M_XENNETBACK, "xnb", "Xen Net Back Driver Data");
 
 #define	XNB_SG	1	/* netback driver supports feature-sg */
 #define	XNB_GSO_TCPV4 0	/* netback driver supports feature-gso-tcpv4 */
 #define	XNB_RX_COPY 1	/* netback driver supports feature-rx-copy */
 #define	XNB_RX_FLIP 0	/* netback driver does not support feature-rx-flip */
 
 #undef XNB_DEBUG
 #define	XNB_DEBUG /* hardcode on during development */
 
 #ifdef XNB_DEBUG
 #define	DPRINTF(fmt, args...) \
 	printf("xnb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
 #else
 #define	DPRINTF(fmt, args...) do {} while (0)
 #endif
 
 /* Default length for stack-allocated grant tables */
 #define	GNTTAB_LEN	(64)
 
 /* Features supported by all backends.  TSO and LRO can be negotiated */
 #define	XNB_CSUM_FEATURES	(CSUM_TCP | CSUM_UDP)
 
 #define	NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE)
 #define	NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE)
 
 /**
  * Two argument version of the standard macro.  Second argument is a tentative
  * value of req_cons
  */
 #define	RING_HAS_UNCONSUMED_REQUESTS_2(_r, cons) ({                     \
 	unsigned int req = (_r)->sring->req_prod - cons;          	\
 	unsigned int rsp = RING_SIZE(_r) -                              \
 	(cons - (_r)->rsp_prod_pvt);                          		\
 	req < rsp ? req : rsp;                                          \
 })
 
 #define	virt_to_mfn(x) (vtophys(x) >> PAGE_SHIFT)
 #define	virt_to_offset(x) ((x) & (PAGE_SIZE - 1))
 
 /**
  * Predefined array type of grant table copy descriptors.  Used to pass around
  * statically allocated memory structures.
  */
 typedef struct gnttab_copy gnttab_copy_table[GNTTAB_LEN];
 
 /*--------------------------- Forward Declarations ---------------------------*/
 struct xnb_softc;
 struct xnb_pkt;
 
 static void	xnb_attach_failed(struct xnb_softc *xnb,
 				  int err, const char *fmt, ...)
 				  __printflike(3,4);
 static int	xnb_shutdown(struct xnb_softc *xnb);
 static int	create_netdev(device_t dev);
 static int	xnb_detach(device_t dev);
 static int	xnb_ifmedia_upd(struct ifnet *ifp);
 static void	xnb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr);
 static void 	xnb_intr(void *arg);
 static int	xnb_send(netif_rx_back_ring_t *rxb, domid_t otherend,
 			 const struct mbuf *mbufc, gnttab_copy_table gnttab);
 static int	xnb_recv(netif_tx_back_ring_t *txb, domid_t otherend,
 			 struct mbuf **mbufc, struct ifnet *ifnet,
 			 gnttab_copy_table gnttab);
 static int	xnb_ring2pkt(struct xnb_pkt *pkt,
 			     const netif_tx_back_ring_t *tx_ring,
 			     RING_IDX start);
 static void	xnb_txpkt2rsp(const struct xnb_pkt *pkt,
 			      netif_tx_back_ring_t *ring, int error);
 static struct mbuf *xnb_pkt2mbufc(const struct xnb_pkt *pkt, struct ifnet *ifp);
 static int	xnb_txpkt2gnttab(const struct xnb_pkt *pkt,
 				 struct mbuf *mbufc,
 				 gnttab_copy_table gnttab,
 				 const netif_tx_back_ring_t *txb,
 				 domid_t otherend_id);
 static void	xnb_update_mbufc(struct mbuf *mbufc,
 				 const gnttab_copy_table gnttab, int n_entries);
 static int	xnb_mbufc2pkt(const struct mbuf *mbufc,
 			      struct xnb_pkt *pkt,
 			      RING_IDX start, int space);
 static int	xnb_rxpkt2gnttab(const struct xnb_pkt *pkt,
 				 const struct mbuf *mbufc,
 				 gnttab_copy_table gnttab,
 				 const netif_rx_back_ring_t *rxb,
 				 domid_t otherend_id);
 static int	xnb_rxpkt2rsp(const struct xnb_pkt *pkt,
 			      const gnttab_copy_table gnttab, int n_entries,
 			      netif_rx_back_ring_t *ring);
 static void	xnb_stop(struct xnb_softc*);
 static int	xnb_ioctl(struct ifnet*, u_long, caddr_t);
 static void	xnb_start_locked(struct ifnet*);
 static void	xnb_start(struct ifnet*);
 static void	xnb_ifinit_locked(struct xnb_softc*);
 static void	xnb_ifinit(void*);
 #ifdef XNB_DEBUG
 static int	xnb_unit_test_main(SYSCTL_HANDLER_ARGS);
 static int	xnb_dump_rings(SYSCTL_HANDLER_ARGS);
 #endif
 #if defined(INET) || defined(INET6)
 static void	xnb_add_mbuf_cksum(struct mbuf *mbufc);
 #endif
 /*------------------------------ Data Structures -----------------------------*/
 
 
 /**
  * Representation of a xennet packet.  Simplified version of a packet as
  * stored in the Xen tx ring.  Applicable to both RX and TX packets
  */
 struct xnb_pkt{
 	/**
 	 * Array index of the first data-bearing (eg, not extra info) entry
 	 * for this packet
 	 */
 	RING_IDX	car;
 
 	/**
 	 * Array index of the second data-bearing entry for this packet.
 	 * Invalid if the packet has only one data-bearing entry.  If the
 	 * packet has more than two data-bearing entries, then the second
 	 * through the last will be sequential modulo the ring size
 	 */
 	RING_IDX	cdr;
 
 	/**
 	 * Optional extra info.  Only valid if flags contains
 	 * NETTXF_extra_info.  Note that extra.type will always be
 	 * XEN_NETIF_EXTRA_TYPE_GSO.  Currently, no known netfront or netback
 	 * driver will ever set XEN_NETIF_EXTRA_TYPE_MCAST_*
 	 */
 	netif_extra_info_t extra;
 
 	/** Size of entire packet in bytes.       */
 	uint16_t	size;
 
 	/** The size of the first entry's data in bytes */
 	uint16_t	car_size;
 
 	/**
 	 * Either NETTXF_ or NETRXF_ flags.  Note that the flag values are
 	 * not the same for TX and RX packets
 	 */
 	uint16_t	flags;
 
 	/**
 	 * The number of valid data-bearing entries (either netif_tx_request's
 	 * or netif_rx_response's) in the packet.  If this is 0, it means the
 	 * entire packet is invalid.
 	 */
 	uint16_t	list_len;
 
 	/** There was an error processing the packet */
 	uint8_t		error;
 };
 
 /** xnb_pkt method: initialize it */
 static inline void
 xnb_pkt_initialize(struct xnb_pkt *pxnb)
 {
 	bzero(pxnb, sizeof(*pxnb));
 }
 
 /** xnb_pkt method: mark the packet as valid */
 static inline void
 xnb_pkt_validate(struct xnb_pkt *pxnb)
 {
 	pxnb->error = 0;
 };
 
 /** xnb_pkt method: mark the packet as invalid */
 static inline void
 xnb_pkt_invalidate(struct xnb_pkt *pxnb)
 {
 	pxnb->error = 1;
 };
 
 /** xnb_pkt method: Check whether the packet is valid */
 static inline int
 xnb_pkt_is_valid(const struct xnb_pkt *pxnb)
 {
 	return (! pxnb->error);
 }
 
 #ifdef XNB_DEBUG
 /** xnb_pkt method: print the packet's contents in human-readable format*/
 static void __unused
 xnb_dump_pkt(const struct xnb_pkt *pkt) {
 	if (pkt == NULL) {
 	  DPRINTF("Was passed a null pointer.\n");
 	  return;
 	}
 	DPRINTF("pkt address= %p\n", pkt);
 	DPRINTF("pkt->size=%d\n", pkt->size);
 	DPRINTF("pkt->car_size=%d\n", pkt->car_size);
 	DPRINTF("pkt->flags=0x%04x\n", pkt->flags);
 	DPRINTF("pkt->list_len=%d\n", pkt->list_len);
 	/* DPRINTF("pkt->extra");	TODO */
 	DPRINTF("pkt->car=%d\n", pkt->car);
 	DPRINTF("pkt->cdr=%d\n", pkt->cdr);
 	DPRINTF("pkt->error=%d\n", pkt->error);
 }
 #endif /* XNB_DEBUG */
 
 static void
 xnb_dump_txreq(RING_IDX idx, const struct netif_tx_request *txreq)
 {
 	if (txreq != NULL) {
 		DPRINTF("netif_tx_request index =%u\n", idx);
 		DPRINTF("netif_tx_request.gref  =%u\n", txreq->gref);
 		DPRINTF("netif_tx_request.offset=%hu\n", txreq->offset);
 		DPRINTF("netif_tx_request.flags =%hu\n", txreq->flags);
 		DPRINTF("netif_tx_request.id    =%hu\n", txreq->id);
 		DPRINTF("netif_tx_request.size  =%hu\n", txreq->size);
 	}
 }
 
 
 /**
  * \brief Configuration data for a shared memory request ring
  *        used to communicate with the front-end client of this
  *        this driver.
  */
 struct xnb_ring_config {
 	/**
 	 * Runtime structures for ring access.  Unfortunately, TX and RX rings
 	 * use different data structures, and that cannot be changed since it
 	 * is part of the interdomain protocol.
 	 */
 	union{
 		netif_rx_back_ring_t	  rx_ring;
 		netif_tx_back_ring_t	  tx_ring;
 	} back_ring;
 
 	/**
 	 * The device bus address returned by the hypervisor when
 	 * mapping the ring and required to unmap it when a connection
 	 * is torn down.
 	 */
 	uint64_t	bus_addr;
 
 	/** The pseudo-physical address where ring memory is mapped.*/
 	uint64_t	gnt_addr;
 
 	/** KVA address where ring memory is mapped. */
 	vm_offset_t	va;
 
 	/**
 	 * Grant table handles, one per-ring page, returned by the
 	 * hyperpervisor upon mapping of the ring and required to
 	 * unmap it when a connection is torn down.
 	 */
 	grant_handle_t	handle;
 
 	/** The number of ring pages mapped for the current connection. */
 	unsigned	ring_pages;
 
 	/**
 	 * The grant references, one per-ring page, supplied by the
 	 * front-end, allowing us to reference the ring pages in the
 	 * front-end's domain and to map these pages into our own domain.
 	 */
 	grant_ref_t	ring_ref;
 };
 
 /**
  * Per-instance connection state flags.
  */
 typedef enum
 {
 	/** Communication with the front-end has been established. */
 	XNBF_RING_CONNECTED    = 0x01,
 
 	/**
 	 * Front-end requests exist in the ring and are waiting for
 	 * xnb_xen_req objects to free up.
 	 */
 	XNBF_RESOURCE_SHORTAGE = 0x02,
 
 	/** Connection teardown has started. */
 	XNBF_SHUTDOWN          = 0x04,
 
 	/** A thread is already performing shutdown processing. */
 	XNBF_IN_SHUTDOWN       = 0x08
 } xnb_flag_t;
 
 /**
  * Types of rings.  Used for array indices and to identify a ring's control
  * data structure type
  */
 typedef enum{
 	XNB_RING_TYPE_TX = 0,	/* ID of TX rings, used for array indices */
 	XNB_RING_TYPE_RX = 1,	/* ID of RX rings, used for array indices */
 	XNB_NUM_RING_TYPES
 } xnb_ring_type_t;
 
 /**
  * Per-instance configuration data.
  */
 struct xnb_softc {
 	/** NewBus device corresponding to this instance. */
 	device_t		dev;
 
 	/* Media related fields */
 
 	/** Generic network media state */
 	struct ifmedia		sc_media;
 
 	/** Media carrier info */
 	struct ifnet 		*xnb_ifp;
 
 	/** Our own private carrier state */
 	unsigned carrier;
 
 	/** Device MAC Address */
 	uint8_t			mac[ETHER_ADDR_LEN];
 
 	/* Xen related fields */
 
 	/**
 	 * \brief The netif protocol abi in effect.
 	 *
 	 * There are situations where the back and front ends can
 	 * have a different, native abi (e.g. intel x86_64 and
 	 * 32bit x86 domains on the same machine).  The back-end
 	 * always accommodates the front-end's native abi.  That
 	 * value is pulled from the XenStore and recorded here.
 	 */
 	int			abi;
 
 	/**
 	 * Name of the bridge to which this VIF is connected, if any
 	 * This field is dynamically allocated by xenbus and must be free()ed
 	 * when no longer needed
 	 */
 	char			*bridge;
 
 	/** The interrupt driven even channel used to signal ring events. */
 	evtchn_port_t		evtchn;
 
 	/** Xen device handle.*/
 	long 			handle;
 
 	/** Handle to the communication ring event channel. */
 	xen_intr_handle_t	xen_intr_handle;
 
 	/**
 	 * \brief Cached value of the front-end's domain id.
 	 *
 	 * This value is used at once for each mapped page in
 	 * a transaction.  We cache it to avoid incuring the
 	 * cost of an ivar access every time this is needed.
 	 */
 	domid_t			otherend_id;
 
 	/**
 	 * Undocumented frontend feature.  Has something to do with
 	 * scatter/gather IO
 	 */
 	uint8_t			can_sg;
 	/** Undocumented frontend feature */
 	uint8_t			gso;
 	/** Undocumented frontend feature */
 	uint8_t			gso_prefix;
 	/** Can checksum TCP/UDP over IPv4 */
 	uint8_t			ip_csum;
 
 	/* Implementation related fields */
 	/**
 	 * Preallocated grant table copy descriptor for RX operations.
 	 * Access must be protected by rx_lock
 	 */
 	gnttab_copy_table	rx_gnttab;
 
 	/**
 	 * Preallocated grant table copy descriptor for TX operations.
 	 * Access must be protected by tx_lock
 	 */
 	gnttab_copy_table	tx_gnttab;
 
 	/**
 	 * Resource representing allocated physical address space
 	 * associated with our per-instance kva region.
 	 */
 	struct resource		*pseudo_phys_res;
 
 	/** Resource id for allocated physical address space. */
 	int			pseudo_phys_res_id;
 
 	/** Ring mapping and interrupt configuration data. */
 	struct xnb_ring_config	ring_configs[XNB_NUM_RING_TYPES];
 
 	/**
 	 * Global pool of kva used for mapping remote domain ring
 	 * and I/O transaction data.
 	 */
 	vm_offset_t		kva;
 
 	/** Pseudo-physical address corresponding to kva. */
 	uint64_t		gnt_base_addr;
 
 	/** Various configuration and state bit flags. */
 	xnb_flag_t		flags;
 
 	/** Mutex protecting per-instance data in the receive path. */
 	struct mtx		rx_lock;
 
 	/** Mutex protecting per-instance data in the softc structure. */
 	struct mtx		sc_lock;
 
 	/** Mutex protecting per-instance data in the transmit path. */
 	struct mtx		tx_lock;
 
 	/** The size of the global kva pool. */
 	int			kva_size;
 
 	/** Name of the interface */
 	char			 if_name[IFNAMSIZ];
 };
 
 /*---------------------------- Debugging functions ---------------------------*/
 #ifdef XNB_DEBUG
 static void __unused
 xnb_dump_gnttab_copy(const struct gnttab_copy *entry)
 {
 	if (entry == NULL) {
 		printf("NULL grant table pointer\n");
 		return;
 	}
 
 	if (entry->flags & GNTCOPY_dest_gref)
 		printf("gnttab dest ref=\t%u\n", entry->dest.u.ref);
 	else
 		printf("gnttab dest gmfn=\t%"PRI_xen_pfn"\n",
 		       entry->dest.u.gmfn);
 	printf("gnttab dest offset=\t%hu\n", entry->dest.offset);
 	printf("gnttab dest domid=\t%hu\n", entry->dest.domid);
 	if (entry->flags & GNTCOPY_source_gref)
 		printf("gnttab source ref=\t%u\n", entry->source.u.ref);
 	else
 		printf("gnttab source gmfn=\t%"PRI_xen_pfn"\n",
 		       entry->source.u.gmfn);
 	printf("gnttab source offset=\t%hu\n", entry->source.offset);
 	printf("gnttab source domid=\t%hu\n", entry->source.domid);
 	printf("gnttab len=\t%hu\n", entry->len);
 	printf("gnttab flags=\t%hu\n", entry->flags);
 	printf("gnttab status=\t%hd\n", entry->status);
 }
 
 static int
 xnb_dump_rings(SYSCTL_HANDLER_ARGS)
 {
 	static char results[720];
 	struct xnb_softc const* xnb = (struct xnb_softc*)arg1;
 	netif_rx_back_ring_t const* rxb =
 		&xnb->ring_configs[XNB_RING_TYPE_RX].back_ring.rx_ring;
 	netif_tx_back_ring_t const* txb =
 		&xnb->ring_configs[XNB_RING_TYPE_TX].back_ring.tx_ring;
 
 	/* empty the result strings */
 	results[0] = 0;
 
 	if ( !txb || !txb->sring || !rxb || !rxb->sring )
 		return (SYSCTL_OUT(req, results, strnlen(results, 720)));
 
 	snprintf(results, 720,
 	    "\n\t%35s %18s\n"	/* TX, RX */
 	    "\t%16s %18d %18d\n"	/* req_cons */
 	    "\t%16s %18d %18d\n"	/* nr_ents */
 	    "\t%16s %18d %18d\n"	/* rsp_prod_pvt */
 	    "\t%16s %18p %18p\n"	/* sring */
 	    "\t%16s %18d %18d\n"	/* req_prod */
 	    "\t%16s %18d %18d\n"	/* req_event */
 	    "\t%16s %18d %18d\n"	/* rsp_prod */
 	    "\t%16s %18d %18d\n",	/* rsp_event */
 	    "TX", "RX",
 	    "req_cons", txb->req_cons, rxb->req_cons,
 	    "nr_ents", txb->nr_ents, rxb->nr_ents,
 	    "rsp_prod_pvt", txb->rsp_prod_pvt, rxb->rsp_prod_pvt,
 	    "sring", txb->sring, rxb->sring,
 	    "sring->req_prod", txb->sring->req_prod, rxb->sring->req_prod,
 	    "sring->req_event", txb->sring->req_event, rxb->sring->req_event,
 	    "sring->rsp_prod", txb->sring->rsp_prod, rxb->sring->rsp_prod,
 	    "sring->rsp_event", txb->sring->rsp_event, rxb->sring->rsp_event);
 
 	return (SYSCTL_OUT(req, results, strnlen(results, 720)));
 }
 
 static void __unused
 xnb_dump_mbuf(const struct mbuf *m)
 {
 	int len;
 	uint8_t *d;
 	if (m == NULL)
 		return;
 
 	printf("xnb_dump_mbuf:\n");
 	if (m->m_flags & M_PKTHDR) {
 		printf("    flowid=%10d, csum_flags=%#8x, csum_data=%#8x, "
 		       "tso_segsz=%5hd\n",
 		       m->m_pkthdr.flowid, (int)m->m_pkthdr.csum_flags,
 		       m->m_pkthdr.csum_data, m->m_pkthdr.tso_segsz);
 		printf("    rcvif=%16p,  len=%19d\n",
 		       m->m_pkthdr.rcvif, m->m_pkthdr.len);
 	}
 	printf("    m_next=%16p, m_nextpk=%16p, m_data=%16p\n",
 	       m->m_next, m->m_nextpkt, m->m_data);
 	printf("    m_len=%17d, m_flags=%#15x, m_type=%18u\n",
 	       m->m_len, m->m_flags, m->m_type);
 
 	len = m->m_len;
 	d = mtod(m, uint8_t*);
 	while (len > 0) {
 		int i;
 		printf("                ");
 		for (i = 0; (i < 16) && (len > 0); i++, len--) {
 			printf("%02hhx ", *(d++));
 		}
 		printf("\n");
 	}
 }
 #endif /* XNB_DEBUG */
 
 /*------------------------ Inter-Domain Communication ------------------------*/
 /**
  * Free dynamically allocated KVA or pseudo-physical address allocations.
  *
  * \param xnb  Per-instance xnb configuration structure.
  */
 static void
 xnb_free_communication_mem(struct xnb_softc *xnb)
 {
 	if (xnb->kva != 0) {
 		if (xnb->pseudo_phys_res != NULL) {
 			xenmem_free(xnb->dev, xnb->pseudo_phys_res_id,
 			    xnb->pseudo_phys_res);
 			xnb->pseudo_phys_res = NULL;
 		}
 	}
 	xnb->kva = 0;
 	xnb->gnt_base_addr = 0;
 }
 
 /**
  * Cleanup all inter-domain communication mechanisms.
  *
  * \param xnb  Per-instance xnb configuration structure.
  */
 static int
 xnb_disconnect(struct xnb_softc *xnb)
 {
 	struct gnttab_unmap_grant_ref gnts[XNB_NUM_RING_TYPES];
 	int error;
 	int i;
 
 	if (xnb->xen_intr_handle != NULL)
 		xen_intr_unbind(&xnb->xen_intr_handle);
 
 	/*
 	 * We may still have another thread currently processing requests.  We
 	 * must acquire the rx and tx locks to make sure those threads are done,
 	 * but we can release those locks as soon as we acquire them, because no
 	 * more interrupts will be arriving.
 	 */
 	mtx_lock(&xnb->tx_lock);
 	mtx_unlock(&xnb->tx_lock);
 	mtx_lock(&xnb->rx_lock);
 	mtx_unlock(&xnb->rx_lock);
 
 	mtx_lock(&xnb->sc_lock);
 	/* Free malloc'd softc member variables */
 	if (xnb->bridge != NULL) {
 		free(xnb->bridge, M_XENSTORE);
 		xnb->bridge = NULL;
 	}
 
 	/* All request processing has stopped, so unmap the rings */
 	for (i=0; i < XNB_NUM_RING_TYPES; i++) {
 		gnts[i].host_addr = xnb->ring_configs[i].gnt_addr;
 		gnts[i].dev_bus_addr = xnb->ring_configs[i].bus_addr;
 		gnts[i].handle = xnb->ring_configs[i].handle;
 	}
 	error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, gnts,
 					  XNB_NUM_RING_TYPES);
 	KASSERT(error == 0, ("Grant table unmap op failed (%d)", error));
 
 	xnb_free_communication_mem(xnb);
 	/*
 	 * Zero the ring config structs because the pointers, handles, and
 	 * grant refs contained therein are no longer valid.
 	 */
 	bzero(&xnb->ring_configs[XNB_RING_TYPE_TX],
 	    sizeof(struct xnb_ring_config));
 	bzero(&xnb->ring_configs[XNB_RING_TYPE_RX],
 	    sizeof(struct xnb_ring_config));
 
 	xnb->flags &= ~XNBF_RING_CONNECTED;
 	mtx_unlock(&xnb->sc_lock);
 
 	return (0);
 }
 
 /**
  * Map a single shared memory ring into domain local address space and
  * initialize its control structure
  *
  * \param xnb	Per-instance xnb configuration structure
  * \param ring_type	Array index of this ring in the xnb's array of rings
  * \return 	An errno
  */
 static int
 xnb_connect_ring(struct xnb_softc *xnb, xnb_ring_type_t ring_type)
 {
 	struct gnttab_map_grant_ref gnt;
 	struct xnb_ring_config *ring = &xnb->ring_configs[ring_type];
 	int error;
 
 	/* TX ring type = 0, RX =1 */
 	ring->va = xnb->kva + ring_type * PAGE_SIZE;
 	ring->gnt_addr = xnb->gnt_base_addr + ring_type * PAGE_SIZE;
 
 	gnt.host_addr = ring->gnt_addr;
 	gnt.flags     = GNTMAP_host_map;
 	gnt.ref       = ring->ring_ref;
 	gnt.dom       = xnb->otherend_id;
 
 	error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &gnt, 1);
 	if (error != 0)
 		panic("netback: Ring page grant table op failed (%d)", error);
 
 	if (gnt.status != 0) {
 		ring->va = 0;
 		error = EACCES;
 		xenbus_dev_fatal(xnb->dev, error,
 				 "Ring shared page mapping failed. "
 				 "Status %d.", gnt.status);
 	} else {
 		ring->handle = gnt.handle;
 		ring->bus_addr = gnt.dev_bus_addr;
 
 		if (ring_type == XNB_RING_TYPE_TX) {
 			BACK_RING_INIT(&ring->back_ring.tx_ring,
 			    (netif_tx_sring_t*)ring->va,
 			    ring->ring_pages * PAGE_SIZE);
 		} else if (ring_type == XNB_RING_TYPE_RX) {
 			BACK_RING_INIT(&ring->back_ring.rx_ring,
 			    (netif_rx_sring_t*)ring->va,
 			    ring->ring_pages * PAGE_SIZE);
 		} else {
 			xenbus_dev_fatal(xnb->dev, error,
 				 "Unknown ring type %d", ring_type);
 		}
 	}
 
 	return error;
 }
 
 /**
  * Setup the shared memory rings and bind an interrupt to the event channel
  * used to notify us of ring changes.
  *
  * \param xnb  Per-instance xnb configuration structure.
  */
 static int
 xnb_connect_comms(struct xnb_softc *xnb)
 {
 	int	error;
 	xnb_ring_type_t i;
 
 	if ((xnb->flags & XNBF_RING_CONNECTED) != 0)
 		return (0);
 
 	/*
 	 * Kva for our rings are at the tail of the region of kva allocated
 	 * by xnb_alloc_communication_mem().
 	 */
 	for (i=0; i < XNB_NUM_RING_TYPES; i++) {
 		error = xnb_connect_ring(xnb, i);
 		if (error != 0)
 	  		return error;
 	}
 
 	xnb->flags |= XNBF_RING_CONNECTED;
 
 	error = xen_intr_bind_remote_port(xnb->dev,
 					  xnb->otherend_id,
 					  xnb->evtchn,
 					  /*filter*/NULL,
 					  xnb_intr, /*arg*/xnb,
 					  INTR_TYPE_NET | INTR_MPSAFE,
 					  &xnb->xen_intr_handle);
 	if (error != 0) {
 		(void)xnb_disconnect(xnb);
 		xenbus_dev_fatal(xnb->dev, error, "binding event channel");
 		return (error);
 	}
 
 	DPRINTF("rings connected!\n");
 
 	return (0);
 }
 
 /**
  * Size KVA and pseudo-physical address allocations based on negotiated
  * values for the size and number of I/O requests, and the size of our
  * communication ring.
  *
  * \param xnb  Per-instance xnb configuration structure.
  *
  * These address spaces are used to dynamically map pages in the
  * front-end's domain into our own.
  */
 static int
 xnb_alloc_communication_mem(struct xnb_softc *xnb)
 {
 	xnb_ring_type_t i;
 
 	xnb->kva_size = 0;
 	for (i=0; i < XNB_NUM_RING_TYPES; i++) {
 		xnb->kva_size += xnb->ring_configs[i].ring_pages * PAGE_SIZE;
 	}
 
 	/*
 	 * Reserve a range of pseudo physical memory that we can map
 	 * into kva.  These pages will only be backed by machine
 	 * pages ("real memory") during the lifetime of front-end requests
 	 * via grant table operations.  We will map the netif tx and rx rings
 	 * into this space.
 	 */
 	xnb->pseudo_phys_res_id = 0;
 	xnb->pseudo_phys_res = xenmem_alloc(xnb->dev, &xnb->pseudo_phys_res_id,
 	    xnb->kva_size);
 	if (xnb->pseudo_phys_res == NULL) {
 		xnb->kva = 0;
 		return (ENOMEM);
 	}
 	xnb->kva = (vm_offset_t)rman_get_virtual(xnb->pseudo_phys_res);
 	xnb->gnt_base_addr = rman_get_start(xnb->pseudo_phys_res);
 	return (0);
 }
 
 /**
  * Collect information from the XenStore related to our device and its frontend
  *
  * \param xnb  Per-instance xnb configuration structure.
  */
 static int
 xnb_collect_xenstore_info(struct xnb_softc *xnb)
 {
 	/**
 	 * \todo Linux collects the following info.  We should collect most
 	 * of this, too:
 	 * "feature-rx-notify"
 	 */
 	const char *otherend_path;
 	const char *our_path;
 	int err;
 	unsigned int rx_copy, bridge_len;
 	uint8_t no_csum_offload;
 
 	otherend_path = xenbus_get_otherend_path(xnb->dev);
 	our_path = xenbus_get_node(xnb->dev);
 
 	/* Collect the critical communication parameters */
 	err = xs_gather(XST_NIL, otherend_path,
 	    "tx-ring-ref", "%l" PRIu32,
 	    	&xnb->ring_configs[XNB_RING_TYPE_TX].ring_ref,
 	    "rx-ring-ref", "%l" PRIu32,
 	    	&xnb->ring_configs[XNB_RING_TYPE_RX].ring_ref,
 	    "event-channel", "%" PRIu32, &xnb->evtchn,
 	    NULL);
 	if (err != 0) {
 		xenbus_dev_fatal(xnb->dev, err,
 				 "Unable to retrieve ring information from "
 				 "frontend %s.  Unable to connect.",
 				 otherend_path);
 		return (err);
 	}
 
 	/* Collect the handle from xenstore */
 	err = xs_scanf(XST_NIL, our_path, "handle", NULL, "%li", &xnb->handle);
 	if (err != 0) {
 		xenbus_dev_fatal(xnb->dev, err,
 		    "Error reading handle from frontend %s.  "
 		    "Unable to connect.", otherend_path);
 	}
 
 	/*
 	 * Collect the bridgename, if any.  We do not need bridge_len; we just
 	 * throw it away
 	 */
 	err = xs_read(XST_NIL, our_path, "bridge", &bridge_len,
 		      (void**)&xnb->bridge);
 	if (err != 0)
 		xnb->bridge = NULL;
 
 	/*
 	 * Does the frontend request that we use rx copy?  If not, return an
 	 * error because this driver only supports rx copy.
 	 */
 	err = xs_scanf(XST_NIL, otherend_path, "request-rx-copy", NULL,
 		       "%" PRIu32, &rx_copy);
 	if (err == ENOENT) {
 		err = 0;
 	 	rx_copy = 0;
 	}
 	if (err < 0) {
 		xenbus_dev_fatal(xnb->dev, err, "reading %s/request-rx-copy",
 				 otherend_path);
 		return err;
 	}
 	/**
 	 * \todo: figure out the exact meaning of this feature, and when
 	 * the frontend will set it to true.  It should be set to true
 	 * at some point
 	 */
 /*        if (!rx_copy)*/
 /*          return EOPNOTSUPP;*/
 
 	/** \todo Collect the rx notify feature */
 
 	/*  Collect the feature-sg. */
 	if (xs_scanf(XST_NIL, otherend_path, "feature-sg", NULL,
 		     "%hhu", &xnb->can_sg) < 0)
 		xnb->can_sg = 0;
 
 	/* Collect remaining frontend features */
 	if (xs_scanf(XST_NIL, otherend_path, "feature-gso-tcpv4", NULL,
 		     "%hhu", &xnb->gso) < 0)
 		xnb->gso = 0;
 
 	if (xs_scanf(XST_NIL, otherend_path, "feature-gso-tcpv4-prefix", NULL,
 		     "%hhu", &xnb->gso_prefix) < 0)
 		xnb->gso_prefix = 0;
 
 	if (xs_scanf(XST_NIL, otherend_path, "feature-no-csum-offload", NULL,
 		     "%hhu", &no_csum_offload) < 0)
 		no_csum_offload = 0;
 	xnb->ip_csum = (no_csum_offload == 0);
 
 	return (0);
 }
 
 /**
  * Supply information about the physical device to the frontend
  * via XenBus.
  *
  * \param xnb  Per-instance xnb configuration structure.
  */
 static int
 xnb_publish_backend_info(struct xnb_softc *xnb)
 {
 	struct xs_transaction xst;
 	const char *our_path;
 	int error;
 
 	our_path = xenbus_get_node(xnb->dev);
 
 	do {
 		error = xs_transaction_start(&xst);
 		if (error != 0) {
 			xenbus_dev_fatal(xnb->dev, error,
 					 "Error publishing backend info "
 					 "(start transaction)");
 			break;
 		}
 
 		error = xs_printf(xst, our_path, "feature-sg",
 				  "%d", XNB_SG);
 		if (error != 0)
 			break;
 
 		error = xs_printf(xst, our_path, "feature-gso-tcpv4",
 				  "%d", XNB_GSO_TCPV4);
 		if (error != 0)
 			break;
 
 		error = xs_printf(xst, our_path, "feature-rx-copy",
 				  "%d", XNB_RX_COPY);
 		if (error != 0)
 			break;
 
 		error = xs_printf(xst, our_path, "feature-rx-flip",
 				  "%d", XNB_RX_FLIP);
 		if (error != 0)
 			break;
 
 		error = xs_transaction_end(xst, 0);
 		if (error != 0 && error != EAGAIN) {
 			xenbus_dev_fatal(xnb->dev, error, "ending transaction");
 			break;
 		}
 
 	} while (error == EAGAIN);
 
 	return (error);
 }
 
 /**
  * Connect to our netfront peer now that it has completed publishing
  * its configuration into the XenStore.
  *
  * \param xnb  Per-instance xnb configuration structure.
  */
 static void
 xnb_connect(struct xnb_softc *xnb)
 {
 	int	error;
 
 	if (xenbus_get_state(xnb->dev) == XenbusStateConnected)
 		return;
 
 	if (xnb_collect_xenstore_info(xnb) != 0)
 		return;
 
 	xnb->flags &= ~XNBF_SHUTDOWN;
 
 	/* Read front end configuration. */
 
 	/* Allocate resources whose size depends on front-end configuration. */
 	error = xnb_alloc_communication_mem(xnb);
 	if (error != 0) {
 		xenbus_dev_fatal(xnb->dev, error,
 				 "Unable to allocate communication memory");
 		return;
 	}
 
 	/*
 	 * Connect communication channel.
 	 */
 	error = xnb_connect_comms(xnb);
 	if (error != 0) {
 		/* Specific errors are reported by xnb_connect_comms(). */
 		return;
 	}
 	xnb->carrier = 1;
 
 	/* Ready for I/O. */
 	xenbus_set_state(xnb->dev, XenbusStateConnected);
 }
 
 /*-------------------------- Device Teardown Support -------------------------*/
 /**
  * Perform device shutdown functions.
  *
  * \param xnb  Per-instance xnb configuration structure.
  *
  * Mark this instance as shutting down, wait for any active requests
  * to drain, disconnect from the front-end, and notify any waiters (e.g.
  * a thread invoking our detach method) that detach can now proceed.
  */
 static int
 xnb_shutdown(struct xnb_softc *xnb)
 {
 	/*
 	 * Due to the need to drop our mutex during some
 	 * xenbus operations, it is possible for two threads
 	 * to attempt to close out shutdown processing at
 	 * the same time.  Tell the caller that hits this
 	 * race to try back later.
 	 */
 	if ((xnb->flags & XNBF_IN_SHUTDOWN) != 0)
 		return (EAGAIN);
 
 	xnb->flags |= XNBF_SHUTDOWN;
 
 	xnb->flags |= XNBF_IN_SHUTDOWN;
 
 	mtx_unlock(&xnb->sc_lock);
 	/* Free the network interface */
 	xnb->carrier = 0;
 	if (xnb->xnb_ifp != NULL) {
 		ether_ifdetach(xnb->xnb_ifp);
 		if_free(xnb->xnb_ifp);
 		xnb->xnb_ifp = NULL;
 	}
 
 	xnb_disconnect(xnb);
 
 	if (xenbus_get_state(xnb->dev) < XenbusStateClosing)
 		xenbus_set_state(xnb->dev, XenbusStateClosing);
 	mtx_lock(&xnb->sc_lock);
 
 	xnb->flags &= ~XNBF_IN_SHUTDOWN;
 
 	/* Indicate to xnb_detach() that is it safe to proceed. */
 	wakeup(xnb);
 
 	return (0);
 }
 
 /**
  * Report an attach time error to the console and Xen, and cleanup
  * this instance by forcing immediate detach processing.
  *
  * \param xnb  Per-instance xnb configuration structure.
  * \param err  Errno describing the error.
  * \param fmt  Printf style format and arguments
  */
 static void
 xnb_attach_failed(struct xnb_softc *xnb, int err, const char *fmt, ...)
 {
 	va_list ap;
 	va_list ap_hotplug;
 
 	va_start(ap, fmt);
 	va_copy(ap_hotplug, ap);
 	xs_vprintf(XST_NIL, xenbus_get_node(xnb->dev),
 		  "hotplug-error", fmt, ap_hotplug);
 	va_end(ap_hotplug);
 	(void)xs_printf(XST_NIL, xenbus_get_node(xnb->dev),
 		  "hotplug-status", "error");
 
 	xenbus_dev_vfatal(xnb->dev, err, fmt, ap);
 	va_end(ap);
 
 	(void)xs_printf(XST_NIL, xenbus_get_node(xnb->dev), "online", "0");
 	xnb_detach(xnb->dev);
 }
 
 /*---------------------------- NewBus Entrypoints ----------------------------*/
 /**
  * Inspect a XenBus device and claim it if is of the appropriate type.
  *
  * \param dev  NewBus device object representing a candidate XenBus device.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xnb_probe(device_t dev)
 {
 	 if (!strcmp(xenbus_get_type(dev), "vif")) {
 		DPRINTF("Claiming device %d, %s\n", device_get_unit(dev),
 		    devclass_get_name(device_get_devclass(dev)));
 		device_set_desc(dev, "Backend Virtual Network Device");
 		device_quiet(dev);
 		return (0);
 	}
 	return (ENXIO);
 }
 
 /**
  * Setup sysctl variables to control various Network Back parameters.
  *
  * \param xnb  Xen Net Back softc.
  *
  */
 static void
 xnb_setup_sysctl(struct xnb_softc *xnb)
 {
 	struct sysctl_ctx_list *sysctl_ctx = NULL;
 	struct sysctl_oid      *sysctl_tree = NULL;
 
 	sysctl_ctx = device_get_sysctl_ctx(xnb->dev);
 	if (sysctl_ctx == NULL)
 		return;
 
 	sysctl_tree = device_get_sysctl_tree(xnb->dev);
 	if (sysctl_tree == NULL)
 		return;
 
 #ifdef XNB_DEBUG
 	SYSCTL_ADD_PROC(sysctl_ctx,
 			SYSCTL_CHILDREN(sysctl_tree),
 			OID_AUTO,
 			"unit_test_results",
-			CTLTYPE_STRING | CTLFLAG_RD,
+			CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 			xnb,
 			0,
 			xnb_unit_test_main,
 			"A",
 			"Results of builtin unit tests");
 
 	SYSCTL_ADD_PROC(sysctl_ctx,
 			SYSCTL_CHILDREN(sysctl_tree),
 			OID_AUTO,
 			"dump_rings",
-			CTLTYPE_STRING | CTLFLAG_RD,
+			CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
 			xnb,
 			0,
 			xnb_dump_rings,
 			"A",
 			"Xennet Back Rings");
 #endif /* XNB_DEBUG */
 }
 
 /**
  * Create a network device.
  * @param handle device handle
  */
 int
 create_netdev(device_t dev)
 {
 	struct ifnet *ifp;
 	struct xnb_softc *xnb;
 	int err = 0;
 	uint32_t handle;
 
 	xnb = device_get_softc(dev);
 	mtx_init(&xnb->sc_lock, "xnb_softc", "xen netback softc lock", MTX_DEF);
 	mtx_init(&xnb->tx_lock, "xnb_tx", "xen netback tx lock", MTX_DEF);
 	mtx_init(&xnb->rx_lock, "xnb_rx", "xen netback rx lock", MTX_DEF);
 
 	xnb->dev = dev;
 
 	ifmedia_init(&xnb->sc_media, 0, xnb_ifmedia_upd, xnb_ifmedia_sts);
 	ifmedia_add(&xnb->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL);
 	ifmedia_set(&xnb->sc_media, IFM_ETHER|IFM_MANUAL);
 
 	/*
 	 * Set the MAC address to a dummy value (00:00:00:00:00),
 	 * if the MAC address of the host-facing interface is set
 	 * to the same as the guest-facing one (the value found in
 	 * xenstore), the bridge would stop delivering packets to
 	 * us because it would see that the destination address of
 	 * the packet is the same as the interface, and so the bridge
 	 * would expect the packet has already been delivered locally
 	 * (and just drop it).
 	 */
 	bzero(&xnb->mac[0], sizeof(xnb->mac));
 
 	/* The interface will be named using the following nomenclature:
 	 *
 	 * xnb<domid>.<handle>
 	 *
 	 * Where handle is the oder of the interface referred to the guest.
 	 */
 	err = xs_scanf(XST_NIL, xenbus_get_node(xnb->dev), "handle", NULL,
 		       "%" PRIu32, &handle);
 	if (err != 0)
 		return (err);
 	snprintf(xnb->if_name, IFNAMSIZ, "xnb%" PRIu16 ".%" PRIu32,
 	    xenbus_get_otherend_id(dev), handle);
 
 	if (err == 0) {
 		/* Set up ifnet structure */
 		ifp = xnb->xnb_ifp = if_alloc(IFT_ETHER);
 		ifp->if_softc = xnb;
 		if_initname(ifp, xnb->if_name,  IF_DUNIT_NONE);
 		ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 		ifp->if_ioctl = xnb_ioctl;
 		ifp->if_start = xnb_start;
 		ifp->if_init = xnb_ifinit;
 		ifp->if_mtu = ETHERMTU;
 		ifp->if_snd.ifq_maxlen = NET_RX_RING_SIZE - 1;
 
 		ifp->if_hwassist = XNB_CSUM_FEATURES;
 		ifp->if_capabilities = IFCAP_HWCSUM;
 		ifp->if_capenable = IFCAP_HWCSUM;
 
 		ether_ifattach(ifp, xnb->mac);
 		xnb->carrier = 0;
 	}
 
 	return err;
 }
 
 /**
  * Attach to a XenBus device that has been claimed by our probe routine.
  *
  * \param dev  NewBus device object representing this Xen Net Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xnb_attach(device_t dev)
 {
 	struct xnb_softc *xnb;
 	int	error;
 	xnb_ring_type_t	i;
 
 	error = create_netdev(dev);
 	if (error != 0) {
 		xenbus_dev_fatal(dev, error, "creating netdev");
 		return (error);
 	}
 
 	DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
 
 	/*
 	 * Basic initialization.
 	 * After this block it is safe to call xnb_detach()
 	 * to clean up any allocated data for this instance.
 	 */
 	xnb = device_get_softc(dev);
 	xnb->otherend_id = xenbus_get_otherend_id(dev);
 	for (i=0; i < XNB_NUM_RING_TYPES; i++) {
 		xnb->ring_configs[i].ring_pages = 1;
 	}
 
 	/*
 	 * Setup sysctl variables.
 	 */
 	xnb_setup_sysctl(xnb);
 
 	/* Update hot-plug status to satisfy xend. */
 	error = xs_printf(XST_NIL, xenbus_get_node(xnb->dev),
 			  "hotplug-status", "connected");
 	if (error != 0) {
 		xnb_attach_failed(xnb, error, "writing %s/hotplug-status",
 				  xenbus_get_node(xnb->dev));
 		return (error);
 	}
 
 	if ((error = xnb_publish_backend_info(xnb)) != 0) {
 		/*
 		 * If we can't publish our data, we cannot participate
 		 * in this connection, and waiting for a front-end state
 		 * change will not help the situation.
 		 */
 		xnb_attach_failed(xnb, error,
 		    "Publishing backend status for %s",
 				  xenbus_get_node(xnb->dev));
 		return error;
 	}
 
 	/* Tell the front end that we are ready to connect. */
 	xenbus_set_state(dev, XenbusStateInitWait);
 
 	return (0);
 }
 
 /**
  * Detach from a net back device instance.
  *
  * \param dev  NewBus device object representing this Xen Net Back instance.
  *
  * \return  0 for success, errno codes for failure.
  *
  * \note A net back device may be detached at any time in its life-cycle,
  *       including part way through the attach process.  For this reason,
  *       initialization order and the initialization state checks in this
  *       routine must be carefully coupled so that attach time failures
  *       are gracefully handled.
  */
 static int
 xnb_detach(device_t dev)
 {
 	struct xnb_softc *xnb;
 
 	DPRINTF("\n");
 
 	xnb = device_get_softc(dev);
 	mtx_lock(&xnb->sc_lock);
 	while (xnb_shutdown(xnb) == EAGAIN) {
 		msleep(xnb, &xnb->sc_lock, /*wakeup prio unchanged*/0,
 		       "xnb_shutdown", 0);
 	}
 	mtx_unlock(&xnb->sc_lock);
 	DPRINTF("\n");
 
 	mtx_destroy(&xnb->tx_lock);
 	mtx_destroy(&xnb->rx_lock);
 	mtx_destroy(&xnb->sc_lock);
 	return (0);
 }
 
 /**
  * Prepare this net back device for suspension of this VM.
  *
  * \param dev  NewBus device object representing this Xen net Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xnb_suspend(device_t dev)
 {
 	return (0);
 }
 
 /**
  * Perform any processing required to recover from a suspended state.
  *
  * \param dev  NewBus device object representing this Xen Net Back instance.
  *
  * \return  0 for success, errno codes for failure.
  */
 static int
 xnb_resume(device_t dev)
 {
 	return (0);
 }
 
 /**
  * Handle state changes expressed via the XenStore by our front-end peer.
  *
  * \param dev             NewBus device object representing this Xen
  *                        Net Back instance.
  * \param frontend_state  The new state of the front-end.
  *
  * \return  0 for success, errno codes for failure.
  */
 static void
 xnb_frontend_changed(device_t dev, XenbusState frontend_state)
 {
 	struct xnb_softc *xnb;
 
 	xnb = device_get_softc(dev);
 
 	DPRINTF("frontend_state=%s, xnb_state=%s\n",
 	        xenbus_strstate(frontend_state),
 		xenbus_strstate(xenbus_get_state(xnb->dev)));
 
 	switch (frontend_state) {
 	case XenbusStateInitialising:
 		break;
 	case XenbusStateInitialised:
 	case XenbusStateConnected:
 		xnb_connect(xnb);
 		break;
 	case XenbusStateClosing:
 	case XenbusStateClosed:
 		mtx_lock(&xnb->sc_lock);
 		xnb_shutdown(xnb);
 		mtx_unlock(&xnb->sc_lock);
 		if (frontend_state == XenbusStateClosed)
 			xenbus_set_state(xnb->dev, XenbusStateClosed);
 		break;
 	default:
 		xenbus_dev_fatal(xnb->dev, EINVAL, "saw state %d at frontend",
 				 frontend_state);
 		break;
 	}
 }
 
 
 /*---------------------------- Request Processing ----------------------------*/
 /**
  * Interrupt handler bound to the shared ring's event channel.
  * Entry point for the xennet transmit path in netback
  * Transfers packets from the Xen ring to the host's generic networking stack
  *
  * \param arg  Callback argument registerd during event channel
  *             binding - the xnb_softc for this instance.
  */
 static void
 xnb_intr(void *arg)
 {
 	struct xnb_softc *xnb;
 	struct ifnet *ifp;
 	netif_tx_back_ring_t *txb;
 	RING_IDX req_prod_local;
 
 	xnb = (struct xnb_softc *)arg;
 	ifp = xnb->xnb_ifp;
 	txb = &xnb->ring_configs[XNB_RING_TYPE_TX].back_ring.tx_ring;
 
 	mtx_lock(&xnb->tx_lock);
 	do {
 		int notify;
 		req_prod_local = txb->sring->req_prod;
 		xen_rmb();
 
 		for (;;) {
 			struct mbuf *mbufc;
 			int err;
 
 			err = xnb_recv(txb, xnb->otherend_id, &mbufc, ifp,
 			    	       xnb->tx_gnttab);
 			if (err || (mbufc == NULL))
 				break;
 
 			/* Send the packet to the generic network stack */
 			(*xnb->xnb_ifp->if_input)(xnb->xnb_ifp, mbufc);
 		}
 
 		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(txb, notify);
 		if (notify != 0)
 			xen_intr_signal(xnb->xen_intr_handle);
 
 		txb->sring->req_event = txb->req_cons + 1;
 		xen_mb();
 	} while (txb->sring->req_prod != req_prod_local) ;
 	mtx_unlock(&xnb->tx_lock);
 
 	xnb_start(ifp);
 }
 
 
 /**
  * Build a struct xnb_pkt based on netif_tx_request's from a netif tx ring.
  * Will read exactly 0 or 1 packets from the ring; never a partial packet.
  * \param[out]	pkt	The returned packet.  If there is an error building
  * 			the packet, pkt.list_len will be set to 0.
  * \param[in]	tx_ring	Pointer to the Ring that is the input to this function
  * \param[in]	start	The ring index of the first potential request
  * \return		The number of requests consumed to build this packet
  */
 static int
 xnb_ring2pkt(struct xnb_pkt *pkt, const netif_tx_back_ring_t *tx_ring,
 	     RING_IDX start)
 {
 	/*
 	 * Outline:
 	 * 1) Initialize pkt
 	 * 2) Read the first request of the packet
 	 * 3) Read the extras
 	 * 4) Set cdr
 	 * 5) Loop on the remainder of the packet
 	 * 6) Finalize pkt (stuff like car_size and list_len)
 	 */
 	int idx = start;
 	int discard = 0;	/* whether to discard the packet */
 	int more_data = 0;	/* there are more request past the last one */
 	uint16_t cdr_size = 0;	/* accumulated size of requests 2 through n */
 
 	xnb_pkt_initialize(pkt);
 
 	/* Read the first request */
 	if (RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) {
 		netif_tx_request_t *tx = RING_GET_REQUEST(tx_ring, idx);
 		pkt->size = tx->size;
 		pkt->flags = tx->flags & ~NETTXF_more_data;
 		more_data = tx->flags & NETTXF_more_data;
 		pkt->list_len++;
 		pkt->car = idx;
 		idx++;
 	}
 
 	/* Read the extra info */
 	if ((pkt->flags & NETTXF_extra_info) &&
 	    RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) {
 		netif_extra_info_t *ext =
 		    (netif_extra_info_t*) RING_GET_REQUEST(tx_ring, idx);
 		pkt->extra.type = ext->type;
 		switch (pkt->extra.type) {
 			case XEN_NETIF_EXTRA_TYPE_GSO:
 				pkt->extra.u.gso = ext->u.gso;
 				break;
 			default:
 				/*
 				 * The reference Linux netfront driver will
 				 * never set any other extra.type.  So we don't
 				 * know what to do with it.  Let's print an
 				 * error, then consume and discard the packet
 				 */
 				printf("xnb(%s:%d): Unknown extra info type %d."
 				       "  Discarding packet\n",
 				       __func__, __LINE__, pkt->extra.type);
 				xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring,
 				    start));
 				xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring,
 				    idx));
 				discard = 1;
 				break;
 		}
 
 		pkt->extra.flags = ext->flags;
 		if (ext->flags & XEN_NETIF_EXTRA_FLAG_MORE) {
 			/*
 			 * The reference linux netfront driver never sets this
 			 * flag (nor does any other known netfront).  So we
 			 * will discard the packet.
 			 */
 			printf("xnb(%s:%d): Request sets "
 			    "XEN_NETIF_EXTRA_FLAG_MORE, but we can't handle "
 			    "that\n", __func__, __LINE__);
 			xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start));
 			xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx));
 			discard = 1;
 		}
 
 		idx++;
 	}
 
 	/* Set cdr.  If there is not more data, cdr is invalid */
 	pkt->cdr = idx;
 
 	/* Loop on remainder of packet */
 	while (more_data && RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) {
 		netif_tx_request_t *tx = RING_GET_REQUEST(tx_ring, idx);
 		pkt->list_len++;
 		cdr_size += tx->size;
 		if (tx->flags & ~NETTXF_more_data) {
 			/* There should be no other flags set at this point */
 			printf("xnb(%s:%d): Request sets unknown flags %d "
 			    "after the 1st request in the packet.\n",
 			    __func__, __LINE__, tx->flags);
 			xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start));
 			xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx));
 		}
 
 		more_data = tx->flags & NETTXF_more_data;
 		idx++;
 	}
 
 	/* Finalize packet */
 	if (more_data != 0) {
 		/* The ring ran out of requests before finishing the packet */
 		xnb_pkt_invalidate(pkt);
 		idx = start;	/* tell caller that we consumed no requests */
 	} else {
 		/* Calculate car_size */
 		pkt->car_size = pkt->size - cdr_size;
 	}
 	if (discard != 0) {
 		xnb_pkt_invalidate(pkt);
 	}
 
 	return idx - start;
 }
 
 
 /**
  * Respond to all the requests that constituted pkt.  Builds the responses and
  * writes them to the ring, but doesn't push them to the shared ring.
  * \param[in] pkt	the packet that needs a response
  * \param[in] error	true if there was an error handling the packet, such
  * 			as in the hypervisor copy op or mbuf allocation
  * \param[out] ring	Responses go here
  */
 static void
 xnb_txpkt2rsp(const struct xnb_pkt *pkt, netif_tx_back_ring_t *ring,
 	      int error)
 {
 	/*
 	 * Outline:
 	 * 1) Respond to the first request
 	 * 2) Respond to the extra info reques
 	 * Loop through every remaining request in the packet, generating
 	 * responses that copy those requests' ids and sets the status
 	 * appropriately.
 	 */
 	netif_tx_request_t *tx;
 	netif_tx_response_t *rsp;
 	int i;
 	uint16_t status;
 
 	status = (xnb_pkt_is_valid(pkt) == 0) || error ?
 		NETIF_RSP_ERROR : NETIF_RSP_OKAY;
 	KASSERT((pkt->list_len == 0) || (ring->rsp_prod_pvt == pkt->car),
 	    ("Cannot respond to ring requests out of order"));
 
 	if (pkt->list_len >= 1) {
 		uint16_t id;
 		tx = RING_GET_REQUEST(ring, ring->rsp_prod_pvt);
 		id = tx->id;
 		rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt);
 		rsp->id = id;
 		rsp->status = status;
 		ring->rsp_prod_pvt++;
 
 		if (pkt->flags & NETRXF_extra_info) {
 			rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt);
 			rsp->status = NETIF_RSP_NULL;
 			ring->rsp_prod_pvt++;
 		}
 	}
 
 	for (i=0; i < pkt->list_len - 1; i++) {
 		uint16_t id;
 		tx = RING_GET_REQUEST(ring, ring->rsp_prod_pvt);
 		id = tx->id;
 		rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt);
 		rsp->id = id;
 		rsp->status = status;
 		ring->rsp_prod_pvt++;
 	}
 }
 
 /**
  * Create an mbuf chain to represent a packet.  Initializes all of the headers
  * in the mbuf chain, but does not copy the data.  The returned chain must be
  * free()'d when no longer needed
  * \param[in]	pkt	A packet to model the mbuf chain after
  * \return	A newly allocated mbuf chain, possibly with clusters attached.
  * 		NULL on failure
  */
 static struct mbuf*
 xnb_pkt2mbufc(const struct xnb_pkt *pkt, struct ifnet *ifp)
 {
 	/**
 	 * \todo consider using a memory pool for mbufs instead of
 	 * reallocating them for every packet
 	 */
 	/** \todo handle extra data */
 	struct mbuf *m;
 
 	m = m_getm(NULL, pkt->size, M_NOWAIT, MT_DATA);
 
 	if (m != NULL) {
 		m->m_pkthdr.rcvif = ifp;
 		if (pkt->flags & NETTXF_data_validated) {
 			/*
 			 * We lie to the host OS and always tell it that the
 			 * checksums are ok, because the packet is unlikely to
 			 * get corrupted going across domains.
 			 */
 			m->m_pkthdr.csum_flags = (
 				CSUM_IP_CHECKED |
 				CSUM_IP_VALID   |
 				CSUM_DATA_VALID |
 				CSUM_PSEUDO_HDR
 				);
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 	}
 	return m;
 }
 
 /**
  * Build a gnttab_copy table that can be used to copy data from a pkt
  * to an mbufc.  Does not actually perform the copy.  Always uses gref's on
  * the packet side.
  * \param[in]	pkt	pkt's associated requests form the src for
  * 			the copy operation
  * \param[in]	mbufc	mbufc's storage forms the dest for the copy operation
  * \param[out]  gnttab	Storage for the returned grant table
  * \param[in]	txb	Pointer to the backend ring structure
  * \param[in]	otherend_id	The domain ID of the other end of the copy
  * \return 		The number of gnttab entries filled
  */
 static int
 xnb_txpkt2gnttab(const struct xnb_pkt *pkt, struct mbuf *mbufc,
 		 gnttab_copy_table gnttab, const netif_tx_back_ring_t *txb,
 		 domid_t otherend_id)
 {
 
 	struct mbuf *mbuf = mbufc;/* current mbuf within the chain */
 	int gnt_idx = 0;		/* index into grant table */
 	RING_IDX r_idx = pkt->car;	/* index into tx ring buffer */
 	int r_ofs = 0;	/* offset of next data within tx request's data area */
 	int m_ofs = 0;	/* offset of next data within mbuf's data area */
 	/* size in bytes that still needs to be represented in the table */
 	uint16_t size_remaining = pkt->size;
 
 	while (size_remaining > 0) {
 		const netif_tx_request_t *txq = RING_GET_REQUEST(txb, r_idx);
 		const size_t mbuf_space = M_TRAILINGSPACE(mbuf) - m_ofs;
 		const size_t req_size =
 			r_idx == pkt->car ? pkt->car_size : txq->size;
 		const size_t pkt_space = req_size - r_ofs;
 		/*
 		 * space is the largest amount of data that can be copied in the
 		 * grant table's next entry
 		 */
 		const size_t space = MIN(pkt_space, mbuf_space);
 
 		/* TODO: handle this error condition without panicking */
 		KASSERT(gnt_idx < GNTTAB_LEN, ("Grant table is too short"));
 
 		gnttab[gnt_idx].source.u.ref = txq->gref;
 		gnttab[gnt_idx].source.domid = otherend_id;
 		gnttab[gnt_idx].source.offset = txq->offset + r_ofs;
 		gnttab[gnt_idx].dest.u.gmfn = virt_to_mfn(
 		    mtod(mbuf, vm_offset_t) + m_ofs);
 		gnttab[gnt_idx].dest.offset = virt_to_offset(
 		    mtod(mbuf, vm_offset_t) + m_ofs);
 		gnttab[gnt_idx].dest.domid = DOMID_SELF;
 		gnttab[gnt_idx].len = space;
 		gnttab[gnt_idx].flags = GNTCOPY_source_gref;
 
 		gnt_idx++;
 		r_ofs += space;
 		m_ofs += space;
 		size_remaining -= space;
 		if (req_size - r_ofs <= 0) {
 			/* Must move to the next tx request */
 			r_ofs = 0;
 			r_idx = (r_idx == pkt->car) ? pkt->cdr : r_idx + 1;
 		}
 		if (M_TRAILINGSPACE(mbuf) - m_ofs <= 0) {
 			/* Must move to the next mbuf */
 			m_ofs = 0;
 			mbuf = mbuf->m_next;
 		}
 	}
 
 	return gnt_idx;
 }
 
 /**
  * Check the status of the grant copy operations, and update mbufs various
  * non-data fields to reflect the data present.
  * \param[in,out] mbufc	mbuf chain to update.  The chain must be valid and of
  * 			the correct length, and data should already be present
  * \param[in] gnttab	A grant table for a just completed copy op
  * \param[in] n_entries The number of valid entries in the grant table
  */
 static void
 xnb_update_mbufc(struct mbuf *mbufc, const gnttab_copy_table gnttab,
     		 int n_entries)
 {
 	struct mbuf *mbuf = mbufc;
 	int i;
 	size_t total_size = 0;
 
 	for (i = 0; i < n_entries; i++) {
 		KASSERT(gnttab[i].status == GNTST_okay,
 		    ("Some gnttab_copy entry had error status %hd\n",
 		    gnttab[i].status));
 
 		mbuf->m_len += gnttab[i].len;
 		total_size += gnttab[i].len;
 		if (M_TRAILINGSPACE(mbuf) <= 0) {
 			mbuf = mbuf->m_next;
 		}
 	}
 	mbufc->m_pkthdr.len = total_size;
 
 #if defined(INET) || defined(INET6)
 	xnb_add_mbuf_cksum(mbufc);
 #endif
 }
 
 /**
  * Dequeue at most one packet from the shared ring
  * \param[in,out] txb	Netif tx ring.  A packet will be removed from it, and
  * 			its private indices will be updated.  But the indices
  * 			will not be pushed to the shared ring.
  * \param[in] ifnet	Interface to which the packet will be sent
  * \param[in] otherend	Domain ID of the other end of the ring
  * \param[out] mbufc	The assembled mbuf chain, ready to send to the generic
  * 			networking stack
  * \param[in,out] gnttab Pointer to enough memory for a grant table.  We make
  * 			this a function parameter so that we will take less
  * 			stack space.
  * \return		An error code
  */
 static int
 xnb_recv(netif_tx_back_ring_t *txb, domid_t otherend, struct mbuf **mbufc,
 	 struct ifnet *ifnet, gnttab_copy_table gnttab)
 {
 	struct xnb_pkt pkt;
 	/* number of tx requests consumed to build the last packet */
 	int num_consumed;
 	int nr_ents;
 
 	*mbufc = NULL;
 	num_consumed = xnb_ring2pkt(&pkt, txb, txb->req_cons);
 	if (num_consumed == 0)
 		return 0;	/* Nothing to receive */
 
 	/* update statistics independent of errors */
 	if_inc_counter(ifnet, IFCOUNTER_IPACKETS, 1);
 
 	/*
 	 * if we got here, then 1 or more requests was consumed, but the packet
 	 * is not necessarily valid.
 	 */
 	if (xnb_pkt_is_valid(&pkt) == 0) {
 		/* got a garbage packet, respond and drop it */
 		xnb_txpkt2rsp(&pkt, txb, 1);
 		txb->req_cons += num_consumed;
 		DPRINTF("xnb_intr: garbage packet, num_consumed=%d\n",
 				num_consumed);
 		if_inc_counter(ifnet, IFCOUNTER_IERRORS, 1);
 		return EINVAL;
 	}
 
 	*mbufc = xnb_pkt2mbufc(&pkt, ifnet);
 
 	if (*mbufc == NULL) {
 		/*
 		 * Couldn't allocate mbufs.  Respond and drop the packet.  Do
 		 * not consume the requests
 		 */
 		xnb_txpkt2rsp(&pkt, txb, 1);
 		DPRINTF("xnb_intr: Couldn't allocate mbufs, num_consumed=%d\n",
 		    num_consumed);
 		if_inc_counter(ifnet, IFCOUNTER_IQDROPS, 1);
 		return ENOMEM;
 	}
 
 	nr_ents = xnb_txpkt2gnttab(&pkt, *mbufc, gnttab, txb, otherend);
 
 	if (nr_ents > 0) {
 		int __unused hv_ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
 		    gnttab, nr_ents);
 		KASSERT(hv_ret == 0,
 		    ("HYPERVISOR_grant_table_op returned %d\n", hv_ret));
 		xnb_update_mbufc(*mbufc, gnttab, nr_ents);
 	}
 
 	xnb_txpkt2rsp(&pkt, txb, 0);
 	txb->req_cons += num_consumed;
 	return 0;
 }
 
 /**
  * Create an xnb_pkt based on the contents of an mbuf chain.
  * \param[in] mbufc	mbuf chain to transform into a packet
  * \param[out] pkt	Storage for the newly generated xnb_pkt
  * \param[in] start	The ring index of the first available slot in the rx
  * 			ring
  * \param[in] space	The number of free slots in the rx ring
  * \retval 0		Success
  * \retval EINVAL	mbufc was corrupt or not convertible into a pkt
  * \retval EAGAIN	There was not enough space in the ring to queue the
  * 			packet
  */
 static int
 xnb_mbufc2pkt(const struct mbuf *mbufc, struct xnb_pkt *pkt,
 	      RING_IDX start, int space)
 {
 
 	int retval = 0;
 
 	if ((mbufc == NULL) ||
 	     ( (mbufc->m_flags & M_PKTHDR) == 0) ||
 	     (mbufc->m_pkthdr.len == 0)) {
 		xnb_pkt_invalidate(pkt);
 		retval = EINVAL;
 	} else {
 		int slots_required;
 
 		xnb_pkt_validate(pkt);
 		pkt->flags = 0;
 		pkt->size = mbufc->m_pkthdr.len;
 		pkt->car = start;
 		pkt->car_size = mbufc->m_len;
 
 		if (mbufc->m_pkthdr.csum_flags & CSUM_TSO) {
 			pkt->flags |= NETRXF_extra_info;
 			pkt->extra.u.gso.size = mbufc->m_pkthdr.tso_segsz;
 			pkt->extra.u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
 			pkt->extra.u.gso.pad = 0;
 			pkt->extra.u.gso.features = 0;
 			pkt->extra.type = XEN_NETIF_EXTRA_TYPE_GSO;
 			pkt->extra.flags = 0;
 			pkt->cdr = start + 2;
 		} else {
 			pkt->cdr = start + 1;
 		}
 		if (mbufc->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_DELAY_DATA)) {
 			pkt->flags |=
 			    (NETRXF_csum_blank | NETRXF_data_validated);
 		}
 
 		/*
 		 * Each ring response can have up to PAGE_SIZE of data.
 		 * Assume that we can defragment the mbuf chain efficiently
 		 * into responses so that each response but the last uses all
 		 * PAGE_SIZE bytes.
 		 */
 		pkt->list_len = howmany(pkt->size, PAGE_SIZE);
 
 		if (pkt->list_len > 1) {
 			pkt->flags |= NETRXF_more_data;
 		}
 
 		slots_required = pkt->list_len +
 			(pkt->flags & NETRXF_extra_info ? 1 : 0);
 		if (slots_required > space) {
 			xnb_pkt_invalidate(pkt);
 			retval = EAGAIN;
 		}
 	}
 
 	return retval;
 }
 
 /**
  * Build a gnttab_copy table that can be used to copy data from an mbuf chain
  * to the frontend's shared buffers.  Does not actually perform the copy.
  * Always uses gref's on the other end's side.
  * \param[in]	pkt	pkt's associated responses form the dest for the copy
  * 			operatoin
  * \param[in]	mbufc	The source for the copy operation
  * \param[out]	gnttab	Storage for the returned grant table
  * \param[in]	rxb	Pointer to the backend ring structure
  * \param[in]	otherend_id	The domain ID of the other end of the copy
  * \return 		The number of gnttab entries filled
  */
 static int
 xnb_rxpkt2gnttab(const struct xnb_pkt *pkt, const struct mbuf *mbufc,
 		 gnttab_copy_table gnttab, const netif_rx_back_ring_t *rxb,
 		 domid_t otherend_id)
 {
 
 	const struct mbuf *mbuf = mbufc;/* current mbuf within the chain */
 	int gnt_idx = 0;		/* index into grant table */
 	RING_IDX r_idx = pkt->car;	/* index into rx ring buffer */
 	int r_ofs = 0;	/* offset of next data within rx request's data area */
 	int m_ofs = 0;	/* offset of next data within mbuf's data area */
 	/* size in bytes that still needs to be represented in the table */
 	uint16_t size_remaining;
 
 	size_remaining = (xnb_pkt_is_valid(pkt) != 0) ? pkt->size : 0;
 
 	while (size_remaining > 0) {
 		const netif_rx_request_t *rxq = RING_GET_REQUEST(rxb, r_idx);
 		const size_t mbuf_space = mbuf->m_len - m_ofs;
 		/* Xen shared pages have an implied size of PAGE_SIZE */
 		const size_t req_size = PAGE_SIZE;
 		const size_t pkt_space = req_size - r_ofs;
 		/*
 		 * space is the largest amount of data that can be copied in the
 		 * grant table's next entry
 		 */
 		const size_t space = MIN(pkt_space, mbuf_space);
 
 		/* TODO: handle this error condition without panicing */
 		KASSERT(gnt_idx < GNTTAB_LEN, ("Grant table is too short"));
 
 		gnttab[gnt_idx].dest.u.ref = rxq->gref;
 		gnttab[gnt_idx].dest.domid = otherend_id;
 		gnttab[gnt_idx].dest.offset = r_ofs;
 		gnttab[gnt_idx].source.u.gmfn = virt_to_mfn(
 		    mtod(mbuf, vm_offset_t) + m_ofs);
 		gnttab[gnt_idx].source.offset = virt_to_offset(
 		    mtod(mbuf, vm_offset_t) + m_ofs);
 		gnttab[gnt_idx].source.domid = DOMID_SELF;
 		gnttab[gnt_idx].len = space;
 		gnttab[gnt_idx].flags = GNTCOPY_dest_gref;
 
 		gnt_idx++;
 
 		r_ofs += space;
 		m_ofs += space;
 		size_remaining -= space;
 		if (req_size - r_ofs <= 0) {
 			/* Must move to the next rx request */
 			r_ofs = 0;
 			r_idx = (r_idx == pkt->car) ? pkt->cdr : r_idx + 1;
 		}
 		if (mbuf->m_len - m_ofs <= 0) {
 			/* Must move to the next mbuf */
 			m_ofs = 0;
 			mbuf = mbuf->m_next;
 		}
 	}
 
 	return gnt_idx;
 }
 
 /**
  * Generates responses for all the requests that constituted pkt.  Builds
  * responses and writes them to the ring, but doesn't push the shared ring
  * indices.
  * \param[in] pkt	the packet that needs a response
  * \param[in] gnttab	The grant copy table corresponding to this packet.
  * 			Used to determine how many rsp->netif_rx_response_t's to
  * 			generate.
  * \param[in] n_entries	Number of relevant entries in the grant table
  * \param[out] ring	Responses go here
  * \return		The number of RX requests that were consumed to generate
  * 			the responses
  */
 static int
 xnb_rxpkt2rsp(const struct xnb_pkt *pkt, const gnttab_copy_table gnttab,
     	      int n_entries, netif_rx_back_ring_t *ring)
 {
 	/*
 	 * This code makes the following assumptions:
 	 *	* All entries in gnttab set GNTCOPY_dest_gref
 	 *	* The entries in gnttab are grouped by their grefs: any two
 	 *	   entries with the same gref must be adjacent
 	 */
 	int error = 0;
 	int gnt_idx, i;
 	int n_responses = 0;
 	grant_ref_t last_gref = GRANT_REF_INVALID;
 	RING_IDX r_idx;
 
 	KASSERT(gnttab != NULL, ("Received a null granttable copy"));
 
 	/*
 	 * In the event of an error, we only need to send one response to the
 	 * netfront.  In that case, we musn't write any data to the responses
 	 * after the one we send.  So we must loop all the way through gnttab
 	 * looking for errors before we generate any responses
 	 *
 	 * Since we're looping through the grant table anyway, we'll count the
 	 * number of different gref's in it, which will tell us how many
 	 * responses to generate
 	 */
 	for (gnt_idx = 0; gnt_idx < n_entries; gnt_idx++) {
 		int16_t status = gnttab[gnt_idx].status;
 		if (status != GNTST_okay) {
 			DPRINTF(
 			    "Got error %d for hypervisor gnttab_copy status\n",
 			    status);
 			error = 1;
 			break;
 		}
 		if (gnttab[gnt_idx].dest.u.ref != last_gref) {
 			n_responses++;
 			last_gref = gnttab[gnt_idx].dest.u.ref;
 		}
 	}
 
 	if (error != 0) {
 		uint16_t id;
 		netif_rx_response_t *rsp;
 		
 		id = RING_GET_REQUEST(ring, ring->rsp_prod_pvt)->id;
 		rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt);
 		rsp->id = id;
 		rsp->status = NETIF_RSP_ERROR;
 		n_responses = 1;
 	} else {
 		gnt_idx = 0;
 		const int has_extra = pkt->flags & NETRXF_extra_info;
 		if (has_extra != 0)
 			n_responses++;
 
 		for (i = 0; i < n_responses; i++) {
 			netif_rx_request_t rxq;
 			netif_rx_response_t *rsp;
 
 			r_idx = ring->rsp_prod_pvt + i;
 			/*
 			 * We copy the structure of rxq instead of making a
 			 * pointer because it shares the same memory as rsp.
 			 */
 			rxq = *(RING_GET_REQUEST(ring, r_idx));
 			rsp = RING_GET_RESPONSE(ring, r_idx);
 			if (has_extra && (i == 1)) {
 				netif_extra_info_t *ext =
 					(netif_extra_info_t*)rsp;
 				ext->type = XEN_NETIF_EXTRA_TYPE_GSO;
 				ext->flags = 0;
 				ext->u.gso.size = pkt->extra.u.gso.size;
 				ext->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
 				ext->u.gso.pad = 0;
 				ext->u.gso.features = 0;
 			} else {
 				rsp->id = rxq.id;
 				rsp->status = GNTST_okay;
 				rsp->offset = 0;
 				rsp->flags = 0;
 				if (i < pkt->list_len - 1)
 					rsp->flags |= NETRXF_more_data;
 				if ((i == 0) && has_extra)
 					rsp->flags |= NETRXF_extra_info;
 				if ((i == 0) &&
 					(pkt->flags & NETRXF_data_validated)) {
 					rsp->flags |= NETRXF_data_validated;
 					rsp->flags |= NETRXF_csum_blank;
 				}
 				rsp->status = 0;
 				for (; gnttab[gnt_idx].dest.u.ref == rxq.gref;
 				    gnt_idx++) {
 					rsp->status += gnttab[gnt_idx].len;
 				}
 			}
 		}
 	}
 
 	ring->req_cons += n_responses;
 	ring->rsp_prod_pvt += n_responses;
 	return n_responses;
 }
 
 #if defined(INET) || defined(INET6)
 /**
  * Add IP, TCP, and/or UDP checksums to every mbuf in a chain.  The first mbuf
  * in the chain must start with a struct ether_header.
  *
  * XXX This function will perform incorrectly on UDP packets that are split up
  * into multiple ethernet frames.
  */
 static void
 xnb_add_mbuf_cksum(struct mbuf *mbufc)
 {
 	struct ether_header *eh;
 	struct ip *iph;
 	uint16_t ether_type;
 
 	eh = mtod(mbufc, struct ether_header*);
 	ether_type = ntohs(eh->ether_type);
 	if (ether_type != ETHERTYPE_IP) {
 		/* Nothing to calculate */
 		return;
 	}
 
 	iph = (struct ip*)(eh + 1);
 	if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) {
 		iph->ip_sum = 0;
 		iph->ip_sum = in_cksum_hdr(iph);
 	}
 
 	switch (iph->ip_p) {
 	case IPPROTO_TCP:
 		if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) {
 			size_t tcplen = ntohs(iph->ip_len) - sizeof(struct ip);
 			struct tcphdr *th = (struct tcphdr*)(iph + 1);
 			th->th_sum = in_pseudo(iph->ip_src.s_addr,
 			    iph->ip_dst.s_addr, htons(IPPROTO_TCP + tcplen));
 			th->th_sum = in_cksum_skip(mbufc,
 			    sizeof(struct ether_header) + ntohs(iph->ip_len),
 			    sizeof(struct ether_header) + (iph->ip_hl << 2));
 		}
 		break;
 	case IPPROTO_UDP:
 		if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) {
 			size_t udplen = ntohs(iph->ip_len) - sizeof(struct ip);
 			struct udphdr *uh = (struct udphdr*)(iph + 1);
 			uh->uh_sum = in_pseudo(iph->ip_src.s_addr,
 			    iph->ip_dst.s_addr, htons(IPPROTO_UDP + udplen));
 			uh->uh_sum = in_cksum_skip(mbufc,
 			    sizeof(struct ether_header) + ntohs(iph->ip_len),
 			    sizeof(struct ether_header) + (iph->ip_hl << 2));
 		}
 		break;
 	default:
 		break;
 	}
 }
 #endif /* INET || INET6 */
 
 static void
 xnb_stop(struct xnb_softc *xnb)
 {
 	struct ifnet *ifp;
 
 	mtx_assert(&xnb->sc_lock, MA_OWNED);
 	ifp = xnb->xnb_ifp;
 	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 }
 
 static int
 xnb_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct xnb_softc *xnb = ifp->if_softc;
 	struct ifreq *ifr = (struct ifreq*) data;
 #ifdef INET
 	struct ifaddr *ifa = (struct ifaddr*)data;
 #endif
 	int error = 0;
 
 	switch (cmd) {
 		case SIOCSIFFLAGS:
 			mtx_lock(&xnb->sc_lock);
 			if (ifp->if_flags & IFF_UP) {
 				xnb_ifinit_locked(xnb);
 			} else {
 				if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 					xnb_stop(xnb);
 				}
 			}
 			/*
 			 * Note: netfront sets a variable named xn_if_flags
 			 * here, but that variable is never read
 			 */
 			mtx_unlock(&xnb->sc_lock);
 			break;
 		case SIOCSIFADDR:
 #ifdef INET
 			mtx_lock(&xnb->sc_lock);
 			if (ifa->ifa_addr->sa_family == AF_INET) {
 				ifp->if_flags |= IFF_UP;
 				if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) {
 					ifp->if_drv_flags &= ~(IFF_DRV_RUNNING |
 							IFF_DRV_OACTIVE);
 					if_link_state_change(ifp,
 							LINK_STATE_DOWN);
 					ifp->if_drv_flags |= IFF_DRV_RUNNING;
 					ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 					if_link_state_change(ifp,
 					    LINK_STATE_UP);
 				}
 				arp_ifinit(ifp, ifa);
 				mtx_unlock(&xnb->sc_lock);
 			} else {
 				mtx_unlock(&xnb->sc_lock);
 #endif
 				error = ether_ioctl(ifp, cmd, data);
 #ifdef INET
 			}
 #endif
 			break;
 		case SIOCSIFCAP:
 			mtx_lock(&xnb->sc_lock);
 			if (ifr->ifr_reqcap & IFCAP_TXCSUM) {
 				ifp->if_capenable |= IFCAP_TXCSUM;
 				ifp->if_hwassist |= XNB_CSUM_FEATURES;
 			} else {
 				ifp->if_capenable &= ~(IFCAP_TXCSUM);
 				ifp->if_hwassist &= ~(XNB_CSUM_FEATURES);
 			}
 			if ((ifr->ifr_reqcap & IFCAP_RXCSUM)) {
 				ifp->if_capenable |= IFCAP_RXCSUM;
 			} else {
 				ifp->if_capenable &= ~(IFCAP_RXCSUM);
 			}
 			/*
 			 * TODO enable TSO4 and LRO once we no longer need
 			 * to calculate checksums in software
 			 */
 #if 0
 			if (ifr->if_reqcap |= IFCAP_TSO4) {
 				if (IFCAP_TXCSUM & ifp->if_capenable) {
 					printf("xnb: Xen netif requires that "
 						"TXCSUM be enabled in order "
 						"to use TSO4\n");
 					error = EINVAL;
 				} else {
 					ifp->if_capenable |= IFCAP_TSO4;
 					ifp->if_hwassist |= CSUM_TSO;
 				}
 			} else {
 				ifp->if_capenable &= ~(IFCAP_TSO4);
 				ifp->if_hwassist &= ~(CSUM_TSO);
 			}
 			if (ifr->ifreqcap |= IFCAP_LRO) {
 				ifp->if_capenable |= IFCAP_LRO;
 			} else {
 				ifp->if_capenable &= ~(IFCAP_LRO);
 			}
 #endif
 			mtx_unlock(&xnb->sc_lock);
 			break;
 		case SIOCSIFMTU:
 			ifp->if_mtu = ifr->ifr_mtu;
 			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 			xnb_ifinit(xnb);
 			break;
 		case SIOCADDMULTI:
 		case SIOCDELMULTI:
 		case SIOCSIFMEDIA:
 		case SIOCGIFMEDIA:
 			error = ifmedia_ioctl(ifp, ifr, &xnb->sc_media, cmd);
 			break;
 		default:
 			error = ether_ioctl(ifp, cmd, data);
 			break;
 	}
 	return (error);
 }
 
 static void
 xnb_start_locked(struct ifnet *ifp)
 {
 	netif_rx_back_ring_t *rxb;
 	struct xnb_softc *xnb;
 	struct mbuf *mbufc;
 	RING_IDX req_prod_local;
 
 	xnb = ifp->if_softc;
 	rxb = &xnb->ring_configs[XNB_RING_TYPE_RX].back_ring.rx_ring;
 
 	if (!xnb->carrier)
 		return;
 
 	do {
 		int out_of_space = 0;
 		int notify;
 		req_prod_local = rxb->sring->req_prod;
 		xen_rmb();
 		for (;;) {
 			int error;
 
 			IF_DEQUEUE(&ifp->if_snd, mbufc);
 			if (mbufc == NULL)
 				break;
 			error = xnb_send(rxb, xnb->otherend_id, mbufc,
 			    		 xnb->rx_gnttab);
 			switch (error) {
 				case EAGAIN:
 					/*
 					 * Insufficient space in the ring.
 					 * Requeue pkt and send when space is
 					 * available.
 					 */
 					IF_PREPEND(&ifp->if_snd, mbufc);
 					/*
 					 * Perhaps the frontend missed an IRQ
 					 * and went to sleep.  Notify it to wake
 					 * it up.
 					 */
 					out_of_space = 1;
 					break;
 
 				case EINVAL:
 					/* OS gave a corrupt packet.  Drop it.*/
 					if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 					/* FALLTHROUGH */
 				default:
 					/* Send succeeded, or packet had error.
 					 * Free the packet */
 					if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 					if (mbufc)
 						m_freem(mbufc);
 					break;
 			}
 			if (out_of_space != 0)
 				break;
 		}
 
 		RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(rxb, notify);
 		if ((notify != 0) || (out_of_space != 0))
 			xen_intr_signal(xnb->xen_intr_handle);
 		rxb->sring->req_event = req_prod_local + 1;
 		xen_mb();
 	} while (rxb->sring->req_prod != req_prod_local) ;
 }
 
 /**
  * Sends one packet to the ring.  Blocks until the packet is on the ring
  * \param[in]	mbufc	Contains one packet to send.  Caller must free
  * \param[in,out] rxb	The packet will be pushed onto this ring, but the
  * 			otherend will not be notified.
  * \param[in]	otherend The domain ID of the other end of the connection
  * \retval	EAGAIN	The ring did not have enough space for the packet.
  * 			The ring has not been modified
  * \param[in,out] gnttab Pointer to enough memory for a grant table.  We make
  * 			this a function parameter so that we will take less
  * 			stack space.
  * \retval EINVAL	mbufc was corrupt or not convertible into a pkt
  */
 static int
 xnb_send(netif_rx_back_ring_t *ring, domid_t otherend, const struct mbuf *mbufc,
 	 gnttab_copy_table gnttab)
 {
 	struct xnb_pkt pkt;
 	int error, n_entries, n_reqs;
 	RING_IDX space;
 
 	space = ring->sring->req_prod - ring->req_cons;
 	error = xnb_mbufc2pkt(mbufc, &pkt, ring->rsp_prod_pvt, space);
 	if (error != 0)
 		return error;
 	n_entries = xnb_rxpkt2gnttab(&pkt, mbufc, gnttab, ring, otherend);
 	if (n_entries != 0) {
 		int __unused hv_ret = HYPERVISOR_grant_table_op(GNTTABOP_copy,
 		    gnttab, n_entries);
 		KASSERT(hv_ret == 0, ("HYPERVISOR_grant_table_op returned %d\n",
 		    hv_ret));
 	}
 
 	n_reqs = xnb_rxpkt2rsp(&pkt, gnttab, n_entries, ring);
 
 	return 0;
 }
 
 static void
 xnb_start(struct ifnet *ifp)
 {
 	struct xnb_softc *xnb;
 
 	xnb = ifp->if_softc;
 	mtx_lock(&xnb->rx_lock);
 	xnb_start_locked(ifp);
 	mtx_unlock(&xnb->rx_lock);
 }
 
 /* equivalent of network_open() in Linux */
 static void
 xnb_ifinit_locked(struct xnb_softc *xnb)
 {
 	struct ifnet *ifp;
 
 	ifp = xnb->xnb_ifp;
 
 	mtx_assert(&xnb->sc_lock, MA_OWNED);
 
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
 		return;
 
 	xnb_stop(xnb);
 
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	if_link_state_change(ifp, LINK_STATE_UP);
 }
 
 
 static void
 xnb_ifinit(void *xsc)
 {
 	struct xnb_softc *xnb = xsc;
 
 	mtx_lock(&xnb->sc_lock);
 	xnb_ifinit_locked(xnb);
 	mtx_unlock(&xnb->sc_lock);
 }
 
 /**
  * Callback used by the generic networking code to tell us when our carrier
  * state has changed.  Since we don't have a physical carrier, we don't care
  */
 static int
 xnb_ifmedia_upd(struct ifnet *ifp)
 {
 	return (0);
 }
 
 /**
  * Callback used by the generic networking code to ask us what our carrier
  * state is.  Since we don't have a physical carrier, this is very simple
  */
 static void
 xnb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
 {
 	ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE;
 	ifmr->ifm_active = IFM_ETHER|IFM_MANUAL;
 }
 
 
 /*---------------------------- NewBus Registration ---------------------------*/
 static device_method_t xnb_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe,		xnb_probe),
 	DEVMETHOD(device_attach,	xnb_attach),
 	DEVMETHOD(device_detach,	xnb_detach),
 	DEVMETHOD(device_shutdown,	bus_generic_shutdown),
 	DEVMETHOD(device_suspend,	xnb_suspend),
 	DEVMETHOD(device_resume,	xnb_resume),
 
 	/* Xenbus interface */
 	DEVMETHOD(xenbus_otherend_changed, xnb_frontend_changed),
 
 	{ 0, 0 }
 };
 
 static driver_t xnb_driver = {
 	"xnb",
 	xnb_methods,
 	sizeof(struct xnb_softc),
 };
 devclass_t xnb_devclass;
 
 DRIVER_MODULE(xnb, xenbusb_back, xnb_driver, xnb_devclass, 0, 0);
 
 
 /*-------------------------- Unit Tests -------------------------------------*/
 #ifdef XNB_DEBUG
 #include "netback_unit_tests.c"
 #endif
Index: head/sys/dev/xen/xenstore/xenstore.c
===================================================================
--- head/sys/dev/xen/xenstore/xenstore.c	(revision 358315)
+++ head/sys/dev/xen/xenstore/xenstore.c	(revision 358316)
@@ -1,1658 +1,1659 @@
 /******************************************************************************
  * xenstore.c
  *
  * Low-level kernel interface to the XenStore.
  *
  * Copyright (C) 2005 Rusty Russell, IBM Corporation
  * Copyright (C) 2009,2010 Spectra Logic Corporation
  *
  * This file may be distributed separately from the Linux kernel, or
  * incorporated into other software packages, subject to the following license:
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this source file (the "Software"), to deal in the Software without
  * restriction, including without limitation the rights to use, copy, modify,
  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  * and to permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
 
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/unistd.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 
 #include <machine/stdarg.h>
 
 #include <xen/xen-os.h>
 #include <xen/hypervisor.h>
 #include <xen/xen_intr.h>
 
 #include <xen/interface/hvm/params.h>
 #include <xen/hvm.h>
 
 #include <xen/xenstore/xenstorevar.h>
 #include <xen/xenstore/xenstore_internal.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 
 /**
  * \file xenstore.c
  * \brief XenStore interface
  *
  * The XenStore interface is a simple storage system that is a means of
  * communicating state and configuration data between the Xen Domain 0
  * and the various guest domains.  All configuration data other than
  * a small amount of essential information required during the early
  * boot process of launching a Xen aware guest, is managed using the
  * XenStore.
  *
  * The XenStore is ASCII string based, and has a structure and semantics
  * similar to a filesystem.  There are files and directories, the directories
  * able to contain files or other directories.  The depth of the hierarchy
  * is only limited by the XenStore's maximum path length.
  *
  * The communication channel between the XenStore service and other
  * domains is via two, guest specific, ring buffers in a shared memory
  * area.  One ring buffer is used for communicating in each direction.
  * The grant table references for this shared memory are given to the
  * guest either via the xen_start_info structure for a fully para-
  * virtualized guest, or via HVM hypercalls for a hardware virtualized
  * guest.
  *
  * The XenStore communication relies on an event channel and thus
  * interrupts.  For this reason, the attachment of the XenStore
  * relies on an interrupt driven configuration hook to hold off
  * boot processing until communication with the XenStore service
  * can be established.
  *
  * Several Xen services depend on the XenStore, most notably the
  * XenBus used to discover and manage Xen devices.  These services
  * are implemented as NewBus child attachments to a bus exported
  * by this XenStore driver.
  */
 
 static struct xs_watch *find_watch(const char *token);
 
 MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results");
 
 /**
  * Pointer to shared memory communication structures allowing us
  * to communicate with the XenStore service.
  *
  * When operating in full PV mode, this pointer is set early in kernel
  * startup from within xen_machdep.c.  In HVM mode, we use hypercalls
  * to get the guest frame number for the shared page and then map it
  * into kva.  See xs_init() for details.
  */
 static struct xenstore_domain_interface *xen_store;
 
 /*-------------------------- Private Data Structures ------------------------*/
 
 /**
  * Structure capturing messages received from the XenStore service.
  */
 struct xs_stored_msg {
 	TAILQ_ENTRY(xs_stored_msg) list;
 
 	struct xsd_sockmsg hdr;
 
 	union {
 		/* Queued replies. */
 		struct {
 			char *body;
 		} reply;
 
 		/* Queued watch events. */
 		struct {
 			struct xs_watch *handle;
 			const char **vec;
 			u_int vec_size;
 		} watch;
 	} u;
 };
 TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg);
 
 /**
  * Container for all XenStore related state.
  */
 struct xs_softc {
 	/** Newbus device for the XenStore. */
 	device_t xs_dev;
 
 	/**
 	 * Lock serializing access to ring producer/consumer
 	 * indexes.  Use of this lock guarantees that wakeups
 	 * of blocking readers/writers are not missed due to
 	 * races with the XenStore service.
 	 */
 	struct mtx ring_lock;
 
 	/*
 	 * Mutex used to insure exclusive access to the outgoing
 	 * communication ring.  We use a lock type that can be
 	 * held while sleeping so that xs_write() can block waiting
 	 * for space in the ring to free up, without allowing another
 	 * writer to come in and corrupt a partial message write.
 	 */
 	struct sx request_mutex;
 
 	/**
 	 * A list of replies to our requests.
 	 *
 	 * The reply list is filled by xs_rcv_thread().  It
 	 * is consumed by the context that issued the request
 	 * to which a reply is made.  The requester blocks in
 	 * xs_read_reply().
 	 *
 	 * /note Only one requesting context can be active at a time.
 	 *       This is guaranteed by the request_mutex and insures
 	 *	 that the requester sees replies matching the order
 	 *	 of its requests.
 	 */
 	struct xs_stored_msg_list reply_list;
 
 	/** Lock protecting the reply list. */
 	struct mtx reply_lock;
 
 	/**
 	 * List of registered watches.
 	 */
 	struct xs_watch_list  registered_watches;
 
 	/** Lock protecting the registered watches list. */
 	struct mtx registered_watches_lock;
 
 	/**
 	 * List of pending watch callback events.
 	 */
 	struct xs_stored_msg_list watch_events;
 
 	/** Lock protecting the watch calback list. */
 	struct mtx watch_events_lock;
 
 	/**
 	 * The processid of the xenwatch thread.
 	 */
 	pid_t xenwatch_pid;
 
 	/**
 	 * Sleepable mutex used to gate the execution of XenStore
 	 * watch event callbacks.
 	 *
 	 * xenwatch_thread holds an exclusive lock on this mutex
 	 * while delivering event callbacks, and xenstore_unregister_watch()
 	 * uses an exclusive lock of this mutex to guarantee that no
 	 * callbacks of the just unregistered watch are pending
 	 * before returning to its caller.
 	 */
 	struct sx xenwatch_mutex;
 
 	/**
 	 * The HVM guest pseudo-physical frame number.  This is Xen's mapping
 	 * of the true machine frame number into our "physical address space".
 	 */
 	unsigned long gpfn;
 
 	/**
 	 * The event channel for communicating with the
 	 * XenStore service.
 	 */
 	int evtchn;
 
 	/** Handle for XenStore interrupts. */
 	xen_intr_handle_t xen_intr_handle;
 
 	/**
 	 * Interrupt driven config hook allowing us to defer
 	 * attaching children until interrupts (and thus communication
 	 * with the XenStore service) are available.
 	 */
 	struct intr_config_hook xs_attachcb;
 
 	/**
 	 * Xenstore is a user-space process that usually runs in Dom0,
 	 * so if this domain is booting as Dom0, xenstore wont we accessible,
 	 * and we have to defer the initialization of xenstore related
 	 * devices to later (when xenstore is started).
 	 */
 	bool initialized;
 
 	/**
 	 * Task to run when xenstore is initialized (Dom0 only), will
 	 * take care of attaching xenstore related devices.
 	 */
 	struct task xs_late_init;
 };
 
 /*-------------------------------- Global Data ------------------------------*/
 static struct xs_softc xs;
 
 /*------------------------- Private Utility Functions -----------------------*/
 
 /**
  * Count and optionally record pointers to a number of NUL terminated
  * strings in a buffer.
  *
  * \param strings  A pointer to a contiguous buffer of NUL terminated strings.
  * \param dest	   An array to store pointers to each string found in strings.
  * \param len	   The length of the buffer pointed to by strings.
  *
  * \return  A count of the number of strings found.
  */
 static u_int
 extract_strings(const char *strings, const char **dest, u_int len)
 {
 	u_int num;
 	const char *p;
 
 	for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) {
 		if (dest != NULL)
 			*dest++ = p;
 		num++;
 	}
 
 	return (num);
 }
 
 /**
  * Convert a contiguous buffer containing a series of NUL terminated
  * strings into an array of pointers to strings.
  *
  * The returned pointer references the array of string pointers which
  * is followed by the storage for the string data.  It is the client's
  * responsibility to free this storage.
  *
  * The storage addressed by strings is free'd prior to split returning.
  *
  * \param strings  A pointer to a contiguous buffer of NUL terminated strings.
  * \param len	   The length of the buffer pointed to by strings.
  * \param num	   The number of strings found and returned in the strings
  *                 array.
  *
  * \return  An array of pointers to the strings found in the input buffer.
  */
 static const char **
 split(char *strings, u_int len, u_int *num)
 {
 	const char **ret;
 
 	/* Protect against unterminated buffers. */
 	if (len > 0)
 		strings[len - 1] = '\0';
 
 	/* Count the strings. */
 	*num = extract_strings(strings, /*dest*/NULL, len);
 
 	/* Transfer to one big alloc for easy freeing by the caller. */
 	ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK);
 	memcpy(&ret[*num], strings, len);
 	free(strings, M_XENSTORE);
 
 	/* Extract pointers to newly allocated array. */
 	strings = (char *)&ret[*num];
 	(void)extract_strings(strings, /*dest*/ret, len);
 
 	return (ret);
 }
 
 /*------------------------- Public Utility Functions -------------------------*/
 /*------- API comments for these methods can be found in xenstorevar.h -------*/
 struct sbuf *
 xs_join(const char *dir, const char *name)
 {
 	struct sbuf *sb;
 
 	sb = sbuf_new_auto();
 	sbuf_cat(sb, dir);
 	if (name[0] != '\0') {
 		sbuf_putc(sb, '/');
 		sbuf_cat(sb, name);
 	}
 	sbuf_finish(sb);
 
 	return (sb);
 }
 
 /*-------------------- Low Level Communication Management --------------------*/
 /**
  * Interrupt handler for the XenStore event channel.
  *
  * XenStore reads and writes block on "xen_store" for buffer
  * space.  Wakeup any blocking operations when the XenStore
  * service has modified the queues.
  */
 static void
 xs_intr(void * arg __unused /*__attribute__((unused))*/)
 {
 
 	/* If xenstore has not been initialized, initialize it now */
 	if (!xs.initialized) {
 		xs.initialized = true;
 		/*
 		 * Since this task is probing and attaching devices we
 		 * have to hold the Giant lock.
 		 */
 		taskqueue_enqueue(taskqueue_swi_giant, &xs.xs_late_init);
 	}
 
 	/*
 	 * Hold ring lock across wakeup so that clients
 	 * cannot miss a wakeup.
 	 */
 	mtx_lock(&xs.ring_lock);
 	wakeup(xen_store);
 	mtx_unlock(&xs.ring_lock);
 }
 
 /**
  * Verify that the indexes for a ring are valid.
  *
  * The difference between the producer and consumer cannot
  * exceed the size of the ring.
  *
  * \param cons  The consumer index for the ring to test.
  * \param prod  The producer index for the ring to test.
  *
  * \retval 1  If indexes are in range.
  * \retval 0  If the indexes are out of range.
  */
 static int
 xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
 {
 
 	return ((prod - cons) <= XENSTORE_RING_SIZE);
 }
 
 /**
  * Return a pointer to, and the length of, the contiguous
  * free region available for output in a ring buffer.
  *
  * \param cons  The consumer index for the ring.
  * \param prod  The producer index for the ring.
  * \param buf   The base address of the ring's storage.
  * \param len   The amount of contiguous storage available.
  *
  * \return  A pointer to the start location of the free region.
  */
 static void *
 xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
     char *buf, uint32_t *len)
 {
 
 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
 	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
 		*len = XENSTORE_RING_SIZE - (prod - cons);
 	return (buf + MASK_XENSTORE_IDX(prod));
 }
 
 /**
  * Return a pointer to, and the length of, the contiguous
  * data available to read from a ring buffer.
  *
  * \param cons  The consumer index for the ring.
  * \param prod  The producer index for the ring.
  * \param buf   The base address of the ring's storage.
  * \param len   The amount of contiguous data available to read.
  *
  * \return  A pointer to the start location of the available data.
  */
 static const void *
 xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod,
     const char *buf, uint32_t *len)
 {
 
 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
 	if ((prod - cons) < *len)
 		*len = prod - cons;
 	return (buf + MASK_XENSTORE_IDX(cons));
 }
 
 /**
  * Transmit data to the XenStore service.
  *
  * \param tdata  A pointer to the contiguous data to send.
  * \param len    The amount of data to send.
  *
  * \return  On success 0, otherwise an errno value indicating the
  *          cause of failure.
  *
  * \invariant  Called from thread context.
  * \invariant  The buffer pointed to by tdata is at least len bytes
  *             in length.
  * \invariant  xs.request_mutex exclusively locked.
  */
 static int
 xs_write_store(const void *tdata, unsigned len)
 {
 	XENSTORE_RING_IDX cons, prod;
 	const char *data = (const char *)tdata;
 	int error;
 
 	sx_assert(&xs.request_mutex, SX_XLOCKED);
 	while (len != 0) {
 		void *dst;
 		u_int avail;
 
 		/* Hold lock so we can't miss wakeups should we block. */
 		mtx_lock(&xs.ring_lock);
 		cons = xen_store->req_cons;
 		prod = xen_store->req_prod;
 		if ((prod - cons) == XENSTORE_RING_SIZE) {
 			/*
 			 * Output ring is full. Wait for a ring event.
 			 *
 			 * Note that the events from both queues
 			 * are combined, so being woken does not
 			 * guarantee that data exist in the read
 			 * ring.
 			 *
 			 * To simplify error recovery and the retry,
 			 * we specify PDROP so our lock is *not* held
 			 * when msleep returns.
 			 */
 			error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
 			     "xbwrite", /*timeout*/0);
 			if (error && error != EWOULDBLOCK)
 				return (error);
 
 			/* Try again. */
 			continue;
 		}
 		mtx_unlock(&xs.ring_lock);
 
 		/* Verify queue sanity. */
 		if (!xs_check_indexes(cons, prod)) {
 			xen_store->req_cons = xen_store->req_prod = 0;
 			return (EIO);
 		}
 
 		dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail);
 		if (avail > len)
 			avail = len;
 
 		memcpy(dst, data, avail);
 		data += avail;
 		len -= avail;
 
 		/*
 		 * The store to the producer index, which indicates
 		 * to the other side that new data has arrived, must
 		 * be visible only after our copy of the data into the
 		 * ring has completed.
 		 */
 		wmb();
 		xen_store->req_prod += avail;
 
 		/*
 		 * xen_intr_signal() implies mb(). The other side will see
 		 * the change to req_prod at the time of the interrupt.
 		 */
 		xen_intr_signal(xs.xen_intr_handle);
 	}
 
 	return (0);
 }
 
 /**
  * Receive data from the XenStore service.
  *
  * \param tdata  A pointer to the contiguous buffer to receive the data.
  * \param len    The amount of data to receive.
  *
  * \return  On success 0, otherwise an errno value indicating the
  *          cause of failure.
  *
  * \invariant  Called from thread context.
  * \invariant  The buffer pointed to by tdata is at least len bytes
  *             in length.
  *
  * \note xs_read does not perform any internal locking to guarantee
  *       serial access to the incoming ring buffer.  However, there
  *	 is only one context processing reads: xs_rcv_thread().
  */
 static int
 xs_read_store(void *tdata, unsigned len)
 {
 	XENSTORE_RING_IDX cons, prod;
 	char *data = (char *)tdata;
 	int error;
 
 	while (len != 0) {
 		u_int avail;
 		const char *src;
 
 		/* Hold lock so we can't miss wakeups should we block. */
 		mtx_lock(&xs.ring_lock);
 		cons = xen_store->rsp_cons;
 		prod = xen_store->rsp_prod;
 		if (cons == prod) {
 			/*
 			 * Nothing to read. Wait for a ring event.
 			 *
 			 * Note that the events from both queues
 			 * are combined, so being woken does not
 			 * guarantee that data exist in the read
 			 * ring.
 			 *
 			 * To simplify error recovery and the retry,
 			 * we specify PDROP so our lock is *not* held
 			 * when msleep returns.
 			 */
 			error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP,
 			    "xbread", /*timeout*/0);
 			if (error && error != EWOULDBLOCK)
 				return (error);
 			continue;
 		}
 		mtx_unlock(&xs.ring_lock);
 
 		/* Verify queue sanity. */
 		if (!xs_check_indexes(cons, prod)) {
 			xen_store->rsp_cons = xen_store->rsp_prod = 0;
 			return (EIO);
 		}
 
 		src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail);
 		if (avail > len)
 			avail = len;
 
 		/*
 		 * Insure the data we read is related to the indexes
 		 * we read above.
 		 */
 		rmb();
 
 		memcpy(data, src, avail);
 		data += avail;
 		len -= avail;
 
 		/*
 		 * Insure that the producer of this ring does not see
 		 * the ring space as free until after we have copied it
 		 * out.
 		 */
 		mb();
 		xen_store->rsp_cons += avail;
 
 		/*
 		 * xen_intr_signal() implies mb(). The producer will see
 		 * the updated consumer index when the event is delivered.
 		 */
 		xen_intr_signal(xs.xen_intr_handle);
 	}
 
 	return (0);
 }
 
 /*----------------------- Received Message Processing ------------------------*/
 /**
  * Block reading the next message from the XenStore service and
  * process the result.
  *
  * \param type  The returned type of the XenStore message received.
  *
  * \return  0 on success.  Otherwise an errno value indicating the
  *          type of failure encountered.
  */
 static int
 xs_process_msg(enum xsd_sockmsg_type *type)
 {
 	struct xs_stored_msg *msg;
 	char *body;
 	int error;
 
 	msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK);
 	error = xs_read_store(&msg->hdr, sizeof(msg->hdr));
 	if (error) {
 		free(msg, M_XENSTORE);
 		return (error);
 	}
 
 	body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK);
 	error = xs_read_store(body, msg->hdr.len);
 	if (error) {
 		free(body, M_XENSTORE);
 		free(msg, M_XENSTORE);
 		return (error);
 	}
 	body[msg->hdr.len] = '\0';
 
 	*type = msg->hdr.type;
 	if (msg->hdr.type == XS_WATCH_EVENT) {
 		msg->u.watch.vec = split(body, msg->hdr.len,
 		    &msg->u.watch.vec_size);
 
 		mtx_lock(&xs.registered_watches_lock);
 		msg->u.watch.handle = find_watch(
 		    msg->u.watch.vec[XS_WATCH_TOKEN]);
 		if (msg->u.watch.handle != NULL) {
 			mtx_lock(&xs.watch_events_lock);
 			TAILQ_INSERT_TAIL(&xs.watch_events, msg, list);
 			wakeup(&xs.watch_events);
 			mtx_unlock(&xs.watch_events_lock);
 		} else {
 			free(msg->u.watch.vec, M_XENSTORE);
 			free(msg, M_XENSTORE);
 		}
 		mtx_unlock(&xs.registered_watches_lock);
 	} else {
 		msg->u.reply.body = body;
 		mtx_lock(&xs.reply_lock);
 		TAILQ_INSERT_TAIL(&xs.reply_list, msg, list);
 		wakeup(&xs.reply_list);
 		mtx_unlock(&xs.reply_lock);
 	}
 
 	return (0);
 }
 
 /**
  * Thread body of the XenStore receive thread.
  *
  * This thread blocks waiting for data from the XenStore service
  * and processes and received messages.
  */
 static void
 xs_rcv_thread(void *arg __unused)
 {
 	int error;
 	enum xsd_sockmsg_type type;
 
 	for (;;) {
 		error = xs_process_msg(&type);
 		if (error)
 			printf("XENSTORE error %d while reading message\n",
 			    error);
 	}
 }
 
 /*---------------- XenStore Message Request/Reply Processing -----------------*/
 #define xsd_error_count	(sizeof(xsd_errors) / sizeof(xsd_errors[0]))
 
 /**
  * Convert a XenStore error string into an errno number.
  *
  * \param errorstring  The error string to convert.
  *
  * \return  The errno best matching the input string.
  *
  * \note Unknown error strings are converted to EINVAL.
  */
 static int
 xs_get_error(const char *errorstring)
 {
 	u_int i;
 
 	for (i = 0; i < xsd_error_count; i++) {
 		if (!strcmp(errorstring, xsd_errors[i].errstring))
 			return (xsd_errors[i].errnum);
 	}
 	log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s",
 	    errorstring);
 	return (EINVAL);
 }
 
 /**
  * Block waiting for a reply to a message request.
  *
  * \param type	  The returned type of the reply.
  * \param len	  The returned body length of the reply.
  * \param result  The returned body of the reply.
  *
  * \return  0 on success.  Otherwise an errno indicating the
  *          cause of failure.
  */
 static int
 xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result)
 {
 	struct xs_stored_msg *msg;
 	char *body;
 	int error;
 
 	mtx_lock(&xs.reply_lock);
 	while (TAILQ_EMPTY(&xs.reply_list)) {
 		error = mtx_sleep(&xs.reply_list, &xs.reply_lock, 0, "xswait",
 		    hz/10);
 		if (error && error != EWOULDBLOCK) {
 			mtx_unlock(&xs.reply_lock);
 			return (error);
 		}
 	}
 	msg = TAILQ_FIRST(&xs.reply_list);
 	TAILQ_REMOVE(&xs.reply_list, msg, list);
 	mtx_unlock(&xs.reply_lock);
 
 	*type = msg->hdr.type;
 	if (len)
 		*len = msg->hdr.len;
 	body = msg->u.reply.body;
 
 	free(msg, M_XENSTORE);
 	*result = body;
 	return (0);
 }
 
 /**
  * Pass-thru interface for XenStore access by userland processes
  * via the XenStore device.
  *
  * Reply type and length data are returned by overwriting these
  * fields in the passed in request message.
  *
  * \param msg	  A properly formatted message to transmit to
  *		  the XenStore service.
  * \param result  The returned body of the reply.
  *
  * \return  0 on success.  Otherwise an errno indicating the cause
  *          of failure.
  *
  * \note The returned result is provided in malloced storage and thus
  *       must be free'd by the caller with 'free(result, M_XENSTORE);
  */
 int
 xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result)
 {
 	uint32_t request_type;
 	int error;
 
 	request_type = msg->type;
 
 	sx_xlock(&xs.request_mutex);
 	if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0)
 		error = xs_read_reply(&msg->type, &msg->len, result);
 	sx_xunlock(&xs.request_mutex);
 
 	return (error);
 }
 
 /**
  * Send a message with an optionally muti-part body to the XenStore service.
  *
  * \param t              The transaction to use for this request.
  * \param request_type   The type of message to send.
  * \param iovec          Pointers to the body sections of the request.
  * \param num_vecs       The number of body sections in the request.
  * \param len            The returned length of the reply.
  * \param result         The returned body of the reply.
  *
  * \return  0 on success.  Otherwise an errno indicating
  *          the cause of failure.
  *
  * \note The returned result is provided in malloced storage and thus
  *       must be free'd by the caller with 'free(*result, M_XENSTORE);
  */
 static int
 xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type,
     const struct iovec *iovec, u_int num_vecs, u_int *len, void **result)
 {
 	struct xsd_sockmsg msg;
 	void *ret = NULL;
 	u_int i;
 	int error;
 
 	msg.tx_id = t.id;
 	msg.req_id = 0;
 	msg.type = request_type;
 	msg.len = 0;
 	for (i = 0; i < num_vecs; i++)
 		msg.len += iovec[i].iov_len;
 
 	sx_xlock(&xs.request_mutex);
 	error = xs_write_store(&msg, sizeof(msg));
 	if (error) {
 		printf("xs_talkv failed %d\n", error);
 		goto error_lock_held;
 	}
 
 	for (i = 0; i < num_vecs; i++) {
 		error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len);
 		if (error) {
 			printf("xs_talkv failed %d\n", error);
 			goto error_lock_held;
 		}
 	}
 
 	error = xs_read_reply(&msg.type, len, &ret);
 
 error_lock_held:
 	sx_xunlock(&xs.request_mutex);
 	if (error)
 		return (error);
 
 	if (msg.type == XS_ERROR) {
 		error = xs_get_error(ret);
 		free(ret, M_XENSTORE);
 		return (error);
 	}
 
 	/* Reply is either error or an echo of our request message type. */
 	KASSERT(msg.type == request_type, ("bad xenstore message type"));
 
 	if (result)
 		*result = ret;
 	else
 		free(ret, M_XENSTORE);
 
 	return (0);
 }
 
 /**
  * Wrapper for xs_talkv allowing easy transmission of a message with
  * a single, contiguous, message body.
  *
  * \param t              The transaction to use for this request.
  * \param request_type   The type of message to send.
  * \param body           The body of the request.
  * \param len            The returned length of the reply.
  * \param result         The returned body of the reply.
  *
  * \return  0 on success.  Otherwise an errno indicating
  *          the cause of failure.
  *
  * \note The returned result is provided in malloced storage and thus
  *       must be free'd by the caller with 'free(*result, M_XENSTORE);
  */
 static int
 xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type,
     const char *body, u_int *len, void **result)
 {
 	struct iovec iovec;
 
 	iovec.iov_base = (void *)(uintptr_t)body;
 	iovec.iov_len = strlen(body) + 1;
 
 	return (xs_talkv(t, request_type, &iovec, 1, len, result));
 }
 
 /*------------------------- XenStore Watch Support ---------------------------*/
 /**
  * Transmit a watch request to the XenStore service.
  *
  * \param path    The path in the XenStore to watch.
  * \param tocken  A unique identifier for this watch.
  *
  * \return  0 on success.  Otherwise an errno indicating the
  *          cause of failure.
  */
 static int
 xs_watch(const char *path, const char *token)
 {
 	struct iovec iov[2];
 
 	iov[0].iov_base = (void *)(uintptr_t) path;
 	iov[0].iov_len = strlen(path) + 1;
 	iov[1].iov_base = (void *)(uintptr_t) token;
 	iov[1].iov_len = strlen(token) + 1;
 
 	return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL));
 }
 
 /**
  * Transmit an uwatch request to the XenStore service.
  *
  * \param path    The path in the XenStore to watch.
  * \param tocken  A unique identifier for this watch.
  *
  * \return  0 on success.  Otherwise an errno indicating the
  *          cause of failure.
  */
 static int
 xs_unwatch(const char *path, const char *token)
 {
 	struct iovec iov[2];
 
 	iov[0].iov_base = (void *)(uintptr_t) path;
 	iov[0].iov_len = strlen(path) + 1;
 	iov[1].iov_base = (void *)(uintptr_t) token;
 	iov[1].iov_len = strlen(token) + 1;
 
 	return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL));
 }
 
 /**
  * Convert from watch token (unique identifier) to the associated
  * internal tracking structure for this watch.
  *
  * \param tocken  The unique identifier for the watch to find.
  *
  * \return  A pointer to the found watch structure or NULL.
  */
 static struct xs_watch *
 find_watch(const char *token)
 {
 	struct xs_watch *i, *cmp;
 
 	cmp = (void *)strtoul(token, NULL, 16);
 
 	LIST_FOREACH(i, &xs.registered_watches, list)
 		if (i == cmp)
 			return (i);
 
 	return (NULL);
 }
 
 /**
  * Thread body of the XenStore watch event dispatch thread.
  */
 static void
 xenwatch_thread(void *unused)
 {
 	struct xs_stored_msg *msg;
 
 	for (;;) {
 
 		mtx_lock(&xs.watch_events_lock);
 		while (TAILQ_EMPTY(&xs.watch_events))
 			mtx_sleep(&xs.watch_events,
 			    &xs.watch_events_lock,
 			    PWAIT | PCATCH, "waitev", hz/10);
 
 		mtx_unlock(&xs.watch_events_lock);
 		sx_xlock(&xs.xenwatch_mutex);
 
 		mtx_lock(&xs.watch_events_lock);
 		msg = TAILQ_FIRST(&xs.watch_events);
 		if (msg)
 			TAILQ_REMOVE(&xs.watch_events, msg, list);
 		mtx_unlock(&xs.watch_events_lock);
 
 		if (msg != NULL) {
 			/*
 			 * XXX There are messages coming in with a NULL
 			 * XXX callback.  This deserves further investigation;
 			 * XXX the workaround here simply prevents the kernel
 			 * XXX from panic'ing on startup.
 			 */
 			if (msg->u.watch.handle->callback != NULL)
 				msg->u.watch.handle->callback(
 					msg->u.watch.handle,
 					(const char **)msg->u.watch.vec,
 					msg->u.watch.vec_size);
 			free(msg->u.watch.vec, M_XENSTORE);
 			free(msg, M_XENSTORE);
 		}
 
 		sx_xunlock(&xs.xenwatch_mutex);
 	}
 }
 
 /*----------- XenStore Configuration, Initialization, and Control ------------*/
 /**
  * Setup communication channels with the XenStore service.
  *
  * \return  On success, 0. Otherwise an errno value indicating the
  *          type of failure.
  */
 static int
 xs_init_comms(void)
 {
 	int error;
 
 	if (xen_store->rsp_prod != xen_store->rsp_cons) {
 		log(LOG_WARNING, "XENSTORE response ring is not quiescent "
 		    "(%08x:%08x): fixing up\n",
 		    xen_store->rsp_cons, xen_store->rsp_prod);
 		xen_store->rsp_cons = xen_store->rsp_prod;
 	}
 
 	xen_intr_unbind(&xs.xen_intr_handle);
 
 	error = xen_intr_bind_local_port(xs.xs_dev, xs.evtchn,
 	    /*filter*/NULL, xs_intr, /*arg*/NULL, INTR_TYPE_NET|INTR_MPSAFE,
 	    &xs.xen_intr_handle);
 	if (error) {
 		log(LOG_WARNING, "XENSTORE request irq failed %i\n", error);
 		return (error);
 	}
 
 	return (0);
 }
 
 /*------------------ Private Device Attachment Functions  --------------------*/
 static void
 xs_identify(driver_t *driver, device_t parent)
 {
 
 	BUS_ADD_CHILD(parent, 0, "xenstore", 0);
 }
 
 /**
  * Probe for the existence of the XenStore.
  *
  * \param dev
  */
 static int 
 xs_probe(device_t dev)
 {
 	/*
 	 * We are either operating within a PV kernel or being probed
 	 * as the child of the successfully attached xenpci device.
 	 * Thus we are in a Xen environment and there will be a XenStore.
 	 * Unconditionally return success.
 	 */
 	device_set_desc(dev, "XenStore");
 	return (BUS_PROBE_NOWILDCARD);
 }
 
 static void
 xs_attach_deferred(void *arg)
 {
 
 	bus_generic_probe(xs.xs_dev);
 	bus_generic_attach(xs.xs_dev);
 
 	config_intrhook_disestablish(&xs.xs_attachcb);
 }
 
 static void
 xs_attach_late(void *arg, int pending)
 {
 
 	KASSERT((pending == 1), ("xs late attach queued several times"));
 	bus_generic_probe(xs.xs_dev);
 	bus_generic_attach(xs.xs_dev);
 }
 
 /**
  * Attach to the XenStore.
  *
  * This routine also prepares for the probe/attach of drivers that rely
  * on the XenStore.  
  */
 static int
 xs_attach(device_t dev)
 {
 	int error;
 
 	/* Allow us to get device_t from softc and vice-versa. */
 	xs.xs_dev = dev;
 	device_set_softc(dev, &xs);
 
 	/* Initialize the interface to xenstore. */
 	struct proc *p;
 
 	xs.initialized = false;
 	xs.evtchn = xen_get_xenstore_evtchn();
 	if (xs.evtchn == 0) {
 		struct evtchn_alloc_unbound alloc_unbound;
 
 		/* Allocate a local event channel for xenstore */
 		alloc_unbound.dom = DOMID_SELF;
 		alloc_unbound.remote_dom = DOMID_SELF;
 		error = HYPERVISOR_event_channel_op(
 		    EVTCHNOP_alloc_unbound, &alloc_unbound);
 		if (error != 0)
 			panic(
 			   "unable to alloc event channel for Dom0: %d",
 			    error);
 
 		xs.evtchn = alloc_unbound.port;
 
 		/* Allocate memory for the xs shared ring */
 		xen_store = malloc(PAGE_SIZE, M_XENSTORE, M_WAITOK | M_ZERO);
 		xs.gpfn = atop(pmap_kextract((vm_offset_t)xen_store));
 	} else {
 		xs.gpfn = xen_get_xenstore_mfn();
 		xen_store = pmap_mapdev_attr(ptoa(xs.gpfn), PAGE_SIZE,
 		    PAT_WRITE_BACK);
 		xs.initialized = true;
 	}
 
 	TAILQ_INIT(&xs.reply_list);
 	TAILQ_INIT(&xs.watch_events);
 
 	mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF);
 	mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF);
 	sx_init(&xs.xenwatch_mutex, "xenwatch");
 	sx_init(&xs.request_mutex, "xenstore request");
 	mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF);
 	mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF);
 
 	/* Initialize the shared memory rings to talk to xenstored */
 	error = xs_init_comms();
 	if (error)
 		return (error);
 
 	error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID,
 	    0, "xenwatch");
 	if (error)
 		return (error);
 	xs.xenwatch_pid = p->p_pid;
 
 	error = kproc_create(xs_rcv_thread, NULL, NULL,
 	    RFHIGHPID, 0, "xenstore_rcv");
 
 	xs.xs_attachcb.ich_func = xs_attach_deferred;
 	xs.xs_attachcb.ich_arg = NULL;
 	if (xs.initialized) {
 		config_intrhook_establish(&xs.xs_attachcb);
 	} else {
 		TASK_INIT(&xs.xs_late_init, 0, xs_attach_late, NULL);
 	}
 
 	return (error);
 }
 
 /**
  * Prepare for suspension of this VM by halting XenStore access after
  * all transactions and individual requests have completed.
  */
 static int
 xs_suspend(device_t dev)
 {
 	int error;
 
 	/* Suspend child Xen devices. */
 	error = bus_generic_suspend(dev);
 	if (error != 0)
 		return (error);
 
 	sx_xlock(&xs.request_mutex);
 
 	return (0);
 }
 
 /**
  * Resume XenStore operations after this VM is resumed.
  */
 static int
 xs_resume(device_t dev __unused)
 {
 	struct xs_watch *watch;
 	char token[sizeof(watch) * 2 + 1];
 
 	xs_init_comms();
 
 	sx_xunlock(&xs.request_mutex);
 
 	/*
 	 * NB: since xenstore childs have not been resumed yet, there's
 	 * no need to hold any watch mutex. Having clients try to add or
 	 * remove watches at this point (before xenstore is resumed) is
 	 * clearly a violantion of the resume order.
 	 */
 	LIST_FOREACH(watch, &xs.registered_watches, list) {
 		sprintf(token, "%lX", (long)watch);
 		xs_watch(watch->node, token);
 	}
 
 	/* Resume child Xen devices. */
 	bus_generic_resume(dev);
 
 	return (0);
 }
 
 /*-------------------- Private Device Attachment Data  -----------------------*/
 static device_method_t xenstore_methods[] = { 
 	/* Device interface */ 
 	DEVMETHOD(device_identify,	xs_identify),
 	DEVMETHOD(device_probe,         xs_probe), 
 	DEVMETHOD(device_attach,        xs_attach), 
 	DEVMETHOD(device_detach,        bus_generic_detach), 
 	DEVMETHOD(device_shutdown,      bus_generic_shutdown), 
 	DEVMETHOD(device_suspend,       xs_suspend), 
 	DEVMETHOD(device_resume,        xs_resume), 
  
 	/* Bus interface */ 
 	DEVMETHOD(bus_add_child,        bus_generic_add_child),
 	DEVMETHOD(bus_alloc_resource,   bus_generic_alloc_resource),
 	DEVMETHOD(bus_release_resource, bus_generic_release_resource),
 	DEVMETHOD(bus_activate_resource, bus_generic_activate_resource),
 	DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource),
 
 	DEVMETHOD_END
 }; 
 
 DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0);
 static devclass_t xenstore_devclass; 
  
 DRIVER_MODULE(xenstore, xenpv, xenstore_driver, xenstore_devclass, 0, 0);
 
 /*------------------------------- Sysctl Data --------------------------------*/
 /* XXX Shouldn't the node be somewhere else? */
-SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen");
+SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
+    "Xen");
 SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, "");
 SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, "");
 
 /*-------------------------------- Public API --------------------------------*/
 /*------- API comments for these methods can be found in xenstorevar.h -------*/
 bool
 xs_initialized(void)
 {
 
 	return (xs.initialized);
 }
 
 evtchn_port_t
 xs_evtchn(void)
 {
 
     return (xs.evtchn);
 }
 
 vm_paddr_t
 xs_address(void)
 {
 
     return (ptoa(xs.gpfn));
 }
 
 int
 xs_directory(struct xs_transaction t, const char *dir, const char *node,
     u_int *num, const char ***result)
 {
 	struct sbuf *path;
 	char *strings;
 	u_int len = 0;
 	int error;
 
 	path = xs_join(dir, node);
 	error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len,
 	    (void **)&strings);
 	sbuf_delete(path);
 	if (error)
 		return (error);
 
 	*result = split(strings, len, num);
 
 	return (0);
 }
 
 int
 xs_exists(struct xs_transaction t, const char *dir, const char *node)
 {
 	const char **d;
 	int error, dir_n;
 
 	error = xs_directory(t, dir, node, &dir_n, &d);
 	if (error)
 		return (0);
 	free(d, M_XENSTORE);
 	return (1);
 }
 
 int
 xs_read(struct xs_transaction t, const char *dir, const char *node,
     u_int *len, void **result)
 {
 	struct sbuf *path;
 	void *ret;
 	int error;
 
 	path = xs_join(dir, node);
 	error = xs_single(t, XS_READ, sbuf_data(path), len, &ret);
 	sbuf_delete(path);
 	if (error)
 		return (error);
 	*result = ret;
 	return (0);
 }
 
 int
 xs_write(struct xs_transaction t, const char *dir, const char *node,
     const char *string)
 {
 	struct sbuf *path;
 	struct iovec iovec[2];
 	int error;
 
 	path = xs_join(dir, node);
 
 	iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path);
 	iovec[0].iov_len = sbuf_len(path) + 1;
 	iovec[1].iov_base = (void *)(uintptr_t) string;
 	iovec[1].iov_len = strlen(string);
 
 	error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL);
 	sbuf_delete(path);
 
 	return (error);
 }
 
 int
 xs_mkdir(struct xs_transaction t, const char *dir, const char *node)
 {
 	struct sbuf *path;
 	int ret;
 
 	path = xs_join(dir, node);
 	ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL);
 	sbuf_delete(path);
 
 	return (ret);
 }
 
 int
 xs_rm(struct xs_transaction t, const char *dir, const char *node)
 {
 	struct sbuf *path;
 	int ret;
 
 	path = xs_join(dir, node);
 	ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL);
 	sbuf_delete(path);
 
 	return (ret);
 }
 
 int
 xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node)
 {
 	struct xs_transaction local_xbt;
 	struct sbuf *root_path_sbuf;
 	struct sbuf *cur_path_sbuf;
 	char *root_path;
 	char *cur_path;
 	const char **dir;
 	int error;
 
 retry:
 	root_path_sbuf = xs_join(base, node);
 	cur_path_sbuf  = xs_join(base, node);
 	root_path      = sbuf_data(root_path_sbuf);
 	cur_path       = sbuf_data(cur_path_sbuf);
 	dir            = NULL;
 	local_xbt.id   = 0;
 
 	if (xbt.id == 0) {
 		error = xs_transaction_start(&local_xbt);
 		if (error != 0)
 			goto out;
 		xbt = local_xbt;
 	}
 
 	while (1) {
 		u_int count;
 		u_int i;
 
 		error = xs_directory(xbt, cur_path, "", &count, &dir);
 		if (error)
 			goto out;
 
 		for (i = 0; i < count; i++) {
 			error = xs_rm(xbt, cur_path, dir[i]);
 			if (error == ENOTEMPTY) {
 				struct sbuf *push_dir;
 
 				/*
 				 * Descend to clear out this sub directory.
 				 * We'll return to cur_dir once push_dir
 				 * is empty.
 				 */
 				push_dir = xs_join(cur_path, dir[i]);
 				sbuf_delete(cur_path_sbuf);
 				cur_path_sbuf = push_dir;
 				cur_path = sbuf_data(cur_path_sbuf);
 				break;
 			} else if (error != 0) {
 				goto out;
 			}
 		}
 
 		free(dir, M_XENSTORE);
 		dir = NULL;
 
 		if (i == count) {
 			char *last_slash;
 
 			/* Directory is empty.  It is now safe to remove. */
 			error = xs_rm(xbt, cur_path, "");
 			if (error != 0)
 				goto out;
 
 			if (!strcmp(cur_path, root_path))
 				break;
 
 			/* Return to processing the parent directory. */
 			last_slash = strrchr(cur_path, '/');
 			KASSERT(last_slash != NULL,
 				("xs_rm_tree: mangled path %s", cur_path));
 			*last_slash = '\0';
 		}
 	}
 
 out:
 	sbuf_delete(cur_path_sbuf);
 	sbuf_delete(root_path_sbuf);
 	if (dir != NULL)
 		free(dir, M_XENSTORE);
 
 	if (local_xbt.id != 0) {
 		int terror;
 
 		terror = xs_transaction_end(local_xbt, /*abort*/error != 0);
 		xbt.id = 0;
 		if (terror == EAGAIN && error == 0)
 			goto retry;
 	}
 	return (error);
 }
 
 int
 xs_transaction_start(struct xs_transaction *t)
 {
 	char *id_str;
 	int error;
 
 	error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL,
 	    (void **)&id_str);
 	if (error == 0) {
 		t->id = strtoul(id_str, NULL, 0);
 		free(id_str, M_XENSTORE);
 	}
 	return (error);
 }
 
 int
 xs_transaction_end(struct xs_transaction t, int abort)
 {
 	char abortstr[2];
 
 	if (abort)
 		strcpy(abortstr, "F");
 	else
 		strcpy(abortstr, "T");
 
 	return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL));
 }
 
 int
 xs_scanf(struct xs_transaction t, const char *dir, const char *node,
      int *scancountp, const char *fmt, ...)
 {
 	va_list ap;
 	int error, ns;
 	char *val;
 
 	error = xs_read(t, dir, node, NULL, (void **) &val);
 	if (error)
 		return (error);
 
 	va_start(ap, fmt);
 	ns = vsscanf(val, fmt, ap);
 	va_end(ap);
 	free(val, M_XENSTORE);
 	/* Distinctive errno. */
 	if (ns == 0)
 		return (ERANGE);
 	if (scancountp)
 		*scancountp = ns;
 	return (0);
 }
 
 int
 xs_vprintf(struct xs_transaction t,
     const char *dir, const char *node, const char *fmt, va_list ap)
 {
 	struct sbuf *sb;
 	int error;
 
 	sb = sbuf_new_auto();
 	sbuf_vprintf(sb, fmt, ap);
 	sbuf_finish(sb);
 	error = xs_write(t, dir, node, sbuf_data(sb));
 	sbuf_delete(sb);
 
 	return (error);
 }
 
 int
 xs_printf(struct xs_transaction t, const char *dir, const char *node,
      const char *fmt, ...)
 {
 	va_list ap;
 	int error;
 
 	va_start(ap, fmt);
 	error = xs_vprintf(t, dir, node, fmt, ap);
 	va_end(ap);
 
 	return (error);
 }
 
 int
 xs_gather(struct xs_transaction t, const char *dir, ...)
 {
 	va_list ap;
 	const char *name;
 	int error;
 
 	va_start(ap, dir);
 	error = 0;
 	while (error == 0 && (name = va_arg(ap, char *)) != NULL) {
 		const char *fmt = va_arg(ap, char *);
 		void *result = va_arg(ap, void *);
 		char *p;
 
 		error = xs_read(t, dir, name, NULL, (void **) &p);
 		if (error)
 			break;
 
 		if (fmt) {
 			if (sscanf(p, fmt, result) == 0)
 				error = EINVAL;
 			free(p, M_XENSTORE);
 		} else
 			*(char **)result = p;
 	}
 	va_end(ap);
 
 	return (error);
 }
 
 int
 xs_register_watch(struct xs_watch *watch)
 {
 	/* Pointer in ascii is the token. */
 	char token[sizeof(watch) * 2 + 1];
 	int error;
 
 	sprintf(token, "%lX", (long)watch);
 
 	mtx_lock(&xs.registered_watches_lock);
 	KASSERT(find_watch(token) == NULL, ("watch already registered"));
 	LIST_INSERT_HEAD(&xs.registered_watches, watch, list);
 	mtx_unlock(&xs.registered_watches_lock);
 
 	error = xs_watch(watch->node, token);
 
 	/* Ignore errors due to multiple registration. */
 	if (error == EEXIST)
 		error = 0;
 
 	if (error != 0) {
 		mtx_lock(&xs.registered_watches_lock);
 		LIST_REMOVE(watch, list);
 		mtx_unlock(&xs.registered_watches_lock);
 	}
 
 	return (error);
 }
 
 void
 xs_unregister_watch(struct xs_watch *watch)
 {
 	struct xs_stored_msg *msg, *tmp;
 	char token[sizeof(watch) * 2 + 1];
 	int error;
 
 	sprintf(token, "%lX", (long)watch);
 
 	mtx_lock(&xs.registered_watches_lock);
 	if (find_watch(token) == NULL) {
 		mtx_unlock(&xs.registered_watches_lock);
 		return;
 	}
 	LIST_REMOVE(watch, list);
 	mtx_unlock(&xs.registered_watches_lock);
 
 	error = xs_unwatch(watch->node, token);
 	if (error)
 		log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n",
 		    watch->node, error);
 
 	/* Cancel pending watch events. */
 	mtx_lock(&xs.watch_events_lock);
 	TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) {
 		if (msg->u.watch.handle != watch)
 			continue;
 		TAILQ_REMOVE(&xs.watch_events, msg, list);
 		free(msg->u.watch.vec, M_XENSTORE);
 		free(msg, M_XENSTORE);
 	}
 	mtx_unlock(&xs.watch_events_lock);
 
 	/* Flush any currently-executing callback, unless we are it. :-) */
 	if (curproc->p_pid != xs.xenwatch_pid) {
 		sx_xlock(&xs.xenwatch_mutex);
 		sx_xunlock(&xs.xenwatch_mutex);
 	}
 }
 
 void
 xs_lock(void)
 {
 
 	sx_xlock(&xs.request_mutex);
 	return;
 }
 
 void
 xs_unlock(void)
 {
 
 	sx_xunlock(&xs.request_mutex);
 	return;
 }
 
Index: head/sys/xen/xenbus/xenbusb.c
===================================================================
--- head/sys/xen/xenbus/xenbusb.c	(revision 358315)
+++ head/sys/xen/xenbus/xenbusb.c	(revision 358316)
@@ -1,975 +1,975 @@
 /******************************************************************************
  * Copyright (C) 2010 Spectra Logic Corporation
  * Copyright (C) 2008 Doug Rabson
  * Copyright (C) 2005 Rusty Russell, IBM Corporation
  * Copyright (C) 2005 Mike Wray, Hewlett-Packard
  * Copyright (C) 2005 XenSource Ltd
  * 
  * This file may be distributed separately from the Linux kernel, or
  * incorporated into other software packages, subject to the following license:
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this source file (the "Software"), to deal in the Software without
  * restriction, including without limitation the rights to use, copy, modify,
  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  * and to permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  * 
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  * 
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
 
 /**
  * \file xenbusb.c
  *
  * \brief Shared support functions for managing the NewBus buses that contain
  *        Xen front and back end device instances.
  *
  * The NewBus implementation of XenBus attaches a xenbusb_front and xenbusb_back
  * child bus to the xenstore device.  This strategy allows the small differences
  * in the handling of XenBus operations for front and back devices to be handled
  * as overrides in xenbusb_front/back.c.  Front and back specific device
  * classes are also provided so device drivers can register for the devices they
  * can handle without the need to filter within their probe routines.  The
  * net result is a device hierarchy that might look like this:
  *
  * xenstore0/
  *           xenbusb_front0/
  *                         xn0
  *                         xbd0
  *                         xbd1
  *           xenbusb_back0/
  *                        xbbd0
  *                        xnb0
  *                        xnb1
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/sbuf.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 #include <sys/sx.h>
 #include <sys/taskqueue.h>
 
 #include <machine/xen/xen-os.h>
 #include <machine/stdarg.h>
 
 #include <xen/gnttab.h>
 #include <xen/xenstore/xenstorevar.h>
 #include <xen/xenbus/xenbusb.h>
 #include <xen/xenbus/xenbusvar.h>
 
 /*------------------------- Private Functions --------------------------------*/
 /**
  * \brief Deallocate XenBus device instance variables.
  *
  * \param ivars  The instance variable block to free.
  */
 static void
 xenbusb_free_child_ivars(struct xenbus_device_ivars *ivars)
 {
 	if (ivars->xd_otherend_watch.node != NULL) {
 		xs_unregister_watch(&ivars->xd_otherend_watch);
 		free(ivars->xd_otherend_watch.node, M_XENBUS);
 		ivars->xd_otherend_watch.node = NULL;
 	}
 
 	if (ivars->xd_local_watch.node != NULL) {
 		xs_unregister_watch(&ivars->xd_local_watch);
 		ivars->xd_local_watch.node = NULL;
 	}
 
 	if (ivars->xd_node != NULL) {
 		free(ivars->xd_node, M_XENBUS);
 		ivars->xd_node = NULL;
 	}
 	ivars->xd_node_len = 0;
 
 	if (ivars->xd_type != NULL) {
 		free(ivars->xd_type, M_XENBUS);
 		ivars->xd_type = NULL;
 	}
 
 	if (ivars->xd_otherend_path != NULL) {
 		free(ivars->xd_otherend_path, M_XENBUS);
 		ivars->xd_otherend_path = NULL;
 	}
 	ivars->xd_otherend_path_len = 0;
 
 	free(ivars, M_XENBUS);
 }
 
 /**
  * XenBus watch callback registered against the "state" XenStore
  * node of the other-end of a split device connection.
  *
  * This callback is invoked whenever the state of a device instance's
  * peer changes.
  *
  * \param watch      The xs_watch object used to register this callback
  *                   function.
  * \param vec        An array of pointers to NUL terminated strings containing
  *                   watch event data.  The vector should be indexed via the
  *                   xs_watch_type enum in xs_wire.h.
  * \param vec_size   The number of elements in vec.
  */
 static void
 xenbusb_otherend_watch_cb(struct xs_watch *watch, const char **vec,
     unsigned int vec_size __unused)
 {
 	struct xenbus_device_ivars *ivars;
 	device_t child;
 	device_t bus;
 	const char *path;
 	enum xenbus_state newstate;
 
 	ivars = (struct xenbus_device_ivars *)watch->callback_data;
 	child = ivars->xd_dev;
 	bus = device_get_parent(child);
 
 	path = vec[XS_WATCH_PATH];
 	if (ivars->xd_otherend_path == NULL
 	 || strncmp(ivars->xd_otherend_path, path, ivars->xd_otherend_path_len))
 		return;
 
 	newstate = xenbus_read_driver_state(ivars->xd_otherend_path);
 	XENBUSB_OTHEREND_CHANGED(bus, child, newstate);
 }
 
 /**
  * XenBus watch callback registered against the XenStore sub-tree
  * represnting the local half of a split device connection.
  *
  * This callback is invoked whenever any XenStore data in the subtree
  * is modified, either by us or another privledged domain.
  *
  * \param watch      The xs_watch object used to register this callback
  *                   function.
  * \param vec        An array of pointers to NUL terminated strings containing
  *                   watch event data.  The vector should be indexed via the
  *                   xs_watch_type enum in xs_wire.h.
  * \param vec_size   The number of elements in vec.
  *
  */
 static void
 xenbusb_local_watch_cb(struct xs_watch *watch, const char **vec,
     unsigned int vec_size __unused)
 {
 	struct xenbus_device_ivars *ivars;
 	device_t child;
 	device_t bus;
 	const char *path;
 
 	ivars = (struct xenbus_device_ivars *)watch->callback_data;
 	child = ivars->xd_dev;
 	bus = device_get_parent(child);
 
 	path = vec[XS_WATCH_PATH];
 	if (ivars->xd_node == NULL
 	 || strncmp(ivars->xd_node, path, ivars->xd_node_len))
 		return;
 
 	XENBUSB_LOCALEND_CHANGED(bus, child, &path[ivars->xd_node_len]);
 }
 
 /**
  * Search our internal record of configured devices (not the XenStore)
  * to determine if the XenBus device indicated by \a node is known to
  * the system.
  *
  * \param dev   The XenBus bus instance to search for device children.
  * \param node  The XenStore node path for the device to find.
  *
  * \return  The device_t of the found device if any, or NULL.
  *
  * \note device_t is a pointer type, so it can be compared against
  *       NULL for validity. 
  */
 static device_t
 xenbusb_device_exists(device_t dev, const char *node)
 {
 	device_t *kids;
 	device_t result;
 	struct xenbus_device_ivars *ivars;
 	int i, count;
 
 	if (device_get_children(dev, &kids, &count))
 		return (FALSE);
 
 	result = NULL;
 	for (i = 0; i < count; i++) {
 		ivars = device_get_ivars(kids[i]);
 		if (!strcmp(ivars->xd_node, node)) {
 			result = kids[i];
 			break;
 		}
 	}
 	free(kids, M_TEMP);
 
 	return (result);
 }
 
 static void
 xenbusb_delete_child(device_t dev, device_t child)
 {
 	struct xenbus_device_ivars *ivars;
 
 	ivars = device_get_ivars(child);
 
 	/*
 	 * We no longer care about the otherend of the
 	 * connection.  Cancel the watches now so that we
 	 * don't try to handle an event for a partially
 	 * detached child.
 	 */
 	if (ivars->xd_otherend_watch.node != NULL)
 		xs_unregister_watch(&ivars->xd_otherend_watch);
 	if (ivars->xd_local_watch.node != NULL)
 		xs_unregister_watch(&ivars->xd_local_watch);
 	
 	device_delete_child(dev, child);
 	xenbusb_free_child_ivars(ivars);
 }
 
 /**
  * \param dev    The NewBus device representing this XenBus bus.
  * \param child	 The NewBus device representing a child of dev%'s XenBus bus.
  */
 static void
 xenbusb_verify_device(device_t dev, device_t child)
 {
 	if (xs_exists(XST_NIL, xenbus_get_node(child), "") == 0) {
 
 		/*
 		 * Device tree has been removed from Xenbus.
 		 * Tear down the device.
 		 */
 		xenbusb_delete_child(dev, child);
 	}
 }
 
 /**
  * \brief Enumerate the devices on a XenBus bus and register them with
  *        the NewBus device tree.
  *
  * xenbusb_enumerate_bus() will create entries (in state DS_NOTPRESENT)
  * for nodes that appear in the XenStore, but will not invoke probe/attach
  * operations on drivers.  Probe/Attach processing must be separately
  * performed via an invocation of xenbusb_probe_children().  This is usually
  * done via the xbs_probe_children task.
  *
  * \param xbs  XenBus Bus device softc of the owner of the bus to enumerate.
  *
  * \return  On success, 0. Otherwise an errno value indicating the
  *          type of failure.
  */
 static int
 xenbusb_enumerate_bus(struct xenbusb_softc *xbs)
 {
 	const char **types;
 	u_int type_idx;
 	u_int type_count;
 	int error;
 
 	error = xs_directory(XST_NIL, xbs->xbs_node, "", &type_count, &types);
 	if (error)
 		return (error);
 
 	for (type_idx = 0; type_idx < type_count; type_idx++)
 		XENBUSB_ENUMERATE_TYPE(xbs->xbs_dev, types[type_idx]);
 
 	free(types, M_XENSTORE);
 
 	return (0);
 }
 
 /**
  * Handler for all generic XenBus device systcl nodes.
  */
 static int
 xenbusb_device_sysctl_handler(SYSCTL_HANDLER_ARGS)  
 {
 	device_t dev;
         const char *value;
 
 	dev = (device_t)arg1;
         switch (arg2) {
 	case XENBUS_IVAR_NODE:
 		value = xenbus_get_node(dev);
 		break;
 	case XENBUS_IVAR_TYPE:
 		value = xenbus_get_type(dev);
 		break;
 	case XENBUS_IVAR_STATE:
 		value = xenbus_strstate(xenbus_get_state(dev));
 		break;
 	case XENBUS_IVAR_OTHEREND_ID:
 		return (sysctl_handle_int(oidp, NULL,
 					  xenbus_get_otherend_id(dev),
 					  req));
 		/* NOTREACHED */
 	case XENBUS_IVAR_OTHEREND_PATH:
 		value = xenbus_get_otherend_path(dev);
                 break;
 	default:
 		return (EINVAL);
 	}
 	return (SYSCTL_OUT_STR(req, value));
 }
 
 /**
  * Create read-only systcl nodes for xenbusb device ivar data.
  *
  * \param dev  The XenBus device instance to register with sysctl.
  */
 static void
 xenbusb_device_sysctl_init(device_t dev)
 {
 	struct sysctl_ctx_list *ctx;
 	struct sysctl_oid      *tree;
 
 	ctx  = device_get_sysctl_ctx(dev);
 	tree = device_get_sysctl_tree(dev);
 
         SYSCTL_ADD_PROC(ctx,
 			SYSCTL_CHILDREN(tree),
 			OID_AUTO,
 			"xenstore_path",
-			CTLTYPE_STRING | CTLFLAG_RD,
+			CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			dev,
 			XENBUS_IVAR_NODE,
 			xenbusb_device_sysctl_handler,
 			"A",
 			"XenStore path to device");
 
         SYSCTL_ADD_PROC(ctx,
 			SYSCTL_CHILDREN(tree),
 			OID_AUTO,
 			"xenbus_dev_type",
-			CTLTYPE_STRING | CTLFLAG_RD,
+			CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			dev,
 			XENBUS_IVAR_TYPE,
 			xenbusb_device_sysctl_handler,
 			"A",
 			"XenBus device type");
 
         SYSCTL_ADD_PROC(ctx,
 			SYSCTL_CHILDREN(tree),
 			OID_AUTO,
 			"xenbus_connection_state",
-			CTLTYPE_STRING | CTLFLAG_RD,
+			CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			dev,
 			XENBUS_IVAR_STATE,
 			xenbusb_device_sysctl_handler,
 			"A",
 			"XenBus state of peer connection");
 
         SYSCTL_ADD_PROC(ctx,
 			SYSCTL_CHILDREN(tree),
 			OID_AUTO,
 			"xenbus_peer_domid",
-			CTLTYPE_INT | CTLFLAG_RD,
+			CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			dev,
 			XENBUS_IVAR_OTHEREND_ID,
 			xenbusb_device_sysctl_handler,
 			"I",
 			"Xen domain ID of peer");
 
         SYSCTL_ADD_PROC(ctx,
 			SYSCTL_CHILDREN(tree),
 			OID_AUTO,
 			"xenstore_peer_path",
-			CTLTYPE_STRING | CTLFLAG_RD,
+			CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			dev,
 			XENBUS_IVAR_OTHEREND_PATH,
 			xenbusb_device_sysctl_handler,
 			"A",
 			"XenStore path to peer device");
 }
 
 /**
  * \brief Decrement the number of XenBus child devices in the
  *        connecting state by one and release the xbs_attch_ch
  *        interrupt configuration hook if the connecting count
  *        drops to zero.
  *
  * \param xbs  XenBus Bus device softc of the owner of the bus to enumerate.
  */
 static void
 xenbusb_release_confighook(struct xenbusb_softc *xbs)
 {
 	mtx_lock(&xbs->xbs_lock);
 	KASSERT(xbs->xbs_connecting_children > 0,
 		("Connecting device count error\n"));
 	xbs->xbs_connecting_children--;
 	if (xbs->xbs_connecting_children == 0
 	 && (xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) {
 		xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE;
 		mtx_unlock(&xbs->xbs_lock);
 		config_intrhook_disestablish(&xbs->xbs_attach_ch);
 	} else {
 		mtx_unlock(&xbs->xbs_lock);
 	}
 }
 
 /**
  * \brief Verify the existance of attached device instances and perform
  *        probe/attach processing for newly arrived devices.
  *
  * \param dev  The NewBus device representing this XenBus bus.
  *
  * \return  On success, 0. Otherwise an errno value indicating the
  *          type of failure.
  */
 static int
 xenbusb_probe_children(device_t dev)
 {
 	device_t *kids;
 	struct xenbus_device_ivars *ivars;
 	int i, count, error;
 
 	if (device_get_children(dev, &kids, &count) == 0) {
 		for (i = 0; i < count; i++) {
 			if (device_get_state(kids[i]) != DS_NOTPRESENT) {
 				/*
 				 * We already know about this one.
 				 * Make sure it's still here.
 				 */
 				xenbusb_verify_device(dev, kids[i]);
 				continue;
 			}
 
 			error = device_probe_and_attach(kids[i]);
 			if (error == ENXIO) {
 				struct xenbusb_softc *xbs;
 
 				/*
 				 * We don't have a PV driver for this device.
 				 * However, an emulated device we do support
 				 * may share this backend.  Hide the node from
 				 * XenBus until the next rescan, but leave it's
 				 * state unchanged so we don't inadvertently
 				 * prevent attachment of any emulated device.
 				 */
 				xenbusb_delete_child(dev, kids[i]);
 
 				/*
 				 * Since the XenStore state of this device
 				 * still indicates a pending attach, manually
 				 * release it's hold on the boot process.
 				 */
 				xbs = device_get_softc(dev);
 				xenbusb_release_confighook(xbs);
 
 				continue;
 			} else if (error) {
 				/*
 				 * Transition device to the closed state
 				 * so the world knows that attachment will
 				 * not occur.
 				 */
 				xenbus_set_state(kids[i], XenbusStateClosed);
 
 				/*
 				 * Remove our record of this device.
 				 * So long as it remains in the closed
 				 * state in the XenStore, we will not find
 				 * it again.  The state will only change
 				 * if the control domain actively reconfigures
 				 * this device.
 				 */
 				xenbusb_delete_child(dev, kids[i]);
 
 				continue;
 			}
 			/*
 			 * Augment default newbus provided dynamic sysctl
 			 * variables with the standard ivar contents of
 			 * XenBus devices.
 			 */
 			xenbusb_device_sysctl_init(kids[i]);
 
 			/*
 			 * Now that we have a driver managing this device
 			 * that can receive otherend state change events,
 			 * hook up a watch for them.
 			 */
 			ivars = device_get_ivars(kids[i]);
 			xs_register_watch(&ivars->xd_otherend_watch);
 			xs_register_watch(&ivars->xd_local_watch);
 		}
 		free(kids, M_TEMP);
 	}
 
 	return (0);
 }
 
 /**
  * \brief Task callback function to perform XenBus probe operations
  *        from a known safe context.
  *
  * \param arg      The NewBus device_t representing the bus instance to
  *                 on which to perform probe processing.
  * \param pending  The number of times this task was queued before it could
  *                 be run.
  */
 static void
 xenbusb_probe_children_cb(void *arg, int pending __unused)
 {
 	device_t dev = (device_t)arg;
 
 	/*
 	 * Hold Giant until the Giant free newbus changes are committed.
 	 */
 	mtx_lock(&Giant);
 	xenbusb_probe_children(dev);
 	mtx_unlock(&Giant);
 }
 
 /**
  * \brief XenStore watch callback for the root node of the XenStore
  *        subtree representing a XenBus.
  *
  * This callback performs, or delegates to the xbs_probe_children task,
  * all processing necessary to handle dynmaic device arrival and departure
  * events from a XenBus.
  *
  * \param watch  The XenStore watch object associated with this callback.
  * \param vec    The XenStore watch event data.
  * \param len	 The number of fields in the event data stream.
  */
 static void
 xenbusb_devices_changed(struct xs_watch *watch, const char **vec,
 			unsigned int len)
 {
 	struct xenbusb_softc *xbs;
 	device_t dev;
 	char *node;
 	char *type;
 	char *id;
 	char *p;
 	u_int component;
 
 	xbs = (struct xenbusb_softc *)watch->callback_data;
 	dev = xbs->xbs_dev;
 
 	if (len <= XS_WATCH_PATH) {
 		device_printf(dev, "xenbusb_devices_changed: "
 			      "Short Event Data.\n");
 		return;
 	}
 
 	node = strdup(vec[XS_WATCH_PATH], M_XENBUS);
 	p = strchr(node, '/');
 	if (p == NULL)
 		goto out;
 	*p = 0;
 	type = p + 1;
 
 	p = strchr(type, '/');
 	if (p == NULL)
 		goto out;
 	*p++ = 0;
 
 	/*
 	 * Extract the device ID.  A device ID has one or more path
 	 * components separated by the '/' character.
 	 *
 	 * e.g. "<frontend vm id>/<frontend dev id>" for backend devices.
 	 */
 	id = p;
 	for (component = 0; component < xbs->xbs_id_components; component++) {
 		p = strchr(p, '/');
 		if (p == NULL)
 			break;
 		p++;
 	}
 	if (p != NULL)
 		*p = 0;
 
 	if (*id != 0 && component >= xbs->xbs_id_components - 1) {
 		xenbusb_add_device(xbs->xbs_dev, type, id);
 		taskqueue_enqueue(taskqueue_thread, &xbs->xbs_probe_children);
 	}
 out:
 	free(node, M_XENBUS);
 }
 
 /**
  * \brief Interrupt configuration hook callback associated with xbs_attch_ch.
  *
  * Since interrupts are always functional at the time of XenBus configuration,
  * there is nothing to be done when the callback occurs.  This hook is only
  * registered to hold up boot processing while XenBus devices come online.
  * 
  * \param arg  Unused configuration hook callback argument.
  */
 static void
 xenbusb_nop_confighook_cb(void *arg __unused)
 {
 }
 
 /*--------------------------- Public Functions -------------------------------*/
 /*--------- API comments for these methods can be found in xenbusb.h ---------*/
 void
 xenbusb_identify(driver_t *driver __unused, device_t parent)
 {
 	/*
 	 * A single instance of each bus type for which we have a driver
 	 * is always present in a system operating under Xen.
 	 */
 	BUS_ADD_CHILD(parent, 0, driver->name, 0);
 }
 
 int
 xenbusb_add_device(device_t dev, const char *type, const char *id)
 {
 	struct xenbusb_softc *xbs;
 	struct sbuf *devpath_sbuf;
 	char *devpath;
 	struct xenbus_device_ivars *ivars;
 	int error;
 
 	xbs = device_get_softc(dev);
 	devpath_sbuf = sbuf_new_auto();
 	sbuf_printf(devpath_sbuf, "%s/%s/%s", xbs->xbs_node, type, id);
 	sbuf_finish(devpath_sbuf);
 	devpath = sbuf_data(devpath_sbuf);
 
 	ivars = malloc(sizeof(*ivars), M_XENBUS, M_ZERO|M_WAITOK);
 	error = ENXIO;
 
 	if (xs_exists(XST_NIL, devpath, "") != 0) {
 		device_t child;
 		enum xenbus_state state;
 		char *statepath;
 
 		child = xenbusb_device_exists(dev, devpath);
 		if (child != NULL) {
 			/*
 			 * We are already tracking this node
 			 */
 			error = 0;
 			goto out;
 		}
 			
 		state = xenbus_read_driver_state(devpath);
 		if (state != XenbusStateInitialising) {
 			/*
 			 * Device is not new, so ignore it. This can
 			 * happen if a device is going away after
 			 * switching to Closed.
 			 */
 			printf("xenbusb_add_device: Device %s ignored. "
 			       "State %d\n", devpath, state);
 			error = 0;
 			goto out;
 		}
 
 		sx_init(&ivars->xd_lock, "xdlock");
 		ivars->xd_flags = XDF_CONNECTING;
 		ivars->xd_node = strdup(devpath, M_XENBUS);
 		ivars->xd_node_len = strlen(devpath);
 		ivars->xd_type  = strdup(type, M_XENBUS);
 		ivars->xd_state = XenbusStateInitialising;
 
 		error = XENBUSB_GET_OTHEREND_NODE(dev, ivars);
 		if (error) {
 			printf("xenbus_update_device: %s no otherend id\n",
 			    devpath); 
 			goto out;
 		}
 
 		statepath = malloc(ivars->xd_otherend_path_len
 		    + strlen("/state") + 1, M_XENBUS, M_WAITOK);
 		sprintf(statepath, "%s/state", ivars->xd_otherend_path);
 		ivars->xd_otherend_watch.node = statepath;
 		ivars->xd_otherend_watch.callback = xenbusb_otherend_watch_cb;
 		ivars->xd_otherend_watch.callback_data = (uintptr_t)ivars;
 
 		ivars->xd_local_watch.node = ivars->xd_node;
 		ivars->xd_local_watch.callback = xenbusb_local_watch_cb;
 		ivars->xd_local_watch.callback_data = (uintptr_t)ivars;
 
 		mtx_lock(&xbs->xbs_lock);
 		xbs->xbs_connecting_children++;
 		mtx_unlock(&xbs->xbs_lock);
 
 		child = device_add_child(dev, NULL, -1);
 		ivars->xd_dev = child;
 		device_set_ivars(child, ivars);
 	}
 
 out:
 	sbuf_delete(devpath_sbuf);
 	if (error != 0)
 		xenbusb_free_child_ivars(ivars);
 
 	return (error);
 }
 
 int
 xenbusb_attach(device_t dev, char *bus_node, u_int id_components)
 {
 	struct xenbusb_softc *xbs;
 
 	xbs = device_get_softc(dev);
 	mtx_init(&xbs->xbs_lock, "xenbusb softc lock", NULL, MTX_DEF);
 	xbs->xbs_node = bus_node;
 	xbs->xbs_id_components = id_components;
 	xbs->xbs_dev = dev;
 
 	/*
 	 * Since XenBus buses are attached to the XenStore, and
 	 * the XenStore does not probe children until after interrupt
 	 * services are available, this config hook is used solely
 	 * to ensure that the remainder of the boot process (e.g.
 	 * mount root) is deferred until child devices are adequately
 	 * probed.  We unblock the boot process as soon as the
 	 * connecting child count in our softc goes to 0.
 	 */
 	xbs->xbs_attach_ch.ich_func = xenbusb_nop_confighook_cb;
 	xbs->xbs_attach_ch.ich_arg = dev;
 	config_intrhook_establish(&xbs->xbs_attach_ch);
 	xbs->xbs_flags |= XBS_ATTACH_CH_ACTIVE;
 	xbs->xbs_connecting_children = 1;
 
 	/*
 	 * The subtree for this bus type may not yet exist
 	 * causing initial enumeration to fail.  We still
 	 * want to return success from our attach though
 	 * so that we are ready to handle devices for this
 	 * bus when they are dynamically attached to us
 	 * by a Xen management action.
 	 */
 	(void)xenbusb_enumerate_bus(xbs);
 	xenbusb_probe_children(dev);
 
 	xbs->xbs_device_watch.node = bus_node;
 	xbs->xbs_device_watch.callback = xenbusb_devices_changed;
 	xbs->xbs_device_watch.callback_data = (uintptr_t)xbs;
 
 	TASK_INIT(&xbs->xbs_probe_children, 0, xenbusb_probe_children_cb, dev);
 
 	xs_register_watch(&xbs->xbs_device_watch);
 
 	xenbusb_release_confighook(xbs);
 
 	return (0);
 }
 
 int
 xenbusb_resume(device_t dev)
 {
 	device_t *kids;
 	struct xenbus_device_ivars *ivars;
 	int i, count, error;
 	char *statepath;
 
 	/*
 	 * We must re-examine each device and find the new path for
 	 * its backend.
 	 */
 	if (device_get_children(dev, &kids, &count) == 0) {
 		for (i = 0; i < count; i++) {
 			if (device_get_state(kids[i]) == DS_NOTPRESENT)
 				continue;
 
 			if (xen_suspend_cancelled) {
 				DEVICE_RESUME(kids[i]);
 				continue;
 			}
 
 			ivars = device_get_ivars(kids[i]);
 
 			xs_unregister_watch(&ivars->xd_otherend_watch);
 			xenbus_set_state(kids[i], XenbusStateInitialising);
 
 			/*
 			 * Find the new backend details and
 			 * re-register our watch.
 			 */
 			error = XENBUSB_GET_OTHEREND_NODE(dev, ivars);
 			if (error)
 				return (error);
 
 			statepath = malloc(ivars->xd_otherend_path_len
 			    + strlen("/state") + 1, M_XENBUS, M_WAITOK);
 			sprintf(statepath, "%s/state", ivars->xd_otherend_path);
 
 			free(ivars->xd_otherend_watch.node, M_XENBUS);
 			ivars->xd_otherend_watch.node = statepath;
 
 			DEVICE_RESUME(kids[i]);
 
 			xs_register_watch(&ivars->xd_otherend_watch);
 #if 0
 			/*
 			 * Can't do this yet since we are running in
 			 * the xenwatch thread and if we sleep here,
 			 * we will stop delivering watch notifications
 			 * and the device will never come back online.
 			 */
 			sx_xlock(&ivars->xd_lock);
 			while (ivars->xd_state != XenbusStateClosed
 			    && ivars->xd_state != XenbusStateConnected)
 				sx_sleep(&ivars->xd_state, &ivars->xd_lock,
 				    0, "xdresume", 0);
 			sx_xunlock(&ivars->xd_lock);
 #endif
 		}
 		free(kids, M_TEMP);
 	}
 
 	return (0);
 }
 
 int
 xenbusb_print_child(device_t dev, device_t child)
 {
 	struct xenbus_device_ivars *ivars = device_get_ivars(child);
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
 	retval += printf(" at %s", ivars->xd_node);
 	retval += bus_print_child_footer(dev, child);
 
 	return (retval);
 }
 
 int
 xenbusb_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
 {
 	struct xenbus_device_ivars *ivars = device_get_ivars(child);
 
 	switch (index) {
 	case XENBUS_IVAR_NODE:
 		*result = (uintptr_t) ivars->xd_node;
 		return (0);
 
 	case XENBUS_IVAR_TYPE:
 		*result = (uintptr_t) ivars->xd_type;
 		return (0);
 
 	case XENBUS_IVAR_STATE:
 		*result = (uintptr_t) ivars->xd_state;
 		return (0);
 
 	case XENBUS_IVAR_OTHEREND_ID:
 		*result = (uintptr_t) ivars->xd_otherend_id;
 		return (0);
 
 	case XENBUS_IVAR_OTHEREND_PATH:
 		*result = (uintptr_t) ivars->xd_otherend_path;
 		return (0);
 	}
 
 	return (ENOENT);
 }
 
 int
 xenbusb_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
 {
 	struct xenbus_device_ivars *ivars = device_get_ivars(child);
 	enum xenbus_state newstate;
 	int currstate;
 
 	switch (index) {
 	case XENBUS_IVAR_STATE:
 	{
 		int error;
 
 		newstate = (enum xenbus_state)value;
 		sx_xlock(&ivars->xd_lock);
 		if (ivars->xd_state == newstate) {
 			error = 0;
 			goto out;
 		}
 
 		error = xs_scanf(XST_NIL, ivars->xd_node, "state",
 		    NULL, "%d", &currstate);
 		if (error)
 			goto out;
 
 		do {
 			error = xs_printf(XST_NIL, ivars->xd_node, "state",
 			    "%d", newstate);
 		} while (error == EAGAIN);
 		if (error) {
 			/*
 			 * Avoid looping through xenbus_dev_fatal()
 			 * which calls xenbus_write_ivar to set the
 			 * state to closing.
 			 */
 			if (newstate != XenbusStateClosing)
 				xenbus_dev_fatal(dev, error,
 						 "writing new state");
 			goto out;
 		}
 		ivars->xd_state = newstate;
 
 		if ((ivars->xd_flags & XDF_CONNECTING) != 0
 		 && (newstate == XenbusStateClosed
 		  || newstate == XenbusStateConnected)) {
 			struct xenbusb_softc *xbs;
 
 			ivars->xd_flags &= ~XDF_CONNECTING;
 			xbs = device_get_softc(dev);
 			xenbusb_release_confighook(xbs);
 		}
 
 		wakeup(&ivars->xd_state);
 	out:
 		sx_xunlock(&ivars->xd_lock);
 		return (error);
 	}
 
 	case XENBUS_IVAR_NODE:
 	case XENBUS_IVAR_TYPE:
 	case XENBUS_IVAR_OTHEREND_ID:
 	case XENBUS_IVAR_OTHEREND_PATH:
 		/*
 		 * These variables are read-only.
 		 */
 		return (EINVAL);
 	}
 
 	return (ENOENT);
 }
 
 void
 xenbusb_otherend_changed(device_t bus, device_t child, enum xenbus_state state)
 {
 	XENBUS_OTHEREND_CHANGED(child, state);
 }
 
 void
 xenbusb_localend_changed(device_t bus, device_t child, const char *path)
 {
 
 	if (strcmp(path, "/state") != 0) {
 		struct xenbus_device_ivars *ivars;
 
 		ivars = device_get_ivars(child);
 		sx_xlock(&ivars->xd_lock);
 		ivars->xd_state = xenbus_read_driver_state(ivars->xd_node);
 		sx_xunlock(&ivars->xd_lock);
 	}
 	XENBUS_LOCALEND_CHANGED(child, path);
 }