Index: head/sys/dev/xen/balloon/balloon.c =================================================================== --- head/sys/dev/xen/balloon/balloon.c (revision 358315) +++ head/sys/dev/xen/balloon/balloon.c (revision 358316) @@ -1,418 +1,420 @@ /****************************************************************************** * balloon.c * * Xen balloon driver - enables returning/claiming memory to/from Xen. * * Copyright (c) 2003, B Dragovic * Copyright (c) 2003-2004, M Williamson, K Fraser * Copyright (c) 2005 Dan M. Smith, IBM Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include static MALLOC_DEFINE(M_BALLOON, "Balloon", "Xen Balloon Driver"); /* Convert from KB (as fetched from xenstore) to number of PAGES */ #define KB_TO_PAGE_SHIFT (PAGE_SHIFT - 10) struct mtx balloon_mutex; /* We increase/decrease in batches which fit in a page */ static xen_pfn_t frame_list[PAGE_SIZE / sizeof(xen_pfn_t)]; struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ unsigned long current_pages; unsigned long target_pages; /* We may hit the hard limit in Xen. If we do then we remember it. */ unsigned long hard_limit; /* * Drivers may alter the memory reservation independently, but they * must inform the balloon driver so we avoid hitting the hard limit. */ unsigned long driver_pages; /* Number of pages in high- and low-memory balloons. */ unsigned long balloon_low; unsigned long balloon_high; }; static struct balloon_stats balloon_stats; #define bs balloon_stats SYSCTL_DECL(_dev_xen); -static SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, CTLFLAG_RD, NULL, "Balloon"); +static SYSCTL_NODE(_dev_xen, OID_AUTO, balloon, + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Balloon"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, current, CTLFLAG_RD, &bs.current_pages, 0, "Current allocation"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, target, CTLFLAG_RD, &bs.target_pages, 0, "Target allocation"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, driver_pages, CTLFLAG_RD, &bs.driver_pages, 0, "Driver pages"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, hard_limit, CTLFLAG_RD, &bs.hard_limit, 0, "Xen hard limit"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, low_mem, CTLFLAG_RD, &bs.balloon_low, 0, "Low-mem balloon"); SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD, &bs.balloon_high, 0, "High-mem balloon"); /* List of ballooned pages, threaded through the mem_map array. */ static TAILQ_HEAD(,vm_page) ballooned_pages; /* Main work function, always executed in process context. */ static void balloon_process(void *unused); #define IPRINTK(fmt, args...) \ printk(KERN_INFO "xen_mem: " fmt, ##args) #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "xen_mem: " fmt, ##args) static unsigned long current_target(void) { unsigned long target = min(bs.target_pages, bs.hard_limit); if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) target = bs.current_pages + bs.balloon_low + bs.balloon_high; return (target); } static unsigned long minimum_target(void) { unsigned long min_pages, curr_pages = current_target(); #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) /* * Simple continuous piecewiese linear function: * max MiB -> min MiB gradient * 0 0 * 16 16 * 32 24 * 128 72 (1/2) * 512 168 (1/4) * 2048 360 (1/8) * 8192 552 (1/32) * 32768 1320 * 131072 4392 */ if (realmem < MB2PAGES(128)) min_pages = MB2PAGES(8) + (realmem >> 1); else if (realmem < MB2PAGES(512)) min_pages = MB2PAGES(40) + (realmem >> 2); else if (realmem < MB2PAGES(2048)) min_pages = MB2PAGES(104) + (realmem >> 3); else min_pages = MB2PAGES(296) + (realmem >> 5); #undef MB2PAGES /* Don't enforce growth */ return (min(min_pages, curr_pages)); } static int increase_reservation(unsigned long nr_pages) { unsigned long i; vm_page_t page; long rc; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; mtx_assert(&balloon_mutex, MA_OWNED); if (nr_pages > nitems(frame_list)) nr_pages = nitems(frame_list); for (page = TAILQ_FIRST(&ballooned_pages), i = 0; i < nr_pages; i++, page = TAILQ_NEXT(page, plinks.q)) { KASSERT(page != NULL, ("ballooned_pages list corrupt")); frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; rc = HYPERVISOR_memory_op( XENMEM_populate_physmap, &reservation); if (rc < nr_pages) { if (rc > 0) { int ret; /* We hit the Xen hard limit: reprobe. */ reservation.nr_extents = rc; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); KASSERT(ret == rc, ("HYPERVISOR_memory_op failed")); } if (rc >= 0) bs.hard_limit = (bs.current_pages + rc - bs.driver_pages); goto out; } for (i = 0; i < nr_pages; i++) { page = TAILQ_FIRST(&ballooned_pages); KASSERT(page != NULL, ("Unable to get ballooned page")); TAILQ_REMOVE(&ballooned_pages, page, plinks.q); bs.balloon_low--; KASSERT(xen_feature(XENFEAT_auto_translated_physmap), ("auto translated physmap but mapping is valid")); vm_page_free(page); } bs.current_pages += nr_pages; out: return (0); } static int decrease_reservation(unsigned long nr_pages) { unsigned long i; vm_page_t page; int need_sleep = 0; int ret; struct xen_memory_reservation reservation = { .address_bits = 0, .extent_order = 0, .domid = DOMID_SELF }; mtx_assert(&balloon_mutex, MA_OWNED); if (nr_pages > nitems(frame_list)) nr_pages = nitems(frame_list); for (i = 0; i < nr_pages; i++) { if ((page = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | VM_ALLOC_ZERO)) == NULL) { nr_pages = i; need_sleep = 1; break; } if ((page->flags & PG_ZERO) == 0) { /* * Zero the page, or else we might be leaking * important data to other domains on the same * host. Xen doesn't scrub ballooned out memory * pages, the guest is in charge of making * sure that no information is leaked. */ pmap_zero_page(page); } frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q); bs.balloon_low++; } set_xen_guest_handle(reservation.extent_start, frame_list); reservation.nr_extents = nr_pages; ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); KASSERT(ret == nr_pages, ("HYPERVISOR_memory_op failed")); bs.current_pages -= nr_pages; return (need_sleep); } /* * We avoid multiple worker processes conflicting via the balloon mutex. * We may of course race updates of the target counts (which are protected * by the balloon lock), or with changes to the Xen hard limit, but we will * recover from these in time. */ static void balloon_process(void *unused) { int need_sleep = 0; long credit; mtx_lock(&balloon_mutex); for (;;) { int sleep_time; do { credit = current_target() - bs.current_pages; if (credit > 0) need_sleep = (increase_reservation(credit) != 0); if (credit < 0) need_sleep = (decrease_reservation(-credit) != 0); } while ((credit != 0) && !need_sleep); /* Schedule more work if there is some still to be done. */ if (current_target() != bs.current_pages) sleep_time = hz; else sleep_time = 0; msleep(balloon_process, &balloon_mutex, 0, "balloon", sleep_time); } mtx_unlock(&balloon_mutex); } /* Resets the Xen limit, sets new target, and kicks off processing. */ static void set_new_target(unsigned long target) { /* No need for lock. Not read-modify-write updates. */ bs.hard_limit = ~0UL; bs.target_pages = max(target, minimum_target()); wakeup(balloon_process); } static struct xs_watch target_watch = { .node = "memory/target" }; /* React to a change in the target key */ static void watch_target(struct xs_watch *watch, const char **vec, unsigned int len) { unsigned long long new_target; int err; err = xs_scanf(XST_NIL, "memory", "target", NULL, "%llu", &new_target); if (err) { /* This is ok (for domain0 at least) - so just return */ return; } /* * The given memory/target value is in KiB, so it needs converting to * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. */ set_new_target(new_target >> KB_TO_PAGE_SHIFT); } /*------------------ Private Device Attachment Functions --------------------*/ /** * \brief Identify instances of this device type in the system. * * \param driver The driver performing this identify action. * \param parent The NewBus parent device for any devices this method adds. */ static void xenballoon_identify(driver_t *driver __unused, device_t parent) { /* * A single device instance for our driver is always present * in a system operating under Xen. */ BUS_ADD_CHILD(parent, 0, driver->name, 0); } /** * \brief Probe for the existence of the Xen Balloon device * * \param dev NewBus device_t for this Xen control instance. * * \return Always returns 0 indicating success. */ static int xenballoon_probe(device_t dev) { device_set_desc(dev, "Xen Balloon Device"); return (0); } /** * \brief Attach the Xen Balloon device. * * \param dev NewBus device_t for this Xen control instance. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xenballoon_attach(device_t dev) { int err; mtx_init(&balloon_mutex, "balloon_mutex", NULL, MTX_DEF); bs.current_pages = realmem; bs.target_pages = bs.current_pages; bs.balloon_low = 0; bs.balloon_high = 0; bs.driver_pages = 0UL; bs.hard_limit = ~0UL; kproc_create(balloon_process, NULL, NULL, 0, 0, "balloon"); target_watch.callback = watch_target; err = xs_register_watch(&target_watch); if (err) device_printf(dev, "xenballon: failed to set balloon watcher\n"); return (err); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t xenballoon_methods[] = { /* Device interface */ DEVMETHOD(device_identify, xenballoon_identify), DEVMETHOD(device_probe, xenballoon_probe), DEVMETHOD(device_attach, xenballoon_attach), DEVMETHOD_END }; DEFINE_CLASS_0(xenballoon, xenballoon_driver, xenballoon_methods, 0); devclass_t xenballoon_devclass; DRIVER_MODULE(xenballoon, xenstore, xenballoon_driver, xenballoon_devclass, NULL, NULL); Index: head/sys/dev/xen/blkfront/blkfront.c =================================================================== --- head/sys/dev/xen/blkfront/blkfront.c (revision 358315) +++ head/sys/dev/xen/blkfront/blkfront.c (revision 358316) @@ -1,1653 +1,1654 @@ /* * XenBSD block device driver * * Copyright (c) 2010-2013 Spectra Logic Corporation * Copyright (c) 2009 Scott Long, Yahoo! * Copyright (c) 2009 Frank Suchomel, Citrix * Copyright (c) 2009 Doug F. Rabson, Citrix * Copyright (c) 2005 Kip Macy * Copyright (c) 2003-2004, Keir Fraser & Steve Hand * Modifications by Mark A. Williamson are (c) Intel Research Cambridge * * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "xenbus_if.h" /*--------------------------- Forward Declarations ---------------------------*/ static void xbd_closing(device_t); static void xbd_startio(struct xbd_softc *sc); /*---------------------------------- Macros ----------------------------------*/ #if 0 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args) #else #define DPRINTK(fmt, args...) #endif #define XBD_SECTOR_SHFT 9 /*---------------------------- Global Static Data ----------------------------*/ static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data"); static int xbd_enable_indirect = 1; -SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD, 0, "xbd driver parameters"); +SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "xbd driver parameters"); SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN, &xbd_enable_indirect, 0, "Enable xbd indirect segments"); /*---------------------------- Command Processing ----------------------------*/ static void xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag) { if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) != 0) return; sc->xbd_flags |= xbd_flag; sc->xbd_qfrozen_cnt++; } static void xbd_thaw(struct xbd_softc *sc, xbd_flag_t xbd_flag) { if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) == 0) return; if (sc->xbd_qfrozen_cnt == 0) panic("%s: Thaw with flag 0x%x while not frozen.", __func__, xbd_flag); sc->xbd_flags &= ~xbd_flag; sc->xbd_qfrozen_cnt--; } static void xbd_cm_freeze(struct xbd_softc *sc, struct xbd_command *cm, xbdc_flag_t cm_flag) { if ((cm->cm_flags & XBDCF_FROZEN) != 0) return; cm->cm_flags |= XBDCF_FROZEN|cm_flag; xbd_freeze(sc, XBDF_NONE); } static void xbd_cm_thaw(struct xbd_softc *sc, struct xbd_command *cm) { if ((cm->cm_flags & XBDCF_FROZEN) == 0) return; cm->cm_flags &= ~XBDCF_FROZEN; xbd_thaw(sc, XBDF_NONE); } static inline void xbd_flush_requests(struct xbd_softc *sc) { int notify; RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->xbd_ring, notify); if (notify) xen_intr_signal(sc->xen_intr_handle); } static void xbd_free_command(struct xbd_command *cm) { KASSERT((cm->cm_flags & XBDCF_Q_MASK) == XBD_Q_NONE, ("Freeing command that is still on queue %d.", cm->cm_flags & XBDCF_Q_MASK)); cm->cm_flags = XBDCF_INITIALIZER; cm->cm_bp = NULL; cm->cm_complete = NULL; xbd_enqueue_cm(cm, XBD_Q_FREE); xbd_thaw(cm->cm_sc, XBDF_CM_SHORTAGE); } static void xbd_mksegarray(bus_dma_segment_t *segs, int nsegs, grant_ref_t * gref_head, int otherend_id, int readonly, grant_ref_t * sg_ref, struct blkif_request_segment *sg) { struct blkif_request_segment *last_block_sg = sg + nsegs; vm_paddr_t buffer_ma; uint64_t fsect, lsect; int ref; while (sg < last_block_sg) { KASSERT(segs->ds_addr % (1 << XBD_SECTOR_SHFT) == 0, ("XEN disk driver I/O must be sector aligned")); KASSERT(segs->ds_len % (1 << XBD_SECTOR_SHFT) == 0, ("XEN disk driver I/Os must be a multiple of " "the sector length")); buffer_ma = segs->ds_addr; fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; lsect = fsect + (segs->ds_len >> XBD_SECTOR_SHFT) - 1; KASSERT(lsect <= 7, ("XEN disk driver data cannot " "cross a page boundary")); /* install a grant reference. */ ref = gnttab_claim_grant_reference(gref_head); /* * GNTTAB_LIST_END == 0xffffffff, but it is private * to gnttab.c. */ KASSERT(ref != ~0, ("grant_reference failed")); gnttab_grant_foreign_access_ref( ref, otherend_id, buffer_ma >> PAGE_SHIFT, readonly); *sg_ref = ref; *sg = (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, .last_sect = lsect }; sg++; sg_ref++; segs++; } } static void xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { struct xbd_softc *sc; struct xbd_command *cm; int op; cm = arg; sc = cm->cm_sc; if (error) { cm->cm_bp->bio_error = EIO; biodone(cm->cm_bp); xbd_free_command(cm); return; } KASSERT(nsegs <= sc->xbd_max_request_segments, ("Too many segments in a blkfront I/O")); if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) { blkif_request_t *ring_req; /* Fill out a blkif_request_t structure. */ ring_req = (blkif_request_t *) RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); sc->xbd_ring.req_prod_pvt++; ring_req->id = cm->cm_id; ring_req->operation = cm->cm_operation; ring_req->sector_number = cm->cm_sector_number; ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; ring_req->nr_segments = nsegs; cm->cm_nseg = nsegs; xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, xenbus_get_otherend_id(sc->xbd_dev), cm->cm_operation == BLKIF_OP_WRITE, cm->cm_sg_refs, ring_req->seg); } else { blkif_request_indirect_t *ring_req; /* Fill out a blkif_request_indirect_t structure. */ ring_req = (blkif_request_indirect_t *) RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); sc->xbd_ring.req_prod_pvt++; ring_req->id = cm->cm_id; ring_req->operation = BLKIF_OP_INDIRECT; ring_req->indirect_op = cm->cm_operation; ring_req->sector_number = cm->cm_sector_number; ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; ring_req->nr_segments = nsegs; cm->cm_nseg = nsegs; xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, xenbus_get_otherend_id(sc->xbd_dev), cm->cm_operation == BLKIF_OP_WRITE, cm->cm_sg_refs, cm->cm_indirectionpages); memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs, sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages); } if (cm->cm_operation == BLKIF_OP_READ) op = BUS_DMASYNC_PREREAD; else if (cm->cm_operation == BLKIF_OP_WRITE) op = BUS_DMASYNC_PREWRITE; else op = 0; bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op); gnttab_free_grant_references(cm->cm_gref_head); xbd_enqueue_cm(cm, XBD_Q_BUSY); /* * If bus dma had to asynchronously call us back to dispatch * this command, we are no longer executing in the context of * xbd_startio(). Thus we cannot rely on xbd_startio()'s call to * xbd_flush_requests() to publish this command to the backend * along with any other commands that it could batch. */ if ((cm->cm_flags & XBDCF_ASYNC_MAPPING) != 0) xbd_flush_requests(sc); return; } static int xbd_queue_request(struct xbd_softc *sc, struct xbd_command *cm) { int error; if (cm->cm_bp != NULL) error = bus_dmamap_load_bio(sc->xbd_io_dmat, cm->cm_map, cm->cm_bp, xbd_queue_cb, cm, 0); else error = bus_dmamap_load(sc->xbd_io_dmat, cm->cm_map, cm->cm_data, cm->cm_datalen, xbd_queue_cb, cm, 0); if (error == EINPROGRESS) { /* * Maintain queuing order by freezing the queue. The next * command may not require as many resources as the command * we just attempted to map, so we can't rely on bus dma * blocking for it too. */ xbd_cm_freeze(sc, cm, XBDCF_ASYNC_MAPPING); return (0); } return (error); } static void xbd_restart_queue_callback(void *arg) { struct xbd_softc *sc = arg; mtx_lock(&sc->xbd_io_lock); xbd_thaw(sc, XBDF_GNT_SHORTAGE); xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); } static struct xbd_command * xbd_bio_command(struct xbd_softc *sc) { struct xbd_command *cm; struct bio *bp; if (__predict_false(sc->xbd_state != XBD_STATE_CONNECTED)) return (NULL); bp = xbd_dequeue_bio(sc); if (bp == NULL) return (NULL); if ((cm = xbd_dequeue_cm(sc, XBD_Q_FREE)) == NULL) { xbd_freeze(sc, XBDF_CM_SHORTAGE); xbd_requeue_bio(sc, bp); return (NULL); } if (gnttab_alloc_grant_references(sc->xbd_max_request_segments, &cm->cm_gref_head) != 0) { gnttab_request_free_callback(&sc->xbd_callback, xbd_restart_queue_callback, sc, sc->xbd_max_request_segments); xbd_freeze(sc, XBDF_GNT_SHORTAGE); xbd_requeue_bio(sc, bp); xbd_enqueue_cm(cm, XBD_Q_FREE); return (NULL); } cm->cm_bp = bp; cm->cm_sector_number = (blkif_sector_t)bp->bio_pblkno; switch (bp->bio_cmd) { case BIO_READ: cm->cm_operation = BLKIF_OP_READ; break; case BIO_WRITE: cm->cm_operation = BLKIF_OP_WRITE; if ((bp->bio_flags & BIO_ORDERED) != 0) { if ((sc->xbd_flags & XBDF_BARRIER) != 0) { cm->cm_operation = BLKIF_OP_WRITE_BARRIER; } else { /* * Single step this command. */ cm->cm_flags |= XBDCF_Q_FREEZE; if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { /* * Wait for in-flight requests to * finish. */ xbd_freeze(sc, XBDF_WAIT_IDLE); xbd_requeue_cm(cm, XBD_Q_READY); return (NULL); } } } break; case BIO_FLUSH: if ((sc->xbd_flags & XBDF_FLUSH) != 0) cm->cm_operation = BLKIF_OP_FLUSH_DISKCACHE; else if ((sc->xbd_flags & XBDF_BARRIER) != 0) cm->cm_operation = BLKIF_OP_WRITE_BARRIER; else panic("flush request, but no flush support available"); break; default: biofinish(bp, NULL, EOPNOTSUPP); xbd_enqueue_cm(cm, XBD_Q_FREE); return (NULL); } return (cm); } /* * Dequeue buffers and place them in the shared communication ring. * Return when no more requests can be accepted or all buffers have * been queued. * * Signal XEN once the ring has been filled out. */ static void xbd_startio(struct xbd_softc *sc) { struct xbd_command *cm; int error, queued = 0; mtx_assert(&sc->xbd_io_lock, MA_OWNED); if (sc->xbd_state != XBD_STATE_CONNECTED) return; while (!RING_FULL(&sc->xbd_ring)) { if (sc->xbd_qfrozen_cnt != 0) break; cm = xbd_dequeue_cm(sc, XBD_Q_READY); if (cm == NULL) cm = xbd_bio_command(sc); if (cm == NULL) break; if ((cm->cm_flags & XBDCF_Q_FREEZE) != 0) { /* * Single step command. Future work is * held off until this command completes. */ xbd_cm_freeze(sc, cm, XBDCF_Q_FREEZE); } if ((error = xbd_queue_request(sc, cm)) != 0) { printf("xbd_queue_request returned %d\n", error); break; } queued++; } if (queued != 0) xbd_flush_requests(sc); } static void xbd_bio_complete(struct xbd_softc *sc, struct xbd_command *cm) { struct bio *bp; bp = cm->cm_bp; if (__predict_false(cm->cm_status != BLKIF_RSP_OKAY)) { disk_err(bp, "disk error" , -1, 0); printf(" status: %x\n", cm->cm_status); bp->bio_flags |= BIO_ERROR; } if (bp->bio_flags & BIO_ERROR) bp->bio_error = EIO; else bp->bio_resid = 0; xbd_free_command(cm); biodone(bp); } static void xbd_int(void *xsc) { struct xbd_softc *sc = xsc; struct xbd_command *cm; blkif_response_t *bret; RING_IDX i, rp; int op; mtx_lock(&sc->xbd_io_lock); if (__predict_false(sc->xbd_state == XBD_STATE_DISCONNECTED)) { mtx_unlock(&sc->xbd_io_lock); return; } again: rp = sc->xbd_ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ for (i = sc->xbd_ring.rsp_cons; i != rp;) { bret = RING_GET_RESPONSE(&sc->xbd_ring, i); cm = &sc->xbd_shadow[bret->id]; xbd_remove_cm(cm, XBD_Q_BUSY); gnttab_end_foreign_access_references(cm->cm_nseg, cm->cm_sg_refs); i++; if (cm->cm_operation == BLKIF_OP_READ) op = BUS_DMASYNC_POSTREAD; else if (cm->cm_operation == BLKIF_OP_WRITE || cm->cm_operation == BLKIF_OP_WRITE_BARRIER) op = BUS_DMASYNC_POSTWRITE; else op = 0; bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op); bus_dmamap_unload(sc->xbd_io_dmat, cm->cm_map); /* * Release any hold this command has on future command * dispatch. */ xbd_cm_thaw(sc, cm); /* * Directly call the i/o complete routine to save an * an indirection in the common case. */ cm->cm_status = bret->status; if (cm->cm_bp) xbd_bio_complete(sc, cm); else if (cm->cm_complete != NULL) cm->cm_complete(cm); else xbd_free_command(cm); } sc->xbd_ring.rsp_cons = i; if (i != sc->xbd_ring.req_prod_pvt) { int more_to_do; RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, more_to_do); if (more_to_do) goto again; } else { sc->xbd_ring.sring->rsp_event = i + 1; } if (xbd_queue_length(sc, XBD_Q_BUSY) == 0) xbd_thaw(sc, XBDF_WAIT_IDLE); xbd_startio(sc); if (__predict_false(sc->xbd_state == XBD_STATE_SUSPENDED)) wakeup(&sc->xbd_cm_q[XBD_Q_BUSY]); mtx_unlock(&sc->xbd_io_lock); } /*------------------------------- Dump Support -------------------------------*/ /** * Quiesce the disk writes for a dump file before allowing the next buffer. */ static void xbd_quiesce(struct xbd_softc *sc) { int mtd; // While there are outstanding requests while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, mtd); if (mtd) { /* Received request completions, update queue. */ xbd_int(sc); } if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { /* * Still pending requests, wait for the disk i/o * to complete. */ HYPERVISOR_yield(); } } } /* Kernel dump function for a paravirtualized disk device */ static void xbd_dump_complete(struct xbd_command *cm) { xbd_enqueue_cm(cm, XBD_Q_COMPLETE); } static int xbd_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct disk *dp = arg; struct xbd_softc *sc = dp->d_drv1; struct xbd_command *cm; size_t chunk; int sbp; int rc = 0; if (length == 0) return (0); xbd_quiesce(sc); /* All quiet on the western front. */ /* * If this lock is held, then this module is failing, and a * successful kernel dump is highly unlikely anyway. */ mtx_lock(&sc->xbd_io_lock); /* Split the 64KB block as needed */ for (sbp=0; length > 0; sbp++) { cm = xbd_dequeue_cm(sc, XBD_Q_FREE); if (cm == NULL) { mtx_unlock(&sc->xbd_io_lock); device_printf(sc->xbd_dev, "dump: no more commands?\n"); return (EBUSY); } if (gnttab_alloc_grant_references(sc->xbd_max_request_segments, &cm->cm_gref_head) != 0) { xbd_free_command(cm); mtx_unlock(&sc->xbd_io_lock); device_printf(sc->xbd_dev, "no more grant allocs?\n"); return (EBUSY); } chunk = length > sc->xbd_max_request_size ? sc->xbd_max_request_size : length; cm->cm_data = virtual; cm->cm_datalen = chunk; cm->cm_operation = BLKIF_OP_WRITE; cm->cm_sector_number = offset / dp->d_sectorsize; cm->cm_complete = xbd_dump_complete; xbd_enqueue_cm(cm, XBD_Q_READY); length -= chunk; offset += chunk; virtual = (char *) virtual + chunk; } /* Tell DOM0 to do the I/O */ xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); /* Poll for the completion. */ xbd_quiesce(sc); /* All quite on the eastern front */ /* If there were any errors, bail out... */ while ((cm = xbd_dequeue_cm(sc, XBD_Q_COMPLETE)) != NULL) { if (cm->cm_status != BLKIF_RSP_OKAY) { device_printf(sc->xbd_dev, "Dump I/O failed at sector %jd\n", cm->cm_sector_number); rc = EIO; } xbd_free_command(cm); } return (rc); } /*----------------------------- Disk Entrypoints -----------------------------*/ static int xbd_open(struct disk *dp) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) { printf("xbd%d: not found", dp->d_unit); return (ENXIO); } sc->xbd_flags |= XBDF_OPEN; sc->xbd_users++; return (0); } static int xbd_close(struct disk *dp) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) return (ENXIO); sc->xbd_flags &= ~XBDF_OPEN; if (--(sc->xbd_users) == 0) { /* * Check whether we have been instructed to close. We will * have ignored this request initially, as the device was * still mounted. */ if (xenbus_get_otherend_state(sc->xbd_dev) == XenbusStateClosing) xbd_closing(sc->xbd_dev); } return (0); } static int xbd_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) return (ENXIO); return (ENOTTY); } /* * Read/write routine for a buffer. Finds the proper unit, place it on * the sortq and kick the controller. */ static void xbd_strategy(struct bio *bp) { struct xbd_softc *sc = bp->bio_disk->d_drv1; /* bogus disk? */ if (sc == NULL) { bp->bio_error = EINVAL; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; biodone(bp); return; } /* * Place it in the queue of disk activities for this disk */ mtx_lock(&sc->xbd_io_lock); xbd_enqueue_bio(sc, bp); xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); return; } /*------------------------------ Ring Management -----------------------------*/ static int xbd_alloc_ring(struct xbd_softc *sc) { blkif_sring_t *sring; uintptr_t sring_page_addr; int error; int i; sring = malloc(sc->xbd_ring_pages * PAGE_SIZE, M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); if (sring == NULL) { xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "allocating shared ring"); return (ENOMEM); } SHARED_RING_INIT(sring); FRONT_RING_INIT(&sc->xbd_ring, sring, sc->xbd_ring_pages * PAGE_SIZE); for (i = 0, sring_page_addr = (uintptr_t)sring; i < sc->xbd_ring_pages; i++, sring_page_addr += PAGE_SIZE) { error = xenbus_grant_ring(sc->xbd_dev, (vtophys(sring_page_addr) >> PAGE_SHIFT), &sc->xbd_ring_ref[i]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "granting ring_ref(%d)", i); return (error); } } if (sc->xbd_ring_pages == 1) { error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev), "ring-ref", "%u", sc->xbd_ring_ref[0]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/ring-ref", xenbus_get_node(sc->xbd_dev)); return (error); } } else { for (i = 0; i < sc->xbd_ring_pages; i++) { char ring_ref_name[]= "ring_refXX"; snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i); error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev), ring_ref_name, "%u", sc->xbd_ring_ref[i]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/%s", xenbus_get_node(sc->xbd_dev), ring_ref_name); return (error); } } } error = xen_intr_alloc_and_bind_local_port(sc->xbd_dev, xenbus_get_otherend_id(sc->xbd_dev), NULL, xbd_int, sc, INTR_TYPE_BIO | INTR_MPSAFE, &sc->xen_intr_handle); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "xen_intr_alloc_and_bind_local_port failed"); return (error); } return (0); } static void xbd_free_ring(struct xbd_softc *sc) { int i; if (sc->xbd_ring.sring == NULL) return; for (i = 0; i < sc->xbd_ring_pages; i++) { if (sc->xbd_ring_ref[i] != GRANT_REF_INVALID) { gnttab_end_foreign_access_ref(sc->xbd_ring_ref[i]); sc->xbd_ring_ref[i] = GRANT_REF_INVALID; } } free(sc->xbd_ring.sring, M_XENBLOCKFRONT); sc->xbd_ring.sring = NULL; } /*-------------------------- Initialization/Teardown -------------------------*/ static int xbd_feature_string(struct xbd_softc *sc, char *features, size_t len) { struct sbuf sb; int feature_cnt; sbuf_new(&sb, features, len, SBUF_FIXEDLEN); feature_cnt = 0; if ((sc->xbd_flags & XBDF_FLUSH) != 0) { sbuf_printf(&sb, "flush"); feature_cnt++; } if ((sc->xbd_flags & XBDF_BARRIER) != 0) { if (feature_cnt != 0) sbuf_printf(&sb, ", "); sbuf_printf(&sb, "write_barrier"); feature_cnt++; } if ((sc->xbd_flags & XBDF_DISCARD) != 0) { if (feature_cnt != 0) sbuf_printf(&sb, ", "); sbuf_printf(&sb, "discard"); feature_cnt++; } if ((sc->xbd_flags & XBDF_PERSISTENT) != 0) { if (feature_cnt != 0) sbuf_printf(&sb, ", "); sbuf_printf(&sb, "persistent_grants"); feature_cnt++; } (void) sbuf_finish(&sb); return (sbuf_len(&sb)); } static int xbd_sysctl_features(SYSCTL_HANDLER_ARGS) { char features[80]; struct xbd_softc *sc = arg1; int error; int len; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); len = xbd_feature_string(sc, features, sizeof(features)); /* len is -1 on error, which will make the SYSCTL_OUT a no-op. */ return (SYSCTL_OUT(req, features, len + 1/*NUL*/)); } static void xbd_setup_sysctl(struct xbd_softc *xbd) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; struct sysctl_oid_list *children; sysctl_ctx = device_get_sysctl_ctx(xbd->xbd_dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xbd->xbd_dev); if (sysctl_tree == NULL) return; children = SYSCTL_CHILDREN(sysctl_tree); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_requests", CTLFLAG_RD, &xbd->xbd_max_requests, -1, "maximum outstanding requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_request_segments", CTLFLAG_RD, &xbd->xbd_max_request_segments, 0, "maximum number of pages per requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_request_size", CTLFLAG_RD, &xbd->xbd_max_request_size, 0, "maximum size in bytes of a request (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "ring_pages", CTLFLAG_RD, &xbd->xbd_ring_pages, 0, "communication channel pages (negotiated)"); SYSCTL_ADD_PROC(sysctl_ctx, children, OID_AUTO, - "features", CTLTYPE_STRING|CTLFLAG_RD, xbd, 0, - xbd_sysctl_features, "A", "protocol features (negotiated)"); + "features", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, xbd, + 0, xbd_sysctl_features, "A", "protocol features (negotiated)"); } /* * Translate Linux major/minor to an appropriate name and unit * number. For HVM guests, this allows us to use the same drive names * with blkfront as the emulated drives, easing transition slightly. */ static void xbd_vdevice_to_unit(uint32_t vdevice, int *unit, const char **name) { static struct vdev_info { int major; int shift; int base; const char *name; } info[] = { {3, 6, 0, "ada"}, /* ide0 */ {22, 6, 2, "ada"}, /* ide1 */ {33, 6, 4, "ada"}, /* ide2 */ {34, 6, 6, "ada"}, /* ide3 */ {56, 6, 8, "ada"}, /* ide4 */ {57, 6, 10, "ada"}, /* ide5 */ {88, 6, 12, "ada"}, /* ide6 */ {89, 6, 14, "ada"}, /* ide7 */ {90, 6, 16, "ada"}, /* ide8 */ {91, 6, 18, "ada"}, /* ide9 */ {8, 4, 0, "da"}, /* scsi disk0 */ {65, 4, 16, "da"}, /* scsi disk1 */ {66, 4, 32, "da"}, /* scsi disk2 */ {67, 4, 48, "da"}, /* scsi disk3 */ {68, 4, 64, "da"}, /* scsi disk4 */ {69, 4, 80, "da"}, /* scsi disk5 */ {70, 4, 96, "da"}, /* scsi disk6 */ {71, 4, 112, "da"}, /* scsi disk7 */ {128, 4, 128, "da"}, /* scsi disk8 */ {129, 4, 144, "da"}, /* scsi disk9 */ {130, 4, 160, "da"}, /* scsi disk10 */ {131, 4, 176, "da"}, /* scsi disk11 */ {132, 4, 192, "da"}, /* scsi disk12 */ {133, 4, 208, "da"}, /* scsi disk13 */ {134, 4, 224, "da"}, /* scsi disk14 */ {135, 4, 240, "da"}, /* scsi disk15 */ {202, 4, 0, "xbd"}, /* xbd */ {0, 0, 0, NULL}, }; int major = vdevice >> 8; int minor = vdevice & 0xff; int i; if (vdevice & (1 << 28)) { *unit = (vdevice & ((1 << 28) - 1)) >> 8; *name = "xbd"; return; } for (i = 0; info[i].major; i++) { if (info[i].major == major) { *unit = info[i].base + (minor >> info[i].shift); *name = info[i].name; return; } } *unit = minor >> 4; *name = "xbd"; } int xbd_instance_create(struct xbd_softc *sc, blkif_sector_t sectors, int vdevice, uint16_t vdisk_info, unsigned long sector_size, unsigned long phys_sector_size) { char features[80]; int unit, error = 0; const char *name; xbd_vdevice_to_unit(vdevice, &unit, &name); sc->xbd_unit = unit; if (strcmp(name, "xbd") != 0) device_printf(sc->xbd_dev, "attaching as %s%d\n", name, unit); if (xbd_feature_string(sc, features, sizeof(features)) > 0) { device_printf(sc->xbd_dev, "features: %s\n", features); } sc->xbd_disk = disk_alloc(); sc->xbd_disk->d_unit = sc->xbd_unit; sc->xbd_disk->d_open = xbd_open; sc->xbd_disk->d_close = xbd_close; sc->xbd_disk->d_ioctl = xbd_ioctl; sc->xbd_disk->d_strategy = xbd_strategy; sc->xbd_disk->d_dump = xbd_dump; sc->xbd_disk->d_name = name; sc->xbd_disk->d_drv1 = sc; sc->xbd_disk->d_sectorsize = sector_size; sc->xbd_disk->d_stripesize = phys_sector_size; sc->xbd_disk->d_stripeoffset = 0; sc->xbd_disk->d_mediasize = sectors * sector_size; sc->xbd_disk->d_maxsize = sc->xbd_max_request_size; sc->xbd_disk->d_flags = DISKFLAG_UNMAPPED_BIO; if ((sc->xbd_flags & (XBDF_FLUSH|XBDF_BARRIER)) != 0) { sc->xbd_disk->d_flags |= DISKFLAG_CANFLUSHCACHE; device_printf(sc->xbd_dev, "synchronize cache commands enabled.\n"); } disk_create(sc->xbd_disk, DISK_VERSION); return error; } static void xbd_free(struct xbd_softc *sc) { int i; /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xbd_io_lock); sc->xbd_state = XBD_STATE_DISCONNECTED; mtx_unlock(&sc->xbd_io_lock); /* Free resources associated with old device channel. */ xbd_free_ring(sc); if (sc->xbd_shadow) { for (i = 0; i < sc->xbd_max_requests; i++) { struct xbd_command *cm; cm = &sc->xbd_shadow[i]; if (cm->cm_sg_refs != NULL) { free(cm->cm_sg_refs, M_XENBLOCKFRONT); cm->cm_sg_refs = NULL; } if (cm->cm_indirectionpages != NULL) { gnttab_end_foreign_access_references( sc->xbd_max_request_indirectpages, &cm->cm_indirectionrefs[0]); contigfree(cm->cm_indirectionpages, PAGE_SIZE * sc->xbd_max_request_indirectpages, M_XENBLOCKFRONT); cm->cm_indirectionpages = NULL; } bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map); } free(sc->xbd_shadow, M_XENBLOCKFRONT); sc->xbd_shadow = NULL; bus_dma_tag_destroy(sc->xbd_io_dmat); xbd_initq_cm(sc, XBD_Q_FREE); xbd_initq_cm(sc, XBD_Q_READY); xbd_initq_cm(sc, XBD_Q_COMPLETE); } xen_intr_unbind(&sc->xen_intr_handle); } /*--------------------------- State Change Handlers --------------------------*/ static void xbd_initialize(struct xbd_softc *sc) { const char *otherend_path; const char *node_path; uint32_t max_ring_page_order; int error; if (xenbus_get_state(sc->xbd_dev) != XenbusStateInitialising) { /* Initialization has already been performed. */ return; } /* * Protocol defaults valid even if negotiation for a * setting fails. */ max_ring_page_order = 0; sc->xbd_ring_pages = 1; /* * Protocol negotiation. * * \note xs_gather() returns on the first encountered error, so * we must use independent calls in order to guarantee * we don't miss information in a sparsly populated back-end * tree. * * \note xs_scanf() does not update variables for unmatched * fields. */ otherend_path = xenbus_get_otherend_path(sc->xbd_dev); node_path = xenbus_get_node(sc->xbd_dev); /* Support both backend schemes for relaying ring page limits. */ (void)xs_scanf(XST_NIL, otherend_path, "max-ring-page-order", NULL, "%" PRIu32, &max_ring_page_order); sc->xbd_ring_pages = 1 << max_ring_page_order; (void)xs_scanf(XST_NIL, otherend_path, "max-ring-pages", NULL, "%" PRIu32, &sc->xbd_ring_pages); if (sc->xbd_ring_pages < 1) sc->xbd_ring_pages = 1; if (sc->xbd_ring_pages > XBD_MAX_RING_PAGES) { device_printf(sc->xbd_dev, "Back-end specified ring-pages of %u " "limited to front-end limit of %u.\n", sc->xbd_ring_pages, XBD_MAX_RING_PAGES); sc->xbd_ring_pages = XBD_MAX_RING_PAGES; } if (powerof2(sc->xbd_ring_pages) == 0) { uint32_t new_page_limit; new_page_limit = 0x01 << (fls(sc->xbd_ring_pages) - 1); device_printf(sc->xbd_dev, "Back-end specified ring-pages of %u " "is not a power of 2. Limited to %u.\n", sc->xbd_ring_pages, new_page_limit); sc->xbd_ring_pages = new_page_limit; } sc->xbd_max_requests = BLKIF_MAX_RING_REQUESTS(sc->xbd_ring_pages * PAGE_SIZE); if (sc->xbd_max_requests > XBD_MAX_REQUESTS) { device_printf(sc->xbd_dev, "Back-end specified max_requests of %u " "limited to front-end limit of %zu.\n", sc->xbd_max_requests, XBD_MAX_REQUESTS); sc->xbd_max_requests = XBD_MAX_REQUESTS; } if (xbd_alloc_ring(sc) != 0) return; /* Support both backend schemes for relaying ring page limits. */ if (sc->xbd_ring_pages > 1) { error = xs_printf(XST_NIL, node_path, "num-ring-pages","%u", sc->xbd_ring_pages); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/num-ring-pages", node_path); return; } error = xs_printf(XST_NIL, node_path, "ring-page-order", "%u", fls(sc->xbd_ring_pages) - 1); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/ring-page-order", node_path); return; } } error = xs_printf(XST_NIL, node_path, "event-channel", "%u", xen_intr_port(sc->xen_intr_handle)); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/event-channel", node_path); return; } error = xs_printf(XST_NIL, node_path, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/protocol", node_path); return; } xenbus_set_state(sc->xbd_dev, XenbusStateInitialised); } /* * Invoked when the backend is finally 'ready' (and has published * the details about the physical device - #sectors, size, etc). */ static void xbd_connect(struct xbd_softc *sc) { device_t dev = sc->xbd_dev; unsigned long sectors, sector_size, phys_sector_size; unsigned int binfo; int err, feature_barrier, feature_flush; int i, j; DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev)); if (sc->xbd_state == XBD_STATE_SUSPENDED) { return; } if (sc->xbd_state == XBD_STATE_CONNECTED) { struct disk *disk; disk = sc->xbd_disk; if (disk == NULL) { return; } err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "sectors", "%lu", §ors, NULL); if (err != 0) { xenbus_dev_error(dev, err, "reading sectors at %s", xenbus_get_otherend_path(dev)); return; } disk->d_mediasize = disk->d_sectorsize * sectors; err = disk_resize(disk, M_NOWAIT); if (err) { xenbus_dev_error(dev, err, "unable to resize disk %s%u", disk->d_name, disk->d_unit); return; } device_printf(sc->xbd_dev, "changed capacity to %jd\n", (intmax_t)disk->d_mediasize); return; } err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "sectors", "%lu", §ors, "info", "%u", &binfo, "sector-size", "%lu", §or_size, NULL); if (err) { xenbus_dev_fatal(dev, err, "reading backend fields at %s", xenbus_get_otherend_path(dev)); return; } if ((sectors == 0) || (sector_size == 0)) { xenbus_dev_fatal(dev, 0, "invalid parameters from %s:" " sectors = %lu, sector_size = %lu", xenbus_get_otherend_path(dev), sectors, sector_size); return; } err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "physical-sector-size", "%lu", &phys_sector_size, NULL); if (err || phys_sector_size <= sector_size) phys_sector_size = 0; err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-barrier", "%d", &feature_barrier, NULL); if (err == 0 && feature_barrier != 0) sc->xbd_flags |= XBDF_BARRIER; err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-flush-cache", "%d", &feature_flush, NULL); if (err == 0 && feature_flush != 0) sc->xbd_flags |= XBDF_FLUSH; err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-max-indirect-segments", "%" PRIu32, &sc->xbd_max_request_segments, NULL); if ((err != 0) || (xbd_enable_indirect == 0)) sc->xbd_max_request_segments = 0; if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS) sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS; if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(MAXPHYS)) sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(MAXPHYS); sc->xbd_max_request_indirectpages = XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments); if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST) sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; sc->xbd_max_request_size = XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); /* Allocate datastructures based on negotiated values. */ err = bus_dma_tag_create( bus_get_dma_tag(sc->xbd_dev), /* parent */ 512, PAGE_SIZE, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sc->xbd_max_request_size, sc->xbd_max_request_segments, PAGE_SIZE, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ busdma_lock_mutex, /* lockfunc */ &sc->xbd_io_lock, /* lockarg */ &sc->xbd_io_dmat); if (err != 0) { xenbus_dev_fatal(sc->xbd_dev, err, "Cannot allocate parent DMA tag\n"); return; } /* Per-transaction data allocation. */ sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests, M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); if (sc->xbd_shadow == NULL) { bus_dma_tag_destroy(sc->xbd_io_dmat); xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "Cannot allocate request structures\n"); return; } for (i = 0; i < sc->xbd_max_requests; i++) { struct xbd_command *cm; void * indirectpages; cm = &sc->xbd_shadow[i]; cm->cm_sg_refs = malloc( sizeof(grant_ref_t) * sc->xbd_max_request_segments, M_XENBLOCKFRONT, M_NOWAIT); if (cm->cm_sg_refs == NULL) break; cm->cm_id = i; cm->cm_flags = XBDCF_INITIALIZER; cm->cm_sc = sc; if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0) break; if (sc->xbd_max_request_indirectpages > 0) { indirectpages = contigmalloc( PAGE_SIZE * sc->xbd_max_request_indirectpages, M_XENBLOCKFRONT, M_ZERO | M_NOWAIT, 0, ~0, PAGE_SIZE, 0); if (indirectpages == NULL) sc->xbd_max_request_indirectpages = 0; } else { indirectpages = NULL; } for (j = 0; j < sc->xbd_max_request_indirectpages; j++) { if (gnttab_grant_foreign_access( xenbus_get_otherend_id(sc->xbd_dev), (vtophys(indirectpages) >> PAGE_SHIFT) + j, 1 /* grant read-only access */, &cm->cm_indirectionrefs[j])) break; } if (j < sc->xbd_max_request_indirectpages) { contigfree(indirectpages, PAGE_SIZE * sc->xbd_max_request_indirectpages, M_XENBLOCKFRONT); break; } cm->cm_indirectionpages = indirectpages; xbd_free_command(cm); } if (sc->xbd_disk == NULL) { device_printf(dev, "%juMB <%s> at %s", (uintmax_t) sectors / (1048576 / sector_size), device_get_desc(dev), xenbus_get_node(dev)); bus_print_child_footer(device_get_parent(dev), dev); xbd_instance_create(sc, sectors, sc->xbd_vdevice, binfo, sector_size, phys_sector_size); } (void)xenbus_set_state(dev, XenbusStateConnected); /* Kick pending requests. */ mtx_lock(&sc->xbd_io_lock); sc->xbd_state = XBD_STATE_CONNECTED; xbd_startio(sc); sc->xbd_flags |= XBDF_READY; mtx_unlock(&sc->xbd_io_lock); } /** * Handle the change of state of the backend to Closing. We must delete our * device-layer structures now, to ensure that writes are flushed through to * the backend. Once this is done, we can switch to Closed in * acknowledgement. */ static void xbd_closing(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); xenbus_set_state(dev, XenbusStateClosing); DPRINTK("xbd_closing: %s removed\n", xenbus_get_node(dev)); if (sc->xbd_disk != NULL) { disk_destroy(sc->xbd_disk); sc->xbd_disk = NULL; } xenbus_set_state(dev, XenbusStateClosed); } /*---------------------------- NewBus Entrypoints ----------------------------*/ static int xbd_probe(device_t dev) { if (strcmp(xenbus_get_type(dev), "vbd") != 0) return (ENXIO); if (xen_hvm_domain() && xen_disable_pv_disks != 0) return (ENXIO); if (xen_hvm_domain()) { int error; char *type; /* * When running in an HVM domain, IDE disk emulation is * disabled early in boot so that native drivers will * not see emulated hardware. However, CDROM device * emulation cannot be disabled. * * Through use of FreeBSD's vm_guest and xen_hvm_domain() * APIs, we could modify the native CDROM driver to fail its * probe when running under Xen. Unfortunatlely, the PV * CDROM support in XenServer (up through at least version * 6.2) isn't functional, so we instead rely on the emulated * CDROM instance, and fail to attach the PV one here in * the blkfront driver. */ error = xs_read(XST_NIL, xenbus_get_node(dev), "device-type", NULL, (void **) &type); if (error) return (ENXIO); if (strncmp(type, "cdrom", 5) == 0) { free(type, M_XENSTORE); return (ENXIO); } free(type, M_XENSTORE); } device_set_desc(dev, "Virtual Block Device"); device_quiet(dev); return (0); } /* * Setup supplies the backend dir, virtual device. We place an event * channel and shared frame entries. We watch backend to wait if it's * ok. */ static int xbd_attach(device_t dev) { struct xbd_softc *sc; const char *name; uint32_t vdevice; int error; int i; int unit; /* FIXME: Use dynamic device id if this is not set. */ error = xs_scanf(XST_NIL, xenbus_get_node(dev), "virtual-device", NULL, "%" PRIu32, &vdevice); if (error) error = xs_scanf(XST_NIL, xenbus_get_node(dev), "virtual-device-ext", NULL, "%" PRIu32, &vdevice); if (error) { xenbus_dev_fatal(dev, error, "reading virtual-device"); device_printf(dev, "Couldn't determine virtual device.\n"); return (error); } xbd_vdevice_to_unit(vdevice, &unit, &name); if (!strcmp(name, "xbd")) device_set_unit(dev, unit); sc = device_get_softc(dev); mtx_init(&sc->xbd_io_lock, "blkfront i/o lock", NULL, MTX_DEF); xbd_initqs(sc); for (i = 0; i < XBD_MAX_RING_PAGES; i++) sc->xbd_ring_ref[i] = GRANT_REF_INVALID; sc->xbd_dev = dev; sc->xbd_vdevice = vdevice; sc->xbd_state = XBD_STATE_DISCONNECTED; xbd_setup_sysctl(sc); /* Wait for backend device to publish its protocol capabilities. */ xenbus_set_state(dev, XenbusStateInitialising); return (0); } static int xbd_detach(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); DPRINTK("%s: %s removed\n", __func__, xenbus_get_node(dev)); xbd_free(sc); mtx_destroy(&sc->xbd_io_lock); return 0; } static int xbd_suspend(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); int retval; int saved_state; /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xbd_io_lock); saved_state = sc->xbd_state; sc->xbd_state = XBD_STATE_SUSPENDED; /* Wait for outstanding I/O to drain. */ retval = 0; while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { if (msleep(&sc->xbd_cm_q[XBD_Q_BUSY], &sc->xbd_io_lock, PRIBIO, "blkf_susp", 30 * hz) == EWOULDBLOCK) { retval = EBUSY; break; } } mtx_unlock(&sc->xbd_io_lock); if (retval != 0) sc->xbd_state = saved_state; return (retval); } static int xbd_resume(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); if (xen_suspend_cancelled) { sc->xbd_state = XBD_STATE_CONNECTED; return (0); } DPRINTK("xbd_resume: %s\n", xenbus_get_node(dev)); xbd_free(sc); xbd_initialize(sc); return (0); } /** * Callback received when the backend's state changes. */ static void xbd_backend_changed(device_t dev, XenbusState backend_state) { struct xbd_softc *sc = device_get_softc(dev); DPRINTK("backend_state=%d\n", backend_state); switch (backend_state) { case XenbusStateUnknown: case XenbusStateInitialising: case XenbusStateReconfigured: case XenbusStateReconfiguring: case XenbusStateClosed: break; case XenbusStateInitWait: case XenbusStateInitialised: xbd_initialize(sc); break; case XenbusStateConnected: xbd_initialize(sc); xbd_connect(sc); break; case XenbusStateClosing: if (sc->xbd_users > 0) { device_printf(dev, "detaching with pending users\n"); KASSERT(sc->xbd_disk != NULL, ("NULL disk with pending users\n")); disk_gone(sc->xbd_disk); } else { xbd_closing(dev); } break; } } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xbd_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xbd_probe), DEVMETHOD(device_attach, xbd_attach), DEVMETHOD(device_detach, xbd_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xbd_suspend), DEVMETHOD(device_resume, xbd_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xbd_backend_changed), { 0, 0 } }; static driver_t xbd_driver = { "xbd", xbd_methods, sizeof(struct xbd_softc), }; devclass_t xbd_devclass; DRIVER_MODULE(xbd, xenbusb_front, xbd_driver, xbd_devclass, 0, 0); Index: head/sys/dev/xen/netback/netback.c =================================================================== --- head/sys/dev/xen/netback/netback.c (revision 358315) +++ head/sys/dev/xen/netback/netback.c (revision 358316) @@ -1,2515 +1,2515 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2011 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) * Alan Somers (Spectra Logic Corporation) * John Suykerbuyk (Spectra Logic Corporation) */ #include __FBSDID("$FreeBSD$"); /** * \file netback.c * * \brief Device driver supporting the vending of network access * from this FreeBSD domain to other domains. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 700000 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include /*--------------------------- Compile-time Tunables --------------------------*/ /*---------------------------------- Macros ----------------------------------*/ /** * Custom malloc type for all driver allocations. */ static MALLOC_DEFINE(M_XENNETBACK, "xnb", "Xen Net Back Driver Data"); #define XNB_SG 1 /* netback driver supports feature-sg */ #define XNB_GSO_TCPV4 0 /* netback driver supports feature-gso-tcpv4 */ #define XNB_RX_COPY 1 /* netback driver supports feature-rx-copy */ #define XNB_RX_FLIP 0 /* netback driver does not support feature-rx-flip */ #undef XNB_DEBUG #define XNB_DEBUG /* hardcode on during development */ #ifdef XNB_DEBUG #define DPRINTF(fmt, args...) \ printf("xnb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while (0) #endif /* Default length for stack-allocated grant tables */ #define GNTTAB_LEN (64) /* Features supported by all backends. TSO and LRO can be negotiated */ #define XNB_CSUM_FEATURES (CSUM_TCP | CSUM_UDP) #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) /** * Two argument version of the standard macro. Second argument is a tentative * value of req_cons */ #define RING_HAS_UNCONSUMED_REQUESTS_2(_r, cons) ({ \ unsigned int req = (_r)->sring->req_prod - cons; \ unsigned int rsp = RING_SIZE(_r) - \ (cons - (_r)->rsp_prod_pvt); \ req < rsp ? req : rsp; \ }) #define virt_to_mfn(x) (vtophys(x) >> PAGE_SHIFT) #define virt_to_offset(x) ((x) & (PAGE_SIZE - 1)) /** * Predefined array type of grant table copy descriptors. Used to pass around * statically allocated memory structures. */ typedef struct gnttab_copy gnttab_copy_table[GNTTAB_LEN]; /*--------------------------- Forward Declarations ---------------------------*/ struct xnb_softc; struct xnb_pkt; static void xnb_attach_failed(struct xnb_softc *xnb, int err, const char *fmt, ...) __printflike(3,4); static int xnb_shutdown(struct xnb_softc *xnb); static int create_netdev(device_t dev); static int xnb_detach(device_t dev); static int xnb_ifmedia_upd(struct ifnet *ifp); static void xnb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); static void xnb_intr(void *arg); static int xnb_send(netif_rx_back_ring_t *rxb, domid_t otherend, const struct mbuf *mbufc, gnttab_copy_table gnttab); static int xnb_recv(netif_tx_back_ring_t *txb, domid_t otherend, struct mbuf **mbufc, struct ifnet *ifnet, gnttab_copy_table gnttab); static int xnb_ring2pkt(struct xnb_pkt *pkt, const netif_tx_back_ring_t *tx_ring, RING_IDX start); static void xnb_txpkt2rsp(const struct xnb_pkt *pkt, netif_tx_back_ring_t *ring, int error); static struct mbuf *xnb_pkt2mbufc(const struct xnb_pkt *pkt, struct ifnet *ifp); static int xnb_txpkt2gnttab(const struct xnb_pkt *pkt, struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_tx_back_ring_t *txb, domid_t otherend_id); static void xnb_update_mbufc(struct mbuf *mbufc, const gnttab_copy_table gnttab, int n_entries); static int xnb_mbufc2pkt(const struct mbuf *mbufc, struct xnb_pkt *pkt, RING_IDX start, int space); static int xnb_rxpkt2gnttab(const struct xnb_pkt *pkt, const struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_rx_back_ring_t *rxb, domid_t otherend_id); static int xnb_rxpkt2rsp(const struct xnb_pkt *pkt, const gnttab_copy_table gnttab, int n_entries, netif_rx_back_ring_t *ring); static void xnb_stop(struct xnb_softc*); static int xnb_ioctl(struct ifnet*, u_long, caddr_t); static void xnb_start_locked(struct ifnet*); static void xnb_start(struct ifnet*); static void xnb_ifinit_locked(struct xnb_softc*); static void xnb_ifinit(void*); #ifdef XNB_DEBUG static int xnb_unit_test_main(SYSCTL_HANDLER_ARGS); static int xnb_dump_rings(SYSCTL_HANDLER_ARGS); #endif #if defined(INET) || defined(INET6) static void xnb_add_mbuf_cksum(struct mbuf *mbufc); #endif /*------------------------------ Data Structures -----------------------------*/ /** * Representation of a xennet packet. Simplified version of a packet as * stored in the Xen tx ring. Applicable to both RX and TX packets */ struct xnb_pkt{ /** * Array index of the first data-bearing (eg, not extra info) entry * for this packet */ RING_IDX car; /** * Array index of the second data-bearing entry for this packet. * Invalid if the packet has only one data-bearing entry. If the * packet has more than two data-bearing entries, then the second * through the last will be sequential modulo the ring size */ RING_IDX cdr; /** * Optional extra info. Only valid if flags contains * NETTXF_extra_info. Note that extra.type will always be * XEN_NETIF_EXTRA_TYPE_GSO. Currently, no known netfront or netback * driver will ever set XEN_NETIF_EXTRA_TYPE_MCAST_* */ netif_extra_info_t extra; /** Size of entire packet in bytes. */ uint16_t size; /** The size of the first entry's data in bytes */ uint16_t car_size; /** * Either NETTXF_ or NETRXF_ flags. Note that the flag values are * not the same for TX and RX packets */ uint16_t flags; /** * The number of valid data-bearing entries (either netif_tx_request's * or netif_rx_response's) in the packet. If this is 0, it means the * entire packet is invalid. */ uint16_t list_len; /** There was an error processing the packet */ uint8_t error; }; /** xnb_pkt method: initialize it */ static inline void xnb_pkt_initialize(struct xnb_pkt *pxnb) { bzero(pxnb, sizeof(*pxnb)); } /** xnb_pkt method: mark the packet as valid */ static inline void xnb_pkt_validate(struct xnb_pkt *pxnb) { pxnb->error = 0; }; /** xnb_pkt method: mark the packet as invalid */ static inline void xnb_pkt_invalidate(struct xnb_pkt *pxnb) { pxnb->error = 1; }; /** xnb_pkt method: Check whether the packet is valid */ static inline int xnb_pkt_is_valid(const struct xnb_pkt *pxnb) { return (! pxnb->error); } #ifdef XNB_DEBUG /** xnb_pkt method: print the packet's contents in human-readable format*/ static void __unused xnb_dump_pkt(const struct xnb_pkt *pkt) { if (pkt == NULL) { DPRINTF("Was passed a null pointer.\n"); return; } DPRINTF("pkt address= %p\n", pkt); DPRINTF("pkt->size=%d\n", pkt->size); DPRINTF("pkt->car_size=%d\n", pkt->car_size); DPRINTF("pkt->flags=0x%04x\n", pkt->flags); DPRINTF("pkt->list_len=%d\n", pkt->list_len); /* DPRINTF("pkt->extra"); TODO */ DPRINTF("pkt->car=%d\n", pkt->car); DPRINTF("pkt->cdr=%d\n", pkt->cdr); DPRINTF("pkt->error=%d\n", pkt->error); } #endif /* XNB_DEBUG */ static void xnb_dump_txreq(RING_IDX idx, const struct netif_tx_request *txreq) { if (txreq != NULL) { DPRINTF("netif_tx_request index =%u\n", idx); DPRINTF("netif_tx_request.gref =%u\n", txreq->gref); DPRINTF("netif_tx_request.offset=%hu\n", txreq->offset); DPRINTF("netif_tx_request.flags =%hu\n", txreq->flags); DPRINTF("netif_tx_request.id =%hu\n", txreq->id); DPRINTF("netif_tx_request.size =%hu\n", txreq->size); } } /** * \brief Configuration data for a shared memory request ring * used to communicate with the front-end client of this * this driver. */ struct xnb_ring_config { /** * Runtime structures for ring access. Unfortunately, TX and RX rings * use different data structures, and that cannot be changed since it * is part of the interdomain protocol. */ union{ netif_rx_back_ring_t rx_ring; netif_tx_back_ring_t tx_ring; } back_ring; /** * The device bus address returned by the hypervisor when * mapping the ring and required to unmap it when a connection * is torn down. */ uint64_t bus_addr; /** The pseudo-physical address where ring memory is mapped.*/ uint64_t gnt_addr; /** KVA address where ring memory is mapped. */ vm_offset_t va; /** * Grant table handles, one per-ring page, returned by the * hyperpervisor upon mapping of the ring and required to * unmap it when a connection is torn down. */ grant_handle_t handle; /** The number of ring pages mapped for the current connection. */ unsigned ring_pages; /** * The grant references, one per-ring page, supplied by the * front-end, allowing us to reference the ring pages in the * front-end's domain and to map these pages into our own domain. */ grant_ref_t ring_ref; }; /** * Per-instance connection state flags. */ typedef enum { /** Communication with the front-end has been established. */ XNBF_RING_CONNECTED = 0x01, /** * Front-end requests exist in the ring and are waiting for * xnb_xen_req objects to free up. */ XNBF_RESOURCE_SHORTAGE = 0x02, /** Connection teardown has started. */ XNBF_SHUTDOWN = 0x04, /** A thread is already performing shutdown processing. */ XNBF_IN_SHUTDOWN = 0x08 } xnb_flag_t; /** * Types of rings. Used for array indices and to identify a ring's control * data structure type */ typedef enum{ XNB_RING_TYPE_TX = 0, /* ID of TX rings, used for array indices */ XNB_RING_TYPE_RX = 1, /* ID of RX rings, used for array indices */ XNB_NUM_RING_TYPES } xnb_ring_type_t; /** * Per-instance configuration data. */ struct xnb_softc { /** NewBus device corresponding to this instance. */ device_t dev; /* Media related fields */ /** Generic network media state */ struct ifmedia sc_media; /** Media carrier info */ struct ifnet *xnb_ifp; /** Our own private carrier state */ unsigned carrier; /** Device MAC Address */ uint8_t mac[ETHER_ADDR_LEN]; /* Xen related fields */ /** * \brief The netif protocol abi in effect. * * There are situations where the back and front ends can * have a different, native abi (e.g. intel x86_64 and * 32bit x86 domains on the same machine). The back-end * always accommodates the front-end's native abi. That * value is pulled from the XenStore and recorded here. */ int abi; /** * Name of the bridge to which this VIF is connected, if any * This field is dynamically allocated by xenbus and must be free()ed * when no longer needed */ char *bridge; /** The interrupt driven even channel used to signal ring events. */ evtchn_port_t evtchn; /** Xen device handle.*/ long handle; /** Handle to the communication ring event channel. */ xen_intr_handle_t xen_intr_handle; /** * \brief Cached value of the front-end's domain id. * * This value is used at once for each mapped page in * a transaction. We cache it to avoid incuring the * cost of an ivar access every time this is needed. */ domid_t otherend_id; /** * Undocumented frontend feature. Has something to do with * scatter/gather IO */ uint8_t can_sg; /** Undocumented frontend feature */ uint8_t gso; /** Undocumented frontend feature */ uint8_t gso_prefix; /** Can checksum TCP/UDP over IPv4 */ uint8_t ip_csum; /* Implementation related fields */ /** * Preallocated grant table copy descriptor for RX operations. * Access must be protected by rx_lock */ gnttab_copy_table rx_gnttab; /** * Preallocated grant table copy descriptor for TX operations. * Access must be protected by tx_lock */ gnttab_copy_table tx_gnttab; /** * Resource representing allocated physical address space * associated with our per-instance kva region. */ struct resource *pseudo_phys_res; /** Resource id for allocated physical address space. */ int pseudo_phys_res_id; /** Ring mapping and interrupt configuration data. */ struct xnb_ring_config ring_configs[XNB_NUM_RING_TYPES]; /** * Global pool of kva used for mapping remote domain ring * and I/O transaction data. */ vm_offset_t kva; /** Pseudo-physical address corresponding to kva. */ uint64_t gnt_base_addr; /** Various configuration and state bit flags. */ xnb_flag_t flags; /** Mutex protecting per-instance data in the receive path. */ struct mtx rx_lock; /** Mutex protecting per-instance data in the softc structure. */ struct mtx sc_lock; /** Mutex protecting per-instance data in the transmit path. */ struct mtx tx_lock; /** The size of the global kva pool. */ int kva_size; /** Name of the interface */ char if_name[IFNAMSIZ]; }; /*---------------------------- Debugging functions ---------------------------*/ #ifdef XNB_DEBUG static void __unused xnb_dump_gnttab_copy(const struct gnttab_copy *entry) { if (entry == NULL) { printf("NULL grant table pointer\n"); return; } if (entry->flags & GNTCOPY_dest_gref) printf("gnttab dest ref=\t%u\n", entry->dest.u.ref); else printf("gnttab dest gmfn=\t%"PRI_xen_pfn"\n", entry->dest.u.gmfn); printf("gnttab dest offset=\t%hu\n", entry->dest.offset); printf("gnttab dest domid=\t%hu\n", entry->dest.domid); if (entry->flags & GNTCOPY_source_gref) printf("gnttab source ref=\t%u\n", entry->source.u.ref); else printf("gnttab source gmfn=\t%"PRI_xen_pfn"\n", entry->source.u.gmfn); printf("gnttab source offset=\t%hu\n", entry->source.offset); printf("gnttab source domid=\t%hu\n", entry->source.domid); printf("gnttab len=\t%hu\n", entry->len); printf("gnttab flags=\t%hu\n", entry->flags); printf("gnttab status=\t%hd\n", entry->status); } static int xnb_dump_rings(SYSCTL_HANDLER_ARGS) { static char results[720]; struct xnb_softc const* xnb = (struct xnb_softc*)arg1; netif_rx_back_ring_t const* rxb = &xnb->ring_configs[XNB_RING_TYPE_RX].back_ring.rx_ring; netif_tx_back_ring_t const* txb = &xnb->ring_configs[XNB_RING_TYPE_TX].back_ring.tx_ring; /* empty the result strings */ results[0] = 0; if ( !txb || !txb->sring || !rxb || !rxb->sring ) return (SYSCTL_OUT(req, results, strnlen(results, 720))); snprintf(results, 720, "\n\t%35s %18s\n" /* TX, RX */ "\t%16s %18d %18d\n" /* req_cons */ "\t%16s %18d %18d\n" /* nr_ents */ "\t%16s %18d %18d\n" /* rsp_prod_pvt */ "\t%16s %18p %18p\n" /* sring */ "\t%16s %18d %18d\n" /* req_prod */ "\t%16s %18d %18d\n" /* req_event */ "\t%16s %18d %18d\n" /* rsp_prod */ "\t%16s %18d %18d\n", /* rsp_event */ "TX", "RX", "req_cons", txb->req_cons, rxb->req_cons, "nr_ents", txb->nr_ents, rxb->nr_ents, "rsp_prod_pvt", txb->rsp_prod_pvt, rxb->rsp_prod_pvt, "sring", txb->sring, rxb->sring, "sring->req_prod", txb->sring->req_prod, rxb->sring->req_prod, "sring->req_event", txb->sring->req_event, rxb->sring->req_event, "sring->rsp_prod", txb->sring->rsp_prod, rxb->sring->rsp_prod, "sring->rsp_event", txb->sring->rsp_event, rxb->sring->rsp_event); return (SYSCTL_OUT(req, results, strnlen(results, 720))); } static void __unused xnb_dump_mbuf(const struct mbuf *m) { int len; uint8_t *d; if (m == NULL) return; printf("xnb_dump_mbuf:\n"); if (m->m_flags & M_PKTHDR) { printf(" flowid=%10d, csum_flags=%#8x, csum_data=%#8x, " "tso_segsz=%5hd\n", m->m_pkthdr.flowid, (int)m->m_pkthdr.csum_flags, m->m_pkthdr.csum_data, m->m_pkthdr.tso_segsz); printf(" rcvif=%16p, len=%19d\n", m->m_pkthdr.rcvif, m->m_pkthdr.len); } printf(" m_next=%16p, m_nextpk=%16p, m_data=%16p\n", m->m_next, m->m_nextpkt, m->m_data); printf(" m_len=%17d, m_flags=%#15x, m_type=%18u\n", m->m_len, m->m_flags, m->m_type); len = m->m_len; d = mtod(m, uint8_t*); while (len > 0) { int i; printf(" "); for (i = 0; (i < 16) && (len > 0); i++, len--) { printf("%02hhx ", *(d++)); } printf("\n"); } } #endif /* XNB_DEBUG */ /*------------------------ Inter-Domain Communication ------------------------*/ /** * Free dynamically allocated KVA or pseudo-physical address allocations. * * \param xnb Per-instance xnb configuration structure. */ static void xnb_free_communication_mem(struct xnb_softc *xnb) { if (xnb->kva != 0) { if (xnb->pseudo_phys_res != NULL) { xenmem_free(xnb->dev, xnb->pseudo_phys_res_id, xnb->pseudo_phys_res); xnb->pseudo_phys_res = NULL; } } xnb->kva = 0; xnb->gnt_base_addr = 0; } /** * Cleanup all inter-domain communication mechanisms. * * \param xnb Per-instance xnb configuration structure. */ static int xnb_disconnect(struct xnb_softc *xnb) { struct gnttab_unmap_grant_ref gnts[XNB_NUM_RING_TYPES]; int error; int i; if (xnb->xen_intr_handle != NULL) xen_intr_unbind(&xnb->xen_intr_handle); /* * We may still have another thread currently processing requests. We * must acquire the rx and tx locks to make sure those threads are done, * but we can release those locks as soon as we acquire them, because no * more interrupts will be arriving. */ mtx_lock(&xnb->tx_lock); mtx_unlock(&xnb->tx_lock); mtx_lock(&xnb->rx_lock); mtx_unlock(&xnb->rx_lock); mtx_lock(&xnb->sc_lock); /* Free malloc'd softc member variables */ if (xnb->bridge != NULL) { free(xnb->bridge, M_XENSTORE); xnb->bridge = NULL; } /* All request processing has stopped, so unmap the rings */ for (i=0; i < XNB_NUM_RING_TYPES; i++) { gnts[i].host_addr = xnb->ring_configs[i].gnt_addr; gnts[i].dev_bus_addr = xnb->ring_configs[i].bus_addr; gnts[i].handle = xnb->ring_configs[i].handle; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, gnts, XNB_NUM_RING_TYPES); KASSERT(error == 0, ("Grant table unmap op failed (%d)", error)); xnb_free_communication_mem(xnb); /* * Zero the ring config structs because the pointers, handles, and * grant refs contained therein are no longer valid. */ bzero(&xnb->ring_configs[XNB_RING_TYPE_TX], sizeof(struct xnb_ring_config)); bzero(&xnb->ring_configs[XNB_RING_TYPE_RX], sizeof(struct xnb_ring_config)); xnb->flags &= ~XNBF_RING_CONNECTED; mtx_unlock(&xnb->sc_lock); return (0); } /** * Map a single shared memory ring into domain local address space and * initialize its control structure * * \param xnb Per-instance xnb configuration structure * \param ring_type Array index of this ring in the xnb's array of rings * \return An errno */ static int xnb_connect_ring(struct xnb_softc *xnb, xnb_ring_type_t ring_type) { struct gnttab_map_grant_ref gnt; struct xnb_ring_config *ring = &xnb->ring_configs[ring_type]; int error; /* TX ring type = 0, RX =1 */ ring->va = xnb->kva + ring_type * PAGE_SIZE; ring->gnt_addr = xnb->gnt_base_addr + ring_type * PAGE_SIZE; gnt.host_addr = ring->gnt_addr; gnt.flags = GNTMAP_host_map; gnt.ref = ring->ring_ref; gnt.dom = xnb->otherend_id; error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &gnt, 1); if (error != 0) panic("netback: Ring page grant table op failed (%d)", error); if (gnt.status != 0) { ring->va = 0; error = EACCES; xenbus_dev_fatal(xnb->dev, error, "Ring shared page mapping failed. " "Status %d.", gnt.status); } else { ring->handle = gnt.handle; ring->bus_addr = gnt.dev_bus_addr; if (ring_type == XNB_RING_TYPE_TX) { BACK_RING_INIT(&ring->back_ring.tx_ring, (netif_tx_sring_t*)ring->va, ring->ring_pages * PAGE_SIZE); } else if (ring_type == XNB_RING_TYPE_RX) { BACK_RING_INIT(&ring->back_ring.rx_ring, (netif_rx_sring_t*)ring->va, ring->ring_pages * PAGE_SIZE); } else { xenbus_dev_fatal(xnb->dev, error, "Unknown ring type %d", ring_type); } } return error; } /** * Setup the shared memory rings and bind an interrupt to the event channel * used to notify us of ring changes. * * \param xnb Per-instance xnb configuration structure. */ static int xnb_connect_comms(struct xnb_softc *xnb) { int error; xnb_ring_type_t i; if ((xnb->flags & XNBF_RING_CONNECTED) != 0) return (0); /* * Kva for our rings are at the tail of the region of kva allocated * by xnb_alloc_communication_mem(). */ for (i=0; i < XNB_NUM_RING_TYPES; i++) { error = xnb_connect_ring(xnb, i); if (error != 0) return error; } xnb->flags |= XNBF_RING_CONNECTED; error = xen_intr_bind_remote_port(xnb->dev, xnb->otherend_id, xnb->evtchn, /*filter*/NULL, xnb_intr, /*arg*/xnb, INTR_TYPE_NET | INTR_MPSAFE, &xnb->xen_intr_handle); if (error != 0) { (void)xnb_disconnect(xnb); xenbus_dev_fatal(xnb->dev, error, "binding event channel"); return (error); } DPRINTF("rings connected!\n"); return (0); } /** * Size KVA and pseudo-physical address allocations based on negotiated * values for the size and number of I/O requests, and the size of our * communication ring. * * \param xnb Per-instance xnb configuration structure. * * These address spaces are used to dynamically map pages in the * front-end's domain into our own. */ static int xnb_alloc_communication_mem(struct xnb_softc *xnb) { xnb_ring_type_t i; xnb->kva_size = 0; for (i=0; i < XNB_NUM_RING_TYPES; i++) { xnb->kva_size += xnb->ring_configs[i].ring_pages * PAGE_SIZE; } /* * Reserve a range of pseudo physical memory that we can map * into kva. These pages will only be backed by machine * pages ("real memory") during the lifetime of front-end requests * via grant table operations. We will map the netif tx and rx rings * into this space. */ xnb->pseudo_phys_res_id = 0; xnb->pseudo_phys_res = xenmem_alloc(xnb->dev, &xnb->pseudo_phys_res_id, xnb->kva_size); if (xnb->pseudo_phys_res == NULL) { xnb->kva = 0; return (ENOMEM); } xnb->kva = (vm_offset_t)rman_get_virtual(xnb->pseudo_phys_res); xnb->gnt_base_addr = rman_get_start(xnb->pseudo_phys_res); return (0); } /** * Collect information from the XenStore related to our device and its frontend * * \param xnb Per-instance xnb configuration structure. */ static int xnb_collect_xenstore_info(struct xnb_softc *xnb) { /** * \todo Linux collects the following info. We should collect most * of this, too: * "feature-rx-notify" */ const char *otherend_path; const char *our_path; int err; unsigned int rx_copy, bridge_len; uint8_t no_csum_offload; otherend_path = xenbus_get_otherend_path(xnb->dev); our_path = xenbus_get_node(xnb->dev); /* Collect the critical communication parameters */ err = xs_gather(XST_NIL, otherend_path, "tx-ring-ref", "%l" PRIu32, &xnb->ring_configs[XNB_RING_TYPE_TX].ring_ref, "rx-ring-ref", "%l" PRIu32, &xnb->ring_configs[XNB_RING_TYPE_RX].ring_ref, "event-channel", "%" PRIu32, &xnb->evtchn, NULL); if (err != 0) { xenbus_dev_fatal(xnb->dev, err, "Unable to retrieve ring information from " "frontend %s. Unable to connect.", otherend_path); return (err); } /* Collect the handle from xenstore */ err = xs_scanf(XST_NIL, our_path, "handle", NULL, "%li", &xnb->handle); if (err != 0) { xenbus_dev_fatal(xnb->dev, err, "Error reading handle from frontend %s. " "Unable to connect.", otherend_path); } /* * Collect the bridgename, if any. We do not need bridge_len; we just * throw it away */ err = xs_read(XST_NIL, our_path, "bridge", &bridge_len, (void**)&xnb->bridge); if (err != 0) xnb->bridge = NULL; /* * Does the frontend request that we use rx copy? If not, return an * error because this driver only supports rx copy. */ err = xs_scanf(XST_NIL, otherend_path, "request-rx-copy", NULL, "%" PRIu32, &rx_copy); if (err == ENOENT) { err = 0; rx_copy = 0; } if (err < 0) { xenbus_dev_fatal(xnb->dev, err, "reading %s/request-rx-copy", otherend_path); return err; } /** * \todo: figure out the exact meaning of this feature, and when * the frontend will set it to true. It should be set to true * at some point */ /* if (!rx_copy)*/ /* return EOPNOTSUPP;*/ /** \todo Collect the rx notify feature */ /* Collect the feature-sg. */ if (xs_scanf(XST_NIL, otherend_path, "feature-sg", NULL, "%hhu", &xnb->can_sg) < 0) xnb->can_sg = 0; /* Collect remaining frontend features */ if (xs_scanf(XST_NIL, otherend_path, "feature-gso-tcpv4", NULL, "%hhu", &xnb->gso) < 0) xnb->gso = 0; if (xs_scanf(XST_NIL, otherend_path, "feature-gso-tcpv4-prefix", NULL, "%hhu", &xnb->gso_prefix) < 0) xnb->gso_prefix = 0; if (xs_scanf(XST_NIL, otherend_path, "feature-no-csum-offload", NULL, "%hhu", &no_csum_offload) < 0) no_csum_offload = 0; xnb->ip_csum = (no_csum_offload == 0); return (0); } /** * Supply information about the physical device to the frontend * via XenBus. * * \param xnb Per-instance xnb configuration structure. */ static int xnb_publish_backend_info(struct xnb_softc *xnb) { struct xs_transaction xst; const char *our_path; int error; our_path = xenbus_get_node(xnb->dev); do { error = xs_transaction_start(&xst); if (error != 0) { xenbus_dev_fatal(xnb->dev, error, "Error publishing backend info " "(start transaction)"); break; } error = xs_printf(xst, our_path, "feature-sg", "%d", XNB_SG); if (error != 0) break; error = xs_printf(xst, our_path, "feature-gso-tcpv4", "%d", XNB_GSO_TCPV4); if (error != 0) break; error = xs_printf(xst, our_path, "feature-rx-copy", "%d", XNB_RX_COPY); if (error != 0) break; error = xs_printf(xst, our_path, "feature-rx-flip", "%d", XNB_RX_FLIP); if (error != 0) break; error = xs_transaction_end(xst, 0); if (error != 0 && error != EAGAIN) { xenbus_dev_fatal(xnb->dev, error, "ending transaction"); break; } } while (error == EAGAIN); return (error); } /** * Connect to our netfront peer now that it has completed publishing * its configuration into the XenStore. * * \param xnb Per-instance xnb configuration structure. */ static void xnb_connect(struct xnb_softc *xnb) { int error; if (xenbus_get_state(xnb->dev) == XenbusStateConnected) return; if (xnb_collect_xenstore_info(xnb) != 0) return; xnb->flags &= ~XNBF_SHUTDOWN; /* Read front end configuration. */ /* Allocate resources whose size depends on front-end configuration. */ error = xnb_alloc_communication_mem(xnb); if (error != 0) { xenbus_dev_fatal(xnb->dev, error, "Unable to allocate communication memory"); return; } /* * Connect communication channel. */ error = xnb_connect_comms(xnb); if (error != 0) { /* Specific errors are reported by xnb_connect_comms(). */ return; } xnb->carrier = 1; /* Ready for I/O. */ xenbus_set_state(xnb->dev, XenbusStateConnected); } /*-------------------------- Device Teardown Support -------------------------*/ /** * Perform device shutdown functions. * * \param xnb Per-instance xnb configuration structure. * * Mark this instance as shutting down, wait for any active requests * to drain, disconnect from the front-end, and notify any waiters (e.g. * a thread invoking our detach method) that detach can now proceed. */ static int xnb_shutdown(struct xnb_softc *xnb) { /* * Due to the need to drop our mutex during some * xenbus operations, it is possible for two threads * to attempt to close out shutdown processing at * the same time. Tell the caller that hits this * race to try back later. */ if ((xnb->flags & XNBF_IN_SHUTDOWN) != 0) return (EAGAIN); xnb->flags |= XNBF_SHUTDOWN; xnb->flags |= XNBF_IN_SHUTDOWN; mtx_unlock(&xnb->sc_lock); /* Free the network interface */ xnb->carrier = 0; if (xnb->xnb_ifp != NULL) { ether_ifdetach(xnb->xnb_ifp); if_free(xnb->xnb_ifp); xnb->xnb_ifp = NULL; } xnb_disconnect(xnb); if (xenbus_get_state(xnb->dev) < XenbusStateClosing) xenbus_set_state(xnb->dev, XenbusStateClosing); mtx_lock(&xnb->sc_lock); xnb->flags &= ~XNBF_IN_SHUTDOWN; /* Indicate to xnb_detach() that is it safe to proceed. */ wakeup(xnb); return (0); } /** * Report an attach time error to the console and Xen, and cleanup * this instance by forcing immediate detach processing. * * \param xnb Per-instance xnb configuration structure. * \param err Errno describing the error. * \param fmt Printf style format and arguments */ static void xnb_attach_failed(struct xnb_softc *xnb, int err, const char *fmt, ...) { va_list ap; va_list ap_hotplug; va_start(ap, fmt); va_copy(ap_hotplug, ap); xs_vprintf(XST_NIL, xenbus_get_node(xnb->dev), "hotplug-error", fmt, ap_hotplug); va_end(ap_hotplug); (void)xs_printf(XST_NIL, xenbus_get_node(xnb->dev), "hotplug-status", "error"); xenbus_dev_vfatal(xnb->dev, err, fmt, ap); va_end(ap); (void)xs_printf(XST_NIL, xenbus_get_node(xnb->dev), "online", "0"); xnb_detach(xnb->dev); } /*---------------------------- NewBus Entrypoints ----------------------------*/ /** * Inspect a XenBus device and claim it if is of the appropriate type. * * \param dev NewBus device object representing a candidate XenBus device. * * \return 0 for success, errno codes for failure. */ static int xnb_probe(device_t dev) { if (!strcmp(xenbus_get_type(dev), "vif")) { DPRINTF("Claiming device %d, %s\n", device_get_unit(dev), devclass_get_name(device_get_devclass(dev))); device_set_desc(dev, "Backend Virtual Network Device"); device_quiet(dev); return (0); } return (ENXIO); } /** * Setup sysctl variables to control various Network Back parameters. * * \param xnb Xen Net Back softc. * */ static void xnb_setup_sysctl(struct xnb_softc *xnb) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; sysctl_ctx = device_get_sysctl_ctx(xnb->dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xnb->dev); if (sysctl_tree == NULL) return; #ifdef XNB_DEBUG SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "unit_test_results", - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, xnb, 0, xnb_unit_test_main, "A", "Results of builtin unit tests"); SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "dump_rings", - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, xnb, 0, xnb_dump_rings, "A", "Xennet Back Rings"); #endif /* XNB_DEBUG */ } /** * Create a network device. * @param handle device handle */ int create_netdev(device_t dev) { struct ifnet *ifp; struct xnb_softc *xnb; int err = 0; uint32_t handle; xnb = device_get_softc(dev); mtx_init(&xnb->sc_lock, "xnb_softc", "xen netback softc lock", MTX_DEF); mtx_init(&xnb->tx_lock, "xnb_tx", "xen netback tx lock", MTX_DEF); mtx_init(&xnb->rx_lock, "xnb_rx", "xen netback rx lock", MTX_DEF); xnb->dev = dev; ifmedia_init(&xnb->sc_media, 0, xnb_ifmedia_upd, xnb_ifmedia_sts); ifmedia_add(&xnb->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL); ifmedia_set(&xnb->sc_media, IFM_ETHER|IFM_MANUAL); /* * Set the MAC address to a dummy value (00:00:00:00:00), * if the MAC address of the host-facing interface is set * to the same as the guest-facing one (the value found in * xenstore), the bridge would stop delivering packets to * us because it would see that the destination address of * the packet is the same as the interface, and so the bridge * would expect the packet has already been delivered locally * (and just drop it). */ bzero(&xnb->mac[0], sizeof(xnb->mac)); /* The interface will be named using the following nomenclature: * * xnb. * * Where handle is the oder of the interface referred to the guest. */ err = xs_scanf(XST_NIL, xenbus_get_node(xnb->dev), "handle", NULL, "%" PRIu32, &handle); if (err != 0) return (err); snprintf(xnb->if_name, IFNAMSIZ, "xnb%" PRIu16 ".%" PRIu32, xenbus_get_otherend_id(dev), handle); if (err == 0) { /* Set up ifnet structure */ ifp = xnb->xnb_ifp = if_alloc(IFT_ETHER); ifp->if_softc = xnb; if_initname(ifp, xnb->if_name, IF_DUNIT_NONE); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = xnb_ioctl; ifp->if_start = xnb_start; ifp->if_init = xnb_ifinit; ifp->if_mtu = ETHERMTU; ifp->if_snd.ifq_maxlen = NET_RX_RING_SIZE - 1; ifp->if_hwassist = XNB_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; ifp->if_capenable = IFCAP_HWCSUM; ether_ifattach(ifp, xnb->mac); xnb->carrier = 0; } return err; } /** * Attach to a XenBus device that has been claimed by our probe routine. * * \param dev NewBus device object representing this Xen Net Back instance. * * \return 0 for success, errno codes for failure. */ static int xnb_attach(device_t dev) { struct xnb_softc *xnb; int error; xnb_ring_type_t i; error = create_netdev(dev); if (error != 0) { xenbus_dev_fatal(dev, error, "creating netdev"); return (error); } DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); /* * Basic initialization. * After this block it is safe to call xnb_detach() * to clean up any allocated data for this instance. */ xnb = device_get_softc(dev); xnb->otherend_id = xenbus_get_otherend_id(dev); for (i=0; i < XNB_NUM_RING_TYPES; i++) { xnb->ring_configs[i].ring_pages = 1; } /* * Setup sysctl variables. */ xnb_setup_sysctl(xnb); /* Update hot-plug status to satisfy xend. */ error = xs_printf(XST_NIL, xenbus_get_node(xnb->dev), "hotplug-status", "connected"); if (error != 0) { xnb_attach_failed(xnb, error, "writing %s/hotplug-status", xenbus_get_node(xnb->dev)); return (error); } if ((error = xnb_publish_backend_info(xnb)) != 0) { /* * If we can't publish our data, we cannot participate * in this connection, and waiting for a front-end state * change will not help the situation. */ xnb_attach_failed(xnb, error, "Publishing backend status for %s", xenbus_get_node(xnb->dev)); return error; } /* Tell the front end that we are ready to connect. */ xenbus_set_state(dev, XenbusStateInitWait); return (0); } /** * Detach from a net back device instance. * * \param dev NewBus device object representing this Xen Net Back instance. * * \return 0 for success, errno codes for failure. * * \note A net back device may be detached at any time in its life-cycle, * including part way through the attach process. For this reason, * initialization order and the initialization state checks in this * routine must be carefully coupled so that attach time failures * are gracefully handled. */ static int xnb_detach(device_t dev) { struct xnb_softc *xnb; DPRINTF("\n"); xnb = device_get_softc(dev); mtx_lock(&xnb->sc_lock); while (xnb_shutdown(xnb) == EAGAIN) { msleep(xnb, &xnb->sc_lock, /*wakeup prio unchanged*/0, "xnb_shutdown", 0); } mtx_unlock(&xnb->sc_lock); DPRINTF("\n"); mtx_destroy(&xnb->tx_lock); mtx_destroy(&xnb->rx_lock); mtx_destroy(&xnb->sc_lock); return (0); } /** * Prepare this net back device for suspension of this VM. * * \param dev NewBus device object representing this Xen net Back instance. * * \return 0 for success, errno codes for failure. */ static int xnb_suspend(device_t dev) { return (0); } /** * Perform any processing required to recover from a suspended state. * * \param dev NewBus device object representing this Xen Net Back instance. * * \return 0 for success, errno codes for failure. */ static int xnb_resume(device_t dev) { return (0); } /** * Handle state changes expressed via the XenStore by our front-end peer. * * \param dev NewBus device object representing this Xen * Net Back instance. * \param frontend_state The new state of the front-end. * * \return 0 for success, errno codes for failure. */ static void xnb_frontend_changed(device_t dev, XenbusState frontend_state) { struct xnb_softc *xnb; xnb = device_get_softc(dev); DPRINTF("frontend_state=%s, xnb_state=%s\n", xenbus_strstate(frontend_state), xenbus_strstate(xenbus_get_state(xnb->dev))); switch (frontend_state) { case XenbusStateInitialising: break; case XenbusStateInitialised: case XenbusStateConnected: xnb_connect(xnb); break; case XenbusStateClosing: case XenbusStateClosed: mtx_lock(&xnb->sc_lock); xnb_shutdown(xnb); mtx_unlock(&xnb->sc_lock); if (frontend_state == XenbusStateClosed) xenbus_set_state(xnb->dev, XenbusStateClosed); break; default: xenbus_dev_fatal(xnb->dev, EINVAL, "saw state %d at frontend", frontend_state); break; } } /*---------------------------- Request Processing ----------------------------*/ /** * Interrupt handler bound to the shared ring's event channel. * Entry point for the xennet transmit path in netback * Transfers packets from the Xen ring to the host's generic networking stack * * \param arg Callback argument registerd during event channel * binding - the xnb_softc for this instance. */ static void xnb_intr(void *arg) { struct xnb_softc *xnb; struct ifnet *ifp; netif_tx_back_ring_t *txb; RING_IDX req_prod_local; xnb = (struct xnb_softc *)arg; ifp = xnb->xnb_ifp; txb = &xnb->ring_configs[XNB_RING_TYPE_TX].back_ring.tx_ring; mtx_lock(&xnb->tx_lock); do { int notify; req_prod_local = txb->sring->req_prod; xen_rmb(); for (;;) { struct mbuf *mbufc; int err; err = xnb_recv(txb, xnb->otherend_id, &mbufc, ifp, xnb->tx_gnttab); if (err || (mbufc == NULL)) break; /* Send the packet to the generic network stack */ (*xnb->xnb_ifp->if_input)(xnb->xnb_ifp, mbufc); } RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(txb, notify); if (notify != 0) xen_intr_signal(xnb->xen_intr_handle); txb->sring->req_event = txb->req_cons + 1; xen_mb(); } while (txb->sring->req_prod != req_prod_local) ; mtx_unlock(&xnb->tx_lock); xnb_start(ifp); } /** * Build a struct xnb_pkt based on netif_tx_request's from a netif tx ring. * Will read exactly 0 or 1 packets from the ring; never a partial packet. * \param[out] pkt The returned packet. If there is an error building * the packet, pkt.list_len will be set to 0. * \param[in] tx_ring Pointer to the Ring that is the input to this function * \param[in] start The ring index of the first potential request * \return The number of requests consumed to build this packet */ static int xnb_ring2pkt(struct xnb_pkt *pkt, const netif_tx_back_ring_t *tx_ring, RING_IDX start) { /* * Outline: * 1) Initialize pkt * 2) Read the first request of the packet * 3) Read the extras * 4) Set cdr * 5) Loop on the remainder of the packet * 6) Finalize pkt (stuff like car_size and list_len) */ int idx = start; int discard = 0; /* whether to discard the packet */ int more_data = 0; /* there are more request past the last one */ uint16_t cdr_size = 0; /* accumulated size of requests 2 through n */ xnb_pkt_initialize(pkt); /* Read the first request */ if (RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) { netif_tx_request_t *tx = RING_GET_REQUEST(tx_ring, idx); pkt->size = tx->size; pkt->flags = tx->flags & ~NETTXF_more_data; more_data = tx->flags & NETTXF_more_data; pkt->list_len++; pkt->car = idx; idx++; } /* Read the extra info */ if ((pkt->flags & NETTXF_extra_info) && RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) { netif_extra_info_t *ext = (netif_extra_info_t*) RING_GET_REQUEST(tx_ring, idx); pkt->extra.type = ext->type; switch (pkt->extra.type) { case XEN_NETIF_EXTRA_TYPE_GSO: pkt->extra.u.gso = ext->u.gso; break; default: /* * The reference Linux netfront driver will * never set any other extra.type. So we don't * know what to do with it. Let's print an * error, then consume and discard the packet */ printf("xnb(%s:%d): Unknown extra info type %d." " Discarding packet\n", __func__, __LINE__, pkt->extra.type); xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start)); xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx)); discard = 1; break; } pkt->extra.flags = ext->flags; if (ext->flags & XEN_NETIF_EXTRA_FLAG_MORE) { /* * The reference linux netfront driver never sets this * flag (nor does any other known netfront). So we * will discard the packet. */ printf("xnb(%s:%d): Request sets " "XEN_NETIF_EXTRA_FLAG_MORE, but we can't handle " "that\n", __func__, __LINE__); xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start)); xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx)); discard = 1; } idx++; } /* Set cdr. If there is not more data, cdr is invalid */ pkt->cdr = idx; /* Loop on remainder of packet */ while (more_data && RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) { netif_tx_request_t *tx = RING_GET_REQUEST(tx_ring, idx); pkt->list_len++; cdr_size += tx->size; if (tx->flags & ~NETTXF_more_data) { /* There should be no other flags set at this point */ printf("xnb(%s:%d): Request sets unknown flags %d " "after the 1st request in the packet.\n", __func__, __LINE__, tx->flags); xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start)); xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx)); } more_data = tx->flags & NETTXF_more_data; idx++; } /* Finalize packet */ if (more_data != 0) { /* The ring ran out of requests before finishing the packet */ xnb_pkt_invalidate(pkt); idx = start; /* tell caller that we consumed no requests */ } else { /* Calculate car_size */ pkt->car_size = pkt->size - cdr_size; } if (discard != 0) { xnb_pkt_invalidate(pkt); } return idx - start; } /** * Respond to all the requests that constituted pkt. Builds the responses and * writes them to the ring, but doesn't push them to the shared ring. * \param[in] pkt the packet that needs a response * \param[in] error true if there was an error handling the packet, such * as in the hypervisor copy op or mbuf allocation * \param[out] ring Responses go here */ static void xnb_txpkt2rsp(const struct xnb_pkt *pkt, netif_tx_back_ring_t *ring, int error) { /* * Outline: * 1) Respond to the first request * 2) Respond to the extra info reques * Loop through every remaining request in the packet, generating * responses that copy those requests' ids and sets the status * appropriately. */ netif_tx_request_t *tx; netif_tx_response_t *rsp; int i; uint16_t status; status = (xnb_pkt_is_valid(pkt) == 0) || error ? NETIF_RSP_ERROR : NETIF_RSP_OKAY; KASSERT((pkt->list_len == 0) || (ring->rsp_prod_pvt == pkt->car), ("Cannot respond to ring requests out of order")); if (pkt->list_len >= 1) { uint16_t id; tx = RING_GET_REQUEST(ring, ring->rsp_prod_pvt); id = tx->id; rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->id = id; rsp->status = status; ring->rsp_prod_pvt++; if (pkt->flags & NETRXF_extra_info) { rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->status = NETIF_RSP_NULL; ring->rsp_prod_pvt++; } } for (i=0; i < pkt->list_len - 1; i++) { uint16_t id; tx = RING_GET_REQUEST(ring, ring->rsp_prod_pvt); id = tx->id; rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->id = id; rsp->status = status; ring->rsp_prod_pvt++; } } /** * Create an mbuf chain to represent a packet. Initializes all of the headers * in the mbuf chain, but does not copy the data. The returned chain must be * free()'d when no longer needed * \param[in] pkt A packet to model the mbuf chain after * \return A newly allocated mbuf chain, possibly with clusters attached. * NULL on failure */ static struct mbuf* xnb_pkt2mbufc(const struct xnb_pkt *pkt, struct ifnet *ifp) { /** * \todo consider using a memory pool for mbufs instead of * reallocating them for every packet */ /** \todo handle extra data */ struct mbuf *m; m = m_getm(NULL, pkt->size, M_NOWAIT, MT_DATA); if (m != NULL) { m->m_pkthdr.rcvif = ifp; if (pkt->flags & NETTXF_data_validated) { /* * We lie to the host OS and always tell it that the * checksums are ok, because the packet is unlikely to * get corrupted going across domains. */ m->m_pkthdr.csum_flags = ( CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR ); m->m_pkthdr.csum_data = 0xffff; } } return m; } /** * Build a gnttab_copy table that can be used to copy data from a pkt * to an mbufc. Does not actually perform the copy. Always uses gref's on * the packet side. * \param[in] pkt pkt's associated requests form the src for * the copy operation * \param[in] mbufc mbufc's storage forms the dest for the copy operation * \param[out] gnttab Storage for the returned grant table * \param[in] txb Pointer to the backend ring structure * \param[in] otherend_id The domain ID of the other end of the copy * \return The number of gnttab entries filled */ static int xnb_txpkt2gnttab(const struct xnb_pkt *pkt, struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_tx_back_ring_t *txb, domid_t otherend_id) { struct mbuf *mbuf = mbufc;/* current mbuf within the chain */ int gnt_idx = 0; /* index into grant table */ RING_IDX r_idx = pkt->car; /* index into tx ring buffer */ int r_ofs = 0; /* offset of next data within tx request's data area */ int m_ofs = 0; /* offset of next data within mbuf's data area */ /* size in bytes that still needs to be represented in the table */ uint16_t size_remaining = pkt->size; while (size_remaining > 0) { const netif_tx_request_t *txq = RING_GET_REQUEST(txb, r_idx); const size_t mbuf_space = M_TRAILINGSPACE(mbuf) - m_ofs; const size_t req_size = r_idx == pkt->car ? pkt->car_size : txq->size; const size_t pkt_space = req_size - r_ofs; /* * space is the largest amount of data that can be copied in the * grant table's next entry */ const size_t space = MIN(pkt_space, mbuf_space); /* TODO: handle this error condition without panicking */ KASSERT(gnt_idx < GNTTAB_LEN, ("Grant table is too short")); gnttab[gnt_idx].source.u.ref = txq->gref; gnttab[gnt_idx].source.domid = otherend_id; gnttab[gnt_idx].source.offset = txq->offset + r_ofs; gnttab[gnt_idx].dest.u.gmfn = virt_to_mfn( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].dest.offset = virt_to_offset( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].dest.domid = DOMID_SELF; gnttab[gnt_idx].len = space; gnttab[gnt_idx].flags = GNTCOPY_source_gref; gnt_idx++; r_ofs += space; m_ofs += space; size_remaining -= space; if (req_size - r_ofs <= 0) { /* Must move to the next tx request */ r_ofs = 0; r_idx = (r_idx == pkt->car) ? pkt->cdr : r_idx + 1; } if (M_TRAILINGSPACE(mbuf) - m_ofs <= 0) { /* Must move to the next mbuf */ m_ofs = 0; mbuf = mbuf->m_next; } } return gnt_idx; } /** * Check the status of the grant copy operations, and update mbufs various * non-data fields to reflect the data present. * \param[in,out] mbufc mbuf chain to update. The chain must be valid and of * the correct length, and data should already be present * \param[in] gnttab A grant table for a just completed copy op * \param[in] n_entries The number of valid entries in the grant table */ static void xnb_update_mbufc(struct mbuf *mbufc, const gnttab_copy_table gnttab, int n_entries) { struct mbuf *mbuf = mbufc; int i; size_t total_size = 0; for (i = 0; i < n_entries; i++) { KASSERT(gnttab[i].status == GNTST_okay, ("Some gnttab_copy entry had error status %hd\n", gnttab[i].status)); mbuf->m_len += gnttab[i].len; total_size += gnttab[i].len; if (M_TRAILINGSPACE(mbuf) <= 0) { mbuf = mbuf->m_next; } } mbufc->m_pkthdr.len = total_size; #if defined(INET) || defined(INET6) xnb_add_mbuf_cksum(mbufc); #endif } /** * Dequeue at most one packet from the shared ring * \param[in,out] txb Netif tx ring. A packet will be removed from it, and * its private indices will be updated. But the indices * will not be pushed to the shared ring. * \param[in] ifnet Interface to which the packet will be sent * \param[in] otherend Domain ID of the other end of the ring * \param[out] mbufc The assembled mbuf chain, ready to send to the generic * networking stack * \param[in,out] gnttab Pointer to enough memory for a grant table. We make * this a function parameter so that we will take less * stack space. * \return An error code */ static int xnb_recv(netif_tx_back_ring_t *txb, domid_t otherend, struct mbuf **mbufc, struct ifnet *ifnet, gnttab_copy_table gnttab) { struct xnb_pkt pkt; /* number of tx requests consumed to build the last packet */ int num_consumed; int nr_ents; *mbufc = NULL; num_consumed = xnb_ring2pkt(&pkt, txb, txb->req_cons); if (num_consumed == 0) return 0; /* Nothing to receive */ /* update statistics independent of errors */ if_inc_counter(ifnet, IFCOUNTER_IPACKETS, 1); /* * if we got here, then 1 or more requests was consumed, but the packet * is not necessarily valid. */ if (xnb_pkt_is_valid(&pkt) == 0) { /* got a garbage packet, respond and drop it */ xnb_txpkt2rsp(&pkt, txb, 1); txb->req_cons += num_consumed; DPRINTF("xnb_intr: garbage packet, num_consumed=%d\n", num_consumed); if_inc_counter(ifnet, IFCOUNTER_IERRORS, 1); return EINVAL; } *mbufc = xnb_pkt2mbufc(&pkt, ifnet); if (*mbufc == NULL) { /* * Couldn't allocate mbufs. Respond and drop the packet. Do * not consume the requests */ xnb_txpkt2rsp(&pkt, txb, 1); DPRINTF("xnb_intr: Couldn't allocate mbufs, num_consumed=%d\n", num_consumed); if_inc_counter(ifnet, IFCOUNTER_IQDROPS, 1); return ENOMEM; } nr_ents = xnb_txpkt2gnttab(&pkt, *mbufc, gnttab, txb, otherend); if (nr_ents > 0) { int __unused hv_ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, gnttab, nr_ents); KASSERT(hv_ret == 0, ("HYPERVISOR_grant_table_op returned %d\n", hv_ret)); xnb_update_mbufc(*mbufc, gnttab, nr_ents); } xnb_txpkt2rsp(&pkt, txb, 0); txb->req_cons += num_consumed; return 0; } /** * Create an xnb_pkt based on the contents of an mbuf chain. * \param[in] mbufc mbuf chain to transform into a packet * \param[out] pkt Storage for the newly generated xnb_pkt * \param[in] start The ring index of the first available slot in the rx * ring * \param[in] space The number of free slots in the rx ring * \retval 0 Success * \retval EINVAL mbufc was corrupt or not convertible into a pkt * \retval EAGAIN There was not enough space in the ring to queue the * packet */ static int xnb_mbufc2pkt(const struct mbuf *mbufc, struct xnb_pkt *pkt, RING_IDX start, int space) { int retval = 0; if ((mbufc == NULL) || ( (mbufc->m_flags & M_PKTHDR) == 0) || (mbufc->m_pkthdr.len == 0)) { xnb_pkt_invalidate(pkt); retval = EINVAL; } else { int slots_required; xnb_pkt_validate(pkt); pkt->flags = 0; pkt->size = mbufc->m_pkthdr.len; pkt->car = start; pkt->car_size = mbufc->m_len; if (mbufc->m_pkthdr.csum_flags & CSUM_TSO) { pkt->flags |= NETRXF_extra_info; pkt->extra.u.gso.size = mbufc->m_pkthdr.tso_segsz; pkt->extra.u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; pkt->extra.u.gso.pad = 0; pkt->extra.u.gso.features = 0; pkt->extra.type = XEN_NETIF_EXTRA_TYPE_GSO; pkt->extra.flags = 0; pkt->cdr = start + 2; } else { pkt->cdr = start + 1; } if (mbufc->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_DELAY_DATA)) { pkt->flags |= (NETRXF_csum_blank | NETRXF_data_validated); } /* * Each ring response can have up to PAGE_SIZE of data. * Assume that we can defragment the mbuf chain efficiently * into responses so that each response but the last uses all * PAGE_SIZE bytes. */ pkt->list_len = howmany(pkt->size, PAGE_SIZE); if (pkt->list_len > 1) { pkt->flags |= NETRXF_more_data; } slots_required = pkt->list_len + (pkt->flags & NETRXF_extra_info ? 1 : 0); if (slots_required > space) { xnb_pkt_invalidate(pkt); retval = EAGAIN; } } return retval; } /** * Build a gnttab_copy table that can be used to copy data from an mbuf chain * to the frontend's shared buffers. Does not actually perform the copy. * Always uses gref's on the other end's side. * \param[in] pkt pkt's associated responses form the dest for the copy * operatoin * \param[in] mbufc The source for the copy operation * \param[out] gnttab Storage for the returned grant table * \param[in] rxb Pointer to the backend ring structure * \param[in] otherend_id The domain ID of the other end of the copy * \return The number of gnttab entries filled */ static int xnb_rxpkt2gnttab(const struct xnb_pkt *pkt, const struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_rx_back_ring_t *rxb, domid_t otherend_id) { const struct mbuf *mbuf = mbufc;/* current mbuf within the chain */ int gnt_idx = 0; /* index into grant table */ RING_IDX r_idx = pkt->car; /* index into rx ring buffer */ int r_ofs = 0; /* offset of next data within rx request's data area */ int m_ofs = 0; /* offset of next data within mbuf's data area */ /* size in bytes that still needs to be represented in the table */ uint16_t size_remaining; size_remaining = (xnb_pkt_is_valid(pkt) != 0) ? pkt->size : 0; while (size_remaining > 0) { const netif_rx_request_t *rxq = RING_GET_REQUEST(rxb, r_idx); const size_t mbuf_space = mbuf->m_len - m_ofs; /* Xen shared pages have an implied size of PAGE_SIZE */ const size_t req_size = PAGE_SIZE; const size_t pkt_space = req_size - r_ofs; /* * space is the largest amount of data that can be copied in the * grant table's next entry */ const size_t space = MIN(pkt_space, mbuf_space); /* TODO: handle this error condition without panicing */ KASSERT(gnt_idx < GNTTAB_LEN, ("Grant table is too short")); gnttab[gnt_idx].dest.u.ref = rxq->gref; gnttab[gnt_idx].dest.domid = otherend_id; gnttab[gnt_idx].dest.offset = r_ofs; gnttab[gnt_idx].source.u.gmfn = virt_to_mfn( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].source.offset = virt_to_offset( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].source.domid = DOMID_SELF; gnttab[gnt_idx].len = space; gnttab[gnt_idx].flags = GNTCOPY_dest_gref; gnt_idx++; r_ofs += space; m_ofs += space; size_remaining -= space; if (req_size - r_ofs <= 0) { /* Must move to the next rx request */ r_ofs = 0; r_idx = (r_idx == pkt->car) ? pkt->cdr : r_idx + 1; } if (mbuf->m_len - m_ofs <= 0) { /* Must move to the next mbuf */ m_ofs = 0; mbuf = mbuf->m_next; } } return gnt_idx; } /** * Generates responses for all the requests that constituted pkt. Builds * responses and writes them to the ring, but doesn't push the shared ring * indices. * \param[in] pkt the packet that needs a response * \param[in] gnttab The grant copy table corresponding to this packet. * Used to determine how many rsp->netif_rx_response_t's to * generate. * \param[in] n_entries Number of relevant entries in the grant table * \param[out] ring Responses go here * \return The number of RX requests that were consumed to generate * the responses */ static int xnb_rxpkt2rsp(const struct xnb_pkt *pkt, const gnttab_copy_table gnttab, int n_entries, netif_rx_back_ring_t *ring) { /* * This code makes the following assumptions: * * All entries in gnttab set GNTCOPY_dest_gref * * The entries in gnttab are grouped by their grefs: any two * entries with the same gref must be adjacent */ int error = 0; int gnt_idx, i; int n_responses = 0; grant_ref_t last_gref = GRANT_REF_INVALID; RING_IDX r_idx; KASSERT(gnttab != NULL, ("Received a null granttable copy")); /* * In the event of an error, we only need to send one response to the * netfront. In that case, we musn't write any data to the responses * after the one we send. So we must loop all the way through gnttab * looking for errors before we generate any responses * * Since we're looping through the grant table anyway, we'll count the * number of different gref's in it, which will tell us how many * responses to generate */ for (gnt_idx = 0; gnt_idx < n_entries; gnt_idx++) { int16_t status = gnttab[gnt_idx].status; if (status != GNTST_okay) { DPRINTF( "Got error %d for hypervisor gnttab_copy status\n", status); error = 1; break; } if (gnttab[gnt_idx].dest.u.ref != last_gref) { n_responses++; last_gref = gnttab[gnt_idx].dest.u.ref; } } if (error != 0) { uint16_t id; netif_rx_response_t *rsp; id = RING_GET_REQUEST(ring, ring->rsp_prod_pvt)->id; rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->id = id; rsp->status = NETIF_RSP_ERROR; n_responses = 1; } else { gnt_idx = 0; const int has_extra = pkt->flags & NETRXF_extra_info; if (has_extra != 0) n_responses++; for (i = 0; i < n_responses; i++) { netif_rx_request_t rxq; netif_rx_response_t *rsp; r_idx = ring->rsp_prod_pvt + i; /* * We copy the structure of rxq instead of making a * pointer because it shares the same memory as rsp. */ rxq = *(RING_GET_REQUEST(ring, r_idx)); rsp = RING_GET_RESPONSE(ring, r_idx); if (has_extra && (i == 1)) { netif_extra_info_t *ext = (netif_extra_info_t*)rsp; ext->type = XEN_NETIF_EXTRA_TYPE_GSO; ext->flags = 0; ext->u.gso.size = pkt->extra.u.gso.size; ext->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; ext->u.gso.pad = 0; ext->u.gso.features = 0; } else { rsp->id = rxq.id; rsp->status = GNTST_okay; rsp->offset = 0; rsp->flags = 0; if (i < pkt->list_len - 1) rsp->flags |= NETRXF_more_data; if ((i == 0) && has_extra) rsp->flags |= NETRXF_extra_info; if ((i == 0) && (pkt->flags & NETRXF_data_validated)) { rsp->flags |= NETRXF_data_validated; rsp->flags |= NETRXF_csum_blank; } rsp->status = 0; for (; gnttab[gnt_idx].dest.u.ref == rxq.gref; gnt_idx++) { rsp->status += gnttab[gnt_idx].len; } } } } ring->req_cons += n_responses; ring->rsp_prod_pvt += n_responses; return n_responses; } #if defined(INET) || defined(INET6) /** * Add IP, TCP, and/or UDP checksums to every mbuf in a chain. The first mbuf * in the chain must start with a struct ether_header. * * XXX This function will perform incorrectly on UDP packets that are split up * into multiple ethernet frames. */ static void xnb_add_mbuf_cksum(struct mbuf *mbufc) { struct ether_header *eh; struct ip *iph; uint16_t ether_type; eh = mtod(mbufc, struct ether_header*); ether_type = ntohs(eh->ether_type); if (ether_type != ETHERTYPE_IP) { /* Nothing to calculate */ return; } iph = (struct ip*)(eh + 1); if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) { iph->ip_sum = 0; iph->ip_sum = in_cksum_hdr(iph); } switch (iph->ip_p) { case IPPROTO_TCP: if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) { size_t tcplen = ntohs(iph->ip_len) - sizeof(struct ip); struct tcphdr *th = (struct tcphdr*)(iph + 1); th->th_sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, htons(IPPROTO_TCP + tcplen)); th->th_sum = in_cksum_skip(mbufc, sizeof(struct ether_header) + ntohs(iph->ip_len), sizeof(struct ether_header) + (iph->ip_hl << 2)); } break; case IPPROTO_UDP: if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) { size_t udplen = ntohs(iph->ip_len) - sizeof(struct ip); struct udphdr *uh = (struct udphdr*)(iph + 1); uh->uh_sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, htons(IPPROTO_UDP + udplen)); uh->uh_sum = in_cksum_skip(mbufc, sizeof(struct ether_header) + ntohs(iph->ip_len), sizeof(struct ether_header) + (iph->ip_hl << 2)); } break; default: break; } } #endif /* INET || INET6 */ static void xnb_stop(struct xnb_softc *xnb) { struct ifnet *ifp; mtx_assert(&xnb->sc_lock, MA_OWNED); ifp = xnb->xnb_ifp; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); } static int xnb_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct xnb_softc *xnb = ifp->if_softc; struct ifreq *ifr = (struct ifreq*) data; #ifdef INET struct ifaddr *ifa = (struct ifaddr*)data; #endif int error = 0; switch (cmd) { case SIOCSIFFLAGS: mtx_lock(&xnb->sc_lock); if (ifp->if_flags & IFF_UP) { xnb_ifinit_locked(xnb); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { xnb_stop(xnb); } } /* * Note: netfront sets a variable named xn_if_flags * here, but that variable is never read */ mtx_unlock(&xnb->sc_lock); break; case SIOCSIFADDR: #ifdef INET mtx_lock(&xnb->sc_lock); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); } arp_ifinit(ifp, ifa); mtx_unlock(&xnb->sc_lock); } else { mtx_unlock(&xnb->sc_lock); #endif error = ether_ioctl(ifp, cmd, data); #ifdef INET } #endif break; case SIOCSIFCAP: mtx_lock(&xnb->sc_lock); if (ifr->ifr_reqcap & IFCAP_TXCSUM) { ifp->if_capenable |= IFCAP_TXCSUM; ifp->if_hwassist |= XNB_CSUM_FEATURES; } else { ifp->if_capenable &= ~(IFCAP_TXCSUM); ifp->if_hwassist &= ~(XNB_CSUM_FEATURES); } if ((ifr->ifr_reqcap & IFCAP_RXCSUM)) { ifp->if_capenable |= IFCAP_RXCSUM; } else { ifp->if_capenable &= ~(IFCAP_RXCSUM); } /* * TODO enable TSO4 and LRO once we no longer need * to calculate checksums in software */ #if 0 if (ifr->if_reqcap |= IFCAP_TSO4) { if (IFCAP_TXCSUM & ifp->if_capenable) { printf("xnb: Xen netif requires that " "TXCSUM be enabled in order " "to use TSO4\n"); error = EINVAL; } else { ifp->if_capenable |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_TSO; } } else { ifp->if_capenable &= ~(IFCAP_TSO4); ifp->if_hwassist &= ~(CSUM_TSO); } if (ifr->ifreqcap |= IFCAP_LRO) { ifp->if_capenable |= IFCAP_LRO; } else { ifp->if_capenable &= ~(IFCAP_LRO); } #endif mtx_unlock(&xnb->sc_lock); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; xnb_ifinit(xnb); break; case SIOCADDMULTI: case SIOCDELMULTI: case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &xnb->sc_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void xnb_start_locked(struct ifnet *ifp) { netif_rx_back_ring_t *rxb; struct xnb_softc *xnb; struct mbuf *mbufc; RING_IDX req_prod_local; xnb = ifp->if_softc; rxb = &xnb->ring_configs[XNB_RING_TYPE_RX].back_ring.rx_ring; if (!xnb->carrier) return; do { int out_of_space = 0; int notify; req_prod_local = rxb->sring->req_prod; xen_rmb(); for (;;) { int error; IF_DEQUEUE(&ifp->if_snd, mbufc); if (mbufc == NULL) break; error = xnb_send(rxb, xnb->otherend_id, mbufc, xnb->rx_gnttab); switch (error) { case EAGAIN: /* * Insufficient space in the ring. * Requeue pkt and send when space is * available. */ IF_PREPEND(&ifp->if_snd, mbufc); /* * Perhaps the frontend missed an IRQ * and went to sleep. Notify it to wake * it up. */ out_of_space = 1; break; case EINVAL: /* OS gave a corrupt packet. Drop it.*/ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* FALLTHROUGH */ default: /* Send succeeded, or packet had error. * Free the packet */ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if (mbufc) m_freem(mbufc); break; } if (out_of_space != 0) break; } RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(rxb, notify); if ((notify != 0) || (out_of_space != 0)) xen_intr_signal(xnb->xen_intr_handle); rxb->sring->req_event = req_prod_local + 1; xen_mb(); } while (rxb->sring->req_prod != req_prod_local) ; } /** * Sends one packet to the ring. Blocks until the packet is on the ring * \param[in] mbufc Contains one packet to send. Caller must free * \param[in,out] rxb The packet will be pushed onto this ring, but the * otherend will not be notified. * \param[in] otherend The domain ID of the other end of the connection * \retval EAGAIN The ring did not have enough space for the packet. * The ring has not been modified * \param[in,out] gnttab Pointer to enough memory for a grant table. We make * this a function parameter so that we will take less * stack space. * \retval EINVAL mbufc was corrupt or not convertible into a pkt */ static int xnb_send(netif_rx_back_ring_t *ring, domid_t otherend, const struct mbuf *mbufc, gnttab_copy_table gnttab) { struct xnb_pkt pkt; int error, n_entries, n_reqs; RING_IDX space; space = ring->sring->req_prod - ring->req_cons; error = xnb_mbufc2pkt(mbufc, &pkt, ring->rsp_prod_pvt, space); if (error != 0) return error; n_entries = xnb_rxpkt2gnttab(&pkt, mbufc, gnttab, ring, otherend); if (n_entries != 0) { int __unused hv_ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, gnttab, n_entries); KASSERT(hv_ret == 0, ("HYPERVISOR_grant_table_op returned %d\n", hv_ret)); } n_reqs = xnb_rxpkt2rsp(&pkt, gnttab, n_entries, ring); return 0; } static void xnb_start(struct ifnet *ifp) { struct xnb_softc *xnb; xnb = ifp->if_softc; mtx_lock(&xnb->rx_lock); xnb_start_locked(ifp); mtx_unlock(&xnb->rx_lock); } /* equivalent of network_open() in Linux */ static void xnb_ifinit_locked(struct xnb_softc *xnb) { struct ifnet *ifp; ifp = xnb->xnb_ifp; mtx_assert(&xnb->sc_lock, MA_OWNED); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; xnb_stop(xnb); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); } static void xnb_ifinit(void *xsc) { struct xnb_softc *xnb = xsc; mtx_lock(&xnb->sc_lock); xnb_ifinit_locked(xnb); mtx_unlock(&xnb->sc_lock); } /** * Callback used by the generic networking code to tell us when our carrier * state has changed. Since we don't have a physical carrier, we don't care */ static int xnb_ifmedia_upd(struct ifnet *ifp) { return (0); } /** * Callback used by the generic networking code to ask us what our carrier * state is. Since we don't have a physical carrier, this is very simple */ static void xnb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE; ifmr->ifm_active = IFM_ETHER|IFM_MANUAL; } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xnb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xnb_probe), DEVMETHOD(device_attach, xnb_attach), DEVMETHOD(device_detach, xnb_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xnb_suspend), DEVMETHOD(device_resume, xnb_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xnb_frontend_changed), { 0, 0 } }; static driver_t xnb_driver = { "xnb", xnb_methods, sizeof(struct xnb_softc), }; devclass_t xnb_devclass; DRIVER_MODULE(xnb, xenbusb_back, xnb_driver, xnb_devclass, 0, 0); /*-------------------------- Unit Tests -------------------------------------*/ #ifdef XNB_DEBUG #include "netback_unit_tests.c" #endif Index: head/sys/dev/xen/xenstore/xenstore.c =================================================================== --- head/sys/dev/xen/xenstore/xenstore.c (revision 358315) +++ head/sys/dev/xen/xenstore/xenstore.c (revision 358316) @@ -1,1658 +1,1659 @@ /****************************************************************************** * xenstore.c * * Low-level kernel interface to the XenStore. * * Copyright (C) 2005 Rusty Russell, IBM Corporation * Copyright (C) 2009,2010 Spectra Logic Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /** * \file xenstore.c * \brief XenStore interface * * The XenStore interface is a simple storage system that is a means of * communicating state and configuration data between the Xen Domain 0 * and the various guest domains. All configuration data other than * a small amount of essential information required during the early * boot process of launching a Xen aware guest, is managed using the * XenStore. * * The XenStore is ASCII string based, and has a structure and semantics * similar to a filesystem. There are files and directories, the directories * able to contain files or other directories. The depth of the hierarchy * is only limited by the XenStore's maximum path length. * * The communication channel between the XenStore service and other * domains is via two, guest specific, ring buffers in a shared memory * area. One ring buffer is used for communicating in each direction. * The grant table references for this shared memory are given to the * guest either via the xen_start_info structure for a fully para- * virtualized guest, or via HVM hypercalls for a hardware virtualized * guest. * * The XenStore communication relies on an event channel and thus * interrupts. For this reason, the attachment of the XenStore * relies on an interrupt driven configuration hook to hold off * boot processing until communication with the XenStore service * can be established. * * Several Xen services depend on the XenStore, most notably the * XenBus used to discover and manage Xen devices. These services * are implemented as NewBus child attachments to a bus exported * by this XenStore driver. */ static struct xs_watch *find_watch(const char *token); MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results"); /** * Pointer to shared memory communication structures allowing us * to communicate with the XenStore service. * * When operating in full PV mode, this pointer is set early in kernel * startup from within xen_machdep.c. In HVM mode, we use hypercalls * to get the guest frame number for the shared page and then map it * into kva. See xs_init() for details. */ static struct xenstore_domain_interface *xen_store; /*-------------------------- Private Data Structures ------------------------*/ /** * Structure capturing messages received from the XenStore service. */ struct xs_stored_msg { TAILQ_ENTRY(xs_stored_msg) list; struct xsd_sockmsg hdr; union { /* Queued replies. */ struct { char *body; } reply; /* Queued watch events. */ struct { struct xs_watch *handle; const char **vec; u_int vec_size; } watch; } u; }; TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg); /** * Container for all XenStore related state. */ struct xs_softc { /** Newbus device for the XenStore. */ device_t xs_dev; /** * Lock serializing access to ring producer/consumer * indexes. Use of this lock guarantees that wakeups * of blocking readers/writers are not missed due to * races with the XenStore service. */ struct mtx ring_lock; /* * Mutex used to insure exclusive access to the outgoing * communication ring. We use a lock type that can be * held while sleeping so that xs_write() can block waiting * for space in the ring to free up, without allowing another * writer to come in and corrupt a partial message write. */ struct sx request_mutex; /** * A list of replies to our requests. * * The reply list is filled by xs_rcv_thread(). It * is consumed by the context that issued the request * to which a reply is made. The requester blocks in * xs_read_reply(). * * /note Only one requesting context can be active at a time. * This is guaranteed by the request_mutex and insures * that the requester sees replies matching the order * of its requests. */ struct xs_stored_msg_list reply_list; /** Lock protecting the reply list. */ struct mtx reply_lock; /** * List of registered watches. */ struct xs_watch_list registered_watches; /** Lock protecting the registered watches list. */ struct mtx registered_watches_lock; /** * List of pending watch callback events. */ struct xs_stored_msg_list watch_events; /** Lock protecting the watch calback list. */ struct mtx watch_events_lock; /** * The processid of the xenwatch thread. */ pid_t xenwatch_pid; /** * Sleepable mutex used to gate the execution of XenStore * watch event callbacks. * * xenwatch_thread holds an exclusive lock on this mutex * while delivering event callbacks, and xenstore_unregister_watch() * uses an exclusive lock of this mutex to guarantee that no * callbacks of the just unregistered watch are pending * before returning to its caller. */ struct sx xenwatch_mutex; /** * The HVM guest pseudo-physical frame number. This is Xen's mapping * of the true machine frame number into our "physical address space". */ unsigned long gpfn; /** * The event channel for communicating with the * XenStore service. */ int evtchn; /** Handle for XenStore interrupts. */ xen_intr_handle_t xen_intr_handle; /** * Interrupt driven config hook allowing us to defer * attaching children until interrupts (and thus communication * with the XenStore service) are available. */ struct intr_config_hook xs_attachcb; /** * Xenstore is a user-space process that usually runs in Dom0, * so if this domain is booting as Dom0, xenstore wont we accessible, * and we have to defer the initialization of xenstore related * devices to later (when xenstore is started). */ bool initialized; /** * Task to run when xenstore is initialized (Dom0 only), will * take care of attaching xenstore related devices. */ struct task xs_late_init; }; /*-------------------------------- Global Data ------------------------------*/ static struct xs_softc xs; /*------------------------- Private Utility Functions -----------------------*/ /** * Count and optionally record pointers to a number of NUL terminated * strings in a buffer. * * \param strings A pointer to a contiguous buffer of NUL terminated strings. * \param dest An array to store pointers to each string found in strings. * \param len The length of the buffer pointed to by strings. * * \return A count of the number of strings found. */ static u_int extract_strings(const char *strings, const char **dest, u_int len) { u_int num; const char *p; for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) { if (dest != NULL) *dest++ = p; num++; } return (num); } /** * Convert a contiguous buffer containing a series of NUL terminated * strings into an array of pointers to strings. * * The returned pointer references the array of string pointers which * is followed by the storage for the string data. It is the client's * responsibility to free this storage. * * The storage addressed by strings is free'd prior to split returning. * * \param strings A pointer to a contiguous buffer of NUL terminated strings. * \param len The length of the buffer pointed to by strings. * \param num The number of strings found and returned in the strings * array. * * \return An array of pointers to the strings found in the input buffer. */ static const char ** split(char *strings, u_int len, u_int *num) { const char **ret; /* Protect against unterminated buffers. */ if (len > 0) strings[len - 1] = '\0'; /* Count the strings. */ *num = extract_strings(strings, /*dest*/NULL, len); /* Transfer to one big alloc for easy freeing by the caller. */ ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK); memcpy(&ret[*num], strings, len); free(strings, M_XENSTORE); /* Extract pointers to newly allocated array. */ strings = (char *)&ret[*num]; (void)extract_strings(strings, /*dest*/ret, len); return (ret); } /*------------------------- Public Utility Functions -------------------------*/ /*------- API comments for these methods can be found in xenstorevar.h -------*/ struct sbuf * xs_join(const char *dir, const char *name) { struct sbuf *sb; sb = sbuf_new_auto(); sbuf_cat(sb, dir); if (name[0] != '\0') { sbuf_putc(sb, '/'); sbuf_cat(sb, name); } sbuf_finish(sb); return (sb); } /*-------------------- Low Level Communication Management --------------------*/ /** * Interrupt handler for the XenStore event channel. * * XenStore reads and writes block on "xen_store" for buffer * space. Wakeup any blocking operations when the XenStore * service has modified the queues. */ static void xs_intr(void * arg __unused /*__attribute__((unused))*/) { /* If xenstore has not been initialized, initialize it now */ if (!xs.initialized) { xs.initialized = true; /* * Since this task is probing and attaching devices we * have to hold the Giant lock. */ taskqueue_enqueue(taskqueue_swi_giant, &xs.xs_late_init); } /* * Hold ring lock across wakeup so that clients * cannot miss a wakeup. */ mtx_lock(&xs.ring_lock); wakeup(xen_store); mtx_unlock(&xs.ring_lock); } /** * Verify that the indexes for a ring are valid. * * The difference between the producer and consumer cannot * exceed the size of the ring. * * \param cons The consumer index for the ring to test. * \param prod The producer index for the ring to test. * * \retval 1 If indexes are in range. * \retval 0 If the indexes are out of range. */ static int xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) { return ((prod - cons) <= XENSTORE_RING_SIZE); } /** * Return a pointer to, and the length of, the contiguous * free region available for output in a ring buffer. * * \param cons The consumer index for the ring. * \param prod The producer index for the ring. * \param buf The base address of the ring's storage. * \param len The amount of contiguous storage available. * * \return A pointer to the start location of the free region. */ static void * xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, char *buf, uint32_t *len) { *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) *len = XENSTORE_RING_SIZE - (prod - cons); return (buf + MASK_XENSTORE_IDX(prod)); } /** * Return a pointer to, and the length of, the contiguous * data available to read from a ring buffer. * * \param cons The consumer index for the ring. * \param prod The producer index for the ring. * \param buf The base address of the ring's storage. * \param len The amount of contiguous data available to read. * * \return A pointer to the start location of the available data. */ static const void * xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, const char *buf, uint32_t *len) { *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); if ((prod - cons) < *len) *len = prod - cons; return (buf + MASK_XENSTORE_IDX(cons)); } /** * Transmit data to the XenStore service. * * \param tdata A pointer to the contiguous data to send. * \param len The amount of data to send. * * \return On success 0, otherwise an errno value indicating the * cause of failure. * * \invariant Called from thread context. * \invariant The buffer pointed to by tdata is at least len bytes * in length. * \invariant xs.request_mutex exclusively locked. */ static int xs_write_store(const void *tdata, unsigned len) { XENSTORE_RING_IDX cons, prod; const char *data = (const char *)tdata; int error; sx_assert(&xs.request_mutex, SX_XLOCKED); while (len != 0) { void *dst; u_int avail; /* Hold lock so we can't miss wakeups should we block. */ mtx_lock(&xs.ring_lock); cons = xen_store->req_cons; prod = xen_store->req_prod; if ((prod - cons) == XENSTORE_RING_SIZE) { /* * Output ring is full. Wait for a ring event. * * Note that the events from both queues * are combined, so being woken does not * guarantee that data exist in the read * ring. * * To simplify error recovery and the retry, * we specify PDROP so our lock is *not* held * when msleep returns. */ error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP, "xbwrite", /*timeout*/0); if (error && error != EWOULDBLOCK) return (error); /* Try again. */ continue; } mtx_unlock(&xs.ring_lock); /* Verify queue sanity. */ if (!xs_check_indexes(cons, prod)) { xen_store->req_cons = xen_store->req_prod = 0; return (EIO); } dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail); if (avail > len) avail = len; memcpy(dst, data, avail); data += avail; len -= avail; /* * The store to the producer index, which indicates * to the other side that new data has arrived, must * be visible only after our copy of the data into the * ring has completed. */ wmb(); xen_store->req_prod += avail; /* * xen_intr_signal() implies mb(). The other side will see * the change to req_prod at the time of the interrupt. */ xen_intr_signal(xs.xen_intr_handle); } return (0); } /** * Receive data from the XenStore service. * * \param tdata A pointer to the contiguous buffer to receive the data. * \param len The amount of data to receive. * * \return On success 0, otherwise an errno value indicating the * cause of failure. * * \invariant Called from thread context. * \invariant The buffer pointed to by tdata is at least len bytes * in length. * * \note xs_read does not perform any internal locking to guarantee * serial access to the incoming ring buffer. However, there * is only one context processing reads: xs_rcv_thread(). */ static int xs_read_store(void *tdata, unsigned len) { XENSTORE_RING_IDX cons, prod; char *data = (char *)tdata; int error; while (len != 0) { u_int avail; const char *src; /* Hold lock so we can't miss wakeups should we block. */ mtx_lock(&xs.ring_lock); cons = xen_store->rsp_cons; prod = xen_store->rsp_prod; if (cons == prod) { /* * Nothing to read. Wait for a ring event. * * Note that the events from both queues * are combined, so being woken does not * guarantee that data exist in the read * ring. * * To simplify error recovery and the retry, * we specify PDROP so our lock is *not* held * when msleep returns. */ error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP, "xbread", /*timeout*/0); if (error && error != EWOULDBLOCK) return (error); continue; } mtx_unlock(&xs.ring_lock); /* Verify queue sanity. */ if (!xs_check_indexes(cons, prod)) { xen_store->rsp_cons = xen_store->rsp_prod = 0; return (EIO); } src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail); if (avail > len) avail = len; /* * Insure the data we read is related to the indexes * we read above. */ rmb(); memcpy(data, src, avail); data += avail; len -= avail; /* * Insure that the producer of this ring does not see * the ring space as free until after we have copied it * out. */ mb(); xen_store->rsp_cons += avail; /* * xen_intr_signal() implies mb(). The producer will see * the updated consumer index when the event is delivered. */ xen_intr_signal(xs.xen_intr_handle); } return (0); } /*----------------------- Received Message Processing ------------------------*/ /** * Block reading the next message from the XenStore service and * process the result. * * \param type The returned type of the XenStore message received. * * \return 0 on success. Otherwise an errno value indicating the * type of failure encountered. */ static int xs_process_msg(enum xsd_sockmsg_type *type) { struct xs_stored_msg *msg; char *body; int error; msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK); error = xs_read_store(&msg->hdr, sizeof(msg->hdr)); if (error) { free(msg, M_XENSTORE); return (error); } body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK); error = xs_read_store(body, msg->hdr.len); if (error) { free(body, M_XENSTORE); free(msg, M_XENSTORE); return (error); } body[msg->hdr.len] = '\0'; *type = msg->hdr.type; if (msg->hdr.type == XS_WATCH_EVENT) { msg->u.watch.vec = split(body, msg->hdr.len, &msg->u.watch.vec_size); mtx_lock(&xs.registered_watches_lock); msg->u.watch.handle = find_watch( msg->u.watch.vec[XS_WATCH_TOKEN]); if (msg->u.watch.handle != NULL) { mtx_lock(&xs.watch_events_lock); TAILQ_INSERT_TAIL(&xs.watch_events, msg, list); wakeup(&xs.watch_events); mtx_unlock(&xs.watch_events_lock); } else { free(msg->u.watch.vec, M_XENSTORE); free(msg, M_XENSTORE); } mtx_unlock(&xs.registered_watches_lock); } else { msg->u.reply.body = body; mtx_lock(&xs.reply_lock); TAILQ_INSERT_TAIL(&xs.reply_list, msg, list); wakeup(&xs.reply_list); mtx_unlock(&xs.reply_lock); } return (0); } /** * Thread body of the XenStore receive thread. * * This thread blocks waiting for data from the XenStore service * and processes and received messages. */ static void xs_rcv_thread(void *arg __unused) { int error; enum xsd_sockmsg_type type; for (;;) { error = xs_process_msg(&type); if (error) printf("XENSTORE error %d while reading message\n", error); } } /*---------------- XenStore Message Request/Reply Processing -----------------*/ #define xsd_error_count (sizeof(xsd_errors) / sizeof(xsd_errors[0])) /** * Convert a XenStore error string into an errno number. * * \param errorstring The error string to convert. * * \return The errno best matching the input string. * * \note Unknown error strings are converted to EINVAL. */ static int xs_get_error(const char *errorstring) { u_int i; for (i = 0; i < xsd_error_count; i++) { if (!strcmp(errorstring, xsd_errors[i].errstring)) return (xsd_errors[i].errnum); } log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s", errorstring); return (EINVAL); } /** * Block waiting for a reply to a message request. * * \param type The returned type of the reply. * \param len The returned body length of the reply. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating the * cause of failure. */ static int xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result) { struct xs_stored_msg *msg; char *body; int error; mtx_lock(&xs.reply_lock); while (TAILQ_EMPTY(&xs.reply_list)) { error = mtx_sleep(&xs.reply_list, &xs.reply_lock, 0, "xswait", hz/10); if (error && error != EWOULDBLOCK) { mtx_unlock(&xs.reply_lock); return (error); } } msg = TAILQ_FIRST(&xs.reply_list); TAILQ_REMOVE(&xs.reply_list, msg, list); mtx_unlock(&xs.reply_lock); *type = msg->hdr.type; if (len) *len = msg->hdr.len; body = msg->u.reply.body; free(msg, M_XENSTORE); *result = body; return (0); } /** * Pass-thru interface for XenStore access by userland processes * via the XenStore device. * * Reply type and length data are returned by overwriting these * fields in the passed in request message. * * \param msg A properly formatted message to transmit to * the XenStore service. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating the cause * of failure. * * \note The returned result is provided in malloced storage and thus * must be free'd by the caller with 'free(result, M_XENSTORE); */ int xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result) { uint32_t request_type; int error; request_type = msg->type; sx_xlock(&xs.request_mutex); if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0) error = xs_read_reply(&msg->type, &msg->len, result); sx_xunlock(&xs.request_mutex); return (error); } /** * Send a message with an optionally muti-part body to the XenStore service. * * \param t The transaction to use for this request. * \param request_type The type of message to send. * \param iovec Pointers to the body sections of the request. * \param num_vecs The number of body sections in the request. * \param len The returned length of the reply. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating * the cause of failure. * * \note The returned result is provided in malloced storage and thus * must be free'd by the caller with 'free(*result, M_XENSTORE); */ static int xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type, const struct iovec *iovec, u_int num_vecs, u_int *len, void **result) { struct xsd_sockmsg msg; void *ret = NULL; u_int i; int error; msg.tx_id = t.id; msg.req_id = 0; msg.type = request_type; msg.len = 0; for (i = 0; i < num_vecs; i++) msg.len += iovec[i].iov_len; sx_xlock(&xs.request_mutex); error = xs_write_store(&msg, sizeof(msg)); if (error) { printf("xs_talkv failed %d\n", error); goto error_lock_held; } for (i = 0; i < num_vecs; i++) { error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len); if (error) { printf("xs_talkv failed %d\n", error); goto error_lock_held; } } error = xs_read_reply(&msg.type, len, &ret); error_lock_held: sx_xunlock(&xs.request_mutex); if (error) return (error); if (msg.type == XS_ERROR) { error = xs_get_error(ret); free(ret, M_XENSTORE); return (error); } /* Reply is either error or an echo of our request message type. */ KASSERT(msg.type == request_type, ("bad xenstore message type")); if (result) *result = ret; else free(ret, M_XENSTORE); return (0); } /** * Wrapper for xs_talkv allowing easy transmission of a message with * a single, contiguous, message body. * * \param t The transaction to use for this request. * \param request_type The type of message to send. * \param body The body of the request. * \param len The returned length of the reply. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating * the cause of failure. * * \note The returned result is provided in malloced storage and thus * must be free'd by the caller with 'free(*result, M_XENSTORE); */ static int xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type, const char *body, u_int *len, void **result) { struct iovec iovec; iovec.iov_base = (void *)(uintptr_t)body; iovec.iov_len = strlen(body) + 1; return (xs_talkv(t, request_type, &iovec, 1, len, result)); } /*------------------------- XenStore Watch Support ---------------------------*/ /** * Transmit a watch request to the XenStore service. * * \param path The path in the XenStore to watch. * \param tocken A unique identifier for this watch. * * \return 0 on success. Otherwise an errno indicating the * cause of failure. */ static int xs_watch(const char *path, const char *token) { struct iovec iov[2]; iov[0].iov_base = (void *)(uintptr_t) path; iov[0].iov_len = strlen(path) + 1; iov[1].iov_base = (void *)(uintptr_t) token; iov[1].iov_len = strlen(token) + 1; return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL)); } /** * Transmit an uwatch request to the XenStore service. * * \param path The path in the XenStore to watch. * \param tocken A unique identifier for this watch. * * \return 0 on success. Otherwise an errno indicating the * cause of failure. */ static int xs_unwatch(const char *path, const char *token) { struct iovec iov[2]; iov[0].iov_base = (void *)(uintptr_t) path; iov[0].iov_len = strlen(path) + 1; iov[1].iov_base = (void *)(uintptr_t) token; iov[1].iov_len = strlen(token) + 1; return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL)); } /** * Convert from watch token (unique identifier) to the associated * internal tracking structure for this watch. * * \param tocken The unique identifier for the watch to find. * * \return A pointer to the found watch structure or NULL. */ static struct xs_watch * find_watch(const char *token) { struct xs_watch *i, *cmp; cmp = (void *)strtoul(token, NULL, 16); LIST_FOREACH(i, &xs.registered_watches, list) if (i == cmp) return (i); return (NULL); } /** * Thread body of the XenStore watch event dispatch thread. */ static void xenwatch_thread(void *unused) { struct xs_stored_msg *msg; for (;;) { mtx_lock(&xs.watch_events_lock); while (TAILQ_EMPTY(&xs.watch_events)) mtx_sleep(&xs.watch_events, &xs.watch_events_lock, PWAIT | PCATCH, "waitev", hz/10); mtx_unlock(&xs.watch_events_lock); sx_xlock(&xs.xenwatch_mutex); mtx_lock(&xs.watch_events_lock); msg = TAILQ_FIRST(&xs.watch_events); if (msg) TAILQ_REMOVE(&xs.watch_events, msg, list); mtx_unlock(&xs.watch_events_lock); if (msg != NULL) { /* * XXX There are messages coming in with a NULL * XXX callback. This deserves further investigation; * XXX the workaround here simply prevents the kernel * XXX from panic'ing on startup. */ if (msg->u.watch.handle->callback != NULL) msg->u.watch.handle->callback( msg->u.watch.handle, (const char **)msg->u.watch.vec, msg->u.watch.vec_size); free(msg->u.watch.vec, M_XENSTORE); free(msg, M_XENSTORE); } sx_xunlock(&xs.xenwatch_mutex); } } /*----------- XenStore Configuration, Initialization, and Control ------------*/ /** * Setup communication channels with the XenStore service. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xs_init_comms(void) { int error; if (xen_store->rsp_prod != xen_store->rsp_cons) { log(LOG_WARNING, "XENSTORE response ring is not quiescent " "(%08x:%08x): fixing up\n", xen_store->rsp_cons, xen_store->rsp_prod); xen_store->rsp_cons = xen_store->rsp_prod; } xen_intr_unbind(&xs.xen_intr_handle); error = xen_intr_bind_local_port(xs.xs_dev, xs.evtchn, /*filter*/NULL, xs_intr, /*arg*/NULL, INTR_TYPE_NET|INTR_MPSAFE, &xs.xen_intr_handle); if (error) { log(LOG_WARNING, "XENSTORE request irq failed %i\n", error); return (error); } return (0); } /*------------------ Private Device Attachment Functions --------------------*/ static void xs_identify(driver_t *driver, device_t parent) { BUS_ADD_CHILD(parent, 0, "xenstore", 0); } /** * Probe for the existence of the XenStore. * * \param dev */ static int xs_probe(device_t dev) { /* * We are either operating within a PV kernel or being probed * as the child of the successfully attached xenpci device. * Thus we are in a Xen environment and there will be a XenStore. * Unconditionally return success. */ device_set_desc(dev, "XenStore"); return (BUS_PROBE_NOWILDCARD); } static void xs_attach_deferred(void *arg) { bus_generic_probe(xs.xs_dev); bus_generic_attach(xs.xs_dev); config_intrhook_disestablish(&xs.xs_attachcb); } static void xs_attach_late(void *arg, int pending) { KASSERT((pending == 1), ("xs late attach queued several times")); bus_generic_probe(xs.xs_dev); bus_generic_attach(xs.xs_dev); } /** * Attach to the XenStore. * * This routine also prepares for the probe/attach of drivers that rely * on the XenStore. */ static int xs_attach(device_t dev) { int error; /* Allow us to get device_t from softc and vice-versa. */ xs.xs_dev = dev; device_set_softc(dev, &xs); /* Initialize the interface to xenstore. */ struct proc *p; xs.initialized = false; xs.evtchn = xen_get_xenstore_evtchn(); if (xs.evtchn == 0) { struct evtchn_alloc_unbound alloc_unbound; /* Allocate a local event channel for xenstore */ alloc_unbound.dom = DOMID_SELF; alloc_unbound.remote_dom = DOMID_SELF; error = HYPERVISOR_event_channel_op( EVTCHNOP_alloc_unbound, &alloc_unbound); if (error != 0) panic( "unable to alloc event channel for Dom0: %d", error); xs.evtchn = alloc_unbound.port; /* Allocate memory for the xs shared ring */ xen_store = malloc(PAGE_SIZE, M_XENSTORE, M_WAITOK | M_ZERO); xs.gpfn = atop(pmap_kextract((vm_offset_t)xen_store)); } else { xs.gpfn = xen_get_xenstore_mfn(); xen_store = pmap_mapdev_attr(ptoa(xs.gpfn), PAGE_SIZE, PAT_WRITE_BACK); xs.initialized = true; } TAILQ_INIT(&xs.reply_list); TAILQ_INIT(&xs.watch_events); mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF); mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF); sx_init(&xs.xenwatch_mutex, "xenwatch"); sx_init(&xs.request_mutex, "xenstore request"); mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF); mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF); /* Initialize the shared memory rings to talk to xenstored */ error = xs_init_comms(); if (error) return (error); error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID, 0, "xenwatch"); if (error) return (error); xs.xenwatch_pid = p->p_pid; error = kproc_create(xs_rcv_thread, NULL, NULL, RFHIGHPID, 0, "xenstore_rcv"); xs.xs_attachcb.ich_func = xs_attach_deferred; xs.xs_attachcb.ich_arg = NULL; if (xs.initialized) { config_intrhook_establish(&xs.xs_attachcb); } else { TASK_INIT(&xs.xs_late_init, 0, xs_attach_late, NULL); } return (error); } /** * Prepare for suspension of this VM by halting XenStore access after * all transactions and individual requests have completed. */ static int xs_suspend(device_t dev) { int error; /* Suspend child Xen devices. */ error = bus_generic_suspend(dev); if (error != 0) return (error); sx_xlock(&xs.request_mutex); return (0); } /** * Resume XenStore operations after this VM is resumed. */ static int xs_resume(device_t dev __unused) { struct xs_watch *watch; char token[sizeof(watch) * 2 + 1]; xs_init_comms(); sx_xunlock(&xs.request_mutex); /* * NB: since xenstore childs have not been resumed yet, there's * no need to hold any watch mutex. Having clients try to add or * remove watches at this point (before xenstore is resumed) is * clearly a violantion of the resume order. */ LIST_FOREACH(watch, &xs.registered_watches, list) { sprintf(token, "%lX", (long)watch); xs_watch(watch->node, token); } /* Resume child Xen devices. */ bus_generic_resume(dev); return (0); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t xenstore_methods[] = { /* Device interface */ DEVMETHOD(device_identify, xs_identify), DEVMETHOD(device_probe, xs_probe), DEVMETHOD(device_attach, xs_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xs_suspend), DEVMETHOD(device_resume, xs_resume), /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD_END }; DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0); static devclass_t xenstore_devclass; DRIVER_MODULE(xenstore, xenpv, xenstore_driver, xenstore_devclass, 0, 0); /*------------------------------- Sysctl Data --------------------------------*/ /* XXX Shouldn't the node be somewhere else? */ -SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD, NULL, "Xen"); +SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Xen"); SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, ""); SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, ""); /*-------------------------------- Public API --------------------------------*/ /*------- API comments for these methods can be found in xenstorevar.h -------*/ bool xs_initialized(void) { return (xs.initialized); } evtchn_port_t xs_evtchn(void) { return (xs.evtchn); } vm_paddr_t xs_address(void) { return (ptoa(xs.gpfn)); } int xs_directory(struct xs_transaction t, const char *dir, const char *node, u_int *num, const char ***result) { struct sbuf *path; char *strings; u_int len = 0; int error; path = xs_join(dir, node); error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len, (void **)&strings); sbuf_delete(path); if (error) return (error); *result = split(strings, len, num); return (0); } int xs_exists(struct xs_transaction t, const char *dir, const char *node) { const char **d; int error, dir_n; error = xs_directory(t, dir, node, &dir_n, &d); if (error) return (0); free(d, M_XENSTORE); return (1); } int xs_read(struct xs_transaction t, const char *dir, const char *node, u_int *len, void **result) { struct sbuf *path; void *ret; int error; path = xs_join(dir, node); error = xs_single(t, XS_READ, sbuf_data(path), len, &ret); sbuf_delete(path); if (error) return (error); *result = ret; return (0); } int xs_write(struct xs_transaction t, const char *dir, const char *node, const char *string) { struct sbuf *path; struct iovec iovec[2]; int error; path = xs_join(dir, node); iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path); iovec[0].iov_len = sbuf_len(path) + 1; iovec[1].iov_base = (void *)(uintptr_t) string; iovec[1].iov_len = strlen(string); error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL); sbuf_delete(path); return (error); } int xs_mkdir(struct xs_transaction t, const char *dir, const char *node) { struct sbuf *path; int ret; path = xs_join(dir, node); ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL); sbuf_delete(path); return (ret); } int xs_rm(struct xs_transaction t, const char *dir, const char *node) { struct sbuf *path; int ret; path = xs_join(dir, node); ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL); sbuf_delete(path); return (ret); } int xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node) { struct xs_transaction local_xbt; struct sbuf *root_path_sbuf; struct sbuf *cur_path_sbuf; char *root_path; char *cur_path; const char **dir; int error; retry: root_path_sbuf = xs_join(base, node); cur_path_sbuf = xs_join(base, node); root_path = sbuf_data(root_path_sbuf); cur_path = sbuf_data(cur_path_sbuf); dir = NULL; local_xbt.id = 0; if (xbt.id == 0) { error = xs_transaction_start(&local_xbt); if (error != 0) goto out; xbt = local_xbt; } while (1) { u_int count; u_int i; error = xs_directory(xbt, cur_path, "", &count, &dir); if (error) goto out; for (i = 0; i < count; i++) { error = xs_rm(xbt, cur_path, dir[i]); if (error == ENOTEMPTY) { struct sbuf *push_dir; /* * Descend to clear out this sub directory. * We'll return to cur_dir once push_dir * is empty. */ push_dir = xs_join(cur_path, dir[i]); sbuf_delete(cur_path_sbuf); cur_path_sbuf = push_dir; cur_path = sbuf_data(cur_path_sbuf); break; } else if (error != 0) { goto out; } } free(dir, M_XENSTORE); dir = NULL; if (i == count) { char *last_slash; /* Directory is empty. It is now safe to remove. */ error = xs_rm(xbt, cur_path, ""); if (error != 0) goto out; if (!strcmp(cur_path, root_path)) break; /* Return to processing the parent directory. */ last_slash = strrchr(cur_path, '/'); KASSERT(last_slash != NULL, ("xs_rm_tree: mangled path %s", cur_path)); *last_slash = '\0'; } } out: sbuf_delete(cur_path_sbuf); sbuf_delete(root_path_sbuf); if (dir != NULL) free(dir, M_XENSTORE); if (local_xbt.id != 0) { int terror; terror = xs_transaction_end(local_xbt, /*abort*/error != 0); xbt.id = 0; if (terror == EAGAIN && error == 0) goto retry; } return (error); } int xs_transaction_start(struct xs_transaction *t) { char *id_str; int error; error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL, (void **)&id_str); if (error == 0) { t->id = strtoul(id_str, NULL, 0); free(id_str, M_XENSTORE); } return (error); } int xs_transaction_end(struct xs_transaction t, int abort) { char abortstr[2]; if (abort) strcpy(abortstr, "F"); else strcpy(abortstr, "T"); return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL)); } int xs_scanf(struct xs_transaction t, const char *dir, const char *node, int *scancountp, const char *fmt, ...) { va_list ap; int error, ns; char *val; error = xs_read(t, dir, node, NULL, (void **) &val); if (error) return (error); va_start(ap, fmt); ns = vsscanf(val, fmt, ap); va_end(ap); free(val, M_XENSTORE); /* Distinctive errno. */ if (ns == 0) return (ERANGE); if (scancountp) *scancountp = ns; return (0); } int xs_vprintf(struct xs_transaction t, const char *dir, const char *node, const char *fmt, va_list ap) { struct sbuf *sb; int error; sb = sbuf_new_auto(); sbuf_vprintf(sb, fmt, ap); sbuf_finish(sb); error = xs_write(t, dir, node, sbuf_data(sb)); sbuf_delete(sb); return (error); } int xs_printf(struct xs_transaction t, const char *dir, const char *node, const char *fmt, ...) { va_list ap; int error; va_start(ap, fmt); error = xs_vprintf(t, dir, node, fmt, ap); va_end(ap); return (error); } int xs_gather(struct xs_transaction t, const char *dir, ...) { va_list ap; const char *name; int error; va_start(ap, dir); error = 0; while (error == 0 && (name = va_arg(ap, char *)) != NULL) { const char *fmt = va_arg(ap, char *); void *result = va_arg(ap, void *); char *p; error = xs_read(t, dir, name, NULL, (void **) &p); if (error) break; if (fmt) { if (sscanf(p, fmt, result) == 0) error = EINVAL; free(p, M_XENSTORE); } else *(char **)result = p; } va_end(ap); return (error); } int xs_register_watch(struct xs_watch *watch) { /* Pointer in ascii is the token. */ char token[sizeof(watch) * 2 + 1]; int error; sprintf(token, "%lX", (long)watch); mtx_lock(&xs.registered_watches_lock); KASSERT(find_watch(token) == NULL, ("watch already registered")); LIST_INSERT_HEAD(&xs.registered_watches, watch, list); mtx_unlock(&xs.registered_watches_lock); error = xs_watch(watch->node, token); /* Ignore errors due to multiple registration. */ if (error == EEXIST) error = 0; if (error != 0) { mtx_lock(&xs.registered_watches_lock); LIST_REMOVE(watch, list); mtx_unlock(&xs.registered_watches_lock); } return (error); } void xs_unregister_watch(struct xs_watch *watch) { struct xs_stored_msg *msg, *tmp; char token[sizeof(watch) * 2 + 1]; int error; sprintf(token, "%lX", (long)watch); mtx_lock(&xs.registered_watches_lock); if (find_watch(token) == NULL) { mtx_unlock(&xs.registered_watches_lock); return; } LIST_REMOVE(watch, list); mtx_unlock(&xs.registered_watches_lock); error = xs_unwatch(watch->node, token); if (error) log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n", watch->node, error); /* Cancel pending watch events. */ mtx_lock(&xs.watch_events_lock); TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) { if (msg->u.watch.handle != watch) continue; TAILQ_REMOVE(&xs.watch_events, msg, list); free(msg->u.watch.vec, M_XENSTORE); free(msg, M_XENSTORE); } mtx_unlock(&xs.watch_events_lock); /* Flush any currently-executing callback, unless we are it. :-) */ if (curproc->p_pid != xs.xenwatch_pid) { sx_xlock(&xs.xenwatch_mutex); sx_xunlock(&xs.xenwatch_mutex); } } void xs_lock(void) { sx_xlock(&xs.request_mutex); return; } void xs_unlock(void) { sx_xunlock(&xs.request_mutex); return; } Index: head/sys/xen/xenbus/xenbusb.c =================================================================== --- head/sys/xen/xenbus/xenbusb.c (revision 358315) +++ head/sys/xen/xenbus/xenbusb.c (revision 358316) @@ -1,975 +1,975 @@ /****************************************************************************** * Copyright (C) 2010 Spectra Logic Corporation * Copyright (C) 2008 Doug Rabson * Copyright (C) 2005 Rusty Russell, IBM Corporation * Copyright (C) 2005 Mike Wray, Hewlett-Packard * Copyright (C) 2005 XenSource Ltd * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ /** * \file xenbusb.c * * \brief Shared support functions for managing the NewBus buses that contain * Xen front and back end device instances. * * The NewBus implementation of XenBus attaches a xenbusb_front and xenbusb_back * child bus to the xenstore device. This strategy allows the small differences * in the handling of XenBus operations for front and back devices to be handled * as overrides in xenbusb_front/back.c. Front and back specific device * classes are also provided so device drivers can register for the devices they * can handle without the need to filter within their probe routines. The * net result is a device hierarchy that might look like this: * * xenstore0/ * xenbusb_front0/ * xn0 * xbd0 * xbd1 * xenbusb_back0/ * xbbd0 * xnb0 * xnb1 */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /*------------------------- Private Functions --------------------------------*/ /** * \brief Deallocate XenBus device instance variables. * * \param ivars The instance variable block to free. */ static void xenbusb_free_child_ivars(struct xenbus_device_ivars *ivars) { if (ivars->xd_otherend_watch.node != NULL) { xs_unregister_watch(&ivars->xd_otherend_watch); free(ivars->xd_otherend_watch.node, M_XENBUS); ivars->xd_otherend_watch.node = NULL; } if (ivars->xd_local_watch.node != NULL) { xs_unregister_watch(&ivars->xd_local_watch); ivars->xd_local_watch.node = NULL; } if (ivars->xd_node != NULL) { free(ivars->xd_node, M_XENBUS); ivars->xd_node = NULL; } ivars->xd_node_len = 0; if (ivars->xd_type != NULL) { free(ivars->xd_type, M_XENBUS); ivars->xd_type = NULL; } if (ivars->xd_otherend_path != NULL) { free(ivars->xd_otherend_path, M_XENBUS); ivars->xd_otherend_path = NULL; } ivars->xd_otherend_path_len = 0; free(ivars, M_XENBUS); } /** * XenBus watch callback registered against the "state" XenStore * node of the other-end of a split device connection. * * This callback is invoked whenever the state of a device instance's * peer changes. * * \param watch The xs_watch object used to register this callback * function. * \param vec An array of pointers to NUL terminated strings containing * watch event data. The vector should be indexed via the * xs_watch_type enum in xs_wire.h. * \param vec_size The number of elements in vec. */ static void xenbusb_otherend_watch_cb(struct xs_watch *watch, const char **vec, unsigned int vec_size __unused) { struct xenbus_device_ivars *ivars; device_t child; device_t bus; const char *path; enum xenbus_state newstate; ivars = (struct xenbus_device_ivars *)watch->callback_data; child = ivars->xd_dev; bus = device_get_parent(child); path = vec[XS_WATCH_PATH]; if (ivars->xd_otherend_path == NULL || strncmp(ivars->xd_otherend_path, path, ivars->xd_otherend_path_len)) return; newstate = xenbus_read_driver_state(ivars->xd_otherend_path); XENBUSB_OTHEREND_CHANGED(bus, child, newstate); } /** * XenBus watch callback registered against the XenStore sub-tree * represnting the local half of a split device connection. * * This callback is invoked whenever any XenStore data in the subtree * is modified, either by us or another privledged domain. * * \param watch The xs_watch object used to register this callback * function. * \param vec An array of pointers to NUL terminated strings containing * watch event data. The vector should be indexed via the * xs_watch_type enum in xs_wire.h. * \param vec_size The number of elements in vec. * */ static void xenbusb_local_watch_cb(struct xs_watch *watch, const char **vec, unsigned int vec_size __unused) { struct xenbus_device_ivars *ivars; device_t child; device_t bus; const char *path; ivars = (struct xenbus_device_ivars *)watch->callback_data; child = ivars->xd_dev; bus = device_get_parent(child); path = vec[XS_WATCH_PATH]; if (ivars->xd_node == NULL || strncmp(ivars->xd_node, path, ivars->xd_node_len)) return; XENBUSB_LOCALEND_CHANGED(bus, child, &path[ivars->xd_node_len]); } /** * Search our internal record of configured devices (not the XenStore) * to determine if the XenBus device indicated by \a node is known to * the system. * * \param dev The XenBus bus instance to search for device children. * \param node The XenStore node path for the device to find. * * \return The device_t of the found device if any, or NULL. * * \note device_t is a pointer type, so it can be compared against * NULL for validity. */ static device_t xenbusb_device_exists(device_t dev, const char *node) { device_t *kids; device_t result; struct xenbus_device_ivars *ivars; int i, count; if (device_get_children(dev, &kids, &count)) return (FALSE); result = NULL; for (i = 0; i < count; i++) { ivars = device_get_ivars(kids[i]); if (!strcmp(ivars->xd_node, node)) { result = kids[i]; break; } } free(kids, M_TEMP); return (result); } static void xenbusb_delete_child(device_t dev, device_t child) { struct xenbus_device_ivars *ivars; ivars = device_get_ivars(child); /* * We no longer care about the otherend of the * connection. Cancel the watches now so that we * don't try to handle an event for a partially * detached child. */ if (ivars->xd_otherend_watch.node != NULL) xs_unregister_watch(&ivars->xd_otherend_watch); if (ivars->xd_local_watch.node != NULL) xs_unregister_watch(&ivars->xd_local_watch); device_delete_child(dev, child); xenbusb_free_child_ivars(ivars); } /** * \param dev The NewBus device representing this XenBus bus. * \param child The NewBus device representing a child of dev%'s XenBus bus. */ static void xenbusb_verify_device(device_t dev, device_t child) { if (xs_exists(XST_NIL, xenbus_get_node(child), "") == 0) { /* * Device tree has been removed from Xenbus. * Tear down the device. */ xenbusb_delete_child(dev, child); } } /** * \brief Enumerate the devices on a XenBus bus and register them with * the NewBus device tree. * * xenbusb_enumerate_bus() will create entries (in state DS_NOTPRESENT) * for nodes that appear in the XenStore, but will not invoke probe/attach * operations on drivers. Probe/Attach processing must be separately * performed via an invocation of xenbusb_probe_children(). This is usually * done via the xbs_probe_children task. * * \param xbs XenBus Bus device softc of the owner of the bus to enumerate. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xenbusb_enumerate_bus(struct xenbusb_softc *xbs) { const char **types; u_int type_idx; u_int type_count; int error; error = xs_directory(XST_NIL, xbs->xbs_node, "", &type_count, &types); if (error) return (error); for (type_idx = 0; type_idx < type_count; type_idx++) XENBUSB_ENUMERATE_TYPE(xbs->xbs_dev, types[type_idx]); free(types, M_XENSTORE); return (0); } /** * Handler for all generic XenBus device systcl nodes. */ static int xenbusb_device_sysctl_handler(SYSCTL_HANDLER_ARGS) { device_t dev; const char *value; dev = (device_t)arg1; switch (arg2) { case XENBUS_IVAR_NODE: value = xenbus_get_node(dev); break; case XENBUS_IVAR_TYPE: value = xenbus_get_type(dev); break; case XENBUS_IVAR_STATE: value = xenbus_strstate(xenbus_get_state(dev)); break; case XENBUS_IVAR_OTHEREND_ID: return (sysctl_handle_int(oidp, NULL, xenbus_get_otherend_id(dev), req)); /* NOTREACHED */ case XENBUS_IVAR_OTHEREND_PATH: value = xenbus_get_otherend_path(dev); break; default: return (EINVAL); } return (SYSCTL_OUT_STR(req, value)); } /** * Create read-only systcl nodes for xenbusb device ivar data. * * \param dev The XenBus device instance to register with sysctl. */ static void xenbusb_device_sysctl_init(device_t dev) { struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "xenstore_path", - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, XENBUS_IVAR_NODE, xenbusb_device_sysctl_handler, "A", "XenStore path to device"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "xenbus_dev_type", - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, XENBUS_IVAR_TYPE, xenbusb_device_sysctl_handler, "A", "XenBus device type"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "xenbus_connection_state", - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, XENBUS_IVAR_STATE, xenbusb_device_sysctl_handler, "A", "XenBus state of peer connection"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "xenbus_peer_domid", - CTLTYPE_INT | CTLFLAG_RD, + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, XENBUS_IVAR_OTHEREND_ID, xenbusb_device_sysctl_handler, "I", "Xen domain ID of peer"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(tree), OID_AUTO, "xenstore_peer_path", - CTLTYPE_STRING | CTLFLAG_RD, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, dev, XENBUS_IVAR_OTHEREND_PATH, xenbusb_device_sysctl_handler, "A", "XenStore path to peer device"); } /** * \brief Decrement the number of XenBus child devices in the * connecting state by one and release the xbs_attch_ch * interrupt configuration hook if the connecting count * drops to zero. * * \param xbs XenBus Bus device softc of the owner of the bus to enumerate. */ static void xenbusb_release_confighook(struct xenbusb_softc *xbs) { mtx_lock(&xbs->xbs_lock); KASSERT(xbs->xbs_connecting_children > 0, ("Connecting device count error\n")); xbs->xbs_connecting_children--; if (xbs->xbs_connecting_children == 0 && (xbs->xbs_flags & XBS_ATTACH_CH_ACTIVE) != 0) { xbs->xbs_flags &= ~XBS_ATTACH_CH_ACTIVE; mtx_unlock(&xbs->xbs_lock); config_intrhook_disestablish(&xbs->xbs_attach_ch); } else { mtx_unlock(&xbs->xbs_lock); } } /** * \brief Verify the existance of attached device instances and perform * probe/attach processing for newly arrived devices. * * \param dev The NewBus device representing this XenBus bus. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xenbusb_probe_children(device_t dev) { device_t *kids; struct xenbus_device_ivars *ivars; int i, count, error; if (device_get_children(dev, &kids, &count) == 0) { for (i = 0; i < count; i++) { if (device_get_state(kids[i]) != DS_NOTPRESENT) { /* * We already know about this one. * Make sure it's still here. */ xenbusb_verify_device(dev, kids[i]); continue; } error = device_probe_and_attach(kids[i]); if (error == ENXIO) { struct xenbusb_softc *xbs; /* * We don't have a PV driver for this device. * However, an emulated device we do support * may share this backend. Hide the node from * XenBus until the next rescan, but leave it's * state unchanged so we don't inadvertently * prevent attachment of any emulated device. */ xenbusb_delete_child(dev, kids[i]); /* * Since the XenStore state of this device * still indicates a pending attach, manually * release it's hold on the boot process. */ xbs = device_get_softc(dev); xenbusb_release_confighook(xbs); continue; } else if (error) { /* * Transition device to the closed state * so the world knows that attachment will * not occur. */ xenbus_set_state(kids[i], XenbusStateClosed); /* * Remove our record of this device. * So long as it remains in the closed * state in the XenStore, we will not find * it again. The state will only change * if the control domain actively reconfigures * this device. */ xenbusb_delete_child(dev, kids[i]); continue; } /* * Augment default newbus provided dynamic sysctl * variables with the standard ivar contents of * XenBus devices. */ xenbusb_device_sysctl_init(kids[i]); /* * Now that we have a driver managing this device * that can receive otherend state change events, * hook up a watch for them. */ ivars = device_get_ivars(kids[i]); xs_register_watch(&ivars->xd_otherend_watch); xs_register_watch(&ivars->xd_local_watch); } free(kids, M_TEMP); } return (0); } /** * \brief Task callback function to perform XenBus probe operations * from a known safe context. * * \param arg The NewBus device_t representing the bus instance to * on which to perform probe processing. * \param pending The number of times this task was queued before it could * be run. */ static void xenbusb_probe_children_cb(void *arg, int pending __unused) { device_t dev = (device_t)arg; /* * Hold Giant until the Giant free newbus changes are committed. */ mtx_lock(&Giant); xenbusb_probe_children(dev); mtx_unlock(&Giant); } /** * \brief XenStore watch callback for the root node of the XenStore * subtree representing a XenBus. * * This callback performs, or delegates to the xbs_probe_children task, * all processing necessary to handle dynmaic device arrival and departure * events from a XenBus. * * \param watch The XenStore watch object associated with this callback. * \param vec The XenStore watch event data. * \param len The number of fields in the event data stream. */ static void xenbusb_devices_changed(struct xs_watch *watch, const char **vec, unsigned int len) { struct xenbusb_softc *xbs; device_t dev; char *node; char *type; char *id; char *p; u_int component; xbs = (struct xenbusb_softc *)watch->callback_data; dev = xbs->xbs_dev; if (len <= XS_WATCH_PATH) { device_printf(dev, "xenbusb_devices_changed: " "Short Event Data.\n"); return; } node = strdup(vec[XS_WATCH_PATH], M_XENBUS); p = strchr(node, '/'); if (p == NULL) goto out; *p = 0; type = p + 1; p = strchr(type, '/'); if (p == NULL) goto out; *p++ = 0; /* * Extract the device ID. A device ID has one or more path * components separated by the '/' character. * * e.g. "/" for backend devices. */ id = p; for (component = 0; component < xbs->xbs_id_components; component++) { p = strchr(p, '/'); if (p == NULL) break; p++; } if (p != NULL) *p = 0; if (*id != 0 && component >= xbs->xbs_id_components - 1) { xenbusb_add_device(xbs->xbs_dev, type, id); taskqueue_enqueue(taskqueue_thread, &xbs->xbs_probe_children); } out: free(node, M_XENBUS); } /** * \brief Interrupt configuration hook callback associated with xbs_attch_ch. * * Since interrupts are always functional at the time of XenBus configuration, * there is nothing to be done when the callback occurs. This hook is only * registered to hold up boot processing while XenBus devices come online. * * \param arg Unused configuration hook callback argument. */ static void xenbusb_nop_confighook_cb(void *arg __unused) { } /*--------------------------- Public Functions -------------------------------*/ /*--------- API comments for these methods can be found in xenbusb.h ---------*/ void xenbusb_identify(driver_t *driver __unused, device_t parent) { /* * A single instance of each bus type for which we have a driver * is always present in a system operating under Xen. */ BUS_ADD_CHILD(parent, 0, driver->name, 0); } int xenbusb_add_device(device_t dev, const char *type, const char *id) { struct xenbusb_softc *xbs; struct sbuf *devpath_sbuf; char *devpath; struct xenbus_device_ivars *ivars; int error; xbs = device_get_softc(dev); devpath_sbuf = sbuf_new_auto(); sbuf_printf(devpath_sbuf, "%s/%s/%s", xbs->xbs_node, type, id); sbuf_finish(devpath_sbuf); devpath = sbuf_data(devpath_sbuf); ivars = malloc(sizeof(*ivars), M_XENBUS, M_ZERO|M_WAITOK); error = ENXIO; if (xs_exists(XST_NIL, devpath, "") != 0) { device_t child; enum xenbus_state state; char *statepath; child = xenbusb_device_exists(dev, devpath); if (child != NULL) { /* * We are already tracking this node */ error = 0; goto out; } state = xenbus_read_driver_state(devpath); if (state != XenbusStateInitialising) { /* * Device is not new, so ignore it. This can * happen if a device is going away after * switching to Closed. */ printf("xenbusb_add_device: Device %s ignored. " "State %d\n", devpath, state); error = 0; goto out; } sx_init(&ivars->xd_lock, "xdlock"); ivars->xd_flags = XDF_CONNECTING; ivars->xd_node = strdup(devpath, M_XENBUS); ivars->xd_node_len = strlen(devpath); ivars->xd_type = strdup(type, M_XENBUS); ivars->xd_state = XenbusStateInitialising; error = XENBUSB_GET_OTHEREND_NODE(dev, ivars); if (error) { printf("xenbus_update_device: %s no otherend id\n", devpath); goto out; } statepath = malloc(ivars->xd_otherend_path_len + strlen("/state") + 1, M_XENBUS, M_WAITOK); sprintf(statepath, "%s/state", ivars->xd_otherend_path); ivars->xd_otherend_watch.node = statepath; ivars->xd_otherend_watch.callback = xenbusb_otherend_watch_cb; ivars->xd_otherend_watch.callback_data = (uintptr_t)ivars; ivars->xd_local_watch.node = ivars->xd_node; ivars->xd_local_watch.callback = xenbusb_local_watch_cb; ivars->xd_local_watch.callback_data = (uintptr_t)ivars; mtx_lock(&xbs->xbs_lock); xbs->xbs_connecting_children++; mtx_unlock(&xbs->xbs_lock); child = device_add_child(dev, NULL, -1); ivars->xd_dev = child; device_set_ivars(child, ivars); } out: sbuf_delete(devpath_sbuf); if (error != 0) xenbusb_free_child_ivars(ivars); return (error); } int xenbusb_attach(device_t dev, char *bus_node, u_int id_components) { struct xenbusb_softc *xbs; xbs = device_get_softc(dev); mtx_init(&xbs->xbs_lock, "xenbusb softc lock", NULL, MTX_DEF); xbs->xbs_node = bus_node; xbs->xbs_id_components = id_components; xbs->xbs_dev = dev; /* * Since XenBus buses are attached to the XenStore, and * the XenStore does not probe children until after interrupt * services are available, this config hook is used solely * to ensure that the remainder of the boot process (e.g. * mount root) is deferred until child devices are adequately * probed. We unblock the boot process as soon as the * connecting child count in our softc goes to 0. */ xbs->xbs_attach_ch.ich_func = xenbusb_nop_confighook_cb; xbs->xbs_attach_ch.ich_arg = dev; config_intrhook_establish(&xbs->xbs_attach_ch); xbs->xbs_flags |= XBS_ATTACH_CH_ACTIVE; xbs->xbs_connecting_children = 1; /* * The subtree for this bus type may not yet exist * causing initial enumeration to fail. We still * want to return success from our attach though * so that we are ready to handle devices for this * bus when they are dynamically attached to us * by a Xen management action. */ (void)xenbusb_enumerate_bus(xbs); xenbusb_probe_children(dev); xbs->xbs_device_watch.node = bus_node; xbs->xbs_device_watch.callback = xenbusb_devices_changed; xbs->xbs_device_watch.callback_data = (uintptr_t)xbs; TASK_INIT(&xbs->xbs_probe_children, 0, xenbusb_probe_children_cb, dev); xs_register_watch(&xbs->xbs_device_watch); xenbusb_release_confighook(xbs); return (0); } int xenbusb_resume(device_t dev) { device_t *kids; struct xenbus_device_ivars *ivars; int i, count, error; char *statepath; /* * We must re-examine each device and find the new path for * its backend. */ if (device_get_children(dev, &kids, &count) == 0) { for (i = 0; i < count; i++) { if (device_get_state(kids[i]) == DS_NOTPRESENT) continue; if (xen_suspend_cancelled) { DEVICE_RESUME(kids[i]); continue; } ivars = device_get_ivars(kids[i]); xs_unregister_watch(&ivars->xd_otherend_watch); xenbus_set_state(kids[i], XenbusStateInitialising); /* * Find the new backend details and * re-register our watch. */ error = XENBUSB_GET_OTHEREND_NODE(dev, ivars); if (error) return (error); statepath = malloc(ivars->xd_otherend_path_len + strlen("/state") + 1, M_XENBUS, M_WAITOK); sprintf(statepath, "%s/state", ivars->xd_otherend_path); free(ivars->xd_otherend_watch.node, M_XENBUS); ivars->xd_otherend_watch.node = statepath; DEVICE_RESUME(kids[i]); xs_register_watch(&ivars->xd_otherend_watch); #if 0 /* * Can't do this yet since we are running in * the xenwatch thread and if we sleep here, * we will stop delivering watch notifications * and the device will never come back online. */ sx_xlock(&ivars->xd_lock); while (ivars->xd_state != XenbusStateClosed && ivars->xd_state != XenbusStateConnected) sx_sleep(&ivars->xd_state, &ivars->xd_lock, 0, "xdresume", 0); sx_xunlock(&ivars->xd_lock); #endif } free(kids, M_TEMP); } return (0); } int xenbusb_print_child(device_t dev, device_t child) { struct xenbus_device_ivars *ivars = device_get_ivars(child); int retval = 0; retval += bus_print_child_header(dev, child); retval += printf(" at %s", ivars->xd_node); retval += bus_print_child_footer(dev, child); return (retval); } int xenbusb_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { struct xenbus_device_ivars *ivars = device_get_ivars(child); switch (index) { case XENBUS_IVAR_NODE: *result = (uintptr_t) ivars->xd_node; return (0); case XENBUS_IVAR_TYPE: *result = (uintptr_t) ivars->xd_type; return (0); case XENBUS_IVAR_STATE: *result = (uintptr_t) ivars->xd_state; return (0); case XENBUS_IVAR_OTHEREND_ID: *result = (uintptr_t) ivars->xd_otherend_id; return (0); case XENBUS_IVAR_OTHEREND_PATH: *result = (uintptr_t) ivars->xd_otherend_path; return (0); } return (ENOENT); } int xenbusb_write_ivar(device_t dev, device_t child, int index, uintptr_t value) { struct xenbus_device_ivars *ivars = device_get_ivars(child); enum xenbus_state newstate; int currstate; switch (index) { case XENBUS_IVAR_STATE: { int error; newstate = (enum xenbus_state)value; sx_xlock(&ivars->xd_lock); if (ivars->xd_state == newstate) { error = 0; goto out; } error = xs_scanf(XST_NIL, ivars->xd_node, "state", NULL, "%d", &currstate); if (error) goto out; do { error = xs_printf(XST_NIL, ivars->xd_node, "state", "%d", newstate); } while (error == EAGAIN); if (error) { /* * Avoid looping through xenbus_dev_fatal() * which calls xenbus_write_ivar to set the * state to closing. */ if (newstate != XenbusStateClosing) xenbus_dev_fatal(dev, error, "writing new state"); goto out; } ivars->xd_state = newstate; if ((ivars->xd_flags & XDF_CONNECTING) != 0 && (newstate == XenbusStateClosed || newstate == XenbusStateConnected)) { struct xenbusb_softc *xbs; ivars->xd_flags &= ~XDF_CONNECTING; xbs = device_get_softc(dev); xenbusb_release_confighook(xbs); } wakeup(&ivars->xd_state); out: sx_xunlock(&ivars->xd_lock); return (error); } case XENBUS_IVAR_NODE: case XENBUS_IVAR_TYPE: case XENBUS_IVAR_OTHEREND_ID: case XENBUS_IVAR_OTHEREND_PATH: /* * These variables are read-only. */ return (EINVAL); } return (ENOENT); } void xenbusb_otherend_changed(device_t bus, device_t child, enum xenbus_state state) { XENBUS_OTHEREND_CHANGED(child, state); } void xenbusb_localend_changed(device_t bus, device_t child, const char *path) { if (strcmp(path, "/state") != 0) { struct xenbus_device_ivars *ivars; ivars = device_get_ivars(child); sx_xlock(&ivars->xd_lock); ivars->xd_state = xenbus_read_driver_state(ivars->xd_node); sx_xunlock(&ivars->xd_lock); } XENBUS_LOCALEND_CHANGED(child, path); }