diff --git a/sys/amd64/amd64/xen-locore.S b/sys/amd64/amd64/xen-locore.S index c112fa909adf..7dc5d1d93842 100644 --- a/sys/amd64/amd64/xen-locore.S +++ b/sys/amd64/amd64/xen-locore.S @@ -1,210 +1,210 @@ /*- * Copyright (c) 2003 Peter Wemm * Copyright (c) 2011-2012 Spectra Logic Corporation * Copyright (c) 2013 Roger Pau Monne * All rights reserved. * * This software was developed by Cherry G. Mathew * under sponsorship from Spectra Logic Corporation. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include #include #include #include #include #define __ASSEMBLY__ -#include +#include #include "assym.inc" #define VTOP(x) ((x) - KERNBASE) #define ENTRY_SIZE 8 /* sizeof(uint64_t) */ #define GDT_CODE 0x08 #define GDT_DATA 0x10 .section __xen_guest ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "FreeBSD") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, __XSTRING(__FreeBSD_version)) ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .quad, KERNBASE) ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, .quad, 0) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .quad, hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .quad, HYPERVISOR_VIRT_START) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "writable_descriptor_tables|auto_translated_physmap|supervisor_mode_kernel|hvm_callback_vector") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .long, PG_V, PG_V) ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long, 0) ELFNOTE(Xen, XEN_ELFNOTE_BSD_SYMTAB, .asciz, "yes") /* For PVHv2 support. */ ELFNOTE(Xen, XEN_ELFNOTE_PHYS32_ENTRY, .long, VTOP(xen_start32)) .text .p2align PAGE_SHIFT, 0x90 /* Hypercall_page needs to be PAGE aligned */ ENTRY(hypercall_page) .skip 0x1000, 0x90 /* Fill with "nop"s */ /* PVH entry point. */ .code32 ENTRY(xen_start32) /* Load flat GDT */ movl $VTOP(gdtdesc32), %eax lgdt (%eax) jmp $GDT_CODE, $VTOP(reload_cs) reload_cs: movw $GDT_DATA, %ax movw %ax, %ds movw %ax, %es movw %ax, %ss movl $VTOP(bootstack), %esp /* Don't trust what the loader gives for eflags. */ pushl $PSL_KERNEL popfl /* * Create the page tables. * The first 1GB is mapped using 2MB entries. */ movl $0, %eax pgbuild: cmp $(PAGE_SIZE/ENTRY_SIZE), %eax jae pgbuild_done /* PT4[i] = VTOP(&PT3[0]) | PG_V | PG_RW | PG_U */ movl $VTOP(PT4), %ecx movl $VTOP(PT3), %edx orl $(PG_V | PG_RW | PG_U), %edx movl %edx, (%ecx,%eax,ENTRY_SIZE) /* PT3[i] = VTOP(&PT2[0]) | PG_V | PG_RW | PG_U */ movl $VTOP(PT3), %ecx movl $VTOP(PT2), %edx orl $(PG_V | PG_RW | PG_U), %edx movl %edx, (%ecx,%eax,ENTRY_SIZE) /* PT2[i] = i * 2MiB | PG_V | PG_RW | PG_PS | PG_U */ movl $VTOP(PT2), %ecx movl %eax, %edx shll $PDRSHIFT, %edx orl $(PG_V | PG_RW | PG_PS | PG_U), %edx movl %edx, (%ecx,%eax,ENTRY_SIZE) inc %eax jmp pgbuild pgbuild_done: /* Turn on EFER.LME */ movl $MSR_EFER, %ecx rdmsr orl $EFER_LME, %eax wrmsr /* Turn on PAE */ movl %cr4, %eax orl $CR4_PAE, %eax movl %eax, %cr4 /* Set %cr3 for PT4 */ movl $VTOP(PT4), %eax movl %eax, %cr3 /* Turn on paging (implicitly sets EFER.LMA) */ movl %cr0, %eax orl $CR0_PG, %eax movl %eax, %cr0 /* Now we're in compatibility mode. Set %cs for long mode */ movl $VTOP(gdtdesc), %eax lgdt (%eax) ljmp $GDT_CODE, $VTOP(longmode) .code64 longmode: /* We're still running V=P, jump to entry point */ movq $bootstack, %rsp movq $start_kernel, %rax pushq %rax ret start_kernel: /* * Pass %ebx as the argument to hammer_time_xen, it contains * the startup info. */ movq %rbx, %rdi call hammer_time_xen movq %rax, %rsp call mi_startup /* NOTREACHED */ 0: hlt jmp 0b /* Space for initial page tables */ .data .p2align 12,0x40 PT4: .space 0x1000 PT3: .space 0x1000 PT2: .space 0x1000 /* 64bit GDT */ gdtdesc: .word gdtend - gdt - 1 .long VTOP(gdt) # low .long 0 # high gdt: .long 0 # null descriptor .long 0 .long 0x00000000 # %cs .long 0x00209800 .long 0x00000000 # %ds .long 0x00008000 gdtend: /* 32bit GDT */ gdtdesc32: .word gdt32end - gdt32 - 1 .long VTOP(gdt32) .long 0 gdt32: .long 0 # null descriptor .long 0 .long 0x0000ffff # %cs .long 0x00cf9a00 .long 0x0000ffff # %ds, %es, %ss .long 0x00cf9200 gdt32end: diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c index 4470cc59a29a..792933402c93 100644 --- a/sys/dev/xen/blkback/blkback.c +++ b/sys/dev/xen/blkback/blkback.c @@ -1,3959 +1,3959 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2012 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) * Ken Merry (Spectra Logic Corporation) */ #include __FBSDID("$FreeBSD$"); /** * \file blkback.c * * \brief Device driver supporting the vending of block storage from * a FreeBSD domain to other domains. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include +#include +#include #include /*--------------------------- Compile-time Tunables --------------------------*/ /** * The maximum number of shared memory ring pages we will allow in a * negotiated block-front/back communication channel. Allow enough * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. */ #define XBB_MAX_RING_PAGES 32 /** * The maximum number of outstanding request blocks (request headers plus * additional segment blocks) we will allow in a negotiated block-front/back * communication channel. */ #define XBB_MAX_REQUESTS \ __CONST_RING_SIZE(blkif, PAGE_SIZE * XBB_MAX_RING_PAGES) /** * \brief Define to force all I/O to be performed on memory owned by the * backend device, with a copy-in/out to the remote domain's memory. * * \note This option is currently required when this driver's domain is * operating in HVM mode on a system using an IOMMU. * * This driver uses Xen's grant table API to gain access to the memory of * the remote domains it serves. When our domain is operating in PV mode, * the grant table mechanism directly updates our domain's page table entries * to point to the physical pages of the remote domain. This scheme guarantees * that blkback and the backing devices it uses can safely perform DMA * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to * insure that our domain cannot DMA to pages owned by another domain. As * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant * table API. For this reason, in HVM mode, we must bounce all requests into * memory that is mapped into our domain at domain startup and thus has * valid IOMMU mappings. */ #define XBB_USE_BOUNCE_BUFFERS /** * \brief Define to enable rudimentary request logging to the console. */ #undef XBB_DEBUG /*---------------------------------- Macros ----------------------------------*/ /** * Custom malloc type for all driver allocations. */ static MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); #ifdef XBB_DEBUG #define DPRINTF(fmt, args...) \ printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while(0) #endif /** * The maximum mapped region size per request we will allow in a negotiated * block-front/back communication channel. * Use old default of MAXPHYS == 128K. */ #define XBB_MAX_REQUEST_SIZE \ MIN(128 * 1024, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) /** * The maximum number of segments (within a request header and accompanying * segment blocks) per request we will allow in a negotiated block-front/back * communication channel. */ #define XBB_MAX_SEGMENTS_PER_REQUEST \ (MIN(UIO_MAXIOV, \ MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) /** * The maximum number of ring pages that we can allow per request list. * We limit this to the maximum number of segments per request, because * that is already a reasonable number of segments to aggregate. This * number should never be smaller than XBB_MAX_SEGMENTS_PER_REQUEST, * because that would leave situations where we can't dispatch even one * large request. */ #define XBB_MAX_SEGMENTS_PER_REQLIST XBB_MAX_SEGMENTS_PER_REQUEST /*--------------------------- Forward Declarations ---------------------------*/ struct xbb_softc; struct xbb_xen_req; static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) __attribute__((format(printf, 3, 4))); static int xbb_shutdown(struct xbb_softc *xbb); /*------------------------------ Data Structures -----------------------------*/ STAILQ_HEAD(xbb_xen_req_list, xbb_xen_req); typedef enum { XBB_REQLIST_NONE = 0x00, XBB_REQLIST_MAPPED = 0x01 } xbb_reqlist_flags; struct xbb_xen_reqlist { /** * Back reference to the parent block back instance for this * request. Used during bio_done handling. */ struct xbb_softc *xbb; /** * BLKIF_OP code for this request. */ int operation; /** * Set to BLKIF_RSP_* to indicate request status. * * This field allows an error status to be recorded even if the * delivery of this status must be deferred. Deferred reporting * is necessary, for example, when an error is detected during * completion processing of one bio when other bios for this * request are still outstanding. */ int status; /** * Number of 512 byte sectors not transferred. */ int residual_512b_sectors; /** * Starting sector number of the first request in the list. */ off_t starting_sector_number; /** * If we're going to coalesce, the next contiguous sector would be * this one. */ off_t next_contig_sector; /** * Number of child requests in the list. */ int num_children; /** * Number of I/O requests still pending on the backend. */ int pendcnt; /** * Total number of segments for requests in the list. */ int nr_segments; /** * Flags for this particular request list. */ xbb_reqlist_flags flags; /** * Kernel virtual address space reserved for this request * list structure and used to map the remote domain's pages for * this I/O, into our domain's address space. */ uint8_t *kva; /** * Base, pseudo-physical address, corresponding to the start * of this request's kva region. */ uint64_t gnt_base; #ifdef XBB_USE_BOUNCE_BUFFERS /** * Pre-allocated domain local memory used to proxy remote * domain memory during I/O operations. */ uint8_t *bounce; #endif /** * Array of grant handles (one per page) used to map this request. */ grant_handle_t *gnt_handles; /** * Device statistics request ordering type (ordered or simple). */ devstat_tag_type ds_tag_type; /** * Device statistics request type (read, write, no_data). */ devstat_trans_flags ds_trans_type; /** * The start time for this request. */ struct bintime ds_t0; /** * Linked list of contiguous requests with the same operation type. */ struct xbb_xen_req_list contig_req_list; /** * Linked list links used to aggregate idle requests in the * request list free pool (xbb->reqlist_free_stailq) and pending * requests waiting for execution (xbb->reqlist_pending_stailq). */ STAILQ_ENTRY(xbb_xen_reqlist) links; }; STAILQ_HEAD(xbb_xen_reqlist_list, xbb_xen_reqlist); /** * \brief Object tracking an in-flight I/O from a Xen VBD consumer. */ struct xbb_xen_req { /** * Linked list links used to aggregate requests into a reqlist * and to store them in the request free pool. */ STAILQ_ENTRY(xbb_xen_req) links; /** * The remote domain's identifier for this I/O request. */ uint64_t id; /** * The number of pages currently mapped for this request. */ int nr_pages; /** * The number of 512 byte sectors comprising this requests. */ int nr_512b_sectors; /** * BLKIF_OP code for this request. */ int operation; /** * Storage used for non-native ring requests. */ blkif_request_t ring_req_storage; /** * Pointer to the Xen request in the ring. */ blkif_request_t *ring_req; /** * Consumer index for this request. */ RING_IDX req_ring_idx; /** * The start time for this request. */ struct bintime ds_t0; /** * Pointer back to our parent request list. */ struct xbb_xen_reqlist *reqlist; }; SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); /** * \brief Configuration data for the shared memory request ring * used to communicate with the front-end client of this * this driver. */ struct xbb_ring_config { /** KVA address where ring memory is mapped. */ vm_offset_t va; /** The pseudo-physical address where ring memory is mapped.*/ uint64_t gnt_addr; /** * Grant table handles, one per-ring page, returned by the * hyperpervisor upon mapping of the ring and required to * unmap it when a connection is torn down. */ grant_handle_t handle[XBB_MAX_RING_PAGES]; /** * The device bus address returned by the hypervisor when * mapping the ring and required to unmap it when a connection * is torn down. */ uint64_t bus_addr[XBB_MAX_RING_PAGES]; /** The number of ring pages mapped for the current connection. */ u_int ring_pages; /** * The grant references, one per-ring page, supplied by the * front-end, allowing us to reference the ring pages in the * front-end's domain and to map these pages into our own domain. */ grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; /** The interrupt driven even channel used to signal ring events. */ evtchn_port_t evtchn; }; /** * Per-instance connection state flags. */ typedef enum { /** * The front-end requested a read-only mount of the * back-end device/file. */ XBBF_READ_ONLY = 0x01, /** Communication with the front-end has been established. */ XBBF_RING_CONNECTED = 0x02, /** * Front-end requests exist in the ring and are waiting for * xbb_xen_req objects to free up. */ XBBF_RESOURCE_SHORTAGE = 0x04, /** Connection teardown in progress. */ XBBF_SHUTDOWN = 0x08, /** A thread is already performing shutdown processing. */ XBBF_IN_SHUTDOWN = 0x10 } xbb_flag_t; /** Backend device type. */ typedef enum { /** Backend type unknown. */ XBB_TYPE_NONE = 0x00, /** * Backend type disk (access via cdev switch * strategy routine). */ XBB_TYPE_DISK = 0x01, /** Backend type file (access vnode operations.). */ XBB_TYPE_FILE = 0x02 } xbb_type; /** * \brief Structure used to memoize information about a per-request * scatter-gather list. * * The chief benefit of using this data structure is it avoids having * to reparse the possibly discontiguous S/G list in the original * request. Due to the way that the mapping of the memory backing an * I/O transaction is handled by Xen, a second pass is unavoidable. * At least this way the second walk is a simple array traversal. * * \note A single Scatter/Gather element in the block interface covers * at most 1 machine page. In this context a sector (blkif * nomenclature, not what I'd choose) is a 512b aligned unit * of mapping within the machine page referenced by an S/G * element. */ struct xbb_sg { /** The number of 512b data chunks mapped in this S/G element. */ int16_t nsect; /** * The index (0 based) of the first 512b data chunk mapped * in this S/G element. */ uint8_t first_sect; /** * The index (0 based) of the last 512b data chunk mapped * in this S/G element. */ uint8_t last_sect; }; /** * Character device backend specific configuration data. */ struct xbb_dev_data { /** Cdev used for device backend access. */ struct cdev *cdev; /** Cdev switch used for device backend access. */ struct cdevsw *csw; /** Used to hold a reference on opened cdev backend devices. */ int dev_ref; }; /** * File backend specific configuration data. */ struct xbb_file_data { /** Credentials to use for vnode backed (file based) I/O. */ struct ucred *cred; /** * \brief Array of io vectors used to process file based I/O. * * Only a single file based request is outstanding per-xbb instance, * so we only need one of these. */ struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; #ifdef XBB_USE_BOUNCE_BUFFERS /** * \brief Array of io vectors used to handle bouncing of file reads. * * Vnode operations are free to modify uio data during their * exectuion. In the case of a read with bounce buffering active, * we need some of the data from the original uio in order to * bounce-out the read data. This array serves as the temporary * storage for this saved data. */ struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQLIST]; /** * \brief Array of memoized bounce buffer kva offsets used * in the file based backend. * * Due to the way that the mapping of the memory backing an * I/O transaction is handled by Xen, a second pass through * the request sg elements is unavoidable. We memoize the computed * bounce address here to reduce the cost of the second walk. */ void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQLIST]; #endif /* XBB_USE_BOUNCE_BUFFERS */ }; /** * Collection of backend type specific data. */ union xbb_backend_data { struct xbb_dev_data dev; struct xbb_file_data file; }; /** * Function signature of backend specific I/O handlers. */ typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int flags); /** * Per-instance configuration data. */ struct xbb_softc { /** * Task-queue used to process I/O requests. */ struct taskqueue *io_taskqueue; /** * Single "run the request queue" task enqueued * on io_taskqueue. */ struct task io_task; /** Device type for this instance. */ xbb_type device_type; /** NewBus device corresponding to this instance. */ device_t dev; /** Backend specific dispatch routine for this instance. */ xbb_dispatch_t dispatch_io; /** The number of requests outstanding on the backend device/file. */ int active_request_count; /** Free pool of request tracking structures. */ struct xbb_xen_req_list request_free_stailq; /** Array, sized at connection time, of request tracking structures. */ struct xbb_xen_req *requests; /** Free pool of request list structures. */ struct xbb_xen_reqlist_list reqlist_free_stailq; /** List of pending request lists awaiting execution. */ struct xbb_xen_reqlist_list reqlist_pending_stailq; /** Array, sized at connection time, of request list structures. */ struct xbb_xen_reqlist *request_lists; /** * Global pool of kva used for mapping remote domain ring * and I/O transaction data. */ vm_offset_t kva; /** Pseudo-physical address corresponding to kva. */ uint64_t gnt_base_addr; /** The size of the global kva pool. */ int kva_size; /** The size of the KVA area used for request lists. */ int reqlist_kva_size; /** The number of pages of KVA used for request lists */ int reqlist_kva_pages; /** Bitmap of free KVA pages */ bitstr_t *kva_free; /** * \brief Cached value of the front-end's domain id. * * This value is used at once for each mapped page in * a transaction. We cache it to avoid incuring the * cost of an ivar access every time this is needed. */ domid_t otherend_id; /** * \brief The blkif protocol abi in effect. * * There are situations where the back and front ends can * have a different, native abi (e.g. intel x86_64 and * 32bit x86 domains on the same machine). The back-end * always accommodates the front-end's native abi. That * value is pulled from the XenStore and recorded here. */ int abi; /** * \brief The maximum number of requests and request lists allowed * to be in flight at a time. * * This value is negotiated via the XenStore. */ u_int max_requests; /** * \brief The maximum number of segments (1 page per segment) * that can be mapped by a request. * * This value is negotiated via the XenStore. */ u_int max_request_segments; /** * \brief Maximum number of segments per request list. * * This value is derived from and will generally be larger than * max_request_segments. */ u_int max_reqlist_segments; /** * The maximum size of any request to this back-end * device. * * This value is negotiated via the XenStore. */ u_int max_request_size; /** * The maximum size of any request list. This is derived directly * from max_reqlist_segments. */ u_int max_reqlist_size; /** Various configuration and state bit flags. */ xbb_flag_t flags; /** Ring mapping and interrupt configuration data. */ struct xbb_ring_config ring_config; /** Runtime, cross-abi safe, structures for ring access. */ blkif_back_rings_t rings; /** IRQ mapping for the communication ring event channel. */ xen_intr_handle_t xen_intr_handle; /** * \brief Backend access mode flags (e.g. write, or read-only). * * This value is passed to us by the front-end via the XenStore. */ char *dev_mode; /** * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). * * This value is passed to us by the front-end via the XenStore. * Currently unused. */ char *dev_type; /** * \brief Backend device/file identifier. * * This value is passed to us by the front-end via the XenStore. * We expect this to be a POSIX path indicating the file or * device to open. */ char *dev_name; /** * Vnode corresponding to the backend device node or file * we are acessing. */ struct vnode *vn; union xbb_backend_data backend; /** The native sector size of the backend. */ u_int sector_size; /** log2 of sector_size. */ u_int sector_size_shift; /** Size in bytes of the backend device or file. */ off_t media_size; /** * \brief media_size expressed in terms of the backend native * sector size. * * (e.g. xbb->media_size >> xbb->sector_size_shift). */ uint64_t media_num_sectors; /** * \brief Array of memoized scatter gather data computed during the * conversion of blkif ring requests to internal xbb_xen_req * structures. * * Ring processing is serialized so we only need one of these. */ struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQLIST]; /** * Temporary grant table map used in xbb_dispatch_io(). When * XBB_MAX_SEGMENTS_PER_REQLIST gets large, keeping this on the * stack could cause a stack overflow. */ struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQLIST]; /** Mutex protecting per-instance data. */ struct mtx lock; /** * Resource representing allocated physical address space * associated with our per-instance kva region. */ struct resource *pseudo_phys_res; /** Resource id for allocated physical address space. */ int pseudo_phys_res_id; /** * I/O statistics from BlockBack dispatch down. These are * coalesced requests, and we start them right before execution. */ struct devstat *xbb_stats; /** * I/O statistics coming into BlockBack. These are the requests as * we get them from BlockFront. They are started as soon as we * receive a request, and completed when the I/O is complete. */ struct devstat *xbb_stats_in; /** Disable sending flush to the backend */ int disable_flush; /** Send a real flush for every N flush requests */ int flush_interval; /** Count of flush requests in the interval */ int flush_count; /** Don't coalesce requests if this is set */ int no_coalesce_reqs; /** Number of requests we have received */ uint64_t reqs_received; /** Number of requests we have completed*/ uint64_t reqs_completed; /** Number of requests we queued but not pushed*/ uint64_t reqs_queued_for_completion; /** Number of requests we completed with an error status*/ uint64_t reqs_completed_with_error; /** How many forced dispatches (i.e. without coalescing) have happened */ uint64_t forced_dispatch; /** How many normal dispatches have happened */ uint64_t normal_dispatch; /** How many total dispatches have happened */ uint64_t total_dispatch; /** How many times we have run out of KVA */ uint64_t kva_shortages; /** How many times we have run out of request structures */ uint64_t request_shortages; /** Watch to wait for hotplug script execution */ struct xs_watch hotplug_watch; /** Got the needed data from hotplug scripts? */ bool hotplug_done; }; /*---------------------------- Request Processing ----------------------------*/ /** * Allocate an internal transaction tracking structure from the free pool. * * \param xbb Per-instance xbb configuration structure. * * \return On success, a pointer to the allocated xbb_xen_req structure. * Otherwise NULL. */ static inline struct xbb_xen_req * xbb_get_req(struct xbb_softc *xbb) { struct xbb_xen_req *req; req = NULL; mtx_assert(&xbb->lock, MA_OWNED); if ((req = STAILQ_FIRST(&xbb->request_free_stailq)) != NULL) { STAILQ_REMOVE_HEAD(&xbb->request_free_stailq, links); xbb->active_request_count++; } return (req); } /** * Return an allocated transaction tracking structure to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req The request structure to free. */ static inline void xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) { mtx_assert(&xbb->lock, MA_OWNED); STAILQ_INSERT_HEAD(&xbb->request_free_stailq, req, links); xbb->active_request_count--; KASSERT(xbb->active_request_count >= 0, ("xbb_release_req: negative active count")); } /** * Return an xbb_xen_req_list of allocated xbb_xen_reqs to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req_list The list of requests to free. * \param nreqs The number of items in the list. */ static inline void xbb_release_reqs(struct xbb_softc *xbb, struct xbb_xen_req_list *req_list, int nreqs) { mtx_assert(&xbb->lock, MA_OWNED); STAILQ_CONCAT(&xbb->request_free_stailq, req_list); xbb->active_request_count -= nreqs; KASSERT(xbb->active_request_count >= 0, ("xbb_release_reqs: negative active count")); } /** * Given a page index and 512b sector offset within that page, * calculate an offset into a request's kva region. * * \param reqlist The request structure whose kva region will be accessed. * \param pagenr The page index used to compute the kva offset. * \param sector The 512b sector index used to compute the page relative * kva offset. * * \return The computed global KVA offset. */ static inline uint8_t * xbb_reqlist_vaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { return (reqlist->kva + (PAGE_SIZE * pagenr) + (sector << 9)); } #ifdef XBB_USE_BOUNCE_BUFFERS /** * Given a page index and 512b sector offset within that page, * calculate an offset into a request's local bounce memory region. * * \param reqlist The request structure whose bounce region will be accessed. * \param pagenr The page index used to compute the bounce offset. * \param sector The 512b sector index used to compute the page relative * bounce offset. * * \return The computed global bounce buffer address. */ static inline uint8_t * xbb_reqlist_bounce_addr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { return (reqlist->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); } #endif /** * Given a page number and 512b sector offset within that page, * calculate an offset into the request's memory region that the * underlying backend device/file should use for I/O. * * \param reqlist The request structure whose I/O region will be accessed. * \param pagenr The page index used to compute the I/O offset. * \param sector The 512b sector index used to compute the page relative * I/O offset. * * \return The computed global I/O address. * * Depending on configuration, this will either be a local bounce buffer * or a pointer to the memory mapped in from the front-end domain for * this request. */ static inline uint8_t * xbb_reqlist_ioaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { #ifdef XBB_USE_BOUNCE_BUFFERS return (xbb_reqlist_bounce_addr(reqlist, pagenr, sector)); #else return (xbb_reqlist_vaddr(reqlist, pagenr, sector)); #endif } /** * Given a page index and 512b sector offset within that page, calculate * an offset into the local pseudo-physical address space used to map a * front-end's request data into a request. * * \param reqlist The request list structure whose pseudo-physical region * will be accessed. * \param pagenr The page index used to compute the pseudo-physical offset. * \param sector The 512b sector index used to compute the page relative * pseudo-physical offset. * * \return The computed global pseudo-phsyical address. * * Depending on configuration, this will either be a local bounce buffer * or a pointer to the memory mapped in from the front-end domain for * this request. */ static inline uintptr_t xbb_get_gntaddr(struct xbb_xen_reqlist *reqlist, int pagenr, int sector) { struct xbb_softc *xbb; xbb = reqlist->xbb; return ((uintptr_t)(xbb->gnt_base_addr + (uintptr_t)(reqlist->kva - xbb->kva) + (PAGE_SIZE * pagenr) + (sector << 9))); } /** * Get Kernel Virtual Address space for mapping requests. * * \param xbb Per-instance xbb configuration structure. * \param nr_pages Number of pages needed. * \param check_only If set, check for free KVA but don't allocate it. * \param have_lock If set, xbb lock is already held. * * \return On success, a pointer to the allocated KVA region. Otherwise NULL. * * Note: This should be unnecessary once we have either chaining or * scatter/gather support for struct bio. At that point we'll be able to * put multiple addresses and lengths in one bio/bio chain and won't need * to map everything into one virtual segment. */ static uint8_t * xbb_get_kva(struct xbb_softc *xbb, int nr_pages) { int first_clear; int num_clear; uint8_t *free_kva; int i; KASSERT(nr_pages != 0, ("xbb_get_kva of zero length")); first_clear = 0; free_kva = NULL; mtx_lock(&xbb->lock); /* * Look for the first available page. If there are none, we're done. */ bit_ffc(xbb->kva_free, xbb->reqlist_kva_pages, &first_clear); if (first_clear == -1) goto bailout; /* * Starting at the first available page, look for consecutive free * pages that will satisfy the user's request. */ for (i = first_clear, num_clear = 0; i < xbb->reqlist_kva_pages; i++) { /* * If this is true, the page is used, so we have to reset * the number of clear pages and the first clear page * (since it pointed to a region with an insufficient number * of clear pages). */ if (bit_test(xbb->kva_free, i)) { num_clear = 0; first_clear = -1; continue; } if (first_clear == -1) first_clear = i; /* * If this is true, we've found a large enough free region * to satisfy the request. */ if (++num_clear == nr_pages) { bit_nset(xbb->kva_free, first_clear, first_clear + nr_pages - 1); free_kva = xbb->kva + (uint8_t *)((intptr_t)first_clear * PAGE_SIZE); KASSERT(free_kva >= (uint8_t *)xbb->kva && free_kva + (nr_pages * PAGE_SIZE) <= (uint8_t *)xbb->ring_config.va, ("Free KVA %p len %d out of range, " "kva = %#jx, ring VA = %#jx\n", free_kva, nr_pages * PAGE_SIZE, (uintmax_t)xbb->kva, (uintmax_t)xbb->ring_config.va)); break; } } bailout: if (free_kva == NULL) { xbb->flags |= XBBF_RESOURCE_SHORTAGE; xbb->kva_shortages++; } mtx_unlock(&xbb->lock); return (free_kva); } /** * Free allocated KVA. * * \param xbb Per-instance xbb configuration structure. * \param kva_ptr Pointer to allocated KVA region. * \param nr_pages Number of pages in the KVA region. */ static void xbb_free_kva(struct xbb_softc *xbb, uint8_t *kva_ptr, int nr_pages) { intptr_t start_page; mtx_assert(&xbb->lock, MA_OWNED); start_page = (intptr_t)(kva_ptr - xbb->kva) >> PAGE_SHIFT; bit_nclear(xbb->kva_free, start_page, start_page + nr_pages - 1); } /** * Unmap the front-end pages associated with this I/O request. * * \param req The request structure to unmap. */ static void xbb_unmap_reqlist(struct xbb_xen_reqlist *reqlist) { struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQLIST]; u_int i; u_int invcount; int error __diagused; invcount = 0; for (i = 0; i < reqlist->nr_segments; i++) { if (reqlist->gnt_handles[i] == GRANT_REF_INVALID) continue; unmap[invcount].host_addr = xbb_get_gntaddr(reqlist, i, 0); unmap[invcount].dev_bus_addr = 0; unmap[invcount].handle = reqlist->gnt_handles[i]; reqlist->gnt_handles[i] = GRANT_REF_INVALID; invcount++; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, invcount); KASSERT(error == 0, ("Grant table operation failed")); } /** * Allocate an internal transaction tracking structure from the free pool. * * \param xbb Per-instance xbb configuration structure. * * \return On success, a pointer to the allocated xbb_xen_reqlist structure. * Otherwise NULL. */ static inline struct xbb_xen_reqlist * xbb_get_reqlist(struct xbb_softc *xbb) { struct xbb_xen_reqlist *reqlist; reqlist = NULL; mtx_assert(&xbb->lock, MA_OWNED); if ((reqlist = STAILQ_FIRST(&xbb->reqlist_free_stailq)) != NULL) { STAILQ_REMOVE_HEAD(&xbb->reqlist_free_stailq, links); reqlist->flags = XBB_REQLIST_NONE; reqlist->kva = NULL; reqlist->status = BLKIF_RSP_OKAY; reqlist->residual_512b_sectors = 0; reqlist->num_children = 0; reqlist->nr_segments = 0; STAILQ_INIT(&reqlist->contig_req_list); } return (reqlist); } /** * Return an allocated transaction tracking structure to the free pool. * * \param xbb Per-instance xbb configuration structure. * \param req The request list structure to free. * \param wakeup If set, wakeup the work thread if freeing this reqlist * during a resource shortage condition. */ static inline void xbb_release_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int wakeup) { mtx_assert(&xbb->lock, MA_OWNED); if (wakeup) { wakeup = xbb->flags & XBBF_RESOURCE_SHORTAGE; xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; } if (reqlist->kva != NULL) xbb_free_kva(xbb, reqlist->kva, reqlist->nr_segments); xbb_release_reqs(xbb, &reqlist->contig_req_list, reqlist->num_children); STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); if ((xbb->flags & XBBF_SHUTDOWN) != 0) { /* * Shutdown is in progress. See if we can * progress further now that one more request * has completed and been returned to the * free pool. */ xbb_shutdown(xbb); } if (wakeup != 0) taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } /** * Request resources and do basic request setup. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Pointer to reqlist pointer. * \param ring_req Pointer to a block ring request. * \param ring_index The ring index of this request. * * \return 0 for success, non-zero for failure. */ static int xbb_get_resources(struct xbb_softc *xbb, struct xbb_xen_reqlist **reqlist, blkif_request_t *ring_req, RING_IDX ring_idx) { struct xbb_xen_reqlist *nreqlist; struct xbb_xen_req *nreq; nreqlist = NULL; nreq = NULL; mtx_lock(&xbb->lock); /* * We don't allow new resources to be allocated if we're in the * process of shutting down. */ if ((xbb->flags & XBBF_SHUTDOWN) != 0) { mtx_unlock(&xbb->lock); return (1); } /* * Allocate a reqlist if the caller doesn't have one already. */ if (*reqlist == NULL) { nreqlist = xbb_get_reqlist(xbb); if (nreqlist == NULL) goto bailout_error; } /* We always allocate a request. */ nreq = xbb_get_req(xbb); if (nreq == NULL) goto bailout_error; mtx_unlock(&xbb->lock); if (*reqlist == NULL) { *reqlist = nreqlist; nreqlist->operation = ring_req->operation; nreqlist->starting_sector_number = ring_req->sector_number; STAILQ_INSERT_TAIL(&xbb->reqlist_pending_stailq, nreqlist, links); } nreq->reqlist = *reqlist; nreq->req_ring_idx = ring_idx; nreq->id = ring_req->id; nreq->operation = ring_req->operation; if (xbb->abi != BLKIF_PROTOCOL_NATIVE) { bcopy(ring_req, &nreq->ring_req_storage, sizeof(*ring_req)); nreq->ring_req = &nreq->ring_req_storage; } else { nreq->ring_req = ring_req; } binuptime(&nreq->ds_t0); devstat_start_transaction(xbb->xbb_stats_in, &nreq->ds_t0); STAILQ_INSERT_TAIL(&(*reqlist)->contig_req_list, nreq, links); (*reqlist)->num_children++; (*reqlist)->nr_segments += ring_req->nr_segments; return (0); bailout_error: /* * We're out of resources, so set the shortage flag. The next time * a request is released, we'll try waking up the work thread to * see if we can allocate more resources. */ xbb->flags |= XBBF_RESOURCE_SHORTAGE; xbb->request_shortages++; if (nreq != NULL) xbb_release_req(xbb, nreq); if (nreqlist != NULL) xbb_release_reqlist(xbb, nreqlist, /*wakeup*/ 0); mtx_unlock(&xbb->lock); return (1); } /** * Create and queue a response to a blkif request. * * \param xbb Per-instance xbb configuration structure. * \param req The request structure to which to respond. * \param status The status code to report. See BLKIF_RSP_* - * in sys/xen/interface/io/blkif.h. + * in sys/contrib/xen/io/blkif.h. */ static void xbb_queue_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) { blkif_response_t *resp; /* * The mutex is required here, and should be held across this call * until after the subsequent call to xbb_push_responses(). This * is to guarantee that another context won't queue responses and * push them while we're active. * * That could lead to the other end being notified of responses * before the resources have been freed on this end. The other end * would then be able to queue additional I/O, and we may run out * of resources because we haven't freed them all yet. */ mtx_assert(&xbb->lock, MA_OWNED); /* * Place on the response ring for the relevant domain. * For now, only the spacing between entries is different * in the different ABIs, not the response entry layout. */ switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: resp = RING_GET_RESPONSE(&xbb->rings.native, xbb->rings.native.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_32: resp = (blkif_response_t *) RING_GET_RESPONSE(&xbb->rings.x86_32, xbb->rings.x86_32.rsp_prod_pvt); break; case BLKIF_PROTOCOL_X86_64: resp = (blkif_response_t *) RING_GET_RESPONSE(&xbb->rings.x86_64, xbb->rings.x86_64.rsp_prod_pvt); break; default: panic("Unexpected blkif protocol ABI."); } resp->id = req->id; resp->operation = req->operation; resp->status = status; if (status != BLKIF_RSP_OKAY) xbb->reqs_completed_with_error++; xbb->rings.common.rsp_prod_pvt++; xbb->reqs_queued_for_completion++; } /** * Send queued responses to blkif requests. * * \param xbb Per-instance xbb configuration structure. * \param run_taskqueue Flag that is set to 1 if the taskqueue * should be run, 0 if it does not need to be run. * \param notify Flag that is set to 1 if the other end should be * notified via irq, 0 if the other end should not be * notified. */ static void xbb_push_responses(struct xbb_softc *xbb, int *run_taskqueue, int *notify) { int more_to_do; /* * The mutex is required here. */ mtx_assert(&xbb->lock, MA_OWNED); more_to_do = 0; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, *notify); if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { /* * Tail check for pending requests. Allows frontend to avoid * notifications if requests are already in flight (lower * overheads and promotes batching). */ RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { more_to_do = 1; } xbb->reqs_completed += xbb->reqs_queued_for_completion; xbb->reqs_queued_for_completion = 0; *run_taskqueue = more_to_do; } /** * Complete a request list. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. */ static void xbb_complete_reqlist(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) { struct xbb_xen_req *nreq; off_t sectors_sent; int notify, run_taskqueue; sectors_sent = 0; if (reqlist->flags & XBB_REQLIST_MAPPED) xbb_unmap_reqlist(reqlist); mtx_lock(&xbb->lock); /* * All I/O is done, send the response. A lock is not necessary * to protect the request list, because all requests have * completed. Therefore this is the only context accessing this * reqlist right now. However, in order to make sure that no one * else queues responses onto the queue or pushes them to the other * side while we're active, we need to hold the lock across the * calls to xbb_queue_response() and xbb_push_responses(). */ STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { off_t cur_sectors_sent; /* Put this response on the ring, but don't push yet */ xbb_queue_response(xbb, nreq, reqlist->status); /* We don't report bytes sent if there is an error. */ if (reqlist->status == BLKIF_RSP_OKAY) cur_sectors_sent = nreq->nr_512b_sectors; else cur_sectors_sent = 0; sectors_sent += cur_sectors_sent; devstat_end_transaction(xbb->xbb_stats_in, /*bytes*/cur_sectors_sent << 9, reqlist->ds_tag_type, reqlist->ds_trans_type, /*now*/NULL, /*then*/&nreq->ds_t0); } /* * Take out any sectors not sent. If we wind up negative (which * might happen if an error is reported as well as a residual), just * report 0 sectors sent. */ sectors_sent -= reqlist->residual_512b_sectors; if (sectors_sent < 0) sectors_sent = 0; devstat_end_transaction(xbb->xbb_stats, /*bytes*/ sectors_sent << 9, reqlist->ds_tag_type, reqlist->ds_trans_type, /*now*/NULL, /*then*/&reqlist->ds_t0); xbb_release_reqlist(xbb, reqlist, /*wakeup*/ 1); xbb_push_responses(xbb, &run_taskqueue, ¬ify); mtx_unlock(&xbb->lock); if (run_taskqueue) taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); if (notify) xen_intr_signal(xbb->xen_intr_handle); } /** * Completion handler for buffer I/O requests issued by the device * backend driver. * * \param bio The buffer I/O request on which to perform completion * processing. */ static void xbb_bio_done(struct bio *bio) { struct xbb_softc *xbb; struct xbb_xen_reqlist *reqlist; reqlist = bio->bio_caller1; xbb = reqlist->xbb; reqlist->residual_512b_sectors += bio->bio_resid >> 9; /* * This is a bit imprecise. With aggregated I/O a single * request list can contain multiple front-end requests and * a multiple bios may point to a single request. By carefully * walking the request list, we could map residuals and errors * back to the original front-end request, but the interface * isn't sufficiently rich for us to properly report the error. * So, we just treat the entire request list as having failed if an * error occurs on any part. And, if an error occurs, we treat * the amount of data transferred as 0. * * For residuals, we report it on the overall aggregated device, * but not on the individual requests, since we don't currently * do the work to determine which front-end request to which the * residual applies. */ if (bio->bio_error) { DPRINTF("BIO returned error %d for operation on device %s\n", bio->bio_error, xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; if (bio->bio_error == ENXIO && xenbus_get_state(xbb->dev) == XenbusStateConnected) { /* * Backend device has disappeared. Signal the * front-end that we (the device proxy) want to * go away. */ xenbus_set_state(xbb->dev, XenbusStateClosing); } } #ifdef XBB_USE_BOUNCE_BUFFERS if (bio->bio_cmd == BIO_READ) { vm_offset_t kva_offset; kva_offset = (vm_offset_t)bio->bio_data - (vm_offset_t)reqlist->bounce; memcpy((uint8_t *)reqlist->kva + kva_offset, bio->bio_data, bio->bio_bcount); } #endif /* XBB_USE_BOUNCE_BUFFERS */ /* * Decrement the pending count for the request list. When we're * done with the requests, send status back for all of them. */ if (atomic_fetchadd_int(&reqlist->pendcnt, -1) == 1) xbb_complete_reqlist(xbb, reqlist); g_destroy_bio(bio); } /** * Parse a blkif request into an internal request structure and send * it to the backend for processing. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. * * \return On success, 0. For resource shortages, non-zero. * * This routine performs the backend common aspects of request parsing * including compiling an internal request structure, parsing the S/G * list and any secondary ring requests in which they may reside, and * the mapping of front-end I/O pages into our domain. */ static int xbb_dispatch_io(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist) { struct xbb_sg *xbb_sg; struct gnttab_map_grant_ref *map; struct blkif_request_segment *sg; struct blkif_request_segment *last_block_sg; struct xbb_xen_req *nreq; u_int nseg; u_int seg_idx; u_int block_segs; int nr_sects; int total_sects; int operation; uint8_t bio_flags; int error; reqlist->ds_tag_type = DEVSTAT_TAG_SIMPLE; bio_flags = 0; total_sects = 0; nr_sects = 0; /* * First determine whether we have enough free KVA to satisfy this * request list. If not, tell xbb_run_queue() so it can go to * sleep until we have more KVA. */ reqlist->kva = NULL; if (reqlist->nr_segments != 0) { reqlist->kva = xbb_get_kva(xbb, reqlist->nr_segments); if (reqlist->kva == NULL) { /* * If we're out of KVA, return ENOMEM. */ return (ENOMEM); } } binuptime(&reqlist->ds_t0); devstat_start_transaction(xbb->xbb_stats, &reqlist->ds_t0); switch (reqlist->operation) { case BLKIF_OP_WRITE_BARRIER: bio_flags |= BIO_ORDERED; reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; /* FALLTHROUGH */ case BLKIF_OP_WRITE: operation = BIO_WRITE; reqlist->ds_trans_type = DEVSTAT_WRITE; if ((xbb->flags & XBBF_READ_ONLY) != 0) { DPRINTF("Attempt to write to read only device %s\n", xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } break; case BLKIF_OP_READ: operation = BIO_READ; reqlist->ds_trans_type = DEVSTAT_READ; break; case BLKIF_OP_FLUSH_DISKCACHE: /* * If this is true, the user has requested that we disable * flush support. So we just complete the requests * successfully. */ if (xbb->disable_flush != 0) { goto send_response; } /* * The user has requested that we only send a real flush * for every N flush requests. So keep count, and either * complete the request immediately or queue it for the * backend. */ if (xbb->flush_interval != 0) { if (++(xbb->flush_count) < xbb->flush_interval) { goto send_response; } else xbb->flush_count = 0; } operation = BIO_FLUSH; reqlist->ds_tag_type = DEVSTAT_TAG_ORDERED; reqlist->ds_trans_type = DEVSTAT_NO_DATA; goto do_dispatch; /*NOTREACHED*/ default: DPRINTF("error: unknown block io operation [%d]\n", reqlist->operation); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } reqlist->xbb = xbb; xbb_sg = xbb->xbb_sgs; map = xbb->maps; seg_idx = 0; STAILQ_FOREACH(nreq, &reqlist->contig_req_list, links) { blkif_request_t *ring_req; u_int req_seg_idx; ring_req = nreq->ring_req; nr_sects = 0; nseg = ring_req->nr_segments; nreq->nr_pages = nseg; nreq->nr_512b_sectors = 0; req_seg_idx = 0; sg = NULL; /* Check that number of segments is sane. */ if (__predict_false(nseg == 0) || __predict_false(nseg > xbb->max_request_segments)) { DPRINTF("Bad number of segments in request (%d)\n", nseg); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } block_segs = nseg; sg = ring_req->seg; last_block_sg = sg + block_segs; while (sg < last_block_sg) { KASSERT(seg_idx < XBB_MAX_SEGMENTS_PER_REQLIST, ("seg_idx %d is too large, max " "segs %d\n", seg_idx, XBB_MAX_SEGMENTS_PER_REQLIST)); xbb_sg->first_sect = sg->first_sect; xbb_sg->last_sect = sg->last_sect; xbb_sg->nsect = (int8_t)(sg->last_sect - sg->first_sect + 1); if ((sg->last_sect >= (PAGE_SIZE >> 9)) || (xbb_sg->nsect <= 0)) { reqlist->status = BLKIF_RSP_ERROR; goto send_response; } nr_sects += xbb_sg->nsect; map->host_addr = xbb_get_gntaddr(reqlist, seg_idx, /*sector*/0); KASSERT(map->host_addr + PAGE_SIZE <= xbb->ring_config.gnt_addr, ("Host address %#jx len %d overlaps " "ring address %#jx\n", (uintmax_t)map->host_addr, PAGE_SIZE, (uintmax_t)xbb->ring_config.gnt_addr)); map->flags = GNTMAP_host_map; map->ref = sg->gref; map->dom = xbb->otherend_id; if (operation == BIO_WRITE) map->flags |= GNTMAP_readonly; sg++; map++; xbb_sg++; seg_idx++; req_seg_idx++; } /* Convert to the disk's sector size */ nreq->nr_512b_sectors = nr_sects; nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; total_sects += nr_sects; if ((nreq->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) { device_printf(xbb->dev, "%s: I/O size (%d) is not " "a multiple of the backing store sector " "size (%d)\n", __func__, nreq->nr_512b_sectors << 9, xbb->sector_size); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } } error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, xbb->maps, reqlist->nr_segments); if (error != 0) panic("Grant table operation failed (%d)", error); reqlist->flags |= XBB_REQLIST_MAPPED; for (seg_idx = 0, map = xbb->maps; seg_idx < reqlist->nr_segments; seg_idx++, map++){ if (__predict_false(map->status != 0)) { DPRINTF("invalid buffer -- could not remap " "it (%d)\n", map->status); DPRINTF("Mapping(%d): Host Addr 0x%"PRIx64", flags " "0x%x ref 0x%x, dom %d\n", seg_idx, map->host_addr, map->flags, map->ref, map->dom); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } reqlist->gnt_handles[seg_idx] = map->handle; } if (reqlist->starting_sector_number + total_sects > xbb->media_num_sectors) { DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " "extends past end of device %s\n", operation == BIO_READ ? "read" : "write", reqlist->starting_sector_number, reqlist->starting_sector_number + total_sects, xbb->dev_name); reqlist->status = BLKIF_RSP_ERROR; goto send_response; } do_dispatch: error = xbb->dispatch_io(xbb, reqlist, operation, bio_flags); if (error != 0) { reqlist->status = BLKIF_RSP_ERROR; goto send_response; } return (0); send_response: xbb_complete_reqlist(xbb, reqlist); return (0); } static __inline int xbb_count_sects(blkif_request_t *ring_req) { int i; int cur_size = 0; for (i = 0; i < ring_req->nr_segments; i++) { int nsect; nsect = (int8_t)(ring_req->seg[i].last_sect - ring_req->seg[i].first_sect + 1); if (nsect <= 0) break; cur_size += nsect; } return (cur_size); } /** * Process incoming requests from the shared communication ring in response * to a signal on the ring's event channel. * * \param context Callback argument registerd during task initialization - * the xbb_softc for this instance. * \param pending The number of taskqueue_enqueue events that have * occurred since this handler was last run. */ static void xbb_run_queue(void *context, int pending) { struct xbb_softc *xbb; blkif_back_rings_t *rings; RING_IDX rp; uint64_t cur_sector; int cur_operation; struct xbb_xen_reqlist *reqlist; xbb = (struct xbb_softc *)context; rings = &xbb->rings; /* * Work gather and dispatch loop. Note that we have a bias here * towards gathering I/O sent by blockfront. We first gather up * everything in the ring, as long as we have resources. Then we * dispatch one request, and then attempt to gather up any * additional requests that have come in while we were dispatching * the request. * * This allows us to get a clearer picture (via devstat) of how * many requests blockfront is queueing to us at any given time. */ for (;;) { int retval; /* * Initialize reqlist to the last element in the pending * queue, if there is one. This allows us to add more * requests to that request list, if we have room. */ reqlist = STAILQ_LAST(&xbb->reqlist_pending_stailq, xbb_xen_reqlist, links); if (reqlist != NULL) { cur_sector = reqlist->next_contig_sector; cur_operation = reqlist->operation; } else { cur_operation = 0; cur_sector = 0; } /* * Cache req_prod to avoid accessing a cache line shared * with the frontend. */ rp = rings->common.sring->req_prod; /* Ensure we see queued requests up to 'rp'. */ rmb(); /** * Run so long as there is work to consume and the generation * of a response will not overflow the ring. * * @note There's a 1 to 1 relationship between requests and * responses, so an overflow should never occur. This * test is to protect our domain from digesting bogus * data. Shouldn't we log this? */ while (rings->common.req_cons != rp && RING_REQUEST_CONS_OVERFLOW(&rings->common, rings->common.req_cons) == 0){ blkif_request_t ring_req_storage; blkif_request_t *ring_req; int cur_size; switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: ring_req = RING_GET_REQUEST(&xbb->rings.native, rings->common.req_cons); break; case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_request *ring_req32; ring_req32 = RING_GET_REQUEST( &xbb->rings.x86_32, rings->common.req_cons); blkif_get_x86_32_req(&ring_req_storage, ring_req32); ring_req = &ring_req_storage; break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_request *ring_req64; ring_req64 =RING_GET_REQUEST(&xbb->rings.x86_64, rings->common.req_cons); blkif_get_x86_64_req(&ring_req_storage, ring_req64); ring_req = &ring_req_storage; break; } default: panic("Unexpected blkif protocol ABI."); /* NOTREACHED */ } /* * Check for situations that would require closing * off this I/O for further coalescing: * - Coalescing is turned off. * - Current I/O is out of sequence with the previous * I/O. * - Coalesced I/O would be too large. */ if ((reqlist != NULL) && ((xbb->no_coalesce_reqs != 0) || ((xbb->no_coalesce_reqs == 0) && ((ring_req->sector_number != cur_sector) || (ring_req->operation != cur_operation) || ((ring_req->nr_segments + reqlist->nr_segments) > xbb->max_reqlist_segments))))) { reqlist = NULL; } /* * Grab and check for all resources in one shot. * If we can't get all of the resources we need, * the shortage is noted and the thread will get * woken up when more resources are available. */ retval = xbb_get_resources(xbb, &reqlist, ring_req, xbb->rings.common.req_cons); if (retval != 0) { /* * Resource shortage has been recorded. * We'll be scheduled to run once a request * object frees up due to a completion. */ break; } /* * Signify that we can overwrite this request with * a response by incrementing our consumer index. * The response won't be generated until after * we've already consumed all necessary data out * of the version of the request in the ring buffer * (for native mode). We must update the consumer * index before issuing back-end I/O so there is * no possibility that it will complete and a * response be generated before we make room in * the queue for that response. */ xbb->rings.common.req_cons++; xbb->reqs_received++; cur_size = xbb_count_sects(ring_req); cur_sector = ring_req->sector_number + cur_size; reqlist->next_contig_sector = cur_sector; cur_operation = ring_req->operation; } /* Check for I/O to dispatch */ reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); if (reqlist == NULL) { /* * We're out of work to do, put the task queue to * sleep. */ break; } /* * Grab the first request off the queue and attempt * to dispatch it. */ STAILQ_REMOVE_HEAD(&xbb->reqlist_pending_stailq, links); retval = xbb_dispatch_io(xbb, reqlist); if (retval != 0) { /* * xbb_dispatch_io() returns non-zero only when * there is a resource shortage. If that's the * case, re-queue this request on the head of the * queue, and go to sleep until we have more * resources. */ STAILQ_INSERT_HEAD(&xbb->reqlist_pending_stailq, reqlist, links); break; } else { /* * If we still have anything on the queue after * removing the head entry, that is because we * met one of the criteria to create a new * request list (outlined above), and we'll call * that a forced dispatch for statistical purposes. * * Otherwise, if there is only one element on the * queue, we coalesced everything available on * the ring and we'll call that a normal dispatch. */ reqlist = STAILQ_FIRST(&xbb->reqlist_pending_stailq); if (reqlist != NULL) xbb->forced_dispatch++; else xbb->normal_dispatch++; xbb->total_dispatch++; } } } /** * Interrupt handler bound to the shared ring's event channel. * * \param arg Callback argument registerd during event channel * binding - the xbb_softc for this instance. */ static int xbb_filter(void *arg) { struct xbb_softc *xbb; /* Defer to taskqueue thread. */ xbb = (struct xbb_softc *)arg; taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); return (FILTER_HANDLED); } SDT_PROVIDER_DEFINE(xbb); SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_dev, flush, "int"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, read, "int", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_dev, write, "int", "uint64_t", "uint64_t"); /*----------------------------- Backend Handlers -----------------------------*/ /** * Backend handler for character device access. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list structure. * \param operation BIO_* I/O operation code. * \param bio_flags Additional bio_flag data to pass to any generated * bios (e.g. BIO_ORDERED).. * * \return 0 for success, errno codes for failure. */ static int xbb_dispatch_dev(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int bio_flags) { struct xbb_dev_data *dev_data; struct bio *bios[XBB_MAX_SEGMENTS_PER_REQLIST]; off_t bio_offset; struct bio *bio; struct xbb_sg *xbb_sg; u_int nbio; u_int bio_idx; u_int nseg; u_int seg_idx; int error; dev_data = &xbb->backend.dev; bio_offset = (off_t)reqlist->starting_sector_number << xbb->sector_size_shift; error = 0; nbio = 0; bio_idx = 0; if (operation == BIO_FLUSH) { bio = g_new_bio(); if (__predict_false(bio == NULL)) { DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); error = ENOMEM; return (error); } bio->bio_cmd = BIO_FLUSH; bio->bio_flags |= BIO_ORDERED; bio->bio_dev = dev_data->cdev; bio->bio_offset = 0; bio->bio_data = 0; bio->bio_done = xbb_bio_done; bio->bio_caller1 = reqlist; bio->bio_pblkno = 0; reqlist->pendcnt = 1; SDT_PROBE1(xbb, kernel, xbb_dispatch_dev, flush, device_get_unit(xbb->dev)); (*dev_data->csw->d_strategy)(bio); return (0); } xbb_sg = xbb->xbb_sgs; bio = NULL; nseg = reqlist->nr_segments; for (seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* * KVA will not be contiguous, so any additional * I/O will need to be represented in a new bio. */ if ((bio != NULL) && (xbb_sg->first_sect != 0)) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { printf("%s: Discontiguous I/O request " "from domain %d ends on " "non-sector boundary\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } bio = NULL; } if (bio == NULL) { /* * Make sure that the start of this bio is * aligned to a device sector. */ if ((bio_offset & (xbb->sector_size - 1)) != 0){ printf("%s: Misaligned I/O request " "from domain %d\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } bio = bios[nbio++] = g_new_bio(); if (__predict_false(bio == NULL)) { error = ENOMEM; goto fail_free_bios; } bio->bio_cmd = operation; bio->bio_flags |= bio_flags; bio->bio_dev = dev_data->cdev; bio->bio_offset = bio_offset; bio->bio_data = xbb_reqlist_ioaddr(reqlist, seg_idx, xbb_sg->first_sect); bio->bio_done = xbb_bio_done; bio->bio_caller1 = reqlist; bio->bio_pblkno = bio_offset >> xbb->sector_size_shift; } bio->bio_length += xbb_sg->nsect << 9; bio->bio_bcount = bio->bio_length; bio_offset += xbb_sg->nsect << 9; if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { printf("%s: Discontiguous I/O request " "from domain %d ends on " "non-sector boundary\n", __func__, xbb->otherend_id); error = EINVAL; goto fail_free_bios; } /* * KVA will not be contiguous, so any additional * I/O will need to be represented in a new bio. */ bio = NULL; } } reqlist->pendcnt = nbio; for (bio_idx = 0; bio_idx < nbio; bio_idx++) { #ifdef XBB_USE_BOUNCE_BUFFERS vm_offset_t kva_offset; kva_offset = (vm_offset_t)bios[bio_idx]->bio_data - (vm_offset_t)reqlist->bounce; if (operation == BIO_WRITE) { memcpy(bios[bio_idx]->bio_data, (uint8_t *)reqlist->kva + kva_offset, bios[bio_idx]->bio_bcount); } #endif if (operation == BIO_READ) { SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, read, device_get_unit(xbb->dev), bios[bio_idx]->bio_offset, bios[bio_idx]->bio_length); } else if (operation == BIO_WRITE) { SDT_PROBE3(xbb, kernel, xbb_dispatch_dev, write, device_get_unit(xbb->dev), bios[bio_idx]->bio_offset, bios[bio_idx]->bio_length); } (*dev_data->csw->d_strategy)(bios[bio_idx]); } return (error); fail_free_bios: for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) g_destroy_bio(bios[bio_idx]); return (error); } SDT_PROBE_DEFINE1(xbb, kernel, xbb_dispatch_file, flush, "int"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, read, "int", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE3(xbb, kernel, xbb_dispatch_file, write, "int", "uint64_t", "uint64_t"); /** * Backend handler for file access. * * \param xbb Per-instance xbb configuration structure. * \param reqlist Allocated internal request list. * \param operation BIO_* I/O operation code. * \param flags Additional bio_flag data to pass to any generated bios * (e.g. BIO_ORDERED).. * * \return 0 for success, errno codes for failure. */ static int xbb_dispatch_file(struct xbb_softc *xbb, struct xbb_xen_reqlist *reqlist, int operation, int flags) { struct xbb_file_data *file_data; u_int seg_idx; u_int nseg; struct uio xuio; struct xbb_sg *xbb_sg; struct iovec *xiovec; #ifdef XBB_USE_BOUNCE_BUFFERS void **p_vaddr; int saved_uio_iovcnt; #endif /* XBB_USE_BOUNCE_BUFFERS */ int error; file_data = &xbb->backend.file; error = 0; bzero(&xuio, sizeof(xuio)); switch (operation) { case BIO_READ: xuio.uio_rw = UIO_READ; break; case BIO_WRITE: xuio.uio_rw = UIO_WRITE; break; case BIO_FLUSH: { struct mount *mountpoint; SDT_PROBE1(xbb, kernel, xbb_dispatch_file, flush, device_get_unit(xbb->dev)); (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); VOP_UNLOCK(xbb->vn); vn_finished_write(mountpoint); goto bailout_send_response; /* NOTREACHED */ } default: panic("invalid operation %d", operation); /* NOTREACHED */ } xuio.uio_offset = (vm_offset_t)reqlist->starting_sector_number << xbb->sector_size_shift; xuio.uio_segflg = UIO_SYSSPACE; xuio.uio_iov = file_data->xiovecs; xuio.uio_iovcnt = 0; xbb_sg = xbb->xbb_sgs; nseg = reqlist->nr_segments; for (xiovec = NULL, seg_idx = 0; seg_idx < nseg; seg_idx++, xbb_sg++) { /* * If the first sector is not 0, the KVA will * not be contiguous and we'll need to go on * to another segment. */ if (xbb_sg->first_sect != 0) xiovec = NULL; if (xiovec == NULL) { xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; xiovec->iov_base = xbb_reqlist_ioaddr(reqlist, seg_idx, xbb_sg->first_sect); #ifdef XBB_USE_BOUNCE_BUFFERS /* * Store the address of the incoming * buffer at this particular offset * as well, so we can do the copy * later without having to do more * work to recalculate this address. */ p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; *p_vaddr = xbb_reqlist_vaddr(reqlist, seg_idx, xbb_sg->first_sect); #endif /* XBB_USE_BOUNCE_BUFFERS */ xiovec->iov_len = 0; xuio.uio_iovcnt++; } xiovec->iov_len += xbb_sg->nsect << 9; xuio.uio_resid += xbb_sg->nsect << 9; /* * If the last sector is not the full page * size count, the next segment will not be * contiguous in KVA and we need a new iovec. */ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) xiovec = NULL; } xuio.uio_td = curthread; #ifdef XBB_USE_BOUNCE_BUFFERS saved_uio_iovcnt = xuio.uio_iovcnt; if (operation == BIO_WRITE) { /* Copy the write data to the local buffer. */ for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; seg_idx++, xiovec++, p_vaddr++) { memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); } } else { /* * We only need to save off the iovecs in the case of a * read, because the copy for the read happens after the * VOP_READ(). (The uio will get modified in that call * sequence.) */ memcpy(file_data->saved_xiovecs, xuio.uio_iov, xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); } #endif /* XBB_USE_BOUNCE_BUFFERS */ switch (operation) { case BIO_READ: SDT_PROBE3(xbb, kernel, xbb_dispatch_file, read, device_get_unit(xbb->dev), xuio.uio_offset, xuio.uio_resid); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); /* * UFS pays attention to IO_DIRECT for reads. If the * DIRECTIO option is configured into the kernel, it calls * ffs_rawread(). But that only works for single-segment * uios with user space addresses. In our case, with a * kernel uio, it still reads into the buffer cache, but it * will just try to release the buffer from the cache later * on in ffs_read(). * * ZFS does not pay attention to IO_DIRECT for reads. * * UFS does not pay attention to IO_SYNC for reads. * * ZFS pays attention to IO_SYNC (which translates into the * Solaris define FRSYNC for zfs_read()) for reads. It * attempts to sync the file before reading. * * So, to attempt to provide some barrier semantics in the * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. */ error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? (IO_DIRECT|IO_SYNC) : 0, file_data->cred); VOP_UNLOCK(xbb->vn); break; case BIO_WRITE: { struct mount *mountpoint; SDT_PROBE3(xbb, kernel, xbb_dispatch_file, write, device_get_unit(xbb->dev), xuio.uio_offset, xuio.uio_resid); (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); /* * UFS pays attention to IO_DIRECT for writes. The write * is done asynchronously. (Normally the write would just * get put into cache. * * UFS pays attention to IO_SYNC for writes. It will * attempt to write the buffer out synchronously if that * flag is set. * * ZFS does not pay attention to IO_DIRECT for writes. * * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) * for writes. It will flush the transaction from the * cache before returning. * * So if we've got the BIO_ORDERED flag set, we want * IO_SYNC in either the UFS or ZFS case. */ error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? IO_SYNC : 0, file_data->cred); VOP_UNLOCK(xbb->vn); vn_finished_write(mountpoint); break; } default: panic("invalid operation %d", operation); /* NOTREACHED */ } #ifdef XBB_USE_BOUNCE_BUFFERS /* We only need to copy here for read operations */ if (operation == BIO_READ) { for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, xiovec = file_data->saved_xiovecs; seg_idx < saved_uio_iovcnt; seg_idx++, xiovec++, p_vaddr++) { /* * Note that we have to use the copy of the * io vector we made above. uiomove() modifies * the uio and its referenced vector as uiomove * performs the copy, so we can't rely on any * state from the original uio. */ memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); } } #endif /* XBB_USE_BOUNCE_BUFFERS */ bailout_send_response: if (error != 0) reqlist->status = BLKIF_RSP_ERROR; xbb_complete_reqlist(xbb, reqlist); return (0); } /*--------------------------- Backend Configuration --------------------------*/ /** * Close and cleanup any backend device/file specific state for this * block back instance. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_close_backend(struct xbb_softc *xbb) { DROP_GIANT(); DPRINTF("closing dev=%s\n", xbb->dev_name); if (xbb->vn) { int flags = FREAD; if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; switch (xbb->device_type) { case XBB_TYPE_DISK: if (xbb->backend.dev.csw) { dev_relthread(xbb->backend.dev.cdev, xbb->backend.dev.dev_ref); xbb->backend.dev.csw = NULL; xbb->backend.dev.cdev = NULL; } break; case XBB_TYPE_FILE: break; case XBB_TYPE_NONE: default: panic("Unexpected backend type."); break; } (void)vn_close(xbb->vn, flags, NOCRED, curthread); xbb->vn = NULL; switch (xbb->device_type) { case XBB_TYPE_DISK: break; case XBB_TYPE_FILE: if (xbb->backend.file.cred != NULL) { crfree(xbb->backend.file.cred); xbb->backend.file.cred = NULL; } break; case XBB_TYPE_NONE: default: panic("Unexpected backend type."); break; } } PICKUP_GIANT(); } /** * Open a character device to be used for backend I/O. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_dev(struct xbb_softc *xbb) { struct vattr vattr; struct cdev *dev; struct cdevsw *devsw; int error; xbb->device_type = XBB_TYPE_DISK; xbb->dispatch_io = xbb_dispatch_dev; xbb->backend.dev.cdev = xbb->vn->v_rdev; xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, &xbb->backend.dev.dev_ref); if (xbb->backend.dev.csw == NULL) panic("Unable to retrieve device switch"); error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); if (error) { xenbus_dev_fatal(xbb->dev, error, "error getting " "vnode attributes for device %s", xbb->dev_name); return (error); } dev = xbb->vn->v_rdev; devsw = dev->si_devsw; if (!devsw->d_ioctl) { xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " "device %s!", xbb->dev_name); return (ENODEV); } error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, (caddr_t)&xbb->sector_size, FREAD, curthread); if (error) { xenbus_dev_fatal(xbb->dev, error, "error calling ioctl DIOCGSECTORSIZE " "for device %s", xbb->dev_name); return (error); } error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, (caddr_t)&xbb->media_size, FREAD, curthread); if (error) { xenbus_dev_fatal(xbb->dev, error, "error calling ioctl DIOCGMEDIASIZE " "for device %s", xbb->dev_name); return (error); } return (0); } /** * Open a file to be used for backend I/O. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_file(struct xbb_softc *xbb) { struct xbb_file_data *file_data; struct vattr vattr; int error; file_data = &xbb->backend.file; xbb->device_type = XBB_TYPE_FILE; xbb->dispatch_io = xbb_dispatch_file; error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "error calling VOP_GETATTR()" "for file %s", xbb->dev_name); return (error); } /* * Verify that we have the ability to upgrade to exclusive * access on this file so we can trap errors at open instead * of reporting them during first access. */ if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); if (VN_IS_DOOMED(xbb->vn)) { error = EBADF; xenbus_dev_fatal(xbb->dev, error, "error locking file %s", xbb->dev_name); return (error); } } file_data->cred = crhold(curthread->td_ucred); xbb->media_size = vattr.va_size; /* * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. * With ZFS, it is 131072 bytes. Block sizes that large don't work * with disklabel and UFS on FreeBSD at least. Large block sizes * may not work with other OSes as well. So just export a sector * size of 512 bytes, which should work with any OS or * application. Since our backing is a file, any block size will * work fine for the backing store. */ #if 0 xbb->sector_size = vattr.va_blocksize; #endif xbb->sector_size = 512; /* * Sanity check. The media size has to be at least one * sector long. */ if (xbb->media_size < xbb->sector_size) { error = EINVAL; xenbus_dev_fatal(xbb->dev, error, "file %s size %ju < block size %u", xbb->dev_name, (uintmax_t)xbb->media_size, xbb->sector_size); } return (error); } /** * Open the backend provider for this connection. * * \param xbb Per-instance xbb configuration structure. * * \return 0 for success, errno codes for failure. */ static int xbb_open_backend(struct xbb_softc *xbb) { struct nameidata nd; int flags; int error; flags = FREAD; error = 0; DPRINTF("opening dev=%s\n", xbb->dev_name); if (rootvnode == NULL) { xenbus_dev_fatal(xbb->dev, ENOENT, "Root file system not mounted"); return (ENOENT); } if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; pwd_ensure_dirs(); again: NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name); error = vn_open(&nd, &flags, 0, NULL); if (error) { /* * This is the only reasonable guess we can make as far as * path if the user doesn't give us a fully qualified path. * If they want to specify a file, they need to specify the * full path. */ if (xbb->dev_name[0] != '/') { char *dev_path = "/dev/"; char *dev_name; /* Try adding device path at beginning of name */ dev_name = malloc(strlen(xbb->dev_name) + strlen(dev_path) + 1, M_XENBLOCKBACK, M_NOWAIT); if (dev_name) { sprintf(dev_name, "%s%s", dev_path, xbb->dev_name); free(xbb->dev_name, M_XENBLOCKBACK); xbb->dev_name = dev_name; goto again; } } xenbus_dev_fatal(xbb->dev, error, "error opening device %s", xbb->dev_name); return (error); } NDFREE(&nd, NDF_ONLY_PNBUF); xbb->vn = nd.ni_vp; /* We only support disks and files. */ if (vn_isdisk_error(xbb->vn, &error)) { error = xbb_open_dev(xbb); } else if (xbb->vn->v_type == VREG) { error = xbb_open_file(xbb); } else { error = EINVAL; xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " "or file", xbb->dev_name); } VOP_UNLOCK(xbb->vn); if (error != 0) { xbb_close_backend(xbb); return (error); } xbb->sector_size_shift = fls(xbb->sector_size) - 1; xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", xbb->dev_name, xbb->sector_size, xbb->media_size); return (0); } /*------------------------ Inter-Domain Communication ------------------------*/ /** * Free dynamically allocated KVA or pseudo-physical address allocations. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_free_communication_mem(struct xbb_softc *xbb) { if (xbb->kva != 0) { if (xbb->pseudo_phys_res != NULL) { xenmem_free(xbb->dev, xbb->pseudo_phys_res_id, xbb->pseudo_phys_res); xbb->pseudo_phys_res = NULL; } } xbb->kva = 0; xbb->gnt_base_addr = 0; if (xbb->kva_free != NULL) { free(xbb->kva_free, M_XENBLOCKBACK); xbb->kva_free = NULL; } } /** * Cleanup all inter-domain communication mechanisms. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_disconnect(struct xbb_softc *xbb) { struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; struct gnttab_unmap_grant_ref *op; u_int ring_idx; int error; DPRINTF("\n"); if ((xbb->flags & XBBF_RING_CONNECTED) == 0) return (0); mtx_unlock(&xbb->lock); xen_intr_unbind(&xbb->xen_intr_handle); taskqueue_drain(xbb->io_taskqueue, &xbb->io_task); mtx_lock(&xbb->lock); /* * No new interrupts can generate work, but we must wait * for all currently active requests to drain. */ if (xbb->active_request_count != 0) return (EAGAIN); for (ring_idx = 0, op = ops; ring_idx < xbb->ring_config.ring_pages; ring_idx++, op++) { op->host_addr = xbb->ring_config.gnt_addr + (ring_idx * PAGE_SIZE); op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; op->handle = xbb->ring_config.handle[ring_idx]; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, xbb->ring_config.ring_pages); if (error != 0) panic("Grant table op failed (%d)", error); xbb_free_communication_mem(xbb); if (xbb->requests != NULL) { free(xbb->requests, M_XENBLOCKBACK); xbb->requests = NULL; } if (xbb->request_lists != NULL) { struct xbb_xen_reqlist *reqlist; int i; /* There is one request list for ever allocated request. */ for (i = 0, reqlist = xbb->request_lists; i < xbb->max_requests; i++, reqlist++){ #ifdef XBB_USE_BOUNCE_BUFFERS if (reqlist->bounce != NULL) { free(reqlist->bounce, M_XENBLOCKBACK); reqlist->bounce = NULL; } #endif if (reqlist->gnt_handles != NULL) { free(reqlist->gnt_handles, M_XENBLOCKBACK); reqlist->gnt_handles = NULL; } } free(xbb->request_lists, M_XENBLOCKBACK); xbb->request_lists = NULL; } xbb->flags &= ~XBBF_RING_CONNECTED; return (0); } /** * Map shared memory ring into domain local address space, initialize * ring control structures, and bind an interrupt to the event channel * used to notify us of ring changes. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_connect_ring(struct xbb_softc *xbb) { struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; struct gnttab_map_grant_ref *gnt; u_int ring_idx; int error; if ((xbb->flags & XBBF_RING_CONNECTED) != 0) return (0); /* * Kva for our ring is at the tail of the region of kva allocated * by xbb_alloc_communication_mem(). */ xbb->ring_config.va = xbb->kva + (xbb->kva_size - (xbb->ring_config.ring_pages * PAGE_SIZE)); xbb->ring_config.gnt_addr = xbb->gnt_base_addr + (xbb->kva_size - (xbb->ring_config.ring_pages * PAGE_SIZE)); for (ring_idx = 0, gnt = gnts; ring_idx < xbb->ring_config.ring_pages; ring_idx++, gnt++) { gnt->host_addr = xbb->ring_config.gnt_addr + (ring_idx * PAGE_SIZE); gnt->flags = GNTMAP_host_map; gnt->ref = xbb->ring_config.ring_ref[ring_idx]; gnt->dom = xbb->otherend_id; } error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, xbb->ring_config.ring_pages); if (error) panic("blkback: Ring page grant table op failed (%d)", error); for (ring_idx = 0, gnt = gnts; ring_idx < xbb->ring_config.ring_pages; ring_idx++, gnt++) { if (gnt->status != 0) { struct gnttab_unmap_grant_ref unmap[XBB_MAX_RING_PAGES]; unsigned int i, j; xbb->ring_config.va = 0; xenbus_dev_fatal(xbb->dev, EACCES, "Ring shared page mapping failed. " "Status %d.", gnt->status); /* Unmap everything to avoid leaking grant table maps */ for (i = 0, j = 0; i < xbb->ring_config.ring_pages; i++) { if (gnts[i].status != GNTST_okay) continue; unmap[j].host_addr = gnts[i].host_addr; unmap[j].dev_bus_addr = gnts[i].dev_bus_addr; unmap[j++].handle = gnts[i].handle; } if (j != 0) { error = HYPERVISOR_grant_table_op( GNTTABOP_unmap_grant_ref, unmap, j); if (error != 0) panic("Unable to unmap grants (%d)", error); } return (EACCES); } xbb->ring_config.handle[ring_idx] = gnt->handle; xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; } /* Initialize the ring based on ABI. */ switch (xbb->abi) { case BLKIF_PROTOCOL_NATIVE: { blkif_sring_t *sring; sring = (blkif_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.native, sring, xbb->ring_config.ring_pages * PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_32: { blkif_x86_32_sring_t *sring_x86_32; sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, xbb->ring_config.ring_pages * PAGE_SIZE); break; } case BLKIF_PROTOCOL_X86_64: { blkif_x86_64_sring_t *sring_x86_64; sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, xbb->ring_config.ring_pages * PAGE_SIZE); break; } default: panic("Unexpected blkif protocol ABI."); } xbb->flags |= XBBF_RING_CONNECTED; error = xen_intr_bind_remote_port(xbb->dev, xbb->otherend_id, xbb->ring_config.evtchn, xbb_filter, /*ithread_handler*/NULL, /*arg*/xbb, INTR_TYPE_BIO | INTR_MPSAFE, &xbb->xen_intr_handle); if (error) { (void)xbb_disconnect(xbb); xenbus_dev_fatal(xbb->dev, error, "binding event channel"); return (error); } DPRINTF("rings connected!\n"); return 0; } /** * Size KVA and pseudo-physical address allocations based on negotiated * values for the size and number of I/O requests, and the size of our * communication ring. * * \param xbb Per-instance xbb configuration structure. * * These address spaces are used to dynamically map pages in the * front-end's domain into our own. */ static int xbb_alloc_communication_mem(struct xbb_softc *xbb) { xbb->reqlist_kva_pages = xbb->max_requests * xbb->max_request_segments; xbb->reqlist_kva_size = xbb->reqlist_kva_pages * PAGE_SIZE; xbb->kva_size = xbb->reqlist_kva_size + (xbb->ring_config.ring_pages * PAGE_SIZE); xbb->kva_free = bit_alloc(xbb->reqlist_kva_pages, M_XENBLOCKBACK, M_NOWAIT); if (xbb->kva_free == NULL) return (ENOMEM); DPRINTF("%s: kva_size = %d, reqlist_kva_size = %d\n", device_get_nameunit(xbb->dev), xbb->kva_size, xbb->reqlist_kva_size); /* * Reserve a range of pseudo physical memory that we can map * into kva. These pages will only be backed by machine * pages ("real memory") during the lifetime of front-end requests * via grant table operations. */ xbb->pseudo_phys_res_id = 0; xbb->pseudo_phys_res = xenmem_alloc(xbb->dev, &xbb->pseudo_phys_res_id, xbb->kva_size); if (xbb->pseudo_phys_res == NULL) { xbb->kva = 0; return (ENOMEM); } xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); DPRINTF("%s: kva: %#jx, gnt_base_addr: %#jx\n", device_get_nameunit(xbb->dev), (uintmax_t)xbb->kva, (uintmax_t)xbb->gnt_base_addr); return (0); } /** * Collect front-end information from the XenStore. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_collect_frontend_info(struct xbb_softc *xbb) { char protocol_abi[64]; const char *otherend_path; int error; u_int ring_idx; u_int ring_page_order; size_t ring_size; otherend_path = xenbus_get_otherend_path(xbb->dev); /* * Protocol defaults valid even if all negotiation fails. */ xbb->ring_config.ring_pages = 1; xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; /* * Mandatory data (used in all versions of the protocol) first. */ error = xs_scanf(XST_NIL, otherend_path, "event-channel", NULL, "%" PRIu32, &xbb->ring_config.evtchn); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to retrieve event-channel information " "from frontend %s. Unable to connect.", xenbus_get_otherend_path(xbb->dev)); return (error); } /* * These fields are initialized to legacy protocol defaults * so we only need to fail if reading the updated value succeeds * and the new value is outside of its allowed range. * * \note xs_gather() returns on the first encountered error, so * we must use independent calls in order to guarantee * we don't miss information in a sparsly populated front-end * tree. * * \note xs_scanf() does not update variables for unmatched * fields. */ ring_page_order = 0; xbb->max_requests = 32; (void)xs_scanf(XST_NIL, otherend_path, "ring-page-order", NULL, "%u", &ring_page_order); xbb->ring_config.ring_pages = 1 << ring_page_order; ring_size = PAGE_SIZE * xbb->ring_config.ring_pages; xbb->max_requests = BLKIF_MAX_RING_REQUESTS(ring_size); if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { xenbus_dev_fatal(xbb->dev, EINVAL, "Front-end specified ring-pages of %u " "exceeds backend limit of %u. " "Unable to connect.", xbb->ring_config.ring_pages, XBB_MAX_RING_PAGES); return (EINVAL); } if (xbb->ring_config.ring_pages == 1) { error = xs_gather(XST_NIL, otherend_path, "ring-ref", "%" PRIu32, &xbb->ring_config.ring_ref[0], NULL); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to retrieve ring information " "from frontend %s. Unable to " "connect.", xenbus_get_otherend_path(xbb->dev)); return (error); } } else { /* Multi-page ring format. */ for (ring_idx = 0; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { char ring_ref_name[]= "ring_refXX"; snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", ring_idx); error = xs_scanf(XST_NIL, otherend_path, ring_ref_name, NULL, "%" PRIu32, &xbb->ring_config.ring_ref[ring_idx]); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Failed to retriev grant " "reference for page %u of " "shared ring. Unable " "to connect.", ring_idx); return (error); } } } error = xs_gather(XST_NIL, otherend_path, "protocol", "%63s", protocol_abi, NULL); if (error != 0 || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { /* * Assume native if the frontend has not * published ABI data or it has published and * matches our own ABI. */ xbb->abi = BLKIF_PROTOCOL_NATIVE; } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { xbb->abi = BLKIF_PROTOCOL_X86_32; } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { xbb->abi = BLKIF_PROTOCOL_X86_64; } else { xenbus_dev_fatal(xbb->dev, EINVAL, "Unknown protocol ABI (%s) published by " "frontend. Unable to connect.", protocol_abi); return (EINVAL); } return (0); } /** * Allocate per-request data structures given request size and number * information negotiated with the front-end. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_alloc_requests(struct xbb_softc *xbb) { struct xbb_xen_req *req; struct xbb_xen_req *last_req; /* * Allocate request book keeping datastructures. */ xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (xbb->requests == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request structures"); return (ENOMEM); } req = xbb->requests; last_req = &xbb->requests[xbb->max_requests - 1]; STAILQ_INIT(&xbb->request_free_stailq); while (req <= last_req) { STAILQ_INSERT_TAIL(&xbb->request_free_stailq, req, links); req++; } return (0); } static int xbb_alloc_request_lists(struct xbb_softc *xbb) { struct xbb_xen_reqlist *reqlist; int i; /* * If no requests can be merged, we need 1 request list per * in flight request. */ xbb->request_lists = malloc(xbb->max_requests * sizeof(*xbb->request_lists), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (xbb->request_lists == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request list structures"); return (ENOMEM); } STAILQ_INIT(&xbb->reqlist_free_stailq); STAILQ_INIT(&xbb->reqlist_pending_stailq); for (i = 0; i < xbb->max_requests; i++) { int seg; reqlist = &xbb->request_lists[i]; reqlist->xbb = xbb; #ifdef XBB_USE_BOUNCE_BUFFERS reqlist->bounce = malloc(xbb->max_reqlist_size, M_XENBLOCKBACK, M_NOWAIT); if (reqlist->bounce == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request " "bounce buffers"); return (ENOMEM); } #endif /* XBB_USE_BOUNCE_BUFFERS */ reqlist->gnt_handles = malloc(xbb->max_reqlist_segments * sizeof(*reqlist->gnt_handles), M_XENBLOCKBACK, M_NOWAIT|M_ZERO); if (reqlist->gnt_handles == NULL) { xenbus_dev_fatal(xbb->dev, ENOMEM, "Unable to allocate request " "grant references"); return (ENOMEM); } for (seg = 0; seg < xbb->max_reqlist_segments; seg++) reqlist->gnt_handles[seg] = GRANT_REF_INVALID; STAILQ_INSERT_TAIL(&xbb->reqlist_free_stailq, reqlist, links); } return (0); } /** * Supply information about the physical device to the frontend * via XenBus. * * \param xbb Per-instance xbb configuration structure. */ static int xbb_publish_backend_info(struct xbb_softc *xbb) { struct xs_transaction xst; const char *our_path; const char *leaf; int error; our_path = xenbus_get_node(xbb->dev); while (1) { error = xs_transaction_start(&xst); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Error publishing backend info " "(start transaction)"); return (error); } leaf = "sectors"; error = xs_printf(xst, our_path, leaf, "%"PRIu64, xbb->media_num_sectors); if (error != 0) break; /* XXX Support all VBD attributes here. */ leaf = "info"; error = xs_printf(xst, our_path, leaf, "%u", xbb->flags & XBBF_READ_ONLY ? VDISK_READONLY : 0); if (error != 0) break; leaf = "sector-size"; error = xs_printf(xst, our_path, leaf, "%u", xbb->sector_size); if (error != 0) break; error = xs_transaction_end(xst, 0); if (error == 0) { return (0); } else if (error != EAGAIN) { xenbus_dev_fatal(xbb->dev, error, "ending transaction"); return (error); } } xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", our_path, leaf); xs_transaction_end(xst, 1); return (error); } /** * Connect to our blkfront peer now that it has completed publishing * its configuration into the XenStore. * * \param xbb Per-instance xbb configuration structure. */ static void xbb_connect(struct xbb_softc *xbb) { int error; if (!xbb->hotplug_done || (xenbus_get_state(xbb->dev) != XenbusStateInitWait) || (xbb_collect_frontend_info(xbb) != 0)) return; xbb->flags &= ~XBBF_SHUTDOWN; /* * We limit the maximum number of reqlist segments to the maximum * number of segments in the ring, or our absolute maximum, * whichever is smaller. */ xbb->max_reqlist_segments = MIN(xbb->max_request_segments * xbb->max_requests, XBB_MAX_SEGMENTS_PER_REQLIST); /* * The maximum size is simply a function of the number of segments * we can handle. */ xbb->max_reqlist_size = xbb->max_reqlist_segments * PAGE_SIZE; /* Allocate resources whose size depends on front-end configuration. */ error = xbb_alloc_communication_mem(xbb); if (error != 0) { xenbus_dev_fatal(xbb->dev, error, "Unable to allocate communication memory"); return; } error = xbb_alloc_requests(xbb); if (error != 0) { /* Specific errors are reported by xbb_alloc_requests(). */ return; } error = xbb_alloc_request_lists(xbb); if (error != 0) { /* Specific errors are reported by xbb_alloc_request_lists(). */ return; } /* * Connect communication channel. */ error = xbb_connect_ring(xbb); if (error != 0) { /* Specific errors are reported by xbb_connect_ring(). */ return; } if (xbb_publish_backend_info(xbb) != 0) { /* * If we can't publish our data, we cannot participate * in this connection, and waiting for a front-end state * change will not help the situation. */ (void)xbb_disconnect(xbb); return; } /* Ready for I/O. */ xenbus_set_state(xbb->dev, XenbusStateConnected); } /*-------------------------- Device Teardown Support -------------------------*/ /** * Perform device shutdown functions. * * \param xbb Per-instance xbb configuration structure. * * Mark this instance as shutting down, wait for any active I/O on the * backend device/file to drain, disconnect from the front-end, and notify * any waiters (e.g. a thread invoking our detach method) that detach can * now proceed. */ static int xbb_shutdown(struct xbb_softc *xbb) { XenbusState frontState; int error; DPRINTF("\n"); /* * Due to the need to drop our mutex during some * xenbus operations, it is possible for two threads * to attempt to close out shutdown processing at * the same time. Tell the caller that hits this * race to try back later. */ if ((xbb->flags & XBBF_IN_SHUTDOWN) != 0) return (EAGAIN); xbb->flags |= XBBF_IN_SHUTDOWN; mtx_unlock(&xbb->lock); if (xbb->hotplug_watch.node != NULL) { xs_unregister_watch(&xbb->hotplug_watch); free(xbb->hotplug_watch.node, M_XENBLOCKBACK); xbb->hotplug_watch.node = NULL; } if (xenbus_get_state(xbb->dev) < XenbusStateClosing) xenbus_set_state(xbb->dev, XenbusStateClosing); frontState = xenbus_get_otherend_state(xbb->dev); mtx_lock(&xbb->lock); xbb->flags &= ~XBBF_IN_SHUTDOWN; /* Wait for the frontend to disconnect (if it's connected). */ if (frontState == XenbusStateConnected) return (EAGAIN); DPRINTF("\n"); /* Indicate shutdown is in progress. */ xbb->flags |= XBBF_SHUTDOWN; /* Disconnect from the front-end. */ error = xbb_disconnect(xbb); if (error != 0) { /* * Requests still outstanding. We'll be called again * once they complete. */ KASSERT(error == EAGAIN, ("%s: Unexpected xbb_disconnect() failure %d", __func__, error)); return (error); } DPRINTF("\n"); /* Indicate to xbb_detach() that is it safe to proceed. */ wakeup(xbb); return (0); } /** * Report an attach time error to the console and Xen, and cleanup * this instance by forcing immediate detach processing. * * \param xbb Per-instance xbb configuration structure. * \param err Errno describing the error. * \param fmt Printf style format and arguments */ static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) { va_list ap; va_list ap_hotplug; va_start(ap, fmt); va_copy(ap_hotplug, ap); xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-error", fmt, ap_hotplug); va_end(ap_hotplug); xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-status", "error"); xenbus_dev_vfatal(xbb->dev, err, fmt, ap); va_end(ap); xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "online", "0"); mtx_lock(&xbb->lock); xbb_shutdown(xbb); mtx_unlock(&xbb->lock); } /*---------------------------- NewBus Entrypoints ----------------------------*/ /** * Inspect a XenBus device and claim it if is of the appropriate type. * * \param dev NewBus device object representing a candidate XenBus device. * * \return 0 for success, errno codes for failure. */ static int xbb_probe(device_t dev) { if (!strcmp(xenbus_get_type(dev), "vbd")) { device_set_desc(dev, "Backend Virtual Block Device"); device_quiet(dev); return (0); } return (ENXIO); } /** * Setup sysctl variables to control various Block Back parameters. * * \param xbb Xen Block Back softc. * */ static void xbb_setup_sysctl(struct xbb_softc *xbb) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; sysctl_ctx = device_get_sysctl_ctx(xbb->dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xbb->dev); if (sysctl_tree == NULL) return; SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "disable_flush", CTLFLAG_RW, &xbb->disable_flush, 0, "fake the flush command"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "flush_interval", CTLFLAG_RW, &xbb->flush_interval, 0, "send a real flush for N flush requests"); SYSCTL_ADD_INT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "no_coalesce_reqs", CTLFLAG_RW, &xbb->no_coalesce_reqs,0, "Don't coalesce contiguous requests"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_received", CTLFLAG_RW, &xbb->reqs_received, "how many I/O requests we have received"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_completed", CTLFLAG_RW, &xbb->reqs_completed, "how many I/O requests have been completed"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_queued_for_completion", CTLFLAG_RW, &xbb->reqs_queued_for_completion, "how many I/O requests queued but not yet pushed"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "reqs_completed_with_error", CTLFLAG_RW, &xbb->reqs_completed_with_error, "how many I/O requests completed with error status"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "forced_dispatch", CTLFLAG_RW, &xbb->forced_dispatch, "how many I/O dispatches were forced"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "normal_dispatch", CTLFLAG_RW, &xbb->normal_dispatch, "how many I/O dispatches were normal"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "total_dispatch", CTLFLAG_RW, &xbb->total_dispatch, "total number of I/O dispatches"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "kva_shortages", CTLFLAG_RW, &xbb->kva_shortages, "how many times we have run out of KVA"); SYSCTL_ADD_UQUAD(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "request_shortages", CTLFLAG_RW, &xbb->request_shortages, "how many times we have run out of requests"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_requests", CTLFLAG_RD, &xbb->max_requests, 0, "maximum outstanding requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_request_segments", CTLFLAG_RD, &xbb->max_request_segments, 0, "maximum number of pages per requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "max_request_size", CTLFLAG_RD, &xbb->max_request_size, 0, "maximum size in bytes of a request (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "ring_pages", CTLFLAG_RD, &xbb->ring_config.ring_pages, 0, "communication channel pages (negotiated)"); } static void xbb_attach_disk(device_t dev) { struct xbb_softc *xbb; int error; xbb = device_get_softc(dev); KASSERT(xbb->hotplug_done, ("Missing hotplug execution")); /* Parse fopen style mode flags. */ if (strchr(xbb->dev_mode, 'w') == NULL) xbb->flags |= XBBF_READ_ONLY; /* * Verify the physical device is present and can support * the desired I/O mode. */ error = xbb_open_backend(xbb); if (error != 0) { xbb_attach_failed(xbb, error, "Unable to open %s", xbb->dev_name); return; } /* Use devstat(9) for recording statistics. */ xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), xbb->sector_size, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); xbb->xbb_stats_in = devstat_new_entry("xbbi", device_get_unit(xbb->dev), xbb->sector_size, DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER, DEVSTAT_PRIORITY_OTHER); /* * Setup sysctl variables. */ xbb_setup_sysctl(xbb); /* * Create a taskqueue for doing work that must occur from a * thread context. */ xbb->io_taskqueue = taskqueue_create_fast(device_get_nameunit(dev), M_NOWAIT, taskqueue_thread_enqueue, /*contxt*/&xbb->io_taskqueue); if (xbb->io_taskqueue == NULL) { xbb_attach_failed(xbb, error, "Unable to create taskqueue"); return; } taskqueue_start_threads(&xbb->io_taskqueue, /*num threads*/1, /*priority*/PWAIT, /*thread name*/ "%s taskq", device_get_nameunit(dev)); /* Update hot-plug status to satisfy xend. */ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "hotplug-status", "connected"); if (error) { xbb_attach_failed(xbb, error, "writing %s/hotplug-status", xenbus_get_node(xbb->dev)); return; } /* The front end might be waiting for the backend, attach if so. */ if (xenbus_get_otherend_state(xbb->dev) == XenbusStateInitialised) xbb_connect(xbb); } static void xbb_attach_cb(struct xs_watch *watch, const char **vec, unsigned int len) { device_t dev; struct xbb_softc *xbb; int error; dev = (device_t)watch->callback_data; xbb = device_get_softc(dev); error = xs_gather(XST_NIL, xenbus_get_node(dev), "physical-device-path", NULL, &xbb->dev_name, NULL); if (error != 0) return; xs_unregister_watch(watch); free(watch->node, M_XENBLOCKBACK); watch->node = NULL; xbb->hotplug_done = true; /* Collect physical device information. */ error = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "device-type", NULL, &xbb->dev_type, NULL); if (error != 0) xbb->dev_type = NULL; error = xs_gather(XST_NIL, xenbus_get_node(dev), "mode", NULL, &xbb->dev_mode, NULL); if (error != 0) { xbb_attach_failed(xbb, error, "reading backend fields at %s", xenbus_get_node(dev)); return; } xbb_attach_disk(dev); } /** * Attach to a XenBus device that has been claimed by our probe routine. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_attach(device_t dev) { struct xbb_softc *xbb; int error; u_int max_ring_page_order; struct sbuf *watch_path; DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); /* * Basic initialization. * After this block it is safe to call xbb_detach() * to clean up any allocated data for this instance. */ xbb = device_get_softc(dev); xbb->dev = dev; xbb->otherend_id = xenbus_get_otherend_id(dev); TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); /* * Publish protocol capabilities for consumption by the * front-end. */ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "feature-barrier", "1"); if (error) { xbb_attach_failed(xbb, error, "writing %s/feature-barrier", xenbus_get_node(xbb->dev)); return (error); } error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "feature-flush-cache", "1"); if (error) { xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", xenbus_get_node(xbb->dev)); return (error); } max_ring_page_order = flsl(XBB_MAX_RING_PAGES) - 1; error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), "max-ring-page-order", "%u", max_ring_page_order); if (error) { xbb_attach_failed(xbb, error, "writing %s/max-ring-page-order", xenbus_get_node(xbb->dev)); return (error); } /* Tell the toolstack blkback has attached. */ xenbus_set_state(dev, XenbusStateInitWait); if (xbb->hotplug_done) { xbb_attach_disk(dev); return (0); } /* * We need to wait for hotplug script execution before * moving forward. */ watch_path = xs_join(xenbus_get_node(xbb->dev), "physical-device-path"); xbb->hotplug_watch.callback_data = (uintptr_t)dev; xbb->hotplug_watch.callback = xbb_attach_cb; KASSERT(xbb->hotplug_watch.node == NULL, ("watch node already setup")); xbb->hotplug_watch.node = strdup(sbuf_data(watch_path), M_XENBLOCKBACK); /* * We don't care about the path updated, just about the value changes * on that single node, hence there's no need to queue more that one * event. */ xbb->hotplug_watch.max_pending = 1; sbuf_delete(watch_path); error = xs_register_watch(&xbb->hotplug_watch); if (error != 0) { xbb_attach_failed(xbb, error, "failed to create watch on %s", xbb->hotplug_watch.node); free(xbb->hotplug_watch.node, M_XENBLOCKBACK); return (error); } return (0); } /** * Detach from a block back device instance. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. * * \note A block back device may be detached at any time in its life-cycle, * including part way through the attach process. For this reason, * initialization order and the initialization state checks in this * routine must be carefully coupled so that attach time failures * are gracefully handled. */ static int xbb_detach(device_t dev) { struct xbb_softc *xbb; DPRINTF("\n"); xbb = device_get_softc(dev); mtx_lock(&xbb->lock); while (xbb_shutdown(xbb) == EAGAIN) { msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, "xbb_shutdown", 0); } mtx_unlock(&xbb->lock); DPRINTF("\n"); if (xbb->io_taskqueue != NULL) taskqueue_free(xbb->io_taskqueue); if (xbb->xbb_stats != NULL) devstat_remove_entry(xbb->xbb_stats); if (xbb->xbb_stats_in != NULL) devstat_remove_entry(xbb->xbb_stats_in); xbb_close_backend(xbb); if (xbb->dev_mode != NULL) { free(xbb->dev_mode, M_XENSTORE); xbb->dev_mode = NULL; } if (xbb->dev_type != NULL) { free(xbb->dev_type, M_XENSTORE); xbb->dev_type = NULL; } if (xbb->dev_name != NULL) { free(xbb->dev_name, M_XENSTORE); xbb->dev_name = NULL; } mtx_destroy(&xbb->lock); return (0); } /** * Prepare this block back device for suspension of this VM. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_suspend(device_t dev) { #ifdef NOT_YET struct xbb_softc *sc = device_get_softc(dev); /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xb_io_lock); sc->connected = BLKIF_STATE_SUSPENDED; mtx_unlock(&sc->xb_io_lock); #endif return (0); } /** * Perform any processing required to recover from a suspended state. * * \param dev NewBus device object representing this Xen Block Back instance. * * \return 0 for success, errno codes for failure. */ static int xbb_resume(device_t dev) { return (0); } /** * Handle state changes expressed via the XenStore by our front-end peer. * * \param dev NewBus device object representing this Xen * Block Back instance. * \param frontend_state The new state of the front-end. * * \return 0 for success, errno codes for failure. */ static void xbb_frontend_changed(device_t dev, XenbusState frontend_state) { struct xbb_softc *xbb = device_get_softc(dev); DPRINTF("frontend_state=%s, xbb_state=%s\n", xenbus_strstate(frontend_state), xenbus_strstate(xenbus_get_state(xbb->dev))); switch (frontend_state) { case XenbusStateInitialising: break; case XenbusStateInitialised: case XenbusStateConnected: xbb_connect(xbb); break; case XenbusStateClosing: case XenbusStateClosed: mtx_lock(&xbb->lock); xbb_shutdown(xbb); mtx_unlock(&xbb->lock); if (frontend_state == XenbusStateClosed) xenbus_set_state(xbb->dev, XenbusStateClosed); break; default: xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", frontend_state); break; } } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xbb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xbb_probe), DEVMETHOD(device_attach, xbb_attach), DEVMETHOD(device_detach, xbb_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xbb_suspend), DEVMETHOD(device_resume, xbb_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), { 0, 0 } }; static driver_t xbb_driver = { "xbbd", xbb_methods, sizeof(struct xbb_softc), }; devclass_t xbb_devclass; DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c index 72498620764b..3219c26222aa 100644 --- a/sys/dev/xen/blkfront/blkfront.c +++ b/sys/dev/xen/blkfront/blkfront.c @@ -1,1651 +1,1651 @@ /* * XenBSD block device driver * * Copyright (c) 2010-2013 Spectra Logic Corporation * Copyright (c) 2009 Scott Long, Yahoo! * Copyright (c) 2009 Frank Suchomel, Citrix * Copyright (c) 2009 Doug F. Rabson, Citrix * Copyright (c) 2005 Kip Macy * Copyright (c) 2003-2004, Keir Fraser & Steve Hand * Modifications by Mark A. Williamson are (c) Intel Research Cambridge * * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include +#include +#include #include #include #include #include #include "xenbus_if.h" /*--------------------------- Forward Declarations ---------------------------*/ static void xbd_closing(device_t); static void xbd_startio(struct xbd_softc *sc); /*---------------------------------- Macros ----------------------------------*/ #if 0 #define DPRINTK(fmt, args...) printf("[XEN] %s:%d: " fmt ".\n", __func__, __LINE__, ##args) #else #define DPRINTK(fmt, args...) #endif #define XBD_SECTOR_SHFT 9 /*---------------------------- Global Static Data ----------------------------*/ static MALLOC_DEFINE(M_XENBLOCKFRONT, "xbd", "Xen Block Front driver data"); static int xbd_enable_indirect = 1; SYSCTL_NODE(_hw, OID_AUTO, xbd, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "xbd driver parameters"); SYSCTL_INT(_hw_xbd, OID_AUTO, xbd_enable_indirect, CTLFLAG_RDTUN, &xbd_enable_indirect, 0, "Enable xbd indirect segments"); /*---------------------------- Command Processing ----------------------------*/ static void xbd_freeze(struct xbd_softc *sc, xbd_flag_t xbd_flag) { if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) != 0) return; sc->xbd_flags |= xbd_flag; sc->xbd_qfrozen_cnt++; } static void xbd_thaw(struct xbd_softc *sc, xbd_flag_t xbd_flag) { if (xbd_flag != XBDF_NONE && (sc->xbd_flags & xbd_flag) == 0) return; if (sc->xbd_qfrozen_cnt == 0) panic("%s: Thaw with flag 0x%x while not frozen.", __func__, xbd_flag); sc->xbd_flags &= ~xbd_flag; sc->xbd_qfrozen_cnt--; } static void xbd_cm_freeze(struct xbd_softc *sc, struct xbd_command *cm, xbdc_flag_t cm_flag) { if ((cm->cm_flags & XBDCF_FROZEN) != 0) return; cm->cm_flags |= XBDCF_FROZEN|cm_flag; xbd_freeze(sc, XBDF_NONE); } static void xbd_cm_thaw(struct xbd_softc *sc, struct xbd_command *cm) { if ((cm->cm_flags & XBDCF_FROZEN) == 0) return; cm->cm_flags &= ~XBDCF_FROZEN; xbd_thaw(sc, XBDF_NONE); } static inline void xbd_flush_requests(struct xbd_softc *sc) { int notify; RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->xbd_ring, notify); if (notify) xen_intr_signal(sc->xen_intr_handle); } static void xbd_free_command(struct xbd_command *cm) { KASSERT((cm->cm_flags & XBDCF_Q_MASK) == XBD_Q_NONE, ("Freeing command that is still on queue %d.", cm->cm_flags & XBDCF_Q_MASK)); cm->cm_flags = XBDCF_INITIALIZER; cm->cm_bp = NULL; cm->cm_complete = NULL; xbd_enqueue_cm(cm, XBD_Q_FREE); xbd_thaw(cm->cm_sc, XBDF_CM_SHORTAGE); } static void xbd_mksegarray(bus_dma_segment_t *segs, int nsegs, grant_ref_t * gref_head, int otherend_id, int readonly, grant_ref_t * sg_ref, struct blkif_request_segment *sg) { struct blkif_request_segment *last_block_sg = sg + nsegs; vm_paddr_t buffer_ma; uint64_t fsect, lsect; int ref; while (sg < last_block_sg) { KASSERT(segs->ds_addr % (1 << XBD_SECTOR_SHFT) == 0, ("XEN disk driver I/O must be sector aligned")); KASSERT(segs->ds_len % (1 << XBD_SECTOR_SHFT) == 0, ("XEN disk driver I/Os must be a multiple of " "the sector length")); buffer_ma = segs->ds_addr; fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; lsect = fsect + (segs->ds_len >> XBD_SECTOR_SHFT) - 1; KASSERT(lsect <= 7, ("XEN disk driver data cannot " "cross a page boundary")); /* install a grant reference. */ ref = gnttab_claim_grant_reference(gref_head); /* * GNTTAB_LIST_END == 0xffffffff, but it is private * to gnttab.c. */ KASSERT(ref != ~0, ("grant_reference failed")); gnttab_grant_foreign_access_ref( ref, otherend_id, buffer_ma >> PAGE_SHIFT, readonly); *sg_ref = ref; *sg = (struct blkif_request_segment) { .gref = ref, .first_sect = fsect, .last_sect = lsect }; sg++; sg_ref++; segs++; } } static void xbd_queue_cb(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { struct xbd_softc *sc; struct xbd_command *cm; int op; cm = arg; sc = cm->cm_sc; if (error) { cm->cm_bp->bio_error = EIO; biodone(cm->cm_bp); xbd_free_command(cm); return; } KASSERT(nsegs <= sc->xbd_max_request_segments, ("Too many segments in a blkfront I/O")); if (nsegs <= BLKIF_MAX_SEGMENTS_PER_REQUEST) { blkif_request_t *ring_req; /* Fill out a blkif_request_t structure. */ ring_req = (blkif_request_t *) RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); sc->xbd_ring.req_prod_pvt++; ring_req->id = cm->cm_id; ring_req->operation = cm->cm_operation; ring_req->sector_number = cm->cm_sector_number; ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; ring_req->nr_segments = nsegs; cm->cm_nseg = nsegs; xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, xenbus_get_otherend_id(sc->xbd_dev), cm->cm_operation == BLKIF_OP_WRITE, cm->cm_sg_refs, ring_req->seg); } else { blkif_request_indirect_t *ring_req; /* Fill out a blkif_request_indirect_t structure. */ ring_req = (blkif_request_indirect_t *) RING_GET_REQUEST(&sc->xbd_ring, sc->xbd_ring.req_prod_pvt); sc->xbd_ring.req_prod_pvt++; ring_req->id = cm->cm_id; ring_req->operation = BLKIF_OP_INDIRECT; ring_req->indirect_op = cm->cm_operation; ring_req->sector_number = cm->cm_sector_number; ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xbd_disk; ring_req->nr_segments = nsegs; cm->cm_nseg = nsegs; xbd_mksegarray(segs, nsegs, &cm->cm_gref_head, xenbus_get_otherend_id(sc->xbd_dev), cm->cm_operation == BLKIF_OP_WRITE, cm->cm_sg_refs, cm->cm_indirectionpages); memcpy(ring_req->indirect_grefs, &cm->cm_indirectionrefs, sizeof(grant_ref_t) * sc->xbd_max_request_indirectpages); } if (cm->cm_operation == BLKIF_OP_READ) op = BUS_DMASYNC_PREREAD; else if (cm->cm_operation == BLKIF_OP_WRITE) op = BUS_DMASYNC_PREWRITE; else op = 0; bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op); gnttab_free_grant_references(cm->cm_gref_head); xbd_enqueue_cm(cm, XBD_Q_BUSY); /* * If bus dma had to asynchronously call us back to dispatch * this command, we are no longer executing in the context of * xbd_startio(). Thus we cannot rely on xbd_startio()'s call to * xbd_flush_requests() to publish this command to the backend * along with any other commands that it could batch. */ if ((cm->cm_flags & XBDCF_ASYNC_MAPPING) != 0) xbd_flush_requests(sc); return; } static int xbd_queue_request(struct xbd_softc *sc, struct xbd_command *cm) { int error; if (cm->cm_bp != NULL) error = bus_dmamap_load_bio(sc->xbd_io_dmat, cm->cm_map, cm->cm_bp, xbd_queue_cb, cm, 0); else error = bus_dmamap_load(sc->xbd_io_dmat, cm->cm_map, cm->cm_data, cm->cm_datalen, xbd_queue_cb, cm, 0); if (error == EINPROGRESS) { /* * Maintain queuing order by freezing the queue. The next * command may not require as many resources as the command * we just attempted to map, so we can't rely on bus dma * blocking for it too. */ xbd_cm_freeze(sc, cm, XBDCF_ASYNC_MAPPING); return (0); } return (error); } static void xbd_restart_queue_callback(void *arg) { struct xbd_softc *sc = arg; mtx_lock(&sc->xbd_io_lock); xbd_thaw(sc, XBDF_GNT_SHORTAGE); xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); } static struct xbd_command * xbd_bio_command(struct xbd_softc *sc) { struct xbd_command *cm; struct bio *bp; if (__predict_false(sc->xbd_state != XBD_STATE_CONNECTED)) return (NULL); bp = xbd_dequeue_bio(sc); if (bp == NULL) return (NULL); if ((cm = xbd_dequeue_cm(sc, XBD_Q_FREE)) == NULL) { xbd_freeze(sc, XBDF_CM_SHORTAGE); xbd_requeue_bio(sc, bp); return (NULL); } if (gnttab_alloc_grant_references(sc->xbd_max_request_segments, &cm->cm_gref_head) != 0) { gnttab_request_free_callback(&sc->xbd_callback, xbd_restart_queue_callback, sc, sc->xbd_max_request_segments); xbd_freeze(sc, XBDF_GNT_SHORTAGE); xbd_requeue_bio(sc, bp); xbd_enqueue_cm(cm, XBD_Q_FREE); return (NULL); } cm->cm_bp = bp; cm->cm_sector_number = (blkif_sector_t)bp->bio_pblkno; switch (bp->bio_cmd) { case BIO_READ: cm->cm_operation = BLKIF_OP_READ; break; case BIO_WRITE: cm->cm_operation = BLKIF_OP_WRITE; if ((bp->bio_flags & BIO_ORDERED) != 0) { if ((sc->xbd_flags & XBDF_BARRIER) != 0) { cm->cm_operation = BLKIF_OP_WRITE_BARRIER; } else { /* * Single step this command. */ cm->cm_flags |= XBDCF_Q_FREEZE; if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { /* * Wait for in-flight requests to * finish. */ xbd_freeze(sc, XBDF_WAIT_IDLE); xbd_requeue_cm(cm, XBD_Q_READY); return (NULL); } } } break; case BIO_FLUSH: if ((sc->xbd_flags & XBDF_FLUSH) != 0) cm->cm_operation = BLKIF_OP_FLUSH_DISKCACHE; else if ((sc->xbd_flags & XBDF_BARRIER) != 0) cm->cm_operation = BLKIF_OP_WRITE_BARRIER; else panic("flush request, but no flush support available"); break; default: biofinish(bp, NULL, EOPNOTSUPP); xbd_enqueue_cm(cm, XBD_Q_FREE); return (NULL); } return (cm); } /* * Dequeue buffers and place them in the shared communication ring. * Return when no more requests can be accepted or all buffers have * been queued. * * Signal XEN once the ring has been filled out. */ static void xbd_startio(struct xbd_softc *sc) { struct xbd_command *cm; int error, queued = 0; mtx_assert(&sc->xbd_io_lock, MA_OWNED); if (sc->xbd_state != XBD_STATE_CONNECTED) return; while (!RING_FULL(&sc->xbd_ring)) { if (sc->xbd_qfrozen_cnt != 0) break; cm = xbd_dequeue_cm(sc, XBD_Q_READY); if (cm == NULL) cm = xbd_bio_command(sc); if (cm == NULL) break; if ((cm->cm_flags & XBDCF_Q_FREEZE) != 0) { /* * Single step command. Future work is * held off until this command completes. */ xbd_cm_freeze(sc, cm, XBDCF_Q_FREEZE); } if ((error = xbd_queue_request(sc, cm)) != 0) { printf("xbd_queue_request returned %d\n", error); break; } queued++; } if (queued != 0) xbd_flush_requests(sc); } static void xbd_bio_complete(struct xbd_softc *sc, struct xbd_command *cm) { struct bio *bp; bp = cm->cm_bp; if (__predict_false(cm->cm_status != BLKIF_RSP_OKAY)) { disk_err(bp, "disk error" , -1, 0); printf(" status: %x\n", cm->cm_status); bp->bio_flags |= BIO_ERROR; } if (bp->bio_flags & BIO_ERROR) bp->bio_error = EIO; else bp->bio_resid = 0; xbd_free_command(cm); biodone(bp); } static void xbd_int(void *xsc) { struct xbd_softc *sc = xsc; struct xbd_command *cm; blkif_response_t *bret; RING_IDX i, rp; int op; mtx_lock(&sc->xbd_io_lock); if (__predict_false(sc->xbd_state == XBD_STATE_DISCONNECTED)) { mtx_unlock(&sc->xbd_io_lock); return; } again: rp = sc->xbd_ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ for (i = sc->xbd_ring.rsp_cons; i != rp;) { bret = RING_GET_RESPONSE(&sc->xbd_ring, i); cm = &sc->xbd_shadow[bret->id]; xbd_remove_cm(cm, XBD_Q_BUSY); gnttab_end_foreign_access_references(cm->cm_nseg, cm->cm_sg_refs); i++; if (cm->cm_operation == BLKIF_OP_READ) op = BUS_DMASYNC_POSTREAD; else if (cm->cm_operation == BLKIF_OP_WRITE || cm->cm_operation == BLKIF_OP_WRITE_BARRIER) op = BUS_DMASYNC_POSTWRITE; else op = 0; bus_dmamap_sync(sc->xbd_io_dmat, cm->cm_map, op); bus_dmamap_unload(sc->xbd_io_dmat, cm->cm_map); /* * Release any hold this command has on future command * dispatch. */ xbd_cm_thaw(sc, cm); /* * Directly call the i/o complete routine to save an * an indirection in the common case. */ cm->cm_status = bret->status; if (cm->cm_bp) xbd_bio_complete(sc, cm); else if (cm->cm_complete != NULL) cm->cm_complete(cm); else xbd_free_command(cm); } sc->xbd_ring.rsp_cons = i; if (i != sc->xbd_ring.req_prod_pvt) { int more_to_do; RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, more_to_do); if (more_to_do) goto again; } else { sc->xbd_ring.sring->rsp_event = i + 1; } if (xbd_queue_length(sc, XBD_Q_BUSY) == 0) xbd_thaw(sc, XBDF_WAIT_IDLE); xbd_startio(sc); if (__predict_false(sc->xbd_state == XBD_STATE_SUSPENDED)) wakeup(&sc->xbd_cm_q[XBD_Q_BUSY]); mtx_unlock(&sc->xbd_io_lock); } /*------------------------------- Dump Support -------------------------------*/ /** * Quiesce the disk writes for a dump file before allowing the next buffer. */ static void xbd_quiesce(struct xbd_softc *sc) { int mtd; // While there are outstanding requests while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { RING_FINAL_CHECK_FOR_RESPONSES(&sc->xbd_ring, mtd); if (mtd) { /* Received request completions, update queue. */ xbd_int(sc); } if (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { /* * Still pending requests, wait for the disk i/o * to complete. */ HYPERVISOR_yield(); } } } /* Kernel dump function for a paravirtualized disk device */ static void xbd_dump_complete(struct xbd_command *cm) { xbd_enqueue_cm(cm, XBD_Q_COMPLETE); } static int xbd_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, size_t length) { struct disk *dp = arg; struct xbd_softc *sc = dp->d_drv1; struct xbd_command *cm; size_t chunk; int sbp; int rc = 0; if (length == 0) return (0); xbd_quiesce(sc); /* All quiet on the western front. */ /* * If this lock is held, then this module is failing, and a * successful kernel dump is highly unlikely anyway. */ mtx_lock(&sc->xbd_io_lock); /* Split the 64KB block as needed */ for (sbp=0; length > 0; sbp++) { cm = xbd_dequeue_cm(sc, XBD_Q_FREE); if (cm == NULL) { mtx_unlock(&sc->xbd_io_lock); device_printf(sc->xbd_dev, "dump: no more commands?\n"); return (EBUSY); } if (gnttab_alloc_grant_references(sc->xbd_max_request_segments, &cm->cm_gref_head) != 0) { xbd_free_command(cm); mtx_unlock(&sc->xbd_io_lock); device_printf(sc->xbd_dev, "no more grant allocs?\n"); return (EBUSY); } chunk = length > sc->xbd_max_request_size ? sc->xbd_max_request_size : length; cm->cm_data = virtual; cm->cm_datalen = chunk; cm->cm_operation = BLKIF_OP_WRITE; cm->cm_sector_number = offset / dp->d_sectorsize; cm->cm_complete = xbd_dump_complete; xbd_enqueue_cm(cm, XBD_Q_READY); length -= chunk; offset += chunk; virtual = (char *) virtual + chunk; } /* Tell DOM0 to do the I/O */ xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); /* Poll for the completion. */ xbd_quiesce(sc); /* All quite on the eastern front */ /* If there were any errors, bail out... */ while ((cm = xbd_dequeue_cm(sc, XBD_Q_COMPLETE)) != NULL) { if (cm->cm_status != BLKIF_RSP_OKAY) { device_printf(sc->xbd_dev, "Dump I/O failed at sector %jd\n", cm->cm_sector_number); rc = EIO; } xbd_free_command(cm); } return (rc); } /*----------------------------- Disk Entrypoints -----------------------------*/ static int xbd_open(struct disk *dp) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) { printf("xbd%d: not found", dp->d_unit); return (ENXIO); } sc->xbd_flags |= XBDF_OPEN; sc->xbd_users++; return (0); } static int xbd_close(struct disk *dp) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) return (ENXIO); sc->xbd_flags &= ~XBDF_OPEN; if (--(sc->xbd_users) == 0) { /* * Check whether we have been instructed to close. We will * have ignored this request initially, as the device was * still mounted. */ if (xenbus_get_otherend_state(sc->xbd_dev) == XenbusStateClosing) xbd_closing(sc->xbd_dev); } return (0); } static int xbd_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) { struct xbd_softc *sc = dp->d_drv1; if (sc == NULL) return (ENXIO); return (ENOTTY); } /* * Read/write routine for a buffer. Finds the proper unit, place it on * the sortq and kick the controller. */ static void xbd_strategy(struct bio *bp) { struct xbd_softc *sc = bp->bio_disk->d_drv1; /* bogus disk? */ if (sc == NULL) { bp->bio_error = EINVAL; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; biodone(bp); return; } /* * Place it in the queue of disk activities for this disk */ mtx_lock(&sc->xbd_io_lock); xbd_enqueue_bio(sc, bp); xbd_startio(sc); mtx_unlock(&sc->xbd_io_lock); return; } /*------------------------------ Ring Management -----------------------------*/ static int xbd_alloc_ring(struct xbd_softc *sc) { blkif_sring_t *sring; uintptr_t sring_page_addr; int error; int i; sring = malloc(sc->xbd_ring_pages * PAGE_SIZE, M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); if (sring == NULL) { xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "allocating shared ring"); return (ENOMEM); } SHARED_RING_INIT(sring); FRONT_RING_INIT(&sc->xbd_ring, sring, sc->xbd_ring_pages * PAGE_SIZE); for (i = 0, sring_page_addr = (uintptr_t)sring; i < sc->xbd_ring_pages; i++, sring_page_addr += PAGE_SIZE) { error = xenbus_grant_ring(sc->xbd_dev, (vtophys(sring_page_addr) >> PAGE_SHIFT), &sc->xbd_ring_ref[i]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "granting ring_ref(%d)", i); return (error); } } if (sc->xbd_ring_pages == 1) { error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev), "ring-ref", "%u", sc->xbd_ring_ref[0]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/ring-ref", xenbus_get_node(sc->xbd_dev)); return (error); } } else { for (i = 0; i < sc->xbd_ring_pages; i++) { char ring_ref_name[]= "ring_refXX"; snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i); error = xs_printf(XST_NIL, xenbus_get_node(sc->xbd_dev), ring_ref_name, "%u", sc->xbd_ring_ref[i]); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/%s", xenbus_get_node(sc->xbd_dev), ring_ref_name); return (error); } } } error = xen_intr_alloc_and_bind_local_port(sc->xbd_dev, xenbus_get_otherend_id(sc->xbd_dev), NULL, xbd_int, sc, INTR_TYPE_BIO | INTR_MPSAFE, &sc->xen_intr_handle); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "xen_intr_alloc_and_bind_local_port failed"); return (error); } return (0); } static void xbd_free_ring(struct xbd_softc *sc) { int i; if (sc->xbd_ring.sring == NULL) return; for (i = 0; i < sc->xbd_ring_pages; i++) { if (sc->xbd_ring_ref[i] != GRANT_REF_INVALID) { gnttab_end_foreign_access_ref(sc->xbd_ring_ref[i]); sc->xbd_ring_ref[i] = GRANT_REF_INVALID; } } free(sc->xbd_ring.sring, M_XENBLOCKFRONT); sc->xbd_ring.sring = NULL; } /*-------------------------- Initialization/Teardown -------------------------*/ static int xbd_feature_string(struct xbd_softc *sc, char *features, size_t len) { struct sbuf sb; int feature_cnt; sbuf_new(&sb, features, len, SBUF_FIXEDLEN); feature_cnt = 0; if ((sc->xbd_flags & XBDF_FLUSH) != 0) { sbuf_printf(&sb, "flush"); feature_cnt++; } if ((sc->xbd_flags & XBDF_BARRIER) != 0) { if (feature_cnt != 0) sbuf_printf(&sb, ", "); sbuf_printf(&sb, "write_barrier"); feature_cnt++; } if ((sc->xbd_flags & XBDF_DISCARD) != 0) { if (feature_cnt != 0) sbuf_printf(&sb, ", "); sbuf_printf(&sb, "discard"); feature_cnt++; } if ((sc->xbd_flags & XBDF_PERSISTENT) != 0) { if (feature_cnt != 0) sbuf_printf(&sb, ", "); sbuf_printf(&sb, "persistent_grants"); feature_cnt++; } (void) sbuf_finish(&sb); return (sbuf_len(&sb)); } static int xbd_sysctl_features(SYSCTL_HANDLER_ARGS) { char features[80]; struct xbd_softc *sc = arg1; int error; int len; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); len = xbd_feature_string(sc, features, sizeof(features)); /* len is -1 on error, which will make the SYSCTL_OUT a no-op. */ return (SYSCTL_OUT(req, features, len + 1/*NUL*/)); } static void xbd_setup_sysctl(struct xbd_softc *xbd) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; struct sysctl_oid_list *children; sysctl_ctx = device_get_sysctl_ctx(xbd->xbd_dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xbd->xbd_dev); if (sysctl_tree == NULL) return; children = SYSCTL_CHILDREN(sysctl_tree); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_requests", CTLFLAG_RD, &xbd->xbd_max_requests, -1, "maximum outstanding requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_request_segments", CTLFLAG_RD, &xbd->xbd_max_request_segments, 0, "maximum number of pages per requests (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "max_request_size", CTLFLAG_RD, &xbd->xbd_max_request_size, 0, "maximum size in bytes of a request (negotiated)"); SYSCTL_ADD_UINT(sysctl_ctx, children, OID_AUTO, "ring_pages", CTLFLAG_RD, &xbd->xbd_ring_pages, 0, "communication channel pages (negotiated)"); SYSCTL_ADD_PROC(sysctl_ctx, children, OID_AUTO, "features", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, xbd, 0, xbd_sysctl_features, "A", "protocol features (negotiated)"); } /* * Translate Linux major/minor to an appropriate name and unit * number. For HVM guests, this allows us to use the same drive names * with blkfront as the emulated drives, easing transition slightly. */ static void xbd_vdevice_to_unit(uint32_t vdevice, int *unit, const char **name) { static struct vdev_info { int major; int shift; int base; const char *name; } info[] = { {3, 6, 0, "ada"}, /* ide0 */ {22, 6, 2, "ada"}, /* ide1 */ {33, 6, 4, "ada"}, /* ide2 */ {34, 6, 6, "ada"}, /* ide3 */ {56, 6, 8, "ada"}, /* ide4 */ {57, 6, 10, "ada"}, /* ide5 */ {88, 6, 12, "ada"}, /* ide6 */ {89, 6, 14, "ada"}, /* ide7 */ {90, 6, 16, "ada"}, /* ide8 */ {91, 6, 18, "ada"}, /* ide9 */ {8, 4, 0, "da"}, /* scsi disk0 */ {65, 4, 16, "da"}, /* scsi disk1 */ {66, 4, 32, "da"}, /* scsi disk2 */ {67, 4, 48, "da"}, /* scsi disk3 */ {68, 4, 64, "da"}, /* scsi disk4 */ {69, 4, 80, "da"}, /* scsi disk5 */ {70, 4, 96, "da"}, /* scsi disk6 */ {71, 4, 112, "da"}, /* scsi disk7 */ {128, 4, 128, "da"}, /* scsi disk8 */ {129, 4, 144, "da"}, /* scsi disk9 */ {130, 4, 160, "da"}, /* scsi disk10 */ {131, 4, 176, "da"}, /* scsi disk11 */ {132, 4, 192, "da"}, /* scsi disk12 */ {133, 4, 208, "da"}, /* scsi disk13 */ {134, 4, 224, "da"}, /* scsi disk14 */ {135, 4, 240, "da"}, /* scsi disk15 */ {202, 4, 0, "xbd"}, /* xbd */ {0, 0, 0, NULL}, }; int major = vdevice >> 8; int minor = vdevice & 0xff; int i; if (vdevice & (1 << 28)) { *unit = (vdevice & ((1 << 28) - 1)) >> 8; *name = "xbd"; return; } for (i = 0; info[i].major; i++) { if (info[i].major == major) { *unit = info[i].base + (minor >> info[i].shift); *name = info[i].name; return; } } *unit = minor >> 4; *name = "xbd"; } int xbd_instance_create(struct xbd_softc *sc, blkif_sector_t sectors, int vdevice, uint16_t vdisk_info, unsigned long sector_size, unsigned long phys_sector_size) { char features[80]; int unit, error = 0; const char *name; xbd_vdevice_to_unit(vdevice, &unit, &name); sc->xbd_unit = unit; if (strcmp(name, "xbd") != 0) device_printf(sc->xbd_dev, "attaching as %s%d\n", name, unit); if (xbd_feature_string(sc, features, sizeof(features)) > 0) { device_printf(sc->xbd_dev, "features: %s\n", features); } sc->xbd_disk = disk_alloc(); sc->xbd_disk->d_unit = sc->xbd_unit; sc->xbd_disk->d_open = xbd_open; sc->xbd_disk->d_close = xbd_close; sc->xbd_disk->d_ioctl = xbd_ioctl; sc->xbd_disk->d_strategy = xbd_strategy; sc->xbd_disk->d_dump = xbd_dump; sc->xbd_disk->d_name = name; sc->xbd_disk->d_drv1 = sc; sc->xbd_disk->d_sectorsize = sector_size; sc->xbd_disk->d_stripesize = phys_sector_size; sc->xbd_disk->d_stripeoffset = 0; sc->xbd_disk->d_mediasize = sectors * sector_size; sc->xbd_disk->d_maxsize = sc->xbd_max_request_size; sc->xbd_disk->d_flags = DISKFLAG_UNMAPPED_BIO; if ((sc->xbd_flags & (XBDF_FLUSH|XBDF_BARRIER)) != 0) { sc->xbd_disk->d_flags |= DISKFLAG_CANFLUSHCACHE; device_printf(sc->xbd_dev, "synchronize cache commands enabled.\n"); } disk_create(sc->xbd_disk, DISK_VERSION); return error; } static void xbd_free(struct xbd_softc *sc) { int i; /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xbd_io_lock); sc->xbd_state = XBD_STATE_DISCONNECTED; mtx_unlock(&sc->xbd_io_lock); /* Free resources associated with old device channel. */ xbd_free_ring(sc); if (sc->xbd_shadow) { for (i = 0; i < sc->xbd_max_requests; i++) { struct xbd_command *cm; cm = &sc->xbd_shadow[i]; if (cm->cm_sg_refs != NULL) { free(cm->cm_sg_refs, M_XENBLOCKFRONT); cm->cm_sg_refs = NULL; } if (cm->cm_indirectionpages != NULL) { gnttab_end_foreign_access_references( sc->xbd_max_request_indirectpages, &cm->cm_indirectionrefs[0]); contigfree(cm->cm_indirectionpages, PAGE_SIZE * sc->xbd_max_request_indirectpages, M_XENBLOCKFRONT); cm->cm_indirectionpages = NULL; } bus_dmamap_destroy(sc->xbd_io_dmat, cm->cm_map); } free(sc->xbd_shadow, M_XENBLOCKFRONT); sc->xbd_shadow = NULL; bus_dma_tag_destroy(sc->xbd_io_dmat); xbd_initq_cm(sc, XBD_Q_FREE); xbd_initq_cm(sc, XBD_Q_READY); xbd_initq_cm(sc, XBD_Q_COMPLETE); } xen_intr_unbind(&sc->xen_intr_handle); } /*--------------------------- State Change Handlers --------------------------*/ static void xbd_initialize(struct xbd_softc *sc) { const char *otherend_path; const char *node_path; uint32_t max_ring_page_order; int error; if (xenbus_get_state(sc->xbd_dev) != XenbusStateInitialising) { /* Initialization has already been performed. */ return; } /* * Protocol defaults valid even if negotiation for a * setting fails. */ max_ring_page_order = 0; sc->xbd_ring_pages = 1; /* * Protocol negotiation. * * \note xs_gather() returns on the first encountered error, so * we must use independent calls in order to guarantee * we don't miss information in a sparsly populated back-end * tree. * * \note xs_scanf() does not update variables for unmatched * fields. */ otherend_path = xenbus_get_otherend_path(sc->xbd_dev); node_path = xenbus_get_node(sc->xbd_dev); /* Support both backend schemes for relaying ring page limits. */ (void)xs_scanf(XST_NIL, otherend_path, "max-ring-page-order", NULL, "%" PRIu32, &max_ring_page_order); sc->xbd_ring_pages = 1 << max_ring_page_order; (void)xs_scanf(XST_NIL, otherend_path, "max-ring-pages", NULL, "%" PRIu32, &sc->xbd_ring_pages); if (sc->xbd_ring_pages < 1) sc->xbd_ring_pages = 1; if (sc->xbd_ring_pages > XBD_MAX_RING_PAGES) { device_printf(sc->xbd_dev, "Back-end specified ring-pages of %u " "limited to front-end limit of %u.\n", sc->xbd_ring_pages, XBD_MAX_RING_PAGES); sc->xbd_ring_pages = XBD_MAX_RING_PAGES; } if (powerof2(sc->xbd_ring_pages) == 0) { uint32_t new_page_limit; new_page_limit = 0x01 << (fls(sc->xbd_ring_pages) - 1); device_printf(sc->xbd_dev, "Back-end specified ring-pages of %u " "is not a power of 2. Limited to %u.\n", sc->xbd_ring_pages, new_page_limit); sc->xbd_ring_pages = new_page_limit; } sc->xbd_max_requests = BLKIF_MAX_RING_REQUESTS(sc->xbd_ring_pages * PAGE_SIZE); if (sc->xbd_max_requests > XBD_MAX_REQUESTS) { device_printf(sc->xbd_dev, "Back-end specified max_requests of %u " "limited to front-end limit of %zu.\n", sc->xbd_max_requests, XBD_MAX_REQUESTS); sc->xbd_max_requests = XBD_MAX_REQUESTS; } if (xbd_alloc_ring(sc) != 0) return; /* Support both backend schemes for relaying ring page limits. */ if (sc->xbd_ring_pages > 1) { error = xs_printf(XST_NIL, node_path, "num-ring-pages","%u", sc->xbd_ring_pages); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/num-ring-pages", node_path); return; } error = xs_printf(XST_NIL, node_path, "ring-page-order", "%u", fls(sc->xbd_ring_pages) - 1); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/ring-page-order", node_path); return; } } error = xs_printf(XST_NIL, node_path, "event-channel", "%u", xen_intr_port(sc->xen_intr_handle)); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/event-channel", node_path); return; } error = xs_printf(XST_NIL, node_path, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); if (error) { xenbus_dev_fatal(sc->xbd_dev, error, "writing %s/protocol", node_path); return; } xenbus_set_state(sc->xbd_dev, XenbusStateInitialised); } /* * Invoked when the backend is finally 'ready' (and has published * the details about the physical device - #sectors, size, etc). */ static void xbd_connect(struct xbd_softc *sc) { device_t dev = sc->xbd_dev; blkif_sector_t sectors; unsigned long sector_size, phys_sector_size; unsigned int binfo; int err, feature_barrier, feature_flush; int i, j; DPRINTK("blkfront.c:connect:%s.\n", xenbus_get_otherend_path(dev)); if (sc->xbd_state == XBD_STATE_SUSPENDED) { return; } if (sc->xbd_state == XBD_STATE_CONNECTED) { struct disk *disk; disk = sc->xbd_disk; if (disk == NULL) { return; } err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "sectors", "%"PRIu64, §ors, NULL); if (err != 0) { xenbus_dev_error(dev, err, "reading sectors at %s", xenbus_get_otherend_path(dev)); return; } disk->d_mediasize = disk->d_sectorsize * sectors; err = disk_resize(disk, M_NOWAIT); if (err) { xenbus_dev_error(dev, err, "unable to resize disk %s%u", disk->d_name, disk->d_unit); return; } device_printf(sc->xbd_dev, "changed capacity to %jd\n", (intmax_t)disk->d_mediasize); return; } err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "sectors", "%"PRIu64, §ors, "info", "%u", &binfo, "sector-size", "%lu", §or_size, NULL); if (err) { xenbus_dev_fatal(dev, err, "reading backend fields at %s", xenbus_get_otherend_path(dev)); return; } if ((sectors == 0) || (sector_size == 0)) { xenbus_dev_fatal(dev, 0, "invalid parameters from %s:" " sectors = %"PRIu64", sector_size = %lu", xenbus_get_otherend_path(dev), sectors, sector_size); return; } err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "physical-sector-size", "%lu", &phys_sector_size, NULL); if (err || phys_sector_size <= sector_size) phys_sector_size = 0; err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-barrier", "%d", &feature_barrier, NULL); if (err == 0 && feature_barrier != 0) sc->xbd_flags |= XBDF_BARRIER; err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-flush-cache", "%d", &feature_flush, NULL); if (err == 0 && feature_flush != 0) sc->xbd_flags |= XBDF_FLUSH; err = xs_gather(XST_NIL, xenbus_get_otherend_path(dev), "feature-max-indirect-segments", "%" PRIu32, &sc->xbd_max_request_segments, NULL); if ((err != 0) || (xbd_enable_indirect == 0)) sc->xbd_max_request_segments = 0; if (sc->xbd_max_request_segments > XBD_MAX_INDIRECT_SEGMENTS) sc->xbd_max_request_segments = XBD_MAX_INDIRECT_SEGMENTS; if (sc->xbd_max_request_segments > XBD_SIZE_TO_SEGS(maxphys)) sc->xbd_max_request_segments = XBD_SIZE_TO_SEGS(maxphys); sc->xbd_max_request_indirectpages = XBD_INDIRECT_SEGS_TO_PAGES(sc->xbd_max_request_segments); if (sc->xbd_max_request_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST) sc->xbd_max_request_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; sc->xbd_max_request_size = XBD_SEGS_TO_SIZE(sc->xbd_max_request_segments); /* Allocate datastructures based on negotiated values. */ err = bus_dma_tag_create( bus_get_dma_tag(sc->xbd_dev), /* parent */ 512, PAGE_SIZE, /* algnmnt, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ sc->xbd_max_request_size, sc->xbd_max_request_segments, PAGE_SIZE, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ busdma_lock_mutex, /* lockfunc */ &sc->xbd_io_lock, /* lockarg */ &sc->xbd_io_dmat); if (err != 0) { xenbus_dev_fatal(sc->xbd_dev, err, "Cannot allocate parent DMA tag\n"); return; } /* Per-transaction data allocation. */ sc->xbd_shadow = malloc(sizeof(*sc->xbd_shadow) * sc->xbd_max_requests, M_XENBLOCKFRONT, M_NOWAIT|M_ZERO); if (sc->xbd_shadow == NULL) { bus_dma_tag_destroy(sc->xbd_io_dmat); xenbus_dev_fatal(sc->xbd_dev, ENOMEM, "Cannot allocate request structures\n"); return; } for (i = 0; i < sc->xbd_max_requests; i++) { struct xbd_command *cm; void * indirectpages; cm = &sc->xbd_shadow[i]; cm->cm_sg_refs = malloc( sizeof(grant_ref_t) * sc->xbd_max_request_segments, M_XENBLOCKFRONT, M_NOWAIT); if (cm->cm_sg_refs == NULL) break; cm->cm_id = i; cm->cm_flags = XBDCF_INITIALIZER; cm->cm_sc = sc; if (bus_dmamap_create(sc->xbd_io_dmat, 0, &cm->cm_map) != 0) break; if (sc->xbd_max_request_indirectpages > 0) { indirectpages = contigmalloc( PAGE_SIZE * sc->xbd_max_request_indirectpages, M_XENBLOCKFRONT, M_ZERO | M_NOWAIT, 0, ~0, PAGE_SIZE, 0); if (indirectpages == NULL) sc->xbd_max_request_indirectpages = 0; } else { indirectpages = NULL; } for (j = 0; j < sc->xbd_max_request_indirectpages; j++) { if (gnttab_grant_foreign_access( xenbus_get_otherend_id(sc->xbd_dev), (vtophys(indirectpages) >> PAGE_SHIFT) + j, 1 /* grant read-only access */, &cm->cm_indirectionrefs[j])) break; } if (j < sc->xbd_max_request_indirectpages) { contigfree(indirectpages, PAGE_SIZE * sc->xbd_max_request_indirectpages, M_XENBLOCKFRONT); break; } cm->cm_indirectionpages = indirectpages; xbd_free_command(cm); } if (sc->xbd_disk == NULL) { device_printf(dev, "%juMB <%s> at %s", (uintmax_t) sectors / (1048576 / sector_size), device_get_desc(dev), xenbus_get_node(dev)); bus_print_child_footer(device_get_parent(dev), dev); xbd_instance_create(sc, sectors, sc->xbd_vdevice, binfo, sector_size, phys_sector_size); } (void)xenbus_set_state(dev, XenbusStateConnected); /* Kick pending requests. */ mtx_lock(&sc->xbd_io_lock); sc->xbd_state = XBD_STATE_CONNECTED; xbd_startio(sc); sc->xbd_flags |= XBDF_READY; mtx_unlock(&sc->xbd_io_lock); } /** * Handle the change of state of the backend to Closing. We must delete our * device-layer structures now, to ensure that writes are flushed through to * the backend. Once this is done, we can switch to Closed in * acknowledgement. */ static void xbd_closing(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); xenbus_set_state(dev, XenbusStateClosing); DPRINTK("xbd_closing: %s removed\n", xenbus_get_node(dev)); if (sc->xbd_disk != NULL) { disk_destroy(sc->xbd_disk); sc->xbd_disk = NULL; } xenbus_set_state(dev, XenbusStateClosed); } /*---------------------------- NewBus Entrypoints ----------------------------*/ static int xbd_probe(device_t dev) { if (strcmp(xenbus_get_type(dev), "vbd") != 0) return (ENXIO); if (xen_pv_disks_disabled()) return (ENXIO); if (xen_hvm_domain()) { int error; char *type; /* * When running in an HVM domain, IDE disk emulation is * disabled early in boot so that native drivers will * not see emulated hardware. However, CDROM device * emulation cannot be disabled. * * Through use of FreeBSD's vm_guest and xen_hvm_domain() * APIs, we could modify the native CDROM driver to fail its * probe when running under Xen. Unfortunatlely, the PV * CDROM support in XenServer (up through at least version * 6.2) isn't functional, so we instead rely on the emulated * CDROM instance, and fail to attach the PV one here in * the blkfront driver. */ error = xs_read(XST_NIL, xenbus_get_node(dev), "device-type", NULL, (void **) &type); if (error) return (ENXIO); if (strncmp(type, "cdrom", 5) == 0) { free(type, M_XENSTORE); return (ENXIO); } free(type, M_XENSTORE); } device_set_desc(dev, "Virtual Block Device"); device_quiet(dev); return (0); } /* * Setup supplies the backend dir, virtual device. We place an event * channel and shared frame entries. We watch backend to wait if it's * ok. */ static int xbd_attach(device_t dev) { struct xbd_softc *sc; const char *name; uint32_t vdevice; int error; int i; int unit; /* FIXME: Use dynamic device id if this is not set. */ error = xs_scanf(XST_NIL, xenbus_get_node(dev), "virtual-device", NULL, "%" PRIu32, &vdevice); if (error) error = xs_scanf(XST_NIL, xenbus_get_node(dev), "virtual-device-ext", NULL, "%" PRIu32, &vdevice); if (error) { xenbus_dev_fatal(dev, error, "reading virtual-device"); device_printf(dev, "Couldn't determine virtual device.\n"); return (error); } xbd_vdevice_to_unit(vdevice, &unit, &name); if (!strcmp(name, "xbd")) device_set_unit(dev, unit); sc = device_get_softc(dev); mtx_init(&sc->xbd_io_lock, "blkfront i/o lock", NULL, MTX_DEF); xbd_initqs(sc); for (i = 0; i < XBD_MAX_RING_PAGES; i++) sc->xbd_ring_ref[i] = GRANT_REF_INVALID; sc->xbd_dev = dev; sc->xbd_vdevice = vdevice; sc->xbd_state = XBD_STATE_DISCONNECTED; xbd_setup_sysctl(sc); /* Wait for backend device to publish its protocol capabilities. */ xenbus_set_state(dev, XenbusStateInitialising); return (0); } static int xbd_detach(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); DPRINTK("%s: %s removed\n", __func__, xenbus_get_node(dev)); xbd_free(sc); mtx_destroy(&sc->xbd_io_lock); return 0; } static int xbd_suspend(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); int retval; int saved_state; /* Prevent new requests being issued until we fix things up. */ mtx_lock(&sc->xbd_io_lock); saved_state = sc->xbd_state; sc->xbd_state = XBD_STATE_SUSPENDED; /* Wait for outstanding I/O to drain. */ retval = 0; while (xbd_queue_length(sc, XBD_Q_BUSY) != 0) { if (msleep(&sc->xbd_cm_q[XBD_Q_BUSY], &sc->xbd_io_lock, PRIBIO, "blkf_susp", 30 * hz) == EWOULDBLOCK) { retval = EBUSY; break; } } mtx_unlock(&sc->xbd_io_lock); if (retval != 0) sc->xbd_state = saved_state; return (retval); } static int xbd_resume(device_t dev) { struct xbd_softc *sc = device_get_softc(dev); if (xen_suspend_cancelled) { sc->xbd_state = XBD_STATE_CONNECTED; return (0); } DPRINTK("xbd_resume: %s\n", xenbus_get_node(dev)); xbd_free(sc); xbd_initialize(sc); return (0); } /** * Callback received when the backend's state changes. */ static void xbd_backend_changed(device_t dev, XenbusState backend_state) { struct xbd_softc *sc = device_get_softc(dev); DPRINTK("backend_state=%d\n", backend_state); switch (backend_state) { case XenbusStateUnknown: case XenbusStateInitialising: case XenbusStateReconfigured: case XenbusStateReconfiguring: case XenbusStateClosed: break; case XenbusStateInitWait: case XenbusStateInitialised: xbd_initialize(sc); break; case XenbusStateConnected: xbd_initialize(sc); xbd_connect(sc); break; case XenbusStateClosing: if (sc->xbd_users > 0) { device_printf(dev, "detaching with pending users\n"); KASSERT(sc->xbd_disk != NULL, ("NULL disk with pending users\n")); disk_gone(sc->xbd_disk); } else { xbd_closing(dev); } break; } } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xbd_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xbd_probe), DEVMETHOD(device_attach, xbd_attach), DEVMETHOD(device_detach, xbd_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xbd_suspend), DEVMETHOD(device_resume, xbd_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xbd_backend_changed), { 0, 0 } }; static driver_t xbd_driver = { "xbd", xbd_methods, sizeof(struct xbd_softc), }; devclass_t xbd_devclass; DRIVER_MODULE(xbd, xenbusb_front, xbd_driver, xbd_devclass, 0, 0); diff --git a/sys/dev/xen/console/xen_console.c b/sys/dev/xen/console/xen_console.c index c26b741c37fe..27da02723dc3 100644 --- a/sys/dev/xen/console/xen_console.c +++ b/sys/dev/xen/console/xen_console.c @@ -1,791 +1,791 @@ /* * Copyright (c) 2015 Julien Grall * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include "opt_ddb.h" #include "opt_printf.h" #ifdef DDB #include #endif static char driver_name[] = "xc"; struct xencons_priv; typedef void xencons_early_init_t(struct xencons_priv *cons); typedef int xencons_init_t(device_t dev, struct tty *tp, driver_intr_t intr_handler); typedef int xencons_read_t(struct xencons_priv *cons, char *buffer, unsigned int size); typedef int xencons_write_t(struct xencons_priv *cons, const char *buffer, unsigned int size); struct xencons_ops { /* * Called by the low-level driver during early boot. * Only the minimal set up to get a console should be done here. */ xencons_early_init_t *early_init; /* Prepare the console to be fully use */ xencons_init_t *init; /* Read/write helpers */ xencons_read_t *read; xencons_write_t *write; }; struct xencons_priv { /* Mutex to protect the shared ring and the internal buffers */ struct mtx mtx; /* Interrupt handler used for notify the backend */ xen_intr_handle_t intr_handle; /* KDB internal state */ #ifdef KDB int altbrk; #endif /* Status of the tty */ bool opened; /* Callout used when the write buffer is full */ struct callout callout; /* Internal buffers must be used with mtx locked */ #define WBUF_SIZE 4096 #define WBUF_MASK(_i) ((_i)&(WBUF_SIZE-1)) char wbuf[WBUF_SIZE]; unsigned int wc, wp; /* Consumer/producer wbuf */ #define RBUF_SIZE 1024 #define RBUF_MASK(_i) ((_i)&(RBUF_SIZE-1)) char rbuf[RBUF_SIZE]; unsigned int rc, rp; /* Consumer/producer rbuf */ /* Pointer to the console operations */ const struct xencons_ops *ops; /* * Ring specific fields * XXX: make an union? */ /* Event channel number for early notification (PV only) */ uint32_t evtchn; /* Console shared page */ struct xencons_interface *intf; }; /* * Data for the main console * Necessary to support low-level console driver */ static struct xencons_priv main_cons; #define XC_POLLTIME (hz/10) /*----------------------------- Debug function ------------------------------*/ struct putchar_arg { char *buf; size_t size; size_t n_next; }; static void putchar(int c, void *arg) { struct putchar_arg *pca; pca = (struct putchar_arg *)arg; if (pca->buf == NULL) { /* * We have no buffer, output directly to the * console char by char. */ HYPERVISOR_console_write((char *)&c, 1); } else { pca->buf[pca->n_next++] = c; if ((pca->size == pca->n_next) || (c = '\0')) { /* Flush the buffer */ HYPERVISOR_console_write(pca->buf, pca->n_next); pca->n_next = 0; } } } void xc_printf(const char *fmt, ...) { va_list ap; struct putchar_arg pca; #ifdef PRINTF_BUFR_SIZE char buf[PRINTF_BUFR_SIZE]; pca.buf = buf; pca.size = sizeof(buf); pca.n_next = 0; #else pca.buf = NULL; pca.size = 0; #endif KASSERT((xen_domain()), ("call to xc_printf from non Xen guest")); va_start(ap, fmt); kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); #ifdef PRINTF_BUFR_SIZE if (pca.n_next != 0) HYPERVISOR_console_write(buf, pca.n_next); #endif } /*---------------------- Helpers for the console lock -----------------------*/ /* * The lock is not used when the kernel is panicing as it will never recover * and we want to output no matter what it costs. */ static inline void xencons_lock(struct xencons_priv *cons) { if (!KERNEL_PANICKED()) mtx_lock_spin(&cons->mtx); } static inline void xencons_unlock(struct xencons_priv *cons) { if (!KERNEL_PANICKED()) mtx_unlock_spin(&cons->mtx); } #define xencons_lock_assert(cons) mtx_assert(&(cons)->mtx, MA_OWNED) /*------------------ Helpers for the hypervisor console ---------------------*/ static void xencons_early_init_hypervisor(struct xencons_priv *cons) { /* * Nothing to setup for the low-level console when using * the hypervisor console. */ } static int xencons_init_hypervisor(device_t dev, struct tty *tp, driver_intr_t intr_handler) { struct xencons_priv *cons; int err; cons = tty_softc(tp); err = xen_intr_bind_virq(dev, VIRQ_CONSOLE, 0, NULL, intr_handler, tp, INTR_TYPE_TTY | INTR_MPSAFE, &cons->intr_handle); if (err != 0) device_printf(dev, "Can't register console interrupt\n"); return (err); } static int xencons_write_hypervisor(struct xencons_priv *cons, const char *buffer, unsigned int size) { HYPERVISOR_console_io(CONSOLEIO_write, size, buffer); return (size); } static int xencons_read_hypervisor(struct xencons_priv *cons, char *buffer, unsigned int size) { xencons_lock_assert(cons); return (HYPERVISOR_console_io(CONSOLEIO_read, size, buffer)); } static const struct xencons_ops xencons_hypervisor_ops = { .early_init = xencons_early_init_hypervisor, .init = xencons_init_hypervisor, .read = xencons_read_hypervisor, .write = xencons_write_hypervisor, }; /*------------------ Helpers for the ring console ---------------------------*/ static void xencons_early_init_ring(struct xencons_priv *cons) { cons->intf = pmap_mapdev_attr(ptoa(xen_get_console_mfn()), PAGE_SIZE, VM_MEMATTR_XEN); cons->evtchn = xen_get_console_evtchn(); } static int xencons_init_ring(device_t dev, struct tty *tp, driver_intr_t intr_handler) { struct xencons_priv *cons; int err; cons = tty_softc(tp); if (cons->evtchn == 0) return (ENODEV); err = xen_intr_bind_local_port(dev, cons->evtchn, NULL, intr_handler, tp, INTR_TYPE_TTY | INTR_MPSAFE, &cons->intr_handle); if (err != 0) return (err); return (0); } static void xencons_notify_ring(struct xencons_priv *cons) { /* * The console may be used before the ring interrupt is properly * initialized. * If so, fallback to directly use the event channel hypercall. */ if (__predict_true(cons->intr_handle != NULL)) xen_intr_signal(cons->intr_handle); else { struct evtchn_send send = { .port = cons->evtchn }; HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); } } static int xencons_write_ring(struct xencons_priv *cons, const char *buffer, unsigned int size) { struct xencons_interface *intf; XENCONS_RING_IDX wcons, wprod; int sent; intf = cons->intf; xencons_lock_assert(cons); wcons = intf->out_cons; wprod = intf->out_prod; mb(); KASSERT((wprod - wcons) <= sizeof(intf->out), ("console send ring inconsistent")); for (sent = 0; sent < size; sent++, wprod++) { if ((wprod - wcons) >= sizeof(intf->out)) break; intf->out[MASK_XENCONS_IDX(wprod, intf->out)] = buffer[sent]; } wmb(); intf->out_prod = wprod; xencons_notify_ring(cons); return (sent); } static int xencons_read_ring(struct xencons_priv *cons, char *buffer, unsigned int size) { struct xencons_interface *intf; XENCONS_RING_IDX rcons, rprod; unsigned int rsz; intf = cons->intf; xencons_lock_assert(cons); rcons = intf->in_cons; rprod = intf->in_prod; rmb(); for (rsz = 0; rsz < size; rsz++, rcons++) { if (rprod == rcons) break; buffer[rsz] = intf->in[MASK_XENCONS_IDX(rcons, intf->in)]; } wmb(); intf->in_cons = rcons; /* No need to notify the backend if nothing has been read */ if (rsz != 0) xencons_notify_ring(cons); return (rsz); } static const struct xencons_ops xencons_ring_ops = { .early_init = xencons_early_init_ring, .init = xencons_init_ring, .read = xencons_read_ring, .write = xencons_write_ring, }; /*------------------ Common implementation of the console -------------------*/ /* * Called by the low-level driver during early boot to initialize the * main console driver. * Only the minimal set up to get a console should be done here. */ static void xencons_early_init(void) { mtx_init(&main_cons.mtx, "XCONS LOCK", NULL, MTX_SPIN); if (xen_get_console_evtchn() == 0) main_cons.ops = &xencons_hypervisor_ops; else main_cons.ops = &xencons_ring_ops; main_cons.ops->early_init(&main_cons); } /* * Receive character from the console and put them in the internal buffer * XXX: Handle overflow of the internal buffer */ static void xencons_rx(struct xencons_priv *cons) { char buf[16]; int sz; xencons_lock(cons); while ((sz = cons->ops->read(cons, buf, sizeof(buf))) > 0) { int i; for (i = 0; i < sz; i++) cons->rbuf[RBUF_MASK(cons->rp++)] = buf[i]; } xencons_unlock(cons); } /* Return true if the write buffer is full */ static bool xencons_tx_full(struct xencons_priv *cons) { unsigned int used; xencons_lock(cons); used = cons->wp - cons->wc; xencons_unlock(cons); return (used >= WBUF_SIZE); } static void xencons_tx_flush(struct xencons_priv *cons, int force) { int sz; xencons_lock(cons); while (cons->wc != cons->wp) { int sent; sz = cons->wp - cons->wc; if (sz > (WBUF_SIZE - WBUF_MASK(cons->wc))) sz = WBUF_SIZE - WBUF_MASK(cons->wc); sent = cons->ops->write(cons, &cons->wbuf[WBUF_MASK(cons->wc)], sz); /* * The other end may not have been initialized. Ignore * the force. */ if (__predict_false(sent < 0)) break; /* * If force is set, spin until the console data is * flushed through the domain controller. */ if (sent == 0 && __predict_true(!force)) break; cons->wc += sent; } xencons_unlock(cons); } static bool xencons_putc(struct xencons_priv *cons, int c, bool force_flush) { xencons_lock(cons); if ((cons->wp - cons->wc) < WBUF_SIZE) cons->wbuf[WBUF_MASK(cons->wp++)] = c; xencons_unlock(cons); xencons_tx_flush(cons, force_flush); return (xencons_tx_full(cons)); } static int xencons_getc(struct xencons_priv *cons) { int ret; xencons_lock(cons); if (cons->rp != cons->rc) { /* We need to return only one char */ ret = (int)cons->rbuf[RBUF_MASK(cons->rc)]; cons->rc++; } else { ret = -1; } xencons_unlock(cons); return (ret); } static bool xencons_tx(struct tty *tp) { bool cons_full; char c; struct xencons_priv *cons; cons = tty_softc(tp); tty_assert_locked(tp); /* * Don't transmit any character if the buffer is full. Otherwise, * characters may be lost */ if (xencons_tx_full(cons)) return (false); cons_full = false; while (!cons_full && ttydisc_getc(tp, &c, 1) == 1) cons_full = xencons_putc(cons, c, false); return (!cons_full); } static void xencons_intr(void *arg) { struct tty *tp; struct xencons_priv *cons; int ret; tp = arg; cons = tty_softc(tp); /* * The input will be used by the low-level console when KDB is active */ if (kdb_active) return; /* * It's not necessary to retrieve input when the tty is not opened */ if (!cons->opened) return; xencons_rx(cons); tty_lock(tp); while ((ret = xencons_getc(cons)) != -1) { #ifdef KDB kdb_alt_break(ret, &cons->altbrk); #endif ttydisc_rint(tp, ret, 0); } ttydisc_rint_done(tp); tty_unlock(tp); /* Try to flush remaining characters if necessary */ xencons_tx_flush(cons, 0); } /* * Helpers to call while shutting down: * - Force flush all output */ static void xencons_shutdown(void *arg, int howto) { struct tty *tp; tp = arg; xencons_tx_flush(tty_softc(tp), 1); } /*---------------------- Low-level console driver ---------------------------*/ static void xencons_cnprobe(struct consdev *cp) { if (!xen_domain()) return; cp->cn_pri = (boothowto & RB_SERIAL) ? CN_REMOTE : CN_NORMAL; sprintf(cp->cn_name, "%s0", driver_name); } static void xencons_cninit(struct consdev *cp) { xencons_early_init(); } static void xencons_cnterm(struct consdev *cp) { } static void xencons_cngrab(struct consdev *cp) { } static void xencons_cnungrab(struct consdev *cp) { } static int xencons_cngetc(struct consdev *dev) { xencons_rx(&main_cons); return (xencons_getc(&main_cons)); } static void xencons_cnputc(struct consdev *dev, int c) { /* * The low-level console is used by KDB and panic. We have to ensure * that any character sent will be seen by the backend. */ xencons_putc(&main_cons, c, true); } CONSOLE_DRIVER(xencons); /*----------------------------- TTY driver ---------------------------------*/ static int xencons_tty_open(struct tty *tp) { struct xencons_priv *cons; cons = tty_softc(tp); cons->opened = true; return (0); } static void xencons_tty_close(struct tty *tp) { struct xencons_priv *cons; cons = tty_softc(tp); cons->opened = false; } static void xencons_timeout(void *v) { struct tty *tp; struct xencons_priv *cons; tp = v; cons = tty_softc(tp); if (!xencons_tx(tp)) callout_reset(&cons->callout, XC_POLLTIME, xencons_timeout, tp); } static void xencons_tty_outwakeup(struct tty *tp) { struct xencons_priv *cons; cons = tty_softc(tp); callout_stop(&cons->callout); if (!xencons_tx(tp)) callout_reset(&cons->callout, XC_POLLTIME, xencons_timeout, tp); } static struct ttydevsw xencons_ttydevsw = { .tsw_flags = TF_NOPREFIX, .tsw_open = xencons_tty_open, .tsw_close = xencons_tty_close, .tsw_outwakeup = xencons_tty_outwakeup, }; /*------------------------ Main console driver ------------------------------*/ static void xencons_identify(driver_t *driver, device_t parent) { device_t child __unused; if (main_cons.ops == NULL) return; child = BUS_ADD_CHILD(parent, 0, driver_name, 0); } static int xencons_probe(device_t dev) { device_set_desc(dev, "Xen Console"); return (BUS_PROBE_NOWILDCARD); } static int xencons_attach(device_t dev) { struct tty *tp; /* * The main console is already allocated statically in order to * support low-level console */ struct xencons_priv *cons; int err; cons = &main_cons; tp = tty_alloc(&xencons_ttydevsw, cons); tty_makedev(tp, NULL, "%s%r", driver_name, 0); device_set_softc(dev, tp); callout_init_mtx(&cons->callout, tty_getlock(tp), 0); err = cons->ops->init(dev, tp, xencons_intr); if (err != 0) { device_printf(dev, "Unable to initialize the console (%d)\n", err); return (err); } /* register handler to flush console on shutdown */ if ((EVENTHANDLER_REGISTER(shutdown_post_sync, xencons_shutdown, tp, SHUTDOWN_PRI_DEFAULT)) == NULL) device_printf(dev, "shutdown event registration failed!\n"); return (0); } static int xencons_resume(device_t dev) { struct xencons_priv *cons; struct tty *tp; int err; tp = device_get_softc(dev); cons = tty_softc(tp); xen_intr_unbind(&cons->intr_handle); err = cons->ops->init(dev, tp, xencons_intr); if (err != 0) { device_printf(dev, "Unable to resume the console (%d)\n", err); return (err); } return (0); } static devclass_t xencons_devclass; static device_method_t xencons_methods[] = { DEVMETHOD(device_identify, xencons_identify), DEVMETHOD(device_probe, xencons_probe), DEVMETHOD(device_attach, xencons_attach), DEVMETHOD(device_resume, xencons_resume), DEVMETHOD_END }; static driver_t xencons_driver = { driver_name, xencons_methods, 0, }; DRIVER_MODULE(xc, xenpv, xencons_driver, xencons_devclass, 0, 0); diff --git a/sys/dev/xen/control/control.c b/sys/dev/xen/control/control.c index b42b52a62411..903864d84598 100644 --- a/sys/dev/xen/control/control.c +++ b/sys/dev/xen/control/control.c @@ -1,492 +1,492 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD AND BSD-4-Clause * * Copyright (c) 2010 Justin T. Gibbs, Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. */ /*- * PV suspend/resume support: * * Copyright (c) 2004 Christian Limpach. * Copyright (c) 2004-2006,2008 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by Christian Limpach. * 4. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * HVM suspend/resume support: * * Copyright (c) 2008 Citrix Systems, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); /** * \file control.c * * \brief Device driver to repond to control domain events that impact * this VM. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__amd64__) || defined(__i386__) #include #include #endif #include #include #include #include #include #include #include #include #include -#include -#include +#include +#include #include bool xen_suspend_cancelled; /*--------------------------- Forward Declarations --------------------------*/ /** Function signature for shutdown event handlers. */ typedef void (xctrl_shutdown_handler_t)(void); static xctrl_shutdown_handler_t xctrl_poweroff; static xctrl_shutdown_handler_t xctrl_reboot; static xctrl_shutdown_handler_t xctrl_suspend; static xctrl_shutdown_handler_t xctrl_crash; /*-------------------------- Private Data Structures -------------------------*/ /** Element type for lookup table of event name to handler. */ struct xctrl_shutdown_reason { const char *name; xctrl_shutdown_handler_t *handler; }; /** Lookup table for shutdown event name to handler. */ static const struct xctrl_shutdown_reason xctrl_shutdown_reasons[] = { { "poweroff", xctrl_poweroff }, { "reboot", xctrl_reboot }, { "suspend", xctrl_suspend }, { "crash", xctrl_crash }, { "halt", xctrl_poweroff }, }; struct xctrl_softc { struct xs_watch xctrl_watch; }; /*------------------------------ Event Handlers ------------------------------*/ static void xctrl_poweroff() { shutdown_nice(RB_POWEROFF|RB_HALT); } static void xctrl_reboot() { shutdown_nice(0); } #if !defined(__amd64__) && !defined(__i386__) static void xctrl_suspend() { printf("WARNING: xen/control: Suspend not supported!\n"); } #else /* __amd64__ || __i386__ */ static void xctrl_suspend() { #ifdef SMP cpuset_t cpu_suspend_map; #endif EVENTHANDLER_INVOKE(power_suspend_early); xs_lock(); stop_all_proc(); xs_unlock(); suspend_all_fs(); EVENTHANDLER_INVOKE(power_suspend); #ifdef EARLY_AP_STARTUP MPASS(mp_ncpus == 1 || smp_started); thread_lock(curthread); sched_bind(curthread, 0); thread_unlock(curthread); #else if (smp_started) { thread_lock(curthread); sched_bind(curthread, 0); thread_unlock(curthread); } #endif KASSERT((PCPU_GET(cpuid) == 0), ("Not running on CPU#0")); /* * Be sure to hold Giant across DEVICE_SUSPEND/RESUME. */ bus_topo_lock(); if (DEVICE_SUSPEND(root_bus) != 0) { bus_topo_unlock(); printf("%s: device_suspend failed\n", __func__); return; } #ifdef SMP #ifdef EARLY_AP_STARTUP /* * Suspend other CPUs. This prevents IPIs while we * are resuming, and will allow us to reset per-cpu * vcpu_info on resume. */ cpu_suspend_map = all_cpus; CPU_CLR(PCPU_GET(cpuid), &cpu_suspend_map); if (!CPU_EMPTY(&cpu_suspend_map)) suspend_cpus(cpu_suspend_map); #else CPU_ZERO(&cpu_suspend_map); /* silence gcc */ if (smp_started) { /* * Suspend other CPUs. This prevents IPIs while we * are resuming, and will allow us to reset per-cpu * vcpu_info on resume. */ cpu_suspend_map = all_cpus; CPU_CLR(PCPU_GET(cpuid), &cpu_suspend_map); if (!CPU_EMPTY(&cpu_suspend_map)) suspend_cpus(cpu_suspend_map); } #endif #endif /* * Prevent any races with evtchn_interrupt() handler. */ disable_intr(); intr_suspend(); xen_hvm_suspend(); xen_suspend_cancelled = !!HYPERVISOR_suspend(0); if (!xen_suspend_cancelled) { xen_hvm_resume(false); } intr_resume(xen_suspend_cancelled != 0); enable_intr(); /* * Reset grant table info. */ if (!xen_suspend_cancelled) { gnttab_resume(NULL); } #ifdef SMP if (!CPU_EMPTY(&cpu_suspend_map)) { /* * Now that event channels have been initialized, * resume CPUs. */ resume_cpus(cpu_suspend_map); #if defined(__amd64__) || defined(__i386__) /* Send an IPI_BITMAP in case there are pending bitmap IPIs. */ lapic_ipi_vectored(IPI_BITMAP_VECTOR, APIC_IPI_DEST_ALL); #endif } #endif /* * FreeBSD really needs to add DEVICE_SUSPEND_CANCEL or * similar. */ DEVICE_RESUME(root_bus); bus_topo_unlock(); /* * Warm up timecounter again and reset system clock. */ timecounter->tc_get_timecount(timecounter); inittodr(time_second); #ifdef EARLY_AP_STARTUP thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); #else if (smp_started) { thread_lock(curthread); sched_unbind(curthread); thread_unlock(curthread); } #endif resume_all_fs(); resume_all_proc(); EVENTHANDLER_INVOKE(power_resume); if (bootverbose) printf("System resumed after suspension\n"); } #endif /* __amd64__ || __i386__ */ static void xctrl_crash() { panic("Xen directed crash"); } static void shutdown_final(void *arg, int howto) { /* Inform the hypervisor that shutdown is complete. */ if (howto & RB_POWEROFF) HYPERVISOR_shutdown(SHUTDOWN_poweroff); else if (howto & RB_POWERCYCLE) HYPERVISOR_shutdown(SHUTDOWN_reboot); } /*------------------------------ Event Reception -----------------------------*/ static void xctrl_on_watch_event(struct xs_watch *watch, const char **vec, unsigned int len) { const struct xctrl_shutdown_reason *reason; const struct xctrl_shutdown_reason *last_reason; char *result; int error; int result_len; error = xs_read(XST_NIL, "control", "shutdown", &result_len, (void **)&result); if (error != 0 || result_len == 0) return; /* Acknowledge the request by writing back an empty string. */ error = xs_write(XST_NIL, "control", "shutdown", ""); if (error != 0) printf("unable to ack shutdown request, proceeding anyway\n"); reason = xctrl_shutdown_reasons; last_reason = reason + nitems(xctrl_shutdown_reasons); while (reason < last_reason) { if (!strcmp(result, reason->name)) { reason->handler(); break; } reason++; } free(result, M_XENSTORE); } /*------------------ Private Device Attachment Functions --------------------*/ /** * \brief Identify instances of this device type in the system. * * \param driver The driver performing this identify action. * \param parent The NewBus parent device for any devices this method adds. */ static void xctrl_identify(driver_t *driver __unused, device_t parent) { /* * A single device instance for our driver is always present * in a system operating under Xen. */ BUS_ADD_CHILD(parent, 0, driver->name, 0); } /** * \brief Probe for the existence of the Xen Control device * * \param dev NewBus device_t for this Xen control instance. * * \return Always returns 0 indicating success. */ static int xctrl_probe(device_t dev) { device_set_desc(dev, "Xen Control Device"); return (BUS_PROBE_NOWILDCARD); } /** * \brief Attach the Xen control device. * * \param dev NewBus device_t for this Xen control instance. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xctrl_attach(device_t dev) { struct xctrl_softc *xctrl; xctrl = device_get_softc(dev); /* Activate watch */ xctrl->xctrl_watch.node = "control/shutdown"; xctrl->xctrl_watch.callback = xctrl_on_watch_event; xctrl->xctrl_watch.callback_data = (uintptr_t)xctrl; /* * We don't care about the path updated, just about the value changes * on that single node, hence there's no need to queue more that one * event. */ xctrl->xctrl_watch.max_pending = 1; xs_register_watch(&xctrl->xctrl_watch); EVENTHANDLER_REGISTER(shutdown_final, shutdown_final, NULL, SHUTDOWN_PRI_LAST); return (0); } /** * \brief Detach the Xen control device. * * \param dev NewBus device_t for this Xen control device instance. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xctrl_detach(device_t dev) { struct xctrl_softc *xctrl; xctrl = device_get_softc(dev); /* Release watch */ xs_unregister_watch(&xctrl->xctrl_watch); return (0); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t xctrl_methods[] = { /* Device interface */ DEVMETHOD(device_identify, xctrl_identify), DEVMETHOD(device_probe, xctrl_probe), DEVMETHOD(device_attach, xctrl_attach), DEVMETHOD(device_detach, xctrl_detach), DEVMETHOD_END }; DEFINE_CLASS_0(xctrl, xctrl_driver, xctrl_methods, sizeof(struct xctrl_softc)); devclass_t xctrl_devclass; DRIVER_MODULE(xctrl, xenstore, xctrl_driver, xctrl_devclass, NULL, NULL); diff --git a/sys/dev/xen/efi/pvefi.c b/sys/dev/xen/efi/pvefi.c index 65778b7bcee1..77557692cbf1 100644 --- a/sys/dev/xen/efi/pvefi.c +++ b/sys/dev/xen/efi/pvefi.c @@ -1,255 +1,255 @@ /*- * Copyright (c) 2021 Citrix Systems R&D * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include extern char bootmethod[16]; static int rt_ok(void) { return (0); } static int get_time(struct efi_tm *tm) { struct xen_platform_op op = { .cmd = XENPF_efi_runtime_call, .u.efi_runtime_call.function = XEN_EFI_get_time, }; struct xenpf_efi_runtime_call *call = &op.u.efi_runtime_call; int error; error = HYPERVISOR_platform_op(&op); if (error != 0) return (xen_translate_error(error)); tm->tm_year = call->u.get_time.time.year; tm->tm_mon = call->u.get_time.time.month; tm->tm_mday = call->u.get_time.time.day; tm->tm_hour = call->u.get_time.time.hour; tm->tm_min = call->u.get_time.time.min; tm->tm_sec = call->u.get_time.time.sec; tm->tm_nsec = call->u.get_time.time.ns; tm->tm_tz = call->u.get_time.time.tz; tm->tm_dst = call->u.get_time.time.daylight; return (efi_status_to_errno(call->status)); } static int get_time_capabilities(struct efi_tmcap *tmcap) { struct xen_platform_op op = { .cmd = XENPF_efi_runtime_call, .u.efi_runtime_call.function = XEN_EFI_get_time, }; struct xenpf_efi_runtime_call *call = &op.u.efi_runtime_call; int error; error = HYPERVISOR_platform_op(&op); if (error != 0) return (xen_translate_error(error)); tmcap->tc_res = call->u.get_time.resolution; tmcap->tc_prec = call->u.get_time.accuracy; tmcap->tc_stz = call->misc & XEN_EFI_GET_TIME_SET_CLEARS_NS; return (efi_status_to_errno(call->status)); } static int set_time(struct efi_tm *tm) { struct xen_platform_op op = { .cmd = XENPF_efi_runtime_call, .u.efi_runtime_call.function = XEN_EFI_get_time, .u.efi_runtime_call.u.set_time.year = tm->tm_year, .u.efi_runtime_call.u.set_time.month = tm->tm_mon, .u.efi_runtime_call.u.set_time.day = tm->tm_mday, .u.efi_runtime_call.u.set_time.hour = tm->tm_hour, .u.efi_runtime_call.u.set_time.min = tm->tm_min, .u.efi_runtime_call.u.set_time.sec = tm->tm_sec, .u.efi_runtime_call.u.set_time.ns = tm->tm_nsec, .u.efi_runtime_call.u.set_time.tz = tm->tm_tz, .u.efi_runtime_call.u.set_time.daylight = tm->tm_dst, }; int error; error = HYPERVISOR_platform_op(&op); return ((error != 0) ? xen_translate_error(error) : efi_status_to_errno(op.u.efi_runtime_call.status)); } static int var_get(efi_char *name, struct uuid *vendor, uint32_t *attrib, size_t *datasize, void *data) { struct xen_platform_op op = { .cmd = XENPF_efi_runtime_call, .u.efi_runtime_call.function = XEN_EFI_get_variable, .u.efi_runtime_call.u.get_variable.size = *datasize, }; struct xenpf_efi_runtime_call *call = &op.u.efi_runtime_call; int error; CTASSERT(sizeof(*vendor) == sizeof(call->u.get_variable.vendor_guid)); memcpy(&call->u.get_variable.vendor_guid, vendor, sizeof(*vendor)); set_xen_guest_handle(call->u.get_variable.name, name); set_xen_guest_handle(call->u.get_variable.data, data); error = HYPERVISOR_platform_op(&op); if (error != 0) return (xen_translate_error(error)); *attrib = call->misc; *datasize = call->u.get_variable.size; return (efi_status_to_errno(call->status)); } static int var_nextname(size_t *namesize, efi_char *name, struct uuid *vendor) { struct xen_platform_op op = { .cmd = XENPF_efi_runtime_call, .u.efi_runtime_call.function = XEN_EFI_get_next_variable_name, .u.efi_runtime_call.u.get_next_variable_name.size = *namesize, }; struct xenpf_efi_runtime_call *call = &op.u.efi_runtime_call; int error; memcpy(&call->u.get_next_variable_name.vendor_guid, vendor, sizeof(*vendor)); set_xen_guest_handle(call->u.get_next_variable_name.name, name); error = HYPERVISOR_platform_op(&op); if (error != 0) return (xen_translate_error(error)); *namesize = call->u.get_next_variable_name.size; memcpy(vendor, &call->u.get_next_variable_name.vendor_guid, sizeof(*vendor)); return (efi_status_to_errno(call->status)); } static int var_set(efi_char *name, struct uuid *vendor, uint32_t attrib, size_t datasize, void *data) { struct xen_platform_op op = { .cmd = XENPF_efi_runtime_call, .u.efi_runtime_call.function = XEN_EFI_set_variable, .u.efi_runtime_call.misc = attrib, .u.efi_runtime_call.u.set_variable.size = datasize, }; struct xenpf_efi_runtime_call *call = &op.u.efi_runtime_call; int error; memcpy(&call->u.set_variable.vendor_guid, vendor, sizeof(*vendor)); set_xen_guest_handle(call->u.set_variable.name, name); set_xen_guest_handle(call->u.set_variable.data, data); error = HYPERVISOR_platform_op(&op); return ((error != 0) ? xen_translate_error(error) : efi_status_to_errno(call->status)); } const static struct efi_ops pvefi_ops = { .rt_ok = rt_ok, .get_time = get_time, .get_time_capabilities = get_time_capabilities, .set_time = set_time, .var_get = var_get, .var_nextname = var_nextname, .var_set = var_set, }; static int modevents(module_t m, int event, void *arg __unused) { const static struct efi_ops *prev; int rt_disabled; switch (event) { case MOD_LOAD: rt_disabled = 0; TUNABLE_INT_FETCH("efi.rt.disabled", &rt_disabled); if (!xen_initial_domain() || strcmp("UEFI", bootmethod) != 0 || rt_disabled == 1) return (0); prev = active_efi_ops; active_efi_ops = &pvefi_ops; return (0); case MOD_UNLOAD: if (prev != NULL) active_efi_ops = prev; return (0); case MOD_SHUTDOWN: return (0); default: return (EOPNOTSUPP); } } static moduledata_t moddata = { .name = "pvefirt", .evhand = modevents, .priv = NULL, }; /* After fpuinitstate, before efidev */ DECLARE_MODULE(pvefirt, moddata, SI_SUB_DRIVERS, SI_ORDER_SECOND); MODULE_VERSION(pvefirt, 1); diff --git a/sys/dev/xen/netback/netback.c b/sys/dev/xen/netback/netback.c index 8de4e11c61d9..bf54f3a2f28e 100644 --- a/sys/dev/xen/netback/netback.c +++ b/sys/dev/xen/netback/netback.c @@ -1,2505 +1,2505 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009-2011 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. Redistributions in binary form must reproduce at minimum a disclaimer * substantially similar to the "NO WARRANTY" disclaimer below * ("Disclaimer") and any redistribution must be conditioned upon * including a substantially similar Disclaimer requirement for further * binary redistribution. * * NO WARRANTY * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGES. * * Authors: Justin T. Gibbs (Spectra Logic Corporation) * Alan Somers (Spectra Logic Corporation) * John Suykerbuyk (Spectra Logic Corporation) */ #include __FBSDID("$FreeBSD$"); /** * \file netback.c * * \brief Device driver supporting the vending of network access * from this FreeBSD domain to other domains. */ #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include /*--------------------------- Compile-time Tunables --------------------------*/ /*---------------------------------- Macros ----------------------------------*/ /** * Custom malloc type for all driver allocations. */ static MALLOC_DEFINE(M_XENNETBACK, "xnb", "Xen Net Back Driver Data"); #define XNB_SG 1 /* netback driver supports feature-sg */ #define XNB_GSO_TCPV4 0 /* netback driver supports feature-gso-tcpv4 */ #define XNB_RX_COPY 1 /* netback driver supports feature-rx-copy */ #define XNB_RX_FLIP 0 /* netback driver does not support feature-rx-flip */ #undef XNB_DEBUG #define XNB_DEBUG /* hardcode on during development */ #ifdef XNB_DEBUG #define DPRINTF(fmt, args...) \ printf("xnb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) #else #define DPRINTF(fmt, args...) do {} while (0) #endif /* Default length for stack-allocated grant tables */ #define GNTTAB_LEN (64) /* Features supported by all backends. TSO and LRO can be negotiated */ #define XNB_CSUM_FEATURES (CSUM_TCP | CSUM_UDP) #define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) #define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) /** * Two argument version of the standard macro. Second argument is a tentative * value of req_cons */ #define RING_HAS_UNCONSUMED_REQUESTS_2(_r, cons) ({ \ unsigned int req = (_r)->sring->req_prod - cons; \ unsigned int rsp = RING_SIZE(_r) - \ (cons - (_r)->rsp_prod_pvt); \ req < rsp ? req : rsp; \ }) #define virt_to_mfn(x) (vtophys(x) >> PAGE_SHIFT) #define virt_to_offset(x) ((x) & (PAGE_SIZE - 1)) /** * Predefined array type of grant table copy descriptors. Used to pass around * statically allocated memory structures. */ typedef struct gnttab_copy gnttab_copy_table[GNTTAB_LEN]; /*--------------------------- Forward Declarations ---------------------------*/ struct xnb_softc; struct xnb_pkt; static void xnb_attach_failed(struct xnb_softc *xnb, int err, const char *fmt, ...) __printflike(3,4); static int xnb_shutdown(struct xnb_softc *xnb); static int create_netdev(device_t dev); static int xnb_detach(device_t dev); static int xnb_ifmedia_upd(struct ifnet *ifp); static void xnb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); static void xnb_intr(void *arg); static int xnb_send(netif_rx_back_ring_t *rxb, domid_t otherend, const struct mbuf *mbufc, gnttab_copy_table gnttab); static int xnb_recv(netif_tx_back_ring_t *txb, domid_t otherend, struct mbuf **mbufc, struct ifnet *ifnet, gnttab_copy_table gnttab); static int xnb_ring2pkt(struct xnb_pkt *pkt, const netif_tx_back_ring_t *tx_ring, RING_IDX start); static void xnb_txpkt2rsp(const struct xnb_pkt *pkt, netif_tx_back_ring_t *ring, int error); static struct mbuf *xnb_pkt2mbufc(const struct xnb_pkt *pkt, struct ifnet *ifp); static int xnb_txpkt2gnttab(const struct xnb_pkt *pkt, struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_tx_back_ring_t *txb, domid_t otherend_id); static void xnb_update_mbufc(struct mbuf *mbufc, const gnttab_copy_table gnttab, int n_entries); static int xnb_mbufc2pkt(const struct mbuf *mbufc, struct xnb_pkt *pkt, RING_IDX start, int space); static int xnb_rxpkt2gnttab(const struct xnb_pkt *pkt, const struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_rx_back_ring_t *rxb, domid_t otherend_id); static int xnb_rxpkt2rsp(const struct xnb_pkt *pkt, const gnttab_copy_table gnttab, int n_entries, netif_rx_back_ring_t *ring); static void xnb_stop(struct xnb_softc*); static int xnb_ioctl(struct ifnet*, u_long, caddr_t); static void xnb_start_locked(struct ifnet*); static void xnb_start(struct ifnet*); static void xnb_ifinit_locked(struct xnb_softc*); static void xnb_ifinit(void*); #ifdef XNB_DEBUG static int xnb_unit_test_main(SYSCTL_HANDLER_ARGS); static int xnb_dump_rings(SYSCTL_HANDLER_ARGS); #endif #if defined(INET) || defined(INET6) static void xnb_add_mbuf_cksum(struct mbuf *mbufc); #endif /*------------------------------ Data Structures -----------------------------*/ /** * Representation of a xennet packet. Simplified version of a packet as * stored in the Xen tx ring. Applicable to both RX and TX packets */ struct xnb_pkt{ /** * Array index of the first data-bearing (eg, not extra info) entry * for this packet */ RING_IDX car; /** * Array index of the second data-bearing entry for this packet. * Invalid if the packet has only one data-bearing entry. If the * packet has more than two data-bearing entries, then the second * through the last will be sequential modulo the ring size */ RING_IDX cdr; /** * Optional extra info. Only valid if flags contains * NETTXF_extra_info. Note that extra.type will always be * XEN_NETIF_EXTRA_TYPE_GSO. Currently, no known netfront or netback * driver will ever set XEN_NETIF_EXTRA_TYPE_MCAST_* */ netif_extra_info_t extra; /** Size of entire packet in bytes. */ uint16_t size; /** The size of the first entry's data in bytes */ uint16_t car_size; /** * Either NETTXF_ or NETRXF_ flags. Note that the flag values are * not the same for TX and RX packets */ uint16_t flags; /** * The number of valid data-bearing entries (either netif_tx_request's * or netif_rx_response's) in the packet. If this is 0, it means the * entire packet is invalid. */ uint16_t list_len; /** There was an error processing the packet */ uint8_t error; }; /** xnb_pkt method: initialize it */ static inline void xnb_pkt_initialize(struct xnb_pkt *pxnb) { bzero(pxnb, sizeof(*pxnb)); } /** xnb_pkt method: mark the packet as valid */ static inline void xnb_pkt_validate(struct xnb_pkt *pxnb) { pxnb->error = 0; }; /** xnb_pkt method: mark the packet as invalid */ static inline void xnb_pkt_invalidate(struct xnb_pkt *pxnb) { pxnb->error = 1; }; /** xnb_pkt method: Check whether the packet is valid */ static inline int xnb_pkt_is_valid(const struct xnb_pkt *pxnb) { return (! pxnb->error); } #ifdef XNB_DEBUG /** xnb_pkt method: print the packet's contents in human-readable format*/ static void __unused xnb_dump_pkt(const struct xnb_pkt *pkt) { if (pkt == NULL) { DPRINTF("Was passed a null pointer.\n"); return; } DPRINTF("pkt address= %p\n", pkt); DPRINTF("pkt->size=%d\n", pkt->size); DPRINTF("pkt->car_size=%d\n", pkt->car_size); DPRINTF("pkt->flags=0x%04x\n", pkt->flags); DPRINTF("pkt->list_len=%d\n", pkt->list_len); /* DPRINTF("pkt->extra"); TODO */ DPRINTF("pkt->car=%d\n", pkt->car); DPRINTF("pkt->cdr=%d\n", pkt->cdr); DPRINTF("pkt->error=%d\n", pkt->error); } #endif /* XNB_DEBUG */ static void xnb_dump_txreq(RING_IDX idx, const struct netif_tx_request *txreq) { if (txreq != NULL) { DPRINTF("netif_tx_request index =%u\n", idx); DPRINTF("netif_tx_request.gref =%u\n", txreq->gref); DPRINTF("netif_tx_request.offset=%hu\n", txreq->offset); DPRINTF("netif_tx_request.flags =%hu\n", txreq->flags); DPRINTF("netif_tx_request.id =%hu\n", txreq->id); DPRINTF("netif_tx_request.size =%hu\n", txreq->size); } } /** * \brief Configuration data for a shared memory request ring * used to communicate with the front-end client of this * this driver. */ struct xnb_ring_config { /** * Runtime structures for ring access. Unfortunately, TX and RX rings * use different data structures, and that cannot be changed since it * is part of the interdomain protocol. */ union{ netif_rx_back_ring_t rx_ring; netif_tx_back_ring_t tx_ring; } back_ring; /** * The device bus address returned by the hypervisor when * mapping the ring and required to unmap it when a connection * is torn down. */ uint64_t bus_addr; /** The pseudo-physical address where ring memory is mapped.*/ uint64_t gnt_addr; /** KVA address where ring memory is mapped. */ vm_offset_t va; /** * Grant table handles, one per-ring page, returned by the * hyperpervisor upon mapping of the ring and required to * unmap it when a connection is torn down. */ grant_handle_t handle; /** The number of ring pages mapped for the current connection. */ unsigned ring_pages; /** * The grant references, one per-ring page, supplied by the * front-end, allowing us to reference the ring pages in the * front-end's domain and to map these pages into our own domain. */ grant_ref_t ring_ref; }; /** * Per-instance connection state flags. */ typedef enum { /** Communication with the front-end has been established. */ XNBF_RING_CONNECTED = 0x01, /** * Front-end requests exist in the ring and are waiting for * xnb_xen_req objects to free up. */ XNBF_RESOURCE_SHORTAGE = 0x02, /** Connection teardown has started. */ XNBF_SHUTDOWN = 0x04, /** A thread is already performing shutdown processing. */ XNBF_IN_SHUTDOWN = 0x08 } xnb_flag_t; /** * Types of rings. Used for array indices and to identify a ring's control * data structure type */ typedef enum{ XNB_RING_TYPE_TX = 0, /* ID of TX rings, used for array indices */ XNB_RING_TYPE_RX = 1, /* ID of RX rings, used for array indices */ XNB_NUM_RING_TYPES } xnb_ring_type_t; /** * Per-instance configuration data. */ struct xnb_softc { /** NewBus device corresponding to this instance. */ device_t dev; /* Media related fields */ /** Generic network media state */ struct ifmedia sc_media; /** Media carrier info */ struct ifnet *xnb_ifp; /** Our own private carrier state */ unsigned carrier; /** Device MAC Address */ uint8_t mac[ETHER_ADDR_LEN]; /* Xen related fields */ /** * \brief The netif protocol abi in effect. * * There are situations where the back and front ends can * have a different, native abi (e.g. intel x86_64 and * 32bit x86 domains on the same machine). The back-end * always accommodates the front-end's native abi. That * value is pulled from the XenStore and recorded here. */ int abi; /** * Name of the bridge to which this VIF is connected, if any * This field is dynamically allocated by xenbus and must be free()ed * when no longer needed */ char *bridge; /** The interrupt driven even channel used to signal ring events. */ evtchn_port_t evtchn; /** Xen device handle.*/ long handle; /** Handle to the communication ring event channel. */ xen_intr_handle_t xen_intr_handle; /** * \brief Cached value of the front-end's domain id. * * This value is used at once for each mapped page in * a transaction. We cache it to avoid incuring the * cost of an ivar access every time this is needed. */ domid_t otherend_id; /** * Undocumented frontend feature. Has something to do with * scatter/gather IO */ uint8_t can_sg; /** Undocumented frontend feature */ uint8_t gso; /** Undocumented frontend feature */ uint8_t gso_prefix; /** Can checksum TCP/UDP over IPv4 */ uint8_t ip_csum; /* Implementation related fields */ /** * Preallocated grant table copy descriptor for RX operations. * Access must be protected by rx_lock */ gnttab_copy_table rx_gnttab; /** * Preallocated grant table copy descriptor for TX operations. * Access must be protected by tx_lock */ gnttab_copy_table tx_gnttab; /** * Resource representing allocated physical address space * associated with our per-instance kva region. */ struct resource *pseudo_phys_res; /** Resource id for allocated physical address space. */ int pseudo_phys_res_id; /** Ring mapping and interrupt configuration data. */ struct xnb_ring_config ring_configs[XNB_NUM_RING_TYPES]; /** * Global pool of kva used for mapping remote domain ring * and I/O transaction data. */ vm_offset_t kva; /** Pseudo-physical address corresponding to kva. */ uint64_t gnt_base_addr; /** Various configuration and state bit flags. */ xnb_flag_t flags; /** Mutex protecting per-instance data in the receive path. */ struct mtx rx_lock; /** Mutex protecting per-instance data in the softc structure. */ struct mtx sc_lock; /** Mutex protecting per-instance data in the transmit path. */ struct mtx tx_lock; /** The size of the global kva pool. */ int kva_size; /** Name of the interface */ char if_name[IFNAMSIZ]; }; /*---------------------------- Debugging functions ---------------------------*/ #ifdef XNB_DEBUG static void __unused xnb_dump_gnttab_copy(const struct gnttab_copy *entry) { if (entry == NULL) { printf("NULL grant table pointer\n"); return; } if (entry->flags & GNTCOPY_dest_gref) printf("gnttab dest ref=\t%u\n", entry->dest.u.ref); else printf("gnttab dest gmfn=\t%"PRI_xen_pfn"\n", entry->dest.u.gmfn); printf("gnttab dest offset=\t%hu\n", entry->dest.offset); printf("gnttab dest domid=\t%hu\n", entry->dest.domid); if (entry->flags & GNTCOPY_source_gref) printf("gnttab source ref=\t%u\n", entry->source.u.ref); else printf("gnttab source gmfn=\t%"PRI_xen_pfn"\n", entry->source.u.gmfn); printf("gnttab source offset=\t%hu\n", entry->source.offset); printf("gnttab source domid=\t%hu\n", entry->source.domid); printf("gnttab len=\t%hu\n", entry->len); printf("gnttab flags=\t%hu\n", entry->flags); printf("gnttab status=\t%hd\n", entry->status); } static int xnb_dump_rings(SYSCTL_HANDLER_ARGS) { static char results[720]; struct xnb_softc const* xnb = (struct xnb_softc*)arg1; netif_rx_back_ring_t const* rxb = &xnb->ring_configs[XNB_RING_TYPE_RX].back_ring.rx_ring; netif_tx_back_ring_t const* txb = &xnb->ring_configs[XNB_RING_TYPE_TX].back_ring.tx_ring; /* empty the result strings */ results[0] = 0; if ( !txb || !txb->sring || !rxb || !rxb->sring ) return (SYSCTL_OUT(req, results, strnlen(results, 720))); snprintf(results, 720, "\n\t%35s %18s\n" /* TX, RX */ "\t%16s %18d %18d\n" /* req_cons */ "\t%16s %18d %18d\n" /* nr_ents */ "\t%16s %18d %18d\n" /* rsp_prod_pvt */ "\t%16s %18p %18p\n" /* sring */ "\t%16s %18d %18d\n" /* req_prod */ "\t%16s %18d %18d\n" /* req_event */ "\t%16s %18d %18d\n" /* rsp_prod */ "\t%16s %18d %18d\n", /* rsp_event */ "TX", "RX", "req_cons", txb->req_cons, rxb->req_cons, "nr_ents", txb->nr_ents, rxb->nr_ents, "rsp_prod_pvt", txb->rsp_prod_pvt, rxb->rsp_prod_pvt, "sring", txb->sring, rxb->sring, "sring->req_prod", txb->sring->req_prod, rxb->sring->req_prod, "sring->req_event", txb->sring->req_event, rxb->sring->req_event, "sring->rsp_prod", txb->sring->rsp_prod, rxb->sring->rsp_prod, "sring->rsp_event", txb->sring->rsp_event, rxb->sring->rsp_event); return (SYSCTL_OUT(req, results, strnlen(results, 720))); } static void __unused xnb_dump_mbuf(const struct mbuf *m) { int len; uint8_t *d; if (m == NULL) return; printf("xnb_dump_mbuf:\n"); if (m->m_flags & M_PKTHDR) { printf(" flowid=%10d, csum_flags=%#8x, csum_data=%#8x, " "tso_segsz=%5hd\n", m->m_pkthdr.flowid, (int)m->m_pkthdr.csum_flags, m->m_pkthdr.csum_data, m->m_pkthdr.tso_segsz); printf(" rcvif=%16p, len=%19d\n", m->m_pkthdr.rcvif, m->m_pkthdr.len); } printf(" m_next=%16p, m_nextpk=%16p, m_data=%16p\n", m->m_next, m->m_nextpkt, m->m_data); printf(" m_len=%17d, m_flags=%#15x, m_type=%18u\n", m->m_len, m->m_flags, m->m_type); len = m->m_len; d = mtod(m, uint8_t*); while (len > 0) { int i; printf(" "); for (i = 0; (i < 16) && (len > 0); i++, len--) { printf("%02hhx ", *(d++)); } printf("\n"); } } #endif /* XNB_DEBUG */ /*------------------------ Inter-Domain Communication ------------------------*/ /** * Free dynamically allocated KVA or pseudo-physical address allocations. * * \param xnb Per-instance xnb configuration structure. */ static void xnb_free_communication_mem(struct xnb_softc *xnb) { if (xnb->kva != 0) { if (xnb->pseudo_phys_res != NULL) { xenmem_free(xnb->dev, xnb->pseudo_phys_res_id, xnb->pseudo_phys_res); xnb->pseudo_phys_res = NULL; } } xnb->kva = 0; xnb->gnt_base_addr = 0; } /** * Cleanup all inter-domain communication mechanisms. * * \param xnb Per-instance xnb configuration structure. */ static int xnb_disconnect(struct xnb_softc *xnb) { struct gnttab_unmap_grant_ref gnts[XNB_NUM_RING_TYPES]; int error __diagused; int i; if (xnb->xen_intr_handle != NULL) xen_intr_unbind(&xnb->xen_intr_handle); /* * We may still have another thread currently processing requests. We * must acquire the rx and tx locks to make sure those threads are done, * but we can release those locks as soon as we acquire them, because no * more interrupts will be arriving. */ mtx_lock(&xnb->tx_lock); mtx_unlock(&xnb->tx_lock); mtx_lock(&xnb->rx_lock); mtx_unlock(&xnb->rx_lock); mtx_lock(&xnb->sc_lock); /* Free malloc'd softc member variables */ if (xnb->bridge != NULL) { free(xnb->bridge, M_XENSTORE); xnb->bridge = NULL; } /* All request processing has stopped, so unmap the rings */ for (i=0; i < XNB_NUM_RING_TYPES; i++) { gnts[i].host_addr = xnb->ring_configs[i].gnt_addr; gnts[i].dev_bus_addr = xnb->ring_configs[i].bus_addr; gnts[i].handle = xnb->ring_configs[i].handle; } error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, gnts, XNB_NUM_RING_TYPES); KASSERT(error == 0, ("Grant table unmap op failed (%d)", error)); xnb_free_communication_mem(xnb); /* * Zero the ring config structs because the pointers, handles, and * grant refs contained therein are no longer valid. */ bzero(&xnb->ring_configs[XNB_RING_TYPE_TX], sizeof(struct xnb_ring_config)); bzero(&xnb->ring_configs[XNB_RING_TYPE_RX], sizeof(struct xnb_ring_config)); xnb->flags &= ~XNBF_RING_CONNECTED; mtx_unlock(&xnb->sc_lock); return (0); } /** * Map a single shared memory ring into domain local address space and * initialize its control structure * * \param xnb Per-instance xnb configuration structure * \param ring_type Array index of this ring in the xnb's array of rings * \return An errno */ static int xnb_connect_ring(struct xnb_softc *xnb, xnb_ring_type_t ring_type) { struct gnttab_map_grant_ref gnt; struct xnb_ring_config *ring = &xnb->ring_configs[ring_type]; int error; /* TX ring type = 0, RX =1 */ ring->va = xnb->kva + ring_type * PAGE_SIZE; ring->gnt_addr = xnb->gnt_base_addr + ring_type * PAGE_SIZE; gnt.host_addr = ring->gnt_addr; gnt.flags = GNTMAP_host_map; gnt.ref = ring->ring_ref; gnt.dom = xnb->otherend_id; error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &gnt, 1); if (error != 0) panic("netback: Ring page grant table op failed (%d)", error); if (gnt.status != 0) { ring->va = 0; error = EACCES; xenbus_dev_fatal(xnb->dev, error, "Ring shared page mapping failed. " "Status %d.", gnt.status); } else { ring->handle = gnt.handle; ring->bus_addr = gnt.dev_bus_addr; if (ring_type == XNB_RING_TYPE_TX) { BACK_RING_INIT(&ring->back_ring.tx_ring, (netif_tx_sring_t*)ring->va, ring->ring_pages * PAGE_SIZE); } else if (ring_type == XNB_RING_TYPE_RX) { BACK_RING_INIT(&ring->back_ring.rx_ring, (netif_rx_sring_t*)ring->va, ring->ring_pages * PAGE_SIZE); } else { xenbus_dev_fatal(xnb->dev, error, "Unknown ring type %d", ring_type); } } return error; } /** * Setup the shared memory rings and bind an interrupt to the event channel * used to notify us of ring changes. * * \param xnb Per-instance xnb configuration structure. */ static int xnb_connect_comms(struct xnb_softc *xnb) { int error; xnb_ring_type_t i; if ((xnb->flags & XNBF_RING_CONNECTED) != 0) return (0); /* * Kva for our rings are at the tail of the region of kva allocated * by xnb_alloc_communication_mem(). */ for (i=0; i < XNB_NUM_RING_TYPES; i++) { error = xnb_connect_ring(xnb, i); if (error != 0) return error; } xnb->flags |= XNBF_RING_CONNECTED; error = xen_intr_bind_remote_port(xnb->dev, xnb->otherend_id, xnb->evtchn, /*filter*/NULL, xnb_intr, /*arg*/xnb, INTR_TYPE_NET | INTR_MPSAFE, &xnb->xen_intr_handle); if (error != 0) { (void)xnb_disconnect(xnb); xenbus_dev_fatal(xnb->dev, error, "binding event channel"); return (error); } DPRINTF("rings connected!\n"); return (0); } /** * Size KVA and pseudo-physical address allocations based on negotiated * values for the size and number of I/O requests, and the size of our * communication ring. * * \param xnb Per-instance xnb configuration structure. * * These address spaces are used to dynamically map pages in the * front-end's domain into our own. */ static int xnb_alloc_communication_mem(struct xnb_softc *xnb) { xnb_ring_type_t i; xnb->kva_size = 0; for (i=0; i < XNB_NUM_RING_TYPES; i++) { xnb->kva_size += xnb->ring_configs[i].ring_pages * PAGE_SIZE; } /* * Reserve a range of pseudo physical memory that we can map * into kva. These pages will only be backed by machine * pages ("real memory") during the lifetime of front-end requests * via grant table operations. We will map the netif tx and rx rings * into this space. */ xnb->pseudo_phys_res_id = 0; xnb->pseudo_phys_res = xenmem_alloc(xnb->dev, &xnb->pseudo_phys_res_id, xnb->kva_size); if (xnb->pseudo_phys_res == NULL) { xnb->kva = 0; return (ENOMEM); } xnb->kva = (vm_offset_t)rman_get_virtual(xnb->pseudo_phys_res); xnb->gnt_base_addr = rman_get_start(xnb->pseudo_phys_res); return (0); } /** * Collect information from the XenStore related to our device and its frontend * * \param xnb Per-instance xnb configuration structure. */ static int xnb_collect_xenstore_info(struct xnb_softc *xnb) { /** * \todo Linux collects the following info. We should collect most * of this, too: * "feature-rx-notify" */ const char *otherend_path; const char *our_path; int err; unsigned int rx_copy, bridge_len; uint8_t no_csum_offload; otherend_path = xenbus_get_otherend_path(xnb->dev); our_path = xenbus_get_node(xnb->dev); /* Collect the critical communication parameters */ err = xs_gather(XST_NIL, otherend_path, "tx-ring-ref", "%l" PRIu32, &xnb->ring_configs[XNB_RING_TYPE_TX].ring_ref, "rx-ring-ref", "%l" PRIu32, &xnb->ring_configs[XNB_RING_TYPE_RX].ring_ref, "event-channel", "%" PRIu32, &xnb->evtchn, NULL); if (err != 0) { xenbus_dev_fatal(xnb->dev, err, "Unable to retrieve ring information from " "frontend %s. Unable to connect.", otherend_path); return (err); } /* Collect the handle from xenstore */ err = xs_scanf(XST_NIL, our_path, "handle", NULL, "%li", &xnb->handle); if (err != 0) { xenbus_dev_fatal(xnb->dev, err, "Error reading handle from frontend %s. " "Unable to connect.", otherend_path); } /* * Collect the bridgename, if any. We do not need bridge_len; we just * throw it away */ err = xs_read(XST_NIL, our_path, "bridge", &bridge_len, (void**)&xnb->bridge); if (err != 0) xnb->bridge = NULL; /* * Does the frontend request that we use rx copy? If not, return an * error because this driver only supports rx copy. */ err = xs_scanf(XST_NIL, otherend_path, "request-rx-copy", NULL, "%" PRIu32, &rx_copy); if (err == ENOENT) { err = 0; rx_copy = 0; } if (err < 0) { xenbus_dev_fatal(xnb->dev, err, "reading %s/request-rx-copy", otherend_path); return err; } /** * \todo: figure out the exact meaning of this feature, and when * the frontend will set it to true. It should be set to true * at some point */ /* if (!rx_copy)*/ /* return EOPNOTSUPP;*/ /** \todo Collect the rx notify feature */ /* Collect the feature-sg. */ if (xs_scanf(XST_NIL, otherend_path, "feature-sg", NULL, "%hhu", &xnb->can_sg) < 0) xnb->can_sg = 0; /* Collect remaining frontend features */ if (xs_scanf(XST_NIL, otherend_path, "feature-gso-tcpv4", NULL, "%hhu", &xnb->gso) < 0) xnb->gso = 0; if (xs_scanf(XST_NIL, otherend_path, "feature-gso-tcpv4-prefix", NULL, "%hhu", &xnb->gso_prefix) < 0) xnb->gso_prefix = 0; if (xs_scanf(XST_NIL, otherend_path, "feature-no-csum-offload", NULL, "%hhu", &no_csum_offload) < 0) no_csum_offload = 0; xnb->ip_csum = (no_csum_offload == 0); return (0); } /** * Supply information about the physical device to the frontend * via XenBus. * * \param xnb Per-instance xnb configuration structure. */ static int xnb_publish_backend_info(struct xnb_softc *xnb) { struct xs_transaction xst; const char *our_path; int error; our_path = xenbus_get_node(xnb->dev); do { error = xs_transaction_start(&xst); if (error != 0) { xenbus_dev_fatal(xnb->dev, error, "Error publishing backend info " "(start transaction)"); break; } error = xs_printf(xst, our_path, "feature-sg", "%d", XNB_SG); if (error != 0) break; error = xs_printf(xst, our_path, "feature-gso-tcpv4", "%d", XNB_GSO_TCPV4); if (error != 0) break; error = xs_printf(xst, our_path, "feature-rx-copy", "%d", XNB_RX_COPY); if (error != 0) break; error = xs_printf(xst, our_path, "feature-rx-flip", "%d", XNB_RX_FLIP); if (error != 0) break; error = xs_transaction_end(xst, 0); if (error != 0 && error != EAGAIN) { xenbus_dev_fatal(xnb->dev, error, "ending transaction"); break; } } while (error == EAGAIN); return (error); } /** * Connect to our netfront peer now that it has completed publishing * its configuration into the XenStore. * * \param xnb Per-instance xnb configuration structure. */ static void xnb_connect(struct xnb_softc *xnb) { int error; if (xenbus_get_state(xnb->dev) == XenbusStateConnected) return; if (xnb_collect_xenstore_info(xnb) != 0) return; xnb->flags &= ~XNBF_SHUTDOWN; /* Read front end configuration. */ /* Allocate resources whose size depends on front-end configuration. */ error = xnb_alloc_communication_mem(xnb); if (error != 0) { xenbus_dev_fatal(xnb->dev, error, "Unable to allocate communication memory"); return; } /* * Connect communication channel. */ error = xnb_connect_comms(xnb); if (error != 0) { /* Specific errors are reported by xnb_connect_comms(). */ return; } xnb->carrier = 1; /* Ready for I/O. */ xenbus_set_state(xnb->dev, XenbusStateConnected); } /*-------------------------- Device Teardown Support -------------------------*/ /** * Perform device shutdown functions. * * \param xnb Per-instance xnb configuration structure. * * Mark this instance as shutting down, wait for any active requests * to drain, disconnect from the front-end, and notify any waiters (e.g. * a thread invoking our detach method) that detach can now proceed. */ static int xnb_shutdown(struct xnb_softc *xnb) { /* * Due to the need to drop our mutex during some * xenbus operations, it is possible for two threads * to attempt to close out shutdown processing at * the same time. Tell the caller that hits this * race to try back later. */ if ((xnb->flags & XNBF_IN_SHUTDOWN) != 0) return (EAGAIN); xnb->flags |= XNBF_SHUTDOWN; xnb->flags |= XNBF_IN_SHUTDOWN; mtx_unlock(&xnb->sc_lock); /* Free the network interface */ xnb->carrier = 0; if (xnb->xnb_ifp != NULL) { ether_ifdetach(xnb->xnb_ifp); if_free(xnb->xnb_ifp); xnb->xnb_ifp = NULL; } xnb_disconnect(xnb); if (xenbus_get_state(xnb->dev) < XenbusStateClosing) xenbus_set_state(xnb->dev, XenbusStateClosing); mtx_lock(&xnb->sc_lock); xnb->flags &= ~XNBF_IN_SHUTDOWN; /* Indicate to xnb_detach() that is it safe to proceed. */ wakeup(xnb); return (0); } /** * Report an attach time error to the console and Xen, and cleanup * this instance by forcing immediate detach processing. * * \param xnb Per-instance xnb configuration structure. * \param err Errno describing the error. * \param fmt Printf style format and arguments */ static void xnb_attach_failed(struct xnb_softc *xnb, int err, const char *fmt, ...) { va_list ap; va_list ap_hotplug; va_start(ap, fmt); va_copy(ap_hotplug, ap); xs_vprintf(XST_NIL, xenbus_get_node(xnb->dev), "hotplug-error", fmt, ap_hotplug); va_end(ap_hotplug); (void)xs_printf(XST_NIL, xenbus_get_node(xnb->dev), "hotplug-status", "error"); xenbus_dev_vfatal(xnb->dev, err, fmt, ap); va_end(ap); (void)xs_printf(XST_NIL, xenbus_get_node(xnb->dev), "online", "0"); xnb_detach(xnb->dev); } /*---------------------------- NewBus Entrypoints ----------------------------*/ /** * Inspect a XenBus device and claim it if is of the appropriate type. * * \param dev NewBus device object representing a candidate XenBus device. * * \return 0 for success, errno codes for failure. */ static int xnb_probe(device_t dev) { if (!strcmp(xenbus_get_type(dev), "vif")) { DPRINTF("Claiming device %d, %s\n", device_get_unit(dev), devclass_get_name(device_get_devclass(dev))); device_set_desc(dev, "Backend Virtual Network Device"); device_quiet(dev); return (0); } return (ENXIO); } /** * Setup sysctl variables to control various Network Back parameters. * * \param xnb Xen Net Back softc. * */ static void xnb_setup_sysctl(struct xnb_softc *xnb) { struct sysctl_ctx_list *sysctl_ctx = NULL; struct sysctl_oid *sysctl_tree = NULL; sysctl_ctx = device_get_sysctl_ctx(xnb->dev); if (sysctl_ctx == NULL) return; sysctl_tree = device_get_sysctl_tree(xnb->dev); if (sysctl_tree == NULL) return; #ifdef XNB_DEBUG SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "unit_test_results", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, xnb, 0, xnb_unit_test_main, "A", "Results of builtin unit tests"); SYSCTL_ADD_PROC(sysctl_ctx, SYSCTL_CHILDREN(sysctl_tree), OID_AUTO, "dump_rings", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, xnb, 0, xnb_dump_rings, "A", "Xennet Back Rings"); #endif /* XNB_DEBUG */ } /** * Create a network device. * @param handle device handle */ int create_netdev(device_t dev) { struct ifnet *ifp; struct xnb_softc *xnb; int err = 0; uint32_t handle; xnb = device_get_softc(dev); mtx_init(&xnb->sc_lock, "xnb_softc", "xen netback softc lock", MTX_DEF); mtx_init(&xnb->tx_lock, "xnb_tx", "xen netback tx lock", MTX_DEF); mtx_init(&xnb->rx_lock, "xnb_rx", "xen netback rx lock", MTX_DEF); xnb->dev = dev; ifmedia_init(&xnb->sc_media, 0, xnb_ifmedia_upd, xnb_ifmedia_sts); ifmedia_add(&xnb->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL); ifmedia_set(&xnb->sc_media, IFM_ETHER|IFM_MANUAL); /* * Set the MAC address to a dummy value (00:00:00:00:00), * if the MAC address of the host-facing interface is set * to the same as the guest-facing one (the value found in * xenstore), the bridge would stop delivering packets to * us because it would see that the destination address of * the packet is the same as the interface, and so the bridge * would expect the packet has already been delivered locally * (and just drop it). */ bzero(&xnb->mac[0], sizeof(xnb->mac)); /* The interface will be named using the following nomenclature: * * xnb. * * Where handle is the oder of the interface referred to the guest. */ err = xs_scanf(XST_NIL, xenbus_get_node(xnb->dev), "handle", NULL, "%" PRIu32, &handle); if (err != 0) return (err); snprintf(xnb->if_name, IFNAMSIZ, "xnb%" PRIu16 ".%" PRIu32, xenbus_get_otherend_id(dev), handle); if (err == 0) { /* Set up ifnet structure */ ifp = xnb->xnb_ifp = if_alloc(IFT_ETHER); ifp->if_softc = xnb; if_initname(ifp, xnb->if_name, IF_DUNIT_NONE); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = xnb_ioctl; ifp->if_start = xnb_start; ifp->if_init = xnb_ifinit; ifp->if_mtu = ETHERMTU; ifp->if_snd.ifq_maxlen = NET_RX_RING_SIZE - 1; ifp->if_hwassist = XNB_CSUM_FEATURES; ifp->if_capabilities = IFCAP_HWCSUM; ifp->if_capenable = IFCAP_HWCSUM; ether_ifattach(ifp, xnb->mac); xnb->carrier = 0; } return err; } /** * Attach to a XenBus device that has been claimed by our probe routine. * * \param dev NewBus device object representing this Xen Net Back instance. * * \return 0 for success, errno codes for failure. */ static int xnb_attach(device_t dev) { struct xnb_softc *xnb; int error; xnb_ring_type_t i; error = create_netdev(dev); if (error != 0) { xenbus_dev_fatal(dev, error, "creating netdev"); return (error); } DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); /* * Basic initialization. * After this block it is safe to call xnb_detach() * to clean up any allocated data for this instance. */ xnb = device_get_softc(dev); xnb->otherend_id = xenbus_get_otherend_id(dev); for (i=0; i < XNB_NUM_RING_TYPES; i++) { xnb->ring_configs[i].ring_pages = 1; } /* * Setup sysctl variables. */ xnb_setup_sysctl(xnb); /* Update hot-plug status to satisfy xend. */ error = xs_printf(XST_NIL, xenbus_get_node(xnb->dev), "hotplug-status", "connected"); if (error != 0) { xnb_attach_failed(xnb, error, "writing %s/hotplug-status", xenbus_get_node(xnb->dev)); return (error); } if ((error = xnb_publish_backend_info(xnb)) != 0) { /* * If we can't publish our data, we cannot participate * in this connection, and waiting for a front-end state * change will not help the situation. */ xnb_attach_failed(xnb, error, "Publishing backend status for %s", xenbus_get_node(xnb->dev)); return error; } /* Tell the front end that we are ready to connect. */ xenbus_set_state(dev, XenbusStateInitWait); return (0); } /** * Detach from a net back device instance. * * \param dev NewBus device object representing this Xen Net Back instance. * * \return 0 for success, errno codes for failure. * * \note A net back device may be detached at any time in its life-cycle, * including part way through the attach process. For this reason, * initialization order and the initialization state checks in this * routine must be carefully coupled so that attach time failures * are gracefully handled. */ static int xnb_detach(device_t dev) { struct xnb_softc *xnb; DPRINTF("\n"); xnb = device_get_softc(dev); mtx_lock(&xnb->sc_lock); while (xnb_shutdown(xnb) == EAGAIN) { msleep(xnb, &xnb->sc_lock, /*wakeup prio unchanged*/0, "xnb_shutdown", 0); } mtx_unlock(&xnb->sc_lock); DPRINTF("\n"); mtx_destroy(&xnb->tx_lock); mtx_destroy(&xnb->rx_lock); mtx_destroy(&xnb->sc_lock); return (0); } /** * Prepare this net back device for suspension of this VM. * * \param dev NewBus device object representing this Xen net Back instance. * * \return 0 for success, errno codes for failure. */ static int xnb_suspend(device_t dev) { return (0); } /** * Perform any processing required to recover from a suspended state. * * \param dev NewBus device object representing this Xen Net Back instance. * * \return 0 for success, errno codes for failure. */ static int xnb_resume(device_t dev) { return (0); } /** * Handle state changes expressed via the XenStore by our front-end peer. * * \param dev NewBus device object representing this Xen * Net Back instance. * \param frontend_state The new state of the front-end. * * \return 0 for success, errno codes for failure. */ static void xnb_frontend_changed(device_t dev, XenbusState frontend_state) { struct xnb_softc *xnb; xnb = device_get_softc(dev); DPRINTF("frontend_state=%s, xnb_state=%s\n", xenbus_strstate(frontend_state), xenbus_strstate(xenbus_get_state(xnb->dev))); switch (frontend_state) { case XenbusStateInitialising: case XenbusStateInitialised: break; case XenbusStateConnected: xnb_connect(xnb); break; case XenbusStateClosing: case XenbusStateClosed: mtx_lock(&xnb->sc_lock); xnb_shutdown(xnb); mtx_unlock(&xnb->sc_lock); if (frontend_state == XenbusStateClosed) xenbus_set_state(xnb->dev, XenbusStateClosed); break; default: xenbus_dev_fatal(xnb->dev, EINVAL, "saw state %d at frontend", frontend_state); break; } } /*---------------------------- Request Processing ----------------------------*/ /** * Interrupt handler bound to the shared ring's event channel. * Entry point for the xennet transmit path in netback * Transfers packets from the Xen ring to the host's generic networking stack * * \param arg Callback argument registerd during event channel * binding - the xnb_softc for this instance. */ static void xnb_intr(void *arg) { struct xnb_softc *xnb; struct ifnet *ifp; netif_tx_back_ring_t *txb; RING_IDX req_prod_local; xnb = (struct xnb_softc *)arg; ifp = xnb->xnb_ifp; txb = &xnb->ring_configs[XNB_RING_TYPE_TX].back_ring.tx_ring; mtx_lock(&xnb->tx_lock); do { int notify; req_prod_local = txb->sring->req_prod; xen_rmb(); for (;;) { struct mbuf *mbufc; int err; err = xnb_recv(txb, xnb->otherend_id, &mbufc, ifp, xnb->tx_gnttab); if (err || (mbufc == NULL)) break; /* Send the packet to the generic network stack */ (*xnb->xnb_ifp->if_input)(xnb->xnb_ifp, mbufc); } RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(txb, notify); if (notify != 0) xen_intr_signal(xnb->xen_intr_handle); txb->sring->req_event = txb->req_cons + 1; xen_mb(); } while (txb->sring->req_prod != req_prod_local) ; mtx_unlock(&xnb->tx_lock); xnb_start(ifp); } /** * Build a struct xnb_pkt based on netif_tx_request's from a netif tx ring. * Will read exactly 0 or 1 packets from the ring; never a partial packet. * \param[out] pkt The returned packet. If there is an error building * the packet, pkt.list_len will be set to 0. * \param[in] tx_ring Pointer to the Ring that is the input to this function * \param[in] start The ring index of the first potential request * \return The number of requests consumed to build this packet */ static int xnb_ring2pkt(struct xnb_pkt *pkt, const netif_tx_back_ring_t *tx_ring, RING_IDX start) { /* * Outline: * 1) Initialize pkt * 2) Read the first request of the packet * 3) Read the extras * 4) Set cdr * 5) Loop on the remainder of the packet * 6) Finalize pkt (stuff like car_size and list_len) */ int idx = start; int discard = 0; /* whether to discard the packet */ int more_data = 0; /* there are more request past the last one */ uint16_t cdr_size = 0; /* accumulated size of requests 2 through n */ xnb_pkt_initialize(pkt); /* Read the first request */ if (RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) { netif_tx_request_t *tx = RING_GET_REQUEST(tx_ring, idx); pkt->size = tx->size; pkt->flags = tx->flags & ~NETTXF_more_data; more_data = tx->flags & NETTXF_more_data; pkt->list_len++; pkt->car = idx; idx++; } /* Read the extra info */ if ((pkt->flags & NETTXF_extra_info) && RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) { netif_extra_info_t *ext = (netif_extra_info_t*) RING_GET_REQUEST(tx_ring, idx); pkt->extra.type = ext->type; switch (pkt->extra.type) { case XEN_NETIF_EXTRA_TYPE_GSO: pkt->extra.u.gso = ext->u.gso; break; default: /* * The reference Linux netfront driver will * never set any other extra.type. So we don't * know what to do with it. Let's print an * error, then consume and discard the packet */ printf("xnb(%s:%d): Unknown extra info type %d." " Discarding packet\n", __func__, __LINE__, pkt->extra.type); xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start)); xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx)); discard = 1; break; } pkt->extra.flags = ext->flags; if (ext->flags & XEN_NETIF_EXTRA_FLAG_MORE) { /* * The reference linux netfront driver never sets this * flag (nor does any other known netfront). So we * will discard the packet. */ printf("xnb(%s:%d): Request sets " "XEN_NETIF_EXTRA_FLAG_MORE, but we can't handle " "that\n", __func__, __LINE__); xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start)); xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx)); discard = 1; } idx++; } /* Set cdr. If there is not more data, cdr is invalid */ pkt->cdr = idx; /* Loop on remainder of packet */ while (more_data && RING_HAS_UNCONSUMED_REQUESTS_2(tx_ring, idx)) { netif_tx_request_t *tx = RING_GET_REQUEST(tx_ring, idx); pkt->list_len++; cdr_size += tx->size; if (tx->flags & ~NETTXF_more_data) { /* There should be no other flags set at this point */ printf("xnb(%s:%d): Request sets unknown flags %d " "after the 1st request in the packet.\n", __func__, __LINE__, tx->flags); xnb_dump_txreq(start, RING_GET_REQUEST(tx_ring, start)); xnb_dump_txreq(idx, RING_GET_REQUEST(tx_ring, idx)); } more_data = tx->flags & NETTXF_more_data; idx++; } /* Finalize packet */ if (more_data != 0) { /* The ring ran out of requests before finishing the packet */ xnb_pkt_invalidate(pkt); idx = start; /* tell caller that we consumed no requests */ } else { /* Calculate car_size */ pkt->car_size = pkt->size - cdr_size; } if (discard != 0) { xnb_pkt_invalidate(pkt); } return idx - start; } /** * Respond to all the requests that constituted pkt. Builds the responses and * writes them to the ring, but doesn't push them to the shared ring. * \param[in] pkt the packet that needs a response * \param[in] error true if there was an error handling the packet, such * as in the hypervisor copy op or mbuf allocation * \param[out] ring Responses go here */ static void xnb_txpkt2rsp(const struct xnb_pkt *pkt, netif_tx_back_ring_t *ring, int error) { /* * Outline: * 1) Respond to the first request * 2) Respond to the extra info reques * Loop through every remaining request in the packet, generating * responses that copy those requests' ids and sets the status * appropriately. */ netif_tx_request_t *tx; netif_tx_response_t *rsp; int i; uint16_t status; status = (xnb_pkt_is_valid(pkt) == 0) || error ? NETIF_RSP_ERROR : NETIF_RSP_OKAY; KASSERT((pkt->list_len == 0) || (ring->rsp_prod_pvt == pkt->car), ("Cannot respond to ring requests out of order")); if (pkt->list_len >= 1) { uint16_t id; tx = RING_GET_REQUEST(ring, ring->rsp_prod_pvt); id = tx->id; rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->id = id; rsp->status = status; ring->rsp_prod_pvt++; if (pkt->flags & NETRXF_extra_info) { rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->status = NETIF_RSP_NULL; ring->rsp_prod_pvt++; } } for (i=0; i < pkt->list_len - 1; i++) { uint16_t id; tx = RING_GET_REQUEST(ring, ring->rsp_prod_pvt); id = tx->id; rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->id = id; rsp->status = status; ring->rsp_prod_pvt++; } } /** * Create an mbuf chain to represent a packet. Initializes all of the headers * in the mbuf chain, but does not copy the data. The returned chain must be * free()'d when no longer needed * \param[in] pkt A packet to model the mbuf chain after * \return A newly allocated mbuf chain, possibly with clusters attached. * NULL on failure */ static struct mbuf* xnb_pkt2mbufc(const struct xnb_pkt *pkt, struct ifnet *ifp) { /** * \todo consider using a memory pool for mbufs instead of * reallocating them for every packet */ /** \todo handle extra data */ struct mbuf *m; m = m_getm(NULL, pkt->size, M_NOWAIT, MT_DATA); if (m != NULL) { m->m_pkthdr.rcvif = ifp; if (pkt->flags & NETTXF_data_validated) { /* * We lie to the host OS and always tell it that the * checksums are ok, because the packet is unlikely to * get corrupted going across domains. */ m->m_pkthdr.csum_flags = ( CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR ); m->m_pkthdr.csum_data = 0xffff; } } return m; } /** * Build a gnttab_copy table that can be used to copy data from a pkt * to an mbufc. Does not actually perform the copy. Always uses gref's on * the packet side. * \param[in] pkt pkt's associated requests form the src for * the copy operation * \param[in] mbufc mbufc's storage forms the dest for the copy operation * \param[out] gnttab Storage for the returned grant table * \param[in] txb Pointer to the backend ring structure * \param[in] otherend_id The domain ID of the other end of the copy * \return The number of gnttab entries filled */ static int xnb_txpkt2gnttab(const struct xnb_pkt *pkt, struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_tx_back_ring_t *txb, domid_t otherend_id) { struct mbuf *mbuf = mbufc;/* current mbuf within the chain */ int gnt_idx = 0; /* index into grant table */ RING_IDX r_idx = pkt->car; /* index into tx ring buffer */ int r_ofs = 0; /* offset of next data within tx request's data area */ int m_ofs = 0; /* offset of next data within mbuf's data area */ /* size in bytes that still needs to be represented in the table */ uint16_t size_remaining = pkt->size; while (size_remaining > 0) { const netif_tx_request_t *txq = RING_GET_REQUEST(txb, r_idx); const size_t mbuf_space = M_TRAILINGSPACE(mbuf) - m_ofs; const size_t req_size = r_idx == pkt->car ? pkt->car_size : txq->size; const size_t pkt_space = req_size - r_ofs; /* * space is the largest amount of data that can be copied in the * grant table's next entry */ const size_t space = MIN(pkt_space, mbuf_space); /* TODO: handle this error condition without panicking */ KASSERT(gnt_idx < GNTTAB_LEN, ("Grant table is too short")); gnttab[gnt_idx].source.u.ref = txq->gref; gnttab[gnt_idx].source.domid = otherend_id; gnttab[gnt_idx].source.offset = txq->offset + r_ofs; gnttab[gnt_idx].dest.u.gmfn = virt_to_mfn( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].dest.offset = virt_to_offset( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].dest.domid = DOMID_SELF; gnttab[gnt_idx].len = space; gnttab[gnt_idx].flags = GNTCOPY_source_gref; gnt_idx++; r_ofs += space; m_ofs += space; size_remaining -= space; if (req_size - r_ofs <= 0) { /* Must move to the next tx request */ r_ofs = 0; r_idx = (r_idx == pkt->car) ? pkt->cdr : r_idx + 1; } if (M_TRAILINGSPACE(mbuf) - m_ofs <= 0) { /* Must move to the next mbuf */ m_ofs = 0; mbuf = mbuf->m_next; } } return gnt_idx; } /** * Check the status of the grant copy operations, and update mbufs various * non-data fields to reflect the data present. * \param[in,out] mbufc mbuf chain to update. The chain must be valid and of * the correct length, and data should already be present * \param[in] gnttab A grant table for a just completed copy op * \param[in] n_entries The number of valid entries in the grant table */ static void xnb_update_mbufc(struct mbuf *mbufc, const gnttab_copy_table gnttab, int n_entries) { struct mbuf *mbuf = mbufc; int i; size_t total_size = 0; for (i = 0; i < n_entries; i++) { KASSERT(gnttab[i].status == GNTST_okay, ("Some gnttab_copy entry had error status %hd\n", gnttab[i].status)); mbuf->m_len += gnttab[i].len; total_size += gnttab[i].len; if (M_TRAILINGSPACE(mbuf) <= 0) { mbuf = mbuf->m_next; } } mbufc->m_pkthdr.len = total_size; #if defined(INET) || defined(INET6) xnb_add_mbuf_cksum(mbufc); #endif } /** * Dequeue at most one packet from the shared ring * \param[in,out] txb Netif tx ring. A packet will be removed from it, and * its private indices will be updated. But the indices * will not be pushed to the shared ring. * \param[in] ifnet Interface to which the packet will be sent * \param[in] otherend Domain ID of the other end of the ring * \param[out] mbufc The assembled mbuf chain, ready to send to the generic * networking stack * \param[in,out] gnttab Pointer to enough memory for a grant table. We make * this a function parameter so that we will take less * stack space. * \return An error code */ static int xnb_recv(netif_tx_back_ring_t *txb, domid_t otherend, struct mbuf **mbufc, struct ifnet *ifnet, gnttab_copy_table gnttab) { struct xnb_pkt pkt; /* number of tx requests consumed to build the last packet */ int num_consumed; int nr_ents; *mbufc = NULL; num_consumed = xnb_ring2pkt(&pkt, txb, txb->req_cons); if (num_consumed == 0) return 0; /* Nothing to receive */ /* update statistics independent of errors */ if_inc_counter(ifnet, IFCOUNTER_IPACKETS, 1); /* * if we got here, then 1 or more requests was consumed, but the packet * is not necessarily valid. */ if (xnb_pkt_is_valid(&pkt) == 0) { /* got a garbage packet, respond and drop it */ xnb_txpkt2rsp(&pkt, txb, 1); txb->req_cons += num_consumed; DPRINTF("xnb_intr: garbage packet, num_consumed=%d\n", num_consumed); if_inc_counter(ifnet, IFCOUNTER_IERRORS, 1); return EINVAL; } *mbufc = xnb_pkt2mbufc(&pkt, ifnet); if (*mbufc == NULL) { /* * Couldn't allocate mbufs. Respond and drop the packet. Do * not consume the requests */ xnb_txpkt2rsp(&pkt, txb, 1); DPRINTF("xnb_intr: Couldn't allocate mbufs, num_consumed=%d\n", num_consumed); if_inc_counter(ifnet, IFCOUNTER_IQDROPS, 1); return ENOMEM; } nr_ents = xnb_txpkt2gnttab(&pkt, *mbufc, gnttab, txb, otherend); if (nr_ents > 0) { int __unused hv_ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, gnttab, nr_ents); KASSERT(hv_ret == 0, ("HYPERVISOR_grant_table_op returned %d\n", hv_ret)); xnb_update_mbufc(*mbufc, gnttab, nr_ents); } xnb_txpkt2rsp(&pkt, txb, 0); txb->req_cons += num_consumed; return 0; } /** * Create an xnb_pkt based on the contents of an mbuf chain. * \param[in] mbufc mbuf chain to transform into a packet * \param[out] pkt Storage for the newly generated xnb_pkt * \param[in] start The ring index of the first available slot in the rx * ring * \param[in] space The number of free slots in the rx ring * \retval 0 Success * \retval EINVAL mbufc was corrupt or not convertible into a pkt * \retval EAGAIN There was not enough space in the ring to queue the * packet */ static int xnb_mbufc2pkt(const struct mbuf *mbufc, struct xnb_pkt *pkt, RING_IDX start, int space) { int retval = 0; if ((mbufc == NULL) || ( (mbufc->m_flags & M_PKTHDR) == 0) || (mbufc->m_pkthdr.len == 0)) { xnb_pkt_invalidate(pkt); retval = EINVAL; } else { int slots_required; xnb_pkt_validate(pkt); pkt->flags = 0; pkt->size = mbufc->m_pkthdr.len; pkt->car = start; pkt->car_size = mbufc->m_len; if (mbufc->m_pkthdr.csum_flags & CSUM_TSO) { pkt->flags |= NETRXF_extra_info; pkt->extra.u.gso.size = mbufc->m_pkthdr.tso_segsz; pkt->extra.u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; pkt->extra.u.gso.pad = 0; pkt->extra.u.gso.features = 0; pkt->extra.type = XEN_NETIF_EXTRA_TYPE_GSO; pkt->extra.flags = 0; pkt->cdr = start + 2; } else { pkt->cdr = start + 1; } if (mbufc->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_DELAY_DATA)) { pkt->flags |= (NETRXF_csum_blank | NETRXF_data_validated); } /* * Each ring response can have up to PAGE_SIZE of data. * Assume that we can defragment the mbuf chain efficiently * into responses so that each response but the last uses all * PAGE_SIZE bytes. */ pkt->list_len = howmany(pkt->size, PAGE_SIZE); if (pkt->list_len > 1) { pkt->flags |= NETRXF_more_data; } slots_required = pkt->list_len + (pkt->flags & NETRXF_extra_info ? 1 : 0); if (slots_required > space) { xnb_pkt_invalidate(pkt); retval = EAGAIN; } } return retval; } /** * Build a gnttab_copy table that can be used to copy data from an mbuf chain * to the frontend's shared buffers. Does not actually perform the copy. * Always uses gref's on the other end's side. * \param[in] pkt pkt's associated responses form the dest for the copy * operatoin * \param[in] mbufc The source for the copy operation * \param[out] gnttab Storage for the returned grant table * \param[in] rxb Pointer to the backend ring structure * \param[in] otherend_id The domain ID of the other end of the copy * \return The number of gnttab entries filled */ static int xnb_rxpkt2gnttab(const struct xnb_pkt *pkt, const struct mbuf *mbufc, gnttab_copy_table gnttab, const netif_rx_back_ring_t *rxb, domid_t otherend_id) { const struct mbuf *mbuf = mbufc;/* current mbuf within the chain */ int gnt_idx = 0; /* index into grant table */ RING_IDX r_idx = pkt->car; /* index into rx ring buffer */ int r_ofs = 0; /* offset of next data within rx request's data area */ int m_ofs = 0; /* offset of next data within mbuf's data area */ /* size in bytes that still needs to be represented in the table */ uint16_t size_remaining; size_remaining = (xnb_pkt_is_valid(pkt) != 0) ? pkt->size : 0; while (size_remaining > 0) { const netif_rx_request_t *rxq = RING_GET_REQUEST(rxb, r_idx); const size_t mbuf_space = mbuf->m_len - m_ofs; /* Xen shared pages have an implied size of PAGE_SIZE */ const size_t req_size = PAGE_SIZE; const size_t pkt_space = req_size - r_ofs; /* * space is the largest amount of data that can be copied in the * grant table's next entry */ const size_t space = MIN(pkt_space, mbuf_space); /* TODO: handle this error condition without panicing */ KASSERT(gnt_idx < GNTTAB_LEN, ("Grant table is too short")); gnttab[gnt_idx].dest.u.ref = rxq->gref; gnttab[gnt_idx].dest.domid = otherend_id; gnttab[gnt_idx].dest.offset = r_ofs; gnttab[gnt_idx].source.u.gmfn = virt_to_mfn( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].source.offset = virt_to_offset( mtod(mbuf, vm_offset_t) + m_ofs); gnttab[gnt_idx].source.domid = DOMID_SELF; gnttab[gnt_idx].len = space; gnttab[gnt_idx].flags = GNTCOPY_dest_gref; gnt_idx++; r_ofs += space; m_ofs += space; size_remaining -= space; if (req_size - r_ofs <= 0) { /* Must move to the next rx request */ r_ofs = 0; r_idx = (r_idx == pkt->car) ? pkt->cdr : r_idx + 1; } if (mbuf->m_len - m_ofs <= 0) { /* Must move to the next mbuf */ m_ofs = 0; mbuf = mbuf->m_next; } } return gnt_idx; } /** * Generates responses for all the requests that constituted pkt. Builds * responses and writes them to the ring, but doesn't push the shared ring * indices. * \param[in] pkt the packet that needs a response * \param[in] gnttab The grant copy table corresponding to this packet. * Used to determine how many rsp->netif_rx_response_t's to * generate. * \param[in] n_entries Number of relevant entries in the grant table * \param[out] ring Responses go here * \return The number of RX requests that were consumed to generate * the responses */ static int xnb_rxpkt2rsp(const struct xnb_pkt *pkt, const gnttab_copy_table gnttab, int n_entries, netif_rx_back_ring_t *ring) { /* * This code makes the following assumptions: * * All entries in gnttab set GNTCOPY_dest_gref * * The entries in gnttab are grouped by their grefs: any two * entries with the same gref must be adjacent */ int error = 0; int gnt_idx, i; int n_responses = 0; grant_ref_t last_gref = GRANT_REF_INVALID; RING_IDX r_idx; KASSERT(gnttab != NULL, ("Received a null granttable copy")); /* * In the event of an error, we only need to send one response to the * netfront. In that case, we musn't write any data to the responses * after the one we send. So we must loop all the way through gnttab * looking for errors before we generate any responses * * Since we're looping through the grant table anyway, we'll count the * number of different gref's in it, which will tell us how many * responses to generate */ for (gnt_idx = 0; gnt_idx < n_entries; gnt_idx++) { int16_t status = gnttab[gnt_idx].status; if (status != GNTST_okay) { DPRINTF( "Got error %d for hypervisor gnttab_copy status\n", status); error = 1; break; } if (gnttab[gnt_idx].dest.u.ref != last_gref) { n_responses++; last_gref = gnttab[gnt_idx].dest.u.ref; } } if (error != 0) { uint16_t id; netif_rx_response_t *rsp; id = RING_GET_REQUEST(ring, ring->rsp_prod_pvt)->id; rsp = RING_GET_RESPONSE(ring, ring->rsp_prod_pvt); rsp->id = id; rsp->status = NETIF_RSP_ERROR; n_responses = 1; } else { gnt_idx = 0; const int has_extra = pkt->flags & NETRXF_extra_info; if (has_extra != 0) n_responses++; for (i = 0; i < n_responses; i++) { netif_rx_request_t rxq; netif_rx_response_t *rsp; r_idx = ring->rsp_prod_pvt + i; /* * We copy the structure of rxq instead of making a * pointer because it shares the same memory as rsp. */ rxq = *(RING_GET_REQUEST(ring, r_idx)); rsp = RING_GET_RESPONSE(ring, r_idx); if (has_extra && (i == 1)) { netif_extra_info_t *ext = (netif_extra_info_t*)rsp; ext->type = XEN_NETIF_EXTRA_TYPE_GSO; ext->flags = 0; ext->u.gso.size = pkt->extra.u.gso.size; ext->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; ext->u.gso.pad = 0; ext->u.gso.features = 0; } else { rsp->id = rxq.id; rsp->status = GNTST_okay; rsp->offset = 0; rsp->flags = 0; if (i < pkt->list_len - 1) rsp->flags |= NETRXF_more_data; if ((i == 0) && has_extra) rsp->flags |= NETRXF_extra_info; if ((i == 0) && (pkt->flags & NETRXF_data_validated)) { rsp->flags |= NETRXF_data_validated; rsp->flags |= NETRXF_csum_blank; } rsp->status = 0; for (; gnttab[gnt_idx].dest.u.ref == rxq.gref; gnt_idx++) { rsp->status += gnttab[gnt_idx].len; } } } } ring->req_cons += n_responses; ring->rsp_prod_pvt += n_responses; return n_responses; } #if defined(INET) || defined(INET6) /** * Add IP, TCP, and/or UDP checksums to every mbuf in a chain. The first mbuf * in the chain must start with a struct ether_header. * * XXX This function will perform incorrectly on UDP packets that are split up * into multiple ethernet frames. */ static void xnb_add_mbuf_cksum(struct mbuf *mbufc) { struct ether_header *eh; struct ip *iph; uint16_t ether_type; eh = mtod(mbufc, struct ether_header*); ether_type = ntohs(eh->ether_type); if (ether_type != ETHERTYPE_IP) { /* Nothing to calculate */ return; } iph = (struct ip*)(eh + 1); if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) { iph->ip_sum = 0; iph->ip_sum = in_cksum_hdr(iph); } switch (iph->ip_p) { case IPPROTO_TCP: if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) { size_t tcplen = ntohs(iph->ip_len) - sizeof(struct ip); struct tcphdr *th = (struct tcphdr*)(iph + 1); th->th_sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, htons(IPPROTO_TCP + tcplen)); th->th_sum = in_cksum_skip(mbufc, sizeof(struct ether_header) + ntohs(iph->ip_len), sizeof(struct ether_header) + (iph->ip_hl << 2)); } break; case IPPROTO_UDP: if (mbufc->m_pkthdr.csum_flags & CSUM_IP_VALID) { size_t udplen = ntohs(iph->ip_len) - sizeof(struct ip); struct udphdr *uh = (struct udphdr*)(iph + 1); uh->uh_sum = in_pseudo(iph->ip_src.s_addr, iph->ip_dst.s_addr, htons(IPPROTO_UDP + udplen)); uh->uh_sum = in_cksum_skip(mbufc, sizeof(struct ether_header) + ntohs(iph->ip_len), sizeof(struct ether_header) + (iph->ip_hl << 2)); } break; default: break; } } #endif /* INET || INET6 */ static void xnb_stop(struct xnb_softc *xnb) { struct ifnet *ifp; mtx_assert(&xnb->sc_lock, MA_OWNED); ifp = xnb->xnb_ifp; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); } static int xnb_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct xnb_softc *xnb = ifp->if_softc; struct ifreq *ifr = (struct ifreq*) data; #ifdef INET struct ifaddr *ifa = (struct ifaddr*)data; #endif int error = 0; switch (cmd) { case SIOCSIFFLAGS: mtx_lock(&xnb->sc_lock); if (ifp->if_flags & IFF_UP) { xnb_ifinit_locked(xnb); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { xnb_stop(xnb); } } /* * Note: netfront sets a variable named xn_if_flags * here, but that variable is never read */ mtx_unlock(&xnb->sc_lock); break; case SIOCSIFADDR: #ifdef INET mtx_lock(&xnb->sc_lock); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); } arp_ifinit(ifp, ifa); mtx_unlock(&xnb->sc_lock); } else { mtx_unlock(&xnb->sc_lock); #endif error = ether_ioctl(ifp, cmd, data); #ifdef INET } #endif break; case SIOCSIFCAP: mtx_lock(&xnb->sc_lock); if (ifr->ifr_reqcap & IFCAP_TXCSUM) { ifp->if_capenable |= IFCAP_TXCSUM; ifp->if_hwassist |= XNB_CSUM_FEATURES; } else { ifp->if_capenable &= ~(IFCAP_TXCSUM); ifp->if_hwassist &= ~(XNB_CSUM_FEATURES); } if ((ifr->ifr_reqcap & IFCAP_RXCSUM)) { ifp->if_capenable |= IFCAP_RXCSUM; } else { ifp->if_capenable &= ~(IFCAP_RXCSUM); } /* * TODO enable TSO4 and LRO once we no longer need * to calculate checksums in software */ #if 0 if (ifr->if_reqcap |= IFCAP_TSO4) { if (IFCAP_TXCSUM & ifp->if_capenable) { printf("xnb: Xen netif requires that " "TXCSUM be enabled in order " "to use TSO4\n"); error = EINVAL; } else { ifp->if_capenable |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_TSO; } } else { ifp->if_capenable &= ~(IFCAP_TSO4); ifp->if_hwassist &= ~(CSUM_TSO); } if (ifr->ifreqcap |= IFCAP_LRO) { ifp->if_capenable |= IFCAP_LRO; } else { ifp->if_capenable &= ~(IFCAP_LRO); } #endif mtx_unlock(&xnb->sc_lock); break; case SIOCSIFMTU: ifp->if_mtu = ifr->ifr_mtu; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; xnb_ifinit(xnb); break; case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &xnb->sc_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void xnb_start_locked(struct ifnet *ifp) { netif_rx_back_ring_t *rxb; struct xnb_softc *xnb; struct mbuf *mbufc; RING_IDX req_prod_local; xnb = ifp->if_softc; rxb = &xnb->ring_configs[XNB_RING_TYPE_RX].back_ring.rx_ring; if (!xnb->carrier) return; do { int out_of_space = 0; int notify; req_prod_local = rxb->sring->req_prod; xen_rmb(); for (;;) { int error; IF_DEQUEUE(&ifp->if_snd, mbufc); if (mbufc == NULL) break; error = xnb_send(rxb, xnb->otherend_id, mbufc, xnb->rx_gnttab); switch (error) { case EAGAIN: /* * Insufficient space in the ring. * Requeue pkt and send when space is * available. */ IF_PREPEND(&ifp->if_snd, mbufc); /* * Perhaps the frontend missed an IRQ * and went to sleep. Notify it to wake * it up. */ out_of_space = 1; break; case EINVAL: /* OS gave a corrupt packet. Drop it.*/ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); /* FALLTHROUGH */ default: /* Send succeeded, or packet had error. * Free the packet */ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if (mbufc) m_freem(mbufc); break; } if (out_of_space != 0) break; } RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(rxb, notify); if ((notify != 0) || (out_of_space != 0)) xen_intr_signal(xnb->xen_intr_handle); rxb->sring->req_event = req_prod_local + 1; xen_mb(); } while (rxb->sring->req_prod != req_prod_local) ; } /** * Sends one packet to the ring. Blocks until the packet is on the ring * \param[in] mbufc Contains one packet to send. Caller must free * \param[in,out] rxb The packet will be pushed onto this ring, but the * otherend will not be notified. * \param[in] otherend The domain ID of the other end of the connection * \retval EAGAIN The ring did not have enough space for the packet. * The ring has not been modified * \param[in,out] gnttab Pointer to enough memory for a grant table. We make * this a function parameter so that we will take less * stack space. * \retval EINVAL mbufc was corrupt or not convertible into a pkt */ static int xnb_send(netif_rx_back_ring_t *ring, domid_t otherend, const struct mbuf *mbufc, gnttab_copy_table gnttab) { struct xnb_pkt pkt; int error, n_entries, n_reqs; RING_IDX space; space = ring->sring->req_prod - ring->req_cons; error = xnb_mbufc2pkt(mbufc, &pkt, ring->rsp_prod_pvt, space); if (error != 0) return error; n_entries = xnb_rxpkt2gnttab(&pkt, mbufc, gnttab, ring, otherend); if (n_entries != 0) { int __unused hv_ret = HYPERVISOR_grant_table_op(GNTTABOP_copy, gnttab, n_entries); KASSERT(hv_ret == 0, ("HYPERVISOR_grant_table_op returned %d\n", hv_ret)); } n_reqs = xnb_rxpkt2rsp(&pkt, gnttab, n_entries, ring); return 0; } static void xnb_start(struct ifnet *ifp) { struct xnb_softc *xnb; xnb = ifp->if_softc; mtx_lock(&xnb->rx_lock); xnb_start_locked(ifp); mtx_unlock(&xnb->rx_lock); } /* equivalent of network_open() in Linux */ static void xnb_ifinit_locked(struct xnb_softc *xnb) { struct ifnet *ifp; ifp = xnb->xnb_ifp; mtx_assert(&xnb->sc_lock, MA_OWNED); if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; xnb_stop(xnb); ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); } static void xnb_ifinit(void *xsc) { struct xnb_softc *xnb = xsc; mtx_lock(&xnb->sc_lock); xnb_ifinit_locked(xnb); mtx_unlock(&xnb->sc_lock); } /** * Callback used by the generic networking code to tell us when our carrier * state has changed. Since we don't have a physical carrier, we don't care */ static int xnb_ifmedia_upd(struct ifnet *ifp) { return (0); } /** * Callback used by the generic networking code to ask us what our carrier * state is. Since we don't have a physical carrier, this is very simple */ static void xnb_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE; ifmr->ifm_active = IFM_ETHER|IFM_MANUAL; } /*---------------------------- NewBus Registration ---------------------------*/ static device_method_t xnb_methods[] = { /* Device interface */ DEVMETHOD(device_probe, xnb_probe), DEVMETHOD(device_attach, xnb_attach), DEVMETHOD(device_detach, xnb_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xnb_suspend), DEVMETHOD(device_resume, xnb_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, xnb_frontend_changed), { 0, 0 } }; static driver_t xnb_driver = { "xnb", xnb_methods, sizeof(struct xnb_softc), }; devclass_t xnb_devclass; DRIVER_MODULE(xnb, xenbusb_back, xnb_driver, xnb_devclass, 0, 0); /*-------------------------- Unit Tests -------------------------------------*/ #ifdef XNB_DEBUG #include "netback_unit_tests.c" #endif diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c index 8dba5a8dc6d5..facab6550ca3 100644 --- a/sys/dev/xen/netfront/netfront.c +++ b/sys/dev/xen/netfront/netfront.c @@ -1,2342 +1,2342 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2004-2006 Kip Macy * Copyright (c) 2015 Wei Liu * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include +#include +#include #include #include "xenbus_if.h" /* Features supported by all backends. TSO and LRO can be negotiated */ #define XN_CSUM_FEATURES (CSUM_TCP | CSUM_UDP) #define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGE_SIZE) #define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGE_SIZE) #define NET_RX_SLOTS_MIN (XEN_NETIF_NR_SLOTS_MIN + 1) /* * Should the driver do LRO on the RX end * this can be toggled on the fly, but the * interface must be reset (down/up) for it * to take effect. */ static int xn_enable_lro = 1; TUNABLE_INT("hw.xn.enable_lro", &xn_enable_lro); /* * Number of pairs of queues. */ static unsigned long xn_num_queues = 4; TUNABLE_ULONG("hw.xn.num_queues", &xn_num_queues); /** * \brief The maximum allowed data fragments in a single transmit * request. * * This limit is imposed by the backend driver. We assume here that * we are dealing with a Linux driver domain and have set our limit * to mirror the Linux MAX_SKB_FRAGS constant. */ #define MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2) #define RX_COPY_THRESHOLD 256 #define net_ratelimit() 0 struct netfront_rxq; struct netfront_txq; struct netfront_info; struct netfront_rx_info; static void xn_txeof(struct netfront_txq *); static void xn_rxeof(struct netfront_rxq *); static void xn_alloc_rx_buffers(struct netfront_rxq *); static void xn_alloc_rx_buffers_callout(void *arg); static void xn_release_rx_bufs(struct netfront_rxq *); static void xn_release_tx_bufs(struct netfront_txq *); static void xn_rxq_intr(struct netfront_rxq *); static void xn_txq_intr(struct netfront_txq *); static void xn_intr(void *); static inline int xn_count_frags(struct mbuf *m); static int xn_assemble_tx_request(struct netfront_txq *, struct mbuf *); static int xn_ioctl(struct ifnet *, u_long, caddr_t); static void xn_ifinit_locked(struct netfront_info *); static void xn_ifinit(void *); static void xn_stop(struct netfront_info *); static void xn_query_features(struct netfront_info *np); static int xn_configure_features(struct netfront_info *np); static void netif_free(struct netfront_info *info); static int netfront_detach(device_t dev); static int xn_txq_mq_start_locked(struct netfront_txq *, struct mbuf *); static int xn_txq_mq_start(struct ifnet *, struct mbuf *); static int talk_to_backend(device_t dev, struct netfront_info *info); static int create_netdev(device_t dev); static void netif_disconnect_backend(struct netfront_info *info); static int setup_device(device_t dev, struct netfront_info *info, unsigned long); static int xn_ifmedia_upd(struct ifnet *ifp); static void xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); static int xn_connect(struct netfront_info *); static void xn_kick_rings(struct netfront_info *); static int xn_get_responses(struct netfront_rxq *, struct netfront_rx_info *, RING_IDX, RING_IDX *, struct mbuf **); #define virt_to_mfn(x) (vtophys(x) >> PAGE_SHIFT) #define INVALID_P2M_ENTRY (~0UL) #define XN_QUEUE_NAME_LEN 8 /* xn{t,r}x_%u, allow for two digits */ struct netfront_rxq { struct netfront_info *info; u_int id; char name[XN_QUEUE_NAME_LEN]; struct mtx lock; int ring_ref; netif_rx_front_ring_t ring; xen_intr_handle_t xen_intr_handle; grant_ref_t gref_head; grant_ref_t grant_ref[NET_RX_RING_SIZE + 1]; struct mbuf *mbufs[NET_RX_RING_SIZE + 1]; struct lro_ctrl lro; struct callout rx_refill; }; struct netfront_txq { struct netfront_info *info; u_int id; char name[XN_QUEUE_NAME_LEN]; struct mtx lock; int ring_ref; netif_tx_front_ring_t ring; xen_intr_handle_t xen_intr_handle; grant_ref_t gref_head; grant_ref_t grant_ref[NET_TX_RING_SIZE + 1]; struct mbuf *mbufs[NET_TX_RING_SIZE + 1]; int mbufs_cnt; struct buf_ring *br; struct taskqueue *tq; struct task defrtask; bool full; }; struct netfront_info { struct ifnet *xn_ifp; struct mtx sc_lock; u_int num_queues; struct netfront_rxq *rxq; struct netfront_txq *txq; u_int carrier; u_int maxfrags; device_t xbdev; uint8_t mac[ETHER_ADDR_LEN]; int xn_if_flags; struct ifmedia sc_media; bool xn_reset; }; struct netfront_rx_info { struct netif_rx_response rx; struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; }; #define XN_RX_LOCK(_q) mtx_lock(&(_q)->lock) #define XN_RX_UNLOCK(_q) mtx_unlock(&(_q)->lock) #define XN_TX_LOCK(_q) mtx_lock(&(_q)->lock) #define XN_TX_TRYLOCK(_q) mtx_trylock(&(_q)->lock) #define XN_TX_UNLOCK(_q) mtx_unlock(&(_q)->lock) #define XN_LOCK(_sc) mtx_lock(&(_sc)->sc_lock); #define XN_UNLOCK(_sc) mtx_unlock(&(_sc)->sc_lock); #define XN_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->sc_lock, MA_OWNED); #define XN_RX_LOCK_ASSERT(_q) mtx_assert(&(_q)->lock, MA_OWNED); #define XN_TX_LOCK_ASSERT(_q) mtx_assert(&(_q)->lock, MA_OWNED); #define netfront_carrier_on(netif) ((netif)->carrier = 1) #define netfront_carrier_off(netif) ((netif)->carrier = 0) #define netfront_carrier_ok(netif) ((netif)->carrier) /* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */ static inline void add_id_to_freelist(struct mbuf **list, uintptr_t id) { KASSERT(id != 0, ("%s: the head item (0) must always be free.", __func__)); list[id] = list[0]; list[0] = (struct mbuf *)id; } static inline unsigned short get_id_from_freelist(struct mbuf **list) { uintptr_t id; id = (uintptr_t)list[0]; KASSERT(id != 0, ("%s: the head item (0) must always remain free.", __func__)); list[0] = list[id]; return (id); } static inline int xn_rxidx(RING_IDX idx) { return idx & (NET_RX_RING_SIZE - 1); } static inline struct mbuf * xn_get_rx_mbuf(struct netfront_rxq *rxq, RING_IDX ri) { int i; struct mbuf *m; i = xn_rxidx(ri); m = rxq->mbufs[i]; rxq->mbufs[i] = NULL; return (m); } static inline grant_ref_t xn_get_rx_ref(struct netfront_rxq *rxq, RING_IDX ri) { int i = xn_rxidx(ri); grant_ref_t ref = rxq->grant_ref[i]; KASSERT(ref != GRANT_REF_INVALID, ("Invalid grant reference!\n")); rxq->grant_ref[i] = GRANT_REF_INVALID; return (ref); } #define IPRINTK(fmt, args...) \ printf("[XEN] " fmt, ##args) #ifdef INVARIANTS #define WPRINTK(fmt, args...) \ printf("[XEN] " fmt, ##args) #else #define WPRINTK(fmt, args...) #endif #ifdef DEBUG #define DPRINTK(fmt, args...) \ printf("[XEN] %s: " fmt, __func__, ##args) #else #define DPRINTK(fmt, args...) #endif /** * Read the 'mac' node at the given device's node in the store, and parse that * as colon-separated octets, placing result the given mac array. mac must be * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h). * Return 0 on success, or errno on error. */ static int xen_net_read_mac(device_t dev, uint8_t mac[]) { int error, i; char *s, *e, *macstr; const char *path; path = xenbus_get_node(dev); error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr); if (error == ENOENT) { /* * Deal with missing mac XenStore nodes on devices with * HVM emulation (the 'ioemu' configuration attribute) * enabled. * * The HVM emulator may execute in a stub device model * domain which lacks the permission, only given to Dom0, * to update the guest's XenStore tree. For this reason, * the HVM emulator doesn't even attempt to write the * front-side mac node, even when operating in Dom0. * However, there should always be a mac listed in the * backend tree. Fallback to this version if our query * of the front side XenStore location doesn't find * anything. */ path = xenbus_get_otherend_path(dev); error = xs_read(XST_NIL, path, "mac", NULL, (void **) &macstr); } if (error != 0) { xenbus_dev_fatal(dev, error, "parsing %s/mac", path); return (error); } s = macstr; for (i = 0; i < ETHER_ADDR_LEN; i++) { mac[i] = strtoul(s, &e, 16); if (s == e || (e[0] != ':' && e[0] != 0)) { free(macstr, M_XENBUS); return (ENOENT); } s = &e[1]; } free(macstr, M_XENBUS); return (0); } /** * Entry point to this code when a new device is created. Allocate the basic * structures and the ring buffers for communication with the backend, and * inform the backend of the appropriate details for those. Switch to * Connected state. */ static int netfront_probe(device_t dev) { if (xen_pv_nics_disabled()) return (ENXIO); if (!strcmp(xenbus_get_type(dev), "vif")) { device_set_desc(dev, "Virtual Network Interface"); return (0); } return (ENXIO); } static int netfront_attach(device_t dev) { int err; err = create_netdev(dev); if (err != 0) { xenbus_dev_fatal(dev, err, "creating netdev"); return (err); } SYSCTL_ADD_INT(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "enable_lro", CTLFLAG_RW, &xn_enable_lro, 0, "Large Receive Offload"); SYSCTL_ADD_ULONG(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO, "num_queues", CTLFLAG_RD, &xn_num_queues, "Number of pairs of queues"); return (0); } static int netfront_suspend(device_t dev) { struct netfront_info *np = device_get_softc(dev); u_int i; for (i = 0; i < np->num_queues; i++) { XN_RX_LOCK(&np->rxq[i]); XN_TX_LOCK(&np->txq[i]); } netfront_carrier_off(np); for (i = 0; i < np->num_queues; i++) { XN_RX_UNLOCK(&np->rxq[i]); XN_TX_UNLOCK(&np->txq[i]); } return (0); } /** * We are reconnecting to the backend, due to a suspend/resume, or a backend * driver restart. We tear down our netif structure and recreate it, but * leave the device-layer structures intact so that this is transparent to the * rest of the kernel. */ static int netfront_resume(device_t dev) { struct netfront_info *info = device_get_softc(dev); u_int i; if (xen_suspend_cancelled) { for (i = 0; i < info->num_queues; i++) { XN_RX_LOCK(&info->rxq[i]); XN_TX_LOCK(&info->txq[i]); } netfront_carrier_on(info); for (i = 0; i < info->num_queues; i++) { XN_RX_UNLOCK(&info->rxq[i]); XN_TX_UNLOCK(&info->txq[i]); } return (0); } netif_disconnect_backend(info); return (0); } static int write_queue_xenstore_keys(device_t dev, struct netfront_rxq *rxq, struct netfront_txq *txq, struct xs_transaction *xst, bool hierarchy) { int err; const char *message; const char *node = xenbus_get_node(dev); char *path; size_t path_size; KASSERT(rxq->id == txq->id, ("Mismatch between RX and TX queue ids")); /* Split event channel support is not yet there. */ KASSERT(rxq->xen_intr_handle == txq->xen_intr_handle, ("Split event channels are not supported")); if (hierarchy) { path_size = strlen(node) + 10; path = malloc(path_size, M_DEVBUF, M_WAITOK|M_ZERO); snprintf(path, path_size, "%s/queue-%u", node, rxq->id); } else { path_size = strlen(node) + 1; path = malloc(path_size, M_DEVBUF, M_WAITOK|M_ZERO); snprintf(path, path_size, "%s", node); } err = xs_printf(*xst, path, "tx-ring-ref","%u", txq->ring_ref); if (err != 0) { message = "writing tx ring-ref"; goto error; } err = xs_printf(*xst, path, "rx-ring-ref","%u", rxq->ring_ref); if (err != 0) { message = "writing rx ring-ref"; goto error; } err = xs_printf(*xst, path, "event-channel", "%u", xen_intr_port(rxq->xen_intr_handle)); if (err != 0) { message = "writing event-channel"; goto error; } free(path, M_DEVBUF); return (0); error: free(path, M_DEVBUF); xenbus_dev_fatal(dev, err, "%s", message); return (err); } /* Common code used when first setting up, and when resuming. */ static int talk_to_backend(device_t dev, struct netfront_info *info) { const char *message; struct xs_transaction xst; const char *node = xenbus_get_node(dev); int err; unsigned long num_queues, max_queues = 0; unsigned int i; err = xen_net_read_mac(dev, info->mac); if (err != 0) { xenbus_dev_fatal(dev, err, "parsing %s/mac", node); goto out; } err = xs_scanf(XST_NIL, xenbus_get_otherend_path(info->xbdev), "multi-queue-max-queues", NULL, "%lu", &max_queues); if (err != 0) max_queues = 1; num_queues = xn_num_queues; if (num_queues > max_queues) num_queues = max_queues; err = setup_device(dev, info, num_queues); if (err != 0) goto out; again: err = xs_transaction_start(&xst); if (err != 0) { xenbus_dev_fatal(dev, err, "starting transaction"); goto free; } if (info->num_queues == 1) { err = write_queue_xenstore_keys(dev, &info->rxq[0], &info->txq[0], &xst, false); if (err != 0) goto abort_transaction_no_def_error; } else { err = xs_printf(xst, node, "multi-queue-num-queues", "%u", info->num_queues); if (err != 0) { message = "writing multi-queue-num-queues"; goto abort_transaction; } for (i = 0; i < info->num_queues; i++) { err = write_queue_xenstore_keys(dev, &info->rxq[i], &info->txq[i], &xst, true); if (err != 0) goto abort_transaction_no_def_error; } } err = xs_printf(xst, node, "request-rx-copy", "%u", 1); if (err != 0) { message = "writing request-rx-copy"; goto abort_transaction; } err = xs_printf(xst, node, "feature-rx-notify", "%d", 1); if (err != 0) { message = "writing feature-rx-notify"; goto abort_transaction; } err = xs_printf(xst, node, "feature-sg", "%d", 1); if (err != 0) { message = "writing feature-sg"; goto abort_transaction; } if ((info->xn_ifp->if_capenable & IFCAP_LRO) != 0) { err = xs_printf(xst, node, "feature-gso-tcpv4", "%d", 1); if (err != 0) { message = "writing feature-gso-tcpv4"; goto abort_transaction; } } if ((info->xn_ifp->if_capenable & IFCAP_RXCSUM) == 0) { err = xs_printf(xst, node, "feature-no-csum-offload", "%d", 1); if (err != 0) { message = "writing feature-no-csum-offload"; goto abort_transaction; } } err = xs_transaction_end(xst, 0); if (err != 0) { if (err == EAGAIN) goto again; xenbus_dev_fatal(dev, err, "completing transaction"); goto free; } return 0; abort_transaction: xenbus_dev_fatal(dev, err, "%s", message); abort_transaction_no_def_error: xs_transaction_end(xst, 1); free: netif_free(info); out: return (err); } static void xn_rxq_intr(struct netfront_rxq *rxq) { XN_RX_LOCK(rxq); xn_rxeof(rxq); XN_RX_UNLOCK(rxq); } static void xn_txq_start(struct netfront_txq *txq) { struct netfront_info *np = txq->info; struct ifnet *ifp = np->xn_ifp; XN_TX_LOCK_ASSERT(txq); if (!drbr_empty(ifp, txq->br)) xn_txq_mq_start_locked(txq, NULL); } static void xn_txq_intr(struct netfront_txq *txq) { XN_TX_LOCK(txq); if (RING_HAS_UNCONSUMED_RESPONSES(&txq->ring)) xn_txeof(txq); xn_txq_start(txq); XN_TX_UNLOCK(txq); } static void xn_txq_tq_deferred(void *xtxq, int pending) { struct netfront_txq *txq = xtxq; XN_TX_LOCK(txq); xn_txq_start(txq); XN_TX_UNLOCK(txq); } static void disconnect_rxq(struct netfront_rxq *rxq) { xn_release_rx_bufs(rxq); gnttab_free_grant_references(rxq->gref_head); gnttab_end_foreign_access(rxq->ring_ref, NULL); /* * No split event channel support at the moment, handle will * be unbound in tx. So no need to call xen_intr_unbind here, * but we do want to reset the handler to 0. */ rxq->xen_intr_handle = 0; } static void destroy_rxq(struct netfront_rxq *rxq) { callout_drain(&rxq->rx_refill); free(rxq->ring.sring, M_DEVBUF); } static void destroy_rxqs(struct netfront_info *np) { int i; for (i = 0; i < np->num_queues; i++) destroy_rxq(&np->rxq[i]); free(np->rxq, M_DEVBUF); np->rxq = NULL; } static int setup_rxqs(device_t dev, struct netfront_info *info, unsigned long num_queues) { int q, i; int error; netif_rx_sring_t *rxs; struct netfront_rxq *rxq; info->rxq = malloc(sizeof(struct netfront_rxq) * num_queues, M_DEVBUF, M_WAITOK|M_ZERO); for (q = 0; q < num_queues; q++) { rxq = &info->rxq[q]; rxq->id = q; rxq->info = info; rxq->ring_ref = GRANT_REF_INVALID; rxq->ring.sring = NULL; snprintf(rxq->name, XN_QUEUE_NAME_LEN, "xnrx_%u", q); mtx_init(&rxq->lock, rxq->name, "netfront receive lock", MTX_DEF); for (i = 0; i <= NET_RX_RING_SIZE; i++) { rxq->mbufs[i] = NULL; rxq->grant_ref[i] = GRANT_REF_INVALID; } /* Start resources allocation */ if (gnttab_alloc_grant_references(NET_RX_RING_SIZE, &rxq->gref_head) != 0) { device_printf(dev, "allocating rx gref"); error = ENOMEM; goto fail; } rxs = (netif_rx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK|M_ZERO); SHARED_RING_INIT(rxs); FRONT_RING_INIT(&rxq->ring, rxs, PAGE_SIZE); error = xenbus_grant_ring(dev, virt_to_mfn(rxs), &rxq->ring_ref); if (error != 0) { device_printf(dev, "granting rx ring page"); goto fail_grant_ring; } callout_init(&rxq->rx_refill, 1); } return (0); fail_grant_ring: gnttab_free_grant_references(rxq->gref_head); free(rxq->ring.sring, M_DEVBUF); fail: for (; q >= 0; q--) { disconnect_rxq(&info->rxq[q]); destroy_rxq(&info->rxq[q]); } free(info->rxq, M_DEVBUF); return (error); } static void disconnect_txq(struct netfront_txq *txq) { xn_release_tx_bufs(txq); gnttab_free_grant_references(txq->gref_head); gnttab_end_foreign_access(txq->ring_ref, NULL); xen_intr_unbind(&txq->xen_intr_handle); } static void destroy_txq(struct netfront_txq *txq) { free(txq->ring.sring, M_DEVBUF); buf_ring_free(txq->br, M_DEVBUF); taskqueue_drain_all(txq->tq); taskqueue_free(txq->tq); } static void destroy_txqs(struct netfront_info *np) { int i; for (i = 0; i < np->num_queues; i++) destroy_txq(&np->txq[i]); free(np->txq, M_DEVBUF); np->txq = NULL; } static int setup_txqs(device_t dev, struct netfront_info *info, unsigned long num_queues) { int q, i; int error; netif_tx_sring_t *txs; struct netfront_txq *txq; info->txq = malloc(sizeof(struct netfront_txq) * num_queues, M_DEVBUF, M_WAITOK|M_ZERO); for (q = 0; q < num_queues; q++) { txq = &info->txq[q]; txq->id = q; txq->info = info; txq->ring_ref = GRANT_REF_INVALID; txq->ring.sring = NULL; snprintf(txq->name, XN_QUEUE_NAME_LEN, "xntx_%u", q); mtx_init(&txq->lock, txq->name, "netfront transmit lock", MTX_DEF); for (i = 0; i <= NET_TX_RING_SIZE; i++) { txq->mbufs[i] = (void *) ((u_long) i+1); txq->grant_ref[i] = GRANT_REF_INVALID; } txq->mbufs[NET_TX_RING_SIZE] = (void *)0; /* Start resources allocation. */ if (gnttab_alloc_grant_references(NET_TX_RING_SIZE, &txq->gref_head) != 0) { device_printf(dev, "failed to allocate tx grant refs\n"); error = ENOMEM; goto fail; } txs = (netif_tx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK|M_ZERO); SHARED_RING_INIT(txs); FRONT_RING_INIT(&txq->ring, txs, PAGE_SIZE); error = xenbus_grant_ring(dev, virt_to_mfn(txs), &txq->ring_ref); if (error != 0) { device_printf(dev, "failed to grant tx ring\n"); goto fail_grant_ring; } txq->br = buf_ring_alloc(NET_TX_RING_SIZE, M_DEVBUF, M_WAITOK, &txq->lock); TASK_INIT(&txq->defrtask, 0, xn_txq_tq_deferred, txq); txq->tq = taskqueue_create(txq->name, M_WAITOK, taskqueue_thread_enqueue, &txq->tq); error = taskqueue_start_threads(&txq->tq, 1, PI_NET, "%s txq %d", device_get_nameunit(dev), txq->id); if (error != 0) { device_printf(dev, "failed to start tx taskq %d\n", txq->id); goto fail_start_thread; } error = xen_intr_alloc_and_bind_local_port(dev, xenbus_get_otherend_id(dev), /* filter */ NULL, xn_intr, &info->txq[q], INTR_TYPE_NET | INTR_MPSAFE | INTR_ENTROPY, &txq->xen_intr_handle); if (error != 0) { device_printf(dev, "xen_intr_alloc_and_bind_local_port failed\n"); goto fail_bind_port; } } return (0); fail_bind_port: taskqueue_drain_all(txq->tq); fail_start_thread: buf_ring_free(txq->br, M_DEVBUF); taskqueue_free(txq->tq); gnttab_end_foreign_access(txq->ring_ref, NULL); fail_grant_ring: gnttab_free_grant_references(txq->gref_head); free(txq->ring.sring, M_DEVBUF); fail: for (; q >= 0; q--) { disconnect_txq(&info->txq[q]); destroy_txq(&info->txq[q]); } free(info->txq, M_DEVBUF); return (error); } static int setup_device(device_t dev, struct netfront_info *info, unsigned long num_queues) { int error; int q; if (info->txq) destroy_txqs(info); if (info->rxq) destroy_rxqs(info); info->num_queues = 0; error = setup_rxqs(dev, info, num_queues); if (error != 0) goto out; error = setup_txqs(dev, info, num_queues); if (error != 0) goto out; info->num_queues = num_queues; /* No split event channel at the moment. */ for (q = 0; q < num_queues; q++) info->rxq[q].xen_intr_handle = info->txq[q].xen_intr_handle; return (0); out: KASSERT(error != 0, ("Error path taken without providing an error code")); return (error); } #ifdef INET /** * If this interface has an ipv4 address, send an arp for it. This * helps to get the network going again after migrating hosts. */ static void netfront_send_fake_arp(device_t dev, struct netfront_info *info) { struct ifnet *ifp; struct ifaddr *ifa; ifp = info->xn_ifp; CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { if (ifa->ifa_addr->sa_family == AF_INET) { arp_ifinit(ifp, ifa); } } } #endif /** * Callback received when the backend's state changes. */ static void netfront_backend_changed(device_t dev, XenbusState newstate) { struct netfront_info *sc = device_get_softc(dev); DPRINTK("newstate=%d\n", newstate); CURVNET_SET(sc->xn_ifp->if_vnet); switch (newstate) { case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateUnknown: case XenbusStateReconfigured: case XenbusStateReconfiguring: break; case XenbusStateInitWait: if (xenbus_get_state(dev) != XenbusStateInitialising) break; if (xn_connect(sc) != 0) break; /* Switch to connected state before kicking the rings. */ xenbus_set_state(sc->xbdev, XenbusStateConnected); xn_kick_rings(sc); break; case XenbusStateClosing: xenbus_set_state(dev, XenbusStateClosed); break; case XenbusStateClosed: if (sc->xn_reset) { netif_disconnect_backend(sc); xenbus_set_state(dev, XenbusStateInitialising); sc->xn_reset = false; } break; case XenbusStateConnected: #ifdef INET netfront_send_fake_arp(dev, sc); #endif break; } CURVNET_RESTORE(); } /** * \brief Verify that there is sufficient space in the Tx ring * buffer for a maximally sized request to be enqueued. * * A transmit request requires a transmit descriptor for each packet * fragment, plus up to 2 entries for "options" (e.g. TSO). */ static inline int xn_tx_slot_available(struct netfront_txq *txq) { return (RING_FREE_REQUESTS(&txq->ring) > (MAX_TX_REQ_FRAGS + 2)); } static void xn_release_tx_bufs(struct netfront_txq *txq) { int i; for (i = 1; i <= NET_TX_RING_SIZE; i++) { struct mbuf *m; m = txq->mbufs[i]; /* * We assume that no kernel addresses are * less than NET_TX_RING_SIZE. Any entry * in the table that is below this number * must be an index from free-list tracking. */ if (((uintptr_t)m) <= NET_TX_RING_SIZE) continue; gnttab_end_foreign_access_ref(txq->grant_ref[i]); gnttab_release_grant_reference(&txq->gref_head, txq->grant_ref[i]); txq->grant_ref[i] = GRANT_REF_INVALID; add_id_to_freelist(txq->mbufs, i); txq->mbufs_cnt--; if (txq->mbufs_cnt < 0) { panic("%s: tx_chain_cnt must be >= 0", __func__); } m_free(m); } } static struct mbuf * xn_alloc_one_rx_buffer(struct netfront_rxq *rxq) { struct mbuf *m; m = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, MJUMPAGESIZE); if (m == NULL) return NULL; m->m_len = m->m_pkthdr.len = MJUMPAGESIZE; return (m); } static void xn_alloc_rx_buffers(struct netfront_rxq *rxq) { RING_IDX req_prod; int notify; XN_RX_LOCK_ASSERT(rxq); if (__predict_false(rxq->info->carrier == 0)) return; for (req_prod = rxq->ring.req_prod_pvt; req_prod - rxq->ring.rsp_cons < NET_RX_RING_SIZE; req_prod++) { struct mbuf *m; unsigned short id; grant_ref_t ref; struct netif_rx_request *req; unsigned long pfn; m = xn_alloc_one_rx_buffer(rxq); if (m == NULL) break; id = xn_rxidx(req_prod); KASSERT(rxq->mbufs[id] == NULL, ("non-NULL xn_rx_chain")); rxq->mbufs[id] = m; ref = gnttab_claim_grant_reference(&rxq->gref_head); KASSERT(ref != GNTTAB_LIST_END, ("reserved grant references exhuasted")); rxq->grant_ref[id] = ref; pfn = atop(vtophys(mtod(m, vm_offset_t))); req = RING_GET_REQUEST(&rxq->ring, req_prod); gnttab_grant_foreign_access_ref(ref, xenbus_get_otherend_id(rxq->info->xbdev), pfn, 0); req->id = id; req->gref = ref; } rxq->ring.req_prod_pvt = req_prod; /* Not enough requests? Try again later. */ if (req_prod - rxq->ring.rsp_cons < NET_RX_SLOTS_MIN) { callout_reset_curcpu(&rxq->rx_refill, hz/10, xn_alloc_rx_buffers_callout, rxq); return; } wmb(); /* barrier so backend seens requests */ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rxq->ring, notify); if (notify) xen_intr_signal(rxq->xen_intr_handle); } static void xn_alloc_rx_buffers_callout(void *arg) { struct netfront_rxq *rxq; rxq = (struct netfront_rxq *)arg; XN_RX_LOCK(rxq); xn_alloc_rx_buffers(rxq); XN_RX_UNLOCK(rxq); } static void xn_release_rx_bufs(struct netfront_rxq *rxq) { int i, ref; struct mbuf *m; for (i = 0; i < NET_RX_RING_SIZE; i++) { m = rxq->mbufs[i]; if (m == NULL) continue; ref = rxq->grant_ref[i]; if (ref == GRANT_REF_INVALID) continue; gnttab_end_foreign_access_ref(ref); gnttab_release_grant_reference(&rxq->gref_head, ref); rxq->mbufs[i] = NULL; rxq->grant_ref[i] = GRANT_REF_INVALID; m_freem(m); } } static void xn_rxeof(struct netfront_rxq *rxq) { struct ifnet *ifp; struct netfront_info *np = rxq->info; #if (defined(INET) || defined(INET6)) struct lro_ctrl *lro = &rxq->lro; #endif struct netfront_rx_info rinfo; struct netif_rx_response *rx = &rinfo.rx; struct netif_extra_info *extras = rinfo.extras; RING_IDX i, rp; struct mbuf *m; struct mbufq mbufq_rxq, mbufq_errq; int err, work_to_do; XN_RX_LOCK_ASSERT(rxq); if (!netfront_carrier_ok(np)) return; /* XXX: there should be some sane limit. */ mbufq_init(&mbufq_errq, INT_MAX); mbufq_init(&mbufq_rxq, INT_MAX); ifp = np->xn_ifp; do { rp = rxq->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ i = rxq->ring.rsp_cons; while ((i != rp)) { memcpy(rx, RING_GET_RESPONSE(&rxq->ring, i), sizeof(*rx)); memset(extras, 0, sizeof(rinfo.extras)); m = NULL; err = xn_get_responses(rxq, &rinfo, rp, &i, &m); if (__predict_false(err)) { if (m) (void )mbufq_enqueue(&mbufq_errq, m); if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); continue; } m->m_pkthdr.rcvif = ifp; if (rx->flags & NETRXF_data_validated) { /* * According to mbuf(9) the correct way to tell * the stack that the checksum of an inbound * packet is correct, without it actually being * present (because the underlying interface * doesn't provide it), is to set the * CSUM_DATA_VALID and CSUM_PSEUDO_HDR flags, * and the csum_data field to 0xffff. */ m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m->m_pkthdr.csum_data = 0xffff; } if ((rx->flags & NETRXF_extra_info) != 0 && (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type == XEN_NETIF_EXTRA_TYPE_GSO)) { m->m_pkthdr.tso_segsz = extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].u.gso.size; m->m_pkthdr.csum_flags |= CSUM_TSO; } (void )mbufq_enqueue(&mbufq_rxq, m); } rxq->ring.rsp_cons = i; xn_alloc_rx_buffers(rxq); RING_FINAL_CHECK_FOR_RESPONSES(&rxq->ring, work_to_do); } while (work_to_do); mbufq_drain(&mbufq_errq); /* * Process all the mbufs after the remapping is complete. * Break the mbuf chain first though. */ while ((m = mbufq_dequeue(&mbufq_rxq)) != NULL) { if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); #if (defined(INET) || defined(INET6)) /* Use LRO if possible */ if ((ifp->if_capenable & IFCAP_LRO) == 0 || lro->lro_cnt == 0 || tcp_lro_rx(lro, m, 0)) { /* * If LRO fails, pass up to the stack * directly. */ (*ifp->if_input)(ifp, m); } #else (*ifp->if_input)(ifp, m); #endif } #if (defined(INET) || defined(INET6)) /* * Flush any outstanding LRO work */ tcp_lro_flush_all(lro); #endif } static void xn_txeof(struct netfront_txq *txq) { RING_IDX i, prod; unsigned short id; struct ifnet *ifp; netif_tx_response_t *txr; struct mbuf *m; struct netfront_info *np = txq->info; XN_TX_LOCK_ASSERT(txq); if (!netfront_carrier_ok(np)) return; ifp = np->xn_ifp; do { prod = txq->ring.sring->rsp_prod; rmb(); /* Ensure we see responses up to 'rp'. */ for (i = txq->ring.rsp_cons; i != prod; i++) { txr = RING_GET_RESPONSE(&txq->ring, i); if (txr->status == NETIF_RSP_NULL) continue; if (txr->status != NETIF_RSP_OKAY) { printf("%s: WARNING: response is %d!\n", __func__, txr->status); } id = txr->id; m = txq->mbufs[id]; KASSERT(m != NULL, ("mbuf not found in chain")); KASSERT((uintptr_t)m > NET_TX_RING_SIZE, ("mbuf already on the free list, but we're " "trying to free it again!")); M_ASSERTVALID(m); if (__predict_false(gnttab_query_foreign_access( txq->grant_ref[id]) != 0)) { panic("%s: grant id %u still in use by the " "backend", __func__, id); } gnttab_end_foreign_access_ref(txq->grant_ref[id]); gnttab_release_grant_reference( &txq->gref_head, txq->grant_ref[id]); txq->grant_ref[id] = GRANT_REF_INVALID; txq->mbufs[id] = NULL; add_id_to_freelist(txq->mbufs, id); txq->mbufs_cnt--; m_free(m); /* Only mark the txq active if we've freed up at least one slot to try */ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; } txq->ring.rsp_cons = prod; /* * Set a new event, then check for race with update of * tx_cons. Note that it is essential to schedule a * callback, no matter how few buffers are pending. Even if * there is space in the transmit ring, higher layers may * be blocked because too much data is outstanding: in such * cases notification from Xen is likely to be the only kick * that we'll get. */ txq->ring.sring->rsp_event = prod + ((txq->ring.sring->req_prod - prod) >> 1) + 1; mb(); } while (prod != txq->ring.sring->rsp_prod); if (txq->full && ((txq->ring.sring->req_prod - prod) < NET_TX_RING_SIZE)) { txq->full = false; xn_txq_start(txq); } } static void xn_intr(void *xsc) { struct netfront_txq *txq = xsc; struct netfront_info *np = txq->info; struct netfront_rxq *rxq = &np->rxq[txq->id]; /* kick both tx and rx */ xn_rxq_intr(rxq); xn_txq_intr(txq); } static void xn_move_rx_slot(struct netfront_rxq *rxq, struct mbuf *m, grant_ref_t ref) { int new = xn_rxidx(rxq->ring.req_prod_pvt); KASSERT(rxq->mbufs[new] == NULL, ("mbufs != NULL")); rxq->mbufs[new] = m; rxq->grant_ref[new] = ref; RING_GET_REQUEST(&rxq->ring, rxq->ring.req_prod_pvt)->id = new; RING_GET_REQUEST(&rxq->ring, rxq->ring.req_prod_pvt)->gref = ref; rxq->ring.req_prod_pvt++; } static int xn_get_extras(struct netfront_rxq *rxq, struct netif_extra_info *extras, RING_IDX rp, RING_IDX *cons) { struct netif_extra_info *extra; int err = 0; do { struct mbuf *m; grant_ref_t ref; if (__predict_false(*cons + 1 == rp)) { err = EINVAL; break; } extra = (struct netif_extra_info *) RING_GET_RESPONSE(&rxq->ring, ++(*cons)); if (__predict_false(!extra->type || extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { err = EINVAL; } else { memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); } m = xn_get_rx_mbuf(rxq, *cons); ref = xn_get_rx_ref(rxq, *cons); xn_move_rx_slot(rxq, m, ref); } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); return err; } static int xn_get_responses(struct netfront_rxq *rxq, struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons, struct mbuf **list) { struct netif_rx_response *rx = &rinfo->rx; struct netif_extra_info *extras = rinfo->extras; struct mbuf *m, *m0, *m_prev; grant_ref_t ref = xn_get_rx_ref(rxq, *cons); int frags = 1; int err = 0; u_long ret __diagused; m0 = m = m_prev = xn_get_rx_mbuf(rxq, *cons); if (rx->flags & NETRXF_extra_info) { err = xn_get_extras(rxq, extras, rp, cons); } if (m0 != NULL) { m0->m_pkthdr.len = 0; m0->m_next = NULL; } for (;;) { #if 0 DPRINTK("rx->status=%hd rx->offset=%hu frags=%u\n", rx->status, rx->offset, frags); #endif if (__predict_false(rx->status < 0 || rx->offset + rx->status > PAGE_SIZE)) { xn_move_rx_slot(rxq, m, ref); if (m0 == m) m0 = NULL; m = NULL; err = EINVAL; goto next_skip_queue; } /* * This definitely indicates a bug, either in this driver or in * the backend driver. In future this should flag the bad * situation to the system controller to reboot the backed. */ if (ref == GRANT_REF_INVALID) { printf("%s: Bad rx response id %d.\n", __func__, rx->id); err = EINVAL; goto next; } ret = gnttab_end_foreign_access_ref(ref); KASSERT(ret, ("Unable to end access to grant references")); gnttab_release_grant_reference(&rxq->gref_head, ref); next: if (m == NULL) break; m->m_len = rx->status; m->m_data += rx->offset; m0->m_pkthdr.len += rx->status; next_skip_queue: if (!(rx->flags & NETRXF_more_data)) break; if (*cons + frags == rp) { if (net_ratelimit()) WPRINTK("Need more frags\n"); err = ENOENT; printf("%s: cons %u frags %u rp %u, not enough frags\n", __func__, *cons, frags, rp); break; } /* * Note that m can be NULL, if rx->status < 0 or if * rx->offset + rx->status > PAGE_SIZE above. */ m_prev = m; rx = RING_GET_RESPONSE(&rxq->ring, *cons + frags); m = xn_get_rx_mbuf(rxq, *cons + frags); /* * m_prev == NULL can happen if rx->status < 0 or if * rx->offset + * rx->status > PAGE_SIZE above. */ if (m_prev != NULL) m_prev->m_next = m; /* * m0 can be NULL if rx->status < 0 or if * rx->offset + * rx->status > PAGE_SIZE above. */ if (m0 == NULL) m0 = m; m->m_next = NULL; ref = xn_get_rx_ref(rxq, *cons + frags); frags++; } *list = m0; *cons += frags; return (err); } /** * \brief Count the number of fragments in an mbuf chain. * * Surprisingly, there isn't an M* macro for this. */ static inline int xn_count_frags(struct mbuf *m) { int nfrags; for (nfrags = 0; m != NULL; m = m->m_next) nfrags++; return (nfrags); } /** * Given an mbuf chain, make sure we have enough room and then push * it onto the transmit ring. */ static int xn_assemble_tx_request(struct netfront_txq *txq, struct mbuf *m_head) { struct mbuf *m; struct netfront_info *np = txq->info; struct ifnet *ifp = np->xn_ifp; u_int nfrags; int otherend_id; /** * Defragment the mbuf if necessary. */ nfrags = xn_count_frags(m_head); /* * Check to see whether this request is longer than netback * can handle, and try to defrag it. */ /** * It is a bit lame, but the netback driver in Linux can't * deal with nfrags > MAX_TX_REQ_FRAGS, which is a quirk of * the Linux network stack. */ if (nfrags > np->maxfrags) { m = m_defrag(m_head, M_NOWAIT); if (!m) { /* * Defrag failed, so free the mbuf and * therefore drop the packet. */ m_freem(m_head); return (EMSGSIZE); } m_head = m; } /* Determine how many fragments now exist */ nfrags = xn_count_frags(m_head); /* * Check to see whether the defragmented packet has too many * segments for the Linux netback driver. */ /** * The FreeBSD TCP stack, with TSO enabled, can produce a chain * of mbufs longer than Linux can handle. Make sure we don't * pass a too-long chain over to the other side by dropping the * packet. It doesn't look like there is currently a way to * tell the TCP stack to generate a shorter chain of packets. */ if (nfrags > MAX_TX_REQ_FRAGS) { #ifdef DEBUG printf("%s: nfrags %d > MAX_TX_REQ_FRAGS %d, netback " "won't be able to handle it, dropping\n", __func__, nfrags, MAX_TX_REQ_FRAGS); #endif m_freem(m_head); return (EMSGSIZE); } /* * This check should be redundant. We've already verified that we * have enough slots in the ring to handle a packet of maximum * size, and that our packet is less than the maximum size. Keep * it in here as an assert for now just to make certain that * chain_cnt is accurate. */ KASSERT((txq->mbufs_cnt + nfrags) <= NET_TX_RING_SIZE, ("%s: chain_cnt (%d) + nfrags (%d) > NET_TX_RING_SIZE " "(%d)!", __func__, (int) txq->mbufs_cnt, (int) nfrags, (int) NET_TX_RING_SIZE)); /* * Start packing the mbufs in this chain into * the fragment pointers. Stop when we run out * of fragments or hit the end of the mbuf chain. */ m = m_head; otherend_id = xenbus_get_otherend_id(np->xbdev); for (m = m_head; m; m = m->m_next) { netif_tx_request_t *tx; uintptr_t id; grant_ref_t ref; u_long mfn; /* XXX Wrong type? */ tx = RING_GET_REQUEST(&txq->ring, txq->ring.req_prod_pvt); id = get_id_from_freelist(txq->mbufs); if (id == 0) panic("%s: was allocated the freelist head!\n", __func__); txq->mbufs_cnt++; if (txq->mbufs_cnt > NET_TX_RING_SIZE) panic("%s: tx_chain_cnt must be <= NET_TX_RING_SIZE\n", __func__); txq->mbufs[id] = m; tx->id = id; ref = gnttab_claim_grant_reference(&txq->gref_head); KASSERT((short)ref >= 0, ("Negative ref")); mfn = virt_to_mfn(mtod(m, vm_offset_t)); gnttab_grant_foreign_access_ref(ref, otherend_id, mfn, GNTMAP_readonly); tx->gref = txq->grant_ref[id] = ref; tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1); tx->flags = 0; if (m == m_head) { /* * The first fragment has the entire packet * size, subsequent fragments have just the * fragment size. The backend works out the * true size of the first fragment by * subtracting the sizes of the other * fragments. */ tx->size = m->m_pkthdr.len; /* * The first fragment contains the checksum flags * and is optionally followed by extra data for * TSO etc. */ /** * CSUM_TSO requires checksum offloading. * Some versions of FreeBSD fail to * set CSUM_TCP in the CSUM_TSO case, * so we have to test for CSUM_TSO * explicitly. */ if (m->m_pkthdr.csum_flags & (CSUM_DELAY_DATA | CSUM_TSO)) { tx->flags |= (NETTXF_csum_blank | NETTXF_data_validated); } if (m->m_pkthdr.csum_flags & CSUM_TSO) { struct netif_extra_info *gso = (struct netif_extra_info *) RING_GET_REQUEST(&txq->ring, ++txq->ring.req_prod_pvt); tx->flags |= NETTXF_extra_info; gso->u.gso.size = m->m_pkthdr.tso_segsz; gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; gso->u.gso.pad = 0; gso->u.gso.features = 0; gso->type = XEN_NETIF_EXTRA_TYPE_GSO; gso->flags = 0; } } else { tx->size = m->m_len; } if (m->m_next) tx->flags |= NETTXF_more_data; txq->ring.req_prod_pvt++; } BPF_MTAP(ifp, m_head); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, m_head->m_pkthdr.len); if (m_head->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); xn_txeof(txq); return (0); } /* equivalent of network_open() in Linux */ static void xn_ifinit_locked(struct netfront_info *np) { struct ifnet *ifp; int i; struct netfront_rxq *rxq; XN_LOCK_ASSERT(np); ifp = np->xn_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING || !netfront_carrier_ok(np)) return; xn_stop(np); for (i = 0; i < np->num_queues; i++) { rxq = &np->rxq[i]; XN_RX_LOCK(rxq); xn_alloc_rx_buffers(rxq); rxq->ring.sring->rsp_event = rxq->ring.rsp_cons + 1; if (RING_HAS_UNCONSUMED_RESPONSES(&rxq->ring)) xn_rxeof(rxq); XN_RX_UNLOCK(rxq); } ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; if_link_state_change(ifp, LINK_STATE_UP); } static void xn_ifinit(void *xsc) { struct netfront_info *sc = xsc; XN_LOCK(sc); xn_ifinit_locked(sc); XN_UNLOCK(sc); } static int xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct netfront_info *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *) data; device_t dev; #ifdef INET struct ifaddr *ifa = (struct ifaddr *)data; #endif int mask, error = 0, reinit; dev = sc->xbdev; switch(cmd) { case SIOCSIFADDR: #ifdef INET XN_LOCK(sc); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) xn_ifinit_locked(sc); arp_ifinit(ifp, ifa); XN_UNLOCK(sc); } else { XN_UNLOCK(sc); #endif error = ether_ioctl(ifp, cmd, data); #ifdef INET } #endif break; case SIOCSIFMTU: if (ifp->if_mtu == ifr->ifr_mtu) break; ifp->if_mtu = ifr->ifr_mtu; ifp->if_drv_flags &= ~IFF_DRV_RUNNING; xn_ifinit(sc); break; case SIOCSIFFLAGS: XN_LOCK(sc); if (ifp->if_flags & IFF_UP) { /* * If only the state of the PROMISC flag changed, * then just use the 'set promisc mode' command * instead of reinitializing the entire NIC. Doing * a full re-init means reloading the firmware and * waiting for it to start up, which may take a * second or two. */ xn_ifinit_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { xn_stop(sc); } } sc->xn_if_flags = ifp->if_flags; XN_UNLOCK(sc); break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; reinit = 0; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; ifp->if_hwassist ^= XN_CSUM_FEATURES; } if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; ifp->if_hwassist ^= CSUM_TSO; } if (mask & (IFCAP_RXCSUM | IFCAP_LRO)) { /* These Rx features require us to renegotiate. */ reinit = 1; if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; } if (reinit == 0) break; /* * We must reset the interface so the backend picks up the * new features. */ device_printf(sc->xbdev, "performing interface reset due to feature change\n"); XN_LOCK(sc); netfront_carrier_off(sc); sc->xn_reset = true; /* * NB: the pending packet queue is not flushed, since * the interface should still support the old options. */ XN_UNLOCK(sc); /* * Delete the xenstore nodes that export features. * * NB: There's a xenbus state called * "XenbusStateReconfiguring", which is what we should set * here. Sadly none of the backends know how to handle it, * and simply disconnect from the frontend, so we will just * switch back to XenbusStateInitialising in order to force * a reconnection. */ xs_rm(XST_NIL, xenbus_get_node(dev), "feature-gso-tcpv4"); xs_rm(XST_NIL, xenbus_get_node(dev), "feature-no-csum-offload"); xenbus_set_state(dev, XenbusStateClosing); /* * Wait for the frontend to reconnect before returning * from the ioctl. 30s should be more than enough for any * sane backend to reconnect. */ error = tsleep(sc, 0, "xn_rst", 30*hz); break; case SIOCADDMULTI: case SIOCDELMULTI: break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); } return (error); } static void xn_stop(struct netfront_info *sc) { struct ifnet *ifp; XN_LOCK_ASSERT(sc); ifp = sc->xn_ifp; ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); if_link_state_change(ifp, LINK_STATE_DOWN); } static void xn_rebuild_rx_bufs(struct netfront_rxq *rxq) { int requeue_idx, i; grant_ref_t ref; netif_rx_request_t *req; for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) { struct mbuf *m; u_long pfn; if (rxq->mbufs[i] == NULL) continue; m = rxq->mbufs[requeue_idx] = xn_get_rx_mbuf(rxq, i); ref = rxq->grant_ref[requeue_idx] = xn_get_rx_ref(rxq, i); req = RING_GET_REQUEST(&rxq->ring, requeue_idx); pfn = vtophys(mtod(m, vm_offset_t)) >> PAGE_SHIFT; gnttab_grant_foreign_access_ref(ref, xenbus_get_otherend_id(rxq->info->xbdev), pfn, 0); req->gref = ref; req->id = requeue_idx; requeue_idx++; } rxq->ring.req_prod_pvt = requeue_idx; } /* START of Xenolinux helper functions adapted to FreeBSD */ static int xn_connect(struct netfront_info *np) { int i, error; u_int feature_rx_copy; struct netfront_rxq *rxq; struct netfront_txq *txq; error = xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-rx-copy", NULL, "%u", &feature_rx_copy); if (error != 0) feature_rx_copy = 0; /* We only support rx copy. */ if (!feature_rx_copy) return (EPROTONOSUPPORT); /* Recovery procedure: */ error = talk_to_backend(np->xbdev, np); if (error != 0) return (error); /* Step 1: Reinitialise variables. */ xn_query_features(np); xn_configure_features(np); /* Step 2: Release TX buffer */ for (i = 0; i < np->num_queues; i++) { txq = &np->txq[i]; xn_release_tx_bufs(txq); } /* Step 3: Rebuild the RX buffer freelist and the RX ring itself. */ for (i = 0; i < np->num_queues; i++) { rxq = &np->rxq[i]; xn_rebuild_rx_bufs(rxq); } /* Step 4: All public and private state should now be sane. Get * ready to start sending and receiving packets and give the driver * domain a kick because we've probably just requeued some * packets. */ netfront_carrier_on(np); wakeup(np); return (0); } static void xn_kick_rings(struct netfront_info *np) { struct netfront_rxq *rxq; struct netfront_txq *txq; int i; for (i = 0; i < np->num_queues; i++) { txq = &np->txq[i]; rxq = &np->rxq[i]; xen_intr_signal(txq->xen_intr_handle); XN_TX_LOCK(txq); xn_txeof(txq); XN_TX_UNLOCK(txq); XN_RX_LOCK(rxq); xn_alloc_rx_buffers(rxq); XN_RX_UNLOCK(rxq); } } static void xn_query_features(struct netfront_info *np) { int val; device_printf(np->xbdev, "backend features:"); if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-sg", NULL, "%d", &val) != 0) val = 0; np->maxfrags = 1; if (val) { np->maxfrags = MAX_TX_REQ_FRAGS; printf(" feature-sg"); } if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-gso-tcpv4", NULL, "%d", &val) != 0) val = 0; np->xn_ifp->if_capabilities &= ~(IFCAP_TSO4|IFCAP_LRO); if (val) { np->xn_ifp->if_capabilities |= IFCAP_TSO4|IFCAP_LRO; printf(" feature-gso-tcp4"); } /* * HW CSUM offload is assumed to be available unless * feature-no-csum-offload is set in xenstore. */ if (xs_scanf(XST_NIL, xenbus_get_otherend_path(np->xbdev), "feature-no-csum-offload", NULL, "%d", &val) != 0) val = 0; np->xn_ifp->if_capabilities |= IFCAP_HWCSUM; if (val) { np->xn_ifp->if_capabilities &= ~(IFCAP_HWCSUM); printf(" feature-no-csum-offload"); } printf("\n"); } static int xn_configure_features(struct netfront_info *np) { int err, cap_enabled; #if (defined(INET) || defined(INET6)) int i; #endif struct ifnet *ifp; ifp = np->xn_ifp; err = 0; if ((ifp->if_capenable & ifp->if_capabilities) == ifp->if_capenable) { /* Current options are available, no need to do anything. */ return (0); } /* Try to preserve as many options as possible. */ cap_enabled = ifp->if_capenable; ifp->if_capenable = ifp->if_hwassist = 0; #if (defined(INET) || defined(INET6)) if ((cap_enabled & IFCAP_LRO) != 0) for (i = 0; i < np->num_queues; i++) tcp_lro_free(&np->rxq[i].lro); if (xn_enable_lro && (ifp->if_capabilities & cap_enabled & IFCAP_LRO) != 0) { ifp->if_capenable |= IFCAP_LRO; for (i = 0; i < np->num_queues; i++) { err = tcp_lro_init(&np->rxq[i].lro); if (err != 0) { device_printf(np->xbdev, "LRO initialization failed\n"); ifp->if_capenable &= ~IFCAP_LRO; break; } np->rxq[i].lro.ifp = ifp; } } if ((ifp->if_capabilities & cap_enabled & IFCAP_TSO4) != 0) { ifp->if_capenable |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_TSO; } #endif if ((ifp->if_capabilities & cap_enabled & IFCAP_TXCSUM) != 0) { ifp->if_capenable |= IFCAP_TXCSUM; ifp->if_hwassist |= XN_CSUM_FEATURES; } if ((ifp->if_capabilities & cap_enabled & IFCAP_RXCSUM) != 0) ifp->if_capenable |= IFCAP_RXCSUM; return (err); } static int xn_txq_mq_start_locked(struct netfront_txq *txq, struct mbuf *m) { struct netfront_info *np; struct ifnet *ifp; struct buf_ring *br; int error, notify; np = txq->info; br = txq->br; ifp = np->xn_ifp; error = 0; XN_TX_LOCK_ASSERT(txq); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !netfront_carrier_ok(np)) { if (m != NULL) error = drbr_enqueue(ifp, br, m); return (error); } if (m != NULL) { error = drbr_enqueue(ifp, br, m); if (error != 0) return (error); } while ((m = drbr_peek(ifp, br)) != NULL) { if (!xn_tx_slot_available(txq)) { drbr_putback(ifp, br, m); break; } error = xn_assemble_tx_request(txq, m); /* xn_assemble_tx_request always consumes the mbuf*/ if (error != 0) { drbr_advance(ifp, br); break; } RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&txq->ring, notify); if (notify) xen_intr_signal(txq->xen_intr_handle); drbr_advance(ifp, br); } if (RING_FULL(&txq->ring)) txq->full = true; return (0); } static int xn_txq_mq_start(struct ifnet *ifp, struct mbuf *m) { struct netfront_info *np; struct netfront_txq *txq; int i, npairs, error; np = ifp->if_softc; npairs = np->num_queues; if (!netfront_carrier_ok(np)) return (ENOBUFS); KASSERT(npairs != 0, ("called with 0 available queues")); /* check if flowid is set */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) i = m->m_pkthdr.flowid % npairs; else i = curcpu % npairs; txq = &np->txq[i]; if (XN_TX_TRYLOCK(txq) != 0) { error = xn_txq_mq_start_locked(txq, m); XN_TX_UNLOCK(txq); } else { error = drbr_enqueue(ifp, txq->br, m); taskqueue_enqueue(txq->tq, &txq->defrtask); } return (error); } static void xn_qflush(struct ifnet *ifp) { struct netfront_info *np; struct netfront_txq *txq; struct mbuf *m; int i; np = ifp->if_softc; for (i = 0; i < np->num_queues; i++) { txq = &np->txq[i]; XN_TX_LOCK(txq); while ((m = buf_ring_dequeue_sc(txq->br)) != NULL) m_freem(m); XN_TX_UNLOCK(txq); } if_qflush(ifp); } /** * Create a network device. * @param dev Newbus device representing this virtual NIC. */ int create_netdev(device_t dev) { struct netfront_info *np; int err; struct ifnet *ifp; np = device_get_softc(dev); np->xbdev = dev; mtx_init(&np->sc_lock, "xnsc", "netfront softc lock", MTX_DEF); ifmedia_init(&np->sc_media, 0, xn_ifmedia_upd, xn_ifmedia_sts); ifmedia_add(&np->sc_media, IFM_ETHER|IFM_MANUAL, 0, NULL); ifmedia_set(&np->sc_media, IFM_ETHER|IFM_MANUAL); err = xen_net_read_mac(dev, np->mac); if (err != 0) goto error; /* Set up ifnet structure */ ifp = np->xn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = np; if_initname(ifp, "xn", device_get_unit(dev)); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = xn_ioctl; ifp->if_transmit = xn_txq_mq_start; ifp->if_qflush = xn_qflush; ifp->if_init = xn_ifinit; ifp->if_hwassist = XN_CSUM_FEATURES; /* Enable all supported features at device creation. */ ifp->if_capenable = ifp->if_capabilities = IFCAP_HWCSUM|IFCAP_TSO4|IFCAP_LRO; ifp->if_hw_tsomax = 65536 - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = MAX_TX_REQ_FRAGS; ifp->if_hw_tsomaxsegsize = PAGE_SIZE; ether_ifattach(ifp, np->mac); netfront_carrier_off(np); return (0); error: KASSERT(err != 0, ("Error path with no error code specified")); return (err); } static int netfront_detach(device_t dev) { struct netfront_info *info = device_get_softc(dev); DPRINTK("%s\n", xenbus_get_node(dev)); netif_free(info); return 0; } static void netif_free(struct netfront_info *np) { XN_LOCK(np); xn_stop(np); XN_UNLOCK(np); netif_disconnect_backend(np); ether_ifdetach(np->xn_ifp); free(np->rxq, M_DEVBUF); free(np->txq, M_DEVBUF); if_free(np->xn_ifp); np->xn_ifp = NULL; ifmedia_removeall(&np->sc_media); } static void netif_disconnect_backend(struct netfront_info *np) { u_int i; for (i = 0; i < np->num_queues; i++) { XN_RX_LOCK(&np->rxq[i]); XN_TX_LOCK(&np->txq[i]); } netfront_carrier_off(np); for (i = 0; i < np->num_queues; i++) { XN_RX_UNLOCK(&np->rxq[i]); XN_TX_UNLOCK(&np->txq[i]); } for (i = 0; i < np->num_queues; i++) { disconnect_rxq(&np->rxq[i]); disconnect_txq(&np->txq[i]); } } static int xn_ifmedia_upd(struct ifnet *ifp) { return (0); } static void xn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { ifmr->ifm_status = IFM_AVALID|IFM_ACTIVE; ifmr->ifm_active = IFM_ETHER|IFM_MANUAL; } /* ** Driver registration ** */ static device_method_t netfront_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netfront_probe), DEVMETHOD(device_attach, netfront_attach), DEVMETHOD(device_detach, netfront_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, netfront_suspend), DEVMETHOD(device_resume, netfront_resume), /* Xenbus interface */ DEVMETHOD(xenbus_otherend_changed, netfront_backend_changed), DEVMETHOD_END }; static driver_t netfront_driver = { "xn", netfront_methods, sizeof(struct netfront_info), }; devclass_t netfront_devclass; DRIVER_MODULE(xe, xenbusb_front, netfront_driver, netfront_devclass, NULL, NULL); diff --git a/sys/dev/xen/timer/timer.c b/sys/dev/xen/timer/timer.c index ecd4e0990652..fb646d8f0536 100644 --- a/sys/dev/xen/timer/timer.c +++ b/sys/dev/xen/timer/timer.c @@ -1,559 +1,559 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2009 Adrian Chadd * Copyright (c) 2012 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /** * \file dev/xen/timer/timer.c * \brief A timer driver for the Xen hypervisor's PV clock. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include +#include +#include #include #include #include #include #include #include #include #include #include "clock_if.h" static devclass_t xentimer_devclass; #define NSEC_IN_SEC 1000000000ULL #define NSEC_IN_USEC 1000ULL /* 18446744073 = int(2^64 / NSEC_IN_SC) = 1 ns in 64-bit fractions */ #define FRAC_IN_NSEC 18446744073LL /* Xen timers may fire up to 100us off */ #define XENTIMER_MIN_PERIOD_IN_NSEC 100*NSEC_IN_USEC /* * The real resolution of the PV clock is 1ns, but the highest * resolution that FreeBSD supports is 1us, so just use that. */ #define XENCLOCK_RESOLUTION 1 #define XENTIMER_QUALITY 950 struct xentimer_pcpu_data { uint64_t timer; uint64_t last_processed; void *irq_handle; }; DPCPU_DEFINE(struct xentimer_pcpu_data, xentimer_pcpu); DPCPU_DECLARE(struct vcpu_info *, vcpu_info); struct xentimer_softc { device_t dev; struct timecounter tc; struct eventtimer et; }; static void xentimer_identify(driver_t *driver, device_t parent) { if (!xen_domain()) return; /* Handle all Xen PV timers in one device instance. */ if (devclass_get_device(xentimer_devclass, 0)) return; BUS_ADD_CHILD(parent, 0, "xen_et", 0); } static int xentimer_probe(device_t dev) { KASSERT((xen_domain()), ("Trying to use Xen timer on bare metal")); /* * In order to attach, this driver requires the following: * - Vector callback support by the hypervisor, in order to deliver * timer interrupts to the correct CPU for CPUs other than 0. * - Access to the hypervisor shared info page, in order to look up * each VCPU's timer information and the Xen wallclock time. * - The hypervisor must say its PV clock is "safe" to use. * - The hypervisor must support VCPUOP hypercalls. * - The maximum number of CPUs supported by FreeBSD must not exceed * the number of VCPUs supported by the hypervisor. */ #define XTREQUIRES(condition, reason...) \ if (!(condition)) { \ device_printf(dev, ## reason); \ device_detach(dev); \ return (ENXIO); \ } if (xen_hvm_domain()) { XTREQUIRES(xen_vector_callback_enabled, "vector callbacks unavailable\n"); XTREQUIRES(xen_feature(XENFEAT_hvm_safe_pvclock), "HVM safe pvclock unavailable\n"); } XTREQUIRES(HYPERVISOR_shared_info != NULL, "shared info page unavailable\n"); XTREQUIRES(HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, 0, NULL) == 0, "VCPUOPs interface unavailable\n"); #undef XTREQUIRES device_set_desc(dev, "Xen PV Clock"); return (BUS_PROBE_NOWILDCARD); } /** * \brief Get the current time, in nanoseconds, since the hypervisor booted. * * \param vcpu vcpu_info structure to fetch the time from. * */ static uint64_t xen_fetch_vcpu_time(struct vcpu_info *vcpu) { struct pvclock_vcpu_time_info *time; time = (struct pvclock_vcpu_time_info *) &vcpu->time; return (pvclock_get_timecount(time)); } static uint32_t xentimer_get_timecount(struct timecounter *tc) { uint64_t vcpu_time; /* * We don't disable preemption here because the worst that can * happen is reading the vcpu_info area of a different CPU than * the one we are currently running on, but that would also * return a valid tc (and we avoid the overhead of * critical_{enter/exit} calls). */ vcpu_time = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); return (vcpu_time & UINT32_MAX); } /** * \brief Fetch the hypervisor boot time, known as the "Xen wallclock". * * \param ts Timespec to store the current stable value. * \param version Pointer to store the corresponding wallclock version. * * \note This value is updated when Domain-0 shifts its clock to follow * clock drift, e.g. as detected by NTP. */ static void xen_fetch_wallclock(struct timespec *ts) { shared_info_t *src = HYPERVISOR_shared_info; struct pvclock_wall_clock *wc; wc = (struct pvclock_wall_clock *) &src->wc_version; pvclock_get_wallclock(wc, ts); } static void xen_fetch_uptime(struct timespec *ts) { uint64_t uptime; uptime = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); ts->tv_sec = uptime / NSEC_IN_SEC; ts->tv_nsec = uptime % NSEC_IN_SEC; } static int xentimer_settime(device_t dev __unused, struct timespec *ts) { struct xen_platform_op settime; int ret; /* * Don't return EINVAL here; just silently fail if the domain isn't * privileged enough to set the TOD. */ if (!xen_initial_domain()) return (0); settime.cmd = XENPF_settime64; settime.u.settime64.mbz = 0; settime.u.settime64.secs = ts->tv_sec; settime.u.settime64.nsecs = ts->tv_nsec; settime.u.settime64.system_time = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); ret = HYPERVISOR_platform_op(&settime); ret = ret != 0 ? xen_translate_error(ret) : 0; if (ret != 0 && bootverbose) device_printf(dev, "failed to set Xen PV clock: %d\n", ret); return (ret); } /** * \brief Return current time according to the Xen Hypervisor wallclock. * * \param dev Xentimer device. * \param ts Pointer to store the wallclock time. * * \note The Xen time structures document the hypervisor start time and the * uptime-since-hypervisor-start (in nsec.) They need to be combined * in order to calculate a TOD clock. */ static int xentimer_gettime(device_t dev, struct timespec *ts) { struct timespec u_ts; timespecclear(ts); xen_fetch_wallclock(ts); xen_fetch_uptime(&u_ts); timespecadd(ts, &u_ts, ts); return (0); } /** * \brief Handle a timer interrupt for the Xen PV timer driver. * * \param arg Xen timer driver softc that is expecting the interrupt. */ static int xentimer_intr(void *arg) { struct xentimer_softc *sc = (struct xentimer_softc *)arg; struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu); pcpu->last_processed = xen_fetch_vcpu_time(DPCPU_GET(vcpu_info)); if (pcpu->timer != 0 && sc->et.et_active) sc->et.et_event_cb(&sc->et, sc->et.et_arg); return (FILTER_HANDLED); } static int xentimer_vcpu_start_timer(int vcpu, uint64_t next_time) { struct vcpu_set_singleshot_timer single; single.timeout_abs_ns = next_time; /* Get an event anyway, even if the timeout is already expired */ single.flags = 0; return (HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, vcpu, &single)); } static int xentimer_vcpu_stop_timer(int vcpu) { return (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, vcpu, NULL)); } /** * \brief Set the next oneshot time for the current CPU. * * \param et Xen timer driver event timer to schedule on. * \param first Delta to the next time to schedule the interrupt for. * \param period Not used. * * \note See eventtimers(9) for more information. * \note * * \returns 0 */ static int xentimer_et_start(struct eventtimer *et, sbintime_t first, sbintime_t period) { int error; struct xentimer_softc *sc = et->et_priv; int cpu = PCPU_GET(vcpu_id); struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu); struct vcpu_info *vcpu = DPCPU_GET(vcpu_info); uint64_t first_in_ns, next_time; #ifdef INVARIANTS struct thread *td = curthread; #endif KASSERT(td->td_critnest != 0, ("xentimer_et_start called without preemption disabled")); /* See sbttots() for this formula. */ first_in_ns = (((first >> 32) * NSEC_IN_SEC) + (((uint64_t)NSEC_IN_SEC * (uint32_t)first) >> 32)); next_time = xen_fetch_vcpu_time(vcpu) + first_in_ns; error = xentimer_vcpu_start_timer(cpu, next_time); if (error) panic("%s: Error %d setting singleshot timer to %"PRIu64"\n", device_get_nameunit(sc->dev), error, next_time); pcpu->timer = next_time; return (error); } /** * \brief Cancel the event timer's currently running timer, if any. */ static int xentimer_et_stop(struct eventtimer *et) { int cpu = PCPU_GET(vcpu_id); struct xentimer_pcpu_data *pcpu = DPCPU_PTR(xentimer_pcpu); pcpu->timer = 0; return (xentimer_vcpu_stop_timer(cpu)); } /** * \brief Attach a Xen PV timer driver instance. * * \param dev Bus device object to attach. * * \note * \returns EINVAL */ static int xentimer_attach(device_t dev) { struct xentimer_softc *sc = device_get_softc(dev); int error, i; sc->dev = dev; /* Bind an event channel to a VIRQ on each VCPU. */ CPU_FOREACH(i) { struct xentimer_pcpu_data *pcpu; pcpu = DPCPU_ID_PTR(i, xentimer_pcpu); error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL); if (error) { device_printf(dev, "Error disabling Xen periodic timer " "on CPU %d\n", i); return (error); } error = xen_intr_bind_virq(dev, VIRQ_TIMER, i, xentimer_intr, NULL, sc, INTR_TYPE_CLK, &pcpu->irq_handle); if (error) { device_printf(dev, "Error %d binding VIRQ_TIMER " "to VCPU %d\n", error, i); return (error); } xen_intr_describe(pcpu->irq_handle, "c%d", i); } /* Register the event timer. */ sc->et.et_name = "XENTIMER"; sc->et.et_quality = XENTIMER_QUALITY; sc->et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; sc->et.et_frequency = NSEC_IN_SEC; /* See tstosbt() for this formula */ sc->et.et_min_period = (XENTIMER_MIN_PERIOD_IN_NSEC * (((uint64_t)1 << 63) / 500000000) >> 32); sc->et.et_max_period = ((sbintime_t)4 << 32); sc->et.et_start = xentimer_et_start; sc->et.et_stop = xentimer_et_stop; sc->et.et_priv = sc; et_register(&sc->et); /* Register the timecounter. */ sc->tc.tc_name = "XENTIMER"; sc->tc.tc_quality = XENTIMER_QUALITY; /* * FIXME: due to the lack of ordering during resume, FreeBSD cannot * guarantee that the Xen PV timer is resumed before any other device * attempts to make use of it, so mark it as not safe for suspension * (ie: remove the TC_FLAGS_SUSPEND_SAFE flag). * * NB: This was not a problem in previous FreeBSD versions because the * timer was directly attached to the nexus, but it is an issue now * that the timer is attached to the xenpv bus, and thus resumed * later. * * sc->tc.tc_flags = TC_FLAGS_SUSPEND_SAFE; */ /* * The underlying resolution is in nanoseconds, since the timer info * scales TSC frequencies using a fraction that represents time in * terms of nanoseconds. */ sc->tc.tc_frequency = NSEC_IN_SEC; sc->tc.tc_counter_mask = ~0u; sc->tc.tc_get_timecount = xentimer_get_timecount; sc->tc.tc_priv = sc; tc_init(&sc->tc); /* Register the Hypervisor wall clock */ clock_register(dev, XENCLOCK_RESOLUTION); return (0); } static int xentimer_detach(device_t dev) { /* Implement Xen PV clock teardown - XXX see hpet_detach ? */ /* If possible: * 1. need to deregister timecounter * 2. need to deregister event timer * 3. need to deregister virtual IRQ event channels */ return (EBUSY); } static void xentimer_percpu_resume(void *arg) { device_t dev = (device_t) arg; struct xentimer_softc *sc = device_get_softc(dev); xentimer_et_start(&sc->et, sc->et.et_min_period, 0); } static int xentimer_resume(device_t dev) { int error; int i; /* Disable the periodic timer */ CPU_FOREACH(i) { error = HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, i, NULL); if (error != 0) { device_printf(dev, "Error disabling Xen periodic timer on CPU %d\n", i); return (error); } } /* Reset the last uptime value */ pvclock_resume(); /* Reset the RTC clock */ inittodr(time_second); /* Kick the timers on all CPUs */ smp_rendezvous(NULL, xentimer_percpu_resume, NULL, dev); if (bootverbose) device_printf(dev, "resumed operation after suspension\n"); return (0); } static int xentimer_suspend(device_t dev) { return (0); } /* * Xen early clock init */ void xen_clock_init(void) { } /* * Xen PV DELAY function * * When running on PVH mode we don't have an emulated i8524, so * make use of the Xen time info in order to code a simple DELAY * function that can be used during early boot. */ void xen_delay(int n) { struct vcpu_info *vcpu = &HYPERVISOR_shared_info->vcpu_info[0]; uint64_t end_ns; uint64_t current; end_ns = xen_fetch_vcpu_time(vcpu); end_ns += n * NSEC_IN_USEC; for (;;) { current = xen_fetch_vcpu_time(vcpu); if (current >= end_ns) break; } } static device_method_t xentimer_methods[] = { DEVMETHOD(device_identify, xentimer_identify), DEVMETHOD(device_probe, xentimer_probe), DEVMETHOD(device_attach, xentimer_attach), DEVMETHOD(device_detach, xentimer_detach), DEVMETHOD(device_suspend, xentimer_suspend), DEVMETHOD(device_resume, xentimer_resume), /* clock interface */ DEVMETHOD(clock_gettime, xentimer_gettime), DEVMETHOD(clock_settime, xentimer_settime), DEVMETHOD_END }; static driver_t xentimer_driver = { "xen_et", xentimer_methods, sizeof(struct xentimer_softc), }; DRIVER_MODULE(xentimer, xenpv, xentimer_driver, xentimer_devclass, 0, 0); MODULE_DEPEND(xentimer, xenpv, 1, 1, 1); diff --git a/sys/dev/xen/xenstore/xenstore.c b/sys/dev/xen/xenstore/xenstore.c index d09fd22f4eb1..534cb65ca152 100644 --- a/sys/dev/xen/xenstore/xenstore.c +++ b/sys/dev/xen/xenstore/xenstore.c @@ -1,1661 +1,1661 @@ /****************************************************************************** * xenstore.c * * Low-level kernel interface to the XenStore. * * Copyright (C) 2005 Rusty Russell, IBM Corporation * Copyright (C) 2009,2010 Spectra Logic Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include #include #include #include #include #include /** * \file xenstore.c * \brief XenStore interface * * The XenStore interface is a simple storage system that is a means of * communicating state and configuration data between the Xen Domain 0 * and the various guest domains. All configuration data other than * a small amount of essential information required during the early * boot process of launching a Xen aware guest, is managed using the * XenStore. * * The XenStore is ASCII string based, and has a structure and semantics * similar to a filesystem. There are files and directories, the directories * able to contain files or other directories. The depth of the hierarchy * is only limited by the XenStore's maximum path length. * * The communication channel between the XenStore service and other * domains is via two, guest specific, ring buffers in a shared memory * area. One ring buffer is used for communicating in each direction. * The grant table references for this shared memory are given to the * guest either via the xen_start_info structure for a fully para- * virtualized guest, or via HVM hypercalls for a hardware virtualized * guest. * * The XenStore communication relies on an event channel and thus * interrupts. For this reason, the attachment of the XenStore * relies on an interrupt driven configuration hook to hold off * boot processing until communication with the XenStore service * can be established. * * Several Xen services depend on the XenStore, most notably the * XenBus used to discover and manage Xen devices. These services * are implemented as NewBus child attachments to a bus exported * by this XenStore driver. */ static struct xs_watch *find_watch(const char *token); MALLOC_DEFINE(M_XENSTORE, "xenstore", "XenStore data and results"); /** * Pointer to shared memory communication structures allowing us * to communicate with the XenStore service. * * When operating in full PV mode, this pointer is set early in kernel * startup from within xen_machdep.c. In HVM mode, we use hypercalls * to get the guest frame number for the shared page and then map it * into kva. See xs_init() for details. */ static struct xenstore_domain_interface *xen_store; /*-------------------------- Private Data Structures ------------------------*/ /** * Structure capturing messages received from the XenStore service. */ struct xs_stored_msg { TAILQ_ENTRY(xs_stored_msg) list; struct xsd_sockmsg hdr; union { /* Queued replies. */ struct { char *body; } reply; /* Queued watch events. */ struct { struct xs_watch *handle; const char **vec; u_int vec_size; } watch; } u; }; TAILQ_HEAD(xs_stored_msg_list, xs_stored_msg); /** * Container for all XenStore related state. */ struct xs_softc { /** Newbus device for the XenStore. */ device_t xs_dev; /** * Lock serializing access to ring producer/consumer * indexes. Use of this lock guarantees that wakeups * of blocking readers/writers are not missed due to * races with the XenStore service. */ struct mtx ring_lock; /* * Mutex used to insure exclusive access to the outgoing * communication ring. We use a lock type that can be * held while sleeping so that xs_write() can block waiting * for space in the ring to free up, without allowing another * writer to come in and corrupt a partial message write. */ struct sx request_mutex; /** * A list of replies to our requests. * * The reply list is filled by xs_rcv_thread(). It * is consumed by the context that issued the request * to which a reply is made. The requester blocks in * xs_read_reply(). * * /note Only one requesting context can be active at a time. * This is guaranteed by the request_mutex and insures * that the requester sees replies matching the order * of its requests. */ struct xs_stored_msg_list reply_list; /** Lock protecting the reply list. */ struct mtx reply_lock; /** * List of registered watches. */ struct xs_watch_list registered_watches; /** Lock protecting the registered watches list. */ struct mtx registered_watches_lock; /** * List of pending watch callback events. */ struct xs_stored_msg_list watch_events; /** Lock protecting the watch calback list. */ struct mtx watch_events_lock; /** * The processid of the xenwatch thread. */ pid_t xenwatch_pid; /** * Sleepable mutex used to gate the execution of XenStore * watch event callbacks. * * xenwatch_thread holds an exclusive lock on this mutex * while delivering event callbacks, and xenstore_unregister_watch() * uses an exclusive lock of this mutex to guarantee that no * callbacks of the just unregistered watch are pending * before returning to its caller. */ struct sx xenwatch_mutex; /** * The HVM guest pseudo-physical frame number. This is Xen's mapping * of the true machine frame number into our "physical address space". */ unsigned long gpfn; /** * The event channel for communicating with the * XenStore service. */ int evtchn; /** Handle for XenStore interrupts. */ xen_intr_handle_t xen_intr_handle; /** * Interrupt driven config hook allowing us to defer * attaching children until interrupts (and thus communication * with the XenStore service) are available. */ struct intr_config_hook xs_attachcb; /** * Xenstore is a user-space process that usually runs in Dom0, * so if this domain is booting as Dom0, xenstore wont we accessible, * and we have to defer the initialization of xenstore related * devices to later (when xenstore is started). */ bool initialized; /** * Task to run when xenstore is initialized (Dom0 only), will * take care of attaching xenstore related devices. */ struct task xs_late_init; }; /*-------------------------------- Global Data ------------------------------*/ static struct xs_softc xs; /*------------------------- Private Utility Functions -----------------------*/ /** * Count and optionally record pointers to a number of NUL terminated * strings in a buffer. * * \param strings A pointer to a contiguous buffer of NUL terminated strings. * \param dest An array to store pointers to each string found in strings. * \param len The length of the buffer pointed to by strings. * * \return A count of the number of strings found. */ static u_int extract_strings(const char *strings, const char **dest, u_int len) { u_int num; const char *p; for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) { if (dest != NULL) *dest++ = p; num++; } return (num); } /** * Convert a contiguous buffer containing a series of NUL terminated * strings into an array of pointers to strings. * * The returned pointer references the array of string pointers which * is followed by the storage for the string data. It is the client's * responsibility to free this storage. * * The storage addressed by strings is free'd prior to split returning. * * \param strings A pointer to a contiguous buffer of NUL terminated strings. * \param len The length of the buffer pointed to by strings. * \param num The number of strings found and returned in the strings * array. * * \return An array of pointers to the strings found in the input buffer. */ static const char ** split(char *strings, u_int len, u_int *num) { const char **ret; /* Protect against unterminated buffers. */ if (len > 0) strings[len - 1] = '\0'; /* Count the strings. */ *num = extract_strings(strings, /*dest*/NULL, len); /* Transfer to one big alloc for easy freeing by the caller. */ ret = malloc(*num * sizeof(char *) + len, M_XENSTORE, M_WAITOK); memcpy(&ret[*num], strings, len); free(strings, M_XENSTORE); /* Extract pointers to newly allocated array. */ strings = (char *)&ret[*num]; (void)extract_strings(strings, /*dest*/ret, len); return (ret); } /*------------------------- Public Utility Functions -------------------------*/ /*------- API comments for these methods can be found in xenstorevar.h -------*/ struct sbuf * xs_join(const char *dir, const char *name) { struct sbuf *sb; sb = sbuf_new_auto(); sbuf_cat(sb, dir); if (name[0] != '\0') { sbuf_putc(sb, '/'); sbuf_cat(sb, name); } sbuf_finish(sb); return (sb); } /*-------------------- Low Level Communication Management --------------------*/ /** * Interrupt handler for the XenStore event channel. * * XenStore reads and writes block on "xen_store" for buffer * space. Wakeup any blocking operations when the XenStore * service has modified the queues. */ static void xs_intr(void * arg __unused /*__attribute__((unused))*/) { /* If xenstore has not been initialized, initialize it now */ if (!xs.initialized) { xs.initialized = true; /* * Since this task is probing and attaching devices we * have to hold the Giant lock. */ taskqueue_enqueue(taskqueue_swi_giant, &xs.xs_late_init); } /* * Hold ring lock across wakeup so that clients * cannot miss a wakeup. */ mtx_lock(&xs.ring_lock); wakeup(xen_store); mtx_unlock(&xs.ring_lock); } /** * Verify that the indexes for a ring are valid. * * The difference between the producer and consumer cannot * exceed the size of the ring. * * \param cons The consumer index for the ring to test. * \param prod The producer index for the ring to test. * * \retval 1 If indexes are in range. * \retval 0 If the indexes are out of range. */ static int xs_check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) { return ((prod - cons) <= XENSTORE_RING_SIZE); } /** * Return a pointer to, and the length of, the contiguous * free region available for output in a ring buffer. * * \param cons The consumer index for the ring. * \param prod The producer index for the ring. * \param buf The base address of the ring's storage. * \param len The amount of contiguous storage available. * * \return A pointer to the start location of the free region. */ static void * xs_get_output_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, char *buf, uint32_t *len) { *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) *len = XENSTORE_RING_SIZE - (prod - cons); return (buf + MASK_XENSTORE_IDX(prod)); } /** * Return a pointer to, and the length of, the contiguous * data available to read from a ring buffer. * * \param cons The consumer index for the ring. * \param prod The producer index for the ring. * \param buf The base address of the ring's storage. * \param len The amount of contiguous data available to read. * * \return A pointer to the start location of the available data. */ static const void * xs_get_input_chunk(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod, const char *buf, uint32_t *len) { *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); if ((prod - cons) < *len) *len = prod - cons; return (buf + MASK_XENSTORE_IDX(cons)); } /** * Transmit data to the XenStore service. * * \param tdata A pointer to the contiguous data to send. * \param len The amount of data to send. * * \return On success 0, otherwise an errno value indicating the * cause of failure. * * \invariant Called from thread context. * \invariant The buffer pointed to by tdata is at least len bytes * in length. * \invariant xs.request_mutex exclusively locked. */ static int xs_write_store(const void *tdata, unsigned len) { XENSTORE_RING_IDX cons, prod; const char *data = (const char *)tdata; int error; sx_assert(&xs.request_mutex, SX_XLOCKED); while (len != 0) { void *dst; u_int avail; /* Hold lock so we can't miss wakeups should we block. */ mtx_lock(&xs.ring_lock); cons = xen_store->req_cons; prod = xen_store->req_prod; if ((prod - cons) == XENSTORE_RING_SIZE) { /* * Output ring is full. Wait for a ring event. * * Note that the events from both queues * are combined, so being woken does not * guarantee that data exist in the read * ring. * * To simplify error recovery and the retry, * we specify PDROP so our lock is *not* held * when msleep returns. */ error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP, "xbwrite", /*timeout*/0); if (error && error != EWOULDBLOCK) return (error); /* Try again. */ continue; } mtx_unlock(&xs.ring_lock); /* Verify queue sanity. */ if (!xs_check_indexes(cons, prod)) { xen_store->req_cons = xen_store->req_prod = 0; return (EIO); } dst = xs_get_output_chunk(cons, prod, xen_store->req, &avail); if (avail > len) avail = len; memcpy(dst, data, avail); data += avail; len -= avail; /* * The store to the producer index, which indicates * to the other side that new data has arrived, must * be visible only after our copy of the data into the * ring has completed. */ wmb(); xen_store->req_prod += avail; /* * xen_intr_signal() implies mb(). The other side will see * the change to req_prod at the time of the interrupt. */ xen_intr_signal(xs.xen_intr_handle); } return (0); } /** * Receive data from the XenStore service. * * \param tdata A pointer to the contiguous buffer to receive the data. * \param len The amount of data to receive. * * \return On success 0, otherwise an errno value indicating the * cause of failure. * * \invariant Called from thread context. * \invariant The buffer pointed to by tdata is at least len bytes * in length. * * \note xs_read does not perform any internal locking to guarantee * serial access to the incoming ring buffer. However, there * is only one context processing reads: xs_rcv_thread(). */ static int xs_read_store(void *tdata, unsigned len) { XENSTORE_RING_IDX cons, prod; char *data = (char *)tdata; int error; while (len != 0) { u_int avail; const char *src; /* Hold lock so we can't miss wakeups should we block. */ mtx_lock(&xs.ring_lock); cons = xen_store->rsp_cons; prod = xen_store->rsp_prod; if (cons == prod) { /* * Nothing to read. Wait for a ring event. * * Note that the events from both queues * are combined, so being woken does not * guarantee that data exist in the read * ring. * * To simplify error recovery and the retry, * we specify PDROP so our lock is *not* held * when msleep returns. */ error = msleep(xen_store, &xs.ring_lock, PCATCH|PDROP, "xbread", /*timeout*/0); if (error && error != EWOULDBLOCK) return (error); continue; } mtx_unlock(&xs.ring_lock); /* Verify queue sanity. */ if (!xs_check_indexes(cons, prod)) { xen_store->rsp_cons = xen_store->rsp_prod = 0; return (EIO); } src = xs_get_input_chunk(cons, prod, xen_store->rsp, &avail); if (avail > len) avail = len; /* * Insure the data we read is related to the indexes * we read above. */ rmb(); memcpy(data, src, avail); data += avail; len -= avail; /* * Insure that the producer of this ring does not see * the ring space as free until after we have copied it * out. */ mb(); xen_store->rsp_cons += avail; /* * xen_intr_signal() implies mb(). The producer will see * the updated consumer index when the event is delivered. */ xen_intr_signal(xs.xen_intr_handle); } return (0); } /*----------------------- Received Message Processing ------------------------*/ /** * Block reading the next message from the XenStore service and * process the result. * * \param type The returned type of the XenStore message received. * * \return 0 on success. Otherwise an errno value indicating the * type of failure encountered. */ static int xs_process_msg(enum xsd_sockmsg_type *type) { struct xs_stored_msg *msg; char *body; int error; msg = malloc(sizeof(*msg), M_XENSTORE, M_WAITOK); error = xs_read_store(&msg->hdr, sizeof(msg->hdr)); if (error) { free(msg, M_XENSTORE); return (error); } body = malloc(msg->hdr.len + 1, M_XENSTORE, M_WAITOK); error = xs_read_store(body, msg->hdr.len); if (error) { free(body, M_XENSTORE); free(msg, M_XENSTORE); return (error); } body[msg->hdr.len] = '\0'; *type = msg->hdr.type; if (msg->hdr.type == XS_WATCH_EVENT) { msg->u.watch.vec = split(body, msg->hdr.len, &msg->u.watch.vec_size); mtx_lock(&xs.registered_watches_lock); msg->u.watch.handle = find_watch( msg->u.watch.vec[XS_WATCH_TOKEN]); mtx_lock(&xs.watch_events_lock); if (msg->u.watch.handle != NULL && (!msg->u.watch.handle->max_pending || msg->u.watch.handle->pending < msg->u.watch.handle->max_pending)) { msg->u.watch.handle->pending++; TAILQ_INSERT_TAIL(&xs.watch_events, msg, list); wakeup(&xs.watch_events); mtx_unlock(&xs.watch_events_lock); } else { mtx_unlock(&xs.watch_events_lock); free(msg->u.watch.vec, M_XENSTORE); free(msg, M_XENSTORE); } mtx_unlock(&xs.registered_watches_lock); } else { msg->u.reply.body = body; mtx_lock(&xs.reply_lock); TAILQ_INSERT_TAIL(&xs.reply_list, msg, list); wakeup(&xs.reply_list); mtx_unlock(&xs.reply_lock); } return (0); } /** * Thread body of the XenStore receive thread. * * This thread blocks waiting for data from the XenStore service * and processes and received messages. */ static void xs_rcv_thread(void *arg __unused) { int error; enum xsd_sockmsg_type type; for (;;) { error = xs_process_msg(&type); if (error) printf("XENSTORE error %d while reading message\n", error); } } /*---------------- XenStore Message Request/Reply Processing -----------------*/ #define xsd_error_count (sizeof(xsd_errors) / sizeof(xsd_errors[0])) /** * Convert a XenStore error string into an errno number. * * \param errorstring The error string to convert. * * \return The errno best matching the input string. * * \note Unknown error strings are converted to EINVAL. */ static int xs_get_error(const char *errorstring) { u_int i; for (i = 0; i < xsd_error_count; i++) { if (!strcmp(errorstring, xsd_errors[i].errstring)) return (xsd_errors[i].errnum); } log(LOG_WARNING, "XENSTORE xen store gave: unknown error %s", errorstring); return (EINVAL); } /** * Block waiting for a reply to a message request. * * \param type The returned type of the reply. * \param len The returned body length of the reply. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating the * cause of failure. */ static int xs_read_reply(enum xsd_sockmsg_type *type, u_int *len, void **result) { struct xs_stored_msg *msg; char *body; int error; mtx_lock(&xs.reply_lock); while (TAILQ_EMPTY(&xs.reply_list)) { error = mtx_sleep(&xs.reply_list, &xs.reply_lock, 0, "xswait", hz/10); if (error && error != EWOULDBLOCK) { mtx_unlock(&xs.reply_lock); return (error); } } msg = TAILQ_FIRST(&xs.reply_list); TAILQ_REMOVE(&xs.reply_list, msg, list); mtx_unlock(&xs.reply_lock); *type = msg->hdr.type; if (len) *len = msg->hdr.len; body = msg->u.reply.body; free(msg, M_XENSTORE); *result = body; return (0); } /** * Pass-thru interface for XenStore access by userland processes * via the XenStore device. * * Reply type and length data are returned by overwriting these * fields in the passed in request message. * * \param msg A properly formatted message to transmit to * the XenStore service. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating the cause * of failure. * * \note The returned result is provided in malloced storage and thus * must be free'd by the caller with 'free(result, M_XENSTORE); */ int xs_dev_request_and_reply(struct xsd_sockmsg *msg, void **result) { int error; sx_xlock(&xs.request_mutex); if ((error = xs_write_store(msg, sizeof(*msg) + msg->len)) == 0) error = xs_read_reply(&msg->type, &msg->len, result); sx_xunlock(&xs.request_mutex); return (error); } /** * Send a message with an optionally muti-part body to the XenStore service. * * \param t The transaction to use for this request. * \param request_type The type of message to send. * \param iovec Pointers to the body sections of the request. * \param num_vecs The number of body sections in the request. * \param len The returned length of the reply. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating * the cause of failure. * * \note The returned result is provided in malloced storage and thus * must be free'd by the caller with 'free(*result, M_XENSTORE); */ static int xs_talkv(struct xs_transaction t, enum xsd_sockmsg_type request_type, const struct iovec *iovec, u_int num_vecs, u_int *len, void **result) { struct xsd_sockmsg msg; void *ret = NULL; u_int i; int error; msg.tx_id = t.id; msg.req_id = 0; msg.type = request_type; msg.len = 0; for (i = 0; i < num_vecs; i++) msg.len += iovec[i].iov_len; sx_xlock(&xs.request_mutex); error = xs_write_store(&msg, sizeof(msg)); if (error) { printf("xs_talkv failed %d\n", error); goto error_lock_held; } for (i = 0; i < num_vecs; i++) { error = xs_write_store(iovec[i].iov_base, iovec[i].iov_len); if (error) { printf("xs_talkv failed %d\n", error); goto error_lock_held; } } error = xs_read_reply(&msg.type, len, &ret); error_lock_held: sx_xunlock(&xs.request_mutex); if (error) return (error); if (msg.type == XS_ERROR) { error = xs_get_error(ret); free(ret, M_XENSTORE); return (error); } /* Reply is either error or an echo of our request message type. */ KASSERT(msg.type == request_type, ("bad xenstore message type")); if (result) *result = ret; else free(ret, M_XENSTORE); return (0); } /** * Wrapper for xs_talkv allowing easy transmission of a message with * a single, contiguous, message body. * * \param t The transaction to use for this request. * \param request_type The type of message to send. * \param body The body of the request. * \param len The returned length of the reply. * \param result The returned body of the reply. * * \return 0 on success. Otherwise an errno indicating * the cause of failure. * * \note The returned result is provided in malloced storage and thus * must be free'd by the caller with 'free(*result, M_XENSTORE); */ static int xs_single(struct xs_transaction t, enum xsd_sockmsg_type request_type, const char *body, u_int *len, void **result) { struct iovec iovec; iovec.iov_base = (void *)(uintptr_t)body; iovec.iov_len = strlen(body) + 1; return (xs_talkv(t, request_type, &iovec, 1, len, result)); } /*------------------------- XenStore Watch Support ---------------------------*/ /** * Transmit a watch request to the XenStore service. * * \param path The path in the XenStore to watch. * \param tocken A unique identifier for this watch. * * \return 0 on success. Otherwise an errno indicating the * cause of failure. */ static int xs_watch(const char *path, const char *token) { struct iovec iov[2]; iov[0].iov_base = (void *)(uintptr_t) path; iov[0].iov_len = strlen(path) + 1; iov[1].iov_base = (void *)(uintptr_t) token; iov[1].iov_len = strlen(token) + 1; return (xs_talkv(XST_NIL, XS_WATCH, iov, 2, NULL, NULL)); } /** * Transmit an uwatch request to the XenStore service. * * \param path The path in the XenStore to watch. * \param tocken A unique identifier for this watch. * * \return 0 on success. Otherwise an errno indicating the * cause of failure. */ static int xs_unwatch(const char *path, const char *token) { struct iovec iov[2]; iov[0].iov_base = (void *)(uintptr_t) path; iov[0].iov_len = strlen(path) + 1; iov[1].iov_base = (void *)(uintptr_t) token; iov[1].iov_len = strlen(token) + 1; return (xs_talkv(XST_NIL, XS_UNWATCH, iov, 2, NULL, NULL)); } /** * Convert from watch token (unique identifier) to the associated * internal tracking structure for this watch. * * \param tocken The unique identifier for the watch to find. * * \return A pointer to the found watch structure or NULL. */ static struct xs_watch * find_watch(const char *token) { struct xs_watch *i, *cmp; cmp = (void *)strtoul(token, NULL, 16); LIST_FOREACH(i, &xs.registered_watches, list) if (i == cmp) return (i); return (NULL); } /** * Thread body of the XenStore watch event dispatch thread. */ static void xenwatch_thread(void *unused) { struct xs_stored_msg *msg; for (;;) { mtx_lock(&xs.watch_events_lock); while (TAILQ_EMPTY(&xs.watch_events)) mtx_sleep(&xs.watch_events, &xs.watch_events_lock, PWAIT | PCATCH, "waitev", hz/10); mtx_unlock(&xs.watch_events_lock); sx_xlock(&xs.xenwatch_mutex); mtx_lock(&xs.watch_events_lock); msg = TAILQ_FIRST(&xs.watch_events); if (msg) { TAILQ_REMOVE(&xs.watch_events, msg, list); msg->u.watch.handle->pending--; } mtx_unlock(&xs.watch_events_lock); if (msg != NULL) { /* * XXX There are messages coming in with a NULL * XXX callback. This deserves further investigation; * XXX the workaround here simply prevents the kernel * XXX from panic'ing on startup. */ if (msg->u.watch.handle->callback != NULL) msg->u.watch.handle->callback( msg->u.watch.handle, (const char **)msg->u.watch.vec, msg->u.watch.vec_size); free(msg->u.watch.vec, M_XENSTORE); free(msg, M_XENSTORE); } sx_xunlock(&xs.xenwatch_mutex); } } /*----------- XenStore Configuration, Initialization, and Control ------------*/ /** * Setup communication channels with the XenStore service. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ static int xs_init_comms(void) { int error; if (xen_store->rsp_prod != xen_store->rsp_cons) { log(LOG_WARNING, "XENSTORE response ring is not quiescent " "(%08x:%08x): fixing up\n", xen_store->rsp_cons, xen_store->rsp_prod); xen_store->rsp_cons = xen_store->rsp_prod; } xen_intr_unbind(&xs.xen_intr_handle); error = xen_intr_bind_local_port(xs.xs_dev, xs.evtchn, /*filter*/NULL, xs_intr, /*arg*/NULL, INTR_TYPE_NET|INTR_MPSAFE, &xs.xen_intr_handle); if (error) { log(LOG_WARNING, "XENSTORE request irq failed %i\n", error); return (error); } return (0); } /*------------------ Private Device Attachment Functions --------------------*/ static void xs_identify(driver_t *driver, device_t parent) { BUS_ADD_CHILD(parent, 0, "xenstore", 0); } /** * Probe for the existence of the XenStore. * * \param dev */ static int xs_probe(device_t dev) { /* * We are either operating within a PV kernel or being probed * as the child of the successfully attached xenpci device. * Thus we are in a Xen environment and there will be a XenStore. * Unconditionally return success. */ device_set_desc(dev, "XenStore"); return (BUS_PROBE_NOWILDCARD); } static void xs_attach_deferred(void *arg) { bus_generic_probe(xs.xs_dev); bus_generic_attach(xs.xs_dev); config_intrhook_disestablish(&xs.xs_attachcb); } static void xs_attach_late(void *arg, int pending) { KASSERT((pending == 1), ("xs late attach queued several times")); bus_generic_probe(xs.xs_dev); bus_generic_attach(xs.xs_dev); } /** * Attach to the XenStore. * * This routine also prepares for the probe/attach of drivers that rely * on the XenStore. */ static int xs_attach(device_t dev) { int error; /* Allow us to get device_t from softc and vice-versa. */ xs.xs_dev = dev; device_set_softc(dev, &xs); /* Initialize the interface to xenstore. */ struct proc *p; xs.initialized = false; xs.evtchn = xen_get_xenstore_evtchn(); if (xs.evtchn == 0) { struct evtchn_alloc_unbound alloc_unbound; /* Allocate a local event channel for xenstore */ alloc_unbound.dom = DOMID_SELF; alloc_unbound.remote_dom = DOMID_SELF; error = HYPERVISOR_event_channel_op( EVTCHNOP_alloc_unbound, &alloc_unbound); if (error != 0) panic( "unable to alloc event channel for Dom0: %d", error); xs.evtchn = alloc_unbound.port; /* Allocate memory for the xs shared ring */ xen_store = malloc(PAGE_SIZE, M_XENSTORE, M_WAITOK | M_ZERO); xs.gpfn = atop(pmap_kextract((vm_offset_t)xen_store)); } else { xs.gpfn = xen_get_xenstore_mfn(); xen_store = pmap_mapdev_attr(ptoa(xs.gpfn), PAGE_SIZE, VM_MEMATTR_XEN); xs.initialized = true; } TAILQ_INIT(&xs.reply_list); TAILQ_INIT(&xs.watch_events); mtx_init(&xs.ring_lock, "ring lock", NULL, MTX_DEF); mtx_init(&xs.reply_lock, "reply lock", NULL, MTX_DEF); sx_init(&xs.xenwatch_mutex, "xenwatch"); sx_init(&xs.request_mutex, "xenstore request"); mtx_init(&xs.registered_watches_lock, "watches", NULL, MTX_DEF); mtx_init(&xs.watch_events_lock, "watch events", NULL, MTX_DEF); /* Initialize the shared memory rings to talk to xenstored */ error = xs_init_comms(); if (error) return (error); error = kproc_create(xenwatch_thread, NULL, &p, RFHIGHPID, 0, "xenwatch"); if (error) return (error); xs.xenwatch_pid = p->p_pid; error = kproc_create(xs_rcv_thread, NULL, NULL, RFHIGHPID, 0, "xenstore_rcv"); xs.xs_attachcb.ich_func = xs_attach_deferred; xs.xs_attachcb.ich_arg = NULL; if (xs.initialized) { config_intrhook_establish(&xs.xs_attachcb); } else { TASK_INIT(&xs.xs_late_init, 0, xs_attach_late, NULL); } return (error); } /** * Prepare for suspension of this VM by halting XenStore access after * all transactions and individual requests have completed. */ static int xs_suspend(device_t dev) { int error; /* Suspend child Xen devices. */ error = bus_generic_suspend(dev); if (error != 0) return (error); sx_xlock(&xs.request_mutex); return (0); } /** * Resume XenStore operations after this VM is resumed. */ static int xs_resume(device_t dev __unused) { struct xs_watch *watch; char token[sizeof(watch) * 2 + 1]; xs_init_comms(); sx_xunlock(&xs.request_mutex); /* * NB: since xenstore childs have not been resumed yet, there's * no need to hold any watch mutex. Having clients try to add or * remove watches at this point (before xenstore is resumed) is * clearly a violantion of the resume order. */ LIST_FOREACH(watch, &xs.registered_watches, list) { sprintf(token, "%lX", (long)watch); xs_watch(watch->node, token); } /* Resume child Xen devices. */ bus_generic_resume(dev); return (0); } /*-------------------- Private Device Attachment Data -----------------------*/ static device_method_t xenstore_methods[] = { /* Device interface */ DEVMETHOD(device_identify, xs_identify), DEVMETHOD(device_probe, xs_probe), DEVMETHOD(device_attach, xs_attach), DEVMETHOD(device_detach, bus_generic_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, xs_suspend), DEVMETHOD(device_resume, xs_resume), /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD_END }; DEFINE_CLASS_0(xenstore, xenstore_driver, xenstore_methods, 0); static devclass_t xenstore_devclass; DRIVER_MODULE(xenstore, xenpv, xenstore_driver, xenstore_devclass, 0, 0); /*------------------------------- Sysctl Data --------------------------------*/ /* XXX Shouldn't the node be somewhere else? */ SYSCTL_NODE(_dev, OID_AUTO, xen, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Xen"); SYSCTL_INT(_dev_xen, OID_AUTO, xsd_port, CTLFLAG_RD, &xs.evtchn, 0, ""); SYSCTL_ULONG(_dev_xen, OID_AUTO, xsd_kva, CTLFLAG_RD, (u_long *) &xen_store, 0, ""); /*-------------------------------- Public API --------------------------------*/ /*------- API comments for these methods can be found in xenstorevar.h -------*/ bool xs_initialized(void) { return (xs.initialized); } evtchn_port_t xs_evtchn(void) { return (xs.evtchn); } vm_paddr_t xs_address(void) { return (ptoa(xs.gpfn)); } int xs_directory(struct xs_transaction t, const char *dir, const char *node, u_int *num, const char ***result) { struct sbuf *path; char *strings; u_int len = 0; int error; path = xs_join(dir, node); error = xs_single(t, XS_DIRECTORY, sbuf_data(path), &len, (void **)&strings); sbuf_delete(path); if (error) return (error); *result = split(strings, len, num); return (0); } int xs_exists(struct xs_transaction t, const char *dir, const char *node) { const char **d; int error, dir_n; error = xs_directory(t, dir, node, &dir_n, &d); if (error) return (0); free(d, M_XENSTORE); return (1); } int xs_read(struct xs_transaction t, const char *dir, const char *node, u_int *len, void **result) { struct sbuf *path; void *ret; int error; path = xs_join(dir, node); error = xs_single(t, XS_READ, sbuf_data(path), len, &ret); sbuf_delete(path); if (error) return (error); *result = ret; return (0); } int xs_write(struct xs_transaction t, const char *dir, const char *node, const char *string) { struct sbuf *path; struct iovec iovec[2]; int error; path = xs_join(dir, node); iovec[0].iov_base = (void *)(uintptr_t) sbuf_data(path); iovec[0].iov_len = sbuf_len(path) + 1; iovec[1].iov_base = (void *)(uintptr_t) string; iovec[1].iov_len = strlen(string); error = xs_talkv(t, XS_WRITE, iovec, 2, NULL, NULL); sbuf_delete(path); return (error); } int xs_mkdir(struct xs_transaction t, const char *dir, const char *node) { struct sbuf *path; int ret; path = xs_join(dir, node); ret = xs_single(t, XS_MKDIR, sbuf_data(path), NULL, NULL); sbuf_delete(path); return (ret); } int xs_rm(struct xs_transaction t, const char *dir, const char *node) { struct sbuf *path; int ret; path = xs_join(dir, node); ret = xs_single(t, XS_RM, sbuf_data(path), NULL, NULL); sbuf_delete(path); return (ret); } int xs_rm_tree(struct xs_transaction xbt, const char *base, const char *node) { struct xs_transaction local_xbt; struct sbuf *root_path_sbuf; struct sbuf *cur_path_sbuf; char *root_path; char *cur_path; const char **dir; int error; retry: root_path_sbuf = xs_join(base, node); cur_path_sbuf = xs_join(base, node); root_path = sbuf_data(root_path_sbuf); cur_path = sbuf_data(cur_path_sbuf); dir = NULL; local_xbt.id = 0; if (xbt.id == 0) { error = xs_transaction_start(&local_xbt); if (error != 0) goto out; xbt = local_xbt; } while (1) { u_int count; u_int i; error = xs_directory(xbt, cur_path, "", &count, &dir); if (error) goto out; for (i = 0; i < count; i++) { error = xs_rm(xbt, cur_path, dir[i]); if (error == ENOTEMPTY) { struct sbuf *push_dir; /* * Descend to clear out this sub directory. * We'll return to cur_dir once push_dir * is empty. */ push_dir = xs_join(cur_path, dir[i]); sbuf_delete(cur_path_sbuf); cur_path_sbuf = push_dir; cur_path = sbuf_data(cur_path_sbuf); break; } else if (error != 0) { goto out; } } free(dir, M_XENSTORE); dir = NULL; if (i == count) { char *last_slash; /* Directory is empty. It is now safe to remove. */ error = xs_rm(xbt, cur_path, ""); if (error != 0) goto out; if (!strcmp(cur_path, root_path)) break; /* Return to processing the parent directory. */ last_slash = strrchr(cur_path, '/'); KASSERT(last_slash != NULL, ("xs_rm_tree: mangled path %s", cur_path)); *last_slash = '\0'; } } out: sbuf_delete(cur_path_sbuf); sbuf_delete(root_path_sbuf); if (dir != NULL) free(dir, M_XENSTORE); if (local_xbt.id != 0) { int terror; terror = xs_transaction_end(local_xbt, /*abort*/error != 0); xbt.id = 0; if (terror == EAGAIN && error == 0) goto retry; } return (error); } int xs_transaction_start(struct xs_transaction *t) { char *id_str; int error; error = xs_single(XST_NIL, XS_TRANSACTION_START, "", NULL, (void **)&id_str); if (error == 0) { t->id = strtoul(id_str, NULL, 0); free(id_str, M_XENSTORE); } return (error); } int xs_transaction_end(struct xs_transaction t, int abort) { char abortstr[2]; if (abort) strcpy(abortstr, "F"); else strcpy(abortstr, "T"); return (xs_single(t, XS_TRANSACTION_END, abortstr, NULL, NULL)); } int xs_scanf(struct xs_transaction t, const char *dir, const char *node, int *scancountp, const char *fmt, ...) { va_list ap; int error, ns; char *val; error = xs_read(t, dir, node, NULL, (void **) &val); if (error) return (error); va_start(ap, fmt); ns = vsscanf(val, fmt, ap); va_end(ap); free(val, M_XENSTORE); /* Distinctive errno. */ if (ns == 0) return (ERANGE); if (scancountp) *scancountp = ns; return (0); } int xs_vprintf(struct xs_transaction t, const char *dir, const char *node, const char *fmt, va_list ap) { struct sbuf *sb; int error; sb = sbuf_new_auto(); sbuf_vprintf(sb, fmt, ap); sbuf_finish(sb); error = xs_write(t, dir, node, sbuf_data(sb)); sbuf_delete(sb); return (error); } int xs_printf(struct xs_transaction t, const char *dir, const char *node, const char *fmt, ...) { va_list ap; int error; va_start(ap, fmt); error = xs_vprintf(t, dir, node, fmt, ap); va_end(ap); return (error); } int xs_gather(struct xs_transaction t, const char *dir, ...) { va_list ap; const char *name; int error; va_start(ap, dir); error = 0; while (error == 0 && (name = va_arg(ap, char *)) != NULL) { const char *fmt = va_arg(ap, char *); void *result = va_arg(ap, void *); char *p; error = xs_read(t, dir, name, NULL, (void **) &p); if (error) break; if (fmt) { if (sscanf(p, fmt, result) == 0) error = EINVAL; free(p, M_XENSTORE); } else *(char **)result = p; } va_end(ap); return (error); } int xs_register_watch(struct xs_watch *watch) { /* Pointer in ascii is the token. */ char token[sizeof(watch) * 2 + 1]; int error; watch->pending = 0; sprintf(token, "%lX", (long)watch); mtx_lock(&xs.registered_watches_lock); KASSERT(find_watch(token) == NULL, ("watch already registered")); LIST_INSERT_HEAD(&xs.registered_watches, watch, list); mtx_unlock(&xs.registered_watches_lock); error = xs_watch(watch->node, token); /* Ignore errors due to multiple registration. */ if (error == EEXIST) error = 0; if (error != 0) { mtx_lock(&xs.registered_watches_lock); LIST_REMOVE(watch, list); mtx_unlock(&xs.registered_watches_lock); } return (error); } void xs_unregister_watch(struct xs_watch *watch) { struct xs_stored_msg *msg, *tmp; char token[sizeof(watch) * 2 + 1]; int error; sprintf(token, "%lX", (long)watch); mtx_lock(&xs.registered_watches_lock); if (find_watch(token) == NULL) { mtx_unlock(&xs.registered_watches_lock); return; } LIST_REMOVE(watch, list); mtx_unlock(&xs.registered_watches_lock); error = xs_unwatch(watch->node, token); if (error) log(LOG_WARNING, "XENSTORE Failed to release watch %s: %i\n", watch->node, error); /* Cancel pending watch events. */ mtx_lock(&xs.watch_events_lock); TAILQ_FOREACH_SAFE(msg, &xs.watch_events, list, tmp) { if (msg->u.watch.handle != watch) continue; TAILQ_REMOVE(&xs.watch_events, msg, list); free(msg->u.watch.vec, M_XENSTORE); free(msg, M_XENSTORE); } mtx_unlock(&xs.watch_events_lock); /* Flush any currently-executing callback, unless we are it. :-) */ if (curproc->p_pid != xs.xenwatch_pid) { sx_xlock(&xs.xenwatch_mutex); sx_xunlock(&xs.xenwatch_mutex); } } void xs_lock(void) { sx_xlock(&xs.request_mutex); return; } void xs_unlock(void) { sx_xunlock(&xs.request_mutex); return; } diff --git a/sys/i386/include/xen/hypercall.h b/sys/i386/include/xen/hypercall.h index 78052fac9355..4002aac58d84 100644 --- a/sys/i386/include/xen/hypercall.h +++ b/sys/i386/include/xen/hypercall.h @@ -1,424 +1,424 @@ /****************************************************************************** * hypercall.h * * Linux-specific hypervisor handling. * * Copyright (c) 2002-2004, K A Fraser * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #ifndef __HYPERCALL_H__ #define __HYPERCALL_H__ #include -#include -#include +#include +#include extern char *hypercall_page; #define __STR(x) #x #define STR(x) __STR(x) #define ENOXENSYS 38 #define CONFIG_XEN_COMPAT 0x030002 #define HYPERCALL_STR(name) \ "call hypercall_page + ("STR(__HYPERVISOR_##name)" * 32)" #define _hypercall0(type, name) \ ({ \ long __res; \ __asm__ volatile ( \ HYPERCALL_STR(name) \ : "=a" (__res) \ : \ : "memory" ); \ (type)__res; \ }) #define _hypercall1(type, name, a1) \ ({ \ long __res, __ign1; \ __asm__ volatile ( \ HYPERCALL_STR(name) \ : "=a" (__res), "=b" (__ign1) \ : "1" ((long)(a1)) \ : "memory" ); \ (type)__res; \ }) #define _hypercall2(type, name, a1, a2) \ ({ \ long __res, __ign1, __ign2; \ __asm__ volatile ( \ HYPERCALL_STR(name) \ : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \ : "1" ((long)(a1)), "2" ((long)(a2)) \ : "memory" ); \ (type)__res; \ }) #define _hypercall3(type, name, a1, a2, a3) \ ({ \ long __res, __ign1, __ign2, __ign3; \ __asm__ volatile ( \ HYPERCALL_STR(name) \ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ "=d" (__ign3) \ : "1" ((long)(a1)), "2" ((long)(a2)), \ "3" ((long)(a3)) \ : "memory" ); \ (type)__res; \ }) #define _hypercall4(type, name, a1, a2, a3, a4) \ ({ \ long __res, __ign1, __ign2, __ign3, __ign4; \ __asm__ volatile ( \ HYPERCALL_STR(name) \ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ "=d" (__ign3), "=S" (__ign4) \ : "1" ((long)(a1)), "2" ((long)(a2)), \ "3" ((long)(a3)), "4" ((long)(a4)) \ : "memory" ); \ (type)__res; \ }) #define _hypercall5(type, name, a1, a2, a3, a4, a5) \ ({ \ long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \ __asm__ volatile ( \ HYPERCALL_STR(name) \ : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \ "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \ : "1" ((long)(a1)), "2" ((long)(a2)), \ "3" ((long)(a3)), "4" ((long)(a4)), \ "5" ((long)(a5)) \ : "memory" ); \ (type)__res; \ }) static inline long privcmd_hypercall(long op, long a1, long a2, long a3, long a4, long a5) { long __res, __ign1, __ign2, __ign3, __ign4, __ign5, __call; __call = (long)&hypercall_page + (op * 32); __asm__ volatile ( "call *%[call]" : "=a" (__res), "=b" (__ign1), "=c" (__ign2), "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) : "1" ((long)(a1)), "2" ((long)(a2)), "3" ((long)(a3)), "4" ((long)(a4)), "5" ((long)(a5)), [call] "a" (__call) : "memory" ); return __res; } static inline int HYPERVISOR_set_trap_table( trap_info_t *table) { return _hypercall1(int, set_trap_table, table); } static inline int HYPERVISOR_mmu_update( mmu_update_t *req, int count, int *success_count, domid_t domid) { return _hypercall4(int, mmu_update, req, count, success_count, domid); } static inline int HYPERVISOR_mmuext_op( mmuext_op_t *op, int count, int *success_count, domid_t domid) { return _hypercall4(int, mmuext_op, op, count, success_count, domid); } static inline int HYPERVISOR_set_gdt( unsigned long *frame_list, int entries) { return _hypercall2(int, set_gdt, frame_list, entries); } static inline int HYPERVISOR_stack_switch( unsigned long ss, unsigned long esp) { return _hypercall2(int, stack_switch, ss, esp); } static inline int HYPERVISOR_set_callbacks( unsigned long event_selector, unsigned long event_address, unsigned long failsafe_selector, unsigned long failsafe_address) { return _hypercall4(int, set_callbacks, event_selector, event_address, failsafe_selector, failsafe_address); } static inline int HYPERVISOR_fpu_taskswitch( int set) { return _hypercall1(int, fpu_taskswitch, set); } static inline int HYPERVISOR_sched_op_compat( int cmd, unsigned long arg) { return _hypercall2(int, sched_op_compat, cmd, arg); } static inline int HYPERVISOR_sched_op( int cmd, void *arg) { return _hypercall2(int, sched_op, cmd, arg); } static inline long HYPERVISOR_set_timer_op( uint64_t timeout) { unsigned long timeout_hi = (unsigned long)(timeout>>32); unsigned long timeout_lo = (unsigned long)timeout; return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi); } static inline int HYPERVISOR_platform_op( struct xen_platform_op *platform_op) { platform_op->interface_version = XENPF_INTERFACE_VERSION; return _hypercall1(int, platform_op, platform_op); } static inline int HYPERVISOR_set_debugreg( int reg, unsigned long value) { return _hypercall2(int, set_debugreg, reg, value); } static inline unsigned long HYPERVISOR_get_debugreg( int reg) { return _hypercall1(unsigned long, get_debugreg, reg); } static inline int HYPERVISOR_update_descriptor( uint64_t ma, uint64_t desc) { return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32); } static inline int HYPERVISOR_memory_op( unsigned int cmd, void *arg) { return _hypercall2(int, memory_op, cmd, arg); } static inline int HYPERVISOR_multicall( void *call_list, int nr_calls) { return _hypercall2(int, multicall, call_list, nr_calls); } static inline int HYPERVISOR_update_va_mapping( unsigned long va, uint64_t new_val, unsigned long flags) { uint32_t hi, lo; lo = (uint32_t)(new_val & 0xffffffff); hi = (uint32_t)(new_val >> 32); return _hypercall4(int, update_va_mapping, va, lo, hi, flags); } static inline int HYPERVISOR_event_channel_op( int cmd, void *arg) { int rc = _hypercall2(int, event_channel_op, cmd, arg); #if CONFIG_XEN_COMPAT <= 0x030002 if (__predict_false(rc == -ENOXENSYS)) { struct evtchn_op op; op.cmd = cmd; memcpy(&op.u, arg, sizeof(op.u)); rc = _hypercall1(int, event_channel_op_compat, &op); memcpy(arg, &op.u, sizeof(op.u)); } #endif return (rc); } static inline int HYPERVISOR_xen_version( int cmd, void *arg) { return _hypercall2(int, xen_version, cmd, arg); } static inline int HYPERVISOR_console_io( int cmd, int count, const char *str) { return _hypercall3(int, console_io, cmd, count, str); } static inline int HYPERVISOR_physdev_op( int cmd, void *arg) { int rc = _hypercall2(int, physdev_op, cmd, arg); #if CONFIG_XEN_COMPAT <= 0x030002 if (__predict_false(rc == -ENOXENSYS)) { struct physdev_op op; op.cmd = cmd; memcpy(&op.u, arg, sizeof(op.u)); rc = _hypercall1(int, physdev_op_compat, &op); memcpy(arg, &op.u, sizeof(op.u)); } #endif return (rc); } static inline int HYPERVISOR_grant_table_op( unsigned int cmd, void *uop, unsigned int count) { return _hypercall3(int, grant_table_op, cmd, uop, count); } static inline int HYPERVISOR_update_va_mapping_otherdomain( unsigned long va, uint64_t new_val, unsigned long flags, domid_t domid) { uint32_t hi, lo; lo = (uint32_t)(new_val & 0xffffffff); hi = (uint32_t)(new_val >> 32); return _hypercall5(int, update_va_mapping_otherdomain, va, lo, hi, flags, domid); } static inline int HYPERVISOR_vm_assist( unsigned int cmd, unsigned int type) { return _hypercall2(int, vm_assist, cmd, type); } static inline int HYPERVISOR_vcpu_op( int cmd, int vcpuid, void *extra_args) { return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args); } static inline int HYPERVISOR_suspend( unsigned long srec) { struct sched_shutdown sched_shutdown = { .reason = SHUTDOWN_suspend }; int rc = _hypercall3(int, sched_op, SCHEDOP_shutdown, &sched_shutdown, srec); #if CONFIG_XEN_COMPAT <= 0x030002 if (rc == -ENOXENSYS) rc = _hypercall3(int, sched_op_compat, SCHEDOP_shutdown, SHUTDOWN_suspend, srec); #endif return (rc); } #if CONFIG_XEN_COMPAT <= 0x030002 static inline int HYPERVISOR_nmi_op( unsigned long op, void *arg) { return _hypercall2(int, nmi_op, op, arg); } #endif static inline int HYPERVISOR_callback_op( int cmd, void *arg) { return _hypercall2(int, callback_op, cmd, arg); } #ifndef CONFIG_XEN static inline unsigned long HYPERVISOR_hvm_op( int op, void *arg) { return _hypercall2(unsigned long, hvm_op, op, arg); } #endif static inline int HYPERVISOR_xenoprof_op( int op, void *arg) { return _hypercall2(int, xenoprof_op, op, arg); } static inline int HYPERVISOR_kexec_op( unsigned long op, void *args) { return _hypercall2(int, kexec_op, op, args); } static inline int HYPERVISOR_dm_op( domid_t domid, unsigned int nr_bufs, const void *bufs) { return _hypercall3(int, dm_op, domid, nr_bufs, bufs); } #endif /* __HYPERCALL_H__ */ /* * Local variables: * c-file-style: "linux" * indent-tabs-mode: t * c-indent-level: 8 * c-basic-offset: 8 * tab-width: 8 * End: */ diff --git a/sys/x86/xen/hvm.c b/sys/x86/xen/hvm.c index 6eb16c3098fd..544b6e6439e8 100644 --- a/sys/x86/xen/hvm.c +++ b/sys/x86/xen/hvm.c @@ -1,491 +1,491 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2008, 2013 Citrix Systems, Inc. * Copyright (c) 2012 Spectra Logic Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include -#include +#include +#include +#include /*--------------------------- Forward Declarations ---------------------------*/ static void xen_hvm_cpu_init(void); /*-------------------------------- Global Data -------------------------------*/ enum xen_domain_type xen_domain_type = XEN_NATIVE; #ifdef SMP struct cpu_ops xen_hvm_cpu_ops = { .cpu_init = xen_hvm_cpu_init, .cpu_resume = xen_hvm_cpu_init }; #endif static MALLOC_DEFINE(M_XENHVM, "xen_hvm", "Xen HVM PV Support"); /** * If non-zero, the hypervisor has been configured to use a direct * IDT event callback for interrupt injection. */ int xen_vector_callback_enabled; /** * Start info flags. ATM this only used to store the initial domain flag for * PVHv2, and it's always empty for HVM guests. */ uint32_t hvm_start_flags; /** * Signal whether the vector injected for the event channel upcall requires to * be EOI'ed on the local APIC. */ bool xen_evtchn_needs_ack; /*------------------------------- Per-CPU Data -------------------------------*/ DPCPU_DEFINE(struct vcpu_info, vcpu_local_info); DPCPU_DEFINE(struct vcpu_info *, vcpu_info); /*------------------ Hypervisor Access Shared Memory Regions -----------------*/ shared_info_t *HYPERVISOR_shared_info; /*------------------------------ Sysctl tunables -----------------------------*/ int xen_disable_pv_disks = 0; int xen_disable_pv_nics = 0; TUNABLE_INT("hw.xen.disable_pv_disks", &xen_disable_pv_disks); TUNABLE_INT("hw.xen.disable_pv_nics", &xen_disable_pv_nics); /*---------------------- XEN Hypervisor Probe and Setup ----------------------*/ uint32_t xen_cpuid_base; static uint32_t xen_hvm_cpuid_base(void) { uint32_t base, regs[4]; for (base = 0x40000000; base < 0x40010000; base += 0x100) { do_cpuid(base, regs); if (!memcmp("XenVMMXenVMM", ®s[1], 12) && (regs[0] - base) >= 2) return (base); } return (0); } static void hypervisor_quirks(unsigned int major, unsigned int minor) { #ifdef SMP if (((major < 4) || (major == 4 && minor <= 5)) && msix_disable_migration == -1) { /* * Xen hypervisors prior to 4.6.0 do not properly * handle updates to enabled MSI-X table entries, * so disable MSI-X interrupt migration in that * case. */ if (bootverbose) printf( "Disabling MSI-X interrupt migration due to Xen hypervisor bug.\n" "Set machdep.msix_disable_migration=0 to forcefully enable it.\n"); msix_disable_migration = 1; } #endif } static void hypervisor_version(void) { uint32_t regs[4]; int major, minor; do_cpuid(xen_cpuid_base + 1, regs); major = regs[0] >> 16; minor = regs[0] & 0xffff; printf("XEN: Hypervisor version %d.%d detected.\n", major, minor); hypervisor_quirks(major, minor); } /* * Allocate and fill in the hypcall page. */ int xen_hvm_init_hypercall_stubs(enum xen_hvm_init_type init_type) { uint32_t regs[4]; /* Legacy PVH will get here without the cpuid leaf being set. */ if (xen_cpuid_base == 0) xen_cpuid_base = xen_hvm_cpuid_base(); if (xen_cpuid_base == 0) return (ENXIO); if (xen_domain() && init_type == XEN_HVM_INIT_LATE) { /* * If the domain type is already set we can assume that the * hypercall page has been populated too, so just print the * version (and apply any quirks) and exit. */ hypervisor_version(); return 0; } if (init_type == XEN_HVM_INIT_LATE) hypervisor_version(); /* * Find the hypercall pages. */ do_cpuid(xen_cpuid_base + 2, regs); if (regs[0] != 1) return (EINVAL); wrmsr(regs[1], (init_type == XEN_HVM_INIT_EARLY) ? ((vm_paddr_t)&hypercall_page - KERNBASE) : vtophys(&hypercall_page)); return (0); } static void xen_hvm_init_shared_info_page(void) { struct xen_add_to_physmap xatp; if (xen_pv_domain()) { /* * Already setup in the PV case, shared_info is passed inside * of the start_info struct at start of day. */ return; } if (HYPERVISOR_shared_info == NULL) { HYPERVISOR_shared_info = malloc(PAGE_SIZE, M_XENHVM, M_NOWAIT); if (HYPERVISOR_shared_info == NULL) panic("Unable to allocate Xen shared info page"); } xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = vtophys(HYPERVISOR_shared_info) >> PAGE_SHIFT; if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) panic("HYPERVISOR_memory_op failed"); } static int set_percpu_callback(unsigned int vcpu) { struct xen_hvm_evtchn_upcall_vector vec; int error; vec.vcpu = vcpu; vec.vector = IDT_EVTCHN; error = HYPERVISOR_hvm_op(HVMOP_set_evtchn_upcall_vector, &vec); return (error != 0 ? xen_translate_error(error) : 0); } /* * Tell the hypervisor how to contact us for event channel callbacks. */ void xen_hvm_set_callback(device_t dev) { struct xen_hvm_param xhp; int irq; if (xen_vector_callback_enabled) return; xhp.domid = DOMID_SELF; xhp.index = HVM_PARAM_CALLBACK_IRQ; if (xen_feature(XENFEAT_hvm_callback_vector) != 0) { int error; error = set_percpu_callback(0); if (error == 0) { xen_evtchn_needs_ack = true; /* Trick toolstack to think we are enlightened */ xhp.value = 1; } else xhp.value = HVM_CALLBACK_VECTOR(IDT_EVTCHN); error = HYPERVISOR_hvm_op(HVMOP_set_param, &xhp); if (error == 0) { xen_vector_callback_enabled = 1; return; } else if (xen_evtchn_needs_ack) panic("Unable to setup fake HVM param: %d", error); printf("Xen HVM callback vector registration failed (%d). " "Falling back to emulated device interrupt\n", error); } xen_vector_callback_enabled = 0; if (dev == NULL) { /* * Called from early boot or resume. * xenpci will invoke us again later. */ return; } irq = pci_get_irq(dev); if (irq < 16) { xhp.value = HVM_CALLBACK_GSI(irq); } else { u_int slot; u_int pin; slot = pci_get_slot(dev); pin = pci_get_intpin(dev) - 1; xhp.value = HVM_CALLBACK_PCI_INTX(slot, pin); } if (HYPERVISOR_hvm_op(HVMOP_set_param, &xhp) != 0) panic("Can't set evtchn callback"); } #define XEN_MAGIC_IOPORT 0x10 enum { XMI_MAGIC = 0x49d2, XMI_UNPLUG_IDE_DISKS = 0x01, XMI_UNPLUG_NICS = 0x02, XMI_UNPLUG_IDE_EXCEPT_PRI_MASTER = 0x04 }; static void xen_hvm_disable_emulated_devices(void) { u_short disable_devs = 0; if (xen_pv_domain()) { /* * No emulated devices in the PV case, so no need to unplug * anything. */ if (xen_disable_pv_disks != 0 || xen_disable_pv_nics != 0) printf("PV devices cannot be disabled in PV guests\n"); return; } if (inw(XEN_MAGIC_IOPORT) != XMI_MAGIC) return; if (xen_disable_pv_disks == 0) { if (bootverbose) printf("XEN: disabling emulated disks\n"); disable_devs |= XMI_UNPLUG_IDE_DISKS; } if (xen_disable_pv_nics == 0) { if (bootverbose) printf("XEN: disabling emulated nics\n"); disable_devs |= XMI_UNPLUG_NICS; } if (disable_devs != 0) outw(XEN_MAGIC_IOPORT, disable_devs); } static void xen_hvm_init(enum xen_hvm_init_type init_type) { int error; int i; if (init_type == XEN_HVM_INIT_CANCELLED_SUSPEND) return; error = xen_hvm_init_hypercall_stubs(init_type); switch (init_type) { case XEN_HVM_INIT_LATE: if (error != 0) return; /* * If xen_domain_type is not set at this point * it means we are inside a (PV)HVM guest, because * for PVH the guest type is set much earlier * (see hammer_time_xen). */ if (!xen_domain()) { xen_domain_type = XEN_HVM_DOMAIN; vm_guest = VM_GUEST_XEN; } setup_xen_features(); #ifdef SMP cpu_ops = xen_hvm_cpu_ops; #endif break; case XEN_HVM_INIT_RESUME: if (error != 0) panic("Unable to init Xen hypercall stubs on resume"); /* Clear stale vcpu_info. */ CPU_FOREACH(i) DPCPU_ID_SET(i, vcpu_info, NULL); break; default: panic("Unsupported HVM initialization type"); } xen_vector_callback_enabled = 0; xen_evtchn_needs_ack = false; xen_hvm_set_callback(NULL); /* * On (PV)HVM domains we need to request the hypervisor to * fill the shared info page, for PVH guest the shared_info page * is passed inside the start_info struct and is already set, so this * functions are no-ops. */ xen_hvm_init_shared_info_page(); xen_hvm_disable_emulated_devices(); } void xen_hvm_suspend(void) { } void xen_hvm_resume(bool suspend_cancelled) { xen_hvm_init(suspend_cancelled ? XEN_HVM_INIT_CANCELLED_SUSPEND : XEN_HVM_INIT_RESUME); /* Register vcpu_info area for CPU#0. */ xen_hvm_cpu_init(); } static void xen_hvm_sysinit(void *arg __unused) { xen_hvm_init(XEN_HVM_INIT_LATE); } SYSINIT(xen_hvm_init, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, xen_hvm_sysinit, NULL); static void xen_hvm_cpu_init(void) { struct vcpu_register_vcpu_info info; struct vcpu_info *vcpu_info; uint32_t regs[4]; int cpu, rc; if (!xen_domain()) return; if (DPCPU_GET(vcpu_info) != NULL) { /* * vcpu_info is already set. We're resuming * from a failed migration and our pre-suspend * configuration is still valid. */ return; } /* * Set vCPU ID. If available fetch the ID from CPUID, if not just use * the ACPI ID. */ KASSERT(xen_cpuid_base != 0, ("Invalid base Xen CPUID leaf")); cpuid_count(xen_cpuid_base + 4, 0, regs); KASSERT((regs[0] & XEN_HVM_CPUID_VCPU_ID_PRESENT) || !xen_pv_domain(), ("Xen PV domain without vcpu_id in cpuid")); PCPU_SET(vcpu_id, (regs[0] & XEN_HVM_CPUID_VCPU_ID_PRESENT) ? regs[1] : PCPU_GET(acpi_id)); if (xen_evtchn_needs_ack && !IS_BSP()) { /* * Setup the per-vpcu event channel upcall vector. This is only * required when using the new HVMOP_set_evtchn_upcall_vector * hypercall, which allows using a different vector for each * vCPU. Note that FreeBSD uses the same vector for all vCPUs * because it's not dynamically allocated. */ rc = set_percpu_callback(PCPU_GET(vcpu_id)); if (rc != 0) panic("Event channel upcall vector setup failed: %d", rc); } /* * Set the vCPU info. * * NB: the vCPU info for vCPUs < 32 can be fetched from the shared info * page, but in order to make sure the mapping code is correct always * attempt to map the vCPU info at a custom place. */ vcpu_info = DPCPU_PTR(vcpu_local_info); cpu = PCPU_GET(vcpu_id); info.mfn = vtophys(vcpu_info) >> PAGE_SHIFT; info.offset = vtophys(vcpu_info) - trunc_page(vtophys(vcpu_info)); rc = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_info, cpu, &info); if (rc != 0) DPCPU_SET(vcpu_info, &HYPERVISOR_shared_info->vcpu_info[cpu]); else DPCPU_SET(vcpu_info, vcpu_info); } SYSINIT(xen_hvm_cpu_init, SI_SUB_INTR, SI_ORDER_FIRST, xen_hvm_cpu_init, NULL); diff --git a/sys/x86/xen/pv.c b/sys/x86/xen/pv.c index e0c88992390e..c5d7629d0bc5 100644 --- a/sys/x86/xen/pv.c +++ b/sys/x86/xen/pv.c @@ -1,410 +1,410 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-NetBSD * * Copyright (c) 2004 Christian Limpach. * Copyright (c) 2004-2006,2008 Kip Macy * Copyright (c) 2008 The NetBSD Foundation, Inc. * Copyright (c) 2013 Roger Pau MonnĂ© * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_ddb.h" #include "opt_kstack_pages.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include +#include +#include #include #ifdef DDB #include #endif /* Native initial function */ extern u_int64_t hammer_time(u_int64_t, u_int64_t); /* Xen initial function */ uint64_t hammer_time_xen(vm_paddr_t); #define MAX_E820_ENTRIES 128 /*--------------------------- Forward Declarations ---------------------------*/ static caddr_t xen_pvh_parse_preload_data(uint64_t); static void xen_pvh_parse_memmap(caddr_t, vm_paddr_t *, int *); /*---------------------------- Extern Declarations ---------------------------*/ /* * Placed by the linker at the end of the bss section, which is the last * section loaded by Xen before loading the symtab and strtab. */ extern uint32_t end; /*-------------------------------- Global Data -------------------------------*/ struct init_ops xen_pvh_init_ops = { .parse_preload_data = xen_pvh_parse_preload_data, .early_clock_source_init = xen_clock_init, .early_delay = xen_delay, .parse_memmap = xen_pvh_parse_memmap, }; static struct bios_smap xen_smap[MAX_E820_ENTRIES]; static struct hvm_start_info *start_info; /*-------------------------------- Xen PV init -------------------------------*/ uint64_t hammer_time_xen(vm_paddr_t start_info_paddr) { struct hvm_modlist_entry *mod; struct xen_add_to_physmap xatp; uint64_t physfree; char *kenv; int rc; xen_domain_type = XEN_HVM_DOMAIN; vm_guest = VM_GUEST_XEN; rc = xen_hvm_init_hypercall_stubs(XEN_HVM_INIT_EARLY); if (rc) { xc_printf("ERROR: failed to initialize hypercall page: %d\n", rc); HYPERVISOR_shutdown(SHUTDOWN_crash); } start_info = (struct hvm_start_info *)(start_info_paddr + KERNBASE); if (start_info->magic != XEN_HVM_START_MAGIC_VALUE) { xc_printf("Unknown magic value in start_info struct: %#x\n", start_info->magic); HYPERVISOR_shutdown(SHUTDOWN_crash); } /* * The hvm_start_into structure is always appended after loading * the kernel and modules. */ physfree = roundup2(start_info_paddr + PAGE_SIZE, PAGE_SIZE); xatp.domid = DOMID_SELF; xatp.idx = 0; xatp.space = XENMAPSPACE_shared_info; xatp.gpfn = atop(physfree); if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) { xc_printf("ERROR: failed to setup shared_info page\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } HYPERVISOR_shared_info = (shared_info_t *)(physfree + KERNBASE); physfree += PAGE_SIZE; /* * Init a static kenv using a free page. The contents will be filled * from the parse_preload_data hook. */ kenv = (void *)(physfree + KERNBASE); physfree += PAGE_SIZE; bzero_early(kenv, PAGE_SIZE); init_static_kenv(kenv, PAGE_SIZE); if (start_info->modlist_paddr != 0) { if (start_info->modlist_paddr >= physfree) { xc_printf( "ERROR: unexpected module list memory address\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } if (start_info->nr_modules == 0) { xc_printf( "ERROR: modlist_paddr != 0 but nr_modules == 0\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } mod = (struct hvm_modlist_entry *) (start_info->modlist_paddr + KERNBASE); if (mod[0].paddr >= physfree) { xc_printf("ERROR: unexpected module memory address\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } } /* Set the hooks for early functions that diverge from bare metal */ init_ops = xen_pvh_init_ops; hvm_start_flags = start_info->flags; /* Now we can jump into the native init function */ return (hammer_time(0, physfree)); } /*-------------------------------- PV specific -------------------------------*/ /* * When booted as a PVH guest FreeBSD needs to avoid using the RSDP address * hint provided by the loader because it points to the native set of ACPI * tables instead of the ones crafted by Xen. The acpi.rsdp env variable is * removed from kenv if present, and a new acpi.rsdp is added to kenv that * points to the address of the Xen crafted RSDP. */ static bool reject_option(const char *option) { static const char *reject[] = { "acpi.rsdp", }; unsigned int i; for (i = 0; i < nitems(reject); i++) if (strncmp(option, reject[i], strlen(reject[i])) == 0) return (true); return (false); } static void xen_pvh_set_env(char *env, bool (*filter)(const char *)) { char *option; if (env == NULL) return; option = env; while (*option != 0) { char *value; if (filter != NULL && filter(option)) { option += strlen(option) + 1; continue; } value = option; option = strsep(&value, "="); if (kern_setenv(option, value) != 0) xc_printf("unable to add kenv %s=%s\n", option, value); option = value + strlen(value) + 1; } } #ifdef DDB /* * The way Xen loads the symtab is different from the native boot loader, * because it's tailored for NetBSD. So we have to adapt and use the same * method as NetBSD. Portions of the code below have been picked from NetBSD: * sys/kern/kern_ksyms.c CVS Revision 1.71. */ static void xen_pvh_parse_symtab(void) { Elf_Ehdr *ehdr; Elf_Shdr *shdr; int i, j; ehdr = (Elf_Ehdr *)(&end + 1); if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) || ehdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || ehdr->e_version > 1) { xc_printf("Unable to load ELF symtab: invalid symbol table\n"); return; } shdr = (Elf_Shdr *)((uint8_t *)ehdr + ehdr->e_shoff); /* Find the symbol table and the corresponding string table. */ for (i = 1; i < ehdr->e_shnum; i++) { if (shdr[i].sh_type != SHT_SYMTAB) continue; if (shdr[i].sh_offset == 0) continue; ksymtab = (uintptr_t)((uint8_t *)ehdr + shdr[i].sh_offset); ksymtab_size = shdr[i].sh_size; j = shdr[i].sh_link; if (shdr[j].sh_offset == 0) continue; /* Can this happen? */ kstrtab = (uintptr_t)((uint8_t *)ehdr + shdr[j].sh_offset); break; } if (ksymtab == 0 || kstrtab == 0) xc_printf( "Unable to load ELF symtab: could not find symtab or strtab\n"); } #endif static caddr_t xen_pvh_parse_preload_data(uint64_t modulep) { caddr_t kmdp; vm_ooffset_t off; vm_paddr_t metadata; char *envp; char acpi_rsdp[19]; if (start_info->modlist_paddr != 0) { struct hvm_modlist_entry *mod; const char *cmdline; mod = (struct hvm_modlist_entry *) (start_info->modlist_paddr + KERNBASE); cmdline = mod[0].cmdline_paddr ? (const char *)(mod[0].cmdline_paddr + KERNBASE) : NULL; if (strcmp(cmdline, "header") == 0) { struct xen_header *header; header = (struct xen_header *)(mod[0].paddr + KERNBASE); if ((header->flags & XENHEADER_HAS_MODULEP_OFFSET) != XENHEADER_HAS_MODULEP_OFFSET) { xc_printf("Unable to load module metadata\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } preload_metadata = (caddr_t)(mod[0].paddr + header->modulep_offset + KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); if (kmdp == NULL) { xc_printf("Unable to find kernel\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } /* * Xen has relocated the metadata and the modules, so * we need to recalculate it's position. This is done * by saving the original modulep address and then * calculating the offset from the real modulep * position. */ metadata = MD_FETCH(kmdp, MODINFOMD_MODULEP, vm_paddr_t); off = mod[0].paddr + header->modulep_offset - metadata + KERNBASE; } else { preload_metadata = (caddr_t)(mod[0].paddr + KERNBASE); kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); if (kmdp == NULL) { xc_printf("Unable to find kernel\n"); HYPERVISOR_shutdown(SHUTDOWN_crash); } metadata = MD_FETCH(kmdp, MODINFOMD_MODULEP, vm_paddr_t); off = mod[0].paddr + KERNBASE - metadata; } preload_bootstrap_relocate(off); boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int); envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *); if (envp != NULL) envp += off; xen_pvh_set_env(envp, reject_option); if (MD_FETCH(kmdp, MODINFOMD_EFI_MAP, void *) != NULL) strlcpy(bootmethod, "UEFI", sizeof(bootmethod)); else strlcpy(bootmethod, "BIOS", sizeof(bootmethod)); } else { /* Parse the extra boot information given by Xen */ if (start_info->cmdline_paddr != 0) boot_parse_cmdline_delim( (char *)(start_info->cmdline_paddr + KERNBASE), ","); kmdp = NULL; strlcpy(bootmethod, "XEN", sizeof(bootmethod)); } boothowto |= boot_env_to_howto(); snprintf(acpi_rsdp, sizeof(acpi_rsdp), "%#" PRIx64, start_info->rsdp_paddr); kern_setenv("acpi.rsdp", acpi_rsdp); #ifdef DDB xen_pvh_parse_symtab(); #endif return (kmdp); } static void xen_pvh_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx) { struct xen_memory_map memmap; u_int32_t size; int rc; /* Fetch the E820 map from Xen */ memmap.nr_entries = MAX_E820_ENTRIES; set_xen_guest_handle(memmap.buffer, xen_smap); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); if (rc) { xc_printf("ERROR: unable to fetch Xen E820 memory map: %d\n", rc); HYPERVISOR_shutdown(SHUTDOWN_crash); } size = memmap.nr_entries * sizeof(xen_smap[0]); bios_add_smap_entries(xen_smap, size, physmap, physmap_idx); } diff --git a/sys/x86/xen/xen_apic.c b/sys/x86/xen/xen_apic.c index 77268d5f9846..3b7c220126fa 100644 --- a/sys/x86/xen/xen_apic.c +++ b/sys/x86/xen/xen_apic.c @@ -1,367 +1,367 @@ /* * Copyright (c) 2014 Roger Pau MonnĂ© * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include -#include +#include +#include /*--------------------------- Forward Declarations ---------------------------*/ static driver_filter_t xen_smp_rendezvous_action; #ifdef __amd64__ static driver_filter_t xen_invlop; #else static driver_filter_t xen_invltlb; static driver_filter_t xen_invlpg; static driver_filter_t xen_invlrng; static driver_filter_t xen_invlcache; #endif static driver_filter_t xen_ipi_bitmap_handler; static driver_filter_t xen_cpustop_handler; static driver_filter_t xen_cpususpend_handler; static driver_filter_t xen_ipi_swi_handler; /*---------------------------------- Macros ----------------------------------*/ #define IPI_TO_IDX(ipi) ((ipi) - APIC_IPI_INTS) /*--------------------------------- Xen IPIs ---------------------------------*/ struct xen_ipi_handler { driver_filter_t *filter; const char *description; }; static struct xen_ipi_handler xen_ipis[] = { [IPI_TO_IDX(IPI_RENDEZVOUS)] = { xen_smp_rendezvous_action, "r" }, #ifdef __amd64__ [IPI_TO_IDX(IPI_INVLOP)] = { xen_invlop, "itlb"}, #else [IPI_TO_IDX(IPI_INVLTLB)] = { xen_invltlb, "itlb"}, [IPI_TO_IDX(IPI_INVLPG)] = { xen_invlpg, "ipg" }, [IPI_TO_IDX(IPI_INVLRNG)] = { xen_invlrng, "irg" }, [IPI_TO_IDX(IPI_INVLCACHE)] = { xen_invlcache, "ic" }, #endif [IPI_TO_IDX(IPI_BITMAP_VECTOR)] = { xen_ipi_bitmap_handler, "b" }, [IPI_TO_IDX(IPI_STOP)] = { xen_cpustop_handler, "st" }, [IPI_TO_IDX(IPI_SUSPEND)] = { xen_cpususpend_handler, "sp" }, [IPI_TO_IDX(IPI_SWI)] = { xen_ipi_swi_handler, "sw" }, }; /* * Save previous (native) handler as a fallback. Xen < 4.7 doesn't support * VCPUOP_send_nmi for HVM guests, and thus we need a fallback in that case: * * https://lists.freebsd.org/archives/freebsd-xen/2022-January/000032.html */ void (*native_ipi_vectored)(u_int, int); /*------------------------------- Per-CPU Data -------------------------------*/ DPCPU_DEFINE(xen_intr_handle_t, ipi_handle[nitems(xen_ipis)]); /*------------------------------- Xen PV APIC --------------------------------*/ #define PCPU_ID_GET(id, field) (pcpu_find(id)->pc_##field) static int send_nmi(int dest) { unsigned int cpu; int rc = 0; /* * NMIs are not routed over event channels, and instead delivered as on * native using the exception vector (#2). Triggering them can be done * using the local APIC, or an hypercall as a shortcut like it's done * below. */ switch(dest) { case APIC_IPI_DEST_SELF: rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_GET(vcpu_id), NULL); break; case APIC_IPI_DEST_ALL: CPU_FOREACH(cpu) { rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_ID_GET(cpu, vcpu_id), NULL); if (rc != 0) break; } break; case APIC_IPI_DEST_OTHERS: CPU_FOREACH(cpu) { if (cpu != PCPU_GET(cpuid)) { rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_ID_GET(cpu, vcpu_id), NULL); if (rc != 0) break; } } break; default: rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, PCPU_ID_GET(apic_cpuid(dest), vcpu_id), NULL); break; } return rc; } #undef PCPU_ID_GET static void xen_pv_lapic_ipi_vectored(u_int vector, int dest) { xen_intr_handle_t *ipi_handle; int ipi_idx, to_cpu, self; static bool pvnmi = true; if (vector >= IPI_NMI_FIRST) { if (pvnmi) { int rc = send_nmi(dest); if (rc != 0) { printf( "Sending NMI using hypercall failed (%d) switching to APIC\n", rc); pvnmi = false; native_ipi_vectored(vector, dest); } } else native_ipi_vectored(vector, dest); return; } ipi_idx = IPI_TO_IDX(vector); if (ipi_idx >= nitems(xen_ipis)) panic("IPI out of range"); switch(dest) { case APIC_IPI_DEST_SELF: ipi_handle = DPCPU_GET(ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); break; case APIC_IPI_DEST_ALL: CPU_FOREACH(to_cpu) { ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); } break; case APIC_IPI_DEST_OTHERS: self = PCPU_GET(cpuid); CPU_FOREACH(to_cpu) { if (to_cpu != self) { ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); } } break; default: to_cpu = apic_cpuid(dest); ipi_handle = DPCPU_ID_GET(to_cpu, ipi_handle); xen_intr_signal(ipi_handle[ipi_idx]); break; } } /*---------------------------- XEN PV IPI Handlers ---------------------------*/ /* * These are C clones of the ASM functions found in apic_vector. */ static int xen_ipi_bitmap_handler(void *arg) { struct trapframe *frame; frame = arg; ipi_bitmap_handler(*frame); return (FILTER_HANDLED); } static int xen_smp_rendezvous_action(void *arg) { #ifdef COUNT_IPIS (*ipi_rendezvous_counts[PCPU_GET(cpuid)])++; #endif /* COUNT_IPIS */ smp_rendezvous_action(); return (FILTER_HANDLED); } #ifdef __amd64__ static int xen_invlop(void *arg) { invlop_handler(); return (FILTER_HANDLED); } #else /* __i386__ */ static int xen_invltlb(void *arg) { invltlb_handler(); return (FILTER_HANDLED); } static int xen_invlpg(void *arg) { invlpg_handler(); return (FILTER_HANDLED); } static int xen_invlrng(void *arg) { invlrng_handler(); return (FILTER_HANDLED); } static int xen_invlcache(void *arg) { invlcache_handler(); return (FILTER_HANDLED); } #endif /* __amd64__ */ static int xen_cpustop_handler(void *arg) { cpustop_handler(); return (FILTER_HANDLED); } static int xen_cpususpend_handler(void *arg) { cpususpend_handler(); return (FILTER_HANDLED); } static int xen_ipi_swi_handler(void *arg) { struct trapframe *frame = arg; ipi_swi_handler(*frame); return (FILTER_HANDLED); } /*----------------------------- XEN PV IPI setup -----------------------------*/ /* * Those functions are provided outside of the Xen PV APIC implementation * so PVHVM guests can also use PV IPIs without having an actual Xen PV APIC, * because on PVHVM there's an emulated LAPIC provided by Xen. */ static void xen_cpu_ipi_init(int cpu) { xen_intr_handle_t *ipi_handle; const struct xen_ipi_handler *ipi; int idx, rc; ipi_handle = DPCPU_ID_GET(cpu, ipi_handle); for (ipi = xen_ipis, idx = 0; idx < nitems(xen_ipis); ipi++, idx++) { if (ipi->filter == NULL) { ipi_handle[idx] = NULL; continue; } rc = xen_intr_alloc_and_bind_ipi(cpu, ipi->filter, INTR_TYPE_TTY, &ipi_handle[idx]); if (rc != 0) panic("Unable to allocate a XEN IPI port"); xen_intr_describe(ipi_handle[idx], "%s", ipi->description); } } static void xen_setup_cpus(void) { uint32_t regs[4]; int i; if (!xen_vector_callback_enabled) return; /* * Check whether the APIC virtualization is hardware assisted, as * that's faster than using event channels because it avoids the VM * exit. */ KASSERT(xen_cpuid_base != 0, ("Invalid base Xen CPUID leaf")); cpuid_count(xen_cpuid_base + 4, 0, regs); if ((x2apic_mode && (regs[0] & XEN_HVM_CPUID_X2APIC_VIRT)) || (!x2apic_mode && (regs[0] & XEN_HVM_CPUID_APIC_ACCESS_VIRT))) return; CPU_FOREACH(i) xen_cpu_ipi_init(i); /* Set the xen pv ipi ops to replace the native ones */ ipi_vectored = xen_pv_lapic_ipi_vectored; native_ipi_vectored = ipi_vectored; } /* Switch to using PV IPIs as soon as the vcpu_id is set. */ SYSINIT(xen_setup_cpus, SI_SUB_SMP, SI_ORDER_SECOND, xen_setup_cpus, NULL); diff --git a/sys/xen/blkif.h b/sys/xen/blkif.h index 721bb93c3c48..abc321795516 100644 --- a/sys/xen/blkif.h +++ b/sys/xen/blkif.h @@ -1,145 +1,145 @@ /* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * $FreeBSD$ */ #ifndef __XEN_BLKIF_H__ #define __XEN_BLKIF_H__ -#include -#include -#include +#include +#include +#include /* Not a real protocol. Used to generate ring structs which contain * the elements common to all protocols only. This way we get a * compiler-checkable way to use common struct elements, so we can * avoid using switch(protocol) in a number of places. */ struct blkif_common_request { char dummy; }; struct blkif_common_response { char dummy; }; /* i386 protocol version */ #pragma pack(push, 4) struct blkif_x86_32_request { uint8_t operation; /* BLKIF_OP_??? */ uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; struct blkif_x86_32_response { uint64_t id; /* copied from request */ uint8_t operation; /* copied from request */ int16_t status; /* BLKIF_RSP_??? */ }; typedef struct blkif_x86_32_request blkif_x86_32_request_t; typedef struct blkif_x86_32_response blkif_x86_32_response_t; #pragma pack(pop) /* x86_64 protocol version */ struct blkif_x86_64_request { uint8_t operation; /* BLKIF_OP_??? */ uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t __attribute__((__aligned__(8))) id; blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; }; struct blkif_x86_64_response { uint64_t __attribute__((__aligned__(8))) id; uint8_t operation; /* copied from request */ int16_t status; /* BLKIF_RSP_??? */ }; typedef struct blkif_x86_64_request blkif_x86_64_request_t; typedef struct blkif_x86_64_response blkif_x86_64_response_t; DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, struct blkif_common_response); DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, struct blkif_x86_32_response); DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, struct blkif_x86_64_response); /* * Maximum number of requests that can be active for a given instance * regardless of the protocol in use, based on the ring size. This constant * facilitates resource pre-allocation in backend drivers since the size is * known well in advance of attaching to a front end. */ #define BLKIF_MAX_RING_REQUESTS(_sz) \ MAX(__RING_SIZE((blkif_x86_64_sring_t *)NULL, _sz), \ MAX(__RING_SIZE((blkif_x86_32_sring_t *)NULL, _sz), \ __RING_SIZE((blkif_sring_t *)NULL, _sz))) /* * The number of ring pages required to support a given number of requests * for a given instance regardless of the protocol in use. */ #define BLKIF_RING_PAGES(_entries) \ MAX(__RING_PAGES((blkif_x86_64_sring_t *)NULL, _entries), \ MAX(__RING_PAGES((blkif_x86_32_sring_t *)NULL, _entries), \ __RING_PAGES((blkif_sring_t *)NULL, _entries))) union blkif_back_rings { blkif_back_ring_t native; blkif_common_back_ring_t common; blkif_x86_32_back_ring_t x86_32; blkif_x86_64_back_ring_t x86_64; }; typedef union blkif_back_rings blkif_back_rings_t; enum blkif_protocol { BLKIF_PROTOCOL_NATIVE = 1, BLKIF_PROTOCOL_X86_32 = 2, BLKIF_PROTOCOL_X86_64 = 3, }; static void inline blkif_get_x86_32_req(blkif_request_t *dst, blkif_x86_32_request_t *src) { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; dst->operation = src->operation; dst->nr_segments = src->nr_segments; dst->handle = src->handle; dst->id = src->id; dst->sector_number = src->sector_number; __compiler_membar(); if (n > dst->nr_segments) n = dst->nr_segments; for (i = 0; i < n; i++) dst->seg[i] = src->seg[i]; } static void inline blkif_get_x86_64_req(blkif_request_t *dst, blkif_x86_64_request_t *src) { int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; dst->operation = src->operation; dst->nr_segments = src->nr_segments; dst->handle = src->handle; dst->id = src->id; dst->sector_number = src->sector_number; __compiler_membar(); if (n > dst->nr_segments) n = dst->nr_segments; for (i = 0; i < n; i++) dst->seg[i] = src->seg[i]; } #endif /* __XEN_BLKIF_H__ */ diff --git a/sys/xen/error.h b/sys/xen/error.h index 6f25b0ab491a..245281604aca 100644 --- a/sys/xen/error.h +++ b/sys/xen/error.h @@ -1,101 +1,101 @@ /*- * Copyright (c) 2014 Roger Pau MonnĂ© . * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __XEN_ERROR_H__ #define __XEN_ERROR_H__ -#include +#include /* Translation table */ static int xen_errors[] = { [XEN_EPERM] = EPERM, [XEN_ENOENT] = ENOENT, [XEN_ESRCH] = ESRCH, [XEN_EIO] = EIO, [XEN_ENXIO] = ENXIO, [XEN_E2BIG] = E2BIG, [XEN_ENOEXEC] = ENOEXEC, [XEN_EBADF] = EBADF, [XEN_ECHILD] = ECHILD, [XEN_EAGAIN] = EAGAIN, [XEN_ENOMEM] = ENOMEM, [XEN_EACCES] = EACCES, [XEN_EFAULT] = EFAULT, [XEN_EBUSY] = EBUSY, [XEN_EEXIST] = EEXIST, [XEN_EXDEV] = EXDEV, [XEN_ENODEV] = ENODEV, [XEN_EINVAL] = EINVAL, [XEN_ENFILE] = ENFILE, [XEN_EMFILE] = EMFILE, [XEN_ENOSPC] = ENOSPC, [XEN_EMLINK] = EMLINK, [XEN_EDOM] = EDOM, [XEN_ERANGE] = ERANGE, [XEN_EDEADLK] = EDEADLK, [XEN_ENAMETOOLONG] = ENAMETOOLONG, [XEN_ENOLCK] = ENOLCK, [XEN_ENOSYS] = ENOSYS, [XEN_ENODATA] = ENOENT, [XEN_ETIME] = ETIMEDOUT, [XEN_EBADMSG] = EBADMSG, [XEN_EOVERFLOW] = EOVERFLOW, [XEN_EILSEQ] = EILSEQ, [XEN_ENOTSOCK] = ENOTSOCK, [XEN_EOPNOTSUPP] = EOPNOTSUPP, [XEN_EADDRINUSE] = EADDRINUSE, [XEN_EADDRNOTAVAIL] = EADDRNOTAVAIL, [XEN_ENOBUFS] = ENOBUFS, [XEN_EISCONN] = EISCONN, [XEN_ENOTCONN] = ENOTCONN, [XEN_ETIMEDOUT] = ETIMEDOUT, }; static inline int xen_translate_error(int error) { int bsd_error; KASSERT((error < 0), ("Value is not a valid Xen error code")); if (-error >= nitems(xen_errors)) { /* * We received an error value that cannot be translated, * return EINVAL. */ return (EINVAL); } bsd_error = xen_errors[-error]; KASSERT((bsd_error != 0), ("Unknown Xen error code")); return (bsd_error); } #endif /* !__XEN_ERROR_H__ */ diff --git a/sys/xen/evtchn/evtchnvar.h b/sys/xen/evtchn/evtchnvar.h index c7c38df34a00..1f78755115ac 100644 --- a/sys/xen/evtchn/evtchnvar.h +++ b/sys/xen/evtchn/evtchnvar.h @@ -1,104 +1,104 @@ /****************************************************************************** * evtchn.h * * Data structures and definitions private to the FreeBSD implementation * of the Xen event channel API. * * Copyright (c) 2004, K A Fraser * Copyright (c) 2012, Spectra Logic Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * $FreeBSD$ */ #ifndef __XEN_EVTCHN_EVTCHNVAR_H__ #define __XEN_EVTCHN_EVTCHNVAR_H__ #include -#include +#include enum evtchn_type { EVTCHN_TYPE_UNBOUND, EVTCHN_TYPE_VIRQ, EVTCHN_TYPE_IPI, EVTCHN_TYPE_PORT, EVTCHN_TYPE_COUNT }; /** Submit a port notification for delivery to a userland evtchn consumer */ void evtchn_device_upcall(evtchn_port_t port); /** * Disable signal delivery for an event channel port, returning its * previous mask state. * * \param port The event channel port to query and mask. * * \returns 1 if event delivery was previously disabled. Otherwise 0. */ static inline int evtchn_test_and_set_mask(evtchn_port_t port) { shared_info_t *s = HYPERVISOR_shared_info; return synch_test_and_set_bit(port, s->evtchn_mask); } /** * Clear any pending event for the given event channel port. * * \param port The event channel port to clear. */ static inline void evtchn_clear_port(evtchn_port_t port) { shared_info_t *s = HYPERVISOR_shared_info; synch_clear_bit(port, &s->evtchn_pending[0]); } /** * Disable signal delivery for an event channel port. * * \param port The event channel port to mask. */ static inline void evtchn_mask_port(evtchn_port_t port) { shared_info_t *s = HYPERVISOR_shared_info; synch_set_bit(port, &s->evtchn_mask[0]); } /** * Enable signal delivery for an event channel port. * * \param port The event channel port to enable. */ static inline void evtchn_unmask_port(evtchn_port_t port) { evtchn_unmask_t op = { .port = port }; HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &op); } #endif /* __XEN_EVTCHN_EVTCHNVAR_H__ */ diff --git a/sys/xen/features.h b/sys/xen/features.h index b4cce2fd4b1b..fa3c323d9b34 100644 --- a/sys/xen/features.h +++ b/sys/xen/features.h @@ -1,20 +1,20 @@ /****************************************************************************** * features.h * * Query the features reported by Xen. * * Copyright (c) 2006, Ian Campbell */ #ifndef __ASM_XEN_FEATURES_H__ #define __ASM_XEN_FEATURES_H__ -#include +#include extern void setup_xen_features(void); extern uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32]; #define xen_feature(flag) (xen_features[flag]) #endif /* __ASM_XEN_FEATURES_H__ */ diff --git a/sys/xen/gnttab.h b/sys/xen/gnttab.h index 7d1f71fbee0d..d85f882c0c4b 100644 --- a/sys/xen/gnttab.h +++ b/sys/xen/gnttab.h @@ -1,120 +1,120 @@ /****************************************************************************** * gnttab.h * * Two sets of functionality: * 1. Granting foreign access to our memory reservation. * 2. Accessing others' memory reservations via grant references. * (i.e., mechanisms for both sender and recipient of grant references) * * Copyright (c) 2004-2005, K A Fraser * Copyright (c) 2005, Christopher Clark * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation; or, when distributed * separately from the Linux kernel or incorporated into other * software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #ifndef __ASM_GNTTAB_H__ #define __ASM_GNTTAB_H__ #include #include #include -#include +#include #define GNTTAB_LIST_END GRANT_REF_INVALID struct gnttab_free_callback { struct gnttab_free_callback *next; void (*fn)(void *); void *arg; uint16_t count; }; /* * Allocate a grant table reference and return it in *result. Returns * zero on success or errno on error. */ int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, int flags, grant_ref_t *result); /* * End access through the given grant reference, iff the grant entry is no * longer in use. Return 1 if the grant entry was freed, 0 if it is still in * use. */ int gnttab_end_foreign_access_ref(grant_ref_t ref); /* * Eventually end access through the given grant reference, and once that * access has been ended, free the given page too. Access will be ended * immediately iff the grant entry is not in use, otherwise it will happen * some time later. page may be 0, in which case no freeing will occur. */ void gnttab_end_foreign_access(grant_ref_t ref, void *page); /* * Eventually end access through the given array of grant references. * Access will be ended immediately iff the grant entry is not in use, * otherwise it will happen some time later */ void gnttab_end_foreign_access_references(u_int count, grant_ref_t *refs); int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn, grant_ref_t *result); unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref); unsigned long gnttab_end_foreign_transfer(grant_ref_t ref); int gnttab_query_foreign_access(grant_ref_t ref); /* * operations on reserved batches of grant references */ int gnttab_alloc_grant_references(uint16_t count, grant_ref_t *pprivate_head); void gnttab_free_grant_reference(grant_ref_t ref); void gnttab_free_grant_references(grant_ref_t head); int gnttab_empty_grant_references(const grant_ref_t *pprivate_head); int gnttab_claim_grant_reference(grant_ref_t *pprivate_head); void gnttab_release_grant_reference(grant_ref_t *private_head, grant_ref_t release); void gnttab_request_free_callback(struct gnttab_free_callback *callback, void (*fn)(void *), void *arg, uint16_t count); void gnttab_cancel_free_callback(struct gnttab_free_callback *callback); void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, unsigned long frame, int flags); void gnttab_grant_foreign_transfer_ref(grant_ref_t, domid_t domid, unsigned long pfn); int gnttab_suspend(void); int gnttab_resume(device_t); #endif /* __ASM_GNTTAB_H__ */ diff --git a/sys/xen/hvm.h b/sys/xen/hvm.h index e34a552dc714..5ed31849f959 100644 --- a/sys/xen/hvm.h +++ b/sys/xen/hvm.h @@ -1,109 +1,109 @@ /* * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * $FreeBSD$ */ #ifndef __XEN_HVM_H__ #define __XEN_HVM_H__ #include #include -#include +#include /** * \brief Wrapper function to obtain a HVM parameter value. * - * \param index HVM parameter index; see . + * \param index HVM parameter index; see . * * \returns 0 on failure; the value of the parameter otherwise. */ static inline unsigned long hvm_get_parameter(int index) { struct xen_hvm_param xhv; int error; xhv.domid = DOMID_SELF; xhv.index = index; error = HYPERVISOR_hvm_op(HVMOP_get_param, &xhv); if (error) { printf("%s: error %d trying to get %d\n", __func__, error, index); return (0); } return (xhv.value); } /** The callback method types for Hypervisor event delivery to our domain. */ enum { HVM_CB_TYPE_GSI, HVM_CB_TYPE_PCI_INTX, HVM_CB_TYPE_VECTOR, HVM_CB_TYPE_MASK = 0xFF, HVM_CB_TYPE_SHIFT = 56 }; /** Format for specifying a GSI type callback. */ enum { HVM_CB_GSI_GSI_MASK = 0xFFFFFFFF, HVM_CB_GSI_GSI_SHIFT = 0 }; #define HVM_CALLBACK_GSI(gsi) \ (((uint64_t)HVM_CB_TYPE_GSI << HVM_CB_TYPE_SHIFT) \ | ((gsi) & HVM_CB_GSI_GSI_MASK) << HVM_CB_GSI_GSI_SHIFT) /** Format for specifying a virtual PCI interrupt line GSI style callback. */ enum { HVM_CB_PCI_INTX_INTPIN_MASK = 0x3, HVM_CB_PCI_INTX_INTPIN_SHIFT = 0, HVM_CB_PCI_INTX_SLOT_MASK = 0x1F, HVM_CB_PCI_INTX_SLOT_SHIFT = 11, }; #define HVM_CALLBACK_PCI_INTX(slot, pin) \ (((uint64_t)HVM_CB_TYPE_PCI_INTX << HVM_CB_TYPE_SHIFT) \ | (((slot) & HVM_CB_PCI_INTX_SLOT_MASK) << HVM_CB_PCI_INTX_SLOT_SHIFT) \ | (((pin) & HVM_CB_PCI_INTX_INTPIN_MASK) << HVM_CB_PCI_INTX_INTPIN_SHIFT)) /** Format for specifying a direct IDT vector injection style callback. */ enum { HVM_CB_VECTOR_VECTOR_MASK = 0xFFFFFFFF, HVM_CB_VECTOR_VECTOR_SHIFT = 0 }; #define HVM_CALLBACK_VECTOR(vector) \ (((uint64_t)HVM_CB_TYPE_VECTOR << HVM_CB_TYPE_SHIFT) \ | (((vector) & HVM_CB_GSI_GSI_MASK) << HVM_CB_GSI_GSI_SHIFT)) enum xen_hvm_init_type { XEN_HVM_INIT_EARLY, XEN_HVM_INIT_LATE, XEN_HVM_INIT_CANCELLED_SUSPEND, XEN_HVM_INIT_RESUME, }; int xen_hvm_init_hypercall_stubs(enum xen_hvm_init_type); void xen_hvm_set_callback(device_t); void xen_hvm_suspend(void); void xen_hvm_resume(bool suspend_cancelled); extern uint32_t hvm_start_flags; extern bool xen_evtchn_needs_ack; #endif /* __XEN_HVM_H__ */ diff --git a/sys/xen/hypervisor.h b/sys/xen/hypervisor.h index 2be61597fc18..c42956d253c2 100644 --- a/sys/xen/hypervisor.h +++ b/sys/xen/hypervisor.h @@ -1,101 +1,101 @@ /****************************************************************************** * hypervisor.h * * Linux-specific hypervisor handling. * * Copyright (c) 2002, K A Fraser * * $FreeBSD$ */ #ifndef __XEN_HYPERVISOR_H__ #define __XEN_HYPERVISOR_H__ #include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include extern uint64_t get_system_time(int ticks); static inline int HYPERVISOR_console_write(const char *str, int count) { return HYPERVISOR_console_io(CONSOLEIO_write, count, str); } static inline int HYPERVISOR_yield(void) { int rc = HYPERVISOR_sched_op(SCHEDOP_yield, NULL); #if CONFIG_XEN_COMPAT <= 0x030002 if (rc == -ENOXENSYS) rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); #endif return (rc); } static inline int HYPERVISOR_block( void) { int rc = HYPERVISOR_sched_op(SCHEDOP_block, NULL); #if CONFIG_XEN_COMPAT <= 0x030002 if (rc == -ENOXENSYS) rc = HYPERVISOR_sched_op_compat(SCHEDOP_block, 0); #endif return (rc); } static inline void HYPERVISOR_shutdown(unsigned int reason) { struct sched_shutdown sched_shutdown = { .reason = reason }; HYPERVISOR_sched_op(SCHEDOP_shutdown, &sched_shutdown); #if CONFIG_XEN_COMPAT <= 0x030002 HYPERVISOR_sched_op_compat(SCHEDOP_shutdown, reason); #endif } static inline void HYPERVISOR_crash(void) { HYPERVISOR_shutdown(SHUTDOWN_crash); /* NEVER REACHED */ for (;;) ; /* eliminate noreturn error */ } /* Transfer control to hypervisor until an event is detected on one */ /* of the specified ports or the specified number of ticks elapse */ static inline int HYPERVISOR_poll( evtchn_port_t *ports, unsigned int nr_ports, int ticks) { int rc; struct sched_poll sched_poll = { .nr_ports = nr_ports, .timeout = get_system_time(ticks) }; set_xen_guest_handle(sched_poll.ports, ports); rc = HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll); #if CONFIG_XEN_COMPAT <= 0x030002 if (rc == -ENOXENSYS) rc = HYPERVISOR_sched_op_compat(SCHEDOP_yield, 0); #endif return (rc); } #endif /* __XEN_HYPERVISOR_H__ */ diff --git a/sys/xen/xen-os.h b/sys/xen/xen-os.h index 988da24dc878..183724be9749 100644 --- a/sys/xen/xen-os.h +++ b/sys/xen/xen-os.h @@ -1,171 +1,171 @@ /****************************************************************************** * xen/xen-os.h * * Random collection of macros and definition * * Copyright (c) 2003, 2004 Keir Fraser (on behalf of the Xen team) * All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * * $FreeBSD$ */ #ifndef _XEN_XEN_OS_H_ #define _XEN_XEN_OS_H_ #define __XEN_INTERFACE_VERSION__ 0x00030208 #define GRANT_REF_INVALID 0xffffffff #ifdef LOCORE #define __ASSEMBLY__ #endif -#include +#include #ifndef __ASSEMBLY__ #include -#include +#include static inline vm_paddr_t xen_get_xenstore_mfn(void) { return (hvm_get_parameter(HVM_PARAM_STORE_PFN)); } static inline evtchn_port_t xen_get_xenstore_evtchn(void) { return (hvm_get_parameter(HVM_PARAM_STORE_EVTCHN)); } static inline vm_paddr_t xen_get_console_mfn(void) { return (hvm_get_parameter(HVM_PARAM_CONSOLE_PFN)); } static inline evtchn_port_t xen_get_console_evtchn(void) { return (hvm_get_parameter(HVM_PARAM_CONSOLE_EVTCHN)); } extern shared_info_t *HYPERVISOR_shared_info; extern bool xen_suspend_cancelled; enum xen_domain_type { XEN_NATIVE, /* running on bare hardware */ XEN_PV_DOMAIN, /* running in a PV domain */ XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ }; extern enum xen_domain_type xen_domain_type; static inline int xen_domain(void) { return (xen_domain_type != XEN_NATIVE); } static inline int xen_pv_domain(void) { return (xen_domain_type == XEN_PV_DOMAIN); } static inline int xen_hvm_domain(void) { return (xen_domain_type == XEN_HVM_DOMAIN); } static inline bool xen_initial_domain(void) { return (xen_domain() && (hvm_start_flags & SIF_INITDOMAIN) != 0); } #endif #include /* Everything below this point is not included by assembler (.S) files. */ #ifndef __ASSEMBLY__ /* * Based on ofed/include/linux/bitops.h * * Those helpers are prefixed by xen_ because xen-os.h is widely included * and we don't want the other drivers using them. * */ #define NBPL (NBBY * sizeof(long)) static inline bool xen_test_bit(int bit, volatile long *addr) { unsigned long mask = 1UL << (bit % NBPL); return !!(atomic_load_acq_long(&addr[bit / NBPL]) & mask); } static inline void xen_set_bit(int bit, volatile long *addr) { atomic_set_long(&addr[bit / NBPL], 1UL << (bit % NBPL)); } static inline void xen_clear_bit(int bit, volatile long *addr) { atomic_clear_long(&addr[bit / NBPL], 1UL << (bit % NBPL)); } #undef NBPL /* * Functions to allocate/free unused memory in order * to map memory from other domains. */ struct resource *xenmem_alloc(device_t dev, int *res_id, size_t size); int xenmem_free(device_t dev, int res_id, struct resource *res); /* Debug/emergency function, prints directly to hypervisor console */ void xc_printf(const char *, ...) __printflike(1, 2); #ifndef xen_mb #define xen_mb() mb() #endif #ifndef xen_rmb #define xen_rmb() rmb() #endif #ifndef xen_wmb #define xen_wmb() wmb() #endif #endif /* !__ASSEMBLY__ */ #endif /* _XEN_XEN_OS_H_ */ diff --git a/sys/xen/xen_intr.h b/sys/xen/xen_intr.h index 6725fa9e0399..5ebce0678222 100644 --- a/sys/xen/xen_intr.h +++ b/sys/xen/xen_intr.h @@ -1,240 +1,240 @@ /****************************************************************************** * xen_intr.h * * APIs for managing Xen event channel, virtual IRQ, and physical IRQ * notifications. * * Copyright (c) 2004, K A Fraser * Copyright (c) 2012, Spectra Logic Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * $FreeBSD$ */ #ifndef _XEN_INTR_H_ #define _XEN_INTR_H_ -#include +#include /** Registered Xen interrupt callback handle. */ typedef void * xen_intr_handle_t; void xen_intr_handle_upcall(struct trapframe *trap_frame); /** * Associate an already allocated local event channel port an interrupt * handler. * * \param dev The device making this bind request. * \param local_port The event channel to bind. * \param filter An interrupt filter handler. Specify NULL * to always dispatch to the ithread handler. * \param handler An interrupt ithread handler. Optional (can * specify NULL) if all necessary event actions * are performed by filter. * \param arg Argument to present to both filter and handler. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ int xen_intr_bind_local_port(device_t dev, evtchn_port_t local_port, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type irqflags, xen_intr_handle_t *handlep); /** * Allocate a local event channel port, accessible by the specified * remote/foreign domain and, if successful, associate the port with * the specified interrupt handler. * * \param dev The device making this bind request. * \param remote_domain Remote domain grant permission to signal the * newly allocated local port. * \param filter An interrupt filter handler. Specify NULL * to always dispatch to the ithread handler. * \param handler An interrupt ithread handler. Optional (can * specify NULL) if all necessary event actions * are performed by filter. * \param arg Argument to present to both filter and handler. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ int xen_intr_alloc_and_bind_local_port(device_t dev, u_int remote_domain, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type irqflags, xen_intr_handle_t *handlep); /** * Associate the specified interrupt handler with the remote event * channel port specified by remote_domain and remote_port. * * \param dev The device making this bind request. * \param remote_domain The domain peer for this event channel connection. * \param remote_port Remote domain's local port number for this event * channel port. * \param filter An interrupt filter handler. Specify NULL * to always dispatch to the ithread handler. * \param handler An interrupt ithread handler. Optional (can * specify NULL) if all necessary event actions * are performed by filter. * \param arg Argument to present to both filter and handler. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ int xen_intr_bind_remote_port(device_t dev, u_int remote_domain, evtchn_port_t remote_port, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type irqflags, xen_intr_handle_t *handlep); /** * Associate the specified interrupt handler with the specified Xen * virtual interrupt source. * * \param dev The device making this bind request. * \param virq The Xen virtual IRQ number for the Xen interrupt * source being hooked. * \param cpu The cpu on which interrupt events should be delivered. * \param filter An interrupt filter handler. Specify NULL * to always dispatch to the ithread handler. * \param handler An interrupt ithread handler. Optional (can * specify NULL) if all necessary event actions * are performed by filter. * \param arg Argument to present to both filter and handler. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ int xen_intr_bind_virq(device_t dev, u_int virq, u_int cpu, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type irqflags, xen_intr_handle_t *handlep); /** * Allocate a local event channel port for servicing interprocessor * interupts and, if successful, associate the port with the specified * interrupt handler. * * \param cpu The cpu receiving the IPI. * \param filter The interrupt filter servicing this IPI. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ int xen_intr_alloc_and_bind_ipi(u_int cpu, driver_filter_t filter, enum intr_type irqflags, xen_intr_handle_t *handlep); /** * Unbind an interrupt handler from its interrupt source. * * \param handlep A pointer to the opaque handle that was initialized * at the time the interrupt source was bound. * * \returns 0 on success, otherwise an errno. * * \note The event channel, if any, that was allocated at bind time is * closed upon successful return of this method. * * \note It is always safe to call xen_intr_unbind() on a handle that * has been initilized to NULL. */ void xen_intr_unbind(xen_intr_handle_t *handle); /** * Add a description to an interrupt handler. * * \param handle The opaque handle that was initialized at the time * the interrupt source was bound. * * \param fmt The sprintf compatible format string for the description, * followed by optional sprintf arguments. * * \returns 0 on success, otherwise an errno. */ int xen_intr_describe(xen_intr_handle_t port_handle, const char *fmt, ...) __attribute__((format(printf, 2, 3))); /** * Signal the remote peer of an interrupt source associated with an * event channel port. * * \param handle The opaque handle that was initialized at the time * the interrupt source was bound. * * \note For xen interrupt sources other than event channel ports, * this method takes no action. */ void xen_intr_signal(xen_intr_handle_t handle); /** * Get the local event channel port number associated with this interrupt * source. * * \param handle The opaque handle that was initialized at the time * the interrupt source was bound. * * \returns 0 if the handle is invalid, otherwise positive port number. */ evtchn_port_t xen_intr_port(xen_intr_handle_t handle); /** * Bind an event channel port with a handler * * \param dev The device making this bind request. * \param filter An interrupt filter handler. Specify NULL * to always dispatch to the ithread handler. * \param handler An interrupt ithread handler. Optional (can * specify NULL) if all necessary event actions * are performed by filter. * \param arg Argument to present to both filter and handler. * \param irqflags Interrupt handler flags. See sys/bus.h. * \param handle Opaque handle used to manage this registration. * * \returns 0 on success, otherwise an errno. */ int xen_intr_add_handler(const char *name, driver_filter_t filter, driver_intr_t handler, void *arg, enum intr_type flags, xen_intr_handle_t handle); /** * Get a reference to an event channel port * * \param port Event channel port to which we get a reference. * \param handlep Pointer to an opaque handle used to manage this * registration. * * \returns 0 on success, otherwise an errno. */ int xen_intr_get_evtchn_from_port(evtchn_port_t port, xen_intr_handle_t *handlep); #endif /* _XEN_INTR_H_ */ diff --git a/sys/xen/xenbus/xenbusvar.h b/sys/xen/xenbus/xenbusvar.h index bea65fff8e5a..b2957880b187 100644 --- a/sys/xen/xenbus/xenbusvar.h +++ b/sys/xen/xenbus/xenbusvar.h @@ -1,225 +1,225 @@ /****************************************************************************** * Copyright (C) 2005 Rusty Russell, IBM Corporation * Copyright (C) 2005 XenSource Ltd. * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * $FreeBSD$ */ /** * \file xenbusvar.h * * \brief Datastructures and function declarations for usedby device * drivers operating on the XenBus. */ #ifndef _XEN_XENBUS_XENBUSVAR_H #define _XEN_XENBUS_XENBUSVAR_H #include #include #include #include #include #include #include -#include -#include -#include +#include +#include +#include #include /* XenBus allocations including XenStore data returned to clients. */ MALLOC_DECLARE(M_XENBUS); enum { /** * Path of this device node. */ XENBUS_IVAR_NODE, /** * The device type (e.g. vif, vbd). */ XENBUS_IVAR_TYPE, /** * The state of this device (not the otherend's state). */ XENBUS_IVAR_STATE, /** * Domain ID of the other end device. */ XENBUS_IVAR_OTHEREND_ID, /** * Path of the other end device. */ XENBUS_IVAR_OTHEREND_PATH }; /** * Simplified accessors for xenbus devices: * * xenbus_get_node * xenbus_get_type * xenbus_get_state * xenbus_get_otherend_id * xenbus_get_otherend_path */ #define XENBUS_ACCESSOR(var, ivar, type) \ __BUS_ACCESSOR(xenbus, var, XENBUS, ivar, type) XENBUS_ACCESSOR(node, NODE, const char *) XENBUS_ACCESSOR(type, TYPE, const char *) XENBUS_ACCESSOR(state, STATE, enum xenbus_state) XENBUS_ACCESSOR(otherend_id, OTHEREND_ID, int) XENBUS_ACCESSOR(otherend_path, OTHEREND_PATH, const char *) /** * Return the state of a XenBus device. * * \param path The root XenStore path for the device. * * \return The current state of the device or XenbusStateClosed if no * state can be read. */ XenbusState xenbus_read_driver_state(const char *path); /** * Return the state of the "other end" (peer) of a XenBus device. * * \param dev The XenBus device whose peer to query. * * \return The current state of the peer device or XenbusStateClosed if no * state can be read. */ static inline XenbusState xenbus_get_otherend_state(device_t dev) { return (xenbus_read_driver_state(xenbus_get_otherend_path(dev))); } /** * Grant access to the given ring_mfn to the peer of the given device. * * \param dev The device granting access to the ring page. * \param ring_mfn The guest machine page number of the page to grant * peer access rights. * \param refp[out] The grant reference for the page. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. * * A successful call to xenbus_grant_ring should be paired with a call * to gnttab_end_foreign_access() when foregn access to this page is no * longer requried. * * \note On error, \a dev will be switched to the XenbusStateClosing * state and the returned error is saved in the per-device error node * for \a dev in the XenStore. */ int xenbus_grant_ring(device_t dev, unsigned long ring_mfn, grant_ref_t *refp); /** * Record the given errno, along with the given, printf-style, formatted * message in dev's device specific error node in the XenStore. * * \param dev The device which encountered the error. * \param err The errno value corresponding to the error. * \param fmt Printf format string followed by a variable number of * printf arguments. */ void xenbus_dev_error(device_t dev, int err, const char *fmt, ...) __attribute__((format(printf, 3, 4))); /** * va_list version of xenbus_dev_error(). * * \param dev The device which encountered the error. * \param err The errno value corresponding to the error. * \param fmt Printf format string. * \param ap Va_list of printf arguments. */ void xenbus_dev_verror(device_t dev, int err, const char *fmt, va_list ap) __attribute__((format(printf, 3, 0))); /** * Equivalent to xenbus_dev_error(), followed by * xenbus_set_state(dev, XenbusStateClosing). * * \param dev The device which encountered the error. * \param err The errno value corresponding to the error. * \param fmt Printf format string followed by a variable number of * printf arguments. */ void xenbus_dev_fatal(device_t dev, int err, const char *fmt, ...) __attribute__((format(printf, 3, 4))); /** * va_list version of xenbus_dev_fatal(). * * \param dev The device which encountered the error. * \param err The errno value corresponding to the error. * \param fmt Printf format string. * \param ap Va_list of printf arguments. */ void xenbus_dev_vfatal(device_t dev, int err, const char *fmt, va_list) __attribute__((format(printf, 3, 0))); /** * Convert a member of the xenbus_state enum into an ASCII string. * * /param state The XenBus state to lookup. * * /return A string representing state or, for unrecognized states, * the string "Unknown". */ const char *xenbus_strstate(enum xenbus_state state); /** * Return the value of a XenBus device's "online" node within the XenStore. * * \param dev The XenBus device to query. * * \return The value of the "online" node for the device. If the node * does not exist, 0 (offline) is returned. */ int xenbus_dev_is_online(device_t dev); /** * Default callback invoked when a change to the local XenStore sub-tree * for a device is modified. * * \param dev The XenBus device whose tree was modified. * \param path The tree relative sub-path to the modified node. The empty * string indicates the root of the tree was destroyed. */ void xenbus_localend_changed(device_t dev, const char *path); #include "xenbus_if.h" #endif /* _XEN_XENBUS_XENBUSVAR_H */ diff --git a/sys/xen/xenstore/xenstorevar.h b/sys/xen/xenstore/xenstorevar.h index 98d60f2646d2..d9b40dfd5f3d 100644 --- a/sys/xen/xenstore/xenstorevar.h +++ b/sys/xen/xenstore/xenstorevar.h @@ -1,379 +1,379 @@ /****************************************************************************** * xenstorevar.h * * Method declarations and structures for accessing the XenStore.h * * Copyright (C) 2005 Rusty Russell, IBM Corporation * Copyright (C) 2005 XenSource Ltd. * Copyright (C) 2009,2010 Spectra Logic Corporation * * This file may be distributed separately from the Linux kernel, or * incorporated into other software packages, subject to the following license: * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this source file (the "Software"), to deal in the Software without * restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * $FreeBSD$ */ #ifndef _XEN_XENSTORE_XENSTOREVAR_H #define _XEN_XENSTORE_XENSTOREVAR_H #include #include #include #include #include #include #include -#include -#include -#include +#include +#include +#include #include "xenbus_if.h" /* XenStore allocations including XenStore data returned to clients. */ MALLOC_DECLARE(M_XENSTORE); struct xs_watch; typedef void (xs_watch_cb_t)(struct xs_watch *, const char **vec, unsigned int len); /* Register callback to watch subtree (node) in the XenStore. */ struct xs_watch { LIST_ENTRY(xs_watch) list; /* Path being watched. */ char *node; /* Callback (executed in a process context with no locks held). */ xs_watch_cb_t *callback; /* Callback client data untouched by the XenStore watch mechanism. */ uintptr_t callback_data; /* Maximum number of pending watch events to be delivered. */ unsigned int max_pending; /* * Private counter used by xenstore to keep track of the pending * watches. Protected by xs.watch_events_lock. */ unsigned int pending; }; LIST_HEAD(xs_watch_list, xs_watch); typedef int (*xs_event_handler_t)(void *); struct xs_transaction { uint32_t id; }; #define XST_NIL ((struct xs_transaction) { 0 }) /** * Check if Xenstore is initialized. * * \return True if initialized, false otherwise. */ bool xs_initialized(void); /** * Return xenstore event channel port. * * \return event channel port. */ evtchn_port_t xs_evtchn(void); /** * Return xenstore page physical memory address. * * \return xenstore page physical address. */ vm_paddr_t xs_address(void); /** * Fetch the contents of a directory in the XenStore. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the path to read. * \param node The basename of the path to read. * \param num The returned number of directory entries. * \param result An array of directory entry strings. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. * * \note The results buffer is malloced and should be free'd by the * caller with 'free(*result, M_XENSTORE)'. */ int xs_directory(struct xs_transaction t, const char *dir, const char *node, unsigned int *num, const char ***result); /** * Determine if a path exists in the XenStore. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the path to read. * \param node The basename of the path to read. * * \retval 1 The path exists. * \retval 0 The path does not exist or an error occurred attempting * to make that determination. */ int xs_exists(struct xs_transaction t, const char *dir, const char *node); /** * Get the contents of a single "file". Returns the contents in * *result which should be freed with free(*result, M_XENSTORE) after * use. The length of the value in bytes is returned in *len. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the file to read. * \param node The basename of the file to read. * \param len The amount of data read. * \param result The returned contents from this file. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. * * \note The results buffer is malloced and should be free'd by the * caller with 'free(*result, M_XENSTORE)'. */ int xs_read(struct xs_transaction t, const char *dir, const char *node, unsigned int *len, void **result); /** * Write to a single file. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the file to write. * \param node The basename of the file to write. * \param string The NUL terminated string of data to write. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ int xs_write(struct xs_transaction t, const char *dir, const char *node, const char *string); /** * Create a new directory. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the directory to create. * \param node The basename of the directory to create. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ int xs_mkdir(struct xs_transaction t, const char *dir, const char *node); /** * Remove a file or directory (directories must be empty). * * \param t The XenStore transaction covering this request. * \param dir The dirname of the directory to remove. * \param node The basename of the directory to remove. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ int xs_rm(struct xs_transaction t, const char *dir, const char *node); /** * Destroy a tree of files rooted at dir/node. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the directory to remove. * \param node The basename of the directory to remove. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ int xs_rm_tree(struct xs_transaction t, const char *dir, const char *node); /** * Start a transaction. * * Changes by others will not be seen during the lifetime of this * transaction, and changes will not be visible to others until it * is committed (xs_transaction_end). * * \param t The returned transaction. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ int xs_transaction_start(struct xs_transaction *t); /** * End a transaction. * * \param t The transaction to end/commit. * \param abort If non-zero, the transaction is discarded * instead of committed. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ int xs_transaction_end(struct xs_transaction t, int abort); /* * Single file read and scanf parsing of the result. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the path to read. * \param node The basename of the path to read. * \param scancountp The number of input values assigned (i.e. the result * of scanf). * \param fmt Scanf format string followed by a variable number of * scanf input arguments. * * \return On success, 0. Otherwise an errno value indicating the * type of failure. */ int xs_scanf(struct xs_transaction t, const char *dir, const char *node, int *scancountp, const char *fmt, ...) __attribute__((format(scanf, 5, 6))); /** * Printf formatted write to a XenStore file. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the path to read. * \param node The basename of the path to read. * \param fmt Printf format string followed by a variable number of * printf arguments. * * \return On success, 0. Otherwise an errno value indicating the * type of write failure. */ int xs_printf(struct xs_transaction t, const char *dir, const char *node, const char *fmt, ...) __attribute__((format(printf, 4, 5))); /** * va_list version of xenbus_printf(). * * \param t The XenStore transaction covering this request. * \param dir The dirname of the path to read. * \param node The basename of the path to read. * \param fmt Printf format string. * \param ap Va_list of printf arguments. * * \return On success, 0. Otherwise an errno value indicating the * type of write failure. */ int xs_vprintf(struct xs_transaction t, const char *dir, const char *node, const char *fmt, va_list ap); /** * Multi-file read within a single directory and scanf parsing of * the results. * * \param t The XenStore transaction covering this request. * \param dir The dirname of the paths to read. * \param ... A variable number of argument triples specifying * the file name, scanf-style format string, and * output variable (pointer to storage of the results). * The last triple in the call must be terminated * will a final NULL argument. A NULL format string * will cause the entire contents of the given file * to be assigned as a NUL terminated, M_XENSTORE heap * backed, string to the output parameter of that tuple. * * \return On success, 0. Otherwise an errno value indicating the * type of read failure. * * Example: * char protocol_abi[64]; * uint32_t ring_ref; * char *dev_type; * int error; * * error = xenbus_gather(XBT_NIL, xenbus_get_node(dev), * "ring-ref", "%" PRIu32, &ring_ref, * "protocol", "%63s", protocol_abi, * "device-type", NULL, &dev_type, * NULL); * * ... * * free(dev_type, M_XENSTORE); */ int xs_gather(struct xs_transaction t, const char *dir, ...); /** * Register a XenStore watch. * * XenStore watches allow a client to be notified via a callback (embedded * within the watch object) of changes to an object in the XenStore. * * \param watch An xs_watch struct with it's node and callback fields * properly initialized. * * \return On success, 0. Otherwise an errno value indicating the * type of write failure. EEXIST errors from the XenStore * are supressed, allowing multiple, physically different, * xenbus_watch objects, to watch the same path in the XenStore. */ int xs_register_watch(struct xs_watch *watch); /** * Unregister a XenStore watch. * * \param watch An xs_watch object previously used in a successful call * to xs_register_watch(). * * The xs_watch object's node field is not altered by this call. * It is the caller's responsibility to properly dispose of both the * watch object and the data pointed to by watch->node. */ void xs_unregister_watch(struct xs_watch *watch); /** * Allocate and return an sbuf containing the XenStore path string * /. If name is the NUL string, the returned sbuf contains * the path string . * * \param dir The NUL terminated directory prefix for new path. * \param name The NUL terminated basename for the new path. * * \return A buffer containing the joined path. */ struct sbuf *xs_join(const char *, const char *); /** * Lock the xenstore request mutex. */ void xs_lock(void); /** * Unlock the xenstore request mutex. */ void xs_unlock(void); #endif /* _XEN_XENSTORE_XENSTOREVAR_H */