Index: head/sys/dev/hyperv/include/hyperv.h =================================================================== --- head/sys/dev/hyperv/include/hyperv.h (revision 303070) +++ head/sys/dev/hyperv/include/hyperv.h (revision 303071) @@ -1,219 +1,105 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /** * HyperV definitions for messages that are sent between instances of the * Channel Management Library in separate partitions, or in some cases, * back to itself. */ #ifndef __HYPERV_H__ #define __HYPERV_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * VMBUS version is 32 bit, upper 16 bit for major_number and lower * 16 bit for minor_number. * * 0.13 -- Windows Server 2008 * 1.1 -- Windows 7 * 2.4 -- Windows 8 * 3.0 -- Windows 8.1 */ #define VMBUS_VERSION_WS2008 ((0 << 16) | (13)) #define VMBUS_VERSION_WIN7 ((1 << 16) | (1)) #define VMBUS_VERSION_WIN8 ((2 << 16) | (4)) #define VMBUS_VERSION_WIN8_1 ((3 << 16) | (0)) #define VMBUS_VERSION_MAJOR(ver) (((uint32_t)(ver)) >> 16) #define VMBUS_VERSION_MINOR(ver) (((uint32_t)(ver)) & 0xffff) struct hyperv_guid { uint8_t hv_guid[16]; } __packed; #define HYPERV_GUID_STRLEN 40 int hyperv_guid2str(const struct hyperv_guid *, char *, size_t); -typedef struct { - /* - * offset in bytes from the start of ring data below - */ - volatile uint32_t write_index; - /* - * offset in bytes from the start of ring data below - */ - volatile uint32_t read_index; - /* - * NOTE: The interrupt_mask field is used only for channels, but - * vmbus connection also uses this data structure - */ - volatile uint32_t interrupt_mask; - /* pad it to PAGE_SIZE so that data starts on a page */ - uint8_t reserved[4084]; - - /* - * WARNING: Ring data starts here - * !!! DO NOT place any fields below this !!! - */ - uint8_t buffer[0]; /* doubles as interrupt mask */ -} __packed hv_vmbus_ring_buffer; - -typedef struct { - hv_vmbus_ring_buffer* ring_buffer; - struct mtx ring_lock; - uint32_t ring_data_size; /* ring_size */ -} hv_vmbus_ring_buffer_info; - struct hv_vmbus_channel; - -typedef void (*vmbus_chan_callback_t)(struct hv_vmbus_channel *, void *); - -typedef struct hv_vmbus_channel { - device_t ch_dev; - struct vmbus_softc *ch_vmbus; - uint32_t ch_flags; /* VMBUS_CHAN_FLAG_ */ - uint32_t ch_id; /* channel id */ - - /* - * These are based on the offer_msg.monitor_id. - * Save it here for easy access. - */ - int ch_montrig_idx; /* MNF trig index */ - uint32_t ch_montrig_mask;/* MNF trig mask */ - - /* - * TX bufring; at the beginning of ch_bufring. - */ - hv_vmbus_ring_buffer_info ch_txbr; - /* - * RX bufring; immediately following ch_txbr. - */ - hv_vmbus_ring_buffer_info ch_rxbr; - - struct taskqueue *ch_tq; - struct task ch_task; - vmbus_chan_callback_t ch_cb; - void *ch_cbarg; - - struct hyperv_mon_param *ch_monprm; - struct hyperv_dma ch_monprm_dma; - - int ch_cpuid; /* owner cpu */ - /* - * Virtual cpuid for ch_cpuid; it is used to communicate cpuid - * related information w/ Hyper-V. If MSR_HV_VP_INDEX does not - * exist, ch_vcpuid will always be 0 for compatibility. - */ - uint32_t ch_vcpuid; - - /* - * If this is a primary channel, ch_subchan* fields - * contain sub-channels belonging to this primary - * channel. - */ - struct mtx ch_subchan_lock; - TAILQ_HEAD(, hv_vmbus_channel) ch_subchans; - int ch_subchan_cnt; - - /* If this is a sub-channel */ - TAILQ_ENTRY(hv_vmbus_channel) ch_sublink; /* sub-channel link */ - struct hv_vmbus_channel *ch_prichan; /* owner primary chan */ - - void *ch_bufring; /* TX+RX bufrings */ - struct hyperv_dma ch_bufring_dma; - uint32_t ch_bufring_gpadl; - - struct task ch_detach_task; - TAILQ_ENTRY(hv_vmbus_channel) ch_prilink; /* primary chan link */ - uint32_t ch_subidx; /* subchan index */ - volatile uint32_t ch_stflags; /* atomic-op */ - /* VMBUS_CHAN_ST_ */ - struct hyperv_guid ch_guid_type; - struct hyperv_guid ch_guid_inst; - - struct sysctl_ctx_list ch_sysctl_ctx; -} hv_vmbus_channel; - -#define VMBUS_CHAN_ISPRIMARY(chan) ((chan)->ch_subidx == 0) - -#define VMBUS_CHAN_FLAG_HASMNF 0x0001 -/* - * If this flag is set, this channel's interrupt will be masked in ISR, - * and the RX bufring will be drained before this channel's interrupt is - * unmasked. - * - * This flag is turned on by default. Drivers can turn it off according - * to their own requirement. - */ -#define VMBUS_CHAN_FLAG_BATCHREAD 0x0002 - -#define VMBUS_CHAN_ST_OPENED_SHIFT 0 -#define VMBUS_CHAN_ST_OPENED (1 << VMBUS_CHAN_ST_OPENED_SHIFT) /** * @brief Get physical address from virtual */ static inline unsigned long hv_get_phys_addr(void *virt) { unsigned long ret; ret = (vtophys(virt) | ((vm_offset_t) virt & PAGE_MASK)); return (ret); } static __inline struct hv_vmbus_channel * vmbus_get_channel(device_t dev) { return device_get_ivars(dev); } #endif /* __HYPERV_H__ */ Index: head/sys/dev/hyperv/include/vmbus.h =================================================================== --- head/sys/dev/hyperv/include/vmbus.h (revision 303070) +++ head/sys/dev/hyperv/include/vmbus.h (revision 303071) @@ -1,127 +1,136 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMBUS_H_ #define _VMBUS_H_ #include /* * GPA stuffs. */ struct vmbus_gpa_range { uint32_t gpa_len; uint32_t gpa_ofs; uint64_t gpa_page[0]; } __packed; /* This is actually vmbus_gpa_range.gpa_page[1] */ struct vmbus_gpa { uint32_t gpa_len; uint32_t gpa_ofs; uint64_t gpa_page; } __packed; #define VMBUS_CHANPKT_SIZE_SHIFT 3 #define VMBUS_CHANPKT_GETLEN(pktlen) \ (((int)(pktlen)) << VMBUS_CHANPKT_SIZE_SHIFT) struct vmbus_chanpkt_hdr { uint16_t cph_type; /* VMBUS_CHANPKT_TYPE_ */ uint16_t cph_hlen; /* header len, in 8 bytes */ uint16_t cph_tlen; /* total len, in 8 bytes */ uint16_t cph_flags; /* VMBUS_CHANPKT_FLAG_ */ uint64_t cph_xactid; } __packed; #define VMBUS_CHANPKT_TYPE_INBAND 0x0006 #define VMBUS_CHANPKT_TYPE_RXBUF 0x0007 #define VMBUS_CHANPKT_TYPE_GPA 0x0009 #define VMBUS_CHANPKT_TYPE_COMP 0x000b #define VMBUS_CHANPKT_FLAG_RC 0x0001 /* report completion */ #define VMBUS_CHANPKT_CONST_DATA(pkt) \ (const void *)((const uint8_t *)(pkt) + \ VMBUS_CHANPKT_GETLEN((pkt)->cph_hlen)) struct vmbus_rxbuf_desc { uint32_t rb_len; uint32_t rb_ofs; } __packed; struct vmbus_chanpkt_rxbuf { struct vmbus_chanpkt_hdr cp_hdr; uint16_t cp_rxbuf_id; uint16_t cp_rsvd; uint32_t cp_rxbuf_cnt; struct vmbus_rxbuf_desc cp_rxbuf[]; } __packed; #define VMBUS_CHAN_SGLIST_MAX 32 #define VMBUS_CHAN_PRPLIST_MAX 32 struct hv_vmbus_channel; +struct hyperv_guid; +typedef void (*vmbus_chan_callback_t)(struct hv_vmbus_channel *, void *); + int vmbus_chan_open(struct hv_vmbus_channel *chan, int txbr_size, int rxbr_size, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg); void vmbus_chan_close(struct hv_vmbus_channel *chan); int vmbus_chan_gpadl_connect(struct hv_vmbus_channel *chan, bus_addr_t paddr, int size, uint32_t *gpadl); int vmbus_chan_gpadl_disconnect(struct hv_vmbus_channel *chan, uint32_t gpadl); void vmbus_chan_cpu_set(struct hv_vmbus_channel *chan, int cpu); void vmbus_chan_cpu_rr(struct hv_vmbus_channel *chan); struct hv_vmbus_channel * vmbus_chan_cpu2chan(struct hv_vmbus_channel *chan, int cpu); void vmbus_chan_set_readbatch(struct hv_vmbus_channel *chan, bool on); struct hv_vmbus_channel ** vmbus_subchan_get(struct hv_vmbus_channel *pri_chan, int subchan_cnt); void vmbus_subchan_rel(struct hv_vmbus_channel **subchan, int subchan_cnt); void vmbus_subchan_drain(struct hv_vmbus_channel *pri_chan); int vmbus_chan_recv(struct hv_vmbus_channel *chan, void *data, int *dlen, uint64_t *xactid); int vmbus_chan_recv_pkt(struct hv_vmbus_channel *chan, struct vmbus_chanpkt_hdr *pkt, int *pktlen); int vmbus_chan_send(struct hv_vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid); int vmbus_chan_send_sglist(struct hv_vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid); int vmbus_chan_send_prplist(struct hv_vmbus_channel *chan, struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen, uint64_t xactid); + +uint32_t vmbus_chan_id(const struct hv_vmbus_channel *chan); +uint32_t vmbus_chan_subidx(const struct hv_vmbus_channel *chan); +bool vmbus_chan_is_primary(const struct hv_vmbus_channel *chan); +const struct hyperv_guid * + vmbus_chan_guid_inst(const struct hv_vmbus_channel *chan); #endif /* !_VMBUS_H_ */ Index: head/sys/dev/hyperv/netvsc/hv_net_vsc.c =================================================================== --- head/sys/dev/hyperv/netvsc/hv_net_vsc.c (revision 303070) +++ head/sys/dev/hyperv/netvsc/hv_net_vsc.c (revision 303071) @@ -1,1039 +1,1039 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ /** * HyperV vmbus network VSC (virtual services client) module * */ #include #include #include #include #include #include #include #include #include #include #include "hv_net_vsc.h" #include "hv_rndis.h" #include "hv_rndis_filter.h" MALLOC_DEFINE(M_NETVSC, "netvsc", "Hyper-V netvsc driver"); /* * Forward declarations */ static void hv_nv_on_channel_callback(struct hv_vmbus_channel *chan, void *xrxr); static int hv_nv_init_send_buffer_with_net_vsp(struct hn_softc *sc); static int hv_nv_init_rx_buffer_with_net_vsp(struct hn_softc *); static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev); static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev); static int hv_nv_connect_to_vsp(struct hn_softc *sc); static void hv_nv_on_send_completion(netvsc_dev *net_dev, struct hv_vmbus_channel *, const struct vmbus_chanpkt_hdr *pkt); static void hv_nv_on_receive_completion(struct hv_vmbus_channel *chan, uint64_t tid, uint32_t status); static void hv_nv_on_receive(netvsc_dev *net_dev, struct hn_rx_ring *rxr, struct hv_vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt); /* * */ static inline netvsc_dev * hv_nv_alloc_net_device(struct hn_softc *sc) { netvsc_dev *net_dev; net_dev = malloc(sizeof(netvsc_dev), M_NETVSC, M_WAITOK | M_ZERO); net_dev->sc = sc; net_dev->destroy = FALSE; sc->net_dev = net_dev; return (net_dev); } /* * XXX unnecessary; nuke it. */ static inline netvsc_dev * hv_nv_get_outbound_net_device(struct hn_softc *sc) { return sc->net_dev; } /* * XXX unnecessary; nuke it. */ static inline netvsc_dev * hv_nv_get_inbound_net_device(struct hn_softc *sc) { return sc->net_dev; } int hv_nv_get_next_send_section(netvsc_dev *net_dev) { unsigned long bitsmap_words = net_dev->bitsmap_words; unsigned long *bitsmap = net_dev->send_section_bitsmap; unsigned long idx; int ret = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; int i; for (i = 0; i < bitsmap_words; i++) { idx = ffsl(~bitsmap[i]); if (0 == idx) continue; idx--; KASSERT(i * BITS_PER_LONG + idx < net_dev->send_section_count, ("invalid i %d and idx %lu", i, idx)); if (atomic_testandset_long(&bitsmap[i], idx)) continue; ret = i * BITS_PER_LONG + idx; break; } return (ret); } /* * Net VSC initialize receive buffer with net VSP * * Net VSP: Network virtual services client, also known as the * Hyper-V extensible switch and the synthetic data path. */ static int hv_nv_init_rx_buffer_with_net_vsp(struct hn_softc *sc) { netvsc_dev *net_dev; nvsp_msg *init_pkt; int ret = 0; net_dev = hv_nv_get_outbound_net_device(sc); if (!net_dev) { return (ENODEV); } net_dev->rx_buf = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), PAGE_SIZE, 0, net_dev->rx_buf_size, &net_dev->rxbuf_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (net_dev->rx_buf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return ENOMEM; } /* * Connect the RXBUF GPADL to the primary channel. * * NOTE: * Only primary channel has RXBUF connected to it. Sub-channels * just share this RXBUF. */ ret = vmbus_chan_gpadl_connect(sc->hn_prichan, net_dev->rxbuf_dma.hv_paddr, net_dev->rx_buf_size, &net_dev->rx_buf_gpadl_handle); if (ret != 0) { device_printf(sc->hn_dev, "rxbuf gpadl connect failed: %d\n", ret); goto cleanup; } /* sema_wait(&ext->channel_init_sema); KYS CHECK */ /* Notify the NetVsp of the gpadl handle */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_1_type_send_rx_buf; init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = net_dev->rx_buf_gpadl_handle; init_pkt->msgs.vers_1_msgs.send_rx_buf.id = NETVSC_RECEIVE_BUFFER_ID; /* Send the gpadl notification request */ ret = vmbus_chan_send(sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); if (ret != 0) { goto cleanup; } sema_wait(&net_dev->channel_init_sema); /* Check the response */ if (init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.status != nvsp_status_success) { ret = EINVAL; goto cleanup; } net_dev->rx_section_count = init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.num_sections; net_dev->rx_sections = malloc(net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section), M_NETVSC, M_WAITOK); memcpy(net_dev->rx_sections, init_pkt->msgs.vers_1_msgs.send_rx_buf_complete.sections, net_dev->rx_section_count * sizeof(nvsp_1_rx_buf_section)); /* * For first release, there should only be 1 section that represents * the entire receive buffer */ if (net_dev->rx_section_count != 1 || net_dev->rx_sections->offset != 0) { ret = EINVAL; goto cleanup; } goto exit; cleanup: hv_nv_destroy_rx_buffer(net_dev); exit: return (ret); } /* * Net VSC initialize send buffer with net VSP */ static int hv_nv_init_send_buffer_with_net_vsp(struct hn_softc *sc) { netvsc_dev *net_dev; nvsp_msg *init_pkt; int ret = 0; net_dev = hv_nv_get_outbound_net_device(sc); if (!net_dev) { return (ENODEV); } net_dev->send_buf = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), PAGE_SIZE, 0, net_dev->send_buf_size, &net_dev->txbuf_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (net_dev->send_buf == NULL) { device_printf(sc->hn_dev, "allocate chimney txbuf failed\n"); return ENOMEM; } /* * Connect chimney sending buffer GPADL to the primary channel. * * NOTE: * Only primary channel has chimney sending buffer connected to it. * Sub-channels just share this chimney sending buffer. */ ret = vmbus_chan_gpadl_connect(sc->hn_prichan, net_dev->txbuf_dma.hv_paddr, net_dev->send_buf_size, &net_dev->send_buf_gpadl_handle); if (ret != 0) { device_printf(sc->hn_dev, "chimney sending buffer gpadl " "connect failed: %d\n", ret); goto cleanup; } /* Notify the NetVsp of the gpadl handle */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_1_type_send_send_buf; init_pkt->msgs.vers_1_msgs.send_rx_buf.gpadl_handle = net_dev->send_buf_gpadl_handle; init_pkt->msgs.vers_1_msgs.send_rx_buf.id = NETVSC_SEND_BUFFER_ID; /* Send the gpadl notification request */ ret = vmbus_chan_send(sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, init_pkt, sizeof(nvsp_msg), (uint64_t)init_pkt); if (ret != 0) { goto cleanup; } sema_wait(&net_dev->channel_init_sema); /* Check the response */ if (init_pkt->msgs.vers_1_msgs.send_send_buf_complete.status != nvsp_status_success) { ret = EINVAL; goto cleanup; } net_dev->send_section_size = init_pkt->msgs.vers_1_msgs.send_send_buf_complete.section_size; net_dev->send_section_count = net_dev->send_buf_size / net_dev->send_section_size; net_dev->bitsmap_words = howmany(net_dev->send_section_count, BITS_PER_LONG); net_dev->send_section_bitsmap = malloc(net_dev->bitsmap_words * sizeof(long), M_NETVSC, M_WAITOK | M_ZERO); goto exit; cleanup: hv_nv_destroy_send_buffer(net_dev); exit: return (ret); } /* * Net VSC destroy receive buffer */ static int hv_nv_destroy_rx_buffer(netvsc_dev *net_dev) { nvsp_msg *revoke_pkt; int ret = 0; /* * If we got a section count, it means we received a * send_rx_buf_complete msg * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, * we need to send a revoke msg here */ if (net_dev->rx_section_count) { /* Send the revoke receive buffer */ revoke_pkt = &net_dev->revoke_packet; memset(revoke_pkt, 0, sizeof(nvsp_msg)); revoke_pkt->hdr.msg_type = nvsp_msg_1_type_revoke_rx_buf; revoke_pkt->msgs.vers_1_msgs.revoke_rx_buf.id = NETVSC_RECEIVE_BUFFER_ID; ret = vmbus_chan_send(net_dev->sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, 0, revoke_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)revoke_pkt); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } } /* Tear down the gpadl on the vsp end */ if (net_dev->rx_buf_gpadl_handle) { ret = vmbus_chan_gpadl_disconnect(net_dev->sc->hn_prichan, net_dev->rx_buf_gpadl_handle); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } net_dev->rx_buf_gpadl_handle = 0; } if (net_dev->rx_buf) { /* Free up the receive buffer */ hyperv_dmamem_free(&net_dev->rxbuf_dma, net_dev->rx_buf); net_dev->rx_buf = NULL; } if (net_dev->rx_sections) { free(net_dev->rx_sections, M_NETVSC); net_dev->rx_sections = NULL; net_dev->rx_section_count = 0; } return (ret); } /* * Net VSC destroy send buffer */ static int hv_nv_destroy_send_buffer(netvsc_dev *net_dev) { nvsp_msg *revoke_pkt; int ret = 0; /* * If we got a section count, it means we received a * send_rx_buf_complete msg * (ie sent nvsp_msg_1_type_send_rx_buf msg) therefore, * we need to send a revoke msg here */ if (net_dev->send_section_size) { /* Send the revoke send buffer */ revoke_pkt = &net_dev->revoke_packet; memset(revoke_pkt, 0, sizeof(nvsp_msg)); revoke_pkt->hdr.msg_type = nvsp_msg_1_type_revoke_send_buf; revoke_pkt->msgs.vers_1_msgs.revoke_send_buf.id = NETVSC_SEND_BUFFER_ID; ret = vmbus_chan_send(net_dev->sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, 0, revoke_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)revoke_pkt); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } } /* Tear down the gpadl on the vsp end */ if (net_dev->send_buf_gpadl_handle) { ret = vmbus_chan_gpadl_disconnect(net_dev->sc->hn_prichan, net_dev->send_buf_gpadl_handle); /* * If we failed here, we might as well return and have a leak * rather than continue and a bugchk */ if (ret != 0) { return (ret); } net_dev->send_buf_gpadl_handle = 0; } if (net_dev->send_buf) { /* Free up the receive buffer */ hyperv_dmamem_free(&net_dev->txbuf_dma, net_dev->send_buf); net_dev->send_buf = NULL; } if (net_dev->send_section_bitsmap) { free(net_dev->send_section_bitsmap, M_NETVSC); } return (ret); } /* * Attempt to negotiate the caller-specified NVSP version * * For NVSP v2, Server 2008 R2 does not set * init_pkt->msgs.init_msgs.init_compl.negotiated_prot_vers * to the negotiated version, so we cannot rely on that. */ static int hv_nv_negotiate_nvsp_protocol(struct hn_softc *sc, netvsc_dev *net_dev, uint32_t nvsp_ver) { nvsp_msg *init_pkt; int ret; init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_type_init; /* * Specify parameter as the only acceptable protocol version */ init_pkt->msgs.init_msgs.init.p1.protocol_version = nvsp_ver; init_pkt->msgs.init_msgs.init.protocol_version_2 = nvsp_ver; /* Send the init request */ ret = vmbus_chan_send(sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); if (ret != 0) return (-1); sema_wait(&net_dev->channel_init_sema); if (init_pkt->msgs.init_msgs.init_compl.status != nvsp_status_success) return (EINVAL); return (0); } /* * Send NDIS version 2 config packet containing MTU. * * Not valid for NDIS version 1. */ static int hv_nv_send_ndis_config(struct hn_softc *sc, uint32_t mtu) { netvsc_dev *net_dev; nvsp_msg *init_pkt; int ret; net_dev = hv_nv_get_outbound_net_device(sc); if (!net_dev) return (-ENODEV); /* * Set up configuration packet, write MTU * Indicate we are capable of handling VLAN tags */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); init_pkt->hdr.msg_type = nvsp_msg_2_type_send_ndis_config; init_pkt->msgs.vers_2_msgs.send_ndis_config.mtu = mtu; init_pkt-> msgs.vers_2_msgs.send_ndis_config.capabilities.u1.u2.ieee8021q = 1; /* Send the configuration packet */ ret = vmbus_chan_send(sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, 0, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); if (ret != 0) return (-EINVAL); return (0); } /* * Net VSC connect to VSP */ static int hv_nv_connect_to_vsp(struct hn_softc *sc) { netvsc_dev *net_dev; nvsp_msg *init_pkt; uint32_t ndis_version; uint32_t protocol_list[] = { NVSP_PROTOCOL_VERSION_1, NVSP_PROTOCOL_VERSION_2, NVSP_PROTOCOL_VERSION_4, NVSP_PROTOCOL_VERSION_5 }; int i; int protocol_number = nitems(protocol_list); int ret = 0; device_t dev = sc->hn_dev; struct ifnet *ifp = sc->hn_ifp; net_dev = hv_nv_get_outbound_net_device(sc); /* * Negotiate the NVSP version. Try the latest NVSP first. */ for (i = protocol_number - 1; i >= 0; i--) { if (hv_nv_negotiate_nvsp_protocol(sc, net_dev, protocol_list[i]) == 0) { net_dev->nvsp_version = protocol_list[i]; if (bootverbose) device_printf(dev, "Netvsc: got version 0x%x\n", net_dev->nvsp_version); break; } } if (i < 0) { if (bootverbose) device_printf(dev, "failed to negotiate a valid " "protocol.\n"); return (EPROTO); } /* * Set the MTU if supported by this NVSP protocol version * This needs to be right after the NVSP init message per Haiyang */ if (net_dev->nvsp_version >= NVSP_PROTOCOL_VERSION_2) ret = hv_nv_send_ndis_config(sc, ifp->if_mtu); /* * Send the NDIS version */ init_pkt = &net_dev->channel_init_packet; memset(init_pkt, 0, sizeof(nvsp_msg)); if (net_dev->nvsp_version <= NVSP_PROTOCOL_VERSION_4) { ndis_version = NDIS_VERSION_6_1; } else { ndis_version = NDIS_VERSION_6_30; } init_pkt->hdr.msg_type = nvsp_msg_1_type_send_ndis_vers; init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_major_vers = (ndis_version & 0xFFFF0000) >> 16; init_pkt->msgs.vers_1_msgs.send_ndis_vers.ndis_minor_vers = ndis_version & 0xFFFF; /* Send the init request */ ret = vmbus_chan_send(sc->hn_prichan, VMBUS_CHANPKT_TYPE_INBAND, 0, init_pkt, sizeof(nvsp_msg), (uint64_t)(uintptr_t)init_pkt); if (ret != 0) { goto cleanup; } /* * TODO: BUGBUG - We have to wait for the above msg since the netvsp * uses KMCL which acknowledges packet (completion packet) * since our Vmbus always set the VMBUS_CHANPKT_FLAG_RC flag */ /* sema_wait(&NetVscChannel->channel_init_sema); */ /* Post the big receive buffer to NetVSP */ if (net_dev->nvsp_version <= NVSP_PROTOCOL_VERSION_2) net_dev->rx_buf_size = NETVSC_RECEIVE_BUFFER_SIZE_LEGACY; else net_dev->rx_buf_size = NETVSC_RECEIVE_BUFFER_SIZE; net_dev->send_buf_size = NETVSC_SEND_BUFFER_SIZE; ret = hv_nv_init_rx_buffer_with_net_vsp(sc); if (ret == 0) ret = hv_nv_init_send_buffer_with_net_vsp(sc); cleanup: return (ret); } /* * Net VSC disconnect from VSP */ static void hv_nv_disconnect_from_vsp(netvsc_dev *net_dev) { hv_nv_destroy_rx_buffer(net_dev); hv_nv_destroy_send_buffer(net_dev); } void hv_nv_subchan_attach(struct hv_vmbus_channel *chan, struct hn_rx_ring *rxr) { - KASSERT(rxr->hn_rx_idx == chan->ch_subidx, + KASSERT(rxr->hn_rx_idx == vmbus_chan_subidx(chan), ("chan%u subidx %u, rxr%d mismatch", - chan->ch_id, chan->ch_subidx, rxr->hn_rx_idx)); + vmbus_chan_id(chan), vmbus_chan_subidx(chan), rxr->hn_rx_idx)); vmbus_chan_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0, hv_nv_on_channel_callback, rxr); } /* * Net VSC on device add * * Callback when the device belonging to this driver is added */ netvsc_dev * hv_nv_on_device_add(struct hn_softc *sc, void *additional_info, struct hn_rx_ring *rxr) { struct hv_vmbus_channel *chan = sc->hn_prichan; netvsc_dev *net_dev; int ret = 0; net_dev = hv_nv_alloc_net_device(sc); if (net_dev == NULL) return NULL; /* Initialize the NetVSC channel extension */ sema_init(&net_dev->channel_init_sema, 0, "netdev_sema"); /* * Open the channel */ - KASSERT(rxr->hn_rx_idx == chan->ch_subidx, + KASSERT(rxr->hn_rx_idx == vmbus_chan_subidx(chan), ("chan%u subidx %u, rxr%d mismatch", - chan->ch_id, chan->ch_subidx, rxr->hn_rx_idx)); + vmbus_chan_id(chan), vmbus_chan_subidx(chan), rxr->hn_rx_idx)); ret = vmbus_chan_open(chan, NETVSC_DEVICE_RING_BUFFER_SIZE, NETVSC_DEVICE_RING_BUFFER_SIZE, NULL, 0, hv_nv_on_channel_callback, rxr); if (ret != 0) goto cleanup; /* * Connect with the NetVsp */ ret = hv_nv_connect_to_vsp(sc); if (ret != 0) goto close; return (net_dev); close: /* Now, we can close the channel safely */ vmbus_chan_close(chan); cleanup: /* * Free the packet buffers on the netvsc device packet queue. * Release other resources. */ sema_destroy(&net_dev->channel_init_sema); free(net_dev, M_NETVSC); return (NULL); } /* * Net VSC on device remove */ int hv_nv_on_device_remove(struct hn_softc *sc, boolean_t destroy_channel) { netvsc_dev *net_dev = sc->net_dev;; /* Stop outbound traffic ie sends and receives completions */ net_dev->destroy = TRUE; hv_nv_disconnect_from_vsp(net_dev); /* At this point, no one should be accessing net_dev except in here */ /* Now, we can close the channel safely */ vmbus_chan_close(sc->hn_prichan); sema_destroy(&net_dev->channel_init_sema); free(net_dev, M_NETVSC); return (0); } /* * Net VSC on send completion */ static void hv_nv_on_send_completion(netvsc_dev *net_dev, struct hv_vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { const nvsp_msg *nvsp_msg_pkt; netvsc_packet *net_vsc_pkt; nvsp_msg_pkt = VMBUS_CHANPKT_CONST_DATA(pkt); if (nvsp_msg_pkt->hdr.msg_type == nvsp_msg_type_init_complete || nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_rx_buf_complete || nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_send_buf_complete || nvsp_msg_pkt->hdr.msg_type == nvsp_msg5_type_subchannel) { /* Copy the response back */ memcpy(&net_dev->channel_init_packet, nvsp_msg_pkt, sizeof(nvsp_msg)); sema_post(&net_dev->channel_init_sema); } else if (nvsp_msg_pkt->hdr.msg_type == nvsp_msg_1_type_send_rndis_pkt_complete) { /* Get the send context */ net_vsc_pkt = (netvsc_packet *)(unsigned long)pkt->cph_xactid; if (NULL != net_vsc_pkt) { if (net_vsc_pkt->send_buf_section_idx != NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { u_long mask; int idx; idx = net_vsc_pkt->send_buf_section_idx / BITS_PER_LONG; KASSERT(idx < net_dev->bitsmap_words, ("invalid section index %u", net_vsc_pkt->send_buf_section_idx)); mask = 1UL << (net_vsc_pkt->send_buf_section_idx % BITS_PER_LONG); KASSERT(net_dev->send_section_bitsmap[idx] & mask, ("index bitmap 0x%lx, section index %u, " "bitmap idx %d, bitmask 0x%lx", net_dev->send_section_bitsmap[idx], net_vsc_pkt->send_buf_section_idx, idx, mask)); atomic_clear_long( &net_dev->send_section_bitsmap[idx], mask); } /* Notify the layer above us */ net_vsc_pkt->compl.send.on_send_completion(chan, net_vsc_pkt->compl.send.send_completion_context); } } } /* * Net VSC on send * Sends a packet on the specified Hyper-V device. * Returns 0 on success, non-zero on failure. */ int hv_nv_on_send(struct hv_vmbus_channel *chan, netvsc_packet *pkt) { nvsp_msg send_msg; int ret; send_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt; if (pkt->is_data_pkt) { /* 0 is RMC_DATA */ send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 0; } else { /* 1 is RMC_CONTROL */ send_msg.msgs.vers_1_msgs.send_rndis_pkt.chan_type = 1; } send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_idx = pkt->send_buf_section_idx; send_msg.msgs.vers_1_msgs.send_rndis_pkt.send_buf_section_size = pkt->send_buf_section_size; if (pkt->gpa_cnt) { ret = vmbus_chan_send_sglist(chan, pkt->gpa, pkt->gpa_cnt, &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt); } else { ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, &send_msg, sizeof(nvsp_msg), (uint64_t)(uintptr_t)pkt); } return (ret); } /* * Net VSC on receive * * In the FreeBSD Hyper-V virtual world, this function deals exclusively * with virtual addresses. */ static void hv_nv_on_receive(netvsc_dev *net_dev, struct hn_rx_ring *rxr, struct hv_vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) { const struct vmbus_chanpkt_rxbuf *pkt; const nvsp_msg *nvsp_msg_pkt; netvsc_packet vsc_pkt; netvsc_packet *net_vsc_pkt = &vsc_pkt; int count = 0; int i = 0; int status = nvsp_status_success; nvsp_msg_pkt = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure this is a valid nvsp packet */ if (nvsp_msg_pkt->hdr.msg_type != nvsp_msg_1_type_send_rndis_pkt) { if_printf(rxr->hn_ifp, "packet hdr type %u is invalid!\n", nvsp_msg_pkt->hdr.msg_type); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (pkt->cp_rxbuf_id != NETVSC_RECEIVE_BUFFER_ID) { if_printf(rxr->hn_ifp, "rxbuf_id %d is invalid!\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; i++) { net_vsc_pkt->status = nvsp_status_success; net_vsc_pkt->data = ((uint8_t *)net_dev->rx_buf + pkt->cp_rxbuf[i].rb_ofs); net_vsc_pkt->tot_data_buf_len = pkt->cp_rxbuf[i].rb_len; hv_rf_on_receive(net_dev, rxr, net_vsc_pkt); if (net_vsc_pkt->status != nvsp_status_success) { status = nvsp_status_failure; } } /* * Moved completion call back here so that all received * messages (not just data messages) will trigger a response * message back to the host. */ hv_nv_on_receive_completion(chan, pkt->cp_hdr.cph_xactid, status); } /* * Net VSC on receive completion * * Send a receive completion packet to RNDIS device (ie NetVsp) */ static void hv_nv_on_receive_completion(struct hv_vmbus_channel *chan, uint64_t tid, uint32_t status) { nvsp_msg rx_comp_msg; int retries = 0; int ret = 0; rx_comp_msg.hdr.msg_type = nvsp_msg_1_type_send_rndis_pkt_complete; /* Pass in the status */ rx_comp_msg.msgs.vers_1_msgs.send_rndis_pkt_complete.status = status; retry_send_cmplt: /* Send the completion */ ret = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, 0, &rx_comp_msg, sizeof(nvsp_msg), tid); if (ret == 0) { /* success */ /* no-op */ } else if (ret == EAGAIN) { /* no more room... wait a bit and attempt to retry 3 times */ retries++; if (retries < 4) { DELAY(100); goto retry_send_cmplt; } } } /* * Net VSC receiving vRSS send table from VSP */ static void hv_nv_send_table(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { netvsc_dev *net_dev; const nvsp_msg *nvsp_msg_pkt; int i; uint32_t count; const uint32_t *table; net_dev = hv_nv_get_inbound_net_device(sc); if (!net_dev) return; nvsp_msg_pkt = VMBUS_CHANPKT_CONST_DATA(pkt); if (nvsp_msg_pkt->hdr.msg_type != nvsp_msg5_type_send_indirection_table) { printf("Netvsc: !Warning! receive msg type not " "send_indirection_table. type = %d\n", nvsp_msg_pkt->hdr.msg_type); return; } count = nvsp_msg_pkt->msgs.vers_5_msgs.send_table.count; if (count != VRSS_SEND_TABLE_SIZE) { printf("Netvsc: Received wrong send table size: %u\n", count); return; } table = (const uint32_t *) ((const uint8_t *)&nvsp_msg_pkt->msgs.vers_5_msgs.send_table + nvsp_msg_pkt->msgs.vers_5_msgs.send_table.offset); for (i = 0; i < count; i++) net_dev->vrss_send_table[i] = table[i]; } /* * Net VSC on channel callback */ static void hv_nv_on_channel_callback(struct hv_vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = rxr->hn_ifp->if_softc; netvsc_dev *net_dev; void *buffer; int bufferlen = NETVSC_PACKET_SIZE; net_dev = hv_nv_get_inbound_net_device(sc); if (net_dev == NULL) return; buffer = rxr->hn_rdbuf; do { struct vmbus_chanpkt_hdr *pkt = buffer; uint32_t bytes_rxed; int ret; bytes_rxed = bufferlen; ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); if (ret == 0) { if (bytes_rxed > 0) { switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hv_nv_on_send_completion(net_dev, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: hv_nv_on_receive(net_dev, rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hv_nv_send_table(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } } else if (ret == ENOBUFS) { /* Handle large packet */ if (bufferlen > NETVSC_PACKET_SIZE) { free(buffer, M_NETVSC); buffer = NULL; } /* alloc new buffer */ buffer = malloc(bytes_rxed, M_NETVSC, M_NOWAIT); if (buffer == NULL) { if_printf(rxr->hn_ifp, "hv_cb malloc buffer failed, len=%u\n", bytes_rxed); bufferlen = 0; break; } bufferlen = bytes_rxed; } else { /* No more packets */ break; } } while (1); if (bufferlen > NETVSC_PACKET_SIZE) free(buffer, M_NETVSC); hv_rf_channel_rollup(rxr, rxr->hn_txr); } Index: head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c =================================================================== --- head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 303070) +++ head/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c (revision 303071) @@ -1,3043 +1,3039 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_net_vsc.h" #include "hv_rndis.h" #include "hv_rndis_filter.h" #include "vmbus_if.h" /* Short for Hyper-V network interface */ #define NETVSC_DEVNAME "hn" /* * It looks like offset 0 of buf is reserved to hold the softc pointer. * The sc pointer evidently not needed, and is not presently populated. * The packet offset is where the netvsc_packet starts in the buffer. */ #define HV_NV_SC_PTR_OFFSET_IN_BUF 0 #define HV_NV_PACKET_OFFSET_IN_BUF 16 /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_LROENT_CNT_DEF 128 #define HN_RING_CNT_DEF_MAX 8 #define HN_RNDIS_MSG_LEN \ (sizeof(rndis_msg) + \ RNDIS_HASHVAL_PPI_SIZE + \ RNDIS_VLAN_PPI_SIZE + \ RNDIS_TSO_PPI_SIZE + \ RNDIS_CSUM_PPI_SIZE) #define HN_RNDIS_MSG_BOUNDARY PAGE_SIZE #define HN_RNDIS_MSG_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE #define HN_TX_DATA_SEGCNT_MAX \ (VMBUS_CHAN_SGLIST_MAX - HV_RF_NUM_TX_RESERVED_PAGE_BUFS) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ netvsc_packet netvsc_pkt; /* XXX to be removed */ bus_dmamap_t data_dmap; bus_addr_t rndis_msg_paddr; rndis_msg *rndis_msg; bus_dmamap_t rndis_msg_dmap; }; #define HN_TXD_FLAG_ONLIST 0x1 #define HN_TXD_FLAG_DMAMAP 0x2 /* * Only enable UDP checksum offloading when it is on 2012R2 or * later. UDP checksum offloading doesn't work on earlier * Windows releases. */ #define HN_CSUM_ASSIST_WIN8 (CSUM_IP | CSUM_TCP) #define HN_CSUM_ASSIST (CSUM_IP | CSUM_UDP | CSUM_TCP) #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) #define HN_LRO_ACKCNT_DEF 1 /* * Be aware that this sleepable mutex will exhibit WITNESS errors when * certain TCP and ARP code paths are taken. This appears to be a * well-known condition, as all other drivers checked use a sleeping * mutex to protect their transmit paths. * Also Be aware that mutexes do not play well with semaphores, and there * is a conflicting semaphore in a certain channel code path. */ #define NV_LOCK_INIT(_sc, _name) \ mtx_init(&(_sc)->hn_lock, _name, MTX_NETWORK_LOCK, MTX_DEF) #define NV_LOCK(_sc) mtx_lock(&(_sc)->hn_lock) #define NV_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->hn_lock, MA_OWNED) #define NV_UNLOCK(_sc) mtx_unlock(&(_sc)->hn_lock) #define NV_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->hn_lock) /* * Globals */ int hv_promisc_mode = 0; /* normal mode by default */ SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); #if __FreeBSD_version >= 1100045 /* Limit TSO burst size */ static int hn_tso_maxlen = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); #endif /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif #endif static int hn_share_tx_taskq = 0; SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN, &hn_share_tx_taskq, 0, "Enable shared TX taskqueue"); static struct taskqueue *hn_tx_taskq; #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); static int hn_bind_tx_taskq = -1; SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN, &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu"); static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); #if __FreeBSD_version >= 1100095 static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); #endif static u_int hn_cpu_index; /* * Forward declarations */ static void hn_stop(hn_softc_t *sc); static void hn_ifinit_locked(hn_softc_t *sc); static void hn_ifinit(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int hn_start_locked(struct hn_tx_ring *txr, int len); static void hn_start(struct ifnet *ifp); static void hn_start_txeof(struct hn_tx_ring *); static int hn_ifmedia_upd(struct ifnet *ifp); static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_check_iplen(const struct mbuf *, int); static int hn_create_tx_ring(struct hn_softc *, int); static void hn_destroy_tx_ring(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_destroy_tx_data(struct hn_softc *); static void hn_start_taskfunc(void *, int); static void hn_start_txeof_taskfunc(void *, int); static void hn_stop_tx_tasks(struct hn_softc *); static int hn_encap(struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static void hn_create_rx_data(struct hn_softc *sc, int); static void hn_destroy_rx_data(struct hn_softc *sc); static void hn_set_tx_chimney_size(struct hn_softc *, int); static void hn_channel_attach(struct hn_softc *, struct hv_vmbus_channel *); static void hn_subchan_attach(struct hn_softc *, struct hv_vmbus_channel *); static void hn_subchan_setup(struct hn_softc *); static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof_taskfunc(void *, int); #if __FreeBSD_version >= 1100099 static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } #endif static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } static int hn_ifmedia_upd(struct ifnet *ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (!sc->hn_carrier) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ static const struct hyperv_guid g_net_vsc_device_type = { .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46, 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E} }; /* * Standard probe entry point. * */ static int netvsc_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &g_net_vsc_device_type) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } /* * Standard attach entry point. * * Called when the driver is loaded. It allocates needed resources, * and initializes the "hardware" and software. */ static int netvsc_attach(device_t dev) { netvsc_device_info device_info; hn_softc_t *sc; int unit = device_get_unit(dev); struct ifnet *ifp = NULL; int error, ring_cnt, tx_ring_cnt; #if __FreeBSD_version >= 1100045 int tso_maxlen; #endif sc = device_get_softc(dev); sc->hn_unit = unit; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); if (hn_tx_taskq == NULL) { sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskq); if (hn_bind_tx_taskq >= 0) { int cpu = hn_bind_tx_taskq; cpuset_t cpu_set; if (cpu > mp_ncpus - 1) cpu = mp_ncpus - 1; CPU_SETOF(cpu, &cpu_set); taskqueue_start_threads_cpuset(&sc->hn_tx_taskq, 1, PI_NET, &cpu_set, "%s tx", device_get_nameunit(dev)); } else { taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx", device_get_nameunit(dev)); } } else { sc->hn_tx_taskq = hn_tx_taskq; } NV_LOCK_INIT(sc, "NetVSCLock"); ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; hn_create_rx_data(sc, ring_cnt); /* * Associate the first TX/RX ring w/ the primary channel. */ hn_channel_attach(sc, sc->hn_prichan); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_init = hn_ifinit; /* needed by hv_rf_on_device_add() code */ ifp->if_mtu = ETHERMTU; if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); ifp->if_start = hn_start; IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); ifp->if_snd.ifq_drv_maxlen = qdepth - 1; IFQ_SET_READY(&ifp->if_snd); } else { ifp->if_transmit = hn_transmit; ifp->if_qflush = hn_xmit_qflush; } ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Tell upper layers that we support full VLAN capability. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO; ifp->if_capenable |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | IFCAP_LRO; ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist | CSUM_TSO; error = hv_rf_on_device_add(sc, &device_info, ring_cnt, &sc->hn_rx_ring[0]); if (error) goto failed; KASSERT(sc->net_dev->num_channel > 0 && sc->net_dev->num_channel <= sc->hn_rx_ring_inuse, ("invalid channel count %u, should be less than %d", sc->net_dev->num_channel, sc->hn_rx_ring_inuse)); /* * Set the # of TX/RX rings that could be used according to * the # of channels that host offered. */ if (sc->hn_tx_ring_inuse > sc->net_dev->num_channel) sc->hn_tx_ring_inuse = sc->net_dev->num_channel; sc->hn_rx_ring_inuse = sc->net_dev->num_channel; device_printf(dev, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); if (sc->net_dev->num_channel > 1) hn_subchan_setup(sc); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } #endif if (device_info.link_state == 0) { sc->hn_carrier = 1; } #if __FreeBSD_version >= 1100045 tso_maxlen = hn_tso_maxlen; if (tso_maxlen <= 0 || tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; ifp->if_hw_tsomaxsegsize = PAGE_SIZE; ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); #endif ether_ifattach(ifp, device_info.mac_addr); #if __FreeBSD_version >= 1100045 if_printf(ifp, "TSO: %u/%u/%u\n", ifp->if_hw_tsomax, ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); #endif sc->hn_tx_chimney_max = sc->net_dev->send_section_size; hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_tx_chimney_max) hn_set_tx_chimney_size(sc, hn_tx_chimney_size); return (0); failed: hn_destroy_tx_data(sc); if (ifp != NULL) if_free(ifp); return (error); } /* * Standard detach entry point */ static int netvsc_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); if (bootverbose) printf("netvsc_detach\n"); /* * XXXKYS: Need to clean up all our * driver state; this is the driver * unloading. */ /* * XXXKYS: Need to stop outgoing traffic and unregister * the netdevice. */ hv_rf_on_device_remove(sc, HV_RF_NV_DESTROY_CHANNEL); hn_stop_tx_tasks(sc); ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskq != hn_tx_taskq) taskqueue_free(sc->hn_tx_taskq); return (0); } /* * Standard shutdown entry point */ static int netvsc_shutdown(device_t dev) { return (0); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline void hn_txdesc_dmamap_unload(struct hn_tx_ring *txr, struct hn_txdesc *txd) { if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; hn_txdesc_dmamap_unload(txr, txd); if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else atomic_add_int(&txr->hn_txdesc_avail, 1); buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif KASSERT(txd->m == NULL && txd->refs == 0 && (txd->flags & HN_TXD_FLAG_ONLIST), ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void hn_tx_done(struct hv_vmbus_channel *chan, void *xpkt) { netvsc_packet *packet = xpkt; struct hn_txdesc *txd; struct hn_tx_ring *txr; txd = (struct hn_txdesc *)(uintptr_t) packet->compl.send.send_completion_tid; txr = txd->txr; KASSERT(txr->hn_chan == chan, - ("channel mismatch, on channel%u, should be channel%u", - chan->ch_subidx, - txr->hn_chan->ch_subidx)); + ("channel mismatch, on chan%u, should be chan%u", + vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } void netvsc_channel_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) tcp_lro_flush_all(&rxr->hn_lro); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; netvsc_packet *packet; rndis_msg *rndis_mesg; rndis_packet *rndis_pkt; rndis_per_packet_info *rppi; struct rndis_hash_value *hash_value; uint32_t rndis_msg_size; packet = &txd->netvsc_pkt; packet->is_data_pkt = TRUE; packet->tot_data_buf_len = m_head->m_pkthdr.len; /* * extension points to the area reserved for the * rndis_filter_packet, which is placed just after * the netvsc_packet (and rppi struct, if present; * length is updated later). */ rndis_mesg = txd->rndis_msg; /* XXX not necessary */ memset(rndis_mesg, 0, HN_RNDIS_MSG_LEN); rndis_mesg->ndis_msg_type = REMOTE_NDIS_PACKET_MSG; rndis_pkt = &rndis_mesg->msg.packet; rndis_pkt->data_offset = sizeof(rndis_packet); rndis_pkt->data_length = packet->tot_data_buf_len; rndis_pkt->per_pkt_info_offset = sizeof(rndis_packet); rndis_msg_size = RNDIS_MESSAGE_SIZE(rndis_packet); /* * Set the hash value for this packet, so that the host could * dispatch the TX done event for this packet back to this TX * ring's channel. */ rndis_msg_size += RNDIS_HASHVAL_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_HASHVAL_PPI_SIZE, nbl_hash_value); hash_value = (struct rndis_hash_value *)((uint8_t *)rppi + rppi->per_packet_info_offset); hash_value->hash_value = txr->hn_tx_idx; if (m_head->m_flags & M_VLANTAG) { ndis_8021q_info *rppi_vlan_info; rndis_msg_size += RNDIS_VLAN_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_VLAN_PPI_SIZE, ieee_8021q_info); rppi_vlan_info = (ndis_8021q_info *)((uint8_t *)rppi + rppi->per_packet_info_offset); rppi_vlan_info->u1.s1.vlan_id = m_head->m_pkthdr.ether_vtag & 0xfff; } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { rndis_tcp_tso_info *tso_info; struct ether_vlan_header *eh; int ether_len; /* * XXX need m_pullup and use mtodo */ eh = mtod(m_head, struct ether_vlan_header*); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) ether_len = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ether_len = ETHER_HDR_LEN; rndis_msg_size += RNDIS_TSO_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_TSO_PPI_SIZE, tcp_large_send_info); tso_info = (rndis_tcp_tso_info *)((uint8_t *)rppi + rppi->per_packet_info_offset); tso_info->lso_v2_xmit.type = RNDIS_TCP_LARGE_SEND_OFFLOAD_V2_TYPE; #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip = (struct ip *)(m_head->m_data + ether_len); unsigned long iph_len = ip->ip_hl << 2; struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iph_len); tso_info->lso_v2_xmit.ip_version = RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV4; ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6 = (struct ip6_hdr *) (m_head->m_data + ether_len); struct tcphdr *th = (struct tcphdr *)(ip6 + 1); tso_info->lso_v2_xmit.ip_version = RNDIS_TCP_LARGE_SEND_OFFLOAD_IPV6; ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); } #endif tso_info->lso_v2_xmit.tcp_header_offset = 0; tso_info->lso_v2_xmit.mss = m_head->m_pkthdr.tso_segsz; } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { rndis_tcp_ip_csum_info *csum_info; rndis_msg_size += RNDIS_CSUM_PPI_SIZE; rppi = hv_set_rppi_data(rndis_mesg, RNDIS_CSUM_PPI_SIZE, tcpip_chksum_info); csum_info = (rndis_tcp_ip_csum_info *)((uint8_t *)rppi + rppi->per_packet_info_offset); csum_info->xmit.is_ipv4 = 1; if (m_head->m_pkthdr.csum_flags & CSUM_IP) csum_info->xmit.ip_header_csum = 1; if (m_head->m_pkthdr.csum_flags & CSUM_TCP) { csum_info->xmit.tcp_csum = 1; csum_info->xmit.tcp_header_offset = 0; } else if (m_head->m_pkthdr.csum_flags & CSUM_UDP) { csum_info->xmit.udp_csum = 1; } } rndis_mesg->msg_len = packet->tot_data_buf_len + rndis_msg_size; packet->tot_data_buf_len = rndis_mesg->msg_len; /* * Chimney send, if the packet could fit into one chimney buffer. */ if (packet->tot_data_buf_len < txr->hn_tx_chimney_size) { netvsc_dev *net_dev = txr->hn_sc->net_dev; uint32_t send_buf_section_idx; txr->hn_tx_chimney_tried++; send_buf_section_idx = hv_nv_get_next_send_section(net_dev); if (send_buf_section_idx != NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX) { uint8_t *dest = ((uint8_t *)net_dev->send_buf + (send_buf_section_idx * net_dev->send_section_size)); memcpy(dest, rndis_mesg, rndis_msg_size); dest += rndis_msg_size; m_copydata(m_head, 0, m_head->m_pkthdr.len, dest); packet->send_buf_section_idx = send_buf_section_idx; packet->send_buf_section_size = packet->tot_data_buf_len; packet->gpa_cnt = 0; txr->hn_tx_chimney++; goto done; } } error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (error) { int freed; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; packet->gpa_cnt = nsegs + HV_RF_NUM_TX_RESERVED_PAGE_BUFS; /* send packet with page buffer */ packet->gpa[0].gpa_page = atop(txd->rndis_msg_paddr); packet->gpa[0].gpa_ofs = txd->rndis_msg_paddr & PAGE_MASK; packet->gpa[0].gpa_len = rndis_msg_size; /* * Fill the page buffers with mbuf info starting at index * HV_RF_NUM_TX_RESERVED_PAGE_BUFS. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &packet->gpa[ i + HV_RF_NUM_TX_RESERVED_PAGE_BUFS]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } packet->send_buf_section_idx = NVSP_1_CHIMNEY_SEND_INVALID_SECTION_INDEX; packet->send_buf_section_size = 0; done: txd->m = m_head; /* Set the completion routine */ packet->compl.send.on_send_completion = hn_tx_done; packet->compl.send.send_completion_context = packet; packet->compl.send.send_completion_tid = (uint64_t)(uintptr_t)txd; return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_send_pkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0; again: /* * Make sure that txd is not freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); error = hv_nv_on_send(txr->hn_chan, &txd->netvsc_pkt); if (!error) { ETHER_BPF_MTAP(ifp, txd->m); if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if (!hn_use_if_start) { if_inc_counter(ifp, IFCOUNTER_OBYTES, txd->m->m_pkthdr.len); if (txd->m->m_flags & M_MCAST) if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); } txr->hn_pkts++; } hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } return error; } /* * Start a transmit of one or more packets */ static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return 0; while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); return 1; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } error = hn_encap(txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ continue; } error = hn_send_pkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } return 0; } /* * Link up/down notification */ void netvsc_linkstatus_callback(struct hn_softc *sc, uint32_t status) { if (status == 1) { sc->hn_carrier = 1; } else { sc->hn_carrier = 0; } } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { #if __FreeBSD_version >= 1100095 if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } #endif return tcp_lro_rx(lc, m, 0); } #endif /* * Called when we receive a data packet from the "wire" on the * specified device * * Note: This is no longer used as a callback */ int netvsc_recv(struct hn_rx_ring *rxr, netvsc_packet *packet, const rndis_tcp_ip_csum_info *csum_info, const struct rndis_hash_info *hash_info, const struct rndis_hash_value *hash_value) { struct ifnet *ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1; int hash_type = M_HASHTYPE_OPAQUE_HASH; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) return (0); /* * Bail out if packet contains more data than configured MTU. */ if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) { return (0); } else if (packet->tot_data_buf_len <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), packet->data, packet->tot_data_buf_len); m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len; rxr->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (packet->tot_data_buf_len > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); return (0); } hv_m_append(m_new, packet->tot_data_buf_len, packet->data); } m_new->m_pkthdr.rcvif = ifp; if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (csum_info != NULL) { /* IP csum offload */ if (csum_info->receive.ip_csum_succeeded && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((csum_info->receive.tcp_csum_succeeded || csum_info->receive.udp_csum_succeeded) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (csum_info->receive.tcp_csum_succeeded) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } if (csum_info->receive.ip_csum_succeeded && csum_info->receive.tcp_csum_succeeded) do_lro = 1; } else { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); if (m_new->m_len < hoff) goto skip; eh = mtod(m_new, struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) goto skip; evl = mtod(m_new, struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } if (etype == ETHERTYPE_IP) { int pr; pr = hn_check_iplen(m_new, hoff); if (pr == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; } else if (pr == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } } else if (pr != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } skip: if ((packet->vlan_tci != 0) && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) { m_new->m_pkthdr.ether_vtag = packet->vlan_tci; m_new->m_flags |= M_VLANTAG; } if (hash_info != NULL && hash_value != NULL) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = hash_value->hash_value; if ((hash_info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (hash_info->hash_info & NDIS_HASH_TYPE_MASK); switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else { if (hash_value != NULL) { m_new->m_pkthdr.flowid = hash_value->hash_value; } else { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } } M_HASHTYPE_SET(m_new, hash_type); /* * Note: Moved RX completion back to hv_nv_on_receive() so all * messages (not just data messages) will trigger a response. */ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); rxr->hn_pkts++; if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } /* We're not holding the lock here, so don't release it */ (*ifp->if_input)(ifp, m_new); return (0); } /* * Rules for using sc->temp_unusable: * 1. sc->temp_unusable can only be read or written while holding NV_LOCK() * 2. code reading sc->temp_unusable under NV_LOCK(), and finding * sc->temp_unusable set, must release NV_LOCK() and exit * 3. to retain exclusive control of the interface, * sc->temp_unusable must be set by code before releasing NV_LOCK() * 4. only code setting sc->temp_unusable can clear sc->temp_unusable * 5. code setting sc->temp_unusable must eventually clear sc->temp_unusable */ /* * Standard ioctl entry point. Called when the user wants to configure * the interface. */ static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { hn_softc_t *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data; #ifdef INET struct ifaddr *ifa = (struct ifaddr *)data; #endif netvsc_device_info device_info; int mask, error = 0; int retry_cnt = 500; switch(cmd) { case SIOCSIFADDR: #ifdef INET if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) hn_ifinit(sc); arp_ifinit(ifp, ifa); } else #endif error = ether_ioctl(ifp, cmd, data); break; case SIOCSIFMTU: /* Check MTU value change */ if (ifp->if_mtu == ifr->ifr_mtu) break; if (ifr->ifr_mtu > NETVSC_MAX_CONFIGURABLE_MTU) { error = EINVAL; break; } /* Obtain and record requested MTU */ ifp->if_mtu = ifr->ifr_mtu; #if __FreeBSD_version >= 1100099 /* * Make sure that LRO aggregation length limit is still * valid, after the MTU change. */ NV_LOCK(sc); if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); NV_UNLOCK(sc); #endif do { NV_LOCK(sc); if (!sc->temp_unusable) { sc->temp_unusable = TRUE; retry_cnt = -1; } NV_UNLOCK(sc); if (retry_cnt > 0) { retry_cnt--; DELAY(5 * 1000); } } while (retry_cnt > 0); if (retry_cnt == 0) { error = EINVAL; break; } /* We must remove and add back the device to cause the new * MTU to take effect. This includes tearing down, but not * deleting the channel, then bringing it back up. */ error = hv_rf_on_device_remove(sc, HV_RF_NV_RETAIN_CHANNEL); if (error) { NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; } /* Wait for subchannels to be destroyed */ vmbus_subchan_drain(sc->hn_prichan); error = hv_rf_on_device_add(sc, &device_info, sc->hn_rx_ring_inuse, &sc->hn_rx_ring[0]); if (error) { NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; } KASSERT(sc->hn_rx_ring_cnt == sc->net_dev->num_channel, ("RX ring count %d and channel count %u mismatch", sc->hn_rx_ring_cnt, sc->net_dev->num_channel)); if (sc->net_dev->num_channel > 1) { int r; /* * Skip the rings on primary channel; they are * handled by the hv_rf_on_device_add() above. */ for (r = 1; r < sc->hn_rx_ring_cnt; ++r) { sc->hn_rx_ring[r].hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; } for (r = 1; r < sc->hn_tx_ring_cnt; ++r) { sc->hn_tx_ring[r].hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } hn_subchan_setup(sc); } sc->hn_tx_chimney_max = sc->net_dev->send_section_size; if (sc->hn_tx_ring[0].hn_tx_chimney_size > sc->hn_tx_chimney_max) hn_set_tx_chimney_size(sc, sc->hn_tx_chimney_max); hn_ifinit_locked(sc); NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); break; case SIOCSIFFLAGS: do { NV_LOCK(sc); if (!sc->temp_unusable) { sc->temp_unusable = TRUE; retry_cnt = -1; } NV_UNLOCK(sc); if (retry_cnt > 0) { retry_cnt--; DELAY(5 * 1000); } } while (retry_cnt > 0); if (retry_cnt == 0) { error = EINVAL; break; } if (ifp->if_flags & IFF_UP) { /* * If only the state of the PROMISC flag changed, * then just use the 'set promisc mode' command * instead of reinitializing the entire NIC. Doing * a full re-init means reloading the firmware and * waiting for it to start up, which may take a * second or two. */ #ifdef notyet /* Fixme: Promiscuous mode? */ if (ifp->if_drv_flags & IFF_DRV_RUNNING && ifp->if_flags & IFF_PROMISC && !(sc->hn_if_flags & IFF_PROMISC)) { /* do something here for Hyper-V */ } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && !(ifp->if_flags & IFF_PROMISC) && sc->hn_if_flags & IFF_PROMISC) { /* do something here for Hyper-V */ } else #endif hn_ifinit_locked(sc); } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { hn_stop(sc); } } NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); sc->hn_if_flags = ifp->if_flags; error = 0; break; case SIOCSIFCAP: NV_LOCK(sc); mask = ifr->ifr_reqcap ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; if (ifp->if_capenable & IFCAP_TXCSUM) { ifp->if_hwassist |= sc->hn_tx_ring[0].hn_csum_assist; } else { ifp->if_hwassist &= ~sc->hn_tx_ring[0].hn_csum_assist; } } if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; else ifp->if_hwassist &= ~CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; else ifp->if_hwassist &= ~CSUM_IP6_TSO; } NV_UNLOCK(sc); error = 0; break; case SIOCADDMULTI: case SIOCDELMULTI: #ifdef notyet /* Fixme: Multicast mode? */ if (ifp->if_drv_flags & IFF_DRV_RUNNING) { NV_LOCK(sc); netvsc_setmulti(sc); NV_UNLOCK(sc); error = 0; } #endif error = EINVAL; break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } /* * */ static void hn_stop(hn_softc_t *sc) { struct ifnet *ifp; int ret, i; ifp = sc->hn_ifp; if (bootverbose) printf(" Closing Device ...\n"); atomic_clear_int(&ifp->if_drv_flags, (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; if_link_state_change(ifp, LINK_STATE_DOWN); sc->hn_initdone = 0; ret = hv_rf_on_close(sc); } /* * FreeBSD transmit entry point */ static void hn_start(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } /* * */ static void hn_ifinit_locked(hn_softc_t *sc) { struct ifnet *ifp; int ret, i; ifp = sc->hn_ifp; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { return; } hv_promisc_mode = 1; ret = hv_rf_on_open(sc); if (ret != 0) { return; } else { sc->hn_initdone = 1; } atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); if_link_state_change(ifp, LINK_STATE_UP); } /* * */ static void hn_ifinit(void *xsc) { hn_softc_t *sc = xsc; NV_LOCK(sc); if (sc->temp_unusable) { NV_UNLOCK(sc); return; } sc->temp_unusable = TRUE; NV_UNLOCK(sc); hn_ifinit_locked(sc); NV_LOCK(sc); sc->temp_unusable = FALSE; NV_UNLOCK(sc); } #ifdef LATER /* * */ static void hn_watchdog(struct ifnet *ifp) { hn_softc_t *sc; sc = ifp->if_softc; printf("hn%d: watchdog timeout -- resetting\n", sc->hn_unit); hn_ifinit(sc); /*???*/ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } #endif #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) return EINVAL; NV_LOCK(sc); hn_set_lro_lenlim(sc, lenlim); NV_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; NV_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; NV_UNLOCK(sc); return 0; } #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; NV_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } NV_UNLOCK(sc); return 0; } static int hn_tx_chimney_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chimney_size, error; chimney_size = sc->hn_tx_ring[0].hn_tx_chimney_size; error = sysctl_handle_int(oidp, &chimney_size, 0, req); if (error || req->newptr == NULL) return error; if (chimney_size > sc->hn_tx_chimney_max || chimney_size <= 0) return EINVAL; hn_set_tx_chimney_size(sc, chimney_size); return 0; } static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; NV_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } NV_UNLOCK(sc); return 0; } static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static void hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 int lroent_cnt; #endif #endif int i; sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_NETVSC, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_rdbuf = malloc(NETVSC_PACKET_SIZE, M_NETVSC, M_WAITOK); rxr->hn_rx_idx = i; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); #else tcp_lro_init(&rxr->hn_lro); rxr->hn_lro.ifp = sc->hn_ifp; #endif #if __FreeBSD_version >= 1100099 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), hn_rx_stat_u64_sysctl, "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), hn_rx_stat_u64_sysctl, "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); #if __FreeBSD_version >= 1100099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); #endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif free(rxr->hn_rdbuf, M_NETVSC); } free(sc->hn_rx_ring, M_NETVSC); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_create_tx_ring(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; uint32_t version; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_NETVSC, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_NETVSC, M_WAITOK, &txr->hn_tx_lock); #endif txr->hn_tx_taskq = sc->hn_tx_taskq; if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_NETVSC, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; version = VMBUS_GET_VERSION(device_get_parent(dev), dev); if (version >= VMBUS_VERSION_WIN8_1) { txr->hn_csum_assist = HN_CSUM_ASSIST; } else { txr->hn_csum_assist = HN_CSUM_ASSIST_WIN8; if (id == 0) { device_printf(dev, "bus version %u.%u, " "no UDP checksum offloading\n", VMBUS_VERSION_MAJOR(version), VMBUS_VERSION_MINOR(version)); } } /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_MSG_ALIGN, /* alignment */ HN_RNDIS_MSG_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_MSG_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_MSG_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; /* * Allocate and load RNDIS messages. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_msg, BUS_DMA_WAITOK | BUS_DMA_COHERENT, &txd->rndis_msg_dmap); if (error) { device_printf(dev, "failed to allocate rndis_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap, txd->rndis_msg, HN_RNDIS_MSG_LEN, hyperv_dma_map_paddr, &txd->rndis_msg_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg, txd->rndis_msg_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg, txd->rndis_msg_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); if (!hn_use_if_start) { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW, &txr->hn_pkts, "# of packets transmitted"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_msg_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_msg, txd->rndis_msg_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_destroy_tx_ring(struct hn_tx_ring *txr) { struct hn_txdesc *txd; if (txr->hn_txdesc == NULL) return; #ifndef HN_USE_TXDESC_BUFRING while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) { SLIST_REMOVE_HEAD(&txr->hn_txlist, link); hn_txdesc_dmamap_destroy(txd); } #else mtx_lock(&txr->hn_tx_lock); while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL) hn_txdesc_dmamap_destroy(txd); mtx_unlock(&txr->hn_tx_lock); #endif if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING buf_ring_free(txr->hn_txdesc_br, M_NETVSC); #endif free(txr->hn_txdesc, M_NETVSC); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) buf_ring_free(txr->hn_mbuf_br, M_NETVSC); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_NETVSC, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_create_tx_ring(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_tx_chimney_max, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_tx_chimney_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); return 0; } static void hn_set_tx_chimney_size(struct hn_softc *sc, int chimney_size) { int i; NV_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_tx_chimney_size = chimney_size; NV_UNLOCK(sc); } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_destroy_tx_ring(&sc->hn_tx_ring[i]); free(sc->hn_tx_ring, M_NETVSC); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_stop_tx_tasks(struct hn_softc *sc) { int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; struct mbuf *m_head; mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return 0; while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); return 1; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ drbr_advance(ifp, txr->hn_mbuf_br); continue; } error = hn_send_pkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } return 0; } static int hn_transmit(struct ifnet *ifp, struct mbuf *m) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr; int error, idx = 0; /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_xmit_qflush(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } if_qflush(ifp); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_channel_attach(struct hn_softc *sc, struct hv_vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx; - idx = chan->ch_subidx; + idx = vmbus_chan_subidx(chan); KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to channel%u\n", - idx, chan->ch_id); + idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to channel%u\n", - idx, chan->ch_id); + idx, vmbus_chan_id(chan)); } } /* Bind channel to a proper CPU */ vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus); } static void hn_subchan_attach(struct hn_softc *sc, struct hv_vmbus_channel *chan) { - KASSERT(!VMBUS_CHAN_ISPRIMARY(chan), + KASSERT(!vmbus_chan_is_primary(chan), ("subchannel callback on primary channel")); - KASSERT(chan->ch_subidx > 0, - ("invalid channel subidx %u", - chan->ch_subidx)); hn_channel_attach(sc, chan); } static void hn_subchan_setup(struct hn_softc *sc) { struct hv_vmbus_channel **subchans; int subchan_cnt = sc->net_dev->num_channel - 1; int i; /* Wait for sub-channels setup to complete. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); /* Attach the sub-channels. */ for (i = 0; i < subchan_cnt; ++i) { struct hv_vmbus_channel *subchan = subchans[i]; /* NOTE: Calling order is critical. */ hn_subchan_attach(sc, subchan); hv_nv_subchan_attach(subchan, - &sc->hn_rx_ring[subchan->ch_subidx]); + &sc->hn_rx_ring[vmbus_chan_subidx(subchan)]); } /* Release the sub-channels */ vmbus_subchan_rel(subchans, subchan_cnt); if_printf(sc->hn_ifp, "%d sub-channels setup done\n", subchan_cnt); } static void hn_tx_taskq_create(void *arg __unused) { if (!hn_share_tx_taskq) return; hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskq); if (hn_bind_tx_taskq >= 0) { int cpu = hn_bind_tx_taskq; cpuset_t cpu_set; if (cpu > mp_ncpus - 1) cpu = mp_ncpus - 1; CPU_SETOF(cpu, &cpu_set); taskqueue_start_threads_cpuset(&hn_tx_taskq, 1, PI_NET, &cpu_set, "hn tx"); } else { taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx"); } } SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_FIRST, hn_tx_taskq_create, NULL); static void hn_tx_taskq_destroy(void *arg __unused) { if (hn_tx_taskq != NULL) taskqueue_free(hn_tx_taskq); } SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_FIRST, hn_tx_taskq_destroy, NULL); static device_method_t netvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netvsc_probe), DEVMETHOD(device_attach, netvsc_attach), DEVMETHOD(device_detach, netvsc_detach), DEVMETHOD(device_shutdown, netvsc_shutdown), { 0, 0 } }; static driver_t netvsc_driver = { NETVSC_DEVNAME, netvsc_methods, sizeof(hn_softc_t) }; static devclass_t netvsc_devclass; DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); Index: head/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c =================================================================== --- head/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c (revision 303070) +++ head/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c (revision 303071) @@ -1,2081 +1,2081 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * StorVSC driver for Hyper-V. This driver presents a SCSI HBA interface * to the Comman Access Method (CAM) layer. CAM control blocks (CCBs) are * converted into VSCSI protocol messages which are delivered to the parent * partition StorVSP driver over the Hyper-V VMBUS. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_vstorage.h" #include "vmbus_if.h" #define STORVSC_RINGBUFFER_SIZE (20*PAGE_SIZE) #define STORVSC_MAX_LUNS_PER_TARGET (64) #define STORVSC_MAX_IO_REQUESTS (STORVSC_MAX_LUNS_PER_TARGET * 2) #define BLKVSC_MAX_IDE_DISKS_PER_TARGET (1) #define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS #define STORVSC_MAX_TARGETS (2) #define VSTOR_PKT_SIZE (sizeof(struct vstor_packet) - vmscsi_size_delta) #define HV_ALIGN(x, a) roundup2(x, a) struct storvsc_softc; struct hv_sgl_node { LIST_ENTRY(hv_sgl_node) link; struct sglist *sgl_data; }; struct hv_sgl_page_pool{ LIST_HEAD(, hv_sgl_node) in_use_sgl_list; LIST_HEAD(, hv_sgl_node) free_sgl_list; boolean_t is_init; } g_hv_sgl_page_pool; #define STORVSC_MAX_SG_PAGE_CNT STORVSC_MAX_IO_REQUESTS * VMBUS_CHAN_PRPLIST_MAX enum storvsc_request_type { WRITE_TYPE, READ_TYPE, UNKNOWN_TYPE }; struct hvs_gpa_range { struct vmbus_gpa_range gpa_range; uint64_t gpa_page[VMBUS_CHAN_PRPLIST_MAX]; } __packed; struct hv_storvsc_request { LIST_ENTRY(hv_storvsc_request) link; struct vstor_packet vstor_packet; int prp_cnt; struct hvs_gpa_range prp_list; void *sense_data; uint8_t sense_info_len; uint8_t retries; union ccb *ccb; struct storvsc_softc *softc; struct callout callout; struct sema synch_sema; /*Synchronize the request/response if needed */ struct sglist *bounce_sgl; unsigned int bounce_sgl_count; uint64_t not_aligned_seg_bits; }; struct storvsc_softc { struct hv_vmbus_channel *hs_chan; LIST_HEAD(, hv_storvsc_request) hs_free_list; struct mtx hs_lock; struct storvsc_driver_props *hs_drv_props; int hs_unit; uint32_t hs_frozen; struct cam_sim *hs_sim; struct cam_path *hs_path; uint32_t hs_num_out_reqs; boolean_t hs_destroy; boolean_t hs_drain_notify; struct sema hs_drain_sema; struct hv_storvsc_request hs_init_req; struct hv_storvsc_request hs_reset_req; device_t hs_dev; struct hv_vmbus_channel *hs_cpu2chan[MAXCPU]; }; /** * HyperV storvsc timeout testing cases: * a. IO returned after first timeout; * b. IO returned after second timeout and queue freeze; * c. IO returned while timer handler is running * The first can be tested by "sg_senddiag -vv /dev/daX", * and the second and third can be done by * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". */ #define HVS_TIMEOUT_TEST 0 /* * Bus/adapter reset functionality on the Hyper-V host is * buggy and it will be disabled until * it can be further tested. */ #define HVS_HOST_RESET 0 struct storvsc_driver_props { char *drv_name; char *drv_desc; uint8_t drv_max_luns_per_target; uint8_t drv_max_ios_per_target; uint32_t drv_ringbuffer_size; }; enum hv_storage_type { DRIVER_BLKVSC, DRIVER_STORVSC, DRIVER_UNKNOWN }; #define HS_MAX_ADAPTERS 10 #define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1 /* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ static const struct hyperv_guid gStorVscDeviceType={ .hv_guid = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} }; /* {32412632-86cb-44a2-9b5c-50d1417354f5} */ static const struct hyperv_guid gBlkVscDeviceType={ .hv_guid = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} }; static struct storvsc_driver_props g_drv_props_table[] = { {"blkvsc", "Hyper-V IDE Storage Interface", BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS, STORVSC_RINGBUFFER_SIZE}, {"storvsc", "Hyper-V SCSI Storage Interface", STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS, STORVSC_RINGBUFFER_SIZE} }; /* * Sense buffer size changed in win8; have a run-time * variable to track the size we should use. */ static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE; /* * The size of the vmscsi_request has changed in win8. The * additional size is for the newly added elements in the * structure. These elements are valid only when we are talking * to a win8 host. * Track the correct size we need to apply. */ static int vmscsi_size_delta; /* * The storage protocol version is determined during the * initial exchange with the host. It will indicate which * storage functionality is available in the host. */ static int vmstor_proto_version; struct vmstor_proto { int proto_version; int sense_buffer_size; int vmscsi_size_delta; }; static const struct vmstor_proto vmstor_proto_list[] = { { VMSTOR_PROTOCOL_VERSION_WIN10, POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, 0 }, { VMSTOR_PROTOCOL_VERSION_WIN8_1, POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, 0 }, { VMSTOR_PROTOCOL_VERSION_WIN8, POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, 0 }, { VMSTOR_PROTOCOL_VERSION_WIN7, PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, sizeof(struct vmscsi_win8_extension), }, { VMSTOR_PROTOCOL_VERSION_WIN6, PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, sizeof(struct vmscsi_win8_extension), } }; /* static functions */ static int storvsc_probe(device_t dev); static int storvsc_attach(device_t dev); static int storvsc_detach(device_t dev); static void storvsc_poll(struct cam_sim * sim); static void storvsc_action(struct cam_sim * sim, union ccb * ccb); static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); static enum hv_storage_type storvsc_get_storage_type(device_t dev); static void hv_storvsc_rescan_target(struct storvsc_softc *sc); static void hv_storvsc_on_channel_callback(struct hv_vmbus_channel *chan, void *xsc); static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc, struct vstor_packet *vstor_packet, struct hv_storvsc_request *request); static int hv_storvsc_connect_vsp(struct storvsc_softc *); static void storvsc_io_done(struct hv_storvsc_request *reqp); static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, bus_dma_segment_t *orig_sgl, unsigned int orig_sgl_count, uint64_t seg_bits); void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, unsigned int dest_sgl_count, struct sglist* src_sgl, uint64_t seg_bits); static device_method_t storvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, storvsc_probe), DEVMETHOD(device_attach, storvsc_attach), DEVMETHOD(device_detach, storvsc_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD_END }; static driver_t storvsc_driver = { "storvsc", storvsc_methods, sizeof(struct storvsc_softc), }; static devclass_t storvsc_devclass; DRIVER_MODULE(storvsc, vmbus, storvsc_driver, storvsc_devclass, 0, 0); MODULE_VERSION(storvsc, 1); MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); static void storvsc_subchan_attach(struct storvsc_softc *sc, struct hv_vmbus_channel *new_channel) { struct vmstor_chan_props props; int ret = 0; memset(&props, 0, sizeof(props)); vmbus_chan_cpu_rr(new_channel); ret = vmbus_chan_open(new_channel, sc->hs_drv_props->drv_ringbuffer_size, sc->hs_drv_props->drv_ringbuffer_size, (void *)&props, sizeof(struct vmstor_chan_props), hv_storvsc_on_channel_callback, sc); } /** * @brief Send multi-channel creation request to host * * @param device a Hyper-V device pointer * @param max_chans the max channels supported by vmbus */ static void storvsc_send_multichannel_request(struct storvsc_softc *sc, int max_chans) { struct hv_vmbus_channel **subchan; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; int request_channels_cnt = 0; int ret, i; /* get multichannels count that need to create */ request_channels_cnt = MIN(max_chans, mp_ncpus); request = &sc->hs_init_req; /* request the host to create multi-channel */ memset(request, 0, sizeof(struct hv_storvsc_request)); sema_init(&request->synch_sema, 0, ("stor_synch_sema")); vstor_packet = &request->vstor_packet; vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS; vstor_packet->flags = REQUEST_COMPLETION_FLAG; vstor_packet->u.multi_channels_cnt = request_channels_cnt; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); /* wait for 5 seconds */ ret = sema_timedwait(&request->synch_sema, 5 * hz); if (ret != 0) { printf("Storvsc_error: create multi-channel timeout, %d\n", ret); return; } if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { printf("Storvsc_error: create multi-channel invalid operation " "(%d) or statue (%u)\n", vstor_packet->operation, vstor_packet->status); return; } /* Wait for sub-channels setup to complete. */ subchan = vmbus_subchan_get(sc->hs_chan, request_channels_cnt); /* Attach the sub-channels. */ for (i = 0; i < request_channels_cnt; ++i) storvsc_subchan_attach(sc, subchan[i]); /* Release the sub-channels. */ vmbus_subchan_rel(subchan, request_channels_cnt); if (bootverbose) printf("Storvsc create multi-channel success!\n"); } /** * @brief initialize channel connection to parent partition * * @param dev a Hyper-V device pointer * @returns 0 on success, non-zero error on failure */ static int hv_storvsc_channel_init(struct storvsc_softc *sc) { int ret = 0, i; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; uint16_t max_chans = 0; boolean_t support_multichannel = FALSE; uint32_t version; max_chans = 0; support_multichannel = FALSE; request = &sc->hs_init_req; memset(request, 0, sizeof(struct hv_storvsc_request)); vstor_packet = &request->vstor_packet; request->softc = sc; /** * Initiate the vsc/vsp initialization protocol on the open channel */ sema_init(&request->synch_sema, 0, ("stor_synch_sema")); vstor_packet->operation = VSTOR_OPERATION_BEGININITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if (ret != 0) goto cleanup; /* wait 5 seconds */ ret = sema_timedwait(&request->synch_sema, 5 * hz); if (ret != 0) goto cleanup; if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { goto cleanup; } for (i = 0; i < nitems(vmstor_proto_list); i++) { /* reuse the packet for version range supported */ memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; vstor_packet->u.version.major_minor = vmstor_proto_list[i].proto_version; /* revision is only significant for Windows guests */ vstor_packet->u.version.revision = 0; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if (ret != 0) goto cleanup; /* wait 5 seconds */ ret = sema_timedwait(&request->synch_sema, 5 * hz); if (ret) goto cleanup; if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO) { ret = EINVAL; goto cleanup; } if (vstor_packet->status == 0) { vmstor_proto_version = vmstor_proto_list[i].proto_version; sense_buffer_size = vmstor_proto_list[i].sense_buffer_size; vmscsi_size_delta = vmstor_proto_list[i].vmscsi_size_delta; break; } } if (vstor_packet->status != 0) { ret = EINVAL; goto cleanup; } /** * Query channel properties */ memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_QUERYPROPERTIES; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if ( ret != 0) goto cleanup; /* wait 5 seconds */ ret = sema_timedwait(&request->synch_sema, 5 * hz); if (ret != 0) goto cleanup; /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { goto cleanup; } /* multi-channels feature is supported by WIN8 and above version */ max_chans = vstor_packet->u.chan_props.max_channel_cnt; version = VMBUS_GET_VERSION(device_get_parent(sc->hs_dev), sc->hs_dev); if (version != VMBUS_VERSION_WIN7 && version != VMBUS_VERSION_WS2008 && (vstor_packet->u.chan_props.flags & HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) { support_multichannel = TRUE; } memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if (ret != 0) { goto cleanup; } /* wait 5 seconds */ ret = sema_timedwait(&request->synch_sema, 5 * hz); if (ret != 0) goto cleanup; if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) goto cleanup; /* * If multi-channel is supported, send multichannel create * request to host. */ if (support_multichannel) storvsc_send_multichannel_request(sc, max_chans); cleanup: sema_destroy(&request->synch_sema); return (ret); } /** * @brief Open channel connection to paraent partition StorVSP driver * * Open and initialize channel connection to parent partition StorVSP driver. * * @param pointer to a Hyper-V device * @returns 0 on success, non-zero error on failure */ static int hv_storvsc_connect_vsp(struct storvsc_softc *sc) { int ret = 0; struct vmstor_chan_props props; memset(&props, 0, sizeof(struct vmstor_chan_props)); /* * Open the channel */ vmbus_chan_cpu_rr(sc->hs_chan); ret = vmbus_chan_open( sc->hs_chan, sc->hs_drv_props->drv_ringbuffer_size, sc->hs_drv_props->drv_ringbuffer_size, (void *)&props, sizeof(struct vmstor_chan_props), hv_storvsc_on_channel_callback, sc); if (ret != 0) { return ret; } ret = hv_storvsc_channel_init(sc); return (ret); } #if HVS_HOST_RESET static int hv_storvsc_host_reset(struct storvsc_softc *sc) { int ret = 0; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; request = &sc->hs_reset_req; request->softc = sc; vstor_packet = &request->vstor_packet; sema_init(&request->synch_sema, 0, "stor synch sema"); vstor_packet->operation = VSTOR_OPERATION_RESETBUS; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(dev->channel, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)&sc->hs_reset_req); if (ret != 0) { goto cleanup; } ret = sema_timedwait(&request->synch_sema, 5 * hz); /* KYS 5 seconds */ if (ret) { goto cleanup; } /* * At this point, all outstanding requests in the adapter * should have been flushed out and return to us */ cleanup: sema_destroy(&request->synch_sema); return (ret); } #endif /* HVS_HOST_RESET */ /** * @brief Function to initiate an I/O request * * @param device Hyper-V device pointer * @param request pointer to a request structure * @returns 0 on success, non-zero error on failure */ static int hv_storvsc_io_request(struct storvsc_softc *sc, struct hv_storvsc_request *request) { struct vstor_packet *vstor_packet = &request->vstor_packet; struct hv_vmbus_channel* outgoing_channel = NULL; int ret = 0; vstor_packet->flags |= REQUEST_COMPLETION_FLAG; vstor_packet->u.vm_srb.length = VSTOR_PKT_SIZE; vstor_packet->u.vm_srb.sense_info_len = sense_buffer_size; vstor_packet->u.vm_srb.transfer_len = request->prp_list.gpa_range.gpa_len; vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; outgoing_channel = sc->hs_cpu2chan[curcpu]; mtx_unlock(&request->softc->hs_lock); if (request->prp_list.gpa_range.gpa_len) { ret = vmbus_chan_send_prplist(outgoing_channel, &request->prp_list.gpa_range, request->prp_cnt, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); } else { ret = vmbus_chan_send(outgoing_channel, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); } mtx_lock(&request->softc->hs_lock); if (ret != 0) { printf("Unable to send packet %p ret %d", vstor_packet, ret); } else { atomic_add_int(&sc->hs_num_out_reqs, 1); } return (ret); } /** * Process IO_COMPLETION_OPERATION and ready * the result to be completed for upper layer * processing by the CAM layer. */ static void hv_storvsc_on_iocompletion(struct storvsc_softc *sc, struct vstor_packet *vstor_packet, struct hv_storvsc_request *request) { struct vmscsi_req *vm_srb; vm_srb = &vstor_packet->u.vm_srb; /* * Copy some fields of the host's response into the request structure, * because the fields will be used later in storvsc_io_done(). */ request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status; request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len; if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) && (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) { /* Autosense data available */ KASSERT(vm_srb->sense_info_len <= request->sense_info_len, ("vm_srb->sense_info_len <= " "request->sense_info_len")); memcpy(request->sense_data, vm_srb->u.sense_data, vm_srb->sense_info_len); request->sense_info_len = vm_srb->sense_info_len; } /* Complete request by passing to the CAM layer */ storvsc_io_done(request); atomic_subtract_int(&sc->hs_num_out_reqs, 1); if (sc->hs_drain_notify && (sc->hs_num_out_reqs == 0)) { sema_post(&sc->hs_drain_sema); } } static void hv_storvsc_rescan_target(struct storvsc_softc *sc) { path_id_t pathid; target_id_t targetid; union ccb *ccb; pathid = cam_sim_path(sc->hs_sim); targetid = CAM_TARGET_WILDCARD; /* * Allocate a CCB and schedule a rescan. */ ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { printf("unable to alloc CCB for rescan\n"); return; } if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { printf("unable to create path for rescan, pathid: %u," "targetid: %u\n", pathid, targetid); xpt_free_ccb(ccb); return; } if (targetid == CAM_TARGET_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_BUS; else ccb->ccb_h.func_code = XPT_SCAN_TGT; xpt_rescan(ccb); } static void hv_storvsc_on_channel_callback(struct hv_vmbus_channel *channel, void *xsc) { int ret = 0; struct storvsc_softc *sc = xsc; uint32_t bytes_recvd; uint64_t request_id; uint8_t packet[roundup2(sizeof(struct vstor_packet), 8)]; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8); ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id); KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough")); /* XXX check bytes_recvd to make sure that it contains enough data */ while ((ret == 0) && (bytes_recvd > 0)) { request = (struct hv_storvsc_request *)(uintptr_t)request_id; if ((request == &sc->hs_init_req) || (request == &sc->hs_reset_req)) { memcpy(&request->vstor_packet, packet, sizeof(struct vstor_packet)); sema_post(&request->synch_sema); } else { vstor_packet = (struct vstor_packet *)packet; switch(vstor_packet->operation) { case VSTOR_OPERATION_COMPLETEIO: if (request == NULL) panic("VMBUS: storvsc received a " "packet with NULL request id in " "COMPLETEIO operation."); hv_storvsc_on_iocompletion(sc, vstor_packet, request); break; case VSTOR_OPERATION_REMOVEDEVICE: printf("VMBUS: storvsc operation %d not " "implemented.\n", vstor_packet->operation); /* TODO: implement */ break; case VSTOR_OPERATION_ENUMERATE_BUS: hv_storvsc_rescan_target(sc); break; default: break; } } bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8), ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id); KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough")); /* * XXX check bytes_recvd to make sure that it contains * enough data */ } } /** * @brief StorVSC probe function * * Device probe function. Returns 0 if the input device is a StorVSC * device. Otherwise, a ENXIO is returned. If the input device is * for BlkVSC (paravirtual IDE) device and this support is disabled in * favor of the emulated ATA/IDE device, return ENXIO. * * @param a device * @returns 0 on success, ENXIO if not a matcing StorVSC device */ static int storvsc_probe(device_t dev) { int ata_disk_enable = 0; int ret = ENXIO; switch (storvsc_get_storage_type(dev)) { case DRIVER_BLKVSC: if(bootverbose) device_printf(dev, "DRIVER_BLKVSC-Emulated ATA/IDE probe\n"); if (!getenv_int("hw.ata.disk_enable", &ata_disk_enable)) { if(bootverbose) device_printf(dev, "Enlightened ATA/IDE detected\n"); device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc); ret = BUS_PROBE_DEFAULT; } else if(bootverbose) device_printf(dev, "Emulated ATA/IDE set (hw.ata.disk_enable set)\n"); break; case DRIVER_STORVSC: if(bootverbose) device_printf(dev, "Enlightened SCSI device detected\n"); device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc); ret = BUS_PROBE_DEFAULT; break; default: ret = ENXIO; } return (ret); } static void storvsc_create_cpu2chan(struct storvsc_softc *sc) { int cpu; CPU_FOREACH(cpu) { sc->hs_cpu2chan[cpu] = vmbus_chan_cpu2chan(sc->hs_chan, cpu); if (bootverbose) { device_printf(sc->hs_dev, "cpu%d -> chan%u\n", - cpu, sc->hs_cpu2chan[cpu]->ch_id); + cpu, vmbus_chan_id(sc->hs_cpu2chan[cpu])); } } } /** * @brief StorVSC attach function * * Function responsible for allocating per-device structures, * setting up CAM interfaces and scanning for available LUNs to * be used for SCSI device peripherals. * * @param a device * @returns 0 on success or an error on failure */ static int storvsc_attach(device_t dev) { enum hv_storage_type stor_type; struct storvsc_softc *sc; struct cam_devq *devq; int ret, i, j; struct hv_storvsc_request *reqp; struct root_hold_token *root_mount_token = NULL; struct hv_sgl_node *sgl_node = NULL; void *tmp_buff = NULL; /* * We need to serialize storvsc attach calls. */ root_mount_token = root_mount_hold("storvsc"); sc = device_get_softc(dev); sc->hs_chan = vmbus_get_channel(dev); stor_type = storvsc_get_storage_type(dev); if (stor_type == DRIVER_UNKNOWN) { ret = ENODEV; goto cleanup; } /* fill in driver specific properties */ sc->hs_drv_props = &g_drv_props_table[stor_type]; /* fill in device specific properties */ sc->hs_unit = device_get_unit(dev); sc->hs_dev = dev; LIST_INIT(&sc->hs_free_list); mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF); for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; ++i) { reqp = malloc(sizeof(struct hv_storvsc_request), M_DEVBUF, M_WAITOK|M_ZERO); reqp->softc = sc; LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); } /* create sg-list page pool */ if (FALSE == g_hv_sgl_page_pool.is_init) { g_hv_sgl_page_pool.is_init = TRUE; LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list); LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list); /* * Pre-create SG list, each SG list with * VMBUS_CHAN_PRPLIST_MAX segments, each * segment has one page buffer */ for (i = 0; i < STORVSC_MAX_IO_REQUESTS; i++) { sgl_node = malloc(sizeof(struct hv_sgl_node), M_DEVBUF, M_WAITOK|M_ZERO); sgl_node->sgl_data = sglist_alloc(VMBUS_CHAN_PRPLIST_MAX, M_WAITOK|M_ZERO); for (j = 0; j < VMBUS_CHAN_PRPLIST_MAX; j++) { tmp_buff = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK|M_ZERO); sgl_node->sgl_data->sg_segs[j].ss_paddr = (vm_paddr_t)tmp_buff; } LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); } } sc->hs_destroy = FALSE; sc->hs_drain_notify = FALSE; sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); ret = hv_storvsc_connect_vsp(sc); if (ret != 0) { goto cleanup; } /* Construct cpu to channel mapping */ storvsc_create_cpu2chan(sc); /* * Create the device queue. * Hyper-V maps each target to one SCSI HBA */ devq = cam_simq_alloc(sc->hs_drv_props->drv_max_ios_per_target); if (devq == NULL) { device_printf(dev, "Failed to alloc device queue\n"); ret = ENOMEM; goto cleanup; } sc->hs_sim = cam_sim_alloc(storvsc_action, storvsc_poll, sc->hs_drv_props->drv_name, sc, sc->hs_unit, &sc->hs_lock, 1, sc->hs_drv_props->drv_max_ios_per_target, devq); if (sc->hs_sim == NULL) { device_printf(dev, "Failed to alloc sim\n"); cam_simq_free(devq); ret = ENOMEM; goto cleanup; } mtx_lock(&sc->hs_lock); /* bus_id is set to 0, need to get it from VMBUS channel query? */ if (xpt_bus_register(sc->hs_sim, dev, 0) != CAM_SUCCESS) { cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); mtx_unlock(&sc->hs_lock); device_printf(dev, "Unable to register SCSI bus\n"); ret = ENXIO; goto cleanup; } if (xpt_create_path(&sc->hs_path, /*periph*/NULL, cam_sim_path(sc->hs_sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { xpt_bus_deregister(cam_sim_path(sc->hs_sim)); cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); mtx_unlock(&sc->hs_lock); device_printf(dev, "Unable to create path\n"); ret = ENXIO; goto cleanup; } mtx_unlock(&sc->hs_lock); root_mount_rel(root_mount_token); return (0); cleanup: root_mount_rel(root_mount_token); while (!LIST_EMPTY(&sc->hs_free_list)) { reqp = LIST_FIRST(&sc->hs_free_list); LIST_REMOVE(reqp, link); free(reqp, M_DEVBUF); } while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); LIST_REMOVE(sgl_node, link); for (j = 0; j < VMBUS_CHAN_PRPLIST_MAX; j++) { if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); } } sglist_free(sgl_node->sgl_data); free(sgl_node, M_DEVBUF); } return (ret); } /** * @brief StorVSC device detach function * * This function is responsible for safely detaching a * StorVSC device. This includes waiting for inbound responses * to complete and freeing associated per-device structures. * * @param dev a device * returns 0 on success */ static int storvsc_detach(device_t dev) { struct storvsc_softc *sc = device_get_softc(dev); struct hv_storvsc_request *reqp = NULL; struct hv_sgl_node *sgl_node = NULL; int j = 0; sc->hs_destroy = TRUE; /* * At this point, all outbound traffic should be disabled. We * only allow inbound traffic (responses) to proceed so that * outstanding requests can be completed. */ sc->hs_drain_notify = TRUE; sema_wait(&sc->hs_drain_sema); sc->hs_drain_notify = FALSE; /* * Since we have already drained, we don't need to busy wait. * The call to close the channel will reset the callback * under the protection of the incoming channel lock. */ vmbus_chan_close(sc->hs_chan); mtx_lock(&sc->hs_lock); while (!LIST_EMPTY(&sc->hs_free_list)) { reqp = LIST_FIRST(&sc->hs_free_list); LIST_REMOVE(reqp, link); free(reqp, M_DEVBUF); } mtx_unlock(&sc->hs_lock); while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); LIST_REMOVE(sgl_node, link); for (j = 0; j < VMBUS_CHAN_PRPLIST_MAX; j++){ if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); } } sglist_free(sgl_node->sgl_data); free(sgl_node, M_DEVBUF); } return (0); } #if HVS_TIMEOUT_TEST /** * @brief unit test for timed out operations * * This function provides unit testing capability to simulate * timed out operations. Recompilation with HV_TIMEOUT_TEST=1 * is required. * * @param reqp pointer to a request structure * @param opcode SCSI operation being performed * @param wait if 1, wait for I/O to complete */ static void storvsc_timeout_test(struct hv_storvsc_request *reqp, uint8_t opcode, int wait) { int ret; union ccb *ccb = reqp->ccb; struct storvsc_softc *sc = reqp->softc; if (reqp->vstor_packet.vm_srb.cdb[0] != opcode) { return; } if (wait) { mtx_lock(&reqp->event.mtx); } ret = hv_storvsc_io_request(sc, reqp); if (ret != 0) { if (wait) { mtx_unlock(&reqp->event.mtx); } printf("%s: io_request failed with %d.\n", __func__, ret); ccb->ccb_h.status = CAM_PROVIDE_FAIL; mtx_lock(&sc->hs_lock); storvsc_free_request(sc, reqp); xpt_done(ccb); mtx_unlock(&sc->hs_lock); return; } if (wait) { xpt_print(ccb->ccb_h.path, "%u: %s: waiting for IO return.\n", ticks, __func__); ret = cv_timedwait(&reqp->event.cv, &reqp->event.mtx, 60*hz); mtx_unlock(&reqp->event.mtx); xpt_print(ccb->ccb_h.path, "%u: %s: %s.\n", ticks, __func__, (ret == 0)? "IO return detected" : "IO return not detected"); /* * Now both the timer handler and io done are running * simultaneously. We want to confirm the io done always * finishes after the timer handler exits. So reqp used by * timer handler is not freed or stale. Do busy loop for * another 1/10 second to make sure io done does * wait for the timer handler to complete. */ DELAY(100*1000); mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "%u: %s: finishing, queue frozen %d, " "ccb status 0x%x scsi_status 0x%x.\n", ticks, __func__, sc->hs_frozen, ccb->ccb_h.status, ccb->csio.scsi_status); mtx_unlock(&sc->hs_lock); } } #endif /* HVS_TIMEOUT_TEST */ #ifdef notyet /** * @brief timeout handler for requests * * This function is called as a result of a callout expiring. * * @param arg pointer to a request */ static void storvsc_timeout(void *arg) { struct hv_storvsc_request *reqp = arg; struct storvsc_softc *sc = reqp->softc; union ccb *ccb = reqp->ccb; if (reqp->retries == 0) { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "%u: IO timed out (req=0x%p), wait for another %u secs.\n", ticks, reqp, ccb->ccb_h.timeout / 1000); cam_error_print(ccb, CAM_ESF_ALL, CAM_EPF_ALL); mtx_unlock(&sc->hs_lock); reqp->retries++; callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout, 0, storvsc_timeout, reqp, 0); #if HVS_TIMEOUT_TEST storvsc_timeout_test(reqp, SEND_DIAGNOSTIC, 0); #endif return; } mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "%u: IO (reqp = 0x%p) did not return for %u seconds, %s.\n", ticks, reqp, ccb->ccb_h.timeout * (reqp->retries+1) / 1000, (sc->hs_frozen == 0)? "freezing the queue" : "the queue is already frozen"); if (sc->hs_frozen == 0) { sc->hs_frozen = 1; xpt_freeze_simq(xpt_path_sim(ccb->ccb_h.path), 1); } mtx_unlock(&sc->hs_lock); #if HVS_TIMEOUT_TEST storvsc_timeout_test(reqp, MODE_SELECT_10, 1); #endif } #endif /** * @brief StorVSC device poll function * * This function is responsible for servicing requests when * interrupts are disabled (i.e when we are dumping core.) * * @param sim a pointer to a CAM SCSI interface module */ static void storvsc_poll(struct cam_sim *sim) { struct storvsc_softc *sc = cam_sim_softc(sim); mtx_assert(&sc->hs_lock, MA_OWNED); mtx_unlock(&sc->hs_lock); hv_storvsc_on_channel_callback(sc->hs_chan, sc); mtx_lock(&sc->hs_lock); } /** * @brief StorVSC device action function * * This function is responsible for handling SCSI operations which * are passed from the CAM layer. The requests are in the form of * CAM control blocks which indicate the action being performed. * Not all actions require converting the request to a VSCSI protocol * message - these actions can be responded to by this driver. * Requests which are destined for a backend storage device are converted * to a VSCSI protocol message and sent on the channel connection associated * with this device. * * @param sim pointer to a CAM SCSI interface module * @param ccb pointer to a CAM control block */ static void storvsc_action(struct cam_sim *sim, union ccb *ccb) { struct storvsc_softc *sc = cam_sim_softc(sim); int res; mtx_assert(&sc->hs_lock, MA_OWNED); switch (ccb->ccb_h.func_code) { case XPT_PATH_INQ: { struct ccb_pathinq *cpi = &ccb->cpi; cpi->version_num = 1; cpi->hba_inquiry = PI_TAG_ABLE|PI_SDTR_ABLE; cpi->target_sprt = 0; cpi->hba_misc = PIM_NOBUSRESET; cpi->hba_eng_cnt = 0; cpi->max_target = STORVSC_MAX_TARGETS; cpi->max_lun = sc->hs_drv_props->drv_max_luns_per_target; cpi->initiator_id = cpi->max_target; cpi->bus_id = cam_sim_bus(sim); cpi->base_transfer_speed = 300000; cpi->transport = XPORT_SAS; cpi->transport_version = 0; cpi->protocol = PROTO_SCSI; cpi->protocol_version = SCSI_REV_SPC2; strncpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strncpy(cpi->hba_vid, sc->hs_drv_props->drv_name, HBA_IDLEN); strncpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; } case XPT_GET_TRAN_SETTINGS: { struct ccb_trans_settings *cts = &ccb->cts; cts->transport = XPORT_SAS; cts->transport_version = 0; cts->protocol = PROTO_SCSI; cts->protocol_version = SCSI_REV_SPC2; /* enable tag queuing and disconnected mode */ cts->proto_specific.valid = CTS_SCSI_VALID_TQ; cts->proto_specific.scsi.valid = CTS_SCSI_VALID_TQ; cts->proto_specific.scsi.flags = CTS_SCSI_FLAGS_TAG_ENB; cts->xport_specific.valid = CTS_SPI_VALID_DISC; cts->xport_specific.spi.flags = CTS_SPI_FLAGS_DISC_ENB; ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; } case XPT_SET_TRAN_SETTINGS: { ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; } case XPT_CALC_GEOMETRY:{ cam_calc_geometry(&ccb->ccg, 1); xpt_done(ccb); return; } case XPT_RESET_BUS: case XPT_RESET_DEV:{ #if HVS_HOST_RESET if ((res = hv_storvsc_host_reset(sc)) != 0) { xpt_print(ccb->ccb_h.path, "hv_storvsc_host_reset failed with %d\n", res); ccb->ccb_h.status = CAM_PROVIDE_FAIL; xpt_done(ccb); return; } ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; #else xpt_print(ccb->ccb_h.path, "%s reset not supported.\n", (ccb->ccb_h.func_code == XPT_RESET_BUS)? "bus" : "dev"); ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); return; #endif /* HVS_HOST_RESET */ } case XPT_SCSI_IO: case XPT_IMMED_NOTIFY: { struct hv_storvsc_request *reqp = NULL; if (ccb->csio.cdb_len == 0) { panic("cdl_len is 0\n"); } if (LIST_EMPTY(&sc->hs_free_list)) { ccb->ccb_h.status = CAM_REQUEUE_REQ; if (sc->hs_frozen == 0) { sc->hs_frozen = 1; xpt_freeze_simq(sim, /* count*/1); } xpt_done(ccb); return; } reqp = LIST_FIRST(&sc->hs_free_list); LIST_REMOVE(reqp, link); bzero(reqp, sizeof(struct hv_storvsc_request)); reqp->softc = sc; ccb->ccb_h.status |= CAM_SIM_QUEUED; if ((res = create_storvsc_request(ccb, reqp)) != 0) { ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); return; } #ifdef notyet if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { callout_init(&reqp->callout, 1); callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout, 0, storvsc_timeout, reqp, 0); #if HVS_TIMEOUT_TEST cv_init(&reqp->event.cv, "storvsc timeout cv"); mtx_init(&reqp->event.mtx, "storvsc timeout mutex", NULL, MTX_DEF); switch (reqp->vstor_packet.vm_srb.cdb[0]) { case MODE_SELECT_10: case SEND_DIAGNOSTIC: /* To have timer send the request. */ return; default: break; } #endif /* HVS_TIMEOUT_TEST */ } #endif if ((res = hv_storvsc_io_request(sc, reqp)) != 0) { xpt_print(ccb->ccb_h.path, "hv_storvsc_io_request failed with %d\n", res); ccb->ccb_h.status = CAM_PROVIDE_FAIL; storvsc_free_request(sc, reqp); xpt_done(ccb); return; } return; } default: ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); return; } } /** * @brief destroy bounce buffer * * This function is responsible for destroy a Scatter/Gather list * that create by storvsc_create_bounce_buffer() * * @param sgl- the Scatter/Gather need be destroy * @param sg_count- page count of the SG list. * */ static void storvsc_destroy_bounce_buffer(struct sglist *sgl) { struct hv_sgl_node *sgl_node = NULL; if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) { printf("storvsc error: not enough in use sgl\n"); return; } sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); LIST_REMOVE(sgl_node, link); sgl_node->sgl_data = sgl; LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); } /** * @brief create bounce buffer * * This function is responsible for create a Scatter/Gather list, * which hold several pages that can be aligned with page size. * * @param seg_count- SG-list segments count * @param write - if WRITE_TYPE, set SG list page used size to 0, * otherwise set used size to page size. * * return NULL if create failed */ static struct sglist * storvsc_create_bounce_buffer(uint16_t seg_count, int write) { int i = 0; struct sglist *bounce_sgl = NULL; unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); struct hv_sgl_node *sgl_node = NULL; /* get struct sglist from free_sgl_list */ if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { printf("storvsc error: not enough free sgl\n"); return NULL; } sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); LIST_REMOVE(sgl_node, link); bounce_sgl = sgl_node->sgl_data; LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); bounce_sgl->sg_maxseg = seg_count; if (write == WRITE_TYPE) bounce_sgl->sg_nseg = 0; else bounce_sgl->sg_nseg = seg_count; for (i = 0; i < seg_count; i++) bounce_sgl->sg_segs[i].ss_len = buf_len; return bounce_sgl; } /** * @brief copy data from SG list to bounce buffer * * This function is responsible for copy data from one SG list's segments * to another SG list which used as bounce buffer. * * @param bounce_sgl - the destination SG list * @param orig_sgl - the segment of the source SG list. * @param orig_sgl_count - the count of segments. * @param orig_sgl_count - indicate which segment need bounce buffer, * set 1 means need. * */ static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, bus_dma_segment_t *orig_sgl, unsigned int orig_sgl_count, uint64_t seg_bits) { int src_sgl_idx = 0; for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) { if (seg_bits & (1 << src_sgl_idx)) { memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr, (void*)orig_sgl[src_sgl_idx].ds_addr, orig_sgl[src_sgl_idx].ds_len); bounce_sgl->sg_segs[src_sgl_idx].ss_len = orig_sgl[src_sgl_idx].ds_len; } } } /** * @brief copy data from SG list which used as bounce to another SG list * * This function is responsible for copy data from one SG list with bounce * buffer to another SG list's segments. * * @param dest_sgl - the destination SG list's segments * @param dest_sgl_count - the count of destination SG list's segment. * @param src_sgl - the source SG list. * @param seg_bits - indicate which segment used bounce buffer of src SG-list. * */ void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, unsigned int dest_sgl_count, struct sglist* src_sgl, uint64_t seg_bits) { int sgl_idx = 0; for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) { if (seg_bits & (1 << sgl_idx)) { memcpy((void*)(dest_sgl[sgl_idx].ds_addr), (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr), src_sgl->sg_segs[sgl_idx].ss_len); } } } /** * @brief check SG list with bounce buffer or not * * This function is responsible for check if need bounce buffer for SG list. * * @param sgl - the SG list's segments * @param sg_count - the count of SG list's segment. * @param bits - segmengs number that need bounce buffer * * return -1 if SG list needless bounce buffer */ static int storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, unsigned int sg_count, uint64_t *bits) { int i = 0; int offset = 0; uint64_t phys_addr = 0; uint64_t tmp_bits = 0; boolean_t found_hole = FALSE; boolean_t pre_aligned = TRUE; if (sg_count < 2){ return -1; } *bits = 0; phys_addr = vtophys(sgl[0].ds_addr); offset = phys_addr - trunc_page(phys_addr); if (offset != 0) { pre_aligned = FALSE; tmp_bits |= 1; } for (i = 1; i < sg_count; i++) { phys_addr = vtophys(sgl[i].ds_addr); offset = phys_addr - trunc_page(phys_addr); if (offset == 0) { if (FALSE == pre_aligned){ /* * This segment is aligned, if the previous * one is not aligned, find a hole */ found_hole = TRUE; } pre_aligned = TRUE; } else { tmp_bits |= 1 << i; if (!pre_aligned) { if (phys_addr != vtophys(sgl[i-1].ds_addr + sgl[i-1].ds_len)) { /* * Check whether connect to previous * segment,if not, find the hole */ found_hole = TRUE; } } else { found_hole = TRUE; } pre_aligned = FALSE; } } if (!found_hole) { return (-1); } else { *bits = tmp_bits; return 0; } } /** * @brief Fill in a request structure based on a CAM control block * * Fills in a request structure based on the contents of a CAM control * block. The request structure holds the payload information for * VSCSI protocol request. * * @param ccb pointer to a CAM contorl block * @param reqp pointer to a request structure */ static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) { struct ccb_scsiio *csio = &ccb->csio; uint64_t phys_addr; uint32_t bytes_to_copy = 0; uint32_t pfn_num = 0; uint32_t pfn; uint64_t not_aligned_seg_bits = 0; struct hvs_gpa_range *prplist; /* refer to struct vmscsi_req for meanings of these two fields */ reqp->vstor_packet.u.vm_srb.port = cam_sim_unit(xpt_path_sim(ccb->ccb_h.path)); reqp->vstor_packet.u.vm_srb.path_id = cam_sim_bus(xpt_path_sim(ccb->ccb_h.path)); reqp->vstor_packet.u.vm_srb.target_id = ccb->ccb_h.target_id; reqp->vstor_packet.u.vm_srb.lun = ccb->ccb_h.target_lun; reqp->vstor_packet.u.vm_srb.cdb_len = csio->cdb_len; if(ccb->ccb_h.flags & CAM_CDB_POINTER) { memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_ptr, csio->cdb_len); } else { memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_bytes, csio->cdb_len); } switch (ccb->ccb_h.flags & CAM_DIR_MASK) { case CAM_DIR_OUT: reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; break; case CAM_DIR_IN: reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; break; case CAM_DIR_NONE: reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; break; default: reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; break; } reqp->sense_data = &csio->sense_data; reqp->sense_info_len = csio->sense_len; reqp->ccb = ccb; if (0 == csio->dxfer_len) { return (0); } prplist = &reqp->prp_list; prplist->gpa_range.gpa_len = csio->dxfer_len; switch (ccb->ccb_h.flags & CAM_DATA_MASK) { case CAM_DATA_VADDR: { bytes_to_copy = csio->dxfer_len; phys_addr = vtophys(csio->data_ptr); prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK; while (bytes_to_copy != 0) { int bytes, page_offset; phys_addr = vtophys(&csio->data_ptr[prplist->gpa_range.gpa_len - bytes_to_copy]); pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[pfn_num] = pfn; page_offset = phys_addr & PAGE_MASK; bytes = min(PAGE_SIZE - page_offset, bytes_to_copy); bytes_to_copy -= bytes; pfn_num++; } reqp->prp_cnt = pfn_num; break; } case CAM_DATA_SG: { int i = 0; int offset = 0; int ret; bus_dma_segment_t *storvsc_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt; printf("Storvsc: get SG I/O operation, %d\n", reqp->vstor_packet.u.vm_srb.data_in); if (storvsc_sg_count > VMBUS_CHAN_PRPLIST_MAX){ printf("Storvsc: %d segments is too much, " "only support %d segments\n", storvsc_sg_count, VMBUS_CHAN_PRPLIST_MAX); return (EINVAL); } /* * We create our own bounce buffer function currently. Idealy * we should use BUS_DMA(9) framework. But with current BUS_DMA * code there is no callback API to check the page alignment of * middle segments before busdma can decide if a bounce buffer * is needed for particular segment. There is callback, * "bus_dma_filter_t *filter", but the parrameters are not * sufficient for storvsc driver. * TODO: * Add page alignment check in BUS_DMA(9) callback. Once * this is complete, switch the following code to use * BUS_DMA(9) for storvsc bounce buffer support. */ /* check if we need to create bounce buffer */ ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist, storvsc_sg_count, ¬_aligned_seg_bits); if (ret != -1) { reqp->bounce_sgl = storvsc_create_bounce_buffer(storvsc_sg_count, reqp->vstor_packet.u.vm_srb.data_in); if (NULL == reqp->bounce_sgl) { printf("Storvsc_error: " "create bounce buffer failed.\n"); return (ENOMEM); } reqp->bounce_sgl_count = storvsc_sg_count; reqp->not_aligned_seg_bits = not_aligned_seg_bits; /* * if it is write, we need copy the original data *to bounce buffer */ if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { storvsc_copy_sgl_to_bounce_buf( reqp->bounce_sgl, storvsc_sglist, storvsc_sg_count, reqp->not_aligned_seg_bits); } /* transfer virtual address to physical frame number */ if (reqp->not_aligned_seg_bits & 0x1){ phys_addr = vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr); }else{ phys_addr = vtophys(storvsc_sglist[0].ds_addr); } prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK; pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[0] = pfn; for (i = 1; i < storvsc_sg_count; i++) { if (reqp->not_aligned_seg_bits & (1 << i)) { phys_addr = vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr); } else { phys_addr = vtophys(storvsc_sglist[i].ds_addr); } pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[i] = pfn; } reqp->prp_cnt = i; } else { phys_addr = vtophys(storvsc_sglist[0].ds_addr); prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK; for (i = 0; i < storvsc_sg_count; i++) { phys_addr = vtophys(storvsc_sglist[i].ds_addr); pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[i] = pfn; } reqp->prp_cnt = i; /* check the last segment cross boundary or not */ offset = phys_addr & PAGE_MASK; if (offset) { /* Add one more PRP entry */ phys_addr = vtophys(storvsc_sglist[i-1].ds_addr + PAGE_SIZE - offset); pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[i] = pfn; reqp->prp_cnt++; } reqp->bounce_sgl_count = 0; } break; } default: printf("Unknow flags: %d\n", ccb->ccb_h.flags); return(EINVAL); } return(0); } /* * SCSI Inquiry checks qualifier and type. * If qualifier is 011b, means the device server is not capable * of supporting a peripheral device on this logical unit, and * the type should be set to 1Fh. * * Return 1 if it is valid, 0 otherwise. */ static inline int is_inquiry_valid(const struct scsi_inquiry_data *inq_data) { uint8_t type; if (SID_QUAL(inq_data) != SID_QUAL_LU_CONNECTED) { return (0); } type = SID_TYPE(inq_data); if (type == T_NODEVICE) { return (0); } return (1); } /** * @brief completion function before returning to CAM * * I/O process has been completed and the result needs * to be passed to the CAM layer. * Free resources related to this request. * * @param reqp pointer to a request structure */ static void storvsc_io_done(struct hv_storvsc_request *reqp) { union ccb *ccb = reqp->ccb; struct ccb_scsiio *csio = &ccb->csio; struct storvsc_softc *sc = reqp->softc; struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb; bus_dma_segment_t *ori_sglist = NULL; int ori_sg_count = 0; /* destroy bounce buffer if it is used */ if (reqp->bounce_sgl_count) { ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; ori_sg_count = ccb->csio.sglist_cnt; /* * If it is READ operation, we should copy back the data * to original SG list. */ if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { storvsc_copy_from_bounce_buf_to_sgl(ori_sglist, ori_sg_count, reqp->bounce_sgl, reqp->not_aligned_seg_bits); } storvsc_destroy_bounce_buffer(reqp->bounce_sgl); reqp->bounce_sgl_count = 0; } if (reqp->retries > 0) { mtx_lock(&sc->hs_lock); #if HVS_TIMEOUT_TEST xpt_print(ccb->ccb_h.path, "%u: IO returned after timeout, " "waking up timer handler if any.\n", ticks); mtx_lock(&reqp->event.mtx); cv_signal(&reqp->event.cv); mtx_unlock(&reqp->event.mtx); #endif reqp->retries = 0; xpt_print(ccb->ccb_h.path, "%u: IO returned after timeout, " "stopping timer if any.\n", ticks); mtx_unlock(&sc->hs_lock); } #ifdef notyet /* * callout_drain() will wait for the timer handler to finish * if it is running. So we don't need any lock to synchronize * between this routine and the timer handler. * Note that we need to make sure reqp is not freed when timer * handler is using or will use it. */ if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { callout_drain(&reqp->callout); } #endif ccb->ccb_h.status &= ~CAM_SIM_QUEUED; ccb->ccb_h.status &= ~CAM_STATUS_MASK; if (vm_srb->scsi_status == SCSI_STATUS_OK) { const struct scsi_generic *cmd; /* * Check whether the data for INQUIRY cmd is valid or * not. Windows 10 and Windows 2016 send all zero * inquiry data to VM even for unpopulated slots. */ cmd = (const struct scsi_generic *) ((ccb->ccb_h.flags & CAM_CDB_POINTER) ? csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes); if (cmd->opcode == INQUIRY) { /* * The host of Windows 10 or 2016 server will response * the inquiry request with invalid data for unexisted device: [0x7f 0x0 0x5 0x2 0x1f ... ] * But on windows 2012 R2, the response is: [0x7f 0x0 0x0 0x0 0x0 ] * That is why here wants to validate the inquiry response. * The validation will skip the INQUIRY whose response is short, * which is less than SHORT_INQUIRY_LENGTH (36). * * For more information about INQUIRY, please refer to: * ftp://ftp.avc-pioneer.com/Mtfuji_7/Proposal/Jun09/INQUIRY.pdf */ const struct scsi_inquiry_data *inq_data = (const struct scsi_inquiry_data *)csio->data_ptr; uint8_t* resp_buf = (uint8_t*)csio->data_ptr; /* Get the buffer length reported by host */ int resp_xfer_len = vm_srb->transfer_len; /* Get the available buffer length */ int resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0; int data_len = (resp_buf_len < resp_xfer_len) ? resp_buf_len : resp_xfer_len; if (data_len < SHORT_INQUIRY_LENGTH) { ccb->ccb_h.status |= CAM_REQ_CMP; if (bootverbose && data_len >= 5) { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "storvsc skips the validation for short inquiry (%d)" " [%x %x %x %x %x]\n", data_len,resp_buf[0],resp_buf[1],resp_buf[2], resp_buf[3],resp_buf[4]); mtx_unlock(&sc->hs_lock); } } else if (is_inquiry_valid(inq_data) == 0) { ccb->ccb_h.status |= CAM_DEV_NOT_THERE; if (bootverbose && data_len >= 5) { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "storvsc uninstalled invalid device" " [%x %x %x %x %x]\n", resp_buf[0],resp_buf[1],resp_buf[2],resp_buf[3],resp_buf[4]); mtx_unlock(&sc->hs_lock); } } else { ccb->ccb_h.status |= CAM_REQ_CMP; if (bootverbose) { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "storvsc has passed inquiry response (%d) validation\n", data_len); mtx_unlock(&sc->hs_lock); } } } else { ccb->ccb_h.status |= CAM_REQ_CMP; } } else { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "storvsc scsi_status = %d\n", vm_srb->scsi_status); mtx_unlock(&sc->hs_lock); ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR; } ccb->csio.scsi_status = (vm_srb->scsi_status & 0xFF); ccb->csio.resid = ccb->csio.dxfer_len - vm_srb->transfer_len; if (reqp->sense_info_len != 0) { csio->sense_resid = csio->sense_len - reqp->sense_info_len; ccb->ccb_h.status |= CAM_AUTOSNS_VALID; } mtx_lock(&sc->hs_lock); if (reqp->softc->hs_frozen == 1) { xpt_print(ccb->ccb_h.path, "%u: storvsc unfreezing softc 0x%p.\n", ticks, reqp->softc); ccb->ccb_h.status |= CAM_RELEASE_SIMQ; reqp->softc->hs_frozen = 0; } storvsc_free_request(sc, reqp); mtx_unlock(&sc->hs_lock); xpt_done_direct(ccb); } /** * @brief Free a request structure * * Free a request structure by returning it to the free list * * @param sc pointer to a softc * @param reqp pointer to a request structure */ static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp) { LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); } /** * @brief Determine type of storage device from GUID * * Using the type GUID, determine if this is a StorVSC (paravirtual * SCSI or BlkVSC (paravirtual IDE) device. * * @param dev a device * returns an enum */ static enum hv_storage_type storvsc_get_storage_type(device_t dev) { device_t parent = device_get_parent(dev); if (VMBUS_PROBE_GUID(parent, dev, &gBlkVscDeviceType) == 0) return DRIVER_BLKVSC; if (VMBUS_PROBE_GUID(parent, dev, &gStorVscDeviceType) == 0) return DRIVER_STORVSC; return DRIVER_UNKNOWN; } Index: head/sys/dev/hyperv/utilities/hv_kvp.c =================================================================== --- head/sys/dev/hyperv/utilities/hv_kvp.c (revision 303070) +++ head/sys/dev/hyperv/utilities/hv_kvp.c (revision 303071) @@ -1,954 +1,955 @@ /*- * Copyright (c) 2014,2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Author: Sainath Varanasi. * Date: 4/2012 * Email: bsdic@microsoft.com */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_util.h" #include "unicode.h" #include "hv_kvp.h" #include "vmbus_if.h" /* hv_kvp defines */ #define BUFFERSIZE sizeof(struct hv_kvp_msg) #define KVP_SUCCESS 0 #define KVP_ERROR 1 #define kvp_hdr hdr.kvp_hdr /* hv_kvp debug control */ static int hv_kvp_log = 0; #define hv_kvp_log_error(...) do { \ if (hv_kvp_log > 0) \ log(LOG_ERR, "hv_kvp: " __VA_ARGS__); \ } while (0) #define hv_kvp_log_info(...) do { \ if (hv_kvp_log > 1) \ log(LOG_INFO, "hv_kvp: " __VA_ARGS__); \ } while (0) static const struct hyperv_guid service_guid = { .hv_guid = {0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6} }; /* character device prototypes */ static d_open_t hv_kvp_dev_open; static d_close_t hv_kvp_dev_close; static d_read_t hv_kvp_dev_daemon_read; static d_write_t hv_kvp_dev_daemon_write; static d_poll_t hv_kvp_dev_daemon_poll; /* hv_kvp character device structure */ static struct cdevsw hv_kvp_cdevsw = { .d_version = D_VERSION, .d_open = hv_kvp_dev_open, .d_close = hv_kvp_dev_close, .d_read = hv_kvp_dev_daemon_read, .d_write = hv_kvp_dev_daemon_write, .d_poll = hv_kvp_dev_daemon_poll, .d_name = "hv_kvp_dev", }; /* * Global state to track and synchronize multiple * KVP transaction requests from the host. */ typedef struct hv_kvp_sc { struct hv_util_sc util_sc; device_t dev; /* Unless specified the pending mutex should be * used to alter the values of the following parameters: * 1. req_in_progress * 2. req_timed_out */ struct mtx pending_mutex; struct task task; /* To track if transaction is active or not */ boolean_t req_in_progress; /* Tracks if daemon did not reply back in time */ boolean_t req_timed_out; /* Tracks if daemon is serving a request currently */ boolean_t daemon_busy; /* Length of host message */ uint32_t host_msg_len; /* Host message id */ uint64_t host_msg_id; /* Current kvp message from the host */ struct hv_kvp_msg *host_kvp_msg; /* Current kvp message for daemon */ struct hv_kvp_msg daemon_kvp_msg; /* Rcv buffer for communicating with the host*/ uint8_t *rcv_buf; /* Device semaphore to control communication */ struct sema dev_sema; /* Indicates if daemon registered with driver */ boolean_t register_done; /* Character device status */ boolean_t dev_accessed; struct cdev *hv_kvp_dev; struct proc *daemon_task; struct selinfo hv_kvp_selinfo; } hv_kvp_sc; /* hv_kvp prototypes */ static int hv_kvp_req_in_progress(hv_kvp_sc *sc); static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *); static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc); static void hv_kvp_process_request(void *context, int pending); /* * hv_kvp low level functions */ /* * Check if kvp transaction is in progres */ static int hv_kvp_req_in_progress(hv_kvp_sc *sc) { return (sc->req_in_progress); } /* * This routine is called whenever a message is received from the host */ static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len, uint64_t request_id, uint8_t *rcv_buf) { /* Store all the relevant message details in the global structure */ /* Do not need to use mutex for req_in_progress here */ sc->req_in_progress = true; sc->host_msg_len = rcv_len; sc->host_msg_id = request_id; sc->rcv_buf = rcv_buf; sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[ sizeof(struct hv_vmbus_pipe_hdr) + sizeof(struct hv_vmbus_icmsg_hdr)]; } /* * hv_kvp - version neogtiation function */ static void hv_kvp_negotiate_version(struct hv_vmbus_icmsg_hdr *icmsghdrp, struct hv_vmbus_icmsg_negotiate *negop, uint8_t *buf) { int icframe_vercnt; int icmsg_vercnt; icmsghdrp->icmsgsize = 0x10; negop = (struct hv_vmbus_icmsg_negotiate *)&buf[ sizeof(struct hv_vmbus_pipe_hdr) + sizeof(struct hv_vmbus_icmsg_hdr)]; icframe_vercnt = negop->icframe_vercnt; icmsg_vercnt = negop->icmsg_vercnt; /* * Select the framework version number we will support */ if ((icframe_vercnt >= 2) && (negop->icversion_data[1].major == 3)) { icframe_vercnt = 3; if (icmsg_vercnt > 2) icmsg_vercnt = 4; else icmsg_vercnt = 3; } else { icframe_vercnt = 1; icmsg_vercnt = 1; } negop->icframe_vercnt = 1; negop->icmsg_vercnt = 1; negop->icversion_data[0].major = icframe_vercnt; negop->icversion_data[0].minor = 0; negop->icversion_data[1].major = icmsg_vercnt; negop->icversion_data[1].minor = 0; } /* * Convert ip related info in umsg from utf8 to utf16 and store in hmsg */ static int hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg, struct hv_kvp_ip_msg *host_ip_msg) { int err_ip, err_subnet, err_gway, err_dns, err_adap; int UNUSED_FLAG = 1; utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.ip_addr, strlen((char *)umsg->body.kvp_ip_val.ip_addr), UNUSED_FLAG, &err_ip); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.sub_net, strlen((char *)umsg->body.kvp_ip_val.sub_net), UNUSED_FLAG, &err_subnet); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, (char *)umsg->body.kvp_ip_val.gate_way, strlen((char *)umsg->body.kvp_ip_val.gate_way), UNUSED_FLAG, &err_gway); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.dns_addr, strlen((char *)umsg->body.kvp_ip_val.dns_addr), UNUSED_FLAG, &err_dns); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.adapter_id, strlen((char *)umsg->body.kvp_ip_val.adapter_id), UNUSED_FLAG, &err_adap); host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled; host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family; return (err_ip | err_subnet | err_gway | err_dns | err_adap); } /* * Convert ip related info in hmsg from utf16 to utf8 and store in umsg */ static int hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, struct hv_kvp_msg *umsg) { int err_ip, err_subnet, err_gway, err_dns, err_adap; int UNUSED_FLAG = 1; device_t *devs; int devcnt; /* IP Address */ utf16_to_utf8((char *)umsg->body.kvp_ip_val.ip_addr, MAX_IP_ADDR_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.ip_addr, MAX_IP_ADDR_SIZE, UNUSED_FLAG, &err_ip); /* Adapter ID : GUID */ utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, UNUSED_FLAG, &err_adap); if (devclass_get_devices(devclass_find("hn"), &devs, &devcnt) == 0) { for (devcnt = devcnt - 1; devcnt >= 0; devcnt--) { /* XXX access other driver's softc? are you kidding? */ device_t dev = devs[devcnt]; struct hn_softc *sc = device_get_softc(dev); struct hv_vmbus_channel *chan; char buf[HYPERV_GUID_STRLEN]; /* * Trying to find GUID of Network Device * TODO: need vmbus interface. */ chan = vmbus_get_channel(dev); - hyperv_guid2str(&chan->ch_guid_inst, buf, sizeof(buf)); + hyperv_guid2str(vmbus_chan_guid_inst(chan), + buf, sizeof(buf)); if (strncmp(buf, (char *)umsg->body.kvp_ip_val.adapter_id, HYPERV_GUID_STRLEN - 1) == 0) { strlcpy((char *)umsg->body.kvp_ip_val.adapter_id, sc->hn_ifp->if_xname, MAX_ADAPTER_ID_SIZE); break; } } free(devs, M_TEMP); } /* Address Family , DHCP , SUBNET, Gateway, DNS */ umsg->kvp_hdr.operation = host_ip_msg->operation; umsg->body.kvp_ip_val.addr_family = host_ip_msg->kvp_ip_val.addr_family; umsg->body.kvp_ip_val.dhcp_enabled = host_ip_msg->kvp_ip_val.dhcp_enabled; utf16_to_utf8((char *)umsg->body.kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE, UNUSED_FLAG, &err_subnet); utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, UNUSED_FLAG, &err_gway); utf16_to_utf8((char *)umsg->body.kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE, UNUSED_FLAG, &err_dns); return (err_ip | err_subnet | err_gway | err_dns | err_adap); } /* * Prepare a user kvp msg based on host kvp msg (utf16 to utf8) * Ensure utf16_utf8 takes care of the additional string terminating char!! */ static void hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg) { int utf_err = 0; uint32_t value_type; struct hv_kvp_ip_msg *host_ip_msg; host_ip_msg = (struct hv_kvp_ip_msg*)hmsg; memset(umsg, 0, sizeof(struct hv_kvp_msg)); umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation; umsg->kvp_hdr.pool = hmsg->kvp_hdr.pool; switch (umsg->kvp_hdr.operation) { case HV_KVP_OP_SET_IP_INFO: hv_kvp_convert_utf16_ipinfo_to_utf8(host_ip_msg, umsg); break; case HV_KVP_OP_GET_IP_INFO: utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, 1, &utf_err); umsg->body.kvp_ip_val.addr_family = host_ip_msg->kvp_ip_val.addr_family; break; case HV_KVP_OP_SET: value_type = hmsg->body.kvp_set.data.value_type; switch (value_type) { case HV_REG_SZ: umsg->body.kvp_set.data.value_size = utf16_to_utf8( (char *)umsg->body.kvp_set.data.msg_value.value, HV_KVP_EXCHANGE_MAX_VALUE_SIZE - 1, (uint16_t *)hmsg->body.kvp_set.data.msg_value.value, hmsg->body.kvp_set.data.value_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_set.data.value_size = umsg->body.kvp_set.data.value_size / 2; break; case HV_REG_U32: umsg->body.kvp_set.data.value_size = sprintf(umsg->body.kvp_set.data.msg_value.value, "%d", hmsg->body.kvp_set.data.msg_value.value_u32) + 1; break; case HV_REG_U64: umsg->body.kvp_set.data.value_size = sprintf(umsg->body.kvp_set.data.msg_value.value, "%llu", (unsigned long long) hmsg->body.kvp_set.data.msg_value.value_u64) + 1; break; } umsg->body.kvp_set.data.key_size = utf16_to_utf8( umsg->body.kvp_set.data.key, HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, (uint16_t *)hmsg->body.kvp_set.data.key, hmsg->body.kvp_set.data.key_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_set.data.key_size = umsg->body.kvp_set.data.key_size / 2; break; case HV_KVP_OP_GET: umsg->body.kvp_get.data.key_size = utf16_to_utf8(umsg->body.kvp_get.data.key, HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, (uint16_t *)hmsg->body.kvp_get.data.key, hmsg->body.kvp_get.data.key_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_get.data.key_size = umsg->body.kvp_get.data.key_size / 2; break; case HV_KVP_OP_DELETE: umsg->body.kvp_delete.key_size = utf16_to_utf8(umsg->body.kvp_delete.key, HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, (uint16_t *)hmsg->body.kvp_delete.key, hmsg->body.kvp_delete.key_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_delete.key_size = umsg->body.kvp_delete.key_size / 2; break; case HV_KVP_OP_ENUMERATE: umsg->body.kvp_enum_data.index = hmsg->body.kvp_enum_data.index; break; default: hv_kvp_log_info("%s: daemon_kvp_msg: Invalid operation : %d\n", __func__, umsg->kvp_hdr.operation); } } /* * Prepare a host kvp msg based on user kvp msg (utf8 to utf16) */ static int hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg) { int hkey_len = 0, hvalue_len = 0, utf_err = 0; struct hv_kvp_exchg_msg_value *host_exchg_data; char *key_name, *value; struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg; switch (hmsg->kvp_hdr.operation) { case HV_KVP_OP_GET_IP_INFO: return (hv_kvp_convert_utf8_ipinfo_to_utf16(umsg, host_ip_msg)); case HV_KVP_OP_SET_IP_INFO: case HV_KVP_OP_SET: case HV_KVP_OP_DELETE: return (KVP_SUCCESS); case HV_KVP_OP_ENUMERATE: host_exchg_data = &hmsg->body.kvp_enum_data.data; key_name = umsg->body.kvp_enum_data.data.key; hkey_len = utf8_to_utf16((uint16_t *)host_exchg_data->key, ((HV_KVP_EXCHANGE_MAX_KEY_SIZE / 2) - 2), key_name, strlen(key_name), 1, &utf_err); /* utf16 encoding */ host_exchg_data->key_size = 2 * (hkey_len + 1); value = umsg->body.kvp_enum_data.data.msg_value.value; hvalue_len = utf8_to_utf16( (uint16_t *)host_exchg_data->msg_value.value, ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2), value, strlen(value), 1, &utf_err); host_exchg_data->value_size = 2 * (hvalue_len + 1); host_exchg_data->value_type = HV_REG_SZ; if ((hkey_len < 0) || (hvalue_len < 0)) return (HV_KVP_E_FAIL); return (KVP_SUCCESS); case HV_KVP_OP_GET: host_exchg_data = &hmsg->body.kvp_get.data; value = umsg->body.kvp_get.data.msg_value.value; hvalue_len = utf8_to_utf16( (uint16_t *)host_exchg_data->msg_value.value, ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2), value, strlen(value), 1, &utf_err); /* Convert value size to uft16 */ host_exchg_data->value_size = 2 * (hvalue_len + 1); /* Use values by string */ host_exchg_data->value_type = HV_REG_SZ; if ((hkey_len < 0) || (hvalue_len < 0)) return (HV_KVP_E_FAIL); return (KVP_SUCCESS); default: return (HV_KVP_E_FAIL); } } /* * Send the response back to the host. */ static void hv_kvp_respond_host(hv_kvp_sc *sc, int error) { struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp; hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *) &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)]; if (error) error = HV_KVP_E_FAIL; hv_icmsg_hdrp->status = error; hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE; error = vmbus_chan_send(vmbus_get_channel(sc->dev), VMBUS_CHANPKT_TYPE_INBAND, 0, sc->rcv_buf, sc->host_msg_len, sc->host_msg_id); if (error) hv_kvp_log_info("%s: hv_kvp_respond_host: sendpacket error:%d\n", __func__, error); } /* * This is the main kvp kernel process that interacts with both user daemon * and the host */ static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc) { struct hv_kvp_msg *hmsg = sc->host_kvp_msg; struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; /* Prepare kvp_msg to be sent to user */ hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg); /* Send the msg to user via function deamon_read - setting sema */ sema_post(&sc->dev_sema); /* We should wake up the daemon, in case it's doing poll() */ selwakeup(&sc->hv_kvp_selinfo); } /* * Function to read the kvp request buffer from host * and interact with daemon */ static void hv_kvp_process_request(void *context, int pending) { uint8_t *kvp_buf; - hv_vmbus_channel *channel; + struct hv_vmbus_channel *channel; uint32_t recvlen = 0; uint64_t requestid; struct hv_vmbus_icmsg_hdr *icmsghdrp; int ret = 0; hv_kvp_sc *sc; hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__); sc = (hv_kvp_sc*)context; kvp_buf = sc->util_sc.receive_buffer; channel = vmbus_get_channel(sc->dev); recvlen = 2 * PAGE_SIZE; ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); /* XXX check recvlen to make sure that it contains enough data */ while ((ret == 0) && (recvlen > 0)) { icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)]; hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf); if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { hv_kvp_negotiate_version(icmsghdrp, NULL, kvp_buf); hv_kvp_respond_host(sc, ret); /* * It is ok to not acquire the mutex before setting * req_in_progress here because negotiation is the * first thing that happens and hence there is no * chance of a race condition. */ sc->req_in_progress = false; hv_kvp_log_info("%s :version negotiated\n", __func__); } else { if (!sc->daemon_busy) { hv_kvp_log_info("%s: issuing qury to daemon\n", __func__); mtx_lock(&sc->pending_mutex); sc->req_timed_out = false; sc->daemon_busy = true; mtx_unlock(&sc->pending_mutex); hv_kvp_send_msg_to_daemon(sc); hv_kvp_log_info("%s: waiting for daemon\n", __func__); } /* Wait 5 seconds for daemon to respond back */ tsleep(sc, 0, "kvpworkitem", 5 * hz); hv_kvp_log_info("%s: came out of wait\n", __func__); } mtx_lock(&sc->pending_mutex); /* Notice that once req_timed_out is set to true * it will remain true until the next request is * sent to the daemon. The response from daemon * is forwarded to host only when this flag is * false. */ sc->req_timed_out = true; /* * Cancel request if so need be. */ if (hv_kvp_req_in_progress(sc)) { hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__); hv_kvp_respond_host(sc, HV_KVP_E_FAIL); sc->req_in_progress = false; } mtx_unlock(&sc->pending_mutex); /* * Try reading next buffer */ recvlen = 2 * PAGE_SIZE; ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); /* XXX check recvlen to make sure that it contains enough data */ hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n", __func__, context, ret, recvlen); } } /* * Callback routine that gets called whenever there is a message from host */ static void hv_kvp_callback(struct hv_vmbus_channel *chan __unused, void *context) { hv_kvp_sc *sc = (hv_kvp_sc*)context; /* The first request from host will not be handled until daemon is registered. when callback is triggered without a registered daemon, callback just return. When a new daemon gets regsitered, this callbcak is trigged from _write op. */ if (sc->register_done) { hv_kvp_log_info("%s: Queuing work item\n", __func__); taskqueue_enqueue(taskqueue_thread, &sc->task); } } static int hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__); if (sc->dev_accessed) return (-EBUSY); sc->daemon_task = curproc; sc->dev_accessed = true; sc->daemon_busy = false; return (0); } static int hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused, struct thread *td __unused) { hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__); sc->dev_accessed = false; sc->register_done = false; return (0); } /* * hv_kvp_daemon read invokes this function * acts as a send to daemon */ static int hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused) { size_t amt; int error = 0; struct hv_kvp_msg *hv_kvp_dev_buf; hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; /* Check hv_kvp daemon registration status*/ if (!sc->register_done) return (KVP_ERROR); sema_wait(&sc->dev_sema); hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg)); amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 : BUFFERSIZE + 1 - uio->uio_offset); if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0) hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__); free(hv_kvp_dev_buf, M_TEMP); return (error); } /* * hv_kvp_daemon write invokes this function * acts as a receive from daemon */ static int hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused) { size_t amt; int error = 0; struct hv_kvp_msg *hv_kvp_dev_buf; hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; uio->uio_offset = 0; hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); amt = MIN(uio->uio_resid, BUFFERSIZE); error = uiomove(hv_kvp_dev_buf, amt, uio); if (error != 0) { free(hv_kvp_dev_buf, M_TEMP); return (error); } memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg)); free(hv_kvp_dev_buf, M_TEMP); if (sc->register_done == false) { if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) { sc->register_done = true; hv_kvp_callback(vmbus_get_channel(sc->dev), dev->si_drv1); } else { hv_kvp_log_info("%s, KVP Registration Failed\n", __func__); return (KVP_ERROR); } } else { mtx_lock(&sc->pending_mutex); if(!sc->req_timed_out) { struct hv_kvp_msg *hmsg = sc->host_kvp_msg; struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg); hv_kvp_respond_host(sc, KVP_SUCCESS); wakeup(sc); sc->req_in_progress = false; } sc->daemon_busy = false; mtx_unlock(&sc->pending_mutex); } return (error); } /* * hv_kvp_daemon poll invokes this function to check if data is available * for daemon to read. */ static int hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td) { int revents = 0; hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; mtx_lock(&sc->pending_mutex); /* * We check global flag daemon_busy for the data availiability for * userland to read. Deamon_busy is set to true before driver has data * for daemon to read. It is set to false after daemon sends * then response back to driver. */ if (sc->daemon_busy == true) revents = POLLIN; else selrecord(td, &sc->hv_kvp_selinfo); mtx_unlock(&sc->pending_mutex); return (revents); } static int hv_kvp_probe(device_t dev) { if (resource_disabled("hvkvp", 0)) return ENXIO; if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &service_guid) == 0) { device_set_desc(dev, "Hyper-V KVP Service"); return BUS_PROBE_DEFAULT; } return ENXIO; } static int hv_kvp_attach(device_t dev) { int error; struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); sc->util_sc.callback = hv_kvp_callback; sc->dev = dev; sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore"); mtx_init(&sc->pending_mutex, "hv-kvp pending mutex", NULL, MTX_DEF); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log", CTLFLAG_RW, &hv_kvp_log, 0, "Hyperv KVP service log level"); TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc); /* create character device */ error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &sc->hv_kvp_dev, &hv_kvp_cdevsw, 0, UID_ROOT, GID_WHEEL, 0640, "hv_kvp_dev"); if (error != 0) return (error); sc->hv_kvp_dev->si_drv1 = sc; return hv_util_attach(dev); } static int hv_kvp_detach(device_t dev) { hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); if (sc->daemon_task != NULL) { PROC_LOCK(sc->daemon_task); kern_psignal(sc->daemon_task, SIGKILL); PROC_UNLOCK(sc->daemon_task); } destroy_dev(sc->hv_kvp_dev); return hv_util_detach(dev); } static device_method_t kvp_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hv_kvp_probe), DEVMETHOD(device_attach, hv_kvp_attach), DEVMETHOD(device_detach, hv_kvp_detach), { 0, 0 } }; static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)}; static devclass_t kvp_devclass; DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL); MODULE_VERSION(hv_kvp, 1); MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1); Index: head/sys/dev/hyperv/vmbus/hv_vmbus_priv.h =================================================================== --- head/sys/dev/hyperv/vmbus/hv_vmbus_priv.h (revision 303070) +++ head/sys/dev/hyperv/vmbus/hv_vmbus_priv.h (revision 303071) @@ -1,87 +1,87 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __HYPERV_PRIV_H__ #define __HYPERV_PRIV_H__ #include #include #include #include #include -#include +#include struct vmbus_softc; /* * Private, VM Bus functions */ struct sysctl_ctx_list; struct sysctl_oid_list; void hv_ring_buffer_stat( struct sysctl_ctx_list *ctx, struct sysctl_oid_list *tree_node, hv_vmbus_ring_buffer_info *rbi, const char *desc); int hv_vmbus_ring_buffer_init( hv_vmbus_ring_buffer_info *ring_info, void *buffer, uint32_t buffer_len); void hv_ring_buffer_cleanup( hv_vmbus_ring_buffer_info *ring_info); int hv_ring_buffer_write( hv_vmbus_ring_buffer_info *ring_info, const struct iovec iov[], uint32_t iovlen, boolean_t *need_sig); int hv_ring_buffer_peek( hv_vmbus_ring_buffer_info *ring_info, void *buffer, uint32_t buffer_len); int hv_ring_buffer_read( hv_vmbus_ring_buffer_info *ring_info, void *buffer, uint32_t buffer_len, uint32_t offset); void hv_ring_buffer_read_begin( hv_vmbus_ring_buffer_info *ring_info); uint32_t hv_ring_buffer_read_end( hv_vmbus_ring_buffer_info *ring_info); #endif /* __HYPERV_PRIV_H__ */ Index: head/sys/dev/hyperv/vmbus/vmbus_chan.c =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_chan.c (revision 303070) +++ head/sys/dev/hyperv/vmbus/vmbus_chan.c (revision 303071) @@ -1,1380 +1,1407 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void vmbus_chan_signal_tx(struct hv_vmbus_channel *chan); static void vmbus_chan_update_evtflagcnt(struct vmbus_softc *, const struct hv_vmbus_channel *); static void vmbus_chan_task(void *, int); static void vmbus_chan_task_nobatch(void *, int); static void vmbus_chan_detach_task(void *, int); static void vmbus_chan_msgproc_choffer(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chan_msgproc_chrescind(struct vmbus_softc *, const struct vmbus_message *); /* * Vmbus channel message processing. */ static const vmbus_chanmsg_proc_t vmbus_chan_msgprocs[VMBUS_CHANMSG_TYPE_MAX] = { VMBUS_CHANMSG_PROC(CHOFFER, vmbus_chan_msgproc_choffer), VMBUS_CHANMSG_PROC(CHRESCIND, vmbus_chan_msgproc_chrescind), VMBUS_CHANMSG_PROC_WAKEUP(CHOPEN_RESP), VMBUS_CHANMSG_PROC_WAKEUP(GPADL_CONNRESP), VMBUS_CHANMSG_PROC_WAKEUP(GPADL_DISCONNRESP) }; /** * @brief Trigger an event notification on the specified channel */ static void vmbus_chan_signal_tx(struct hv_vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; uint32_t chanid = chan->ch_id; atomic_set_long(&sc->vmbus_tx_evtflags[chanid >> VMBUS_EVTFLAG_SHIFT], 1UL << (chanid & VMBUS_EVTFLAG_MASK)); if (chan->ch_flags & VMBUS_CHAN_FLAG_HASMNF) { atomic_set_int( &sc->vmbus_mnf2->mnf_trigs[chan->ch_montrig_idx].mt_pending, chan->ch_montrig_mask); } else { hypercall_signal_event(chan->ch_monprm_dma.hv_paddr); } } static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS) { struct hv_vmbus_channel *chan = arg1; int mnf = 0; if (chan->ch_flags & VMBUS_CHAN_FLAG_HASMNF) mnf = 1; return sysctl_handle_int(oidp, &mnf, 0, req); } static void vmbus_chan_sysctl_create(struct hv_vmbus_channel *chan) { struct sysctl_oid *ch_tree, *chid_tree, *br_tree; struct sysctl_ctx_list *ctx; uint32_t ch_id; char name[16]; /* * Add sysctl nodes related to this channel to this * channel's sysctl ctx, so that they can be destroyed * independently upon close of this channel, which can * happen even if the device is not detached. */ ctx = &chan->ch_sysctl_ctx; sysctl_ctx_init(ctx); /* * Create dev.NAME.UNIT.channel tree. */ ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(chan->ch_dev)), OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (ch_tree == NULL) return; /* * Create dev.NAME.UNIT.channel.CHANID tree. */ if (VMBUS_CHAN_ISPRIMARY(chan)) ch_id = chan->ch_id; else ch_id = chan->ch_prichan->ch_id; snprintf(name, sizeof(name), "%d", ch_id); chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (chid_tree == NULL) return; if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * Create dev.NAME.UNIT.channel.CHANID.sub tree. */ ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (ch_tree == NULL) return; /* * Create dev.NAME.UNIT.channel.CHANID.sub.SUBIDX tree. * * NOTE: * chid_tree is changed to this new sysctl tree. */ snprintf(name, sizeof(name), "%d", chan->ch_subidx); chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (chid_tree == NULL) return; SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "chanid", CTLFLAG_RD, &chan->ch_id, 0, "channel id"); } SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "cpu", CTLFLAG_RD, &chan->ch_cpuid, 0, "owner CPU id"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "mnf", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, chan, 0, vmbus_chan_sysctl_mnf, "I", "has monitor notification facilities"); /* * Create sysctl tree for RX bufring. */ br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "in", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (br_tree != NULL) { hv_ring_buffer_stat(ctx, SYSCTL_CHILDREN(br_tree), &chan->ch_rxbr, "inbound ring buffer stats"); } /* * Create sysctl tree for TX bufring. */ br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "out", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (br_tree != NULL) { hv_ring_buffer_stat(ctx, SYSCTL_CHILDREN(br_tree), &chan->ch_txbr, "outbound ring buffer stats"); } } int vmbus_chan_open(struct hv_vmbus_channel *chan, int txbr_size, int rxbr_size, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) { struct vmbus_softc *sc = chan->ch_vmbus; const struct vmbus_chanmsg_chopen_resp *resp; const struct vmbus_message *msg; struct vmbus_chanmsg_chopen *req; struct vmbus_msghc *mh; uint32_t status; int error; uint8_t *br; if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) { device_printf(sc->vmbus_dev, "invalid udata len %d for chan%u\n", udlen, chan->ch_id); return EINVAL; } KASSERT((txbr_size & PAGE_MASK) == 0, ("send bufring size is not multiple page")); KASSERT((rxbr_size & PAGE_MASK) == 0, ("recv bufring size is not multiple page")); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED_SHIFT)) panic("double-open chan%u", chan->ch_id); chan->ch_cb = cb; chan->ch_cbarg = cbarg; vmbus_chan_update_evtflagcnt(sc, chan); chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid); if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) TASK_INIT(&chan->ch_task, 0, vmbus_chan_task, chan); else TASK_INIT(&chan->ch_task, 0, vmbus_chan_task_nobatch, chan); /* * Allocate the TX+RX bufrings. * XXX should use ch_dev dtag */ br = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev), PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (br == NULL) { device_printf(sc->vmbus_dev, "bufring allocation failed\n"); error = ENOMEM; goto failed; } chan->ch_bufring = br; /* TX bufring comes first */ hv_vmbus_ring_buffer_init(&chan->ch_txbr, br, txbr_size); /* RX bufring immediately follows TX bufring */ hv_vmbus_ring_buffer_init(&chan->ch_rxbr, br + txbr_size, rxbr_size); /* Create sysctl tree for this channel */ vmbus_chan_sysctl_create(chan); /* * Connect the bufrings, both RX and TX, to this channel. */ error = vmbus_chan_gpadl_connect(chan, chan->ch_bufring_dma.hv_paddr, txbr_size + rxbr_size, &chan->ch_bufring_gpadl); if (error) { device_printf(sc->vmbus_dev, "failed to connect bufring GPADL to chan%u\n", chan->ch_id); goto failed; } /* * Open channel w/ the bufring GPADL on the target CPU. */ mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for chopen(chan%u)\n", chan->ch_id); error = ENXIO; goto failed; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHOPEN; req->chm_chanid = chan->ch_id; req->chm_openid = chan->ch_id; req->chm_gpadl = chan->ch_bufring_gpadl; req->chm_vcpuid = chan->ch_vcpuid; req->chm_txbr_pgcnt = txbr_size >> PAGE_SHIFT; if (udlen > 0) memcpy(req->chm_udata, udata, udlen); error = vmbus_msghc_exec(sc, mh); if (error) { device_printf(sc->vmbus_dev, "chopen(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); vmbus_msghc_put(sc, mh); goto failed; } msg = vmbus_msghc_wait_result(sc, mh); resp = (const struct vmbus_chanmsg_chopen_resp *)msg->msg_data; status = resp->chm_status; vmbus_msghc_put(sc, mh); if (status == 0) { if (bootverbose) { device_printf(sc->vmbus_dev, "chan%u opened\n", chan->ch_id); } return 0; } device_printf(sc->vmbus_dev, "failed to open chan%u\n", chan->ch_id); error = ENXIO; failed: if (chan->ch_bufring_gpadl) { vmbus_chan_gpadl_disconnect(chan, chan->ch_bufring_gpadl); chan->ch_bufring_gpadl = 0; } if (chan->ch_bufring != NULL) { hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); chan->ch_bufring = NULL; } atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED); return error; } int vmbus_chan_gpadl_connect(struct hv_vmbus_channel *chan, bus_addr_t paddr, int size, uint32_t *gpadl0) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_gpadl_conn *req; const struct vmbus_message *msg; size_t reqsz; uint32_t gpadl, status; int page_count, range_len, i, cnt, error; uint64_t page_id; /* * Preliminary checks. */ KASSERT((size & PAGE_MASK) == 0, ("invalid GPA size %d, not multiple page size", size)); page_count = size >> PAGE_SHIFT; KASSERT((paddr & PAGE_MASK) == 0, ("GPA is not page aligned %jx", (uintmax_t)paddr)); page_id = paddr >> PAGE_SHIFT; range_len = __offsetof(struct vmbus_gpa_range, gpa_page[page_count]); /* * We don't support multiple GPA ranges. */ if (range_len > UINT16_MAX) { device_printf(sc->vmbus_dev, "GPA too large, %d pages\n", page_count); return EOPNOTSUPP; } /* * Allocate GPADL id. */ gpadl = vmbus_gpadl_alloc(sc); *gpadl0 = gpadl; /* * Connect this GPADL to the target channel. * * NOTE: * Since each message can only hold small set of page * addresses, several messages may be required to * complete the connection. */ if (page_count > VMBUS_CHANMSG_GPADL_CONN_PGMAX) cnt = VMBUS_CHANMSG_GPADL_CONN_PGMAX; else cnt = page_count; page_count -= cnt; reqsz = __offsetof(struct vmbus_chanmsg_gpadl_conn, chm_range.gpa_page[cnt]); mh = vmbus_msghc_get(sc, reqsz); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for gpadl->chan%u\n", chan->ch_id); return EIO; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_CONN; req->chm_chanid = chan->ch_id; req->chm_gpadl = gpadl; req->chm_range_len = range_len; req->chm_range_cnt = 1; req->chm_range.gpa_len = size; req->chm_range.gpa_ofs = 0; for (i = 0; i < cnt; ++i) req->chm_range.gpa_page[i] = page_id++; error = vmbus_msghc_exec(sc, mh); if (error) { device_printf(sc->vmbus_dev, "gpadl->chan%u msg hypercall exec failed: %d\n", chan->ch_id, error); vmbus_msghc_put(sc, mh); return error; } while (page_count > 0) { struct vmbus_chanmsg_gpadl_subconn *subreq; if (page_count > VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX) cnt = VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX; else cnt = page_count; page_count -= cnt; reqsz = __offsetof(struct vmbus_chanmsg_gpadl_subconn, chm_gpa_page[cnt]); vmbus_msghc_reset(mh, reqsz); subreq = vmbus_msghc_dataptr(mh); subreq->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_SUBCONN; subreq->chm_gpadl = gpadl; for (i = 0; i < cnt; ++i) subreq->chm_gpa_page[i] = page_id++; vmbus_msghc_exec_noresult(mh); } KASSERT(page_count == 0, ("invalid page count %d", page_count)); msg = vmbus_msghc_wait_result(sc, mh); status = ((const struct vmbus_chanmsg_gpadl_connresp *) msg->msg_data)->chm_status; vmbus_msghc_put(sc, mh); if (status != 0) { device_printf(sc->vmbus_dev, "gpadl->chan%u failed: " "status %u\n", chan->ch_id, status); return EIO; } else { if (bootverbose) { device_printf(sc->vmbus_dev, "gpadl->chan%u " "succeeded\n", chan->ch_id); } } return 0; } /* * Disconnect the GPA from the target channel */ int vmbus_chan_gpadl_disconnect(struct hv_vmbus_channel *chan, uint32_t gpadl) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_gpadl_disconn *req; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for gpa x->chan%u\n", chan->ch_id); return EBUSY; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_DISCONN; req->chm_chanid = chan->ch_id; req->chm_gpadl = gpadl; error = vmbus_msghc_exec(sc, mh); if (error) { device_printf(sc->vmbus_dev, "gpa x->chan%u msg hypercall exec failed: %d\n", chan->ch_id, error); vmbus_msghc_put(sc, mh); return error; } vmbus_msghc_wait_result(sc, mh); /* Discard result; no useful information */ vmbus_msghc_put(sc, mh); return 0; } static void vmbus_chan_close_internal(struct hv_vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_chclose *req; struct taskqueue *tq = chan->ch_tq; int error; /* TODO: stringent check */ atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED); /* * Free this channel's sysctl tree attached to its device's * sysctl tree. */ sysctl_ctx_free(&chan->ch_sysctl_ctx); /* * Set ch_tq to NULL to avoid more requests be scheduled. * XXX pretty broken; need rework. */ chan->ch_tq = NULL; taskqueue_drain(tq, &chan->ch_task); chan->ch_cb = NULL; /* * Close this channel. */ mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for chclose(chan%u)\n", chan->ch_id); return; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHCLOSE; req->chm_chanid = chan->ch_id; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { device_printf(sc->vmbus_dev, "chclose(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); return; } else if (bootverbose) { device_printf(sc->vmbus_dev, "close chan%u\n", chan->ch_id); } /* * Disconnect the TX+RX bufrings from this channel. */ if (chan->ch_bufring_gpadl) { vmbus_chan_gpadl_disconnect(chan, chan->ch_bufring_gpadl); chan->ch_bufring_gpadl = 0; } /* * Destroy the TX+RX bufrings. */ hv_ring_buffer_cleanup(&chan->ch_txbr); hv_ring_buffer_cleanup(&chan->ch_rxbr); if (chan->ch_bufring != NULL) { hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); chan->ch_bufring = NULL; } } /* * Caller should make sure that all sub-channels have * been added to 'chan' and all to-be-closed channels * are not being opened. */ void vmbus_chan_close(struct hv_vmbus_channel *chan) { int subchan_cnt; if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * Sub-channel is closed when its primary channel * is closed; done. */ return; } /* * Close all sub-channels, if any. */ subchan_cnt = chan->ch_subchan_cnt; if (subchan_cnt > 0) { struct hv_vmbus_channel **subchan; int i; subchan = vmbus_subchan_get(chan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) vmbus_chan_close_internal(subchan[i]); vmbus_subchan_rel(subchan, subchan_cnt); } /* Then close the primary channel. */ vmbus_chan_close_internal(chan); } int vmbus_chan_send(struct hv_vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt pkt; int pktlen, pad_pktlen, hlen, error; uint64_t pad = 0; struct iovec iov[3]; boolean_t send_evt; hlen = sizeof(pkt); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); pkt.cp_hdr.cph_type = type; pkt.cp_hdr.cph_flags = flags; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; iov[0].iov_base = &pkt; iov[0].iov_len = hlen; iov[1].iov_base = data; iov[1].iov_len = dlen; iov[2].iov_base = &pad; iov[2].iov_len = pad_pktlen - pktlen; error = hv_ring_buffer_write(&chan->ch_txbr, iov, 3, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_send_sglist(struct hv_vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt_sglist pkt; int pktlen, pad_pktlen, hlen, error; struct iovec iov[4]; boolean_t send_evt; uint64_t pad = 0; KASSERT(sglen < VMBUS_CHAN_SGLIST_MAX, ("invalid sglist len %d", sglen)); hlen = __offsetof(struct vmbus_chanpkt_sglist, cp_gpa[sglen]); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; pkt.cp_rsvd = 0; pkt.cp_gpa_cnt = sglen; iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); iov[1].iov_base = sg; iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen; iov[2].iov_base = data; iov[2].iov_len = dlen; iov[3].iov_base = &pad; iov[3].iov_len = pad_pktlen - pktlen; error = hv_ring_buffer_write(&chan->ch_txbr, iov, 4, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_send_prplist(struct hv_vmbus_channel *chan, struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt_prplist pkt; int pktlen, pad_pktlen, hlen, error; struct iovec iov[4]; boolean_t send_evt; uint64_t pad = 0; KASSERT(prp_cnt < VMBUS_CHAN_PRPLIST_MAX, ("invalid prplist entry count %d", prp_cnt)); hlen = __offsetof(struct vmbus_chanpkt_prplist, cp_range[0].gpa_page[prp_cnt]); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; pkt.cp_rsvd = 0; pkt.cp_range_cnt = 1; iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); iov[1].iov_base = prp; iov[1].iov_len = __offsetof(struct vmbus_gpa_range, gpa_page[prp_cnt]); iov[2].iov_base = data; iov[2].iov_len = dlen; iov[3].iov_base = &pad; iov[3].iov_len = pad_pktlen - pktlen; error = hv_ring_buffer_write(&chan->ch_txbr, iov, 4, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_recv(struct hv_vmbus_channel *chan, void *data, int *dlen0, uint64_t *xactid) { struct vmbus_chanpkt_hdr pkt; int error, dlen, hlen; error = hv_ring_buffer_peek(&chan->ch_rxbr, &pkt, sizeof(pkt)); if (error) return error; hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen); dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen; if (*dlen0 < dlen) { /* Return the size of this packet's data. */ *dlen0 = dlen; return ENOBUFS; } *xactid = pkt.cph_xactid; *dlen0 = dlen; /* Skip packet header */ error = hv_ring_buffer_read(&chan->ch_rxbr, data, dlen, hlen); KASSERT(!error, ("hv_ring_buffer_read failed")); return 0; } int vmbus_chan_recv_pkt(struct hv_vmbus_channel *chan, struct vmbus_chanpkt_hdr *pkt0, int *pktlen0) { struct vmbus_chanpkt_hdr pkt; int error, pktlen; error = hv_ring_buffer_peek(&chan->ch_rxbr, &pkt, sizeof(pkt)); if (error) return error; pktlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen); if (*pktlen0 < pktlen) { /* Return the size of this packet. */ *pktlen0 = pktlen; return ENOBUFS; } *pktlen0 = pktlen; /* Include packet header */ error = hv_ring_buffer_read(&chan->ch_rxbr, pkt0, pktlen, 0); KASSERT(!error, ("hv_ring_buffer_read failed")); return 0; } static void vmbus_chan_task(void *xchan, int pending __unused) { struct hv_vmbus_channel *chan = xchan; vmbus_chan_callback_t cb = chan->ch_cb; void *cbarg = chan->ch_cbarg; /* * Optimize host to guest signaling by ensuring: * 1. While reading the channel, we disable interrupts from * host. * 2. Ensure that we process all posted messages from the host * before returning from this callback. * 3. Once we return, enable signaling from the host. Once this * state is set we check to see if additional packets are * available to read. In this case we repeat the process. * * NOTE: Interrupt has been disabled in the ISR. */ for (;;) { uint32_t left; cb(chan, cbarg); left = hv_ring_buffer_read_end(&chan->ch_rxbr); if (left == 0) { /* No more data in RX bufring; done */ break; } hv_ring_buffer_read_begin(&chan->ch_rxbr); } } static void vmbus_chan_task_nobatch(void *xchan, int pending __unused) { struct hv_vmbus_channel *chan = xchan; chan->ch_cb(chan, chan->ch_cbarg); } static __inline void vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags, int flag_cnt) { int f; for (f = 0; f < flag_cnt; ++f) { uint32_t chid_base; u_long flags; int chid_ofs; if (event_flags[f] == 0) continue; flags = atomic_swap_long(&event_flags[f], 0); chid_base = f << VMBUS_EVTFLAG_SHIFT; while ((chid_ofs = ffsl(flags)) != 0) { struct hv_vmbus_channel *chan; --chid_ofs; /* NOTE: ffsl is 1-based */ flags &= ~(1UL << chid_ofs); chan = sc->vmbus_chmap[chid_base + chid_ofs]; /* if channel is closed or closing */ if (chan == NULL || chan->ch_tq == NULL) continue; if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) hv_ring_buffer_read_begin(&chan->ch_rxbr); taskqueue_enqueue(chan->ch_tq, &chan->ch_task); } } } void vmbus_event_proc(struct vmbus_softc *sc, int cpu) { struct vmbus_evtflags *eventf; /* * On Host with Win8 or above, the event page can be checked directly * to get the id of the channel that has the pending interrupt. */ eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; vmbus_event_flags_proc(sc, eventf->evt_flags, VMBUS_PCPU_GET(sc, event_flags_cnt, cpu)); } void vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu) { struct vmbus_evtflags *eventf; eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) { vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags, VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT); } } static void vmbus_chan_update_evtflagcnt(struct vmbus_softc *sc, const struct hv_vmbus_channel *chan) { volatile int *flag_cnt_ptr; int flag_cnt; flag_cnt = (chan->ch_id / VMBUS_EVTFLAG_LEN) + 1; flag_cnt_ptr = VMBUS_PCPU_PTR(sc, event_flags_cnt, chan->ch_cpuid); for (;;) { int old_flag_cnt; old_flag_cnt = *flag_cnt_ptr; if (old_flag_cnt >= flag_cnt) break; if (atomic_cmpset_int(flag_cnt_ptr, old_flag_cnt, flag_cnt)) { if (bootverbose) { device_printf(sc->vmbus_dev, "channel%u update cpu%d flag_cnt to %d\n", chan->ch_id, chan->ch_cpuid, flag_cnt); } break; } } } static struct hv_vmbus_channel * vmbus_chan_alloc(struct vmbus_softc *sc) { struct hv_vmbus_channel *chan; chan = malloc(sizeof(*chan), M_DEVBUF, M_WAITOK | M_ZERO); chan->ch_monprm = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev), HYPERCALL_PARAM_ALIGN, 0, sizeof(struct hyperv_mon_param), &chan->ch_monprm_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (chan->ch_monprm == NULL) { device_printf(sc->vmbus_dev, "monprm alloc failed\n"); free(chan, M_DEVBUF); return NULL; } chan->ch_vmbus = sc; mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF); TAILQ_INIT(&chan->ch_subchans); TASK_INIT(&chan->ch_detach_task, 0, vmbus_chan_detach_task, chan); return chan; } static void vmbus_chan_free(struct hv_vmbus_channel *chan) { /* TODO: assert sub-channel list is empty */ /* TODO: asset no longer on the primary channel's sub-channel list */ /* TODO: asset no longer on the vmbus channel list */ hyperv_dmamem_free(&chan->ch_monprm_dma, chan->ch_monprm); mtx_destroy(&chan->ch_subchan_lock); free(chan, M_DEVBUF); } static int vmbus_chan_add(struct hv_vmbus_channel *newchan) { struct vmbus_softc *sc = newchan->ch_vmbus; struct hv_vmbus_channel *prichan; if (newchan->ch_id == 0) { /* * XXX * Chan0 will neither be processed nor should be offered; * skip it. */ device_printf(sc->vmbus_dev, "got chan0 offer, discard\n"); return EINVAL; } else if (newchan->ch_id >= VMBUS_CHAN_MAX) { device_printf(sc->vmbus_dev, "invalid chan%u offer\n", newchan->ch_id); return EINVAL; } sc->vmbus_chmap[newchan->ch_id] = newchan; if (bootverbose) { device_printf(sc->vmbus_dev, "chan%u subidx%u offer\n", newchan->ch_id, newchan->ch_subidx); } mtx_lock(&sc->vmbus_prichan_lock); TAILQ_FOREACH(prichan, &sc->vmbus_prichans, ch_prilink) { /* * Sub-channel will have the same type GUID and instance * GUID as its primary channel. */ if (memcmp(&prichan->ch_guid_type, &newchan->ch_guid_type, sizeof(struct hyperv_guid)) == 0 && memcmp(&prichan->ch_guid_inst, &newchan->ch_guid_inst, sizeof(struct hyperv_guid)) == 0) break; } if (VMBUS_CHAN_ISPRIMARY(newchan)) { if (prichan == NULL) { /* Install the new primary channel */ TAILQ_INSERT_TAIL(&sc->vmbus_prichans, newchan, ch_prilink); mtx_unlock(&sc->vmbus_prichan_lock); return 0; } else { mtx_unlock(&sc->vmbus_prichan_lock); device_printf(sc->vmbus_dev, "duplicated primary " "chan%u\n", newchan->ch_id); return EINVAL; } } else { /* Sub-channel */ if (prichan == NULL) { mtx_unlock(&sc->vmbus_prichan_lock); device_printf(sc->vmbus_dev, "no primary chan for " "chan%u\n", newchan->ch_id); return EINVAL; } /* * Found the primary channel for this sub-channel and * move on. * * XXX refcnt prichan */ } mtx_unlock(&sc->vmbus_prichan_lock); /* * This is a sub-channel; link it with the primary channel. */ KASSERT(!VMBUS_CHAN_ISPRIMARY(newchan), ("new channel is not sub-channel")); KASSERT(prichan != NULL, ("no primary channel")); newchan->ch_prichan = prichan; newchan->ch_dev = prichan->ch_dev; mtx_lock(&prichan->ch_subchan_lock); TAILQ_INSERT_TAIL(&prichan->ch_subchans, newchan, ch_sublink); /* * Bump up sub-channel count and notify anyone that is * interested in this sub-channel, after this sub-channel * is setup. */ prichan->ch_subchan_cnt++; mtx_unlock(&prichan->ch_subchan_lock); wakeup(prichan); return 0; } void vmbus_chan_cpu_set(struct hv_vmbus_channel *chan, int cpu) { KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu)); if (chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WS2008 || chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WIN7) { /* Only cpu0 is supported */ cpu = 0; } chan->ch_cpuid = cpu; chan->ch_vcpuid = VMBUS_PCPU_GET(chan->ch_vmbus, vcpuid, cpu); if (bootverbose) { printf("vmbus_chan%u: assigned to cpu%u [vcpu%u]\n", chan->ch_id, chan->ch_cpuid, chan->ch_vcpuid); } } void vmbus_chan_cpu_rr(struct hv_vmbus_channel *chan) { static uint32_t vmbus_chan_nextcpu; int cpu; cpu = atomic_fetchadd_int(&vmbus_chan_nextcpu, 1) % mp_ncpus; vmbus_chan_cpu_set(chan, cpu); } static void vmbus_chan_cpu_default(struct hv_vmbus_channel *chan) { /* * By default, pin the channel to cpu0. Devices having * special channel-cpu mapping requirement should call * vmbus_chan_cpu_{set,rr}(). */ vmbus_chan_cpu_set(chan, 0); } static void vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, const struct vmbus_message *msg) { const struct vmbus_chanmsg_choffer *offer; struct hv_vmbus_channel *chan; int error; offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data; chan = vmbus_chan_alloc(sc); if (chan == NULL) { device_printf(sc->vmbus_dev, "allocate chan%u failed\n", offer->chm_chanid); return; } chan->ch_id = offer->chm_chanid; chan->ch_subidx = offer->chm_subidx; chan->ch_guid_type = offer->chm_chtype; chan->ch_guid_inst = offer->chm_chinst; /* Batch reading is on by default */ chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; chan->ch_monprm->mp_connid = VMBUS_CONNID_EVENT; if (sc->vmbus_version != VMBUS_VERSION_WS2008) chan->ch_monprm->mp_connid = offer->chm_connid; if (offer->chm_flags1 & VMBUS_CHOFFER_FLAG1_HASMNF) { /* * Setup MNF stuffs. */ chan->ch_flags |= VMBUS_CHAN_FLAG_HASMNF; chan->ch_montrig_idx = offer->chm_montrig / VMBUS_MONTRIG_LEN; if (chan->ch_montrig_idx >= VMBUS_MONTRIGS_MAX) panic("invalid monitor trigger %u", offer->chm_montrig); chan->ch_montrig_mask = 1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN); } /* Select default cpu for this channel. */ vmbus_chan_cpu_default(chan); error = vmbus_chan_add(chan); if (error) { device_printf(sc->vmbus_dev, "add chan%u failed: %d\n", chan->ch_id, error); vmbus_chan_free(chan); return; } if (VMBUS_CHAN_ISPRIMARY(chan)) { /* * Add device for this primary channel. * * NOTE: * Error is ignored here; don't have much to do if error * really happens. */ vmbus_add_child(chan); } } /* * XXX pretty broken; need rework. */ static void vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc, const struct vmbus_message *msg) { const struct vmbus_chanmsg_chrescind *note; struct hv_vmbus_channel *chan; note = (const struct vmbus_chanmsg_chrescind *)msg->msg_data; if (note->chm_chanid > VMBUS_CHAN_MAX) { device_printf(sc->vmbus_dev, "invalid rescinded chan%u\n", note->chm_chanid); return; } if (bootverbose) { device_printf(sc->vmbus_dev, "chan%u rescinded\n", note->chm_chanid); } chan = sc->vmbus_chmap[note->chm_chanid]; if (chan == NULL) return; sc->vmbus_chmap[note->chm_chanid] = NULL; taskqueue_enqueue(taskqueue_thread, &chan->ch_detach_task); } static void vmbus_chan_detach_task(void *xchan, int pending __unused) { struct hv_vmbus_channel *chan = xchan; if (VMBUS_CHAN_ISPRIMARY(chan)) { /* Only primary channel owns the device */ vmbus_delete_child(chan); /* NOTE: DO NOT free primary channel for now */ } else { struct vmbus_softc *sc = chan->ch_vmbus; struct hv_vmbus_channel *pri_chan = chan->ch_prichan; struct vmbus_chanmsg_chfree *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for chfree(chan%u)\n", chan->ch_id); goto remove; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE; req->chm_chanid = chan->ch_id; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { device_printf(sc->vmbus_dev, "chfree(chan%u) failed: %d", chan->ch_id, error); /* NOTE: Move on! */ } else { if (bootverbose) { device_printf(sc->vmbus_dev, "chan%u freed\n", chan->ch_id); } } remove: mtx_lock(&pri_chan->ch_subchan_lock); TAILQ_REMOVE(&pri_chan->ch_subchans, chan, ch_sublink); KASSERT(pri_chan->ch_subchan_cnt > 0, ("invalid subchan_cnt %d", pri_chan->ch_subchan_cnt)); pri_chan->ch_subchan_cnt--; mtx_unlock(&pri_chan->ch_subchan_lock); wakeup(pri_chan); vmbus_chan_free(chan); } } /* * Detach all devices and destroy the corresponding primary channels. */ void vmbus_chan_destroy_all(struct vmbus_softc *sc) { struct hv_vmbus_channel *chan; mtx_lock(&sc->vmbus_prichan_lock); while ((chan = TAILQ_FIRST(&sc->vmbus_prichans)) != NULL) { KASSERT(VMBUS_CHAN_ISPRIMARY(chan), ("not primary channel")); TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink); mtx_unlock(&sc->vmbus_prichan_lock); vmbus_delete_child(chan); vmbus_chan_free(chan); mtx_lock(&sc->vmbus_prichan_lock); } bzero(sc->vmbus_chmap, sizeof(struct hv_vmbus_channel *) * VMBUS_CHAN_MAX); mtx_unlock(&sc->vmbus_prichan_lock); } /* * The channel whose vcpu binding is closest to the currect vcpu will * be selected. * If no multi-channel, always select primary channel. */ struct hv_vmbus_channel * vmbus_chan_cpu2chan(struct hv_vmbus_channel *prichan, int cpu) { struct hv_vmbus_channel *sel, *chan; uint32_t vcpu, sel_dist; KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpuid %d", cpu)); if (TAILQ_EMPTY(&prichan->ch_subchans)) return prichan; vcpu = VMBUS_PCPU_GET(prichan->ch_vmbus, vcpuid, cpu); #define CHAN_VCPU_DIST(ch, vcpu) \ (((ch)->ch_vcpuid > (vcpu)) ? \ ((ch)->ch_vcpuid - (vcpu)) : ((vcpu) - (ch)->ch_vcpuid)) #define CHAN_SELECT(ch) \ do { \ sel = ch; \ sel_dist = CHAN_VCPU_DIST(ch, vcpu); \ } while (0) CHAN_SELECT(prichan); mtx_lock(&prichan->ch_subchan_lock); TAILQ_FOREACH(chan, &prichan->ch_subchans, ch_sublink) { uint32_t dist; KASSERT(chan->ch_stflags & VMBUS_CHAN_ST_OPENED, ("chan%u is not opened", chan->ch_id)); if (chan->ch_vcpuid == vcpu) { /* Exact match; done */ CHAN_SELECT(chan); break; } dist = CHAN_VCPU_DIST(chan, vcpu); if (sel_dist <= dist) { /* Far or same distance; skip */ continue; } /* Select the closer channel. */ CHAN_SELECT(chan); } mtx_unlock(&prichan->ch_subchan_lock); #undef CHAN_SELECT #undef CHAN_VCPU_DIST return sel; } struct hv_vmbus_channel ** vmbus_subchan_get(struct hv_vmbus_channel *pri_chan, int subchan_cnt) { struct hv_vmbus_channel **ret, *chan; int i; ret = malloc(subchan_cnt * sizeof(struct hv_vmbus_channel *), M_TEMP, M_WAITOK); mtx_lock(&pri_chan->ch_subchan_lock); while (pri_chan->ch_subchan_cnt < subchan_cnt) mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "subch", 0); i = 0; TAILQ_FOREACH(chan, &pri_chan->ch_subchans, ch_sublink) { /* TODO: refcnt chan */ ret[i] = chan; ++i; if (i == subchan_cnt) break; } KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d", pri_chan->ch_subchan_cnt, subchan_cnt)); mtx_unlock(&pri_chan->ch_subchan_lock); return ret; } void vmbus_subchan_rel(struct hv_vmbus_channel **subchan, int subchan_cnt __unused) { free(subchan, M_TEMP); } void vmbus_subchan_drain(struct hv_vmbus_channel *pri_chan) { mtx_lock(&pri_chan->ch_subchan_lock); while (pri_chan->ch_subchan_cnt > 0) mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "dsubch", 0); mtx_unlock(&pri_chan->ch_subchan_lock); } void vmbus_chan_msgproc(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_chanmsg_proc_t msg_proc; uint32_t msg_type; msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; KASSERT(msg_type < VMBUS_CHANMSG_TYPE_MAX, ("invalid message type %u", msg_type)); msg_proc = vmbus_chan_msgprocs[msg_type]; if (msg_proc != NULL) msg_proc(sc, msg); } void vmbus_chan_set_readbatch(struct hv_vmbus_channel *chan, bool on) { if (!on) chan->ch_flags &= ~VMBUS_CHAN_FLAG_BATCHREAD; else chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; } + +uint32_t +vmbus_chan_id(const struct hv_vmbus_channel *chan) +{ + return chan->ch_id; +} + +uint32_t +vmbus_chan_subidx(const struct hv_vmbus_channel *chan) +{ + return chan->ch_subidx; +} + +bool +vmbus_chan_is_primary(const struct hv_vmbus_channel *chan) +{ + if (VMBUS_CHAN_ISPRIMARY(chan)) + return true; + else + return false; +} + +const struct hyperv_guid * +vmbus_chan_guid_inst(const struct hv_vmbus_channel *chan) +{ + return &chan->ch_guid_inst; +} Index: head/sys/dev/hyperv/vmbus/vmbus_chanvar.h =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_chanvar.h (nonexistent) +++ head/sys/dev/hyperv/vmbus/vmbus_chanvar.h (revision 303071) @@ -0,0 +1,155 @@ +/*- + * Copyright (c) 2016 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMBUS_CHANVAR_H_ +#define _VMBUS_CHANVAR_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +typedef struct { + /* + * offset in bytes from the start of ring data below + */ + volatile uint32_t write_index; + /* + * offset in bytes from the start of ring data below + */ + volatile uint32_t read_index; + /* + * NOTE: The interrupt_mask field is used only for channels, but + * vmbus connection also uses this data structure + */ + volatile uint32_t interrupt_mask; + /* pad it to PAGE_SIZE so that data starts on a page */ + uint8_t reserved[4084]; + + /* + * WARNING: Ring data starts here + * !!! DO NOT place any fields below this !!! + */ + uint8_t buffer[0]; /* doubles as interrupt mask */ +} __packed hv_vmbus_ring_buffer; + +typedef struct { + hv_vmbus_ring_buffer* ring_buffer; + struct mtx ring_lock; + uint32_t ring_data_size; /* ring_size */ +} hv_vmbus_ring_buffer_info; + +typedef struct hv_vmbus_channel { + device_t ch_dev; + struct vmbus_softc *ch_vmbus; + uint32_t ch_flags; /* VMBUS_CHAN_FLAG_ */ + uint32_t ch_id; /* channel id */ + + /* + * These are based on the offer_msg.monitor_id. + * Save it here for easy access. + */ + int ch_montrig_idx; /* MNF trig index */ + uint32_t ch_montrig_mask;/* MNF trig mask */ + + /* + * TX bufring; at the beginning of ch_bufring. + */ + hv_vmbus_ring_buffer_info ch_txbr; + /* + * RX bufring; immediately following ch_txbr. + */ + hv_vmbus_ring_buffer_info ch_rxbr; + + struct taskqueue *ch_tq; + struct task ch_task; + vmbus_chan_callback_t ch_cb; + void *ch_cbarg; + + struct hyperv_mon_param *ch_monprm; + struct hyperv_dma ch_monprm_dma; + + int ch_cpuid; /* owner cpu */ + /* + * Virtual cpuid for ch_cpuid; it is used to communicate cpuid + * related information w/ Hyper-V. If MSR_HV_VP_INDEX does not + * exist, ch_vcpuid will always be 0 for compatibility. + */ + uint32_t ch_vcpuid; + + /* + * If this is a primary channel, ch_subchan* fields + * contain sub-channels belonging to this primary + * channel. + */ + struct mtx ch_subchan_lock; + TAILQ_HEAD(, hv_vmbus_channel) ch_subchans; + int ch_subchan_cnt; + + /* If this is a sub-channel */ + TAILQ_ENTRY(hv_vmbus_channel) ch_sublink; /* sub-channel link */ + struct hv_vmbus_channel *ch_prichan; /* owner primary chan */ + + void *ch_bufring; /* TX+RX bufrings */ + struct hyperv_dma ch_bufring_dma; + uint32_t ch_bufring_gpadl; + + struct task ch_detach_task; + TAILQ_ENTRY(hv_vmbus_channel) ch_prilink; /* primary chan link */ + uint32_t ch_subidx; /* subchan index */ + volatile uint32_t ch_stflags; /* atomic-op */ + /* VMBUS_CHAN_ST_ */ + struct hyperv_guid ch_guid_type; + struct hyperv_guid ch_guid_inst; + + struct sysctl_ctx_list ch_sysctl_ctx; +} hv_vmbus_channel; + +#define VMBUS_CHAN_ISPRIMARY(chan) ((chan)->ch_subidx == 0) + +#define VMBUS_CHAN_FLAG_HASMNF 0x0001 +/* + * If this flag is set, this channel's interrupt will be masked in ISR, + * and the RX bufring will be drained before this channel's interrupt is + * unmasked. + * + * This flag is turned on by default. Drivers can turn it off according + * to their own requirement. + */ +#define VMBUS_CHAN_FLAG_BATCHREAD 0x0002 + +#define VMBUS_CHAN_ST_OPENED_SHIFT 0 +#define VMBUS_CHAN_ST_OPENED (1 << VMBUS_CHAN_ST_OPENED_SHIFT) + +#endif /* !_VMBUS_CHANVAR_H_ */ Property changes on: head/sys/dev/hyperv/vmbus/vmbus_chanvar.h ___________________________________________________________________ Added: svn:keywords ## -0,0 +1 ## +FreeBSD=%H \ No newline at end of property