diff --git a/sys/dev/hyperv/include/hyperv_busdma.h b/sys/dev/hyperv/include/hyperv_busdma.h index ff01b3e27a95..804e5c26a669 100644 --- a/sys/dev/hyperv/include/hyperv_busdma.h +++ b/sys/dev/hyperv/include/hyperv_busdma.h @@ -1,49 +1,45 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _HYPERV_BUSDMA_H_ #define _HYPERV_BUSDMA_H_ #include #include #include struct hyperv_dma { bus_addr_t hv_paddr; bus_dma_tag_t hv_dtag; bus_dmamap_t hv_dmap; }; void hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error); -void *hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag, - bus_size_t alignment, bus_addr_t boundary, bus_size_t size, - struct hyperv_dma *dma, int flags); -void hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr); #endif /* !_HYPERV_BUSDMA_H_ */ diff --git a/sys/dev/hyperv/netvsc/hn_nvs.c b/sys/dev/hyperv/netvsc/hn_nvs.c index 4dbc28996617..e1769a806c89 100644 --- a/sys/dev/hyperv/netvsc/hn_nvs.c +++ b/sys/dev/hyperv/netvsc/hn_nvs.c @@ -1,751 +1,756 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Network Virtualization Service. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include +#include +#include +#include + #include #include #include #include #include #include #include -#include #include #include #include #include #include #include static int hn_nvs_conn_chim(struct hn_softc *); static int hn_nvs_conn_rxbuf(struct hn_softc *); static void hn_nvs_disconn_chim(struct hn_softc *); static void hn_nvs_disconn_rxbuf(struct hn_softc *); static int hn_nvs_conf_ndis(struct hn_softc *, int); static int hn_nvs_init_ndis(struct hn_softc *); static int hn_nvs_doinit(struct hn_softc *, uint32_t); static int hn_nvs_init(struct hn_softc *); static const void *hn_nvs_xact_execute(struct hn_softc *, struct vmbus_xact *, void *, int, size_t *, uint32_t); static void hn_nvs_sent_none(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); struct hn_nvs_sendctx hn_nvs_sendctx_none = HN_NVS_SENDCTX_INITIALIZER(hn_nvs_sent_none, NULL); static const uint32_t hn_nvs_version[] = { HN_NVS_VERSION_61, HN_NVS_VERSION_6, HN_NVS_VERSION_5, HN_NVS_VERSION_4, HN_NVS_VERSION_2, HN_NVS_VERSION_1 }; static const void * hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, void *req, int reqlen, size_t *resplen0, uint32_t type) { struct hn_nvs_sendctx sndc; size_t resplen, min_resplen = *resplen0; const struct hn_nvs_hdr *hdr; int error; KASSERT(min_resplen >= sizeof(*hdr), ("invalid minimum response len %zu", min_resplen)); /* * Execute the xact setup by the caller. */ hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); vmbus_xact_activate(xact); error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC, req, reqlen, &sndc); if (error) { vmbus_xact_deactivate(xact); return (NULL); } hdr = vmbus_chan_xact_wait(sc->hn_prichan, xact, &resplen, HN_CAN_SLEEP(sc)); /* * Check this NVS response message. */ if (resplen < min_resplen) { if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen); return (NULL); } if (hdr->nvs_type != type) { if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, " "expect 0x%08x\n", hdr->nvs_type, type); return (NULL); } /* All pass! */ *resplen0 = resplen; return (hdr); } static __inline int hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen) { return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE, req, reqlen, &hn_nvs_sendctx_none)); } static int hn_nvs_conn_rxbuf(struct hn_softc *sc) { struct vmbus_xact *xact = NULL; struct hn_nvs_rxbuf_conn *conn; const struct hn_nvs_rxbuf_connresp *resp; size_t resp_len; uint32_t status; int error, rxbuf_size; /* * Limit RXBUF size for old NVS. */ if (sc->hn_nvs_ver <= HN_NVS_VERSION_2) rxbuf_size = HN_RXBUF_SIZE_COMPAT; else rxbuf_size = HN_RXBUF_SIZE; /* * Connect the RXBUF GPADL to the primary channel. * * NOTE: * Only primary channel has RXBUF connected to it. Sub-channels * just share this RXBUF. */ error = vmbus_chan_gpadl_connect(sc->hn_prichan, - sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl); + pmap_kextract((vm_offset_t)sc->hn_rxbuf), rxbuf_size, + &sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n", error); goto cleanup; } /* * Connect RXBUF to NVS. */ xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n"); error = ENXIO; goto cleanup; } conn = vmbus_xact_req_data(xact); conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN; conn->nvs_gpadl = sc->hn_rxbuf_gpadl; conn->nvs_sig = HN_NVS_RXBUF_SIG; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len, HN_NVS_TYPE_RXBUF_CONNRESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n"); error = EIO; goto cleanup; } status = resp->nvs_status; vmbus_xact_put(xact); xact = NULL; if (status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status); error = EIO; goto cleanup; } sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED; return (0); cleanup: if (xact != NULL) vmbus_xact_put(xact); hn_nvs_disconn_rxbuf(sc); return (error); } static int hn_nvs_conn_chim(struct hn_softc *sc) { struct vmbus_xact *xact = NULL; struct hn_nvs_chim_conn *chim; const struct hn_nvs_chim_connresp *resp; size_t resp_len; uint32_t status, sectsz; int error; /* * Connect chimney sending buffer GPADL to the primary channel. * * NOTE: * Only primary channel has chimney sending buffer connected to it. * Sub-channels just share this chimney sending buffer. */ error = vmbus_chan_gpadl_connect(sc->hn_prichan, - sc->hn_chim_dma.hv_paddr, HN_CHIM_SIZE, &sc->hn_chim_gpadl); + pmap_kextract((vm_offset_t)sc->hn_chim), HN_CHIM_SIZE, + &sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error); goto cleanup; } /* * Connect chimney sending buffer to NVS */ xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs chim conn\n"); error = ENXIO; goto cleanup; } chim = vmbus_xact_req_data(xact); chim->nvs_type = HN_NVS_TYPE_CHIM_CONN; chim->nvs_gpadl = sc->hn_chim_gpadl; chim->nvs_sig = HN_NVS_CHIM_SIG; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len, HN_NVS_TYPE_CHIM_CONNRESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs chim conn failed\n"); error = EIO; goto cleanup; } status = resp->nvs_status; sectsz = resp->nvs_sectsz; vmbus_xact_put(xact); xact = NULL; if (status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status); error = EIO; goto cleanup; } if (sectsz == 0 || sectsz % sizeof(uint32_t) != 0) { /* * Can't use chimney sending buffer; done! */ if (sectsz == 0) { if_printf(sc->hn_ifp, "zero chimney sending buffer " "section size\n"); } else { if_printf(sc->hn_ifp, "misaligned chimney sending " "buffers, section size: %u\n", sectsz); } sc->hn_chim_szmax = 0; sc->hn_chim_cnt = 0; sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; return (0); } sc->hn_chim_szmax = sectsz; sc->hn_chim_cnt = HN_CHIM_SIZE / sc->hn_chim_szmax; if (HN_CHIM_SIZE % sc->hn_chim_szmax != 0) { if_printf(sc->hn_ifp, "chimney sending sections are " "not properly aligned\n"); } if (sc->hn_chim_cnt % LONG_BIT != 0) { if_printf(sc->hn_ifp, "discard %d chimney sending sections\n", sc->hn_chim_cnt % LONG_BIT); } sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT; sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long), M_DEVBUF, M_WAITOK | M_ZERO); /* Done! */ sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; if (bootverbose) { if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n", sc->hn_chim_szmax, sc->hn_chim_cnt); } return (0); cleanup: if (xact != NULL) vmbus_xact_put(xact); hn_nvs_disconn_chim(sc); return (error); } static void hn_nvs_disconn_rxbuf(struct hn_softc *sc) { int error; if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) { struct hn_nvs_rxbuf_disconn disconn; /* * Disconnect RXBUF from NVS. */ memset(&disconn, 0, sizeof(disconn)); disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN; disconn.nvs_sig = HN_NVS_RXBUF_SIG; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); if (error) { if_printf(sc->hn_ifp, "send nvs rxbuf disconn failed: %d\n", error); /* * Fine for a revoked channel, since the hypervisor * does not drain TX bufring for a revoked channel. */ if (!vmbus_chan_is_revoked(sc->hn_prichan)) sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED; /* * Wait for the hypervisor to receive this NVS request. * * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_tx_empty(sc->hn_prichan) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("waittx", 1); /* * Linger long enough for NVS to disconnect RXBUF. */ pause("lingtx", (200 * hz) / 1000); } if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { /* * Disconnect RXBUF from primary channel. */ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_rxbuf_gpadl = 0; } } static void hn_nvs_disconn_chim(struct hn_softc *sc) { int error; if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) { struct hn_nvs_chim_disconn disconn; /* * Disconnect chimney sending buffer from NVS. */ memset(&disconn, 0, sizeof(disconn)); disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN; disconn.nvs_sig = HN_NVS_CHIM_SIG; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); if (error) { if_printf(sc->hn_ifp, "send nvs chim disconn failed: %d\n", error); /* * Fine for a revoked channel, since the hypervisor * does not drain TX bufring for a revoked channel. */ if (!vmbus_chan_is_revoked(sc->hn_prichan)) sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED; /* * Wait for the hypervisor to receive this NVS request. * * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_tx_empty(sc->hn_prichan) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("waittx", 1); /* * Linger long enough for NVS to disconnect chimney * sending buffer. */ pause("lingtx", (200 * hz) / 1000); } if (vmbus_current_version < VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { /* * Disconnect chimney sending buffer from primary channel. */ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_chim_gpadl = 0; } if (sc->hn_chim_bmap != NULL) { free(sc->hn_chim_bmap, M_DEVBUF); sc->hn_chim_bmap = NULL; sc->hn_chim_bmap_cnt = 0; } } static int hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver) { struct vmbus_xact *xact; struct hn_nvs_init *init; const struct hn_nvs_init_resp *resp; size_t resp_len; uint32_t status; xact = vmbus_xact_get(sc->hn_xact, sizeof(*init)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs init\n"); return (ENXIO); } init = vmbus_xact_req_data(xact); init->nvs_type = HN_NVS_TYPE_INIT; init->nvs_ver_min = nvs_ver; init->nvs_ver_max = nvs_ver; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len, HN_NVS_TYPE_INIT_RESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec init failed\n"); vmbus_xact_put(xact); return (EIO); } status = resp->nvs_status; vmbus_xact_put(xact); if (status != HN_NVS_STATUS_OK) { if (bootverbose) { /* * Caller may try another NVS version, and will log * error if there are no more NVS versions to try, * so don't bark out loud here. */ if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n", nvs_ver); } return (EINVAL); } return (0); } /* * Configure MTU and enable VLAN. */ static int hn_nvs_conf_ndis(struct hn_softc *sc, int mtu) { struct hn_nvs_ndis_conf conf; int error; memset(&conf, 0, sizeof(conf)); conf.nvs_type = HN_NVS_TYPE_NDIS_CONF; conf.nvs_mtu = mtu + ETHER_HDR_LEN; conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN; if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) conf.nvs_caps |= HN_NVS_NDIS_CONF_SRIOV; if (sc->hn_nvs_ver >= HN_NVS_VERSION_61) conf.nvs_caps |= HN_NVS_NDIS_CONF_RSC; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &conf, sizeof(conf)); if (error) { if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error); return (error); } if (bootverbose) if_printf(sc->hn_ifp, "nvs ndis conf done\n"); sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN; return (0); } static int hn_nvs_init_ndis(struct hn_softc *sc) { struct hn_nvs_ndis_init ndis; int error; memset(&ndis, 0, sizeof(ndis)); ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT; ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver); ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver); /* NOTE: No response. */ error = hn_nvs_req_send(sc, &ndis, sizeof(ndis)); if (error) if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error); return (error); } static int hn_nvs_init(struct hn_softc *sc) { int i, error; if (device_is_attached(sc->hn_dev)) { /* * NVS version and NDIS version MUST NOT be changed. */ if (bootverbose) { if_printf(sc->hn_ifp, "reinit NVS version 0x%x, " "NDIS version %u.%u\n", sc->hn_nvs_ver, HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); } error = hn_nvs_doinit(sc, sc->hn_nvs_ver); if (error) { if_printf(sc->hn_ifp, "reinit NVS version 0x%x " "failed: %d\n", sc->hn_nvs_ver, error); return (error); } goto done; } /* * Find the supported NVS version and set NDIS version accordingly. */ for (i = 0; i < nitems(hn_nvs_version); ++i) { error = hn_nvs_doinit(sc, hn_nvs_version[i]); if (!error) { sc->hn_nvs_ver = hn_nvs_version[i]; /* Set NDIS version according to NVS version. */ sc->hn_ndis_ver = HN_NDIS_VERSION_6_30; if (sc->hn_nvs_ver <= HN_NVS_VERSION_4) sc->hn_ndis_ver = HN_NDIS_VERSION_6_1; if (bootverbose) { if_printf(sc->hn_ifp, "NVS version 0x%x, " "NDIS version %u.%u\n", sc->hn_nvs_ver, HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); } goto done; } } if_printf(sc->hn_ifp, "no NVS available\n"); return (ENXIO); done: if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) sc->hn_caps |= HN_CAP_HASHVAL; return (0); } int hn_nvs_attach(struct hn_softc *sc, int mtu) { int error; if (hyperv_ver_major >= 10) { /* UDP 4-tuple hash is enforced. */ sc->hn_caps |= HN_CAP_UDPHASH; } /* * Initialize NVS. */ error = hn_nvs_init(sc); if (error) return (error); if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) { /* * Configure NDIS before initializing it. */ error = hn_nvs_conf_ndis(sc, mtu); if (error) return (error); } /* * Initialize NDIS. */ error = hn_nvs_init_ndis(sc); if (error) return (error); /* * Connect RXBUF. */ error = hn_nvs_conn_rxbuf(sc); if (error) return (error); /* * Connect chimney sending buffer. */ error = hn_nvs_conn_chim(sc); if (error) { hn_nvs_disconn_rxbuf(sc); return (error); } return (0); } void hn_nvs_detach(struct hn_softc *sc) { /* NOTE: there are no requests to stop the NVS. */ hn_nvs_disconn_rxbuf(sc); hn_nvs_disconn_chim(sc); } void hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, const void *data, int dlen) { vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen); } static void hn_nvs_sent_none(struct hn_nvs_sendctx *sndc __unused, struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, const void *data __unused, int dlen __unused) { /* EMPTY */ } int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0) { struct vmbus_xact *xact; struct hn_nvs_subch_req *req; const struct hn_nvs_subch_resp *resp; int error, nsubch_req; uint32_t nsubch; size_t resp_len; nsubch_req = *nsubch0; KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req)); xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n"); return (ENXIO); } req = vmbus_xact_req_data(xact); req->nvs_type = HN_NVS_TYPE_SUBCH_REQ; req->nvs_op = HN_NVS_SUBCH_OP_ALLOC; req->nvs_nsubch = nsubch_req; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len, HN_NVS_TYPE_SUBCH_RESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n"); error = EIO; goto done; } if (resp->nvs_status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n", resp->nvs_status); error = EIO; goto done; } nsubch = resp->nvs_nsubch; if (nsubch > nsubch_req) { if_printf(sc->hn_ifp, "%u subchans are allocated, " "requested %d\n", nsubch, nsubch_req); nsubch = nsubch_req; } *nsubch0 = nsubch; error = 0; done: vmbus_xact_put(xact); return (error); } int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) { return hn_nvs_send_rndis_sglist(chan, HN_NVS_RNDIS_MTYPE_CTRL, sndc, gpa, gpa_cnt); } void hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path) { struct hn_nvs_datapath dp; memset(&dp, 0, sizeof(dp)); dp.nvs_type = HN_NVS_TYPE_SET_DATAPATH; dp.nvs_active_path = path; hn_nvs_req_send(sc, &dp, sizeof(dp)); } diff --git a/sys/dev/hyperv/netvsc/hn_rndis.c b/sys/dev/hyperv/netvsc/hn_rndis.c index 58167312c9f3..cafe73ff519c 100644 --- a/sys/dev/hyperv/netvsc/hn_rndis.c +++ b/sys/dev/hyperv/netvsc/hn_rndis.c @@ -1,1068 +1,1067 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include #include #include #include #include #include #include #include #define HN_RNDIS_RID_COMPAT_MASK 0xffff #define HN_RNDIS_RID_COMPAT_MAX HN_RNDIS_RID_COMPAT_MASK #define HN_RNDIS_XFER_SIZE 2048 #define HN_NDIS_TXCSUM_CAP_IP4 \ (NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT) #define HN_NDIS_TXCSUM_CAP_TCP4 \ (NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT) #define HN_NDIS_TXCSUM_CAP_TCP6 \ (NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \ NDIS_TXCSUM_CAP_IP6EXT) #define HN_NDIS_TXCSUM_CAP_UDP6 \ (NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT) #define HN_NDIS_LSOV2_CAP_IP6 \ (NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT) static const void *hn_rndis_xact_exec1(struct hn_softc *, struct vmbus_xact *, size_t, struct hn_nvs_sendctx *, size_t *); static const void *hn_rndis_xact_execute(struct hn_softc *, struct vmbus_xact *, uint32_t, size_t, size_t *, uint32_t); static int hn_rndis_query(struct hn_softc *, uint32_t, const void *, size_t, void *, size_t *); static int hn_rndis_query2(struct hn_softc *, uint32_t, const void *, size_t, void *, size_t *, size_t); static int hn_rndis_set(struct hn_softc *, uint32_t, const void *, size_t); static int hn_rndis_init(struct hn_softc *); static int hn_rndis_halt(struct hn_softc *); static int hn_rndis_conf_offload(struct hn_softc *, int); static int hn_rndis_query_hwcaps(struct hn_softc *, struct ndis_offload *); static __inline uint32_t hn_rndis_rid(struct hn_softc *sc) { uint32_t rid; again: rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1); if (rid == 0) goto again; /* Use upper 16 bits for non-compat RNDIS messages. */ return ((rid & 0xffff) << 16); } void hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_comp_hdr *comp; const struct rndis_msghdr *hdr; KASSERT(dlen >= sizeof(*hdr), ("invalid RNDIS msg\n")); hdr = data; switch (hdr->rm_type) { case REMOTE_NDIS_INITIALIZE_CMPLT: case REMOTE_NDIS_QUERY_CMPLT: case REMOTE_NDIS_SET_CMPLT: case REMOTE_NDIS_KEEPALIVE_CMPLT: /* unused */ if (dlen < sizeof(*comp)) { if_printf(sc->hn_ifp, "invalid RNDIS cmplt\n"); return; } comp = data; KASSERT(comp->rm_rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid RNDIS rid 0x%08x\n", comp->rm_rid)); vmbus_xact_ctx_wakeup(sc->hn_xact, comp, dlen); break; case REMOTE_NDIS_RESET_CMPLT: /* * Reset completed, no rid. * * NOTE: * RESET is not issued by hn(4), so this message should * _not_ be observed. */ if_printf(sc->hn_ifp, "RESET cmplt received\n"); break; default: if_printf(sc->hn_ifp, "unknown RNDIS msg 0x%x\n", hdr->rm_type); break; } } int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr) { size_t eaddr_len; int error; eaddr_len = ETHER_ADDR_LEN; error = hn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, NULL, 0, eaddr, &eaddr_len); if (error) return (error); if (eaddr_len != ETHER_ADDR_LEN) { if_printf(sc->hn_ifp, "invalid eaddr len %zu\n", eaddr_len); return (EINVAL); } return (0); } int hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status) { size_t size; int error; size = sizeof(*link_status); error = hn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, NULL, 0, link_status, &size); if (error) return (error); if (size != sizeof(uint32_t)) { if_printf(sc->hn_ifp, "invalid link status len %zu\n", size); return (EINVAL); } return (0); } int hn_rndis_get_mtu(struct hn_softc *sc, uint32_t *mtu) { size_t size; int error; size = sizeof(*mtu); error = hn_rndis_query(sc, OID_GEN_MAXIMUM_FRAME_SIZE, NULL, 0, mtu, &size); if (error) return (error); if (size != sizeof(uint32_t)) { if_printf(sc->hn_ifp, "invalid mtu len %zu\n", size); return (EINVAL); } return (0); } static const void * hn_rndis_xact_exec1(struct hn_softc *sc, struct vmbus_xact *xact, size_t reqlen, struct hn_nvs_sendctx *sndc, size_t *comp_len) { struct vmbus_gpa gpa[HN_XACT_REQ_PGCNT]; int gpa_cnt, error; bus_addr_t paddr; KASSERT(reqlen <= HN_XACT_REQ_SIZE && reqlen > 0, ("invalid request length %zu", reqlen)); /* * Setup the SG list. */ paddr = vmbus_xact_req_paddr(xact); KASSERT((paddr & PAGE_MASK) == 0, ("vmbus xact request is not page aligned 0x%jx", (uintmax_t)paddr)); for (gpa_cnt = 0; gpa_cnt < HN_XACT_REQ_PGCNT; ++gpa_cnt) { int len = PAGE_SIZE; if (reqlen == 0) break; if (reqlen < len) len = reqlen; gpa[gpa_cnt].gpa_page = atop(paddr) + gpa_cnt; gpa[gpa_cnt].gpa_len = len; gpa[gpa_cnt].gpa_ofs = 0; reqlen -= len; } KASSERT(reqlen == 0, ("still have %zu request data left", reqlen)); /* * Send this RNDIS control message and wait for its completion * message. */ vmbus_xact_activate(xact); error = hn_nvs_send_rndis_ctrl(sc->hn_prichan, sndc, gpa, gpa_cnt); if (error) { vmbus_xact_deactivate(xact); if_printf(sc->hn_ifp, "RNDIS ctrl send failed: %d\n", error); return (NULL); } return (vmbus_chan_xact_wait(sc->hn_prichan, xact, comp_len, HN_CAN_SLEEP(sc))); } static const void * hn_rndis_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, uint32_t rid, size_t reqlen, size_t *comp_len0, uint32_t comp_type) { const struct rndis_comp_hdr *comp; size_t comp_len, min_complen = *comp_len0; KASSERT(rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid rid %u\n", rid)); KASSERT(min_complen >= sizeof(*comp), ("invalid minimum complete len %zu", min_complen)); /* * Execute the xact setup by the caller. */ comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_nvs_sendctx_none, &comp_len); if (comp == NULL) return (NULL); /* * Check this RNDIS complete message. */ if (comp_len < min_complen) { if (comp_len >= sizeof(*comp)) { /* rm_status field is valid */ if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu, " "status 0x%08x\n", comp_len, comp->rm_status); } else { if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu\n", comp_len); } return (NULL); } if (comp->rm_len < min_complen) { if_printf(sc->hn_ifp, "invalid RNDIS comp msglen %u\n", comp->rm_len); return (NULL); } if (comp->rm_type != comp_type) { if_printf(sc->hn_ifp, "unexpected RNDIS comp 0x%08x, " "expect 0x%08x\n", comp->rm_type, comp_type); return (NULL); } if (comp->rm_rid != rid) { if_printf(sc->hn_ifp, "RNDIS comp rid mismatch %u, " "expect %u\n", comp->rm_rid, rid); return (NULL); } /* All pass! */ *comp_len0 = comp_len; return (comp); } static int hn_rndis_query(struct hn_softc *sc, uint32_t oid, const void *idata, size_t idlen, void *odata, size_t *odlen0) { return (hn_rndis_query2(sc, oid, idata, idlen, odata, odlen0, *odlen0)); } static int hn_rndis_query2(struct hn_softc *sc, uint32_t oid, const void *idata, size_t idlen, void *odata, size_t *odlen0, size_t min_odlen) { struct rndis_query_req *req; const struct rndis_query_comp *comp; struct vmbus_xact *xact; size_t reqlen, odlen = *odlen0, comp_len; int error, ofs; uint32_t rid; reqlen = sizeof(*req) + idlen; xact = vmbus_xact_get(sc->hn_xact, reqlen); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS query 0x%08x\n", oid); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_QUERY_MSG; req->rm_len = reqlen; req->rm_rid = rid; req->rm_oid = oid; /* * XXX * This is _not_ RNDIS Spec conforming: * "This MUST be set to 0 when there is no input data * associated with the OID." * * If this field was set to 0 according to the RNDIS Spec, * Hyper-V would set non-SUCCESS status in the query * completion. */ req->rm_infobufoffset = RNDIS_QUERY_REQ_INFOBUFOFFSET; if (idlen > 0) { req->rm_infobuflen = idlen; /* Input data immediately follows RNDIS query. */ memcpy(req + 1, idata, idlen); } comp_len = sizeof(*comp) + min_odlen; comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, REMOTE_NDIS_QUERY_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS query 0x%08x failed\n", oid); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS query 0x%08x failed: " "status 0x%08x\n", oid, comp->rm_status); error = EIO; goto done; } if (comp->rm_infobuflen == 0 || comp->rm_infobufoffset == 0) { /* No output data! */ if_printf(sc->hn_ifp, "RNDIS query 0x%08x, no data\n", oid); *odlen0 = 0; error = 0; goto done; } /* * Check output data length and offset. */ /* ofs is the offset from the beginning of comp. */ ofs = RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(comp->rm_infobufoffset); if (ofs < sizeof(*comp) || ofs + comp->rm_infobuflen > comp_len) { if_printf(sc->hn_ifp, "RNDIS query invalid comp ib off/len, " "%u/%u\n", comp->rm_infobufoffset, comp->rm_infobuflen); error = EINVAL; goto done; } /* * Save output data. */ if (comp->rm_infobuflen < odlen) odlen = comp->rm_infobuflen; memcpy(odata, ((const uint8_t *)comp) + ofs, odlen); *odlen0 = odlen; error = 0; done: vmbus_xact_put(xact); return (error); } int hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt0) { struct ndis_rss_caps in, caps; size_t caps_len; int error, indsz, rxr_cnt, hash_fnidx; uint32_t hash_func = 0, hash_types = 0; *rxr_cnt0 = 0; if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_20) return (EOPNOTSUPP); memset(&in, 0, sizeof(in)); in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS; in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2; in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE; caps_len = NDIS_RSS_CAPS_SIZE; error = hn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES, &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0); if (error) return (error); /* * Preliminary verification. */ if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) { if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", caps.ndis_hdr.ndis_type); return (EINVAL); } if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) { if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", caps.ndis_hdr.ndis_rev); return (EINVAL); } if (caps.ndis_hdr.ndis_size > caps_len) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " "data size %zu\n", caps.ndis_hdr.ndis_size, caps_len); return (EINVAL); } else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", caps.ndis_hdr.ndis_size); return (EINVAL); } /* * Save information for later RSS configuration. */ if (caps.ndis_nrxr == 0) { if_printf(sc->hn_ifp, "0 RX rings!?\n"); return (EINVAL); } if (bootverbose) if_printf(sc->hn_ifp, "%u RX rings\n", caps.ndis_nrxr); rxr_cnt = caps.ndis_nrxr; if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE && caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) { if (caps.ndis_nind > NDIS_HASH_INDCNT) { if_printf(sc->hn_ifp, "too many RSS indirect table entries %u\n", caps.ndis_nind); return (EOPNOTSUPP); } if (!powerof2(caps.ndis_nind)) { if_printf(sc->hn_ifp, "RSS indirect table size is not " "power-of-2 %u\n", caps.ndis_nind); } if (bootverbose) { if_printf(sc->hn_ifp, "RSS indirect table size %u\n", caps.ndis_nind); } indsz = caps.ndis_nind; } else { indsz = NDIS_HASH_INDCNT; } if (indsz < rxr_cnt) { if_printf(sc->hn_ifp, "# of RX rings (%d) > " "RSS indirect table size %d\n", rxr_cnt, indsz); rxr_cnt = indsz; } /* * NOTE: * Toeplitz is at the lowest bit, and it is preferred; so ffs(), * instead of fls(), is used here. */ hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK); if (hash_fnidx == 0) { if_printf(sc->hn_ifp, "no hash functions, caps 0x%08x\n", caps.ndis_caps); return (EOPNOTSUPP); } hash_func = 1 << (hash_fnidx - 1); /* ffs is 1-based */ if (caps.ndis_caps & NDIS_RSS_CAP_IPV4) hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4; if (caps.ndis_caps & NDIS_RSS_CAP_IPV6) hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6; if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX) hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX; if (hash_types == 0) { if_printf(sc->hn_ifp, "no hash types, caps 0x%08x\n", caps.ndis_caps); return (EOPNOTSUPP); } if (bootverbose) if_printf(sc->hn_ifp, "RSS caps %#x\n", caps.ndis_caps); /* Commit! */ sc->hn_rss_ind_size = indsz; sc->hn_rss_hcap = hash_func | hash_types; if (sc->hn_caps & HN_CAP_UDPHASH) { /* UDP 4-tuple hash is unconditionally enabled. */ sc->hn_rss_hcap |= NDIS_HASH_UDP_IPV4_X; } *rxr_cnt0 = rxr_cnt; return (0); } static int hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, size_t dlen) { struct rndis_set_req *req; const struct rndis_set_comp *comp; struct vmbus_xact *xact; size_t reqlen, comp_len; uint32_t rid; int error; KASSERT(dlen > 0, ("invalid dlen %zu", dlen)); reqlen = sizeof(*req) + dlen; xact = vmbus_xact_get(sc->hn_xact, reqlen); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS set 0x%08x\n", oid); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_SET_MSG; req->rm_len = reqlen; req->rm_rid = rid; req->rm_oid = oid; req->rm_infobuflen = dlen; req->rm_infobufoffset = RNDIS_SET_REQ_INFOBUFOFFSET; /* Data immediately follows RNDIS set. */ memcpy(req + 1, data, dlen); comp_len = sizeof(*comp); comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, REMOTE_NDIS_SET_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS set 0x%08x failed\n", oid); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS set 0x%08x failed: " "status 0x%08x\n", oid, comp->rm_status); error = EIO; goto done; } error = 0; done: vmbus_xact_put(xact); return (error); } int hn_rndis_reconf_offload(struct hn_softc *sc, int mtu) { return(hn_rndis_conf_offload(sc, mtu)); } static int hn_rndis_conf_offload(struct hn_softc *sc, int mtu) { struct ndis_offload hwcaps; struct ndis_offload_params params; uint32_t caps = 0; size_t paramsz; int error, tso_maxsz, tso_minsg; error = hn_rndis_query_hwcaps(sc, &hwcaps); if (error) { if_printf(sc->hn_ifp, "hwcaps query failed: %d\n", error); return (error); } /* NOTE: 0 means "no change" */ memset(¶ms, 0, sizeof(params)); params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT; if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) { params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2; paramsz = NDIS_OFFLOAD_PARAMS_SIZE_6_1; } else { params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3; paramsz = NDIS_OFFLOAD_PARAMS_SIZE; } params.ndis_hdr.ndis_size = paramsz; /* * TSO4/TSO6 setup. */ tso_maxsz = IP_MAXPACKET; tso_minsg = 2; if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) { caps |= HN_CAP_TSO4; params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON; if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz) tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz; if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg) tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg; } if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) && (hwcaps.ndis_lsov2.ndis_ip6_opts & HN_NDIS_LSOV2_CAP_IP6) == HN_NDIS_LSOV2_CAP_IP6) { caps |= HN_CAP_TSO6; params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON; if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz) tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz; if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg) tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg; } sc->hn_ndis_tso_szmax = 0; sc->hn_ndis_tso_sgmin = 0; if (caps & (HN_CAP_TSO4 | HN_CAP_TSO6)) { KASSERT(tso_maxsz <= IP_MAXPACKET, ("invalid NDIS TSO maxsz %d", tso_maxsz)); KASSERT(tso_minsg >= 2, ("invalid NDIS TSO minsg %d", tso_minsg)); if (tso_maxsz < tso_minsg * mtu) { if_printf(sc->hn_ifp, "invalid NDIS TSO config: " "maxsz %d, minsg %d, mtu %d; " "disable TSO4 and TSO6\n", tso_maxsz, tso_minsg, mtu); caps &= ~(HN_CAP_TSO4 | HN_CAP_TSO6); params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF; params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF; } else { sc->hn_ndis_tso_szmax = tso_maxsz; sc->hn_ndis_tso_sgmin = tso_minsg; if (bootverbose) { if_printf(sc->hn_ifp, "NDIS TSO " "szmax %d sgmin %d\n", sc->hn_ndis_tso_szmax, sc->hn_ndis_tso_sgmin); } } } /* IPv4 checksum */ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_IP4) == HN_NDIS_TXCSUM_CAP_IP4) { caps |= HN_CAP_IPCS; params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) { if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX; } /* TCP4 checksum */ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_TCP4) == HN_NDIS_TXCSUM_CAP_TCP4) { caps |= HN_CAP_TCP4CS; params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) { if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX; } /* UDP4 checksum */ if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { caps |= HN_CAP_UDP4CS; params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) { if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX; } /* TCP6 checksum */ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_TCP6) == HN_NDIS_TXCSUM_CAP_TCP6) { caps |= HN_CAP_TCP6CS; params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) { if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX; } /* UDP6 checksum */ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_UDP6) == HN_NDIS_TXCSUM_CAP_UDP6) { caps |= HN_CAP_UDP6CS; params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) { if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX; } /* RSC offload */ if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3) { if (hwcaps.ndis_rsc.ndis_ip4 && hwcaps.ndis_rsc.ndis_ip6 && sc->hn_rsc_ctrl) { params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_ON; params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_ON; } else { params.ndis_rsc_ip4 = NDIS_OFFLOAD_RSC_OFF; params.ndis_rsc_ip6 = NDIS_OFFLOAD_RSC_OFF; } } if (bootverbose) { if_printf(sc->hn_ifp, "offload csum: " "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n", params.ndis_ip4csum, params.ndis_tcp4csum, params.ndis_udp4csum, params.ndis_tcp6csum, params.ndis_udp6csum); if_printf(sc->hn_ifp, "offload lsov2: ip4 %u, ip6 %u\n", params.ndis_lsov2_ip4, params.ndis_lsov2_ip6); if (hwcaps.ndis_hdr.ndis_rev >= NDIS_OFFLOAD_PARAMS_REV_3) if_printf(sc->hn_ifp, "offload rsc: ip4 %u, ip6 %u\n", params.ndis_rsc_ip4, params.ndis_rsc_ip6); } error = hn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, ¶ms, paramsz); if (error) { if_printf(sc->hn_ifp, "offload config failed: %d\n", error); return (error); } if (bootverbose) if_printf(sc->hn_ifp, "offload config done\n"); sc->hn_caps |= caps; return (0); } int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; struct ndis_rss_params *prm = &rss->rss_params; int error, rss_size; /* * Only NDIS 6.20+ is supported: * We only support 4bytes element in indirect table, which has been * adopted since NDIS 6.20. */ KASSERT(sc->hn_ndis_ver >= HN_NDIS_VERSION_6_20, ("NDIS 6.20+ is required, NDIS version 0x%08x", sc->hn_ndis_ver)); /* XXX only one can be specified through, popcnt? */ KASSERT((sc->hn_rss_hash & NDIS_HASH_FUNCTION_MASK), ("no hash func %08x", sc->hn_rss_hash)); KASSERT((sc->hn_rss_hash & NDIS_HASH_STD), ("no standard hash types %08x", sc->hn_rss_hash)); KASSERT(sc->hn_rss_ind_size > 0, ("no indirect table size")); if (bootverbose) { if_printf(sc->hn_ifp, "RSS indirect table size %d, " "hash 0x%08x\n", sc->hn_rss_ind_size, sc->hn_rss_hash); } /* * NOTE: * DO NOT whack rss_key and rss_ind, which are setup by the caller. */ memset(prm, 0, sizeof(*prm)); rss_size = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->hn_rss_ind_size); prm->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS; prm->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2; prm->ndis_hdr.ndis_size = rss_size; prm->ndis_flags = flags; prm->ndis_hash = sc->hn_rss_hash & (NDIS_HASH_FUNCTION_MASK | NDIS_HASH_STD); prm->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->hn_rss_ind_size; prm->ndis_indoffset = __offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]); prm->ndis_keysize = sizeof(rss->rss_key); prm->ndis_keyoffset = __offsetof(struct ndis_rssprm_toeplitz, rss_key[0]); error = hn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS, rss, rss_size); if (error) { if_printf(sc->hn_ifp, "RSS config failed: %d\n", error); } else { if (bootverbose) if_printf(sc->hn_ifp, "RSS config done\n"); } return (error); } int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error; error = hn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER, &filter, sizeof(filter)); if (error) { if_printf(sc->hn_ifp, "set RX filter 0x%08x failed: %d\n", filter, error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "set RX filter 0x%08x done\n", filter); } } return (error); } static int hn_rndis_init(struct hn_softc *sc) { struct rndis_init_req *req; const struct rndis_init_comp *comp; struct vmbus_xact *xact; size_t comp_len; uint32_t rid; int error; xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS init\n"); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_INITIALIZE_MSG; req->rm_len = sizeof(*req); req->rm_rid = rid; req->rm_ver_major = RNDIS_VERSION_MAJOR; req->rm_ver_minor = RNDIS_VERSION_MINOR; req->rm_max_xfersz = HN_RNDIS_XFER_SIZE; comp_len = RNDIS_INIT_COMP_SIZE_MIN; comp = hn_rndis_xact_execute(sc, xact, rid, sizeof(*req), &comp_len, REMOTE_NDIS_INITIALIZE_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS init failed\n"); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS init failed: status 0x%08x\n", comp->rm_status); error = EIO; goto done; } sc->hn_rndis_agg_size = comp->rm_pktmaxsz; sc->hn_rndis_agg_pkts = comp->rm_pktmaxcnt; sc->hn_rndis_agg_align = 1U << comp->rm_align; if (sc->hn_rndis_agg_align < sizeof(uint32_t)) { /* * The RNDIS packet messsage encap assumes that the RNDIS * packet message is at least 4 bytes aligned. Fix up the * alignment here, if the remote side sets the alignment * too low. */ if_printf(sc->hn_ifp, "fixup RNDIS aggpkt align: %u -> %zu\n", sc->hn_rndis_agg_align, sizeof(uint32_t)); sc->hn_rndis_agg_align = sizeof(uint32_t); } if (bootverbose) { if_printf(sc->hn_ifp, "RNDIS ver %u.%u, " "aggpkt size %u, aggpkt cnt %u, aggpkt align %u\n", comp->rm_ver_major, comp->rm_ver_minor, sc->hn_rndis_agg_size, sc->hn_rndis_agg_pkts, sc->hn_rndis_agg_align); } error = 0; done: vmbus_xact_put(xact); return (error); } static int hn_rndis_halt(struct hn_softc *sc) { struct vmbus_xact *xact; struct rndis_halt_req *halt; struct hn_nvs_sendctx sndc; size_t comp_len; xact = vmbus_xact_get(sc->hn_xact, sizeof(*halt)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS halt\n"); return (ENXIO); } halt = vmbus_xact_req_data(xact); halt->rm_type = REMOTE_NDIS_HALT_MSG; halt->rm_len = sizeof(*halt); halt->rm_rid = hn_rndis_rid(sc); /* No RNDIS completion; rely on NVS message send completion */ hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); hn_rndis_xact_exec1(sc, xact, sizeof(*halt), &sndc, &comp_len); vmbus_xact_put(xact); if (bootverbose) if_printf(sc->hn_ifp, "RNDIS halt done\n"); return (0); } static int hn_rndis_query_hwcaps(struct hn_softc *sc, struct ndis_offload *caps) { struct ndis_offload in; size_t caps_len, size; int error; memset(&in, 0, sizeof(in)); in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD; if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3; size = NDIS_OFFLOAD_SIZE; } else if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_1) { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2; size = NDIS_OFFLOAD_SIZE_6_1; } else { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1; size = NDIS_OFFLOAD_SIZE_6_0; } in.ndis_hdr.ndis_size = size; caps_len = NDIS_OFFLOAD_SIZE; error = hn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES, &in, size, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0); if (error) return (error); /* * Preliminary verification. */ if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) { if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", caps->ndis_hdr.ndis_type); return (EINVAL); } if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) { if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", caps->ndis_hdr.ndis_rev); return (EINVAL); } if (caps->ndis_hdr.ndis_size > caps_len) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " "data size %zu\n", caps->ndis_hdr.ndis_size, caps_len); return (EINVAL); } else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", caps->ndis_hdr.ndis_size); return (EINVAL); } else if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3 && caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE) { if_printf(sc->hn_ifp, "invalid NDIS rev3 objsize %u\n", caps->ndis_hdr.ndis_size); return (EINVAL); } if (bootverbose) { /* * NOTE: * caps->ndis_hdr.ndis_size MUST be checked before accessing * NDIS 6.1+ specific fields. */ if_printf(sc->hn_ifp, "hwcaps rev %u\n", caps->ndis_hdr.ndis_rev); if_printf(sc->hn_ifp, "hwcaps csum: " "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, " "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n", caps->ndis_csum.ndis_ip4_txcsum, caps->ndis_csum.ndis_ip4_txenc, caps->ndis_csum.ndis_ip4_rxcsum, caps->ndis_csum.ndis_ip4_rxenc, caps->ndis_csum.ndis_ip6_txcsum, caps->ndis_csum.ndis_ip6_txenc, caps->ndis_csum.ndis_ip6_rxcsum, caps->ndis_csum.ndis_ip6_rxenc); if_printf(sc->hn_ifp, "hwcaps lsov2: " "ip4 maxsz %u minsg %u encap 0x%x, " "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n", caps->ndis_lsov2.ndis_ip4_maxsz, caps->ndis_lsov2.ndis_ip4_minsg, caps->ndis_lsov2.ndis_ip4_encap, caps->ndis_lsov2.ndis_ip6_maxsz, caps->ndis_lsov2.ndis_ip6_minsg, caps->ndis_lsov2.ndis_ip6_encap, caps->ndis_lsov2.ndis_ip6_opts); if (caps->ndis_hdr.ndis_rev >= NDIS_OFFLOAD_REV_3) if_printf(sc->hn_ifp, "hwcaps rsc: " "ip4 %u ip6 %u\n", caps->ndis_rsc.ndis_ip4, caps->ndis_rsc.ndis_ip6); } return (0); } int hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done) { int error; *init_done = 0; /* * Initialize RNDIS. */ error = hn_rndis_init(sc); if (error) return (error); *init_done = 1; /* * Configure NDIS offload settings. */ hn_rndis_conf_offload(sc, mtu); return (0); } void hn_rndis_detach(struct hn_softc *sc) { /* Halt the RNDIS. */ hn_rndis_halt(sc); } diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c index 5d7002b8f65f..aeb6ec22a703 100644 --- a/sys/dev/hyperv/netvsc/if_hn.c +++ b/sys/dev/hyperv/netvsc/if_hn.c @@ -1,7721 +1,7723 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hn.h" #include "opt_inet6.h" #include "opt_inet.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include +#include +#include + #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmbus_if.h" #define HN_IFSTART_SUPPORT #define HN_RING_CNT_DEF_MAX 8 #define HN_VFMAP_SIZE_DEF 8 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_RNDIS_PKT_LEN \ (sizeof(struct rndis_packet_msg) + \ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE /* -1 for RNDIS packet message */ #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 #define HN_PKTBUF_LEN_DEF (16 * 1024) #define HN_LROENT_CNT_DEF 128 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * if_getmtu(ifp)) #define HN_LRO_ACKCNT_DEF 1 #define HN_LOCK_INIT(sc) \ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) #define HN_LOCK(sc) \ do { \ while (sx_try_xlock(&(sc)->hn_lock) == 0) { \ /* Relinquish cpu to avoid deadlock */ \ sched_relinquish(curthread); \ DELAY(1000); \ } \ } while (0) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) #define HN_CSUM_IP_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) #define HN_PKTSIZE_MIN(align) \ roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ HN_RNDIS_PKT_LEN, (align)) #define HN_PKTSIZE(m, align) \ roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) #ifdef RSS #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) #else #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) #endif struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif STAILQ_ENTRY(hn_txdesc) agg_link; /* Aggregated txdescs, in sending order. */ STAILQ_HEAD(, hn_txdesc) agg_list; /* The oldest packet, if transmission aggregation happens. */ struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ struct hn_nvs_sendctx send_ctx; uint32_t chim_index; int chim_size; bus_dmamap_t data_dmap; bus_addr_t rndis_pkt_paddr; struct rndis_packet_msg *rndis_pkt; bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x0001 #define HN_TXD_FLAG_DMAMAP 0x0002 #define HN_TXD_FLAG_ONAGG 0x0004 #define HN_NDIS_PKTINFO_SUBALLOC 0x01 #define HN_NDIS_PKTINFO_1ST_FRAG 0x02 #define HN_NDIS_PKTINFO_LAST_FRAG 0x04 struct packet_info_id { uint8_t ver; uint8_t flag; uint16_t pkt_id; }; #define NDIS_PKTINFOID_SZ sizeof(struct packet_info_id) struct hn_rxinfo { const uint32_t *vlan_info; const uint32_t *csum_info; const uint32_t *hash_info; const uint32_t *hash_value; const struct packet_info_id *pktinfo_id; }; struct hn_rxvf_setarg { struct hn_rx_ring *rxr; if_t vf_ifp; }; #define HN_RXINFO_VLAN 0x0001 #define HN_RXINFO_CSUM 0x0002 #define HN_RXINFO_HASHINF 0x0004 #define HN_RXINFO_HASHVAL 0x0008 #define HN_RXINFO_PKTINFO_ID 0x0010 #define HN_RXINFO_ALL \ (HN_RXINFO_VLAN | \ HN_RXINFO_CSUM | \ HN_RXINFO_HASHINF | \ HN_RXINFO_HASHVAL | \ HN_RXINFO_PKTINFO_ID) static int hn_probe(device_t); static int hn_attach(device_t); static int hn_detach(device_t); static int hn_shutdown(device_t); static void hn_chan_callback(struct vmbus_channel *, void *); static void hn_init(void *); static int hn_ioctl(if_t, u_long, caddr_t); #ifdef HN_IFSTART_SUPPORT static void hn_start(if_t); #endif static int hn_transmit(if_t, struct mbuf *); static void hn_xmit_qflush(if_t); static int hn_ifmedia_upd(if_t); static void hn_ifmedia_sts(if_t, struct ifmediareq *); static void hn_ifnet_event(void *, if_t, int); static void hn_ifaddr_event(void *, if_t); static void hn_ifnet_attevent(void *, if_t); static void hn_ifnet_detevent(void *, if_t); static void hn_ifnet_lnkevent(void *, if_t, int); static bool hn_ismyvf(const struct hn_softc *, const if_t); static void hn_rxvf_change(struct hn_softc *, if_t, bool); static void hn_rxvf_set(struct hn_softc *, if_t); static void hn_rxvf_set_task(void *, int); static void hn_xpnt_vf_input(if_t, struct mbuf *); static int hn_xpnt_vf_iocsetflags(struct hn_softc *); static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, struct ifreq *); static void hn_xpnt_vf_saveifflags(struct hn_softc *); static bool hn_xpnt_vf_isready(struct hn_softc *); static void hn_xpnt_vf_setready(struct hn_softc *); static void hn_xpnt_vf_init_taskfunc(void *, int); static void hn_xpnt_vf_init(struct hn_softc *); static void hn_xpnt_vf_setenable(struct hn_softc *); static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); static void hn_vf_rss_fixup(struct hn_softc *, bool); static void hn_vf_rss_restore(struct hn_softc *); static int hn_rndis_rxinfo(const void *, int, struct hn_rxinfo *); static void hn_rndis_rx_data(struct hn_rx_ring *, const void *, int); static void hn_rndis_rx_status(struct hn_softc *, const void *, int); static void hn_rndis_init_fixat(struct hn_softc *, int); static void hn_nvs_handle_notify(struct hn_softc *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_comp(struct hn_softc *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, uint64_t); static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); static void hn_stop(struct hn_softc *, bool); static void hn_init_locked(struct hn_softc *); static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); static int hn_attach_subchans(struct hn_softc *); static void hn_detach_allchans(struct hn_softc *); static void hn_chan_rollup(struct hn_rx_ring *, struct hn_tx_ring *); static void hn_set_ring_inuse(struct hn_softc *, int); static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static int hn_synth_alloc_subchans(struct hn_softc *, int *); static bool hn_synth_attachable(const struct hn_softc *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_suspend_mgmt_taskfunc(void *, int); static void hn_chan_drain(struct hn_softc *, struct vmbus_channel *); static void hn_disable_rx(struct hn_softc *); static void hn_drain_rxtx(struct hn_softc *, int); static void hn_polling(struct hn_softc *, u_int); static void hn_chan_polling(struct vmbus_channel *, u_int); static void hn_mtu_change_fixup(struct hn_softc *); static void hn_update_link_status(struct hn_softc *); static void hn_change_network(struct hn_softc *); static void hn_link_taskfunc(void *, int); static void hn_netchg_init_taskfunc(void *, int); static void hn_netchg_status_taskfunc(void *, int); static void hn_link_status(struct hn_softc *); static int hn_create_rx_data(struct hn_softc *, int); static void hn_destroy_rx_data(struct hn_softc *); static int hn_check_iplen(const struct mbuf *, int); static void hn_rxpkt_proto(const struct mbuf *, int *, int *); static int hn_set_rxfilter(struct hn_softc *, uint32_t); static int hn_rxfilter_config(struct hn_softc *); static int hn_rss_reconfig(struct hn_softc *); static void hn_rss_ind_fixup(struct hn_softc *); static void hn_rss_mbuf_hash(struct hn_softc *, uint32_t); static int hn_rxpkt(struct hn_rx_ring *); static uint32_t hn_rss_type_fromndis(uint32_t); static uint32_t hn_rss_type_tondis(uint32_t); static int hn_tx_ring_create(struct hn_softc *, int); static void hn_tx_ring_destroy(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); static void hn_fixup_rx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); static void hn_txdesc_gc(struct hn_tx_ring *, struct hn_txdesc *); static int hn_encap(if_t, struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_txpkt(if_t, struct hn_tx_ring *, struct hn_txdesc *); static void hn_set_chim_size(struct hn_softc *, int); static void hn_set_tso_maxsize(struct hn_softc *, int, int); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_tx_ring_qflush(struct hn_tx_ring *); static void hn_resume_tx(struct hn_softc *, int); static void hn_set_txagg(struct hn_softc *); static void *hn_try_txagg(if_t, struct hn_tx_ring *, struct hn_txdesc *, int); static int hn_get_txswq_depth(const struct hn_tx_ring *); static void hn_txpkt_done(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); static int hn_txpkt_sglist(struct hn_tx_ring *, struct hn_txdesc *); static int hn_txpkt_chim(struct hn_tx_ring *, struct hn_txdesc *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_txeof_taskfunc(void *, int); #ifdef HN_IFSTART_SUPPORT static int hn_start_locked(struct hn_tx_ring *, int); static void hn_start_taskfunc(void *, int); static void hn_start_txeof(struct hn_tx_ring *); static void hn_start_txeof_taskfunc(void *, int); #endif static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS); SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segment verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segment verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); /* * Offload UDP/IPv4 checksum. */ static int hn_enable_udp4cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp4cs, CTLFLAG_RDTUN, &hn_enable_udp4cs, 0, "Offload UDP/IPv4 checksum"); /* * Offload UDP/IPv6 checksum. */ static int hn_enable_udp6cs = 1; SYSCTL_INT(_hw_hn, OID_AUTO, enable_udp6cs, CTLFLAG_RDTUN, &hn_enable_udp6cs, 0, "Offload UDP/IPv6 checksum"); /* Stats. */ static counter_u64_t hn_udpcs_fixup; SYSCTL_COUNTER_U64(_hw_hn, OID_AUTO, udpcs_fixup, CTLFLAG_RW, &hn_udpcs_fixup, "# of UDP checksum fixup"); /* * See hn_set_hlen(). * * This value is for Azure. For Hyper-V, set this above * 65536 to disable UDP datagram checksum fixup. */ static int hn_udpcs_fixup_mtu = 1420; SYSCTL_INT(_hw_hn, OID_AUTO, udpcs_fixup_mtu, CTLFLAG_RWTUN, &hn_udpcs_fixup_mtu, 0, "UDP checksum fixup MTU threshold"); /* Limit TSO burst size */ static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); /* # of LRO entries per RX ring */ #if defined(INET) || defined(INET6) static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif static int hn_tx_taskq_cnt = 1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); #define HN_TX_TASKQ_M_INDEP 0 #define HN_TX_TASKQ_M_GLOBAL 1 #define HN_TX_TASKQ_M_EVTTQ 2 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, &hn_tx_taskq_mode, 0, "TX taskqueue modes: " "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); #ifdef HN_IFSTART_SUPPORT /* Use ifnet.if_start instead of ifnet.if_transmit */ static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); #endif /* # of channels to use */ static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); /* # of transmit rings to use */ static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); /* Software TX ring deptch */ static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); /* Packet transmission aggregation size limit */ static int hn_tx_agg_size = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); /* Packet transmission aggregation count limit */ static int hn_tx_agg_pkts = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); /* VF list */ SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, hn_vflist_sysctl, "A", "VF list"); /* VF mapping */ SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING | CTLFLAG_NEEDGIANT, 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); /* Transparent VF */ static int hn_xpnt_vf = 1; SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, &hn_xpnt_vf, 0, "Transparent VF mod"); /* Accurate BPF support for Transparent VF */ static int hn_xpnt_vf_accbpf = 0; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); /* Extra wait for transparent VF attach routing; unit seconds. */ static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, &hn_xpnt_vf_attwait, 0, "Extra wait for transparent VF attach routing; unit: seconds"); static u_int hn_cpu_index; /* next CPU for channel */ static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ static struct rmlock hn_vfmap_lock; static int hn_vfmap_size; static if_t *hn_vfmap; #ifndef RSS static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; #endif /* !RSS */ static const struct hyperv_guid hn_guid = { .hv_guid = { 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } }; static device_method_t hn_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hn_probe), DEVMETHOD(device_attach, hn_attach), DEVMETHOD(device_detach, hn_detach), DEVMETHOD(device_shutdown, hn_shutdown), DEVMETHOD_END }; static driver_t hn_driver = { "hn", hn_methods, sizeof(struct hn_softc) }; DRIVER_MODULE(hn, vmbus, hn_driver, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } static int hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0, ("invalid rndis sglist txd")); return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); } static int hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) { struct hn_nvs_rndis rndis; KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && txd->chim_size > 0, ("invalid rndis chim txd")); rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; rndis.nvs_chim_idx = txd->chim_index; rndis.nvs_chim_sz = txd->chim_size; return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, &rndis, sizeof(rndis), &txd->send_ctx)); } static __inline uint32_t hn_chim_alloc(struct hn_softc *sc) { int i, bmap_cnt = sc->hn_chim_bmap_cnt; u_long *bmap = sc->hn_chim_bmap; uint32_t ret = HN_NVS_CHIM_IDX_INVALID; for (i = 0; i < bmap_cnt; ++i) { int idx; idx = ffsl(~bmap[i]); if (idx == 0) continue; --idx; /* ffsl is 1-based */ KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, ("invalid i %d and idx %d", i, idx)); if (atomic_testandset_long(&bmap[i], idx)) continue; ret = i * LONG_BIT + idx; break; } return (ret); } static __inline void hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) { u_long mask; uint32_t idx; idx = chim_idx / LONG_BIT; KASSERT(idx < sc->hn_chim_bmap_cnt, ("invalid chimney index 0x%x", chim_idx)); mask = 1UL << (chim_idx % LONG_BIT); KASSERT(sc->hn_chim_bmap[idx] & mask, ("index bitmap 0x%lx, chimney index %u, " "bitmap idx %d, bitmask 0x%lx", sc->hn_chim_bmap[idx], chim_idx, idx, mask)); atomic_clear_long(&sc->hn_chim_bmap[idx], mask); } #if defined(INET6) || defined(INET) #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (NULL); \ } \ } while (0) /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_tso_fixup(struct mbuf *m_head) { struct ether_vlan_header *evl; struct tcphdr *th; int ehlen; KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); th = mtodo(m_head, ehlen + sizeof(*ip6)); ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_set_hlen(struct mbuf *m_head) { const struct ether_vlan_header *evl; int ehlen; PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, const struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; m_head->m_pkthdr.l2hlen = ehlen; #ifdef INET if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP_UDP)) { const struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; m_head->m_pkthdr.l3hlen = iphlen; /* * UDP checksum offload does not work in Azure, if the * following conditions meet: * - sizeof(IP hdr + UDP hdr + payload) > 1420. * - IP_DF is not set in the IP hdr. * * Fallback to software checksum for these UDP datagrams. */ if ((m_head->m_pkthdr.csum_flags & CSUM_IP_UDP) && m_head->m_pkthdr.len > hn_udpcs_fixup_mtu + ehlen && (ntohs(ip->ip_off) & IP_DF) == 0) { uint16_t off = ehlen + iphlen; counter_u64_add(hn_udpcs_fixup, 1); PULLUP_HDR(m_head, off + sizeof(struct udphdr)); *(uint16_t *)(m_head->m_data + off + m_head->m_pkthdr.csum_data) = in_cksum_skip( m_head, m_head->m_pkthdr.len, off); m_head->m_pkthdr.csum_flags &= ~CSUM_IP_UDP; } } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { const struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP && ip6->ip6_nxt != IPPROTO_UDP) { m_freem(m_head); return (NULL); } m_head->m_pkthdr.l3hlen = sizeof(*ip6); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) { const struct tcphdr *th; int ehlen, iphlen; *tcpsyn = 0; ehlen = m_head->m_pkthdr.l2hlen; iphlen = m_head->m_pkthdr.l3hlen; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); if (th->th_flags & TH_SYN) *tcpsyn = 1; return (m_head); } #undef PULLUP_HDR #endif /* INET6 || INET */ static int hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error = 0; HN_LOCK_ASSERT(sc); if (sc->hn_rx_filter != filter) { error = hn_rndis_set_rxfilter(sc, filter); if (!error) sc->hn_rx_filter = filter; } return (error); } static int hn_rxfilter_config(struct hn_softc *sc) { if_t ifp = sc->hn_ifp; uint32_t filter; HN_LOCK_ASSERT(sc); /* * If the non-transparent mode VF is activated, we don't know how * its RX filter is configured, so stick the synthetic device in * the promiscous mode. */ if ((if_getflags(ifp) & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { filter = NDIS_PACKET_TYPE_PROMISCUOUS; } else { filter = NDIS_PACKET_TYPE_DIRECTED; if (if_getflags(ifp) & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; /* TODO: support multicast list */ if ((if_getflags(ifp) & IFF_ALLMULTI) || !if_maddr_empty(ifp)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; } return (hn_set_rxfilter(sc, filter)); } static void hn_set_txagg(struct hn_softc *sc) { uint32_t size, pkts; int i; /* * Setup aggregation size. */ if (sc->hn_agg_size < 0) size = UINT32_MAX; else size = sc->hn_agg_size; if (sc->hn_rndis_agg_size < size) size = sc->hn_rndis_agg_size; /* NOTE: We only aggregate packets using chimney sending buffers. */ if (size > (uint32_t)sc->hn_chim_szmax) size = sc->hn_chim_szmax; if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'int'. */ if (size > INT_MAX) size = INT_MAX; /* * Setup aggregation packet count. */ if (sc->hn_agg_pkts < 0) pkts = UINT32_MAX; else pkts = sc->hn_agg_pkts; if (sc->hn_rndis_agg_pkts < pkts) pkts = sc->hn_rndis_agg_pkts; if (pkts <= 1) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'short'. */ if (pkts > SHRT_MAX) pkts = SHRT_MAX; done: /* NOTE: Type of the per TX ring setting is 'short'. */ if (sc->hn_rndis_agg_align > SHRT_MAX) { /* Disable */ size = 0; pkts = 0; } if (bootverbose) { if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", size, pkts, sc->hn_rndis_agg_align); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_agg_szmax = size; txr->hn_agg_pktmax = pkts; txr->hn_agg_align = sc->hn_rndis_agg_align; mtx_unlock(&txr->hn_tx_lock); } } static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } static int hn_rss_reconfig(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return (ENXIO); /* * Disable RSS first. * * NOTE: * Direct reconfiguration by setting the UNCHG flags does * _not_ work properly. */ if (bootverbose) if_printf(sc->hn_ifp, "disable RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); if (error) { if_printf(sc->hn_ifp, "RSS disable failed\n"); return (error); } /* * Reenable the RSS w/ the updated RSS key or indirect * table. */ if (bootverbose) if_printf(sc->hn_ifp, "reconfig RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { if_printf(sc->hn_ifp, "RSS reconfig failed\n"); return (error); } return (0); } static void hn_rss_ind_fixup(struct hn_softc *sc) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int i, nchan; nchan = sc->hn_rx_ring_inuse; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* * Check indirect table to make sure that all channels in it * can be used. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) { if (rss->rss_ind[i] >= nchan) { if_printf(sc->hn_ifp, "RSS indirect table %d fixup: %u -> %d\n", i, rss->rss_ind[i], nchan - 1); rss->rss_ind[i] = nchan - 1; } } } static int hn_ifmedia_upd(if_t ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(if_t ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = if_getsoftc(ifp); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } static void hn_rxvf_set_task(void *xarg, int pending __unused) { struct hn_rxvf_setarg *arg = xarg; arg->rxr->hn_rxvf_ifp = arg->vf_ifp; } static void hn_rxvf_set(struct hn_softc *sc, if_t vf_ifp) { struct hn_rx_ring *rxr; struct hn_rxvf_setarg arg; struct task task; int i; HN_LOCK_ASSERT(sc); TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; if (i < sc->hn_rx_ring_inuse) { arg.rxr = rxr; arg.vf_ifp = vf_ifp; vmbus_chan_run_task(rxr->hn_chan, &task); } else { rxr->hn_rxvf_ifp = vf_ifp; } } } static bool hn_ismyvf(const struct hn_softc *sc, const if_t ifp) { if_t hn_ifp; hn_ifp = sc->hn_ifp; if (ifp == hn_ifp) return (false); if (if_getalloctype(ifp) != IFT_ETHER) return (false); /* Ignore lagg/vlan interfaces */ if (strcmp(if_getdname(ifp), "lagg") == 0 || strcmp(if_getdname(ifp), "vlan") == 0) return (false); /* * During detach events if_getifaddr(ifp) might be NULL. * Make sure the bcmp() below doesn't panic on that: */ if (if_getifaddr(ifp) == NULL || if_getifaddr(hn_ifp) == NULL) return (false); if (bcmp(if_getlladdr(ifp), if_getlladdr(hn_ifp), ETHER_ADDR_LEN) != 0) return (false); return (true); } static void hn_rxvf_change(struct hn_softc *sc, if_t ifp, bool rxvf) { if_t hn_ifp; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto out; if (!hn_ismyvf(sc, ifp)) goto out; hn_ifp = sc->hn_ifp; if (rxvf) { if (sc->hn_flags & HN_FLAG_RXVF) goto out; sc->hn_flags |= HN_FLAG_RXVF; hn_rxfilter_config(sc); } else { if (!(sc->hn_flags & HN_FLAG_RXVF)) goto out; sc->hn_flags &= ~HN_FLAG_RXVF; if (if_getdrvflags(hn_ifp) & IFF_DRV_RUNNING) hn_rxfilter_config(sc); else hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); } hn_nvs_set_datapath(sc, rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); hn_rxvf_set(sc, rxvf ? ifp : NULL); if (rxvf) { hn_vf_rss_fixup(sc, true); hn_suspend_mgmt(sc); sc->hn_link_flags &= ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); if_link_state_change(hn_ifp, LINK_STATE_DOWN); } else { hn_vf_rss_restore(sc); hn_resume_mgmt(sc); } devctl_notify("HYPERV_NIC_VF", if_name(hn_ifp), rxvf ? "VF_UP" : "VF_DOWN", NULL); if (bootverbose) { if_printf(hn_ifp, "datapath is switched %s %s\n", rxvf ? "to" : "from", if_name(ifp)); } out: HN_UNLOCK(sc); } static void hn_ifnet_event(void *arg, if_t ifp, int event) { if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) return; hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); } static void hn_ifaddr_event(void *arg, if_t ifp) { hn_rxvf_change(arg, ifp, if_getflags(ifp) & IFF_UP); } static int hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) { if_t ifp, vf_ifp; uint64_t tmp; int error; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ ifr->ifr_reqcap &= if_getcapabilities(ifp); /* Pass SIOCSIFCAP to VF. */ error = ifhwioctl(SIOCSIFCAP, vf_ifp, (caddr_t)ifr, curthread); /* * NOTE: * The error will be propagated to the callers, however, it * is _not_ useful here. */ /* * Merge VF's enabled capabilities. */ if_setcapenable(ifp, if_getcapenable(vf_ifp) & if_getcapabilities(ifp)); tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP_HWASSIST(sc); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, tmp, 0); else if_sethwassistbits(ifp, 0, tmp); tmp = if_gethwassist(vf_ifp) & HN_CSUM_IP6_HWASSIST(sc); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, tmp, 0); else if_sethwassistbits(ifp, 0, tmp); tmp = if_gethwassist(vf_ifp) & CSUM_IP_TSO; if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, tmp, 0); else if_sethwassistbits(ifp, 0, tmp); tmp = if_gethwassist(vf_ifp) & CSUM_IP6_TSO; if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, tmp, 0); else if_sethwassistbits(ifp, 0, tmp); return (error); } static int hn_xpnt_vf_iocsetflags(struct hn_softc *sc) { if_t vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); vf_ifp = sc->hn_vf_ifp; memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); ifr.ifr_flags = if_getflags(vf_ifp) & 0xffff; ifr.ifr_flagshigh = if_getflags(vf_ifp) >> 16; return (ifhwioctl(SIOCSIFFLAGS, vf_ifp, (caddr_t)&ifr, curthread)); } static void hn_xpnt_vf_saveifflags(struct hn_softc *sc) { if_t ifp = sc->hn_ifp; int allmulti = 0; HN_LOCK_ASSERT(sc); /* XXX vlan(4) style mcast addr maintenance */ if (!if_maddr_empty(ifp)) allmulti = IFF_ALLMULTI; /* Always set the VF's if_flags */ if_setflags(sc->hn_vf_ifp, if_getflags(ifp) | allmulti); } static void hn_xpnt_vf_input(if_t vf_ifp, struct mbuf *m) { struct rm_priotracker pt; if_t hn_ifp = NULL; struct mbuf *mn; /* * XXX racy, if hn(4) ever detached. */ rm_rlock(&hn_vfmap_lock, &pt); if (if_getindex(vf_ifp) < hn_vfmap_size) hn_ifp = hn_vfmap[if_getindex(vf_ifp)]; rm_runlock(&hn_vfmap_lock, &pt); if (hn_ifp != NULL) { for (mn = m; mn != NULL; mn = mn->m_nextpkt) { /* * Allow tapping on the VF. */ ETHER_BPF_MTAP(vf_ifp, mn); /* * Update VF stats. */ if ((if_getcapenable(vf_ifp) & IFCAP_HWSTATS) == 0) { if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, mn->m_pkthdr.len); } /* * XXX IFCOUNTER_IMCAST * This stat updating is kinda invasive, since it * requires two checks on the mbuf: the length check * and the ethernet header check. As of this write, * all multicast packets go directly to hn(4), which * makes imcast stat updating in the VF a try in vian. */ /* * Fix up rcvif and increase hn(4)'s ipackets. */ mn->m_pkthdr.rcvif = hn_ifp; if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); } /* * Go through hn(4)'s if_input. */ if_input(hn_ifp, m); } else { /* * In the middle of the transition; free this * mbuf chain. */ while (m != NULL) { mn = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = mn; } } } static void hn_mtu_change_fixup(struct hn_softc *sc) { if_t ifp; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; hn_set_tso_maxsize(sc, hn_tso_maxlen, if_getmtu(ifp)); if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); } static uint32_t hn_rss_type_fromndis(uint32_t rss_hash) { uint32_t types = 0; if (rss_hash & NDIS_HASH_IPV4) types |= RSS_TYPE_IPV4; if (rss_hash & NDIS_HASH_TCP_IPV4) types |= RSS_TYPE_TCP_IPV4; if (rss_hash & NDIS_HASH_IPV6) types |= RSS_TYPE_IPV6; if (rss_hash & NDIS_HASH_IPV6_EX) types |= RSS_TYPE_IPV6_EX; if (rss_hash & NDIS_HASH_TCP_IPV6) types |= RSS_TYPE_TCP_IPV6; if (rss_hash & NDIS_HASH_TCP_IPV6_EX) types |= RSS_TYPE_TCP_IPV6_EX; if (rss_hash & NDIS_HASH_UDP_IPV4_X) types |= RSS_TYPE_UDP_IPV4; return (types); } static uint32_t hn_rss_type_tondis(uint32_t types) { uint32_t rss_hash = 0; KASSERT((types & (RSS_TYPE_UDP_IPV6 | RSS_TYPE_UDP_IPV6_EX)) == 0, ("UDP6 and UDP6EX are not supported")); if (types & RSS_TYPE_IPV4) rss_hash |= NDIS_HASH_IPV4; if (types & RSS_TYPE_TCP_IPV4) rss_hash |= NDIS_HASH_TCP_IPV4; if (types & RSS_TYPE_IPV6) rss_hash |= NDIS_HASH_IPV6; if (types & RSS_TYPE_IPV6_EX) rss_hash |= NDIS_HASH_IPV6_EX; if (types & RSS_TYPE_TCP_IPV6) rss_hash |= NDIS_HASH_TCP_IPV6; if (types & RSS_TYPE_TCP_IPV6_EX) rss_hash |= NDIS_HASH_TCP_IPV6_EX; if (types & RSS_TYPE_UDP_IPV4) rss_hash |= NDIS_HASH_UDP_IPV4_X; return (rss_hash); } static void hn_rss_mbuf_hash(struct hn_softc *sc, uint32_t mbuf_hash) { int i; HN_LOCK_ASSERT(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_mbuf_hash = mbuf_hash; } static void hn_vf_rss_fixup(struct hn_softc *sc, bool reconf) { if_t ifp, vf_ifp; struct ifrsshash ifrh; struct ifrsskey ifrk; int error; uint32_t my_types, diff_types, mbuf_types = 0; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); if (sc->hn_rx_ring_inuse == 1) { /* No RSS on synthetic parts; done. */ return; } if ((sc->hn_rss_hcap & NDIS_HASH_FUNCTION_TOEPLITZ) == 0) { /* Synthetic parts do not support Toeplitz; done. */ return; } ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Extract VF's RSS key. Only 40 bytes key for Toeplitz is * supported. */ memset(&ifrk, 0, sizeof(ifrk)); strlcpy(ifrk.ifrk_name, if_name(vf_ifp), sizeof(ifrk.ifrk_name)); error = ifhwioctl(SIOCGIFRSSKEY, vf_ifp, (caddr_t)&ifrk, curthread); if (error) { if_printf(ifp, "%s SIOCGIFRSSKEY failed: %d\n", if_name(vf_ifp), error); goto done; } if (ifrk.ifrk_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", if_name(vf_ifp), ifrk.ifrk_func); goto done; } if (ifrk.ifrk_keylen != NDIS_HASH_KEYSIZE_TOEPLITZ) { if_printf(ifp, "%s invalid RSS Toeplitz key length %d\n", if_name(vf_ifp), ifrk.ifrk_keylen); goto done; } /* * Extract VF's RSS hash. Only Toeplitz is supported. */ memset(&ifrh, 0, sizeof(ifrh)); strlcpy(ifrh.ifrh_name, if_name(vf_ifp), sizeof(ifrh.ifrh_name)); error = ifhwioctl(SIOCGIFRSSHASH, vf_ifp, (caddr_t)&ifrh, curthread); if (error) { if_printf(ifp, "%s SIOCGRSSHASH failed: %d\n", if_name(vf_ifp), error); goto done; } if (ifrh.ifrh_func != RSS_FUNC_TOEPLITZ) { if_printf(ifp, "%s RSS function %u is not Toeplitz\n", if_name(vf_ifp), ifrh.ifrh_func); goto done; } my_types = hn_rss_type_fromndis(sc->hn_rss_hcap); if ((ifrh.ifrh_types & my_types) == 0) { /* This disables RSS; ignore it then */ if_printf(ifp, "%s intersection of RSS types failed. " "VF %#x, mine %#x\n", if_name(vf_ifp), ifrh.ifrh_types, my_types); goto done; } diff_types = my_types ^ ifrh.ifrh_types; my_types &= ifrh.ifrh_types; mbuf_types = my_types; /* * Detect RSS hash value/type confliction. * * NOTE: * We don't disable the hash type, but stop delivery the hash * value/type through mbufs on RX path. * * XXX If HN_CAP_UDPHASH is set in hn_caps, then UDP 4-tuple * hash is delivered with type of TCP_IPV4. This means if * UDP_IPV4 is enabled, then TCP_IPV4 should be forced, at * least to hn_mbuf_hash. However, given that _all_ of the * NICs implement TCP_IPV4, this will _not_ impose any issues * here. */ if ((my_types & RSS_TYPE_IPV4) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV4 | RSS_TYPE_UDP_IPV4))) { /* Conflict; disable IPV4 hash type/value delivery. */ if_printf(ifp, "disable IPV4 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV4; } if ((my_types & RSS_TYPE_IPV6) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6_EX))) { /* Conflict; disable IPV6 hash type/value delivery. */ if_printf(ifp, "disable IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6; } if ((my_types & RSS_TYPE_IPV6_EX) && (diff_types & ifrh.ifrh_types & (RSS_TYPE_TCP_IPV6 | RSS_TYPE_UDP_IPV6 | RSS_TYPE_TCP_IPV6_EX | RSS_TYPE_UDP_IPV6_EX | RSS_TYPE_IPV6))) { /* Conflict; disable IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_IPV6_EX; } if ((my_types & RSS_TYPE_TCP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6_EX)) { /* Conflict; disable TCP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6; } if ((my_types & RSS_TYPE_TCP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_TCP_IPV6)) { /* Conflict; disable TCP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable TCP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_TCP_IPV6_EX; } if ((my_types & RSS_TYPE_UDP_IPV6) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6_EX)) { /* Conflict; disable UDP_IPV6 hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6 mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6; } if ((my_types & RSS_TYPE_UDP_IPV6_EX) && (diff_types & ifrh.ifrh_types & RSS_TYPE_UDP_IPV6)) { /* Conflict; disable UDP_IPV6_EX hash type/value delivery. */ if_printf(ifp, "disable UDP_IPV6_EX mbuf hash delivery\n"); mbuf_types &= ~RSS_TYPE_UDP_IPV6_EX; } /* * Indirect table does not matter. */ sc->hn_rss_hash = (sc->hn_rss_hcap & NDIS_HASH_FUNCTION_MASK) | hn_rss_type_tondis(my_types); memcpy(sc->hn_rss.rss_key, ifrk.ifrk_key, sizeof(sc->hn_rss.rss_key)); sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (reconf) { error = hn_rss_reconfig(sc); if (error) { /* XXX roll-back? */ if_printf(ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, hn_rss_type_tondis(mbuf_types)); } static void hn_vf_rss_restore(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("%s: synthetic parts are not attached", if_name(sc->hn_ifp))); if (sc->hn_rx_ring_inuse == 1) goto done; /* * Restore hash types. Key does _not_ matter. */ if (sc->hn_rss_hash != sc->hn_rss_hcap) { int error; sc->hn_rss_hash = sc->hn_rss_hcap; error = hn_rss_reconfig(sc); if (error) { if_printf(sc->hn_ifp, "hn_rss_reconfig failed: %d\n", error); /* XXX keep going. */ } } done: /* Hash deliverability for mbufs. */ hn_rss_mbuf_hash(sc, NDIS_HASH_ALL); } static void hn_xpnt_vf_setready(struct hn_softc *sc) { if_t ifp, vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Mark the VF ready. */ sc->hn_vf_rdytick = 0; /* * Save information for restoration. */ sc->hn_saved_caps = if_getcapabilities(ifp); sc->hn_saved_tsomax = if_gethwtsomax(ifp); sc->hn_saved_tsosegcnt = if_gethwtsomaxsegcount(ifp); sc->hn_saved_tsosegsz = if_gethwtsomaxsegsize(ifp); /* * Intersect supported/enabled capabilities. * * NOTE: * if_hwassist is not changed here. */ if_setcapabilitiesbit(ifp, 0, if_getcapabilities(vf_ifp)); if_setcapenablebit(ifp, 0, if_getcapabilities(ifp)); /* * Fix TSO settings. */ if (if_gethwtsomax(ifp) > if_gethwtsomax(vf_ifp)) if_sethwtsomax(ifp, if_gethwtsomax(vf_ifp)); if (if_gethwtsomaxsegcount(ifp) > if_gethwtsomaxsegcount(vf_ifp)) if_sethwtsomaxsegcount(ifp, if_gethwtsomaxsegcount(vf_ifp)); if (if_gethwtsomaxsegsize(ifp) > if_gethwtsomaxsegsize(vf_ifp)) if_sethwtsomaxsegsize(ifp, if_gethwtsomaxsegsize(vf_ifp)); /* * Change VF's enabled capabilities. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); ifr.ifr_reqcap = if_getcapenable(ifp); hn_xpnt_vf_iocsetcaps(sc, &ifr); if (if_getmtu(ifp) != ETHERMTU) { int error; /* * Change VF's MTU. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, if_name(vf_ifp), sizeof(ifr.ifr_name)); ifr.ifr_mtu = if_getmtu(ifp); error = ifhwioctl(SIOCSIFMTU, vf_ifp, (caddr_t)&ifr, curthread); if (error) { if_printf(ifp, "%s SIOCSIFMTU %u failed\n", if_name(vf_ifp), if_getmtu(ifp)); if (if_getmtu(ifp) > ETHERMTU) { if_printf(ifp, "change MTU to %d\n", ETHERMTU); /* * XXX * No need to adjust the synthetic parts' MTU; * failure of the adjustment will cause us * infinite headache. */ if_setmtu(ifp, ETHERMTU); hn_mtu_change_fixup(sc); } } } } static bool hn_xpnt_vf_isready(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) return (false); if (sc->hn_vf_rdytick == 0) return (true); if (sc->hn_vf_rdytick > ticks) return (false); /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); return (true); } static void hn_xpnt_vf_setenable(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; if (clear_vf) sc->hn_vf_ifp = NULL; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_init(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); if (bootverbose) { if_printf(sc->hn_ifp, "try bringing up %s\n", if_name(sc->hn_vf_ifp)); } /* * Bring the VF up. */ hn_xpnt_vf_saveifflags(sc); if_setflagbits(sc->hn_ifp, IFF_UP, 0); error = hn_xpnt_vf_iocsetflags(sc); if (error) { if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", if_name(sc->hn_vf_ifp), error); return; } /* * NOTE: * Datapath setting must happen _after_ bringing the VF up. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); /* * NOTE: * Fixup RSS related bits _after_ the VF is brought up, since * many VFs generate RSS key during it's initialization. */ hn_vf_rss_fixup(sc, true); /* Mark transparent mode VF as enabled. */ hn_xpnt_vf_setenable(sc); } static void hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) goto done; if (sc->hn_vf_ifp == NULL) goto done; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) goto done; if (sc->hn_vf_rdytick != 0) { /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); } if (if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) { /* * Delayed VF initialization. */ if (bootverbose) { if_printf(sc->hn_ifp, "delayed initialize %s\n", if_name(sc->hn_vf_ifp)); } hn_xpnt_vf_init(sc); } done: HN_UNLOCK(sc); } static void hn_ifnet_attevent(void *xsc, if_t ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (sc->hn_vf_ifp != NULL) { if_printf(sc->hn_ifp, "%s was attached as VF\n", if_name(sc->hn_vf_ifp)); goto done; } if (hn_xpnt_vf && if_getstartfn(ifp) != NULL) { /* * ifnet.if_start is _not_ supported by transparent * mode VF; mainly due to the IFF_DRV_OACTIVE flag. */ if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " "in transparent VF mode.\n", if_name(sc->hn_vf_ifp)); goto done; } rm_wlock(&hn_vfmap_lock); if (if_getindex(ifp) >= hn_vfmap_size) { if_t *newmap; int newsize; newsize = if_getindex(ifp) + HN_VFMAP_SIZE_DEF; newmap = malloc(sizeof(if_t) * newsize, M_DEVBUF, M_WAITOK | M_ZERO); memcpy(newmap, hn_vfmap, sizeof(if_t) * hn_vfmap_size); free(hn_vfmap, M_DEVBUF); hn_vfmap = newmap; hn_vfmap_size = newsize; } KASSERT(hn_vfmap[if_getindex(ifp)] == NULL, ("%s: ifindex %d was mapped to %s", if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); hn_vfmap[if_getindex(ifp)] = sc->hn_ifp; rm_wunlock(&hn_vfmap_lock); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", if_name(sc->hn_ifp))); sc->hn_vf_ifp = ifp; rm_wunlock(&sc->hn_vf_lock); if (hn_xpnt_vf) { int wait_ticks; /* * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. * Save vf_ifp's current if_input for later restoration. */ sc->hn_vf_input = if_getinputfn(ifp); if_setinputfn(ifp, hn_xpnt_vf_input); /* * Stop link status management; use the VF's. */ hn_suspend_mgmt(sc); /* * Give VF sometime to complete its attach routing. */ wait_ticks = hn_xpnt_vf_attwait * hz; sc->hn_vf_rdytick = ticks + wait_ticks; taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, wait_ticks); } done: HN_UNLOCK(sc); } static void hn_ifnet_detevent(void *xsc, if_t ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (sc->hn_vf_ifp == NULL) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (hn_xpnt_vf) { /* * Make sure that the delayed initialization is not running. * * NOTE: * - This lock _must_ be released, since the hn_vf_init task * will try holding this lock. * - It is safe to release this lock here, since the * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. * * XXX racy, if hn(4) ever detached. */ HN_UNLOCK(sc); taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); HN_LOCK(sc); KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", if_name(sc->hn_ifp))); if_setinputfn(ifp, sc->hn_vf_input); sc->hn_vf_input = NULL; if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) && (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); if (sc->hn_vf_rdytick == 0) { /* * The VF was ready; restore some settings. */ if_setcapabilities(ifp, sc->hn_saved_caps); /* * NOTE: * There is _no_ need to fixup if_capenable and * if_hwassist, since the if_capabilities before * restoration was an intersection of the VF's * if_capabilites and the synthetic device's * if_capabilites. */ if_sethwtsomax(ifp, sc->hn_saved_tsomax); if_sethwtsomaxsegcount(sc->hn_ifp, sc->hn_saved_tsosegcnt); if_sethwtsomaxsegsize(ifp, sc->hn_saved_tsosegsz); } if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { /* * Restore RSS settings. */ hn_vf_rss_restore(sc); /* * Resume link status management, which was suspended * by hn_ifnet_attevent(). */ hn_resume_mgmt(sc); } } /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); rm_wlock(&hn_vfmap_lock); KASSERT(if_getindex(ifp) < hn_vfmap_size, ("ifindex %d, vfmapsize %d", if_getindex(ifp), hn_vfmap_size)); if (hn_vfmap[if_getindex(ifp)] != NULL) { KASSERT(hn_vfmap[if_getindex(ifp)] == sc->hn_ifp, ("%s: ifindex %d was mapped to %s", if_name(ifp), if_getindex(ifp), if_name(hn_vfmap[if_getindex(ifp)]))); hn_vfmap[if_getindex(ifp)] = NULL; } rm_wunlock(&hn_vfmap_lock); done: HN_UNLOCK(sc); } static void hn_ifnet_lnkevent(void *xsc, if_t ifp, int link_state) { struct hn_softc *sc = xsc; if (sc->hn_vf_ifp == ifp) if_link_state_change(sc->hn_ifp, link_state); } static int hn_tsomax_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int tsomax; int error; tsomax = if_gethwtsomax(sc->hn_ifp); error = sysctl_handle_int(oidp, &tsomax, 0, req); return error; } static int hn_tsomaxsegcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int tsomaxsegcnt; int error; tsomaxsegcnt = if_gethwtsomaxsegcount(sc->hn_ifp); error = sysctl_handle_int(oidp, &tsomaxsegcnt, 0, req); return error; } static int hn_tsomaxsegsz_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int tsomaxsegsz; int error; tsomaxsegsz = if_gethwtsomaxsegsize(sc->hn_ifp); error = sysctl_handle_int(oidp, &tsomaxsegsz, 0, req); return error; } static int hn_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } static int hn_attach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; uint8_t eaddr[ETHER_ADDR_LEN]; if_t ifp = NULL; int error, ring_cnt, tx_ring_cnt; uint32_t mtu; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); HN_LOCK_INIT(sc); rm_init(&sc->hn_vf_lock, "hnvf"); if (hn_xpnt_vf && hn_xpnt_vf_accbpf) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; /* * Initialize these tunables once. */ sc->hn_agg_size = hn_tx_agg_size; sc->hn_agg_pkts = hn_tx_agg_pkts; /* * Setup taskqueue for transmission. */ if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { int i; sc->hn_tx_taskqs = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskqs[i]); taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, "%s tx%d", device_get_nameunit(dev), i); } } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { sc->hn_tx_taskqs = hn_tx_taskque; } /* * Setup taskqueue for mangement tasks, e.g. link status. */ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", device_get_nameunit(dev)); TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, hn_netchg_status_taskfunc, sc); if (hn_xpnt_vf) { /* * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. */ sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_vf_taskq); taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", device_get_nameunit(dev)); TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, hn_xpnt_vf_init_taskfunc, sc); } /* * Allocate ifnet and setup its name earlier, so that if_printf * can be used by functions, which will be called after * ether_ifattach(). */ ifp = sc->hn_ifp = if_alloc(IFT_ETHER); if_setsoftc(ifp, sc); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Initialize ifmedia earlier so that it can be unconditionally * destroyed, if error happened later on. */ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } #ifdef RSS if (ring_cnt > rss_getnumbuckets()) ring_cnt = rss_getnumbuckets(); #endif tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } #endif /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; /* * Create enough TX/RX rings, even if only limited number of * channels can be allocated. */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; error = hn_create_rx_data(sc, ring_cnt); if (error) goto failed; /* * Create transaction context for NVS and RNDIS transactions. */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); if (sc->hn_xact == NULL) { error = ENXIO; goto failed; } /* * Install orphan handler for the revocation of this device's * primary channel. * * NOTE: * The processing order is critical here: * Install the orphan handler, _before_ testing whether this * device's primary channel has been revoked or not. */ vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); if (vmbus_chan_is_revoked(sc->hn_prichan)) { error = ENXIO; goto failed; } /* * Attach the synthetic parts, i.e. NVS and RNDIS. */ error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; error = hn_rndis_get_eaddr(sc, eaddr); if (error) goto failed; error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ETHERMTU; else if (bootverbose) device_printf(dev, "RNDIS mtu %u\n", mtu); if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } /* * Fixup TX/RX stuffs after synthetic parts are attached. */ hn_fixup_tx_data(sc); hn_fixup_rx_data(sc); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, &sc->hn_nvs_ver, 0, "NVS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_ndis_version_sysctl, "A", "NDIS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_caps_sysctl, "A", "capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_hwassist_sysctl, "A", "hwassist"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_max", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomax_sysctl, "IU", "max TSO size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegcnt", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegcnt_sysctl, "IU", "max # of TSO segments"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tso_maxsegsz", CTLTYPE_UINT | CTLFLAG_RD, sc, 0, hn_tsomaxsegsz_sysctl, "IU", "max size of TSO segment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxfilter_sysctl, "A", "rxfilter"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hash_sysctl, "A", "RSS hash"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hashcap", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hcap_sysctl, "A", "RSS hash capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "mbuf_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_mbuf_sysctl, "A", "RSS hash for mbufs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); #ifndef RSS /* * Don't allow RSS key/indirect table changes, if RSS is defined. */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_key_sysctl, "IU", "RSS key"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); #endif SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, "RNDIS offered packet transmission aggregation size limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, "RNDIS offered packet transmission aggregation count limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, "RNDIS packet transmission aggregation alignment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_size_sysctl, "I", "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pkts_sysctl, "I", "Packet transmission aggregation packets, " "0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_polling_sysctl, "I", "Polling frequency: [100,1000000], 0 disable polling"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_vf_sysctl, "A", "Virtual Function's name"); if (!hn_xpnt_vf) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxvf_sysctl, "A", "activated Virtual Function's name"); } else { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_enabled_sysctl, "I", "Transparent VF enabled"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_accbpf_sysctl, "I", "Accurate BPF for transparent VF"); } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rsc_switch", CTLTYPE_UINT | CTLFLAG_RW, sc, 0, hn_rsc_sysctl, "A", "switch to rsc"); /* * Setup the ifmedia, which has been initialized earlier. */ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Setup the ifnet for this interface. */ if_setbaudrate(ifp, IF_Gbps(10)); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); if_setioctlfn(ifp, hn_ioctl); if_setinitfn(ifp, hn_init); #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); if_setstartfn(ifp, hn_start); if_setsendqlen(ifp, qdepth); if_setsendqready(ifp); } else #endif { if_settransmitfn(ifp, hn_transmit); if_setqflushfn(ifp, hn_xmit_qflush); } if_setcapabilitiesbit(ifp, IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE, 0); #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if_setcapabilitiesbit(ifp, IFCAP_RXCSUM_IPV6, 0); #endif if (sc->hn_caps & HN_CAP_VLAN) { /* XXX not sure about VLAN_MTU. */ if_setcapabilitiesbit(ifp, IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU, 0); } if_sethwassist(ifp, sc->hn_tx_ring[0].hn_csum_assist); if (if_gethwassist(ifp) & HN_CSUM_IP_MASK) if_setcapabilitiesbit(ifp, IFCAP_TXCSUM, 0); if (if_gethwassist(ifp) & HN_CSUM_IP6_MASK) if_setcapabilitiesbit(ifp, IFCAP_TXCSUM_IPV6, 0); if (sc->hn_caps & HN_CAP_TSO4) { if_setcapabilitiesbit(ifp, IFCAP_TSO4, 0); if_sethwassistbits(ifp, CSUM_IP_TSO, 0); } if (sc->hn_caps & HN_CAP_TSO6) { if_setcapabilitiesbit(ifp, IFCAP_TSO6, 0); if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); } /* Enable all available capabilities by default. */ if_setcapenable(ifp, if_getcapabilities(ifp)); /* * Disable IPv6 TSO and TXCSUM by default, they still can * be enabled through SIOCSIFCAP. */ if_setcapenablebit(ifp, 0, (IFCAP_TXCSUM_IPV6 | IFCAP_TSO6)); if_sethwassistbits(ifp, 0, (HN_CSUM_IP6_MASK | CSUM_IP6_TSO)); if (if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) { /* * Lock hn_set_tso_maxsize() to simplify its * internal logic. */ HN_LOCK(sc); hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); HN_UNLOCK(sc); if_sethwtsomaxsegcount(ifp, HN_TX_DATA_SEGCNT_MAX); if_sethwtsomaxsegsize(ifp, PAGE_SIZE); } ether_ifattach(ifp, eaddr); if ((if_getcapabilities(ifp) & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { if_printf(ifp, "TSO segcnt %u segsz %u\n", if_gethwtsomaxsegcount(ifp), if_gethwtsomaxsegsize(ifp)); } if (mtu < ETHERMTU) { if_setmtu(ifp, mtu); } /* Inform the upper layer about the long frame support. */ if_setifheaderlen(ifp, sizeof(struct ether_vlan_header)); /* * Kick off link status check. */ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; hn_update_link_status(sc); if (!hn_xpnt_vf) { sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); } else { sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); } /* * NOTE: * Subscribe ether_ifattach event, instead of ifnet_arrival event, * since interface's LLADDR is needed; interface LLADDR is not * available when ifnet_arrival event is triggered. */ sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) hn_synth_detach(sc); hn_detach(dev); return (error); } static int hn_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); if_t ifp = sc->hn_ifp, vf_ifp; if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { /* * In case that the vmbus missed the orphan handler * installation. */ vmbus_xact_ctx_orphan(sc->hn_xact); } if (sc->hn_ifaddr_evthand != NULL) EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); if (sc->hn_ifnet_evthand != NULL) EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); if (sc->hn_ifnet_atthand != NULL) { EVENTHANDLER_DEREGISTER(ether_ifattach_event, sc->hn_ifnet_atthand); } if (sc->hn_ifnet_dethand != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, sc->hn_ifnet_dethand); } if (sc->hn_ifnet_lnkhand != NULL) EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); vf_ifp = sc->hn_vf_ifp; __compiler_membar(); if (vf_ifp != NULL) hn_ifnet_detevent(sc, vf_ifp); if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) hn_stop(sc, true); /* * NOTE: * hn_stop() only suspends data, so managment * stuffs have to be suspended manually here. */ hn_suspend_mgmt(sc); hn_synth_detach(sc); } HN_UNLOCK(sc); ether_ifdetach(ifp); } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(sc->hn_tx_taskqs[i]); free(sc->hn_tx_taskqs, M_DEVBUF); } taskqueue_free(sc->hn_mgmt_taskq0); if (sc->hn_vf_taskq != NULL) taskqueue_free(sc->hn_vf_taskq); if (sc->hn_xact != NULL) { /* * Uninstall the orphan handler _before_ the xact is * destructed. */ vmbus_chan_unset_orphan(sc->hn_prichan); vmbus_xact_ctx_destroy(sc->hn_xact); } if_free(ifp); HN_LOCK_DESTROY(sc); rm_destroy(&sc->hn_vf_lock); return (0); } static int hn_shutdown(device_t dev) { return (0); } static void hn_link_status(struct hn_softc *sc) { uint32_t link_status; int error; error = hn_rndis_get_linkstatus(sc, &link_status); if (error) { /* XXX what to do? */ return; } if (link_status == NDIS_MEDIA_STATE_CONNECTED) sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; else sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? LINK_STATE_UP : LINK_STATE_DOWN); } static void hn_link_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) return; hn_link_status(sc); } static void hn_netchg_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Prevent any link status checks from running. */ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; /* * Fake up a [link down --> link up] state change; 5 seconds * delay is used, which closely simulates miibus reaction * upon link down event. */ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 5 * hz); } static void hn_netchg_status_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Re-allow link status checks. */ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; hn_link_status(sc); } static void hn_update_link_status(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); } static void hn_change_network(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("put an onagg txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; if (!STAILQ_EMPTY(&txd->agg_list)) { struct hn_txdesc *tmp_txd; while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { int freed __diagused; KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), ("resursive aggregation on aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), ("not aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("aggregated txdesc uses dmamap")); KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("aggregated txdesc consumes " "chimney sending buffer")); KASSERT(tmp_txd->chim_size == 0, ("aggregated txdesc has non-zero " "chimney sending size")); STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; freed = hn_txdesc_put(txr, tmp_txd); KASSERT(freed, ("failed to free aggregated txdesc")); } } if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else /* HN_USE_TXDESC_BUFRING */ #ifdef HN_DEBUG atomic_add_int(&txr->hn_txdesc_avail, 1); #endif buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif /* !HN_USE_TXDESC_BUFRING */ return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING #ifdef HN_DEBUG atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif #endif /* HN_USE_TXDESC_BUFRING */ KASSERT(txd->m == NULL && txd->refs == 0 && STAILQ_EMPTY(&txd->agg_list) && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0 && (txd->flags & HN_TXD_FLAG_ONLIST) && (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static __inline void hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) { KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("recursive aggregation on aggregating txdesc")); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("already aggregated")); KASSERT(STAILQ_EMPTY(&txd->agg_list), ("recursive aggregation on to-be-aggregated txdesc")); txd->flags |= HN_TXD_FLAG_ONAGG; STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); } static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { bool pending = false; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) pending = true; mtx_unlock_spin(&txr->hn_txlist_spin); #else if (!buf_ring_full(txr->hn_txdesc_br)) pending = true; #endif return (pending); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } static void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) struct epoch_tracker et; NET_EPOCH_ENTER(et); tcp_lro_flush_all(&rxr->hn_lro); NET_EPOCH_EXIT(et); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } static __inline uint32_t hn_rndis_pktmsg_offset(uint32_t ofs) { KASSERT(ofs >= sizeof(struct rndis_packet_msg), ("invalid RNDIS packet msg offset %u", ofs)); return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); } static __inline void * hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, size_t pi_dlen, uint32_t pi_type) { const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); struct rndis_pktinfo *pi; KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); /* * Per-packet-info does not move; it only grows. * * NOTE: * rm_pktinfooffset in this phase counts from the beginning * of rndis_packet_msg. */ KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, ("%u pktinfo overflows RNDIS packet msg", pi_type)); pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + pkt->rm_pktinfolen); pkt->rm_pktinfolen += pi_size; pi->rm_size = pi_size; pi->rm_type = pi_type; pi->rm_internal = 0; pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; return (pi->rm_data); } static __inline int hn_flush_txagg(if_t ifp, struct hn_tx_ring *txr) { struct hn_txdesc *txd; struct mbuf *m; int error, pkts; txd = txr->hn_agg_txd; KASSERT(txd != NULL, ("no aggregate txdesc")); /* * Since hn_txpkt() will reset this temporary stat, save * it now, so that oerrors can be updated properly, if * hn_txpkt() ever fails. */ pkts = txr->hn_stat_pkts; /* * Since txd's mbuf will _not_ be freed upon hn_txpkt() * failure, save it for later freeing, if hn_txpkt() ever * fails. */ m = txd->m; error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m is not. */ m_freem(m); txr->hn_flush_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); } /* Reset all aggregation states. */ txr->hn_agg_txd = NULL; txr->hn_agg_szleft = 0; txr->hn_agg_pktleft = 0; txr->hn_agg_prevpkt = NULL; return (error); } static void * hn_try_txagg(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, int pktsize) { void *chim; if (txr->hn_agg_txd != NULL) { if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { struct hn_txdesc *agg_txd = txr->hn_agg_txd; struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; int olen; /* * Update the previous RNDIS packet's total length, * it can be increased due to the mandatory alignment * padding for this RNDIS packet. And update the * aggregating txdesc's chimney sending buffer size * accordingly. * * XXX * Zero-out the padding, as required by the RNDIS spec. */ olen = pkt->rm_len; pkt->rm_len = roundup2(olen, txr->hn_agg_align); agg_txd->chim_size += pkt->rm_len - olen; /* Link this txdesc to the parent. */ hn_txdesc_agg(agg_txd, txd); chim = (uint8_t *)pkt + pkt->rm_len; /* Save the current packet for later fixup. */ txr->hn_agg_prevpkt = chim; txr->hn_agg_pktleft--; txr->hn_agg_szleft -= pktsize; if (txr->hn_agg_szleft <= HN_PKTSIZE_MIN(txr->hn_agg_align)) { /* * Probably can't aggregate more packets, * flush this aggregating txdesc proactively. */ txr->hn_agg_pktleft = 0; } /* Done! */ return (chim); } hn_flush_txagg(ifp, txr); } KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); txr->hn_tx_chimney_tried++; txd->chim_index = hn_chim_alloc(txr->hn_sc); if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) return (NULL); txr->hn_tx_chimney++; chim = txr->hn_sc->hn_chim + (txd->chim_index * txr->hn_sc->hn_chim_szmax); if (txr->hn_agg_pktmax > 1 && txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { txr->hn_agg_txd = txd; txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; txr->hn_agg_prevpkt = chim; } return (chim); } /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; struct rndis_packet_msg *pkt; uint32_t *pi_data; void *chim = NULL; int pkt_hlen, pkt_size; pkt = txd->rndis_pkt; pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); if (pkt_size < txr->hn_chim_size) { chim = hn_try_txagg(ifp, txr, txd, pkt_size); if (chim != NULL) pkt = chim; } else { if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); } pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = m_head->m_pkthdr.len; pkt->rm_dataoffset = 0; pkt->rm_datalen = m_head->m_pkthdr.len; pkt->rm_oobdataoffset = 0; pkt->rm_oobdatalen = 0; pkt->rm_oobdataelements = 0; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; pkt->rm_vchandle = 0; pkt->rm_reserved = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* * Set the hash value for this packet. */ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); if (M_HASHTYPE_ISHASH(m_head)) /* * The flowid field contains the hash value host * set in the rx queue if it is a ip forwarding pkt. * Set the same hash value so host can send on the * cpu it was received. */ *pi_data = m_head->m_pkthdr.flowid; else /* * Otherwise just put the tx queue index. */ *pi_data = txr->hn_tx_idx; } if (m_head->m_flags & M_VLANTAG) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); *pi_data = NDIS_VLAN_INFO_MAKE( EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { #if defined(INET6) || defined(INET) pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { *pi_data = NDIS_LSO2_INFO_MAKEIPV4( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { *pi_data = NDIS_LSO2_INFO_MAKEIPV6( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen, m_head->m_pkthdr.tso_segsz); } #endif #endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); if (m_head->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP)) { *pi_data = NDIS_TXCSUM_INFO_IPV6; } else { *pi_data = NDIS_TXCSUM_INFO_IPV4; if (m_head->m_pkthdr.csum_flags & CSUM_IP) *pi_data |= NDIS_TXCSUM_INFO_IPCS; } if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) { *pi_data |= NDIS_TXCSUM_INFO_MKTCPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) { *pi_data |= NDIS_TXCSUM_INFO_MKUDPCS( m_head->m_pkthdr.l2hlen + m_head->m_pkthdr.l3hlen); } } pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Fixup RNDIS packet message total length */ pkt->rm_len += pkt_hlen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Fast path: Chimney sending. */ if (chim != NULL) { struct hn_txdesc *tgt_txd = txd; if (txr->hn_agg_txd != NULL) { tgt_txd = txr->hn_agg_txd; #ifdef INVARIANTS *m_head0 = NULL; #endif } KASSERT(pkt == chim, ("RNDIS pkt not in chimney sending buffer")); KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, ("chimney sending buffer is not used")); tgt_txd->chim_size += pkt->rm_len; m_copydata(m_head, 0, m_head->m_pkthdr.len, ((uint8_t *)chim) + pkt_hlen); txr->hn_gpa_cnt = 0; txr->hn_sendpkt = hn_txpkt_chim; goto done; } KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("chimney buffer is used")); KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (__predict_false(error)) { int freed __diagused; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; /* +1 RNDIS packet message */ txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; txr->hn_gpa[0].gpa_len = pkt_hlen; /* * Fill the page buffers with mbuf info after the page * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; txr->hn_sendpkt = hn_txpkt_sglist; done: txd->m = m_head; /* Set the completion routine */ hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); /* Update temporary stats for later use. */ txr->hn_stat_pkts++; txr->hn_stat_size += m_head->m_pkthdr.len; if (m_head->m_flags & M_MCAST) txr->hn_stat_mcasts++; return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_txpkt(if_t ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0, has_bpf; again: has_bpf = bpf_peers_present(if_getbpf(ifp)); if (has_bpf) { /* * Make sure that this txd and any aggregated txds are not * freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); } error = txr->hn_sendpkt(txr, txd); if (!error) { if (has_bpf) { const struct hn_txdesc *tmp_txd; ETHER_BPF_MTAP(ifp, txd->m); STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) ETHER_BPF_MTAP(ifp, tmp_txd->m); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { if_inc_counter(ifp, IFCOUNTER_OBYTES, txr->hn_stat_size); if (txr->hn_stat_mcasts != 0) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, txr->hn_stat_mcasts); } } txr->hn_pkts += txr->hn_stat_pkts; txr->hn_sends++; } if (has_bpf) hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed __diagused; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } /* Reset temporary stats, after this sending is done. */ txr->hn_stat_size = 0; txr->hn_stat_pkts = 0; txr->hn_stat_mcasts = 0; return (error); } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return the last mbuf in the chain or NULL if failed to * allocate new mbuf. */ static struct mbuf * hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) return NULL; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } return m; } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } return tcp_lro_rx(lc, m, 0); } #endif static int hn_rxpkt(struct hn_rx_ring *rxr) { if_t ifp, hn_ifp = rxr->hn_ifp; struct mbuf *m_new, *n; int size, do_lro = 0, do_csum = 1, is_vf = 0; int hash_type = M_HASHTYPE_NONE; int l3proto = ETHERTYPE_MAX, l4proto = IPPROTO_DONE; int i; ifp = hn_ifp; if (rxr->hn_rxvf_ifp != NULL) { /* * Non-transparent mode VF; pretend this packet is from * the VF. */ ifp = rxr->hn_rxvf_ifp; is_vf = 1; } else if (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF) { /* Transparent mode VF. */ is_vf = 1; } if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { /* * NOTE: * See the NOTE of hn_rndis_init_fixat(). This * function can be reached, immediately after the * RNDIS is initialized but before the ifnet is * setup on the hn_attach() path; drop the unexpected * packets. */ return (0); } if (__predict_false(rxr->rsc.pktlen < ETHER_HDR_LEN)) { if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); return (0); } if (rxr->rsc.cnt == 1 && rxr->rsc.pktlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), rxr->rsc.frag_data[0], rxr->rsc.frag_len[0]); m_new->m_pkthdr.len = m_new->m_len = rxr->rsc.frag_len[0]; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (rxr->rsc.pktlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } n = m_new; for (i = 0; i < rxr->rsc.cnt; i++) { n = hv_m_append(n, rxr->rsc.frag_len[i], rxr->rsc.frag_data[i]); if (n == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } else { m_new->m_pkthdr.len += rxr->rsc.frag_len[i]; } } } if (rxr->rsc.pktlen <= MHLEN) rxr->hn_small_pkts++; m_new->m_pkthdr.rcvif = ifp; if (__predict_false((if_getcapenable(hn_ifp) & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (rxr->rsc.csum_info != NULL) { /* IP csum offload */ if ((*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_UDPCS_OK | NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (*(rxr->rsc.csum_info) & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } /* * XXX * As of this write (Oct 28th, 2016), host side will turn * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so * the do_lro setting here is actually _not_ accurate. We * depend on the RSS hash type check to reset do_lro. */ if ((*(rxr->rsc.csum_info) & (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { hn_rxpkt_proto(m_new, &l3proto, &l4proto); if (l3proto == ETHERTYPE_IP) { if (l4proto == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; } else if (l4proto == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } } else if (l4proto != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } if (rxr->rsc.vlan_info != NULL) { m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( NDIS_VLAN_INFO_ID(*(rxr->rsc.vlan_info)), NDIS_VLAN_INFO_PRI(*(rxr->rsc.vlan_info)), NDIS_VLAN_INFO_CFI(*(rxr->rsc.vlan_info))); m_new->m_flags |= M_VLANTAG; } /* * If VF is activated (tranparent/non-transparent mode does not * matter here). * * - Disable LRO * * hn(4) will only receive broadcast packets, multicast packets, * TCP SYN and SYN|ACK (in Azure), LRO is useless for these * packet types. * * For non-transparent, we definitely _cannot_ enable LRO at * all, since the LRO flush will use hn(4) as the receiving * interface; i.e. hn_ifp->if_input(hn_ifp, m). */ if (is_vf) do_lro = 0; /* * If VF is activated (tranparent/non-transparent mode does not * matter here), do _not_ mess with unsupported hash types or * functions. */ if (rxr->rsc.hash_info != NULL) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = *(rxr->rsc.hash_value); if (!is_vf) hash_type = M_HASHTYPE_OPAQUE_HASH; if ((*(rxr->rsc.hash_info) & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (*(rxr->rsc.hash_info) & NDIS_HASH_TYPE_MASK & rxr->hn_mbuf_hash); /* * NOTE: * do_lro is resetted, if the hash types are not TCP * related. See the comment in the above csum_flags * setup section. */ switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; do_lro = 0; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; if (rxr->hn_rx_flags & HN_RX_FLAG_UDP_HASH) { int def_htype = M_HASHTYPE_OPAQUE_HASH; if (is_vf) def_htype = M_HASHTYPE_NONE; /* * UDP 4-tuple hash is delivered as * TCP 4-tuple hash. */ if (l3proto == ETHERTYPE_MAX) { hn_rxpkt_proto(m_new, &l3proto, &l4proto); } if (l3proto == ETHERTYPE_IP) { if (l4proto == IPPROTO_UDP && (rxr->hn_mbuf_hash & NDIS_HASH_UDP_IPV4_X)) { hash_type = M_HASHTYPE_RSS_UDP_IPV4; do_lro = 0; } else if (l4proto != IPPROTO_TCP) { hash_type = def_htype; do_lro = 0; } } else { hash_type = def_htype; do_lro = 0; } } break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; do_lro = 0; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; do_lro = 0; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else if (!is_vf) { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if (hn_ifp != ifp) { const struct ether_header *eh; /* * Non-transparent mode VF is activated. */ /* * Allow tapping on hn(4). */ ETHER_BPF_MTAP(hn_ifp, m_new); /* * Update hn(4)'s stats. */ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); eh = mtod(m_new, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); } rxr->hn_pkts++; if ((if_getcapenable(hn_ifp) & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } if_input(ifp, m_new); return (0); } static int hn_ioctl(if_t ifp, u_long cmd, caddr_t data) { struct hn_softc *sc = if_getsoftc(ifp); struct ifreq *ifr = (struct ifreq *)data, ifr_vf; if_t vf_ifp; int mask, error = 0; struct ifrsskey *ifrk; struct ifrsshash *ifrh; uint32_t mtu; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu > HN_MTU_MAX) { error = EINVAL; break; } HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if ((sc->hn_caps & HN_CAP_MTU) == 0) { /* Can't change MTU */ HN_UNLOCK(sc); error = EOPNOTSUPP; break; } if (if_getmtu(ifp) == ifr->ifr_mtu) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) { vf_ifp = sc->hn_vf_ifp; ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, if_name(vf_ifp), sizeof(ifr_vf.ifr_name)); error = ifhwioctl(SIOCSIFMTU,vf_ifp, (caddr_t)&ifr_vf, curthread); if (error) { HN_UNLOCK(sc); if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", if_name(vf_ifp), ifr->ifr_mtu, error); break; } } /* * Suspend this interface before the synthetic parts * are ripped. */ hn_suspend(sc); /* * Detach the synthetics parts, i.e. NVS and RNDIS. */ hn_synth_detach(sc); /* * Reattach the synthetic parts, i.e. NVS and RNDIS, * with the new MTU setting. */ error = hn_synth_attach(sc, ifr->ifr_mtu); if (error) { HN_UNLOCK(sc); break; } error = hn_rndis_get_mtu(sc, &mtu); if (error) mtu = ifr->ifr_mtu; else if (bootverbose) if_printf(ifp, "RNDIS mtu %u\n", mtu); /* * Commit the requested MTU, after the synthetic parts * have been successfully attached. */ if (mtu >= ifr->ifr_mtu) { mtu = ifr->ifr_mtu; } else { if_printf(ifp, "fixup mtu %d -> %u\n", ifr->ifr_mtu, mtu); } if_setmtu(ifp, mtu); /* * Synthetic parts' reattach may change the chimney * sending size; update it. */ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) hn_set_chim_size(sc, sc->hn_chim_szmax); /* * Make sure that various parameters based on MTU are * still valid, after the MTU change. */ hn_mtu_change_fixup(sc); /* * All done! Resume the interface now. */ hn_resume(sc); if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* * Since we have reattached the NVS part, * change the datapath to VF again; in case * that it is lost, after the NVS was detached. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); } HN_UNLOCK(sc); break; case SIOCSIFFLAGS: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) hn_xpnt_vf_saveifflags(sc); if (if_getflags(ifp) & IFF_UP) { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { /* * Caller meight hold mutex, e.g. * bpf; use busy-wait for the RNDIS * reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) error = hn_xpnt_vf_iocsetflags(sc); } else { hn_init_locked(sc); } } else { if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) hn_stop(sc, false); } sc->hn_if_flags = if_getflags(ifp); HN_UNLOCK(sc); break; case SIOCSIFCAP: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, if_name(sc->hn_vf_ifp), sizeof(ifr_vf.ifr_name)); error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); HN_UNLOCK(sc); break; } /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ mask = (ifr->ifr_reqcap & if_getcapabilities(ifp)) ^ if_getcapenable(ifp); if (mask & IFCAP_TXCSUM) { if_togglecapenable(ifp, IFCAP_TXCSUM); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, HN_CSUM_IP_HWASSIST(sc), 0); else if_sethwassistbits(ifp, 0, HN_CSUM_IP_HWASSIST(sc)); } if (mask & IFCAP_TXCSUM_IPV6) { if_togglecapenable(ifp, IFCAP_TXCSUM_IPV6); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, HN_CSUM_IP6_HWASSIST(sc), 0); else if_sethwassistbits(ifp, 0, HN_CSUM_IP6_HWASSIST(sc)); } /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) if_togglecapenable(ifp, IFCAP_RXCSUM); #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if (mask & IFCAP_RXCSUM_IPV6) if_togglecapenable(ifp, IFCAP_RXCSUM_IPV6); #endif if (mask & IFCAP_LRO) if_togglecapenable(ifp, IFCAP_LRO); if (mask & IFCAP_TSO4) { if_togglecapenable(ifp, IFCAP_TSO4); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); else if_sethwassistbits(ifp, 0, CSUM_IP_TSO); } if (mask & IFCAP_TSO6) { if_togglecapenable(ifp, IFCAP_TSO6); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); else if_sethwassistbits(ifp, 0, CSUM_IP6_TSO); } HN_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) { /* * Multicast uses mutex; use busy-wait for * the RNDIS reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); } /* XXX vlan(4) style mcast addr maintenance */ if (hn_xpnt_vf_isready(sc)) { int old_if_flags; old_if_flags = if_getflags(sc->hn_vf_ifp); hn_xpnt_vf_saveifflags(sc); if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && ((old_if_flags ^ if_getflags(sc->hn_vf_ifp)) & IFF_ALLMULTI)) error = hn_xpnt_vf_iocsetflags(sc); } HN_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { /* * SIOCGIFMEDIA expects ifmediareq, so don't * create and pass ifr_vf to the VF here; just * replace the ifr_name. */ vf_ifp = sc->hn_vf_ifp; strlcpy(ifr->ifr_name, if_name(vf_ifp), sizeof(ifr->ifr_name)); error = ifhwioctl(cmd, vf_ifp, data, curthread); /* Restore the ifr_name. */ strlcpy(ifr->ifr_name, if_name(ifp), sizeof(ifr->ifr_name)); HN_UNLOCK(sc); break; } HN_UNLOCK(sc); error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; case SIOCGIFRSSHASH: ifrh = (struct ifrsshash *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrh->ifrh_func = RSS_FUNC_NONE; ifrh->ifrh_types = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; else ifrh->ifrh_func = RSS_FUNC_PRIVATE; ifrh->ifrh_types = hn_rss_type_fromndis(sc->hn_rss_hash); HN_UNLOCK(sc); break; case SIOCGIFRSSKEY: ifrk = (struct ifrsskey *)data; HN_LOCK(sc); if (sc->hn_rx_ring_inuse == 1) { HN_UNLOCK(sc); ifrk->ifrk_func = RSS_FUNC_NONE; ifrk->ifrk_keylen = 0; break; } if (sc->hn_rss_hash & NDIS_HASH_FUNCTION_TOEPLITZ) ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; else ifrk->ifrk_func = RSS_FUNC_PRIVATE; ifrk->ifrk_keylen = NDIS_HASH_KEYSIZE_TOEPLITZ; memcpy(ifrk->ifrk_key, sc->hn_rss.rss_key, NDIS_HASH_KEYSIZE_TOEPLITZ); HN_UNLOCK(sc); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void hn_stop(struct hn_softc *sc, bool detaching) { if_t ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Clear RUNNING bit ASAP. */ if_setdrvflagbits(ifp, 0, IFF_DRV_RUNNING); /* Disable polling. */ hn_polling(sc, 0); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { KASSERT(sc->hn_vf_ifp != NULL, ("%s: VF is not attached", if_name(ifp))); /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); /* * NOTE: * Datapath setting must happen _before_ bringing * the VF down. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); /* * Bring the VF down. */ hn_xpnt_vf_saveifflags(sc); if_setflagbits(ifp, 0, IFF_UP); hn_xpnt_vf_iocsetflags(sc); } /* Suspend data transfers. */ hn_suspend_data(sc); /* Clear OACTIVE bit. */ if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* * If the non-transparent mode VF is active, make sure * that the RX filter still allows packet reception. */ if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) hn_rxfilter_config(sc); } static void hn_init_locked(struct hn_softc *sc) { if_t ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; if (if_getdrvflags(ifp) & IFF_DRV_RUNNING) return; /* Configure RX filter */ hn_rxfilter_config(sc); /* Clear OACTIVE bit. */ if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* Clear TX 'suspended' bit. */ hn_resume_tx(sc, sc->hn_tx_ring_inuse); if (hn_xpnt_vf_isready(sc)) { /* Initialize transparent VF. */ hn_xpnt_vf_init(sc); } /* Everything is ready; unleash! */ if_setdrvflagbits(ifp, IFF_DRV_RUNNING, 0); /* Re-enable polling if requested. */ if (sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_init(void *xsc) { struct hn_softc *sc = xsc; HN_LOCK(sc); hn_init_locked(sc); HN_UNLOCK(sc); } static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) { HN_UNLOCK(sc); return EINVAL; } hn_set_lro_lenlim(sc, lenlim); HN_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; } static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } HN_UNLOCK(sc); return 0; } static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chim_size, error; chim_size = sc->hn_tx_ring[0].hn_chim_size; error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; HN_LOCK(sc); hn_set_chim_size(sc, chim_size); HN_UNLOCK(sc); return 0; } static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } HN_UNLOCK(sc); return 0; } static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, size; size = sc->hn_agg_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_size = size; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, pkts; pkts = sc->hn_agg_pkts; error = sysctl_handle_int(oidp, &pkts, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_pkts = pkts; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pkts; pkts = sc->hn_tx_ring[0].hn_agg_pktmax; return (sysctl_handle_int(oidp, &pkts, 0, req)); } static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int align; align = sc->hn_tx_ring[0].hn_agg_align; return (sysctl_handle_int(oidp, &align, 0, req)); } static void hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) { if (pollhz == 0) vmbus_chan_poll_disable(chan); else vmbus_chan_poll_enable(chan, pollhz); } static void hn_polling(struct hn_softc *sc, u_int pollhz) { int nsubch = sc->hn_rx_ring_inuse - 1; HN_LOCK_ASSERT(sc); if (nsubch > 0) { struct vmbus_channel **subch; int i; subch = vmbus_subchan_get(sc->hn_prichan, nsubch); for (i = 0; i < nsubch; ++i) hn_chan_polling(subch[i], pollhz); vmbus_subchan_rel(subch, nsubch); } hn_chan_polling(sc->hn_prichan, pollhz); } static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pollhz, error; pollhz = sc->hn_pollhz; error = sysctl_handle_int(oidp, &pollhz, 0, req); if (error || req->newptr == NULL) return (error); if (pollhz != 0 && (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) return (EINVAL); HN_LOCK(sc); if (sc->hn_pollhz != pollhz) { sc->hn_pollhz = pollhz; if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) hn_polling(sc, sc->hn_pollhz); } HN_UNLOCK(sc); return (0); } static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char caps_str[128]; uint32_t caps; HN_LOCK(sc); caps = sc->hn_caps; HN_UNLOCK(sc); snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); } static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char assist_str[128]; uint32_t hwassist; HN_LOCK(sc); hwassist = if_gethwassist(sc->hn_ifp); HN_UNLOCK(sc); snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); } static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char filter_str[128]; uint32_t filter; HN_LOCK(sc); filter = sc->hn_rx_filter; HN_UNLOCK(sc); snprintf(filter_str, sizeof(filter_str), "%b", filter, NDIS_PACKET_TYPES); return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); } static int hn_rsc_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; uint32_t mtu; int error; HN_LOCK(sc); error = hn_rndis_get_mtu(sc, &mtu); if (error) { if_printf(sc->hn_ifp, "failed to get mtu\n"); goto back; } error = SYSCTL_OUT(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); if (error || req->newptr == NULL) goto back; error = SYSCTL_IN(req, &(sc->hn_rsc_ctrl), sizeof(sc->hn_rsc_ctrl)); if (error) goto back; error = hn_rndis_reconf_offload(sc, mtu); back: HN_UNLOCK(sc); return (error); } #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error || req->newptr == NULL) goto back; if ((sc->hn_flags & HN_FLAG_RXVF) || (hn_xpnt_vf && sc->hn_vf_ifp != NULL)) { /* * RSS key is synchronized w/ VF's, don't allow users * to change it. */ error = EBUSY; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (sc->hn_rx_ring_inuse > 1) { error = hn_rss_reconfig(sc); } else { /* Not RSS capable, at least for now; just save the RSS key. */ error = 0; } back: HN_UNLOCK(sc); return (error); } static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error || req->newptr == NULL) goto back; /* * Don't allow RSS indirect table change, if this interface is not * RSS capable currently. */ if (sc->hn_rx_ring_inuse == 1) { error = EOPNOTSUPP; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; hn_rss_ind_fixup(sc); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); return (error); } #endif /* !RSS */ static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_hcap_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hcap; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_rss_mbuf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rx_ring[0].hn_mbuf_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; if_t vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_vf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; if_t vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", if_name(vf_ifp)); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct epoch_tracker et; if_t ifp; if (hn_vfmap[i] == NULL) continue; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) sbuf_printf(sb, "%s", if_name(ifp)); else sbuf_printf(sb, " %s", if_name(ifp)); first = false; } NET_EPOCH_EXIT(et); } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct epoch_tracker et; if_t ifp, hn_ifp; hn_ifp = hn_vfmap[i]; if (hn_ifp == NULL) continue; NET_EPOCH_ENTER(et); ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) { sbuf_printf(sb, "%s:%s", if_name(ifp), if_name(hn_ifp)); } else { sbuf_printf(sb, " %s:%s", if_name(ifp), if_name(hn_ifp)); } first = false; } NET_EPOCH_EXIT(et); } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, onoff = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) onoff = 1; error = sysctl_handle_int(oidp, &onoff, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); /* NOTE: hn_vf_lock for hn_transmit() */ rm_wlock(&sc->hn_vf_lock); if (onoff) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; else sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; rm_wunlock(&sc->hn_vf_lock); HN_UNLOCK(sc); return (0); } static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int enabled = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) enabled = 1; return (sysctl_handle_int(oidp, &enabled, 0, req)); } static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static void hn_rxpkt_proto(const struct mbuf *m_new, int *l3proto, int *l4proto) { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); eh = mtod(m_new, const struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) return; evl = mtod(m_new, const struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } *l3proto = etype; if (etype == ETHERTYPE_IP) *l4proto = hn_check_iplen(m_new, hoff); else *l4proto = IPPROTO_DONE; } static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) int lroent_cnt; #endif int i; /* * Create RXBUF for reception. * * NOTE: * - It is shared by all channels. * - A large enough buffer is allocated, certain version of NVSes * may further limit the usable space. */ - sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), - PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, - BUS_DMA_WAITOK | BUS_DMA_ZERO); + sc->hn_rxbuf = contigmalloc(HN_RXBUF_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, + 0ul, ~0ul, PAGE_SIZE, 0); if (sc->hn_rxbuf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return (ENOMEM); } sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; if (bootverbose) device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; - rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), - PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, - &rxr->hn_br_dma, BUS_DMA_WAITOK); + rxr->hn_br = contigmalloc(HN_TXBR_SIZE + HN_RXBR_SIZE, M_DEVBUF, + M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (rxr->hn_br == NULL) { device_printf(dev, "allocate bufring failed\n"); return (ENOMEM); } if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_mbuf_hash = NDIS_HASH_ALL; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); rxr->hn_rx_idx = i; rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rsc_pkts", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_rsc_pkts, "# of RSC packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rsc_drop", CTLFLAG_RW | CTLFLAG_STATS, &rxr->hn_rsc_drop, "# of RSC fragments dropped"); SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "pktbuf_len", CTLFLAG_RD, &rxr->hn_pktbuf_len, 0, "Temporary channel packet buffer length"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), hn_rx_stat_u64_sysctl, "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), hn_rx_stat_u64_sysctl, "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segment verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS , sc, __offsetof(struct hn_rx_ring, hn_ack_failed), hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); return (0); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rxbuf != NULL) { if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) - hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); + contigfree(sc->hn_rxbuf, HN_RXBUF_SIZE, M_DEVBUF); else device_printf(sc->hn_dev, "RXBUF is referenced\n"); sc->hn_rxbuf = NULL; } if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_br == NULL) continue; if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { - hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); + contigfree(rxr->hn_br, HN_TXBR_SIZE + HN_RXBR_SIZE, + M_DEVBUF); } else { device_printf(sc->hn_dev, "%dth channel bufring is referenced", i); } rxr->hn_br = NULL; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif free(rxr->hn_pktbuf, M_DEVBUF); } free(sc->hn_rx_ring, M_DEVBUF); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_tx_ring_create(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); #endif if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); } else { txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; } #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else #endif { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_PKT_ALIGN, /* alignment */ HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; STAILQ_INIT(&txd->agg_list); /* * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_pkt, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &txd->rndis_pkt_dmap); if (error) { device_printf(dev, "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap, txd->rndis_pkt, HN_RNDIS_PKT_LEN, hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); #ifdef HN_DEBUG SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); #endif #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_pkts, "# of packets transmitted"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", CTLFLAG_RW | CTLFLAG_STATS, &txr->hn_sends, "# of sends"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->refs == 0 || txd->refs == 1, ("invalid txd refs %d", txd->refs)); /* Aggregated txds will be freed by their aggregating txd. */ if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { int freed __diagused; freed = hn_txdesc_put(txr, txd); KASSERT(freed, ("can't free txdesc")); } } static void hn_tx_ring_destroy(struct hn_tx_ring *txr) { int i; if (txr->hn_txdesc == NULL) return; /* * NOTE: * Because the freeing of aggregated txds will be deferred * to the aggregating txd, two passes are used here: * - The first pass GCes any pending txds. This GC is necessary, * since if the channels are revoked, hypervisor will not * deliver send-done for all pending txds. * - The second pass frees the busdma stuffs, i.e. after all txds * were freed. */ for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_gc(txr, &txr->hn_txdesc[i]); for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); #endif free(txr->hn_txdesc, M_DEVBUF); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; /* * Create TXBUF for chimney sending. * * NOTE: It is shared by all channels. */ - sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), - PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, - BUS_DMA_WAITOK | BUS_DMA_ZERO); + sc->hn_chim = contigmalloc(HN_CHIM_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, + 0ul, ~0ul, PAGE_SIZE, 0); if (sc->hn_chim == NULL) { device_printf(sc->hn_dev, "allocate txbuf failed\n"); return (ENOMEM); } sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_tx_ring_create(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_flush_failed), hn_tx_stat_ulong_sysctl, "LU", "# of packet transmission aggregation flush failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE | CTLFLAG_STATS, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, "Applied packet transmission aggregation size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pktmax_sysctl, "I", "Applied packet transmission aggregation packets"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_align_sysctl, "I", "Applied packet transmission aggregation alignment"); return 0; } static void hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } static void hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) { if_t ifp = sc->hn_ifp; u_int hw_tsomax; int tso_minlen; HN_LOCK_ASSERT(sc); if ((if_getcapabilities(ifp) & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) return; KASSERT(sc->hn_ndis_tso_sgmin >= 2, ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); tso_minlen = sc->hn_ndis_tso_sgmin * mtu; KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && sc->hn_ndis_tso_szmax <= IP_MAXPACKET, ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); if (tso_maxlen < tso_minlen) tso_maxlen = tso_minlen; else if (tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; if (tso_maxlen > sc->hn_ndis_tso_szmax) tso_maxlen = sc->hn_ndis_tso_szmax; hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); if (hn_xpnt_vf_isready(sc)) { if (hw_tsomax > if_gethwtsomax(sc->hn_vf_ifp)) hw_tsomax = if_gethwtsomax(sc->hn_vf_ifp); } if_sethwtsomax(ifp, hw_tsomax); if (bootverbose) if_printf(ifp, "TSO size max %u\n", if_gethwtsomax(ifp)); } static void hn_fixup_tx_data(struct hn_softc *sc) { uint64_t csum_assist; int i; hn_set_chim_size(sc, sc->hn_chim_szmax); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_chim_szmax) hn_set_chim_size(sc, hn_tx_chimney_size); csum_assist = 0; if (sc->hn_caps & HN_CAP_IPCS) csum_assist |= CSUM_IP; if (sc->hn_caps & HN_CAP_TCP4CS) csum_assist |= CSUM_IP_TCP; if ((sc->hn_caps & HN_CAP_UDP4CS) && hn_enable_udp4cs) csum_assist |= CSUM_IP_UDP; if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if ((sc->hn_caps & HN_CAP_UDP6CS) && hn_enable_udp6cs) csum_assist |= CSUM_IP6_UDP; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; if (sc->hn_caps & HN_CAP_HASHVAL) { /* * Support HASHVAL pktinfo on TX path. */ if (bootverbose) if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; } } static void hn_fixup_rx_data(struct hn_softc *sc) { if (sc->hn_caps & HN_CAP_UDPHASH) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_UDP_HASH; } } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_chim != NULL) { if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { - hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); + contigfree(sc->hn_chim, HN_CHIM_SIZE, M_DEVBUF); } else { device_printf(sc->hn_dev, "chimney sending buffer is referenced"); } sc->hn_chim = NULL; } if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_destroy(&sc->hn_tx_ring[i]); free(sc->hn_tx_ring, M_DEVBUF); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } #ifdef HN_IFSTART_SUPPORT static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; if_t ifp = sc->hn_ifp; int sched = 0; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((if_getdrvflags(ifp) & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (0); while (!if_sendq_empty(ifp)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; m_head = if_dequeue(ifp); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ if_sendq_prepend(ifp, m_head); sched = 1; break; } #if defined(INET6) || defined(INET) if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { m_head = hn_tso_fixup(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m_head = hn_set_hlen(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } #endif txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; if_sendq_prepend(ifp, m_head); if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ if_sendq_prepend(ifp, m_head); if_setdrvflagbits(ifp, IFF_DRV_OACTIVE, 0); break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static void hn_start(if_t ifp) { struct hn_softc *sc = if_getsoftc(ifp); struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); if_setdrvflagbits(txr->hn_sc->hn_ifp, 0, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; if_t ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ if_setdrvflagbits(ifp, 0, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } #endif /* HN_IFSTART_SUPPORT */ static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; if_t ifp = sc->hn_ifp; struct mbuf *m_head; int sched = 0; mtx_assert(&txr->hn_tx_lock, MA_OWNED); #ifdef HN_IFSTART_SUPPORT KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); #endif KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return (0); while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); sched = 1; break; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); drbr_advance(ifp, txr->hn_mbuf_br); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { txr->hn_oactive = 1; break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static int hn_transmit(if_t ifp, struct mbuf *m) { struct hn_softc *sc = if_getsoftc(ifp); struct hn_tx_ring *txr; int error, idx = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { struct rm_priotracker pt; rm_rlock(&sc->hn_vf_lock, &pt); if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { struct mbuf *m_bpf = NULL; int obytes, omcast; obytes = m->m_pkthdr.len; omcast = (m->m_flags & M_MCAST) != 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { if (bpf_peers_present(if_getbpf(ifp))) { m_bpf = m_copypacket(m, M_NOWAIT); if (m_bpf == NULL) { /* * Failed to grab a shallow * copy; tap now. */ ETHER_BPF_MTAP(ifp, m); } } } else { ETHER_BPF_MTAP(ifp, m); } error = if_transmit(sc->hn_vf_ifp, m); rm_runlock(&sc->hn_vf_lock, &pt); if (m_bpf != NULL) { if (!error) ETHER_BPF_MTAP(ifp, m_bpf); m_freem(m_bpf); } if (error == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); } else if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); if (omcast) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, omcast); } } return (error); } rm_runlock(&sc->hn_vf_lock, &pt); } #if defined(INET6) || defined(INET) /* * Perform TSO packet header fixup or get l2/l3 header length now, * since packet headers should be cache-hot. */ if (m->m_pkthdr.csum_flags & CSUM_TSO) { m = hn_tso_fixup(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { m = hn_set_hlen(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } #endif /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS uint32_t bid; if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bid) == 0) idx = bid % sc->hn_tx_ring_inuse; else #endif { #if defined(INET6) || defined(INET) int tcpsyn = 0; if (m->m_pkthdr.len < 128 && (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) && (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { m = hn_check_tcpsyn(m, &tcpsyn); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EIO); } } #else const int tcpsyn = 0; #endif if (tcpsyn) idx = 0; else idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; } } txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_tx_ring_qflush(struct hn_tx_ring *txr) { struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_qflush(if_t ifp) { struct hn_softc *sc = if_getsoftc(ifp); struct rm_priotracker pt; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); rm_rlock(&sc->hn_vf_lock, &pt); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) if_qflush(sc->hn_vf_ifp); rm_runlock(&sc->hn_vf_lock, &pt); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; struct hn_tx_ring *txr = NULL; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; rxr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); /* * Open this channel */ cbr.cbr = rxr->hn_br; - cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; + cbr.cbr_paddr = pmap_kextract((vm_offset_t)rxr->hn_br); cbr.cbr_txsz = HN_TXBR_SIZE; cbr.cbr_rxsz = HN_RXBR_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { if (error == EISCONN) { if_printf(sc->hn_ifp, "bufring is connected after " "chan%u open failure\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else { if_printf(sc->hn_ifp, "open chan%u failed: %d\n", vmbus_chan_id(chan), error); } } return (error); } static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), ("RX ring %d is not attached", idx)); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), ("TX ring %d is not attached attached", idx)); txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } /* * Close this channel. * * NOTE: * Channel closing does _not_ destroy the target channel. */ error = vmbus_chan_close_direct(chan); if (error == EISCONN) { if_printf(sc->hn_ifp, "chan%u bufring is connected " "after being closed\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else if (error) { if_printf(sc->hn_ifp, "chan%u close failed: %d\n", vmbus_chan_id(chan), error); } } static int hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; KASSERT(subchan_cnt > 0, ("no sub-channels")); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { int error1; error1 = hn_chan_attach(sc, subchans[i]); if (error1) { error = error1; /* Move on; all channels will be detached later. */ } } vmbus_subchan_rel(subchans, subchan_cnt); if (error) { if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "%d sub-channels attached\n", subchan_cnt); } } return (error); } static void hn_detach_allchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i; if (subchan_cnt == 0) goto back; /* Detach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); back: /* * Detach the primary channel, _after_ all sub-channels * are detached. */ hn_chan_detach(sc, sc->hn_prichan); /* Wait for sub-channels to be destroyed, if any. */ vmbus_subchan_drain(sc->hn_prichan); #ifdef INVARIANTS for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { KASSERT((sc->hn_rx_ring[i].hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("%dth RX ring is still attached", i)); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { KASSERT((sc->hn_tx_ring[i].hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("%dth TX ring is still attached", i)); } #endif } static int hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) { struct vmbus_channel **subchans; int nchan, rxr_cnt, error; nchan = *nsubch + 1; if (nchan == 1) { /* * Multiple RX/TX rings are not requested. */ *nsubch = 0; return (0); } /* * Query RSS capabilities, e.g. # of RX rings, and # of indirect * table entries. */ error = hn_rndis_query_rsscaps(sc, &rxr_cnt); if (error) { /* No RSS; this is benign. */ *nsubch = 0; return (0); } if (bootverbose) { if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", rxr_cnt, nchan); } if (nchan > rxr_cnt) nchan = rxr_cnt; if (nchan == 1) { if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); *nsubch = 0; return (0); } /* * Allocate sub-channels from NVS. */ *nsubch = nchan - 1; error = hn_nvs_alloc_subchans(sc, nsubch); if (error || *nsubch == 0) { /* Failed to allocate sub-channels. */ *nsubch = 0; return (0); } /* * Wait for all sub-channels to become ready before moving on. */ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); vmbus_subchan_rel(subchans, *nsubch); return (0); } static bool hn_synth_attachable(const struct hn_softc *sc) { int i; if (sc->hn_flags & HN_FLAG_ERRORS) return (false); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) return (false); } return (true); } /* * Make sure that the RX filter is zero after the successful * RNDIS initialization. * * NOTE: * Under certain conditions on certain versions of Hyper-V, * the RNDIS rxfilter is _not_ zero on the hypervisor side * after the successful RNDIS initialization, which breaks * the assumption of any following code (well, it breaks the * RNDIS API contract actually). Clear the RNDIS rxfilter * explicitly, drain packets sneaking through, and drain the * interrupt taskqueues scheduled due to the stealth packets. */ static void hn_rndis_init_fixat(struct hn_softc *sc, int nchan) { hn_disable_rx(sc); hn_drain_rxtx(sc, nchan); } static int hn_synth_attach(struct hn_softc *sc, int mtu) { #define ATTACHED_NVS 0x0002 #define ATTACHED_RNDIS 0x0004 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan = 1, i, rndis_inited; uint32_t old_caps, attached = 0; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); if (!hn_synth_attachable(sc)) return (ENXIO); /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; /* Clear RSS stuffs. */ sc->hn_rss_ind_size = 0; sc->hn_rss_hash = 0; sc->hn_rss_hcap = 0; /* * Attach the primary channel _before_ attaching NVS and RNDIS. */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) goto failed; /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) goto failed; attached |= ATTACHED_NVS; /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu, &rndis_inited); if (rndis_inited) attached |= ATTACHED_RNDIS; if (error) goto failed; /* * Make sure capabilities are not changed. */ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); error = ENXIO; goto failed; } /* * Allocate sub-channels for multi-TX/RX rings. * * NOTE: * The # of RX rings that can be used is equivalent to the # of * channels to be requested. */ nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) goto failed; /* NOTE: _Full_ synthetic parts detach is required now. */ sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; /* * Set the # of TX/RX rings that could be used according to * the # of channels that NVS offered. */ nchan = nsubch + 1; hn_set_ring_inuse(sc, nchan); if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* * Attach the sub-channels. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ error = hn_attach_subchans(sc); if (error) goto failed; /* * Configure RSS key and indirect table _after_ all sub-channels * are attached. */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. */ if (bootverbose) if_printf(sc->hn_ifp, "setup default RSS key\n"); #ifdef RSS rss_getkey(rss->rss_key); #else memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); #endif sc->hn_flags |= HN_FLAG_HAS_RSSKEY; } if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { /* * RSS indirect table is not set yet; set it up in round- * robin fashion. */ if (bootverbose) { if_printf(sc->hn_ifp, "setup default RSS indirect " "table\n"); } for (i = 0; i < NDIS_HASH_INDCNT; ++i) { uint32_t subidx; #ifdef RSS subidx = rss_get_indirection_to_bucket(i); #else subidx = i; #endif rss->rss_ind[i] = subidx % nchan; } sc->hn_flags |= HN_FLAG_HAS_RSSIND; } else { /* * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ hn_rss_ind_fixup(sc); } sc->hn_rss_hash = sc->hn_rss_hcap; if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* NOTE: Don't reconfigure RSS; will do immediately. */ hn_vf_rss_fixup(sc, false); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) goto failed; back: /* * Fixup transmission aggregation setup. */ hn_set_txagg(sc); hn_rndis_init_fixat(sc, nchan); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { hn_rndis_init_fixat(sc, nchan); hn_synth_detach(sc); } else { if (attached & ATTACHED_RNDIS) { hn_rndis_init_fixat(sc, nchan); hn_rndis_detach(sc); } if (attached & ATTACHED_NVS) hn_nvs_detach(sc); hn_chan_detach(sc, sc->hn_prichan); /* Restore old capabilities. */ sc->hn_caps = old_caps; } return (error); #undef ATTACHED_RNDIS #undef ATTACHED_NVS } /* * NOTE: * The interface must have been suspended though hn_suspend(), before * this function get called. */ static void hn_synth_detach(struct hn_softc *sc) { KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Detach the RNDIS first. */ hn_rndis_detach(sc); /* Detach NVS. */ hn_nvs_detach(sc); /* Detach all of the channels. */ hn_detach_allchans(sc); if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_rxbuf_gpadl != 0) { /* * Host is post-Win2016, disconnect RXBUF from primary channel here. */ int error; error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_rxbuf_gpadl = 0; } if (vmbus_current_version >= VMBUS_VERSION_WIN10 && sc->hn_chim_gpadl != 0) { /* * Host is post-Win2016, disconnect chimney sending buffer from * primary channel here. */ int error; error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_chim_gpadl = 0; } sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; } static void hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) { KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, ("invalid ring count %d", ring_cnt)); if (sc->hn_tx_ring_cnt > ring_cnt) sc->hn_tx_ring_inuse = ring_cnt; else sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_rx_ring_inuse = ring_cnt; #ifdef RSS if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, rss_getnumbuckets()); } #endif if (bootverbose) { if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); } } static void hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) { /* * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_rx_empty(chan) || (!vmbus_chan_is_revoked(sc->hn_prichan) && !vmbus_chan_tx_empty(chan))) pause("waitch", 1); vmbus_chan_intr_drain(chan); } static void hn_disable_rx(struct hn_softc *sc) { /* * Disable RX by clearing RX filter forcefully. */ sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ /* * Give RNDIS enough time to flush all pending data packets. */ pause("waitrx", (200 * hz) / 1000); } /* * NOTE: * RX/TX _must_ have been suspended/disabled, before this function * is called. */ static void hn_drain_rxtx(struct hn_softc *sc, int nchan) { struct vmbus_channel **subch = NULL; int nsubch; /* * Drain RX/TX bufrings and interrupts. */ nsubch = nchan - 1; if (nsubch > 0) subch = vmbus_subchan_get(sc->hn_prichan, nsubch); if (subch != NULL) { int i; for (i = 0; i < nsubch; ++i) hn_chan_drain(sc, subch[i]); } hn_chan_drain(sc, sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); } static void hn_suspend_data(struct hn_softc *sc) { struct hn_tx_ring *txr; int i; HN_LOCK_ASSERT(sc); /* * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ /* * Wait for all pending sends to finish. * * NOTE: * We will _not_ receive all pending send-done, if the * primary channel is revoked. */ while (hn_tx_ring_pending(txr) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("hnwtx", 1 /* 1 tick */); } /* * Disable RX. */ hn_disable_rx(sc); /* * Drain RX/TX. */ hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); /* * Drain any pending TX tasks. * * NOTE: * The above hn_drain_rxtx() can dispatch TX tasks, so the TX * tasks will have to be drained _after_ the above hn_drain_rxtx(). */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) { ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; } static void hn_suspend_mgmt(struct hn_softc *sc) { struct task task; HN_LOCK_ASSERT(sc); /* * Make sure that hn_mgmt_taskq0 can nolonger be accessed * through hn_mgmt_taskq. */ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); vmbus_chan_run_task(sc->hn_prichan, &task); /* * Make sure that all pending management tasks are completed. */ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); taskqueue_drain_all(sc->hn_mgmt_taskq0); } static void hn_suspend(struct hn_softc *sc) { /* Disable polling. */ hn_polling(sc, 0); /* * If the non-transparent mode VF is activated, the synthetic * device is receiving packets, so the data path of the * synthetic device must be suspended. */ if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_suspend_data(sc); hn_suspend_mgmt(sc); } static void hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) { int i; KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, ("invalid TX ring count %d", tx_ring_cnt)); for (i = 0; i < tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 0; mtx_unlock(&txr->hn_tx_lock); } } static void hn_resume_data(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* * Re-enable RX. */ hn_rxfilter_config(sc); /* * Make sure to clear suspend status on "all" TX rings, * since hn_tx_ring_inuse can be changed after * hn_suspend_data(). */ hn_resume_tx(sc, sc->hn_tx_ring_cnt); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { /* * Flush unused drbrs, since hn_tx_ring_inuse may be * reduced. */ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); } /* * Kick start TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; /* * Use txeof task, so that any pending oactive can be * cleared properly. */ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_resume_mgmt(struct hn_softc *sc) { sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; /* * Kick off network change detection, if it was pending. * If no network change was pending, start link status * checks, which is more lightweight than network change * detection. */ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) hn_change_network(sc); else hn_update_link_status(sc); } static void hn_resume(struct hn_softc *sc) { /* * If the non-transparent mode VF is activated, the synthetic * device have to receive packets, so the data path of the * synthetic device must be resumed. */ if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_resume_data(sc); /* * Don't resume link status change if VF is attached/activated. * - In the non-transparent VF mode, the synthetic device marks * link down until the VF is deactivated; i.e. VF is down. * - In transparent VF mode, VF's media status is used until * the VF is detached. */ if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) hn_resume_mgmt(sc); /* * Re-enable polling if this interface is running and * the polling is requested. */ if ((if_getdrvflags(sc->hn_ifp) & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_status_msg *msg; int ofs; if (dlen < sizeof(*msg)) { if_printf(sc->hn_ifp, "invalid RNDIS status\n"); return; } msg = data; switch (msg->rm_status) { case RNDIS_STATUS_MEDIA_CONNECT: case RNDIS_STATUS_MEDIA_DISCONNECT: hn_update_link_status(sc); break; case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: case RNDIS_STATUS_LINK_SPEED_CHANGE: /* Not really useful; ignore. */ break; case RNDIS_STATUS_NETWORK_CHANGE: ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); if (dlen < ofs + msg->rm_stbuflen || msg->rm_stbuflen < sizeof(uint32_t)) { if_printf(sc->hn_ifp, "network changed\n"); } else { uint32_t change; memcpy(&change, ((const uint8_t *)msg) + ofs, sizeof(change)); if_printf(sc->hn_ifp, "network changed, change %u\n", change); } hn_change_network(sc); break; default: if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", msg->rm_status); break; } } static int hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) { const struct rndis_pktinfo *pi = info_data; uint32_t mask = 0; while (info_dlen != 0) { const void *data; uint32_t dlen; if (__predict_false(info_dlen < sizeof(*pi))) return (EINVAL); if (__predict_false(info_dlen < pi->rm_size)) return (EINVAL); info_dlen -= pi->rm_size; if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) return (EINVAL); if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) return (EINVAL); dlen = pi->rm_size - pi->rm_pktinfooffset; data = pi->rm_data; if (pi->rm_internal == 1) { switch (pi->rm_type) { case NDIS_PKTINFO_IT_PKTINFO_ID: if (__predict_false(dlen < NDIS_PKTINFOID_SZ)) return (EINVAL); info->pktinfo_id = (const struct packet_info_id *)data; mask |= HN_RXINFO_PKTINFO_ID; break; default: goto next; } } else { switch (pi->rm_type) { case NDIS_PKTINFO_TYPE_VLAN: if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) return (EINVAL); info->vlan_info = (const uint32_t *)data; mask |= HN_RXINFO_VLAN; break; case NDIS_PKTINFO_TYPE_CSUM: if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) return (EINVAL); info->csum_info = (const uint32_t *)data; mask |= HN_RXINFO_CSUM; break; case HN_NDIS_PKTINFO_TYPE_HASHVAL: if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) return (EINVAL); info->hash_value = (const uint32_t *)data; mask |= HN_RXINFO_HASHVAL; break; case HN_NDIS_PKTINFO_TYPE_HASHINF: if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) return (EINVAL); info->hash_info = (const uint32_t *)data; mask |= HN_RXINFO_HASHINF; break; default: goto next; } } if (mask == HN_RXINFO_ALL) { /* All found; done */ break; } next: pi = (const struct rndis_pktinfo *) ((const uint8_t *)pi + pi->rm_size); } /* * Final fixup. * - If there is no hash value, invalidate the hash info. */ if ((mask & HN_RXINFO_HASHVAL) == 0) info->hash_info = NULL; return (0); } static __inline bool hn_rndis_check_overlap(int off, int len, int check_off, int check_len) { if (off < check_off) { if (__predict_true(off + len <= check_off)) return (false); } else if (off > check_off) { if (__predict_true(check_off + check_len <= off)) return (false); } return (true); } static __inline void hn_rsc_add_data(struct hn_rx_ring *rxr, const void *data, uint32_t len, struct hn_rxinfo *info) { uint32_t cnt = rxr->rsc.cnt; if (cnt) { rxr->rsc.pktlen += len; } else { rxr->rsc.vlan_info = info->vlan_info; rxr->rsc.csum_info = info->csum_info; rxr->rsc.hash_info = info->hash_info; rxr->rsc.hash_value = info->hash_value; rxr->rsc.pktlen = len; } rxr->rsc.frag_data[cnt] = data; rxr->rsc.frag_len[cnt] = len; rxr->rsc.cnt++; } static void hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_packet_msg *pkt; struct hn_rxinfo info; int data_off, pktinfo_off, data_len, pktinfo_len; bool rsc_more= false; /* * Check length. */ if (__predict_false(dlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); return; } pkt = data; if (__predict_false(dlen < pkt->rm_len)) { if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " "dlen %d, msglen %u\n", dlen, pkt->rm_len); return; } if (__predict_false(pkt->rm_len < pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " "msglen %u, data %u, oob %u, pktinfo %u\n", pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, pkt->rm_pktinfolen); return; } if (__predict_false(pkt->rm_datalen == 0)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); return; } /* * Check offests. */ #define IS_OFFSET_INVALID(ofs) \ ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) /* XXX Hyper-V does not meet data offset alignment requirement */ if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data offset %u\n", pkt->rm_dataoffset); return; } if (__predict_false(pkt->rm_oobdataoffset > 0 && IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob offset %u\n", pkt->rm_oobdataoffset); return; } if (__predict_true(pkt->rm_pktinfooffset > 0) && __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo offset %u\n", pkt->rm_pktinfooffset); return; } #undef IS_OFFSET_INVALID data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); data_len = pkt->rm_datalen; pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); pktinfo_len = pkt->rm_pktinfolen; /* * Check OOB coverage. */ if (__predict_false(pkt->rm_oobdatalen != 0)) { int oob_off, oob_len; if_printf(rxr->hn_ifp, "got oobdata\n"); oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); oob_len = pkt->rm_oobdatalen; if (__predict_false(oob_off + oob_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overflow, msglen %u, oob abs %d len %d\n", pkt->rm_len, oob_off, oob_len); return; } /* * Check against data. */ if (hn_rndis_check_overlap(oob_off, oob_len, data_off, data_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps data, oob abs %d len %d, " "data abs %d len %d\n", oob_off, oob_len, data_off, data_len); return; } /* * Check against pktinfo. */ if (pktinfo_len != 0 && hn_rndis_check_overlap(oob_off, oob_len, pktinfo_off, pktinfo_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps pktinfo, oob abs %d len %d, " "pktinfo abs %d len %d\n", oob_off, oob_len, pktinfo_off, pktinfo_len); return; } } /* * Check per-packet-info coverage and find useful per-packet-info. */ info.vlan_info = NULL; info.csum_info = NULL; info.hash_info = NULL; info.pktinfo_id = NULL; if (__predict_true(pktinfo_len != 0)) { bool overlap; int error; if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overflow, msglen %u, " "pktinfo abs %d len %d\n", pkt->rm_len, pktinfo_off, pktinfo_len); return; } /* * Check packet info coverage. */ overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, data_off, data_len); if (__predict_false(overlap)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overlap data, pktinfo abs %d len %d, " "data abs %d len %d\n", pktinfo_off, pktinfo_len, data_off, data_len); return; } /* * Find useful per-packet-info. */ error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, pktinfo_len, &info); if (__predict_false(error)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " "pktinfo\n"); return; } } if (__predict_false(data_off + data_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data overflow, msglen %u, data abs %d len %d\n", pkt->rm_len, data_off, data_len); return; } /* Identify RSC fragments, drop invalid packets */ if ((info.pktinfo_id != NULL) && (info.pktinfo_id->flag & HN_NDIS_PKTINFO_SUBALLOC)) { if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_1ST_FRAG) { rxr->rsc.cnt = 0; rxr->hn_rsc_pkts++; } else if (rxr->rsc.cnt == 0) goto drop; rsc_more = true; if (info.pktinfo_id->flag & HN_NDIS_PKTINFO_LAST_FRAG) rsc_more = false; if (rsc_more && rxr->rsc.is_last) goto drop; } else { rxr->rsc.cnt = 0; } if (__predict_false(rxr->rsc.cnt >= HN_NVS_RSC_MAX)) goto drop; /* Store data in per rx ring structure */ hn_rsc_add_data(rxr,((const uint8_t *)pkt) + data_off, data_len, &info); if (rsc_more) return; hn_rxpkt(rxr); rxr->rsc.cnt = 0; return; drop: rxr->hn_rsc_drop++; return; } static __inline void hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_msghdr *hdr; if (__predict_false(dlen < sizeof(*hdr))) { if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); return; } hdr = data; if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { /* Hot data path. */ hn_rndis_rx_data(rxr, data, dlen); /* Done! */ return; } if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) hn_rndis_rx_status(if_getsoftc(rxr->hn_ifp), data, dlen); else hn_rndis_rx_ctrl(if_getsoftc(rxr->hn_ifp), data, dlen); } static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { const struct hn_nvs_hdr *hdr; if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { if_printf(sc->hn_ifp, "invalid nvs notify\n"); return; } hdr = VMBUS_CHANPKT_CONST_DATA(pkt); if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { /* Useless; ignore */ return; } if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); } static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { struct hn_nvs_sendctx *sndc; sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), VMBUS_CHANPKT_DATALEN(pkt)); /* * NOTE: * 'sndc' CAN NOT be accessed anymore, since it can be freed by * its callback. */ } static void hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) { struct epoch_tracker et; const struct vmbus_chanpkt_rxbuf *pkt; const struct hn_nvs_hdr *nvs_hdr; int count, i, hlen; if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); return; } nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure that this is a RNDIS message. */ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", nvs_hdr->nvs_type); return; } hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); if (__predict_false(hlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; if (__predict_false(hlen < __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); return; } NET_EPOCH_ENTER(et); /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; ++i) { int ofs, len; ofs = pkt->cp_rxbuf[i].rb_ofs; len = pkt->cp_rxbuf[i].rb_len; if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " "ofs %d, len %d\n", i, ofs, len); continue; } rxr->rsc.is_last = (i == (count - 1)); hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); } NET_EPOCH_EXIT(et); /* * Ack the consumed RXBUF associated w/ this channel packet, * so that this RXBUF can be recycled by the hypervisor. */ hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); } static void hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, uint64_t tid) { struct hn_nvs_rndis_ack ack; int retries, error; ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; ack.nvs_status = HN_NVS_STATUS_OK; retries = 0; again: error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); if (__predict_false(error == EAGAIN)) { /* * NOTE: * This should _not_ happen in real world, since the * consumption of the TX bufring from the TX path is * controlled. */ if (rxr->hn_ack_failed == 0) if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); rxr->hn_ack_failed++; retries++; if (retries < 10) { DELAY(100); goto again; } /* RXBUF leaks! */ if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); } } static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = if_getsoftc(rxr->hn_ifp); for (;;) { struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; int error, pktlen; pktlen = rxr->hn_pktbuf_len; error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); if (__predict_false(error == ENOBUFS)) { void *nbuf; int nlen; /* * Expand channel packet buffer. * * XXX * Use M_WAITOK here, since allocation failure * is fatal. */ nlen = rxr->hn_pktbuf_len * 2; while (nlen < pktlen) nlen *= 2; nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", rxr->hn_pktbuf_len, nlen); free(rxr->hn_pktbuf, M_DEVBUF); rxr->hn_pktbuf = nbuf; rxr->hn_pktbuf_len = nlen; /* Retry! */ continue; } else if (__predict_false(error == EAGAIN)) { /* No more channel packets; done! */ break; } KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hn_nvs_handle_comp(sc, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: hn_nvs_handle_rxbuf(rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hn_nvs_handle_notify(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } hn_chan_rollup(rxr, rxr->hn_txr); } static void hn_sysinit(void *arg __unused) { int i; hn_udpcs_fixup = counter_u64_alloc(M_WAITOK); #ifdef HN_IFSTART_SUPPORT /* * Don't use ifnet.if_start if transparent VF mode is requested; * mainly due to the IFF_DRV_OACTIVE flag. */ if (hn_xpnt_vf && hn_use_if_start) { hn_use_if_start = 0; printf("hn: tranparent VF mode, if_transmit will be used, " "instead of if_start\n"); } #endif if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { printf("hn: invalid transparent VF attach routing " "wait timeout %d, reset to %d\n", hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; } /* * Initialize VF map. */ rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); hn_vfmap_size = HN_VFMAP_SIZE_DEF; hn_vfmap = malloc(sizeof(if_t) * hn_vfmap_size, M_DEVBUF, M_WAITOK | M_ZERO); /* * Fix the # of TX taskqueues. */ if (hn_tx_taskq_cnt <= 0) hn_tx_taskq_cnt = 1; else if (hn_tx_taskq_cnt > mp_ncpus) hn_tx_taskq_cnt = mp_ncpus; /* * Fix the TX taskqueue mode. */ switch (hn_tx_taskq_mode) { case HN_TX_TASKQ_M_INDEP: case HN_TX_TASKQ_M_GLOBAL: case HN_TX_TASKQ_M_EVTTQ: break; default: hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; break; } if (vm_guest != VM_GUEST_HV) return; if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) return; hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskque[i]); taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, "hn tx%d", i); } } SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); static void hn_sysuninit(void *arg __unused) { if (hn_tx_taskque != NULL) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(hn_tx_taskque[i]); free(hn_tx_taskque, M_DEVBUF); } if (hn_vfmap != NULL) free(hn_vfmap, M_DEVBUF); rm_destroy(&hn_vfmap_lock); counter_u64_free(hn_udpcs_fixup); } SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h index b85513308267..cc1f53b91876 100644 --- a/sys/dev/hyperv/netvsc/if_hnvar.h +++ b/sys/dev/hyperv/netvsc/if_hnvar.h @@ -1,340 +1,337 @@ /*- * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IF_HNVAR_H_ #define _IF_HNVAR_H_ #define HN_USE_TXDESC_BUFRING #define HN_CHIM_SIZE (15 * 1024 * 1024) #define HN_RXBUF_SIZE (31 * 1024 * 1024) #define HN_RXBUF_SIZE_COMPAT (15 * 1024 * 1024) #define HN_MTU_MAX (65535 - ETHER_ADDR_LEN) #define HN_TXBR_SIZE (128 * PAGE_SIZE) #define HN_RXBR_SIZE (128 * PAGE_SIZE) #define HN_XACT_REQ_PGCNT 2 #define HN_XACT_RESP_PGCNT 2 #define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE) #define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE) #define HN_GPACNT_MAX 32 struct hn_txdesc; #ifndef HN_USE_TXDESC_BUFRING SLIST_HEAD(hn_txdesc_list, hn_txdesc); #else struct buf_ring; #endif struct hn_tx_ring; #define HN_NVS_RSC_MAX 562 /* Max RSC frags in one vmbus packet */ struct hn_rx_rsc { const uint32_t *vlan_info; const uint32_t *csum_info; const uint32_t *hash_info; const uint32_t *hash_value; uint32_t cnt; /* fragment count */ uint32_t pktlen; /* full packet length */ uint8_t is_last; /* last fragment */ const void *frag_data[HN_NVS_RSC_MAX]; uint32_t frag_len[HN_NVS_RSC_MAX]; }; struct hn_rx_ring { if_t hn_ifp; if_t hn_rxvf_ifp; /* SR-IOV VF for RX */ struct hn_tx_ring *hn_txr; void *hn_pktbuf; int hn_pktbuf_len; int hn_rx_flags; /* HN_RX_FLAG_ */ uint32_t hn_mbuf_hash; /* NDIS_HASH_ */ uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */ int hn_rx_idx; struct hn_rx_rsc rsc; /* Trust csum verification on host side */ int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ struct lro_ctrl hn_lro; u_long hn_csum_ip; u_long hn_csum_tcp; u_long hn_csum_udp; u_long hn_csum_trusted; u_long hn_lro_tried; u_long hn_small_pkts; u_long hn_pkts; u_long hn_rss_pkts; u_long hn_ack_failed; u_long hn_rsc_pkts; u_long hn_rsc_drop; /* Rarely used stuffs */ struct sysctl_oid *hn_rx_sysctl_tree; void *hn_br; /* TX/RX bufring */ - struct hyperv_dma hn_br_dma; struct vmbus_channel *hn_chan; } __aligned(CACHE_LINE_SIZE); #define HN_TRUST_HCSUM_IP 0x0001 #define HN_TRUST_HCSUM_TCP 0x0002 #define HN_TRUST_HCSUM_UDP 0x0004 #define HN_RX_FLAG_ATTACHED 0x0001 #define HN_RX_FLAG_BR_REF 0x0002 #define HN_RX_FLAG_XPNT_VF 0x0004 #define HN_RX_FLAG_UDP_HASH 0x0008 struct hn_tx_ring { #ifndef HN_USE_TXDESC_BUFRING struct mtx hn_txlist_spin; struct hn_txdesc_list hn_txlist; #else struct buf_ring *hn_txdesc_br; #endif int hn_txdesc_cnt; int hn_txdesc_avail; u_short hn_has_txeof; u_short hn_txdone_cnt; int hn_sched_tx; void (*hn_txeof)(struct hn_tx_ring *); struct taskqueue *hn_tx_taskq; struct task hn_tx_task; struct task hn_txeof_task; struct buf_ring *hn_mbuf_br; int hn_oactive; int hn_tx_idx; int hn_tx_flags; struct mtx hn_tx_lock; struct hn_softc *hn_sc; struct vmbus_channel *hn_chan; int hn_direct_tx_size; int hn_chim_size; bus_dma_tag_t hn_tx_data_dtag; uint64_t hn_csum_assist; /* Applied packet transmission aggregation limits. */ int hn_agg_szmax; short hn_agg_pktmax; short hn_agg_align; /* Packet transmission aggregation states. */ struct hn_txdesc *hn_agg_txd; int hn_agg_szleft; short hn_agg_pktleft; struct rndis_packet_msg *hn_agg_prevpkt; /* Temporary stats for each sends. */ int hn_stat_size; short hn_stat_pkts; short hn_stat_mcasts; int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *); int hn_suspended; int hn_gpa_cnt; struct vmbus_gpa hn_gpa[HN_GPACNT_MAX]; u_long hn_no_txdescs; u_long hn_send_failed; u_long hn_txdma_failed; u_long hn_tx_collapsed; u_long hn_tx_chimney_tried; u_long hn_tx_chimney; u_long hn_pkts; u_long hn_sends; u_long hn_flush_failed; /* Rarely used stuffs */ struct hn_txdesc *hn_txdesc; bus_dma_tag_t hn_tx_rndis_dtag; struct sysctl_oid *hn_tx_sysctl_tree; } __aligned(CACHE_LINE_SIZE); #define HN_TX_FLAG_ATTACHED 0x0001 #define HN_TX_FLAG_HASHVAL 0x0002 /* support HASHVAL pktinfo */ /* * Device-specific softc structure */ struct hn_softc { if_t hn_ifp; struct ifmedia hn_media; device_t hn_dev; int hn_if_flags; struct sx hn_lock; struct vmbus_channel *hn_prichan; int hn_rx_ring_cnt; int hn_rx_ring_inuse; struct hn_rx_ring *hn_rx_ring; struct rmlock hn_vf_lock; if_t hn_vf_ifp; /* SR-IOV VF */ uint32_t hn_xvf_flags; /* transparent VF flags */ int hn_tx_ring_cnt; int hn_tx_ring_inuse; struct hn_tx_ring *hn_tx_ring; uint8_t *hn_chim; u_long *hn_chim_bmap; int hn_chim_bmap_cnt; int hn_chim_cnt; int hn_chim_szmax; int hn_cpu; struct taskqueue **hn_tx_taskqs; struct sysctl_oid *hn_tx_sysctl_tree; struct sysctl_oid *hn_rx_sysctl_tree; struct vmbus_xact_ctx *hn_xact; uint32_t hn_nvs_ver; uint32_t hn_rx_filter; /* Packet transmission aggregation user settings. */ int hn_agg_size; int hn_agg_pkts; struct taskqueue *hn_mgmt_taskq; struct taskqueue *hn_mgmt_taskq0; struct task hn_link_task; struct task hn_netchg_init; struct timeout_task hn_netchg_status; uint32_t hn_link_flags; /* HN_LINK_FLAG_ */ uint32_t hn_caps; /* HN_CAP_ */ uint32_t hn_flags; /* HN_FLAG_ */ u_int hn_pollhz; void *hn_rxbuf; uint32_t hn_rxbuf_gpadl; - struct hyperv_dma hn_rxbuf_dma; uint32_t hn_chim_gpadl; - struct hyperv_dma hn_chim_dma; uint32_t hn_rndis_rid; uint32_t hn_ndis_ver; int hn_ndis_tso_szmax; int hn_ndis_tso_sgmin; uint32_t hn_rndis_agg_size; uint32_t hn_rndis_agg_pkts; uint32_t hn_rndis_agg_align; int hn_rss_ind_size; uint32_t hn_rss_hash; /* setting, NDIS_HASH_ */ uint32_t hn_rss_hcap; /* caps, NDIS_HASH_ */ struct ndis_rssprm_toeplitz hn_rss; eventhandler_tag hn_ifaddr_evthand; eventhandler_tag hn_ifnet_evthand; eventhandler_tag hn_ifnet_atthand; eventhandler_tag hn_ifnet_dethand; eventhandler_tag hn_ifnet_lnkhand; /* * Transparent VF delayed initialization. */ int hn_vf_rdytick; /* ticks, 0 == ready */ struct taskqueue *hn_vf_taskq; struct timeout_task hn_vf_init; /* * Saved information for VF under transparent mode. */ void (*hn_vf_input) (if_t, struct mbuf *); int hn_saved_caps; u_int hn_saved_tsomax; u_int hn_saved_tsosegcnt; u_int hn_saved_tsosegsz; /* * RSC switch, default off */ u_int hn_rsc_ctrl; }; #define HN_FLAG_RXBUF_CONNECTED 0x0001 #define HN_FLAG_CHIM_CONNECTED 0x0002 #define HN_FLAG_HAS_RSSKEY 0x0004 #define HN_FLAG_HAS_RSSIND 0x0008 #define HN_FLAG_SYNTH_ATTACHED 0x0010 #define HN_FLAG_NO_SLEEPING 0x0020 #define HN_FLAG_RXBUF_REF 0x0040 #define HN_FLAG_CHIM_REF 0x0080 #define HN_FLAG_RXVF 0x0100 #define HN_FLAG_ERRORS (HN_FLAG_RXBUF_REF | HN_FLAG_CHIM_REF) #define HN_XVFFLAG_ENABLED 0x0001 #define HN_XVFFLAG_ACCBPF 0x0002 #define HN_NO_SLEEPING(sc) \ do { \ (sc)->hn_flags |= HN_FLAG_NO_SLEEPING; \ } while (0) #define HN_SLEEPING_OK(sc) \ do { \ (sc)->hn_flags &= ~HN_FLAG_NO_SLEEPING; \ } while (0) #define HN_CAN_SLEEP(sc) \ (((sc)->hn_flags & HN_FLAG_NO_SLEEPING) == 0) #define HN_CAP_VLAN 0x0001 #define HN_CAP_MTU 0x0002 #define HN_CAP_IPCS 0x0004 #define HN_CAP_TCP4CS 0x0008 #define HN_CAP_TCP6CS 0x0010 #define HN_CAP_UDP4CS 0x0020 #define HN_CAP_UDP6CS 0x0040 #define HN_CAP_TSO4 0x0080 #define HN_CAP_TSO6 0x0100 #define HN_CAP_HASHVAL 0x0200 #define HN_CAP_UDPHASH 0x0400 /* Capability description for use with printf(9) %b identifier. */ #define HN_CAP_BITS \ "\020\1VLAN\2MTU\3IPCS\4TCP4CS\5TCP6CS" \ "\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL\13UDPHASH" #define HN_LINK_FLAG_LINKUP 0x0001 #define HN_LINK_FLAG_NETCHG 0x0002 #endif /* !_IF_HNVAR_H_ */ diff --git a/sys/dev/hyperv/pcib/vmbus_pcib.c b/sys/dev/hyperv/pcib/vmbus_pcib.c index d68ad6821ad8..a684e79fbd14 100644 --- a/sys/dev/hyperv/pcib/vmbus_pcib.c +++ b/sys/dev/hyperv/pcib/vmbus_pcib.c @@ -1,2004 +1,2003 @@ /*- * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef NEW_PCIB #include "opt_acpi.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(__aarch64__) #include #endif #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" #if defined(__i386__) || defined(__amd64__) #include #include #endif #if defined(__aarch64__) #include #include #include #include #endif #include -#include #include #include #include #include "vmbus_if.h" struct completion { unsigned int done; struct mtx lock; }; static void init_completion(struct completion *c) { memset(c, 0, sizeof(*c)); mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF); c->done = 0; } static void reinit_completion(struct completion *c) { c->done = 0; } static void free_completion(struct completion *c) { mtx_destroy(&c->lock); } static void complete(struct completion *c) { mtx_lock(&c->lock); c->done++; mtx_unlock(&c->lock); wakeup(c); } static void wait_for_completion(struct completion *c) { mtx_lock(&c->lock); while (c->done == 0) mtx_sleep(c, &c->lock, 0, "hvwfc", 0); c->done--; mtx_unlock(&c->lock); } /* * Return: 0 if completed, a non-zero value if timed out. */ static int wait_for_completion_timeout(struct completion *c, int timeout) { int ret; mtx_lock(&c->lock); if (c->done == 0) mtx_sleep(c, &c->lock, 0, "hvwfc", timeout); if (c->done > 0) { c->done--; ret = 0; } else { ret = 1; } mtx_unlock(&c->lock); return (ret); } #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (minor))) enum pci_protocol_version_t { PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), PCI_PROTOCOL_VERSION_1_4 = PCI_MAKE_VERSION(1, 4), }; static enum pci_protocol_version_t pci_protocol_versions[] = { PCI_PROTOCOL_VERSION_1_4, PCI_PROTOCOL_VERSION_1_1, }; #define PCI_CONFIG_MMIO_LENGTH 0x2000 #define CFG_PAGE_OFFSET 0x1000 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) /* * Message Types */ enum pci_message_type { /* * Version 1.1 */ PCI_MESSAGE_BASE = 0x42490000, PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, PCI_EJECT = PCI_MESSAGE_BASE + 0xB, PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16, PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17, PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */ PCI_BUS_RELATIONS2 = PCI_MESSAGE_BASE + 0x19, PCI_RESOURCES_ASSIGNED3 = PCI_MESSAGE_BASE + 0x1A, PCI_CREATE_INTERRUPT_MESSAGE3 = PCI_MESSAGE_BASE + 0x1B, PCI_MESSAGE_MAXIMUM }; #define STATUS_REVISION_MISMATCH 0xC0000059 /* * Structures defining the virtual PCI Express protocol. */ union pci_version { struct { uint16_t minor_version; uint16_t major_version; } parts; uint32_t version; } __packed; /* * This representation is the one used in Windows, which is * what is expected when sending this back and forth with * the Hyper-V parent partition. */ union win_slot_encoding { struct { uint32_t slot:5; uint32_t func:3; uint32_t reserved:24; } bits; uint32_t val; } __packed; struct pci_func_desc { uint16_t v_id; /* vendor ID */ uint16_t d_id; /* device ID */ uint8_t rev; uint8_t prog_intf; uint8_t subclass; uint8_t base_class; uint32_t subsystem_id; union win_slot_encoding wslot; uint32_t ser; /* serial number */ } __packed; struct pci_func_desc2 { uint16_t v_id; /* vendor ID */ uint16_t d_id; /* device ID */ uint8_t rev; uint8_t prog_intf; uint8_t subclass; uint8_t base_class; uint32_t subsystem_id; union win_slot_encoding wslot; uint32_t ser; /* serial number */ uint32_t flags; uint16_t virtual_numa_node; uint16_t reserved; } __packed; struct hv_msi_desc { uint8_t vector; uint8_t delivery_mode; uint16_t vector_count; uint32_t reserved; uint64_t cpu_mask; } __packed; struct hv_msi_desc3 { uint32_t vector; uint8_t delivery_mode; uint8_t reserved; uint16_t vector_count; uint16_t processor_count; uint16_t processor_array[32]; } __packed; struct tran_int_desc { uint16_t reserved; uint16_t vector_count; uint32_t data; uint64_t address; } __packed; struct pci_message { uint32_t type; } __packed; struct pci_child_message { struct pci_message message_type; union win_slot_encoding wslot; } __packed; struct pci_incoming_message { struct vmbus_chanpkt_hdr hdr; struct pci_message message_type; } __packed; struct pci_response { struct vmbus_chanpkt_hdr hdr; int32_t status; /* negative values are failures */ } __packed; struct pci_packet { void (*completion_func)(void *context, struct pci_response *resp, int resp_packet_size); void *compl_ctxt; struct pci_message message[0]; }; /* * Specific message types supporting the PCI protocol. */ struct pci_version_request { struct pci_message message_type; uint32_t protocol_version; uint32_t reservedz:31; } __packed; struct pci_bus_d0_entry { struct pci_message message_type; uint32_t reserved; uint64_t mmio_base; } __packed; struct pci_bus_relations { struct pci_incoming_message incoming; uint32_t device_count; struct pci_func_desc func[0]; } __packed; struct pci_bus_relations2 { struct pci_incoming_message incoming; uint32_t device_count; struct pci_func_desc2 func[0]; } __packed; #define MAX_NUM_BARS (PCIR_MAX_BAR_0 + 1) struct pci_q_res_req_response { struct vmbus_chanpkt_hdr hdr; int32_t status; /* negative values are failures */ uint32_t probed_bar[MAX_NUM_BARS]; } __packed; struct pci_resources_assigned { struct pci_message message_type; union win_slot_encoding wslot; uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */ uint32_t msi_descriptors; uint32_t reserved[4]; } __packed; struct pci_resources_assigned2 { struct pci_message message_type; union win_slot_encoding wslot; uint8_t memory_range[0x14][6]; /* not used here */ uint32_t msi_descriptor_count; uint8_t reserved[70]; } __packed; struct pci_create_interrupt { struct pci_message message_type; union win_slot_encoding wslot; struct hv_msi_desc int_desc; } __packed; struct pci_create_interrupt3 { struct pci_message message_type; union win_slot_encoding wslot; struct hv_msi_desc3 int_desc; } __packed; struct pci_create_int_response { struct pci_response response; uint32_t reserved; struct tran_int_desc int_desc; } __packed; struct pci_delete_interrupt { struct pci_message message_type; union win_slot_encoding wslot; struct tran_int_desc int_desc; } __packed; struct pci_dev_incoming { struct pci_incoming_message incoming; union win_slot_encoding wslot; } __packed; struct pci_eject_response { struct pci_message message_type; union win_slot_encoding wslot; uint32_t status; } __packed; /* * Driver specific state. */ enum hv_pcibus_state { hv_pcibus_init = 0, hv_pcibus_installed, }; struct hv_pcibus { device_t pcib; device_t pci_bus; struct vmbus_pcib_softc *sc; uint16_t pci_domain; enum hv_pcibus_state state; struct resource *cfg_res; struct completion query_completion, *query_comp; struct mtx config_lock; /* Avoid two threads writing index page */ struct mtx device_list_lock; /* Protect lists below */ uint32_t protocol_version; TAILQ_HEAD(, hv_pci_dev) children; TAILQ_HEAD(, hv_dr_state) dr_list; volatile int detaching; }; struct hv_pcidev_desc { uint16_t v_id; /* vendor ID */ uint16_t d_id; /* device ID */ uint8_t rev; uint8_t prog_intf; uint8_t subclass; uint8_t base_class; uint32_t subsystem_id; union win_slot_encoding wslot; uint32_t ser; /* serial number */ uint32_t flags; uint16_t virtual_numa_node; } __packed; struct hv_pci_dev { TAILQ_ENTRY(hv_pci_dev) link; struct hv_pcidev_desc desc; bool reported_missing; struct hv_pcibus *hbus; struct task eject_task; TAILQ_HEAD(, hv_irq_desc) irq_desc_list; /* * What would be observed if one wrote 0xFFFFFFFF to a BAR and then * read it back, for each of the BAR offsets within config space. */ uint32_t probed_bar[MAX_NUM_BARS]; }; /* * Tracks "Device Relations" messages from the host, which must be both * processed in order. */ struct hv_dr_work { struct task task; struct hv_pcibus *bus; }; struct hv_dr_state { TAILQ_ENTRY(hv_dr_state) link; uint32_t device_count; struct hv_pcidev_desc func[0]; }; struct hv_irq_desc { TAILQ_ENTRY(hv_irq_desc) link; struct tran_int_desc desc; int irq; }; #define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) #define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) #define PCI_FUNC(devfn) ((devfn) & 0x07) static uint32_t devfn_to_wslot(unsigned int devfn) { union win_slot_encoding wslot; wslot.val = 0; wslot.bits.slot = PCI_SLOT(devfn); wslot.bits.func = PCI_FUNC(devfn); return (wslot.val); } static unsigned int wslot_to_devfn(uint32_t wslot) { union win_slot_encoding encoding; unsigned int slot; unsigned int func; encoding.val = wslot; slot = encoding.bits.slot; func = encoding.bits.func; return (PCI_DEVFN(slot, func)); } struct vmbus_pcib_softc { struct vmbus_channel *chan; void *rx_buf; struct taskqueue *taskq; struct hv_pcibus *hbus; }; /* {44C4F61D-4444-4400-9D52-802E27EDE19F} */ static const struct hyperv_guid g_pass_through_dev_type = { .hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44, 0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F} }; struct hv_pci_compl { struct completion host_event; int32_t completion_status; }; struct q_res_req_compl { struct completion host_event; struct hv_pci_dev *hpdev; }; struct compose_comp_ctxt { struct hv_pci_compl comp_pkt; struct tran_int_desc int_desc; }; /* * It is possible the device is revoked during initialization. * Check if this happens during wait. * Return: 0 if response arrived, ENODEV if device revoked. */ static int wait_for_response(struct hv_pcibus *hbus, struct completion *c) { do { if (vmbus_chan_is_revoked(hbus->sc->chan)) { device_printf(hbus->pcib, "The device is revoked.\n"); return (ENODEV); } } while (wait_for_completion_timeout(c, hz /10) != 0); return 0; } static void hv_pci_generic_compl(void *context, struct pci_response *resp, int resp_packet_size) { struct hv_pci_compl *comp_pkt = context; if (resp_packet_size >= sizeof(struct pci_response)) comp_pkt->completion_status = resp->status; else comp_pkt->completion_status = -1; complete(&comp_pkt->host_event); } static void q_resource_requirements(void *context, struct pci_response *resp, int resp_packet_size) { struct q_res_req_compl *completion = context; struct pci_q_res_req_response *q_res_req = (struct pci_q_res_req_response *)resp; int i; if (resp->status < 0) { printf("vmbus_pcib: failed to query resource requirements\n"); } else { for (i = 0; i < MAX_NUM_BARS; i++) completion->hpdev->probed_bar[i] = q_res_req->probed_bar[i]; } complete(&completion->host_event); } static void hv_pci_compose_compl(void *context, struct pci_response *resp, int resp_packet_size) { struct compose_comp_ctxt *comp_pkt = context; struct pci_create_int_response *int_resp = (struct pci_create_int_response *)resp; comp_pkt->comp_pkt.completion_status = resp->status; comp_pkt->int_desc = int_resp->int_desc; complete(&comp_pkt->comp_pkt.host_event); } static void hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid) { struct pci_delete_interrupt *int_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_delete_interrupt)]; } ctxt; memset(&ctxt, 0, sizeof(ctxt)); int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE; int_pkt->wslot.val = hpdev->desc.wslot.val; int_pkt->int_desc = hid->desc; vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, int_pkt, sizeof(*int_pkt), 0); free(hid, M_DEVBUF); } static void hv_pci_delete_device(struct hv_pci_dev *hpdev) { struct hv_pcibus *hbus = hpdev->hbus; struct hv_irq_desc *hid, *tmp_hid; device_t pci_dev; int devfn; devfn = wslot_to_devfn(hpdev->desc.wslot.val); bus_topo_lock(); pci_dev = pci_find_dbsf(hbus->pci_domain, 0, PCI_SLOT(devfn), PCI_FUNC(devfn)); if (pci_dev) device_delete_child(hbus->pci_bus, pci_dev); bus_topo_unlock(); mtx_lock(&hbus->device_list_lock); TAILQ_REMOVE(&hbus->children, hpdev, link); mtx_unlock(&hbus->device_list_lock); TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) hv_int_desc_free(hpdev, hid); free(hpdev, M_DEVBUF); } static struct hv_pci_dev * new_pcichild_device(struct hv_pcibus *hbus, struct hv_pcidev_desc *desc) { struct hv_pci_dev *hpdev; struct pci_child_message *res_req; struct q_res_req_compl comp_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_child_message)]; } ctxt; int ret; hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO); hpdev->hbus = hbus; TAILQ_INIT(&hpdev->irq_desc_list); init_completion(&comp_pkt.host_event); comp_pkt.hpdev = hpdev; ctxt.pkt.compl_ctxt = &comp_pkt; ctxt.pkt.completion_func = q_resource_requirements; res_req = (struct pci_child_message *)&ctxt.pkt.message; res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; res_req->wslot.val = desc->wslot.val; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, res_req, sizeof(*res_req), (uint64_t)(uintptr_t)&ctxt.pkt); if (ret) goto err; if (wait_for_response(hbus, &comp_pkt.host_event)) goto err; free_completion(&comp_pkt.host_event); hpdev->desc = *desc; mtx_lock(&hbus->device_list_lock); if (TAILQ_EMPTY(&hbus->children)) hbus->pci_domain = desc->ser & 0xFFFF; TAILQ_INSERT_TAIL(&hbus->children, hpdev, link); mtx_unlock(&hbus->device_list_lock); return (hpdev); err: free_completion(&comp_pkt.host_event); free(hpdev, M_DEVBUF); return (NULL); } static int pci_rescan(device_t dev) { return (BUS_RESCAN(dev)); } static void pci_devices_present_work(void *arg, int pending __unused) { struct hv_dr_work *dr_wrk = arg; struct hv_dr_state *dr = NULL; struct hv_pcibus *hbus; uint32_t child_no; bool found; struct hv_pcidev_desc *new_desc; struct hv_pci_dev *hpdev, *tmp_hpdev; struct completion *query_comp; bool need_rescan = false; hbus = dr_wrk->bus; free(dr_wrk, M_DEVBUF); /* Pull this off the queue and process it if it was the last one. */ mtx_lock(&hbus->device_list_lock); while (!TAILQ_EMPTY(&hbus->dr_list)) { dr = TAILQ_FIRST(&hbus->dr_list); TAILQ_REMOVE(&hbus->dr_list, dr, link); /* Throw this away if the list still has stuff in it. */ if (!TAILQ_EMPTY(&hbus->dr_list)) { free(dr, M_DEVBUF); continue; } } mtx_unlock(&hbus->device_list_lock); if (!dr) return; /* First, mark all existing children as reported missing. */ mtx_lock(&hbus->device_list_lock); TAILQ_FOREACH(hpdev, &hbus->children, link) hpdev->reported_missing = true; mtx_unlock(&hbus->device_list_lock); /* Next, add back any reported devices. */ for (child_no = 0; child_no < dr->device_count; child_no++) { found = false; new_desc = &dr->func[child_no]; mtx_lock(&hbus->device_list_lock); TAILQ_FOREACH(hpdev, &hbus->children, link) { if ((hpdev->desc.wslot.val == new_desc->wslot.val) && (hpdev->desc.v_id == new_desc->v_id) && (hpdev->desc.d_id == new_desc->d_id) && (hpdev->desc.ser == new_desc->ser)) { hpdev->reported_missing = false; found = true; break; } } mtx_unlock(&hbus->device_list_lock); if (!found) { if (!need_rescan) need_rescan = true; hpdev = new_pcichild_device(hbus, new_desc); if (!hpdev) printf("vmbus_pcib: failed to add a child\n"); } } /* Remove missing device(s), if any */ TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) { if (hpdev->reported_missing) hv_pci_delete_device(hpdev); } /* Rescan the bus to find any new device, if necessary. */ if (hbus->state == hv_pcibus_installed && need_rescan) pci_rescan(hbus->pci_bus); /* Wake up hv_pci_query_relations(), if it's waiting. */ query_comp = hbus->query_comp; if (query_comp) { hbus->query_comp = NULL; complete(query_comp); } free(dr, M_DEVBUF); } static struct hv_pci_dev * get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot) { struct hv_pci_dev *hpdev, *ret = NULL; mtx_lock(&hbus->device_list_lock); TAILQ_FOREACH(hpdev, &hbus->children, link) { if (hpdev->desc.wslot.val == wslot) { ret = hpdev; break; } } mtx_unlock(&hbus->device_list_lock); return (ret); } static void hv_pci_devices_present(struct hv_pcibus *hbus, struct pci_bus_relations *relations) { struct hv_dr_state *dr; struct hv_dr_work *dr_wrk; unsigned long dr_size; if (hbus->detaching && relations->device_count > 0) return; dr_size = offsetof(struct hv_dr_state, func) + (sizeof(struct pci_func_desc) * relations->device_count); dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO); dr->device_count = relations->device_count; if (dr->device_count != 0) memcpy(dr->func, relations->func, sizeof(struct hv_pcidev_desc) * dr->device_count); mtx_lock(&hbus->device_list_lock); TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link); mtx_unlock(&hbus->device_list_lock); dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO); dr_wrk->bus = hbus; TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk); taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task); } static void hv_pci_devices_present2(struct hv_pcibus *hbus, struct pci_bus_relations2 *relations) { struct hv_dr_state *dr; struct hv_dr_work *dr_wrk; unsigned long dr_size; if (hbus->detaching && relations->device_count > 0) return; dr_size = offsetof(struct hv_dr_state, func) + (sizeof(struct pci_func_desc2) * relations->device_count); dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO); dr->device_count = relations->device_count; if (dr->device_count != 0) memcpy(dr->func, relations->func, sizeof(struct pci_func_desc2) * dr->device_count); mtx_lock(&hbus->device_list_lock); TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link); mtx_unlock(&hbus->device_list_lock); dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO); dr_wrk->bus = hbus; TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk); taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task); } static void hv_eject_device_work(void *arg, int pending __unused) { struct hv_pci_dev *hpdev = arg; union win_slot_encoding wslot = hpdev->desc.wslot; struct hv_pcibus *hbus = hpdev->hbus; struct pci_eject_response *eject_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_eject_response)]; } ctxt; hv_pci_delete_device(hpdev); memset(&ctxt, 0, sizeof(ctxt)); eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message; eject_pkt->message_type.type = PCI_EJECTION_COMPLETE; eject_pkt->wslot.val = wslot.val; vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, eject_pkt, sizeof(*eject_pkt), 0); } static void hv_pci_eject_device(struct hv_pci_dev *hpdev) { struct hv_pcibus *hbus = hpdev->hbus; struct taskqueue *taskq; if (hbus->detaching) return; /* * Push this task into the same taskqueue on which * vmbus_pcib_attach() runs, so we're sure this task can't run * concurrently with vmbus_pcib_attach(). */ TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev); taskq = vmbus_chan_mgmt_tq(hbus->sc->chan); taskqueue_enqueue(taskq, &hpdev->eject_task); } #define PCIB_PACKET_SIZE 0x100 static void vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg) { struct vmbus_pcib_softc *sc = arg; struct hv_pcibus *hbus = sc->hbus; void *buffer; int bufferlen = PCIB_PACKET_SIZE; struct pci_packet *comp_packet; struct pci_response *response; struct pci_incoming_message *new_msg; struct pci_bus_relations *bus_rel; struct pci_bus_relations2 *bus_rel2; struct pci_dev_incoming *dev_msg; struct hv_pci_dev *hpdev; buffer = sc->rx_buf; do { struct vmbus_chanpkt_hdr *pkt = buffer; uint32_t bytes_rxed; int ret; bytes_rxed = bufferlen; ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); if (ret == ENOBUFS) { /* Handle large packet */ if (bufferlen > PCIB_PACKET_SIZE) { free(buffer, M_DEVBUF); buffer = NULL; } /* alloc new buffer */ buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO); bufferlen = bytes_rxed; continue; } if (ret != 0) { /* ignore EIO or EAGAIN */ break; } if (bytes_rxed <= sizeof(struct pci_response)) continue; switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: comp_packet = (struct pci_packet *)(uintptr_t)pkt->cph_xactid; response = (struct pci_response *)pkt; comp_packet->completion_func(comp_packet->compl_ctxt, response, bytes_rxed); break; case VMBUS_CHANPKT_TYPE_INBAND: new_msg = (struct pci_incoming_message *)buffer; switch (new_msg->message_type.type) { case PCI_BUS_RELATIONS: bus_rel = (struct pci_bus_relations *)buffer; if (bus_rel->device_count == 0) break; if (bytes_rxed < offsetof(struct pci_bus_relations, func) + (sizeof(struct pci_func_desc) * (bus_rel->device_count))) break; hv_pci_devices_present(hbus, bus_rel); break; case PCI_BUS_RELATIONS2: bus_rel2 = (struct pci_bus_relations2 *)buffer; if (bus_rel2->device_count == 0) break; if (bytes_rxed < offsetof(struct pci_bus_relations2, func) + (sizeof(struct pci_func_desc2) * (bus_rel2->device_count))) break; hv_pci_devices_present2(hbus, bus_rel2); case PCI_EJECT: dev_msg = (struct pci_dev_incoming *)buffer; hpdev = get_pcichild_wslot(hbus, dev_msg->wslot.val); if (hpdev) hv_pci_eject_device(hpdev); break; default: printf("vmbus_pcib: Unknown msg type 0x%x\n", new_msg->message_type.type); break; } break; default: printf("vmbus_pcib: Unknown VMBus msg type %hd\n", pkt->cph_type); break; } } while (1); if (bufferlen > PCIB_PACKET_SIZE) free(buffer, M_DEVBUF); } static int hv_pci_protocol_negotiation(struct hv_pcibus *hbus, enum pci_protocol_version_t version[], int num_version) { struct pci_version_request *version_req; struct hv_pci_compl comp_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_version_request)]; } ctxt; int ret; int i; init_completion(&comp_pkt.host_event); ctxt.pkt.completion_func = hv_pci_generic_compl; ctxt.pkt.compl_ctxt = &comp_pkt; version_req = (struct pci_version_request *)&ctxt.pkt.message; version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; for(i=0; i< num_version; i++) { version_req->protocol_version = version[i]; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req), (uint64_t)(uintptr_t)&ctxt.pkt); if (!ret) ret = wait_for_response(hbus, &comp_pkt.host_event); if (ret) { device_printf(hbus->pcib, "vmbus_pcib failed to request version: %d\n", ret); goto out; } if (comp_pkt.completion_status >= 0) { hbus->protocol_version = version[i]; device_printf(hbus->pcib, "PCI VMBus using version 0x%x\n", hbus->protocol_version); ret = 0; goto out; } if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) { device_printf(hbus->pcib, "vmbus_pcib version negotiation failed: %x\n", comp_pkt.completion_status); ret = EPROTO; goto out; } reinit_completion(&comp_pkt.host_event); } device_printf(hbus->pcib, "PCI pass-trhpugh VSP failed to find supported version\n"); out: free_completion(&comp_pkt.host_event); return (ret); } /* Ask the host to send along the list of child devices */ static int hv_pci_query_relations(struct hv_pcibus *hbus) { struct pci_message message; int ret; message.type = PCI_QUERY_BUS_RELATIONS; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, &message, sizeof(message), 0); return (ret); } static int hv_pci_enter_d0(struct hv_pcibus *hbus) { struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_bus_d0_entry)]; } ctxt; int ret; /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region * of memory-mapped I/O space has been chosen for configuration space * access. */ init_completion(&comp_pkt.host_event); ctxt.pkt.completion_func = hv_pci_generic_compl; ctxt.pkt.compl_ctxt = &comp_pkt; d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message; memset(d0_entry, 0, sizeof(*d0_entry)); d0_entry->message_type.type = PCI_BUS_D0ENTRY; d0_entry->mmio_base = rman_get_start(hbus->cfg_res); ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry), (uint64_t)(uintptr_t)&ctxt.pkt); if (!ret) ret = wait_for_response(hbus, &comp_pkt.host_event); if (ret) goto out; if (comp_pkt.completion_status < 0) { device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n"); ret = EPROTO; } else { ret = 0; } out: free_completion(&comp_pkt.host_event); return (ret); } /* * It looks this is only needed by Windows VM, but let's send the message too * just to make the host happy. */ static int hv_send_resources_allocated(struct hv_pcibus *hbus) { struct pci_resources_assigned *res_assigned; struct pci_resources_assigned2 *res_assigned2; struct hv_pci_compl comp_pkt; struct hv_pci_dev *hpdev; struct pci_packet *pkt; uint32_t wslot; int ret = 0; size_t size_res; size_res = (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_4) ? sizeof(*res_assigned) : sizeof(*res_assigned2); pkt = malloc(sizeof(*pkt) + size_res, M_DEVBUF, M_WAITOK | M_ZERO); for (wslot = 0; wslot < 256; wslot++) { hpdev = get_pcichild_wslot(hbus, wslot); if (!hpdev) continue; init_completion(&comp_pkt.host_event); memset(pkt, 0, sizeof(*pkt) + size_res); pkt->completion_func = hv_pci_generic_compl; pkt->compl_ctxt = &comp_pkt; if (hbus->protocol_version < PCI_PROTOCOL_VERSION_1_4) { res_assigned = (struct pci_resources_assigned *)&pkt->message; res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED; res_assigned->wslot.val = hpdev->desc.wslot.val; } else { res_assigned2 = (struct pci_resources_assigned2 *)&pkt->message; res_assigned2->message_type.type = PCI_RESOURCES_ASSIGNED2; res_assigned2->wslot.val = hpdev->desc.wslot.val; } ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, &pkt->message, size_res, (uint64_t)(uintptr_t)pkt); if (!ret) ret = wait_for_response(hbus, &comp_pkt.host_event); free_completion(&comp_pkt.host_event); if (ret) break; if (comp_pkt.completion_status < 0) { ret = EPROTO; device_printf(hbus->pcib, "failed to send PCI_RESOURCES_ASSIGNED\n"); break; } } free(pkt, M_DEVBUF); return (ret); } static int hv_send_resources_released(struct hv_pcibus *hbus) { struct pci_child_message pkt; struct hv_pci_dev *hpdev; uint32_t wslot; int ret; for (wslot = 0; wslot < 256; wslot++) { hpdev = get_pcichild_wslot(hbus, wslot); if (!hpdev) continue; pkt.message_type.type = PCI_RESOURCES_RELEASED; pkt.wslot.val = hpdev->desc.wslot.val; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0); if (ret) return (ret); } return (0); } #define hv_cfg_read(x, s) \ static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus, \ bus_size_t offset) \ { \ return (bus_read_##s(bus->cfg_res, offset)); \ } #define hv_cfg_write(x, s) \ static inline void hv_cfg_write_##s(struct hv_pcibus *bus, \ bus_size_t offset, uint##x##_t val) \ { \ return (bus_write_##s(bus->cfg_res, offset, val)); \ } hv_cfg_read(8, 1) hv_cfg_read(16, 2) hv_cfg_read(32, 4) hv_cfg_write(8, 1) hv_cfg_write(16, 2) hv_cfg_write(32, 4) static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size, uint32_t *val) { struct hv_pcibus *hbus = hpdev->hbus; bus_size_t addr = CFG_PAGE_OFFSET + where; /* * If the attempt is to read the IDs or the ROM BAR, simulate that. */ if (where + size <= PCIR_COMMAND) { memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size); } else if (where >= PCIR_REVID && where + size <= PCIR_CACHELNSZ) { memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where - PCIR_REVID, size); } else if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_BIOS) { memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where - PCIR_SUBVEND_0, size); } else if (where >= PCIR_BIOS && where + size <= PCIR_CAP_PTR) { /* ROM BARs are unimplemented */ *val = 0; } else if ((where >= PCIR_INTLINE && where + size <= PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) { /* * Interrupt Line and Interrupt PIN are hard-wired to zero * because this front-end only supports message-signaled * interrupts. */ *val = 0; } else if (where + size <= CFG_PAGE_SIZE) { mtx_lock(&hbus->config_lock); /* Choose the function to be read. */ hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val); /* Make sure the function was chosen before we start reading.*/ mb(); /* Read from that function's config space. */ switch (size) { case 1: *((uint8_t *)val) = hv_cfg_read_1(hbus, addr); break; case 2: *((uint16_t *)val) = hv_cfg_read_2(hbus, addr); break; default: *((uint32_t *)val) = hv_cfg_read_4(hbus, addr); break; } /* * Make sure the write was done before we release the lock, * allowing consecutive reads/writes. */ mb(); mtx_unlock(&hbus->config_lock); } else { /* Invalid config read: it's unlikely to reach here. */ memset(val, 0, size); } } static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size, uint32_t val) { struct hv_pcibus *hbus = hpdev->hbus; bus_size_t addr = CFG_PAGE_OFFSET + where; /* SSIDs and ROM BARs are read-only */ if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR) return; if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) { mtx_lock(&hbus->config_lock); /* Choose the function to be written. */ hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val); /* Make sure the function was chosen before we start writing.*/ wmb(); /* Write to that function's config space. */ switch (size) { case 1: hv_cfg_write_1(hbus, addr, (uint8_t)val); break; case 2: hv_cfg_write_2(hbus, addr, (uint16_t)val); break; default: hv_cfg_write_4(hbus, addr, (uint32_t)val); break; } /* * Make sure the write was done before we release the lock, * allowing consecutive reads/writes. */ mb(); mtx_unlock(&hbus->config_lock); } else { /* Invalid config write: it's unlikely to reach here. */ return; } } /* * The vPCI in some Hyper-V releases do not initialize the last 4 * bit of BAR registers. This could result weird problems causing PCI * code fail to configure BAR correctly. * * Just write all 1's to those BARs whose probed values are not zero. * This seems to make the Hyper-V vPCI and pci_write_bar() to cooperate * correctly. */ static void vmbus_pcib_prepopulate_bars(struct hv_pcibus *hbus) { struct hv_pci_dev *hpdev; int i; mtx_lock(&hbus->device_list_lock); TAILQ_FOREACH(hpdev, &hbus->children, link) { for (i = 0; i < 6; i++) { /* Ignore empty bar */ if (hpdev->probed_bar[i] == 0) continue; uint32_t bar_val = 0; _hv_pcifront_read_config(hpdev, PCIR_BAR(i), 4, &bar_val); if (hpdev->probed_bar[i] != bar_val) { if (bootverbose) printf("vmbus_pcib: initialize bar %d " "by writing all 1s\n", i); _hv_pcifront_write_config(hpdev, PCIR_BAR(i), 4, 0xffffffff); /* Now write the original value back */ _hv_pcifront_write_config(hpdev, PCIR_BAR(i), 4, bar_val); } } } mtx_unlock(&hbus->device_list_lock); } static void vmbus_pcib_set_detaching(void *arg, int pending __unused) { struct hv_pcibus *hbus = arg; atomic_set_int(&hbus->detaching, 1); } static void vmbus_pcib_pre_detach(struct hv_pcibus *hbus) { struct task task; TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus); /* * Make sure the channel callback won't push any possible new * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq. */ vmbus_chan_run_task(hbus->sc->chan, &task); taskqueue_drain_all(hbus->sc->taskq); } /* * Standard probe entry point. * */ static int vmbus_pcib_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &g_pass_through_dev_type) == 0) { device_set_desc(dev, "Hyper-V PCI Express Pass Through"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } /* * Standard attach entry point. * */ static int vmbus_pcib_attach(device_t dev) { const int pci_ring_size = (4 * PAGE_SIZE); const struct hyperv_guid *inst_guid; struct vmbus_channel *channel; struct vmbus_pcib_softc *sc; struct hv_pcibus *hbus; int rid = 0; int ret; hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO); hbus->pcib = dev; channel = vmbus_get_channel(dev); inst_guid = vmbus_chan_guid_inst(channel); hbus->pci_domain = inst_guid->hv_guid[9] | (inst_guid->hv_guid[8] << 8); mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF); mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF); TAILQ_INIT(&hbus->children); TAILQ_INIT(&hbus->dr_list); hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH, RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE)); if (!hbus->cfg_res) { device_printf(dev, "failed to get resource for cfg window\n"); ret = ENXIO; goto free_bus; } sc = device_get_softc(dev); sc->chan = channel; sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); sc->hbus = hbus; /* * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT * messages. NB: we can't handle the messages in the channel callback * directly, because the message handlers need to send new messages * to the host and waits for the host's completion messages, which * must also be handled by the channel callback. */ sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK, taskqueue_thread_enqueue, &sc->taskq); taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq"); hbus->sc = sc; init_completion(&hbus->query_completion); hbus->query_comp = &hbus->query_completion; ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size, NULL, 0, vmbus_pcib_on_channel_callback, sc); if (ret) goto free_res; ret = hv_pci_protocol_negotiation(hbus, pci_protocol_versions, ARRAY_SIZE(pci_protocol_versions)); if (ret) goto vmbus_close; ret = hv_pci_query_relations(hbus); if (!ret) ret = wait_for_response(hbus, hbus->query_comp); if (ret) goto vmbus_close; ret = hv_pci_enter_d0(hbus); if (ret) goto vmbus_close; ret = hv_send_resources_allocated(hbus); if (ret) goto vmbus_close; vmbus_pcib_prepopulate_bars(hbus); hbus->pci_bus = device_add_child(dev, "pci", -1); if (!hbus->pci_bus) { device_printf(dev, "failed to create pci bus\n"); ret = ENXIO; goto vmbus_close; } bus_generic_attach(dev); hbus->state = hv_pcibus_installed; return (0); vmbus_close: vmbus_pcib_pre_detach(hbus); vmbus_chan_close(sc->chan); free_res: taskqueue_free(sc->taskq); free_completion(&hbus->query_completion); free(sc->rx_buf, M_DEVBUF); bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res); free_bus: mtx_destroy(&hbus->device_list_lock); mtx_destroy(&hbus->config_lock); free(hbus, M_DEVBUF); return (ret); } /* * Standard detach entry point */ static int vmbus_pcib_detach(device_t dev) { struct vmbus_pcib_softc *sc = device_get_softc(dev); struct hv_pcibus *hbus = sc->hbus; struct pci_message teardown_packet; struct pci_bus_relations relations; int ret; vmbus_pcib_pre_detach(hbus); if (hbus->state == hv_pcibus_installed) bus_generic_detach(dev); /* Delete any children which might still exist. */ memset(&relations, 0, sizeof(relations)); hv_pci_devices_present(hbus, &relations); ret = hv_send_resources_released(hbus); if (ret) device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n"); teardown_packet.type = PCI_BUS_D0EXIT; ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, &teardown_packet, sizeof(struct pci_message), 0); if (ret) device_printf(dev, "failed to send PCI_BUS_D0EXIT\n"); taskqueue_drain_all(hbus->sc->taskq); vmbus_chan_close(sc->chan); taskqueue_free(sc->taskq); free_completion(&hbus->query_completion); free(sc->rx_buf, M_DEVBUF); bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res); mtx_destroy(&hbus->device_list_lock); mtx_destroy(&hbus->config_lock); free(hbus, M_DEVBUF); return (0); } static int vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val) { struct vmbus_pcib_softc *sc = device_get_softc(dev); switch (which) { case PCIB_IVAR_DOMAIN: *val = sc->hbus->pci_domain; return (0); case PCIB_IVAR_BUS: /* There is only bus 0. */ *val = 0; return (0); } return (ENOENT); } static int vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val) { return (ENOENT); } static struct resource * vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { unsigned int bar_no; struct hv_pci_dev *hpdev; struct vmbus_pcib_softc *sc = device_get_softc(dev); struct resource *res; unsigned int devfn; if (type == PCI_RES_BUS) return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid, start, end, count, flags)); /* Devices with port I/O BAR are not supported. */ if (type == SYS_RES_IOPORT) return (NULL); if (type == SYS_RES_MEMORY) { devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return (NULL); bar_no = PCI_RID2BAR(*rid); if (bar_no >= MAX_NUM_BARS) return (NULL); /* Make sure a 32-bit BAR gets a 32-bit address */ if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64)) end = ulmin(end, 0xFFFFFFFF); } res = bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags); /* * If this is a request for a specific range, assume it is * correct and pass it up to the parent. */ if (res == NULL && start + count - 1 == end) res = bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags); if (res) { device_printf(dev,"vmbus_pcib_alloc_resource is successful\n"); } return (res); } static int vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct vmbus_pcib_softc *sc = device_get_softc(dev); if (type == PCI_RES_BUS) return (pci_domain_release_bus(sc->hbus->pci_domain, child, rid, r)); if (type == SYS_RES_IOPORT) return (EINVAL); return (bus_generic_release_resource(dev, child, type, rid, r)); } static int vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { return (bus_get_cpus(pcib, op, setsize, cpuset)); } static uint32_t vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, int bytes) { struct vmbus_pcib_softc *sc = device_get_softc(dev); struct hv_pci_dev *hpdev; unsigned int devfn = PCI_DEVFN(slot, func); uint32_t data = 0; KASSERT(bus == 0, ("bus should be 0, but is %u", bus)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return (~0); _hv_pcifront_read_config(hpdev, reg, bytes, &data); return (data); } static void vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, uint32_t data, int bytes) { struct vmbus_pcib_softc *sc = device_get_softc(dev); struct hv_pci_dev *hpdev; unsigned int devfn = PCI_DEVFN(slot, func); KASSERT(bus == 0, ("bus should be 0, but is %u", bus)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return; _hv_pcifront_write_config(hpdev, reg, bytes, data); } static int vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin) { /* We only support MSI/MSI-X and don't support INTx interrupt. */ return (PCI_INVALID_IRQ); } static int vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs) { #if defined(__amd64__) || defined(__i386__) return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount, irqs)); #endif #if defined(__aarch64__) return (intr_alloc_msi(pcib, dev, ACPI_MSI_XREF, count, maxcount, irqs)); #endif } static int vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs) { #if defined(__amd64__) || defined(__i386__) return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs)); #endif #if defined(__aarch64__) return(intr_release_msi(pcib, dev, ACPI_MSI_XREF, count, irqs)); #endif } static int vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq) { #if defined(__aarch64__) int ret; #if defined(INTRNG) ret = intr_alloc_msix(pcib, dev, ACPI_MSI_XREF, irq); return ret; #else return (ENXIO); #endif #else return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq)); #endif /* __aarch64__ */ } static int vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq) { #if defined(__aarch64__) return (intr_release_msix(pcib, dev, ACPI_MSI_XREF, irq)); #else return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq)); #endif /* __aarch64__ */ } #if defined(__aarch64__) #define MSI_INTEL_ADDR_DEST 0x00000000 #define MSI_INTEL_DATA_DELFIXED 0x0 #endif #if defined(__amd64__) || defined(__i386__) #define MSI_INTEL_ADDR_DEST 0x000ff000 #define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */ #define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED #endif static int vmbus_pcib_map_msi(device_t pcib, device_t child, int irq, uint64_t *addr, uint32_t *data) { unsigned int devfn; struct hv_pci_dev *hpdev; uint64_t v_addr; uint32_t v_data; struct hv_irq_desc *hid, *tmp_hid; unsigned int cpu, vcpu_id; unsigned int vector; struct vmbus_pcib_softc *sc = device_get_softc(pcib); struct compose_comp_ctxt comp; struct { struct pci_packet pkt; union { struct pci_create_interrupt v1; struct pci_create_interrupt3 v3; }int_pkts; } ctxt; int ret; uint32_t size; devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return (ENOENT); #if defined(__aarch64__) ret = intr_map_msi(pcib, child, ACPI_MSI_XREF, irq, &v_addr, &v_data); #else ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq, &v_addr, &v_data); #endif if (ret) return (ret); TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) { if (hid->irq == irq) { TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link); hv_int_desc_free(hpdev, hid); break; } } #if defined(__aarch64__) cpu = 0; vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu); vector = v_data; #else cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12; vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu); vector = v_data & MSI_INTEL_DATA_INTVEC; #endif init_completion(&comp.comp_pkt.host_event); memset(&ctxt, 0, sizeof(ctxt)); ctxt.pkt.completion_func = hv_pci_compose_compl; ctxt.pkt.compl_ctxt = ∁ switch (hpdev->hbus->protocol_version) { case PCI_PROTOCOL_VERSION_1_1: ctxt.int_pkts.v1.message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; ctxt.int_pkts.v1.wslot.val = hpdev->desc.wslot.val; ctxt.int_pkts.v1.int_desc.vector = vector; ctxt.int_pkts.v1.int_desc.vector_count = 1; ctxt.int_pkts.v1.int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED; ctxt.int_pkts.v1.int_desc.cpu_mask = 1ULL << vcpu_id; size = sizeof(ctxt.int_pkts.v1); break; case PCI_PROTOCOL_VERSION_1_4: ctxt.int_pkts.v3.message_type.type = PCI_CREATE_INTERRUPT_MESSAGE3; ctxt.int_pkts.v3.wslot.val = hpdev->desc.wslot.val; ctxt.int_pkts.v3.int_desc.vector = vector; ctxt.int_pkts.v3.int_desc.vector_count = 1; ctxt.int_pkts.v3.int_desc.reserved = 0; ctxt.int_pkts.v3.int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED; ctxt.int_pkts.v3.int_desc.processor_count = 1; ctxt.int_pkts.v3.int_desc.processor_array[0] = vcpu_id; size = sizeof(ctxt.int_pkts.v3); break; } ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, &ctxt.int_pkts, size, (uint64_t)(uintptr_t)&ctxt.pkt); if (ret) { free_completion(&comp.comp_pkt.host_event); return (ret); } wait_for_completion(&comp.comp_pkt.host_event); free_completion(&comp.comp_pkt.host_event); if (comp.comp_pkt.completion_status < 0) { device_printf(pcib, "vmbus_pcib_map_msi completion_status %d\n", comp.comp_pkt.completion_status); return (EPROTO); } *addr = comp.int_desc.address; *data = comp.int_desc.data; hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO); hid->irq = irq; hid->desc = comp.int_desc; TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link); return (0); } static device_method_t vmbus_pcib_methods[] = { /* Device interface */ DEVMETHOD(device_probe, vmbus_pcib_probe), DEVMETHOD(device_attach, vmbus_pcib_attach), DEVMETHOD(device_detach, vmbus_pcib_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_read_ivar, vmbus_pcib_read_ivar), DEVMETHOD(bus_write_ivar, vmbus_pcib_write_ivar), DEVMETHOD(bus_alloc_resource, vmbus_pcib_alloc_resource), DEVMETHOD(bus_release_resource, vmbus_pcib_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD(bus_get_cpus, vmbus_pcib_get_cpus), /* pcib interface */ DEVMETHOD(pcib_maxslots, pcib_maxslots), DEVMETHOD(pcib_read_config, vmbus_pcib_read_config), DEVMETHOD(pcib_write_config, vmbus_pcib_write_config), DEVMETHOD(pcib_route_interrupt, vmbus_pcib_route_intr), DEVMETHOD(pcib_alloc_msi, vmbus_pcib_alloc_msi), DEVMETHOD(pcib_release_msi, vmbus_pcib_release_msi), DEVMETHOD(pcib_alloc_msix, vmbus_pcib_alloc_msix), DEVMETHOD(pcib_release_msix, vmbus_pcib_release_msix), DEVMETHOD(pcib_map_msi, vmbus_pcib_map_msi), DEVMETHOD(pcib_request_feature, pcib_request_feature_allow), DEVMETHOD_END }; DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods, sizeof(struct vmbus_pcib_softc)); DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, 0, 0); MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1); MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1); #endif /* NEW_PCIB */ diff --git a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c index c158f72eb9e7..9b9c240d3843 100644 --- a/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c +++ b/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c @@ -1,237 +1,238 @@ /*- * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include +#include #include #include #include #include #include #include #include #include +#include +#include #include #include #include #include #include #include struct hyperv_reftsc_ctx { struct hyperv_reftsc *tsc_ref; - struct hyperv_dma tsc_ref_dma; }; static uint32_t hyperv_tsc_vdso_timehands( struct vdso_timehands *, struct timecounter *); static d_open_t hyperv_tsc_open; static d_mmap_t hyperv_tsc_mmap; static struct timecounter hyperv_tsc_timecounter = { .tc_get_timecount = NULL, /* based on CPU vendor. */ .tc_counter_mask = 0xffffffff, .tc_frequency = HYPERV_TIMER_FREQ, .tc_name = "Hyper-V-TSC", .tc_quality = 3000, .tc_fill_vdso_timehands = hyperv_tsc_vdso_timehands, }; static struct cdevsw hyperv_tsc_cdevsw = { .d_version = D_VERSION, .d_open = hyperv_tsc_open, .d_mmap = hyperv_tsc_mmap, .d_name = HYPERV_REFTSC_DEVNAME }; static struct hyperv_reftsc_ctx hyperv_ref_tsc; uint64_t hypercall_md(volatile void *hc_addr, uint64_t in_val, uint64_t in_paddr, uint64_t out_paddr) { uint64_t status; __asm__ __volatile__ ("mov %0, %%r8" : : "r" (out_paddr): "r8"); __asm__ __volatile__ ("call *%3" : "=a" (status) : "c" (in_val), "d" (in_paddr), "m" (hc_addr)); return (status); } static int hyperv_tsc_open(struct cdev *dev __unused, int oflags, int devtype __unused, struct thread *td __unused) { if (oflags & FWRITE) return (EPERM); return (0); } static int hyperv_tsc_mmap(struct cdev *dev __unused, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot __unused, vm_memattr_t *memattr __unused) { KASSERT(hyperv_ref_tsc.tsc_ref != NULL, ("reftsc has not been setup")); /* * NOTE: * 'nprot' does not contain information interested to us; * WR-open is blocked by d_open. */ if (offset != 0) return (EOPNOTSUPP); - *paddr = hyperv_ref_tsc.tsc_ref_dma.hv_paddr; + *paddr = pmap_kextract((vm_offset_t)hyperv_ref_tsc.tsc_ref); return (0); } static uint32_t hyperv_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc __unused) { vdso_th->th_algo = VDSO_TH_ALGO_X86_HVTSC; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = 0; vdso_th->th_x86_pvc_last_systime = 0; vdso_th->th_x86_pvc_stable_mask = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } #define HYPERV_TSC_TIMECOUNT(fence) \ static uint64_t \ hyperv_tc64_tsc_##fence(void) \ { \ struct hyperv_reftsc *tsc_ref = hyperv_ref_tsc.tsc_ref; \ uint32_t seq; \ \ while ((seq = atomic_load_acq_int(&tsc_ref->tsc_seq)) != 0) { \ uint64_t disc, ret, tsc; \ uint64_t scale = tsc_ref->tsc_scale; \ int64_t ofs = tsc_ref->tsc_ofs; \ \ fence(); \ tsc = rdtsc(); \ \ /* ret = ((tsc * scale) >> 64) + ofs */ \ __asm__ __volatile__ ("mulq %3" : \ "=d" (ret), "=a" (disc) : \ "a" (tsc), "r" (scale)); \ ret += ofs; \ \ atomic_thread_fence_acq(); \ if (tsc_ref->tsc_seq == seq) \ return (ret); \ \ /* Sequence changed; re-sync. */ \ } \ /* Fallback to the generic timecounter, i.e. rdmsr. */ \ return (rdmsr(MSR_HV_TIME_REF_COUNT)); \ } \ \ static u_int \ hyperv_tsc_timecount_##fence(struct timecounter *tc __unused) \ { \ \ return (hyperv_tc64_tsc_##fence()); \ } \ struct __hack HYPERV_TSC_TIMECOUNT(lfence); HYPERV_TSC_TIMECOUNT(mfence); static void hyperv_tsc_tcinit(void *dummy __unused) { hyperv_tc64_t tc64 = NULL; uint64_t val, orig; if ((hyperv_features & (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC)) != (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC) || (cpu_feature & CPUID_SSE2) == 0) /* SSE2 for mfence/lfence */ return; switch (cpu_vendor_id) { case CPU_VENDOR_AMD: case CPU_VENDOR_HYGON: hyperv_tsc_timecounter.tc_get_timecount = hyperv_tsc_timecount_mfence; tc64 = hyperv_tc64_tsc_mfence; break; case CPU_VENDOR_INTEL: hyperv_tsc_timecounter.tc_get_timecount = hyperv_tsc_timecount_lfence; tc64 = hyperv_tc64_tsc_lfence; break; default: /* Unsupported CPU vendors. */ return; } - hyperv_ref_tsc.tsc_ref = hyperv_dmamem_alloc(NULL, PAGE_SIZE, 0, - sizeof(struct hyperv_reftsc), &hyperv_ref_tsc.tsc_ref_dma, - BUS_DMA_WAITOK | BUS_DMA_ZERO); + hyperv_ref_tsc.tsc_ref = contigmalloc(PAGE_SIZE, M_DEVBUF, + M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (hyperv_ref_tsc.tsc_ref == NULL) { printf("hyperv: reftsc page allocation failed\n"); return; } orig = rdmsr(MSR_HV_REFERENCE_TSC); - val = MSR_HV_REFTSC_ENABLE | (orig & MSR_HV_REFTSC_RSVD_MASK) | - ((hyperv_ref_tsc.tsc_ref_dma.hv_paddr >> PAGE_SHIFT) << - MSR_HV_REFTSC_PGSHIFT); + val = (pmap_kextract((vm_offset_t)hyperv_ref_tsc.tsc_ref) >> + PAGE_SHIFT) << MSR_HV_REFTSC_PGSHIFT; + val |= MSR_HV_REFTSC_ENABLE | (orig & MSR_HV_REFTSC_RSVD_MASK); wrmsr(MSR_HV_REFERENCE_TSC, val); /* Register "enlightened" timecounter. */ tc_init(&hyperv_tsc_timecounter); /* Install 64 bits timecounter method for other modules to use. */ KASSERT(tc64 != NULL, ("tc64 is not set")); hyperv_tc64 = tc64; /* Add device for mmap(2). */ make_dev(&hyperv_tsc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0444, HYPERV_REFTSC_DEVNAME); } SYSINIT(hyperv_tsc_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, hyperv_tsc_tcinit, NULL); diff --git a/sys/dev/hyperv/vmbus/hyperv_busdma.c b/sys/dev/hyperv/vmbus/hyperv_busdma.c index 9550540014c4..81817d476d29 100644 --- a/sys/dev/hyperv/vmbus/hyperv_busdma.c +++ b/sys/dev/hyperv/vmbus/hyperv_busdma.c @@ -1,98 +1,51 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #define HYPERV_DMA_MASK (BUS_DMA_WAITOK | BUS_DMA_NOWAIT | BUS_DMA_ZERO) void hyperv_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error) { bus_addr_t *paddr = arg; if (error) return; KASSERT(nseg == 1, ("too many segments %d!", nseg)); *paddr = segs->ds_addr; } -void * -hyperv_dmamem_alloc(bus_dma_tag_t parent_dtag, bus_size_t alignment, - bus_addr_t boundary, bus_size_t size, struct hyperv_dma *dma, int flags) -{ - void *ret; - int error; - - error = bus_dma_tag_create(parent_dtag, /* parent */ - alignment, /* alignment */ - boundary, /* boundary */ - BUS_SPACE_MAXADDR, /* lowaddr */ - BUS_SPACE_MAXADDR, /* highaddr */ - NULL, NULL, /* filter, filterarg */ - size, /* maxsize */ - 1, /* nsegments */ - size, /* maxsegsize */ - 0, /* flags */ - NULL, /* lockfunc */ - NULL, /* lockfuncarg */ - &dma->hv_dtag); - if (error) - return NULL; - - error = bus_dmamem_alloc(dma->hv_dtag, &ret, - (flags & HYPERV_DMA_MASK) | BUS_DMA_COHERENT, &dma->hv_dmap); - if (error) { - bus_dma_tag_destroy(dma->hv_dtag); - return NULL; - } - - error = bus_dmamap_load(dma->hv_dtag, dma->hv_dmap, ret, size, - hyperv_dma_map_paddr, &dma->hv_paddr, BUS_DMA_NOWAIT); - if (error) { - bus_dmamem_free(dma->hv_dtag, ret, dma->hv_dmap); - bus_dma_tag_destroy(dma->hv_dtag); - return NULL; - } - return ret; -} - -void -hyperv_dmamem_free(struct hyperv_dma *dma, void *ptr) -{ - bus_dmamap_unload(dma->hv_dtag, dma->hv_dmap); - bus_dmamem_free(dma->hv_dtag, ptr, dma->hv_dmap); - bus_dma_tag_destroy(dma->hv_dtag); -} diff --git a/sys/dev/hyperv/vmbus/hyperv_common_reg.h b/sys/dev/hyperv/vmbus/hyperv_common_reg.h index 2a6437390ca6..e303d263a7fe 100644 --- a/sys/dev/hyperv/vmbus/hyperv_common_reg.h +++ b/sys/dev/hyperv/vmbus/hyperv_common_reg.h @@ -1,175 +1,177 @@ /*- SPDX-License-Identifier: BSD-2-Clause * Copyright (c) 2022 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _HYPERV_COMMON_REG_H_ #define _HYPERV_COMMON_REG_H_ #include #include #define MSR_HV_GUESTID_BUILD_MASK 0xffffULL #define MSR_HV_GUESTID_VERSION_MASK 0x0000ffffffff0000ULL #define MSR_HV_GUESTID_VERSION_SHIFT 16 #define MSR_HV_GUESTID_OSID_MASK 0x00ff000000000000ULL #define MSR_HV_GUESTID_OSID_SHIFT 48 #define MSR_HV_GUESTID_OSTYPE_MASK 0x7f00000000000000ULL #define MSR_HV_GUESTID_OSTYPE_SHIFT 56 #define MSR_HV_GUESTID_OPENSRC 0x8000000000000000ULL #define MSR_HV_GUESTID_OSTYPE_LINUX \ ((0x01ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC) #define MSR_HV_GUESTID_OSTYPE_FREEBSD \ ((0x02ULL << MSR_HV_GUESTID_OSTYPE_SHIFT) | MSR_HV_GUESTID_OPENSRC) #define MSR_HV_HYPERCALL 0x40000001 #define MSR_HV_HYPERCALL_ENABLE 0x0001ULL #define MSR_HV_HYPERCALL_RSVD_MASK 0x0ffeULL #define MSR_HV_HYPERCALL_PGSHIFT 12 #define MSR_HV_REFERENCE_TSC 0x40000021 #define MSR_HV_REFTSC_ENABLE 0x0001ULL #define MSR_HV_REFTSC_RSVD_MASK 0x0ffeULL #define MSR_HV_REFTSC_PGSHIFT 12 #define MSR_HV_SCTRL_ENABLE 0x0001ULL #define MSR_HV_SCTRL_RSVD_MASK 0xfffffffffffffffeULL #define MSR_HV_SIEFP_ENABLE 0x0001ULL #define MSR_HV_SIEFP_RSVD_MASK 0x0ffeULL #define MSR_HV_SIEFP_PGSHIFT 12 +#define MSR_HV_SIEFP_PGMASK (~0ULL << MSR_HV_SIEFP_PGSHIFT) #define MSR_HV_SIMP_ENABLE 0x0001ULL #define MSR_HV_SIMP_RSVD_MASK 0x0ffeULL #define MSR_HV_SIMP_PGSHIFT 12 +#define MSR_HV_SIMP_PGMASK (~0ULL << MSR_HV_SIMP_PGSHIFT) #define MSR_HV_SINT_VECTOR_MASK 0x00ffULL #define MSR_HV_SINT_RSVD1_MASK 0xff00ULL #define MSR_HV_SINT_MASKED 0x00010000ULL #define MSR_HV_SINT_RSVD2_MASK 0xfffffffffffc0000ULL #define MSR_HV_SINT_RSVD_MASK (MSR_HV_SINT_RSVD1_MASK | \ MSR_HV_SINT_RSVD2_MASK) #define MSR_HV_STIMER0_CONFIG 0x400000b0 #define MSR_HV_STIMER_CFG_ENABLE 0x0001ULL #define MSR_HV_STIMER_CFG_PERIODIC 0x0002ULL #define MSR_HV_STIMER_CFG_LAZY 0x0004ULL #define MSR_HV_STIMER_CFG_AUTOEN 0x0008ULL #define MSR_HV_STIMER_CFG_SINT_MASK 0x000f0000ULL #define MSR_HV_STIMER_CFG_SINT_SHIFT 16 #define MSR_HV_STIMER0_COUNT 0x400000b1 /* * CPUID leaves */ #define CPUID_LEAF_HV_MAXLEAF 0x40000000 #define CPUID_LEAF_HV_INTERFACE 0x40000001 #define CPUID_HV_IFACE_HYPERV 0x31237648 /* HV#1 */ /* EAX: features include/hyperv.h CPUID_HV_MSR */ /* ECX: power management features */ #define CPUPM_HV_CSTATE_MASK 0x000f /* deepest C-state */ #define CPUPM_HV_C3_HPET 0x0010 /* C3 requires HPET */ #define CPUPM_HV_CSTATE(f) ((f) & CPUPM_HV_CSTATE_MASK) /* EDX: features3 */ #define CPUID3_HV_MWAIT 0x0001 /* MWAIT */ #define CPUID3_HV_XMM_HYPERCALL 0x0010 /* Hypercall input through * XMM regs */ #define CPUID3_HV_GUEST_IDLE 0x0020 /* guest idle */ #define CPUID3_HV_NUMA 0x0080 /* NUMA distance query */ #define CPUID3_HV_TIME_FREQ 0x0100 /* timer frequency query * (TSC, LAPIC) */ #define CPUID3_HV_MSR_CRASH 0x0400 /* MSRs for guest crash */ #define CPUID_LEAF_HV_LIMITS 0x40000005 #define CPUID_LEAF_HV_HWFEATURES 0x40000006 /* * Hyper-V Monitor Notification Facility */ struct hyperv_mon_param { uint32_t mp_connid; uint16_t mp_evtflag_ofs; uint16_t mp_rsvd; } __packed; /* * Hyper-V message types */ #define HYPERV_MSGTYPE_NONE 0 #define HYPERV_MSGTYPE_CHANNEL 1 #define HYPERV_MSGTYPE_TIMER_EXPIRED 0x80000010 /* * Hypercall status codes */ #define HYPERCALL_STATUS_SUCCESS 0x0000 /* * Hypercall input values */ #define HYPERCALL_POST_MESSAGE 0x005c #define HYPERCALL_SIGNAL_EVENT 0x005d /* * Hypercall input parameters */ #define HYPERCALL_PARAM_ALIGN 8 #if 0 /* * XXX * <> requires * input parameters size to be multiple of 8, however, many post * message input parameters do _not_ meet this requirement. */ #define HYPERCALL_PARAM_SIZE_ALIGN 8 #endif /* * HYPERCALL_POST_MESSAGE */ #define HYPERCALL_POSTMSGIN_DSIZE_MAX 240 #define HYPERCALL_POSTMSGIN_SIZE 256 struct hypercall_postmsg_in { uint32_t hc_connid; uint32_t hc_rsvd; uint32_t hc_msgtype; /* HYPERV_MSGTYPE_ */ uint32_t hc_dsize; uint8_t hc_data[HYPERCALL_POSTMSGIN_DSIZE_MAX]; } __packed; CTASSERT(sizeof(struct hypercall_postmsg_in) == HYPERCALL_POSTMSGIN_SIZE); /* * HYPERCALL_SIGNAL_EVENT * * struct hyperv_mon_param. */ #endif /* i!__HYPERV_COMMON_REG_H__ */ diff --git a/sys/dev/hyperv/vmbus/vmbus.c b/sys/dev/hyperv/vmbus/vmbus.c index c349f862d378..ae4c466d98d9 100644 --- a/sys/dev/hyperv/vmbus/vmbus.c +++ b/sys/dev/hyperv/vmbus/vmbus.c @@ -1,1606 +1,1600 @@ /*- * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * VM Bus Driver Implementation */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include +#include #include #include #include #if defined(__aarch64__) #include #include #include #else #include #include #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi_if.h" #include "pcib_if.h" #include "vmbus_if.h" #define VMBUS_GPADL_START 0xe1e10 struct vmbus_msghc { struct vmbus_xact *mh_xact; struct hypercall_postmsg_in mh_inprm_save; }; static void vmbus_identify(driver_t *, device_t); static int vmbus_probe(device_t); static int vmbus_attach(device_t); static int vmbus_detach(device_t); static int vmbus_read_ivar(device_t, device_t, int, uintptr_t *); static int vmbus_child_pnpinfo(device_t, device_t, struct sbuf *); static struct resource *vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs); static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs); static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq); static int vmbus_release_msix(device_t bus, device_t dev, int irq); static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data); static uint32_t vmbus_get_version_method(device_t, device_t); static int vmbus_probe_guid_method(device_t, device_t, const struct hyperv_guid *); static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu); static struct taskqueue *vmbus_get_eventtq_method(device_t, device_t, int); #if defined(EARLY_AP_STARTUP) || defined(__aarch64__) static void vmbus_intrhook(void *); #endif static int vmbus_init(struct vmbus_softc *); static int vmbus_connect(struct vmbus_softc *, uint32_t); static int vmbus_req_channels(struct vmbus_softc *sc); static void vmbus_disconnect(struct vmbus_softc *); static int vmbus_scan(struct vmbus_softc *); static void vmbus_scan_teardown(struct vmbus_softc *); static void vmbus_scan_done(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chanmsg_handle(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_msg_task(void *, int); static void vmbus_synic_setup(void *); static void vmbus_synic_teardown(void *); static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS); static int vmbus_dma_alloc(struct vmbus_softc *); static void vmbus_dma_free(struct vmbus_softc *); static int vmbus_intr_setup(struct vmbus_softc *); static void vmbus_intr_teardown(struct vmbus_softc *); static int vmbus_doattach(struct vmbus_softc *); static void vmbus_event_proc_dummy(struct vmbus_softc *, int); static struct vmbus_softc *vmbus_sc; SYSCTL_NODE(_hw, OID_AUTO, vmbus, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V vmbus"); static int vmbus_pin_evttask = 1; SYSCTL_INT(_hw_vmbus, OID_AUTO, pin_evttask, CTLFLAG_RDTUN, &vmbus_pin_evttask, 0, "Pin event tasks to their respective CPU"); uint32_t vmbus_current_version; static const uint32_t vmbus_version[] = { VMBUS_VERSION_WIN10, VMBUS_VERSION_WIN8_1, VMBUS_VERSION_WIN8, VMBUS_VERSION_WIN7, VMBUS_VERSION_WS2008 }; static const vmbus_chanmsg_proc_t vmbus_chanmsg_handlers[VMBUS_CHANMSG_TYPE_MAX] = { VMBUS_CHANMSG_PROC(CHOFFER_DONE, vmbus_scan_done), VMBUS_CHANMSG_PROC_WAKEUP(CONNECT_RESP) }; static device_method_t vmbus_methods[] = { /* Device interface */ DEVMETHOD(device_identify, vmbus_identify), DEVMETHOD(device_probe, vmbus_probe), DEVMETHOD(device_attach, vmbus_attach), DEVMETHOD(device_detach, vmbus_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), DEVMETHOD(bus_print_child, bus_generic_print_child), DEVMETHOD(bus_read_ivar, vmbus_read_ivar), DEVMETHOD(bus_child_pnpinfo, vmbus_child_pnpinfo), DEVMETHOD(bus_alloc_resource, vmbus_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), DEVMETHOD(bus_get_cpus, bus_generic_get_cpus), /* pcib interface */ DEVMETHOD(pcib_alloc_msi, vmbus_alloc_msi), DEVMETHOD(pcib_release_msi, vmbus_release_msi), DEVMETHOD(pcib_alloc_msix, vmbus_alloc_msix), DEVMETHOD(pcib_release_msix, vmbus_release_msix), DEVMETHOD(pcib_map_msi, vmbus_map_msi), /* Vmbus interface */ DEVMETHOD(vmbus_get_version, vmbus_get_version_method), DEVMETHOD(vmbus_probe_guid, vmbus_probe_guid_method), DEVMETHOD(vmbus_get_vcpu_id, vmbus_get_vcpu_id_method), DEVMETHOD(vmbus_get_event_taskq, vmbus_get_eventtq_method), DEVMETHOD_END }; static driver_t vmbus_driver = { "vmbus", vmbus_methods, sizeof(struct vmbus_softc) }; DRIVER_MODULE(vmbus, pcib, vmbus_driver, NULL, NULL); DRIVER_MODULE(vmbus, acpi_syscontainer, vmbus_driver, NULL, NULL); MODULE_DEPEND(vmbus, acpi, 1, 1, 1); MODULE_DEPEND(vmbus, pci, 1, 1, 1); MODULE_VERSION(vmbus, 1); static __inline struct vmbus_softc * vmbus_get_softc(void) { return vmbus_sc; } void vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize) { struct hypercall_postmsg_in *inprm; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); inprm = vmbus_xact_req_data(mh->mh_xact); memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE); inprm->hc_connid = VMBUS_CONNID_MESSAGE; inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL; inprm->hc_dsize = dsize; } struct vmbus_msghc * vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize) { struct vmbus_msghc *mh; struct vmbus_xact *xact; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); xact = vmbus_xact_get(sc->vmbus_xc, dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0])); if (xact == NULL) return (NULL); mh = vmbus_xact_priv(xact, sizeof(*mh)); mh->mh_xact = xact; vmbus_msghc_reset(mh, dsize); return (mh); } void vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_put(mh->mh_xact); } void * vmbus_msghc_dataptr(struct vmbus_msghc *mh) { struct hypercall_postmsg_in *inprm; inprm = vmbus_xact_req_data(mh->mh_xact); return (inprm->hc_data); } int vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) { sbintime_t time = SBT_1MS; struct hypercall_postmsg_in *inprm; bus_addr_t inprm_paddr; int i; inprm = vmbus_xact_req_data(mh->mh_xact); inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact); /* * Save the input parameter so that we could restore the input * parameter if the Hypercall failed. * * XXX * Is this really necessary?! i.e. Will the Hypercall ever * overwrite the input parameter? */ memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE); /* * In order to cope with transient failures, e.g. insufficient * resources on host side, we retry the post message Hypercall * several times. 20 retries seem sufficient. */ #define HC_RETRY_MAX 20 for (i = 0; i < HC_RETRY_MAX; ++i) { uint64_t status; status = hypercall_post_message(inprm_paddr); if (status == HYPERCALL_STATUS_SUCCESS) return 0; pause_sbt("hcpmsg", time, 0, C_HARDCLOCK); if (time < SBT_1S * 2) time *= 2; /* Restore input parameter and try again */ memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE); } #undef HC_RETRY_MAX return EIO; } int vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { int error; vmbus_xact_activate(mh->mh_xact); error = vmbus_msghc_exec_noresult(mh); if (error) vmbus_xact_deactivate(mh->mh_xact); return error; } void vmbus_msghc_exec_cancel(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_deactivate(mh->mh_xact); } const struct vmbus_message * vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_wait(mh->mh_xact, &resp_len)); } const struct vmbus_message * vmbus_msghc_poll_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_poll(mh->mh_xact, &resp_len)); } void vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg)); } uint32_t vmbus_gpadl_alloc(struct vmbus_softc *sc) { uint32_t gpadl; again: gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); if (gpadl == 0) goto again; return (gpadl); } /* Used for Hyper-V socket when guest client connects to host */ int vmbus_req_tl_connect(struct hyperv_guid *guest_srv_id, struct hyperv_guid *host_srv_id) { struct vmbus_softc *sc = vmbus_get_softc(); struct vmbus_chanmsg_tl_connect *req; struct vmbus_msghc *mh; int error; if (!sc) return ENXIO; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for tl connect\n"); return ENXIO; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_TL_CONN; req->guest_endpoint_id = *guest_srv_id; req->host_service_id = *host_srv_id; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { device_printf(sc->vmbus_dev, "tl connect msg hypercall failed\n"); } return error; } static int vmbus_connect(struct vmbus_softc *sc, uint32_t version) { struct vmbus_chanmsg_connect *req; const struct vmbus_message *msg; struct vmbus_msghc *mh; int error, done = 0; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CONNECT; req->chm_ver = version; - req->chm_evtflags = sc->vmbus_evtflags_dma.hv_paddr; - req->chm_mnf1 = sc->vmbus_mnf1_dma.hv_paddr; - req->chm_mnf2 = sc->vmbus_mnf2_dma.hv_paddr; + req->chm_evtflags = pmap_kextract((vm_offset_t)sc->vmbus_evtflags); + req->chm_mnf1 = pmap_kextract((vm_offset_t)sc->vmbus_mnf1); + req->chm_mnf2 = pmap_kextract((vm_offset_t)sc->vmbus_mnf2); error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_msghc_put(sc, mh); return error; } msg = vmbus_msghc_wait_result(sc, mh); done = ((const struct vmbus_chanmsg_connect_resp *) msg->msg_data)->chm_done; vmbus_msghc_put(sc, mh); return (done ? 0 : EOPNOTSUPP); } static int vmbus_init(struct vmbus_softc *sc) { int i; for (i = 0; i < nitems(vmbus_version); ++i) { int error; error = vmbus_connect(sc, vmbus_version[i]); if (!error) { vmbus_current_version = vmbus_version[i]; sc->vmbus_version = vmbus_version[i]; device_printf(sc->vmbus_dev, "version %u.%u\n", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return 0; } } return ENXIO; } static void vmbus_disconnect(struct vmbus_softc *sc) { struct vmbus_chanmsg_disconnect *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for disconnect\n"); return; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_DISCONNECT; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { device_printf(sc->vmbus_dev, "disconnect msg hypercall failed\n"); } } static int vmbus_req_channels(struct vmbus_softc *sc) { struct vmbus_chanmsg_chrequest *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHREQUEST; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); return error; } static void vmbus_scan_done_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; bus_topo_lock(); sc->vmbus_scandone = true; bus_topo_unlock(); wakeup(&sc->vmbus_scandone); } static void vmbus_scan_done(struct vmbus_softc *sc, const struct vmbus_message *msg __unused) { taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task); } static int vmbus_scan(struct vmbus_softc *sc) { int error; /* * Identify, probe and attach for non-channel devices. */ bus_generic_probe(sc->vmbus_dev); bus_generic_attach(sc->vmbus_dev); /* * This taskqueue serializes vmbus devices' attach and detach * for channel offer and rescind messages. */ sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_devtq); taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev"); TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc); /* * This taskqueue handles sub-channel detach, so that vmbus * device's detach running in vmbus_devtq can drain its sub- * channels. */ sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_subchtq); taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch"); /* * Start vmbus scanning. */ error = vmbus_req_channels(sc); if (error) { device_printf(sc->vmbus_dev, "channel request failed: %d\n", error); return (error); } /* * Wait for all vmbus devices from the initial channel offers to be * attached. */ bus_topo_assert(); while (!sc->vmbus_scandone) mtx_sleep(&sc->vmbus_scandone, bus_topo_mtx(), 0, "vmbusdev", 0); if (bootverbose) { device_printf(sc->vmbus_dev, "device scan, probe and attach " "done\n"); } return (0); } static void vmbus_scan_teardown(struct vmbus_softc *sc) { bus_topo_assert(); if (sc->vmbus_devtq != NULL) { bus_topo_unlock(); taskqueue_free(sc->vmbus_devtq); bus_topo_lock(); sc->vmbus_devtq = NULL; } if (sc->vmbus_subchtq != NULL) { bus_topo_unlock(); taskqueue_free(sc->vmbus_subchtq); bus_topo_lock(); sc->vmbus_subchtq = NULL; } } static void vmbus_chanmsg_handle(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_chanmsg_proc_t msg_proc; uint32_t msg_type; msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; if (msg_type >= VMBUS_CHANMSG_TYPE_MAX) { device_printf(sc->vmbus_dev, "unknown message type 0x%x\n", msg_type); return; } msg_proc = vmbus_chanmsg_handlers[msg_type]; if (msg_proc != NULL) msg_proc(sc, msg); /* Channel specific processing */ vmbus_chan_msgproc(sc, msg); } static void vmbus_msg_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; volatile struct vmbus_message *msg; msg = VMBUS_PCPU_GET(sc, message, curcpu) + VMBUS_SINT_MESSAGE; for (;;) { if (msg->msg_type == HYPERV_MSGTYPE_NONE) { /* No message */ break; } else if (msg->msg_type == HYPERV_MSGTYPE_CHANNEL) { /* Channel message */ vmbus_chanmsg_handle(sc, __DEVOLATILE(const struct vmbus_message *, msg)); } msg->msg_type = HYPERV_MSGTYPE_NONE; /* * Make sure the write to msg_type (i.e. set to * HYPERV_MSGTYPE_NONE) happens before we read the * msg_flags and EOMing. Otherwise, the EOMing will * not deliver any more messages since there is no * empty slot * * NOTE: * mb() is used here, since atomic_thread_fence_seq_cst() * will become compiler fence on UP kernel. */ mb(); if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { /* * This will cause message queue rescan to possibly * deliver another msg from the hypervisor */ WRMSR(MSR_HV_EOM, 0); } } } static __inline int vmbus_handle_intr1(struct vmbus_softc *sc, struct trapframe *frame, int cpu) { volatile struct vmbus_message *msg; struct vmbus_message *msg_base; msg_base = VMBUS_PCPU_GET(sc, message, cpu); /* * Check event timer. * * TODO: move this to independent IDT vector. */ vmbus_handle_timer_intr1(msg_base, frame); /* * Check events. Hot path for network and storage I/O data; high rate. * * NOTE: * As recommended by the Windows guest fellows, we check events before * checking messages. */ sc->vmbus_event_proc(sc, cpu); /* * Check messages. Mainly management stuffs; ultra low rate. */ msg = msg_base + VMBUS_SINT_MESSAGE; if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) { taskqueue_enqueue(VMBUS_PCPU_GET(sc, message_tq, cpu), VMBUS_PCPU_PTR(sc, message_task, cpu)); } return (FILTER_HANDLED); } void vmbus_handle_intr(struct trapframe *trap_frame) { struct vmbus_softc *sc = vmbus_get_softc(); int cpu = curcpu; /* * Disable preemption. */ critical_enter(); /* * Do a little interrupt counting. This used x86 specific * intrcnt_add function */ #if !defined(__aarch64__) (*VMBUS_PCPU_GET(sc, intr_cnt, cpu))++; #endif /* not for aarch64 */ vmbus_handle_intr1(sc, trap_frame, cpu); /* * Enable preemption. */ critical_exit(); } static void vmbus_synic_setup(void *xsc) { struct vmbus_softc *sc = xsc; int cpu = curcpu; uint64_t val, orig; uint32_t sint; if (hyperv_features & CPUID_HV_MSR_VP_INDEX) { /* Save virtual processor id. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = RDMSR(MSR_HV_VP_INDEX); } else { /* Set virtual processor id to 0 for compatibility. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = 0; } /* * Setup the SynIC message. */ orig = RDMSR(MSR_HV_SIMP); - val = MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK) | - ((VMBUS_PCPU_GET(sc, message_dma.hv_paddr, cpu) >> PAGE_SHIFT) - << MSR_HV_SIMP_PGSHIFT); + val = pmap_kextract((vm_offset_t)VMBUS_PCPU_GET(sc, message, cpu)) & + MSR_HV_SIMP_PGMASK; + val |= MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK); WRMSR(MSR_HV_SIMP, val); /* * Setup the SynIC event flags. */ orig = RDMSR(MSR_HV_SIEFP); - val = MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK) | - ((VMBUS_PCPU_GET(sc, event_flags_dma.hv_paddr, cpu) >> PAGE_SHIFT) - << MSR_HV_SIEFP_PGSHIFT); + val = pmap_kextract((vm_offset_t)VMBUS_PCPU_GET(sc, event_flags, cpu)) & + MSR_HV_SIMP_PGMASK; + val |= MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK); WRMSR(MSR_HV_SIEFP, val); /* * Configure and unmask SINT for message and event flags. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = RDMSR(sint); val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | (orig & MSR_HV_SINT_RSVD_MASK); WRMSR(sint, val); /* * Configure and unmask SINT for timer. */ vmbus_synic_setup1(sc); /* * All done; enable SynIC. */ orig = RDMSR(MSR_HV_SCONTROL); val = MSR_HV_SCTRL_ENABLE | (orig & MSR_HV_SCTRL_RSVD_MASK); WRMSR(MSR_HV_SCONTROL, val); } static void vmbus_synic_teardown(void *arg) { uint64_t orig; uint32_t sint; /* * Disable SynIC. */ orig = RDMSR(MSR_HV_SCONTROL); WRMSR(MSR_HV_SCONTROL, (orig & MSR_HV_SCTRL_RSVD_MASK)); /* * Mask message and event flags SINT. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = RDMSR(sint); WRMSR(sint, orig | MSR_HV_SINT_MASKED); /* * Mask timer SINT. */ vmbus_synic_teardown1(); /* * Teardown SynIC message. */ orig = RDMSR(MSR_HV_SIMP); WRMSR(MSR_HV_SIMP, (orig & MSR_HV_SIMP_RSVD_MASK)); /* * Teardown SynIC event flags. */ orig = RDMSR(MSR_HV_SIEFP); WRMSR(MSR_HV_SIEFP, (orig & MSR_HV_SIEFP_RSVD_MASK)); } static int vmbus_dma_alloc(struct vmbus_softc *sc) { - bus_dma_tag_t parent_dtag; uint8_t *evtflags; int cpu; - parent_dtag = bus_get_dma_tag(sc->vmbus_dev); CPU_FOREACH(cpu) { void *ptr; /* * Per-cpu messages and event flags. */ - ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, - PAGE_SIZE, VMBUS_PCPU_PTR(sc, message_dma, cpu), - BUS_DMA_WAITOK | BUS_DMA_ZERO); + ptr = contigmalloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, + 0ul, ~0ul, PAGE_SIZE, 0); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, message, cpu) = ptr; - ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, - PAGE_SIZE, VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), - BUS_DMA_WAITOK | BUS_DMA_ZERO); + ptr = contigmalloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, + 0ul, ~0ul, PAGE_SIZE, 0); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, event_flags, cpu) = ptr; } - evtflags = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, - PAGE_SIZE, &sc->vmbus_evtflags_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); + evtflags = contigmalloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, + 0ul, ~0ul, PAGE_SIZE, 0); if (evtflags == NULL) return ENOMEM; sc->vmbus_rx_evtflags = (u_long *)evtflags; sc->vmbus_tx_evtflags = (u_long *)(evtflags + (PAGE_SIZE / 2)); sc->vmbus_evtflags = evtflags; - sc->vmbus_mnf1 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, - PAGE_SIZE, &sc->vmbus_mnf1_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); + sc->vmbus_mnf1 = contigmalloc(PAGE_SIZE, M_DEVBUF, M_WAITOK | M_ZERO, + 0ul, ~0ul, PAGE_SIZE, 0); if (sc->vmbus_mnf1 == NULL) return ENOMEM; - sc->vmbus_mnf2 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, - sizeof(struct vmbus_mnf), &sc->vmbus_mnf2_dma, - BUS_DMA_WAITOK | BUS_DMA_ZERO); + sc->vmbus_mnf2 = contigmalloc(sizeof(struct vmbus_mnf), M_DEVBUF, + M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (sc->vmbus_mnf2 == NULL) return ENOMEM; return 0; } static void vmbus_dma_free(struct vmbus_softc *sc) { int cpu; if (sc->vmbus_evtflags != NULL) { - hyperv_dmamem_free(&sc->vmbus_evtflags_dma, sc->vmbus_evtflags); + contigfree(sc->vmbus_evtflags, PAGE_SIZE, M_DEVBUF); sc->vmbus_evtflags = NULL; sc->vmbus_rx_evtflags = NULL; sc->vmbus_tx_evtflags = NULL; } if (sc->vmbus_mnf1 != NULL) { - hyperv_dmamem_free(&sc->vmbus_mnf1_dma, sc->vmbus_mnf1); + contigfree(sc->vmbus_mnf1, PAGE_SIZE, M_DEVBUF); sc->vmbus_mnf1 = NULL; } if (sc->vmbus_mnf2 != NULL) { - hyperv_dmamem_free(&sc->vmbus_mnf2_dma, sc->vmbus_mnf2); + contigfree(sc->vmbus_mnf2, sizeof(struct vmbus_mnf), M_DEVBUF); sc->vmbus_mnf2 = NULL; } CPU_FOREACH(cpu) { if (VMBUS_PCPU_GET(sc, message, cpu) != NULL) { - hyperv_dmamem_free( - VMBUS_PCPU_PTR(sc, message_dma, cpu), - VMBUS_PCPU_GET(sc, message, cpu)); + contigfree(VMBUS_PCPU_GET(sc, message, cpu), PAGE_SIZE, + M_DEVBUF); VMBUS_PCPU_GET(sc, message, cpu) = NULL; } if (VMBUS_PCPU_GET(sc, event_flags, cpu) != NULL) { - hyperv_dmamem_free( - VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), - VMBUS_PCPU_GET(sc, event_flags, cpu)); + contigfree(VMBUS_PCPU_GET(sc, event_flags, cpu), + PAGE_SIZE, M_DEVBUF); VMBUS_PCPU_GET(sc, event_flags, cpu) = NULL; } } } static int vmbus_intr_setup(struct vmbus_softc *sc) { int cpu; CPU_FOREACH(cpu) { char buf[MAXCOMLEN + 1]; cpuset_t cpu_mask; /* Allocate an interrupt counter for Hyper-V interrupt */ snprintf(buf, sizeof(buf), "cpu%d:hyperv", cpu); #if !defined(__aarch64__) intrcnt_add(buf, VMBUS_PCPU_PTR(sc, intr_cnt, cpu)); #endif /* not for aarch64 */ /* * Setup taskqueue to handle events. Task will be per- * channel. */ VMBUS_PCPU_GET(sc, event_tq, cpu) = taskqueue_create_fast( "hyperv event", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, event_tq, cpu)); if (vmbus_pin_evttask) { CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset( VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, &cpu_mask, "hvevent%d", cpu); } else { taskqueue_start_threads( VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, "hvevent%d", cpu); } /* * Setup tasks and taskqueues to handle messages. */ VMBUS_PCPU_GET(sc, message_tq, cpu) = taskqueue_create_fast( "hyperv msg", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, message_tq, cpu)); CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset( VMBUS_PCPU_PTR(sc, message_tq, cpu), 1, PI_NET, &cpu_mask, "hvmsg%d", cpu); TASK_INIT(VMBUS_PCPU_PTR(sc, message_task, cpu), 0, vmbus_msg_task, sc); } return (vmbus_setup_intr1(sc)); } static void vmbus_intr_teardown(struct vmbus_softc *sc) { vmbus_intr_teardown1(sc); } static int vmbus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { return (ENOENT); } static int vmbus_child_pnpinfo(device_t dev, device_t child, struct sbuf *sb) { const struct vmbus_channel *chan; char guidbuf[HYPERV_GUID_STRLEN]; chan = vmbus_get_channel(child); if (chan == NULL) { /* Event timer device, which does not belong to a channel */ return (0); } hyperv_guid2str(&chan->ch_guid_type, guidbuf, sizeof(guidbuf)); sbuf_printf(sb, "classid=%s", guidbuf); hyperv_guid2str(&chan->ch_guid_inst, guidbuf, sizeof(guidbuf)); sbuf_printf(sb, " deviceid=%s", guidbuf); return (0); } int vmbus_add_child(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; device_t parent = sc->vmbus_dev; bus_topo_lock(); chan->ch_dev = device_add_child(parent, NULL, -1); if (chan->ch_dev == NULL) { bus_topo_unlock(); device_printf(parent, "device_add_child for chan%u failed\n", chan->ch_id); return (ENXIO); } device_set_ivars(chan->ch_dev, chan); device_probe_and_attach(chan->ch_dev); bus_topo_unlock(); return (0); } int vmbus_delete_child(struct vmbus_channel *chan) { int error = 0; bus_topo_lock(); if (chan->ch_dev != NULL) { error = device_delete_child(chan->ch_vmbus->vmbus_dev, chan->ch_dev); chan->ch_dev = NULL; } bus_topo_unlock(); return (error); } static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS) { struct vmbus_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } /* * We need the function to make sure the MMIO resource is allocated from the * ranges found in _CRS. * * For the release function, we can use bus_generic_release_resource(). */ static struct resource * vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { device_t parent = device_get_parent(dev); struct resource *res; #ifdef NEW_PCIB if (type == SYS_RES_MEMORY) { struct vmbus_softc *sc = device_get_softc(dev); res = pcib_host_res_alloc(&sc->vmbus_mmio_res, child, type, rid, start, end, count, flags); } else #endif { res = BUS_ALLOC_RESOURCE(parent, child, type, rid, start, end, count, flags); } return (res); } static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs) { return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount, irqs)); } static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs) { return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs)); } static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq) { return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_release_msix(device_t bus, device_t dev, int irq) { return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data) { return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data)); } static uint32_t vmbus_get_version_method(device_t bus, device_t dev) { struct vmbus_softc *sc = device_get_softc(bus); return sc->vmbus_version; } static int vmbus_probe_guid_method(device_t bus, device_t dev, const struct hyperv_guid *guid) { const struct vmbus_channel *chan = vmbus_get_channel(dev); if (memcmp(&chan->ch_guid_type, guid, sizeof(struct hyperv_guid)) == 0) return 0; return ENXIO; } static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); return (VMBUS_PCPU_GET(sc, vcpuid, cpu)); } static struct taskqueue * vmbus_get_eventtq_method(device_t bus, device_t dev __unused, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu%d", cpu)); return (VMBUS_PCPU_GET(sc, event_tq, cpu)); } #ifdef NEW_PCIB #define VTPM_BASE_ADDR 0xfed40000 #define FOUR_GB (1ULL << 32) enum parse_pass { parse_64, parse_32 }; struct parse_context { device_t vmbus_dev; enum parse_pass pass; }; static ACPI_STATUS parse_crs(ACPI_RESOURCE *res, void *ctx) { const struct parse_context *pc = ctx; device_t vmbus_dev = pc->vmbus_dev; struct vmbus_softc *sc = device_get_softc(vmbus_dev); UINT64 start, end; switch (res->Type) { case ACPI_RESOURCE_TYPE_ADDRESS32: start = res->Data.Address32.Address.Minimum; end = res->Data.Address32.Address.Maximum; break; case ACPI_RESOURCE_TYPE_ADDRESS64: start = res->Data.Address64.Address.Minimum; end = res->Data.Address64.Address.Maximum; break; default: /* Unused types. */ return (AE_OK); } /* * We don't use <1MB addresses. */ if (end < 0x100000) return (AE_OK); /* Don't conflict with vTPM. */ if (end >= VTPM_BASE_ADDR && start < VTPM_BASE_ADDR) end = VTPM_BASE_ADDR - 1; if ((pc->pass == parse_32 && start < FOUR_GB) || (pc->pass == parse_64 && start >= FOUR_GB)) pcib_host_res_decodes(&sc->vmbus_mmio_res, SYS_RES_MEMORY, start, end, 0); return (AE_OK); } static void vmbus_get_crs(device_t dev, device_t vmbus_dev, enum parse_pass pass) { struct parse_context pc; ACPI_STATUS status; if (bootverbose) device_printf(dev, "walking _CRS, pass=%d\n", pass); pc.vmbus_dev = vmbus_dev; pc.pass = pass; status = AcpiWalkResources(acpi_get_handle(dev), "_CRS", parse_crs, &pc); if (bootverbose && ACPI_FAILURE(status)) device_printf(dev, "_CRS: not found, pass=%d\n", pass); } static void vmbus_get_mmio_res_pass(device_t dev, enum parse_pass pass) { device_t acpi0, parent; parent = device_get_parent(dev); acpi0 = device_get_parent(parent); if (strcmp("acpi0", device_get_nameunit(acpi0)) == 0) { device_t *children; int count; /* * Try to locate VMBUS resources and find _CRS on them. */ if (device_get_children(acpi0, &children, &count) == 0) { int i; for (i = 0; i < count; ++i) { if (!device_is_attached(children[i])) continue; if (strcmp("vmbus_res", device_get_name(children[i])) == 0) vmbus_get_crs(children[i], dev, pass); } free(children, M_TEMP); } /* * Try to find _CRS on acpi. */ vmbus_get_crs(acpi0, dev, pass); } else { device_printf(dev, "not grandchild of acpi\n"); } /* * Try to find _CRS on parent. */ vmbus_get_crs(parent, dev, pass); } static void vmbus_get_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); /* * We walk the resources twice to make sure that: in the resource * list, the 32-bit resources appear behind the 64-bit resources. * NB: resource_list_add() uses INSERT_TAIL. This way, when we * iterate through the list to find a range for a 64-bit BAR in * vmbus_alloc_resource(), we can make sure we try to use >4GB * ranges first. */ pcib_host_res_init(dev, &sc->vmbus_mmio_res); vmbus_get_mmio_res_pass(dev, parse_64); vmbus_get_mmio_res_pass(dev, parse_32); } /* * On Gen2 VMs, Hyper-V provides mmio space for framebuffer. * This mmio address range is not useable for other PCI devices. * Currently only efifb and vbefb drivers are using this range without * reserving it from system. * Therefore, vmbus driver reserves it before any other PCI device * drivers start to request mmio addresses. */ static struct resource *hv_fb_res; static void vmbus_fb_mmio_res(device_t dev) { struct efi_fb *efifb; #if !defined(__aarch64__) struct vbe_fb *vbefb; #endif /* aarch64 */ rman_res_t fb_start, fb_end, fb_count; int fb_height, fb_width; caddr_t kmdp; struct vmbus_softc *sc = device_get_softc(dev); int rid = 0; kmdp = preload_search_by_type("elf kernel"); if (kmdp == NULL) kmdp = preload_search_by_type("elf64 kernel"); efifb = (struct efi_fb *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_FB); #if !defined(__aarch64__) vbefb = (struct vbe_fb *)preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_VBE_FB); #endif /* aarch64 */ if (efifb != NULL) { fb_start = efifb->fb_addr; fb_end = efifb->fb_addr + efifb->fb_size; fb_count = efifb->fb_size; fb_height = efifb->fb_height; fb_width = efifb->fb_width; } #if !defined(__aarch64__) else if (vbefb != NULL) { fb_start = vbefb->fb_addr; fb_end = vbefb->fb_addr + vbefb->fb_size; fb_count = vbefb->fb_size; fb_height = vbefb->fb_height; fb_width = vbefb->fb_width; } #endif /* aarch64 */ else { if (bootverbose) device_printf(dev, "no preloaded kernel fb information\n"); /* We are on Gen1 VM, just return. */ return; } if (bootverbose) device_printf(dev, "fb: fb_addr: %#jx, size: %#jx, " "actual size needed: 0x%x\n", fb_start, fb_count, fb_height * fb_width); hv_fb_res = pcib_host_res_alloc(&sc->vmbus_mmio_res, dev, SYS_RES_MEMORY, &rid, fb_start, fb_end, fb_count, RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE)); if (hv_fb_res && bootverbose) device_printf(dev, "successfully reserved memory for framebuffer " "starting at %#jx, size %#jx\n", fb_start, fb_count); } static void vmbus_free_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); pcib_host_res_free(dev, &sc->vmbus_mmio_res); if (hv_fb_res) hv_fb_res = NULL; } #endif /* NEW_PCIB */ static void vmbus_identify(driver_t *driver, device_t parent) { if (device_get_unit(parent) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return; device_add_child(parent, "vmbus", -1); } static int vmbus_probe(device_t dev) { if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return (ENXIO); device_set_desc(dev, "Hyper-V Vmbus"); return (BUS_PROBE_DEFAULT); } /** * @brief Main vmbus driver initialization routine. * * Here, we * - initialize the vmbus driver context * - setup various driver entry points * - invoke the vmbus hv main init routine * - get the irq resource * - invoke the vmbus to add the vmbus root device * - setup the vmbus root device * - retrieve the channel offers */ static int vmbus_doattach(struct vmbus_softc *sc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int ret; if (sc->vmbus_flags & VMBUS_FLAG_ATTACHED) return (0); #ifdef NEW_PCIB vmbus_get_mmio_res(sc->vmbus_dev); vmbus_fb_mmio_res(sc->vmbus_dev); #endif sc->vmbus_flags |= VMBUS_FLAG_ATTACHED; sc->vmbus_gpadl = VMBUS_GPADL_START; mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_prichans); mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_chans); sc->vmbus_chmap = malloc( sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF, M_WAITOK | M_ZERO); /* * Create context for "post message" Hypercalls */ sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev), HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE, sizeof(struct vmbus_msghc)); if (sc->vmbus_xc == NULL) { ret = ENXIO; goto cleanup; } /* * Allocate DMA stuffs. */ ret = vmbus_dma_alloc(sc); if (ret != 0) goto cleanup; /* * Setup interrupt. */ ret = vmbus_intr_setup(sc); if (ret != 0) goto cleanup; /* * Setup SynIC. */ if (bootverbose) device_printf(sc->vmbus_dev, "smp_started = %d\n", smp_started); smp_rendezvous(NULL, vmbus_synic_setup, NULL, sc); sc->vmbus_flags |= VMBUS_FLAG_SYNIC; /* * Initialize vmbus, e.g. connect to Hypervisor. */ ret = vmbus_init(sc); if (ret != 0) goto cleanup; if (sc->vmbus_version == VMBUS_VERSION_WS2008 || sc->vmbus_version == VMBUS_VERSION_WIN7) sc->vmbus_event_proc = vmbus_event_proc_compat; else sc->vmbus_event_proc = vmbus_event_proc; ret = vmbus_scan(sc); if (ret != 0) goto cleanup; ctx = device_get_sysctl_ctx(sc->vmbus_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->vmbus_dev)); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, vmbus_sysctl_version, "A", "vmbus version"); return (ret); cleanup: vmbus_scan_teardown(sc); vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); return (ret); } static void vmbus_event_proc_dummy(struct vmbus_softc *sc __unused, int cpu __unused) { } #if defined(EARLY_AP_STARTUP) || defined(__aarch64__) static void vmbus_intrhook(void *xsc) { struct vmbus_softc *sc = xsc; if (bootverbose) device_printf(sc->vmbus_dev, "intrhook\n"); vmbus_doattach(sc); config_intrhook_disestablish(&sc->vmbus_intrhook); } #endif /* EARLY_AP_STARTUP aarch64 */ static int vmbus_attach(device_t dev) { vmbus_sc = device_get_softc(dev); vmbus_sc->vmbus_dev = dev; vmbus_sc->vmbus_idtvec = -1; /* * Event processing logic will be configured: * - After the vmbus protocol version negotiation. * - Before we request channel offers. */ vmbus_sc->vmbus_event_proc = vmbus_event_proc_dummy; #if defined(EARLY_AP_STARTUP) || defined(__aarch64__) /* * Defer the real attach until the pause(9) works as expected. */ vmbus_sc->vmbus_intrhook.ich_func = vmbus_intrhook; vmbus_sc->vmbus_intrhook.ich_arg = vmbus_sc; config_intrhook_establish(&vmbus_sc->vmbus_intrhook); #else /* !EARLY_AP_STARTUP */ /* * If the system has already booted and thread * scheduling is possible indicated by the global * cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(vmbus_sc); #endif /* EARLY_AP_STARTUP and aarch64 */ return (0); } static int vmbus_detach(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); bus_generic_detach(dev); vmbus_chan_destroy_all(sc); vmbus_scan_teardown(sc); vmbus_disconnect(sc); if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) { sc->vmbus_flags &= ~VMBUS_FLAG_SYNIC; smp_rendezvous(NULL, vmbus_synic_teardown, NULL, NULL); } vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); #ifdef NEW_PCIB vmbus_free_mmio_res(dev); #endif #if defined(__aarch64__) bus_release_resource(device_get_parent(dev), SYS_RES_IRQ, sc->vector, sc->ires); #endif return (0); } #if !defined(EARLY_AP_STARTUP) && !defined(__aarch64__) static void vmbus_sysinit(void *arg __unused) { struct vmbus_softc *sc = vmbus_get_softc(); if (vm_guest != VM_GUEST_HV || sc == NULL) return; /* * If the system has already booted and thread * scheduling is possible, as indicated by the * global cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(sc); } /* * NOTE: * We have to start as the last step of SI_SUB_SMP, i.e. after SMP is * initialized. */ SYSINIT(vmbus_initialize, SI_SUB_SMP, SI_ORDER_ANY, vmbus_sysinit, NULL); #endif /* !EARLY_AP_STARTUP */ diff --git a/sys/dev/hyperv/vmbus/vmbus_chan.c b/sys/dev/hyperv/vmbus/vmbus_chan.c index 032e06c47c95..ff7bbc7c0dc3 100644 --- a/sys/dev/hyperv/vmbus/vmbus_chan.c +++ b/sys/dev/hyperv/vmbus/vmbus_chan.c @@ -1,2390 +1,2393 @@ /*- * Copyright (c) 2009-2012,2016 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include +#include +#include + #include #include #include #include #include #include struct vmbus_chan_pollarg { struct vmbus_channel *poll_chan; u_int poll_hz; }; static void vmbus_chan_update_evtflagcnt( struct vmbus_softc *, const struct vmbus_channel *); static int vmbus_chan_close_internal( struct vmbus_channel *); static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS); static void vmbus_chan_sysctl_create( struct vmbus_channel *); static struct vmbus_channel *vmbus_chan_alloc(struct vmbus_softc *); static void vmbus_chan_free(struct vmbus_channel *); static int vmbus_chan_add(struct vmbus_channel *); static void vmbus_chan_cpu_default(struct vmbus_channel *); static int vmbus_chan_release(struct vmbus_channel *); static void vmbus_chan_set_chmap(struct vmbus_channel *); static void vmbus_chan_clear_chmap(struct vmbus_channel *); static void vmbus_chan_detach(struct vmbus_channel *); static bool vmbus_chan_wait_revoke( const struct vmbus_channel *, bool); static void vmbus_chan_poll_timeout(void *); static bool vmbus_chan_poll_cancel_intq( struct vmbus_channel *); static void vmbus_chan_poll_cancel(struct vmbus_channel *); static void vmbus_chan_ins_prilist(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_rem_prilist(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_ins_list(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_rem_list(struct vmbus_softc *, struct vmbus_channel *); static void vmbus_chan_ins_sublist(struct vmbus_channel *, struct vmbus_channel *); static void vmbus_chan_rem_sublist(struct vmbus_channel *, struct vmbus_channel *); static void vmbus_chan_task(void *, int); static void vmbus_chan_task_nobatch(void *, int); static void vmbus_chan_poll_task(void *, int); static void vmbus_chan_clrchmap_task(void *, int); static void vmbus_chan_pollcfg_task(void *, int); static void vmbus_chan_polldis_task(void *, int); static void vmbus_chan_poll_cancel_task(void *, int); static void vmbus_prichan_attach_task(void *, int); static void vmbus_subchan_attach_task(void *, int); static void vmbus_prichan_detach_task(void *, int); static void vmbus_subchan_detach_task(void *, int); static void vmbus_chan_msgproc_choffer(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chan_msgproc_chrescind( struct vmbus_softc *, const struct vmbus_message *); static int vmbus_chan_printf(const struct vmbus_channel *, const char *, ...) __printflike(2, 3); /* * Vmbus channel message processing. */ static const vmbus_chanmsg_proc_t vmbus_chan_msgprocs[VMBUS_CHANMSG_TYPE_MAX] = { VMBUS_CHANMSG_PROC(CHOFFER, vmbus_chan_msgproc_choffer), VMBUS_CHANMSG_PROC(CHRESCIND, vmbus_chan_msgproc_chrescind), VMBUS_CHANMSG_PROC_WAKEUP(CHOPEN_RESP), VMBUS_CHANMSG_PROC_WAKEUP(GPADL_CONNRESP), VMBUS_CHANMSG_PROC_WAKEUP(GPADL_DISCONNRESP) }; /* * Notify host that there are data pending on our TX bufring or * we have put some data on the TX bufring. */ static __inline void vmbus_chan_signal(const struct vmbus_channel *chan) { atomic_set_long(chan->ch_evtflag, chan->ch_evtflag_mask); if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF) atomic_set_int(chan->ch_montrig, chan->ch_montrig_mask); else - hypercall_signal_event(chan->ch_monprm_dma.hv_paddr); + hypercall_signal_event(pmap_kextract( + (vm_offset_t)chan->ch_monprm)); } static __inline void vmbus_chan_signal_tx(struct vmbus_channel *chan) { chan->ch_txbr.txbr_intrcnt ++; vmbus_chan_signal(chan); } static __inline void vmbus_chan_signal_rx(struct vmbus_channel *chan) { chan->ch_rxbr.rxbr_intrcnt ++; vmbus_chan_signal(chan); } static void vmbus_chan_ins_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONPRIL_SHIFT)) panic("channel is already on the prilist"); TAILQ_INSERT_TAIL(&sc->vmbus_prichans, chan, ch_prilink); } static void vmbus_chan_rem_prilist(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_prichan_lock, MA_OWNED); if (atomic_testandclear_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONPRIL_SHIFT) == 0) panic("channel is not on the prilist"); TAILQ_REMOVE(&sc->vmbus_prichans, chan, ch_prilink); } static void vmbus_chan_ins_sublist(struct vmbus_channel *prichan, struct vmbus_channel *chan) { mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONSUBL_SHIFT)) panic("channel is already on the sublist"); TAILQ_INSERT_TAIL(&prichan->ch_subchans, chan, ch_sublink); /* Bump sub-channel count. */ prichan->ch_subchan_cnt++; } static void vmbus_chan_rem_sublist(struct vmbus_channel *prichan, struct vmbus_channel *chan) { mtx_assert(&prichan->ch_subchan_lock, MA_OWNED); KASSERT(prichan->ch_subchan_cnt > 0, ("invalid subchan_cnt %d", prichan->ch_subchan_cnt)); prichan->ch_subchan_cnt--; if (atomic_testandclear_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONSUBL_SHIFT) == 0) panic("channel is not on the sublist"); TAILQ_REMOVE(&prichan->ch_subchans, chan, ch_sublink); } static void vmbus_chan_ins_list(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONLIST_SHIFT)) panic("channel is already on the list"); TAILQ_INSERT_TAIL(&sc->vmbus_chans, chan, ch_link); } static void vmbus_chan_rem_list(struct vmbus_softc *sc, struct vmbus_channel *chan) { mtx_assert(&sc->vmbus_chan_lock, MA_OWNED); if (atomic_testandclear_int(&chan->ch_stflags, VMBUS_CHAN_ST_ONLIST_SHIFT) == 0) panic("channel is not on the list"); TAILQ_REMOVE(&sc->vmbus_chans, chan, ch_link); } static int vmbus_chan_sysctl_mnf(SYSCTL_HANDLER_ARGS) { struct vmbus_channel *chan = arg1; int mnf = 0; if (chan->ch_txflags & VMBUS_CHAN_TXF_HASMNF) mnf = 1; return sysctl_handle_int(oidp, &mnf, 0, req); } static void vmbus_chan_sysctl_create(struct vmbus_channel *chan) { struct sysctl_oid *ch_tree, *chid_tree, *br_tree; struct sysctl_ctx_list *ctx; uint32_t ch_id; char name[16]; /* * Add sysctl nodes related to this channel to this * channel's sysctl ctx, so that they can be destroyed * independently upon close of this channel, which can * happen even if the device is not detached. */ ctx = &chan->ch_sysctl_ctx; sysctl_ctx_init(ctx); /* * Create dev.NAME.UNIT.channel tree. */ ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(device_get_sysctl_tree(chan->ch_dev)), OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (ch_tree == NULL) return; /* * Create dev.NAME.UNIT.channel.CHANID tree. */ if (VMBUS_CHAN_ISPRIMARY(chan)) ch_id = chan->ch_id; else ch_id = chan->ch_prichan->ch_id; snprintf(name, sizeof(name), "%d", ch_id); chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (chid_tree == NULL) return; if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * Create dev.NAME.UNIT.channel.CHANID.sub tree. */ ch_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "sub", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (ch_tree == NULL) return; /* * Create dev.NAME.UNIT.channel.CHANID.sub.SUBIDX tree. * * NOTE: * chid_tree is changed to this new sysctl tree. */ snprintf(name, sizeof(name), "%d", chan->ch_subidx); chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (chid_tree == NULL) return; SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "chanid", CTLFLAG_RD, &chan->ch_id, 0, "channel id"); } SYSCTL_ADD_UINT(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "cpu", CTLFLAG_RD, &chan->ch_cpuid, 0, "owner CPU id"); SYSCTL_ADD_PROC(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "mnf", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, chan, 0, vmbus_chan_sysctl_mnf, "I", "has monitor notification facilities"); br_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "br", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (br_tree != NULL) { /* * Create sysctl tree for RX bufring. */ vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_rxbr.rxbr, "rx"); /* * Create sysctl tree for TX bufring. */ vmbus_br_sysctl_create(ctx, br_tree, &chan->ch_txbr.txbr, "tx"); } } int vmbus_chan_open(struct vmbus_channel *chan, int txbr_size, int rxbr_size, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) { struct vmbus_chan_br cbr; int error; /* * Allocate the TX+RX bufrings. */ KASSERT(chan->ch_bufring == NULL, ("bufrings are allocated")); - chan->ch_bufring = hyperv_dmamem_alloc(bus_get_dma_tag(chan->ch_dev), - PAGE_SIZE, 0, txbr_size + rxbr_size, &chan->ch_bufring_dma, - BUS_DMA_WAITOK); + chan->ch_bufring_size = txbr_size + rxbr_size; + chan->ch_bufring = contigmalloc(chan->ch_bufring_size, M_DEVBUF, + M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (chan->ch_bufring == NULL) { vmbus_chan_printf(chan, "bufring allocation failed\n"); return (ENOMEM); } cbr.cbr = chan->ch_bufring; - cbr.cbr_paddr = chan->ch_bufring_dma.hv_paddr; + cbr.cbr_paddr = pmap_kextract((vm_offset_t)chan->ch_bufring); cbr.cbr_txsz = txbr_size; cbr.cbr_rxsz = rxbr_size; error = vmbus_chan_open_br(chan, &cbr, udata, udlen, cb, cbarg); if (error) { if (error == EISCONN) { /* * XXX * The bufring GPADL is still connected; abandon * this bufring, instead of having mysterious * crash or trashed data later on. */ vmbus_chan_printf(chan, "chan%u bufring GPADL " "is still connected upon channel open error; " "leak %d bytes memory\n", chan->ch_id, txbr_size + rxbr_size); } else { - hyperv_dmamem_free(&chan->ch_bufring_dma, - chan->ch_bufring); + contigfree(chan->ch_bufring, chan->ch_bufring_size, + M_DEVBUF); } chan->ch_bufring = NULL; } return (error); } int vmbus_chan_open_br(struct vmbus_channel *chan, const struct vmbus_chan_br *cbr, const void *udata, int udlen, vmbus_chan_callback_t cb, void *cbarg) { struct vmbus_softc *sc = chan->ch_vmbus; const struct vmbus_message *msg; struct vmbus_chanmsg_chopen *req; struct vmbus_msghc *mh; uint32_t status; int error, txbr_size, rxbr_size; task_fn_t *task_fn; uint8_t *br; if (udlen > VMBUS_CHANMSG_CHOPEN_UDATA_SIZE) { vmbus_chan_printf(chan, "invalid udata len %d for chan%u\n", udlen, chan->ch_id); return (EINVAL); } br = cbr->cbr; txbr_size = cbr->cbr_txsz; rxbr_size = cbr->cbr_rxsz; KASSERT((txbr_size & PAGE_MASK) == 0, ("send bufring size is not multiple page")); KASSERT((rxbr_size & PAGE_MASK) == 0, ("recv bufring size is not multiple page")); KASSERT((cbr->cbr_paddr & PAGE_MASK) == 0, ("bufring is not page aligned")); /* * Zero out the TX/RX bufrings, in case that they were used before. */ memset(br, 0, txbr_size + rxbr_size); if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED_SHIFT)) panic("double-open chan%u", chan->ch_id); chan->ch_cb = cb; chan->ch_cbarg = cbarg; vmbus_chan_update_evtflagcnt(sc, chan); chan->ch_tq = VMBUS_PCPU_GET(chan->ch_vmbus, event_tq, chan->ch_cpuid); if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) task_fn = vmbus_chan_task; else task_fn = vmbus_chan_task_nobatch; TASK_INIT(&chan->ch_task, 0, task_fn, chan); /* TX bufring comes first */ vmbus_txbr_setup(&chan->ch_txbr, br, txbr_size); /* RX bufring immediately follows TX bufring */ vmbus_rxbr_setup(&chan->ch_rxbr, br + txbr_size, rxbr_size); /* Create sysctl tree for this channel */ vmbus_chan_sysctl_create(chan); /* * Connect the bufrings, both RX and TX, to this channel. */ error = vmbus_chan_gpadl_connect(chan, cbr->cbr_paddr, txbr_size + rxbr_size, &chan->ch_bufring_gpadl); if (error) { vmbus_chan_printf(chan, "failed to connect bufring GPADL to chan%u\n", chan->ch_id); goto failed; } /* * Install this channel, before it is opened, but after everything * else has been setup. */ vmbus_chan_set_chmap(chan); /* * Open channel w/ the bufring GPADL on the target CPU. */ mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for chopen(chan%u)\n", chan->ch_id); error = ENXIO; goto failed; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHOPEN; req->chm_chanid = chan->ch_id; req->chm_openid = chan->ch_id; req->chm_gpadl = chan->ch_bufring_gpadl; req->chm_vcpuid = chan->ch_vcpuid; req->chm_txbr_pgcnt = txbr_size >> PAGE_SHIFT; if (udlen > 0) memcpy(req->chm_udata, udata, udlen); error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_chan_printf(chan, "chopen(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); vmbus_msghc_put(sc, mh); goto failed; } for (;;) { msg = vmbus_msghc_poll_result(sc, mh); if (msg != NULL) break; if (vmbus_chan_is_revoked(chan)) { int i; /* * NOTE: * Hypervisor does _not_ send response CHOPEN to * a revoked channel. */ vmbus_chan_printf(chan, "chan%u is revoked, when it is being opened\n", chan->ch_id); /* * XXX * Add extra delay before cancel the hypercall * execution; mainly to close any possible * CHRESCIND and CHOPEN_RESP races on the * hypervisor side. */ #define REVOKE_LINGER 100 for (i = 0; i < REVOKE_LINGER; ++i) { msg = vmbus_msghc_poll_result(sc, mh); if (msg != NULL) break; pause("rchopen", 1); } #undef REVOKE_LINGER if (msg == NULL) vmbus_msghc_exec_cancel(sc, mh); break; } pause("chopen", 1); } if (msg != NULL) { status = ((const struct vmbus_chanmsg_chopen_resp *) msg->msg_data)->chm_status; } else { /* XXX any non-0 value is ok here. */ status = 0xff; } vmbus_msghc_put(sc, mh); if (status == 0) { if (bootverbose) vmbus_chan_printf(chan, "chan%u opened\n", chan->ch_id); return (0); } vmbus_chan_printf(chan, "failed to open chan%u\n", chan->ch_id); error = ENXIO; failed: sysctl_ctx_free(&chan->ch_sysctl_ctx); vmbus_chan_clear_chmap(chan); if (chan->ch_bufring_gpadl != 0) { int error1; error1 = vmbus_chan_gpadl_disconnect(chan, chan->ch_bufring_gpadl); if (error1) { /* * Give caller a hint that the bufring GPADL is still * connected. */ error = EISCONN; } chan->ch_bufring_gpadl = 0; } atomic_clear_int(&chan->ch_stflags, VMBUS_CHAN_ST_OPENED); return (error); } int vmbus_chan_gpadl_connect(struct vmbus_channel *chan, bus_addr_t paddr, int size, uint32_t *gpadl0) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_gpadl_conn *req; const struct vmbus_message *msg; size_t reqsz; uint32_t gpadl, status; int page_count, range_len, i, cnt, error; uint64_t page_id; KASSERT(*gpadl0 == 0, ("GPADL is not zero")); /* * Preliminary checks. */ KASSERT((size & PAGE_MASK) == 0, ("invalid GPA size %d, not multiple page size", size)); page_count = size >> PAGE_SHIFT; KASSERT((paddr & PAGE_MASK) == 0, ("GPA is not page aligned %jx", (uintmax_t)paddr)); page_id = paddr >> PAGE_SHIFT; range_len = __offsetof(struct vmbus_gpa_range, gpa_page[page_count]); /* * We don't support multiple GPA ranges. */ if (range_len > UINT16_MAX) { vmbus_chan_printf(chan, "GPA too large, %d pages\n", page_count); return EOPNOTSUPP; } /* * Allocate GPADL id. */ gpadl = vmbus_gpadl_alloc(sc); /* * Connect this GPADL to the target channel. * * NOTE: * Since each message can only hold small set of page * addresses, several messages may be required to * complete the connection. */ if (page_count > VMBUS_CHANMSG_GPADL_CONN_PGMAX) cnt = VMBUS_CHANMSG_GPADL_CONN_PGMAX; else cnt = page_count; page_count -= cnt; reqsz = __offsetof(struct vmbus_chanmsg_gpadl_conn, chm_range.gpa_page[cnt]); mh = vmbus_msghc_get(sc, reqsz); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for gpadl_conn(chan%u)\n", chan->ch_id); return EIO; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_CONN; req->chm_chanid = chan->ch_id; req->chm_gpadl = gpadl; req->chm_range_len = range_len; req->chm_range_cnt = 1; req->chm_range.gpa_len = size; req->chm_range.gpa_ofs = 0; for (i = 0; i < cnt; ++i) req->chm_range.gpa_page[i] = page_id++; error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_chan_printf(chan, "gpadl_conn(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); vmbus_msghc_put(sc, mh); return error; } while (page_count > 0) { struct vmbus_chanmsg_gpadl_subconn *subreq; if (page_count > VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX) cnt = VMBUS_CHANMSG_GPADL_SUBCONN_PGMAX; else cnt = page_count; page_count -= cnt; reqsz = __offsetof(struct vmbus_chanmsg_gpadl_subconn, chm_gpa_page[cnt]); vmbus_msghc_reset(mh, reqsz); subreq = vmbus_msghc_dataptr(mh); subreq->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_SUBCONN; subreq->chm_gpadl = gpadl; for (i = 0; i < cnt; ++i) subreq->chm_gpa_page[i] = page_id++; vmbus_msghc_exec_noresult(mh); } KASSERT(page_count == 0, ("invalid page count %d", page_count)); msg = vmbus_msghc_wait_result(sc, mh); status = ((const struct vmbus_chanmsg_gpadl_connresp *) msg->msg_data)->chm_status; vmbus_msghc_put(sc, mh); if (status != 0) { vmbus_chan_printf(chan, "gpadl_conn(chan%u) failed: %u\n", chan->ch_id, status); return EIO; } /* Done; commit the GPADL id. */ *gpadl0 = gpadl; if (bootverbose) { vmbus_chan_printf(chan, "gpadl_conn(chan%u) succeeded\n", chan->ch_id); } return 0; } static bool vmbus_chan_wait_revoke(const struct vmbus_channel *chan, bool can_sleep) { #define WAIT_COUNT 200 /* 200ms */ int i; for (i = 0; i < WAIT_COUNT; ++i) { if (vmbus_chan_is_revoked(chan)) return (true); if (can_sleep) pause("wchrev", 1); else DELAY(1000); } return (false); #undef WAIT_COUNT } /* * Disconnect the GPA from the target channel */ int vmbus_chan_gpadl_disconnect(struct vmbus_channel *chan, uint32_t gpadl) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_gpadl_disconn *req; int error; KASSERT(gpadl != 0, ("GPADL is zero")); mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for gpadl_disconn(chan%u)\n", chan->ch_id); return (EBUSY); } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_GPADL_DISCONN; req->chm_chanid = chan->ch_id; req->chm_gpadl = gpadl; error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_msghc_put(sc, mh); if (vmbus_chan_wait_revoke(chan, true)) { /* * Error is benign; this channel is revoked, * so this GPADL will not be touched anymore. */ vmbus_chan_printf(chan, "gpadl_disconn(revoked chan%u) msg hypercall " "exec failed: %d\n", chan->ch_id, error); return (0); } vmbus_chan_printf(chan, "gpadl_disconn(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); return (error); } vmbus_msghc_wait_result(sc, mh); /* Discard result; no useful information */ vmbus_msghc_put(sc, mh); return (0); } static void vmbus_chan_detach(struct vmbus_channel *chan) { int refs; KASSERT(chan->ch_refs > 0, ("chan%u: invalid refcnt %d", chan->ch_id, chan->ch_refs)); refs = atomic_fetchadd_int(&chan->ch_refs, -1); #ifdef INVARIANTS if (VMBUS_CHAN_ISPRIMARY(chan)) { KASSERT(refs == 1, ("chan%u: invalid refcnt %d for prichan", chan->ch_id, refs + 1)); } #endif if (refs == 1) { /* * Detach the target channel. */ if (bootverbose) { vmbus_chan_printf(chan, "chan%u detached\n", chan->ch_id); } taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); } } static void vmbus_chan_clrchmap_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL; } static void vmbus_chan_clear_chmap(struct vmbus_channel *chan) { struct task chmap_task; TASK_INIT(&chmap_task, 0, vmbus_chan_clrchmap_task, chan); vmbus_chan_run_task(chan, &chmap_task); } static void vmbus_chan_set_chmap(struct vmbus_channel *chan) { __compiler_membar(); chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan; } static void vmbus_chan_poll_cancel_task(void *xchan, int pending __unused) { vmbus_chan_poll_cancel_intq(xchan); } static void vmbus_chan_poll_cancel(struct vmbus_channel *chan) { struct task poll_cancel; TASK_INIT(&poll_cancel, 0, vmbus_chan_poll_cancel_task, chan); vmbus_chan_run_task(chan, &poll_cancel); } static int vmbus_chan_close_internal(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_msghc *mh; struct vmbus_chanmsg_chclose *req; uint32_t old_stflags; int error; /* * NOTE: * Sub-channels are closed upon their primary channel closing, * so they can be closed even before they are opened. */ for (;;) { old_stflags = chan->ch_stflags; if (atomic_cmpset_int(&chan->ch_stflags, old_stflags, old_stflags & ~VMBUS_CHAN_ST_OPENED)) break; } if ((old_stflags & VMBUS_CHAN_ST_OPENED) == 0) { /* Not opened yet; done */ if (bootverbose) { vmbus_chan_printf(chan, "chan%u not opened\n", chan->ch_id); } return (0); } /* * Free this channel's sysctl tree attached to its device's * sysctl tree. */ sysctl_ctx_free(&chan->ch_sysctl_ctx); /* * Cancel polling, if it is enabled. */ vmbus_chan_poll_cancel(chan); /* * NOTE: * Order is critical. This channel _must_ be uninstalled first, * else the channel task may be enqueued by the IDT after it has * been drained. */ vmbus_chan_clear_chmap(chan); taskqueue_drain(chan->ch_tq, &chan->ch_task); chan->ch_tq = NULL; /* * Close this channel. */ mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for chclose(chan%u)\n", chan->ch_id); error = ENXIO; goto disconnect; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHCLOSE; req->chm_chanid = chan->ch_id; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { vmbus_chan_printf(chan, "chclose(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); goto disconnect; } if (bootverbose) vmbus_chan_printf(chan, "chan%u closed\n", chan->ch_id); disconnect: /* * Disconnect the TX+RX bufrings from this channel. */ if (chan->ch_bufring_gpadl != 0) { int error1; error1 = vmbus_chan_gpadl_disconnect(chan, chan->ch_bufring_gpadl); if (error1) { /* * XXX * The bufring GPADL is still connected; abandon * this bufring, instead of having mysterious * crash or trashed data later on. */ vmbus_chan_printf(chan, "chan%u bufring GPADL " "is still connected after close\n", chan->ch_id); chan->ch_bufring = NULL; /* * Give caller a hint that the bufring GPADL is * still connected. */ error = EISCONN; } chan->ch_bufring_gpadl = 0; } /* * Destroy the TX+RX bufrings. */ if (chan->ch_bufring != NULL) { - hyperv_dmamem_free(&chan->ch_bufring_dma, chan->ch_bufring); + contigfree(chan->ch_bufring, chan->ch_bufring_size, M_DEVBUF); chan->ch_bufring = NULL; } return (error); } int vmbus_chan_close_direct(struct vmbus_channel *chan) { int error; #ifdef INVARIANTS if (VMBUS_CHAN_ISPRIMARY(chan)) { struct vmbus_channel *subchan; /* * All sub-channels _must_ have been closed, or are _not_ * opened at all. */ mtx_lock(&chan->ch_subchan_lock); TAILQ_FOREACH(subchan, &chan->ch_subchans, ch_sublink) { KASSERT( (subchan->ch_stflags & VMBUS_CHAN_ST_OPENED) == 0, ("chan%u: subchan%u is still opened", chan->ch_id, subchan->ch_subidx)); } mtx_unlock(&chan->ch_subchan_lock); } #endif error = vmbus_chan_close_internal(chan); if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * This sub-channel is referenced, when it is linked to * the primary channel; drop that reference now. */ vmbus_chan_detach(chan); } return (error); } /* * Caller should make sure that all sub-channels have * been added to 'chan' and all to-be-closed channels * are not being opened. */ void vmbus_chan_close(struct vmbus_channel *chan) { int subchan_cnt; if (!VMBUS_CHAN_ISPRIMARY(chan)) { /* * Sub-channel is closed when its primary channel * is closed; done. */ return; } /* * Close all sub-channels, if any. */ subchan_cnt = chan->ch_subchan_cnt; if (subchan_cnt > 0) { struct vmbus_channel **subchan; int i; subchan = vmbus_subchan_get(chan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { vmbus_chan_close_internal(subchan[i]); /* * This sub-channel is referenced, when it is * linked to the primary channel; drop that * reference now. */ vmbus_chan_detach(subchan[i]); } vmbus_subchan_rel(subchan, subchan_cnt); } /* Then close the primary channel. */ vmbus_chan_close_internal(chan); } void vmbus_chan_intr_drain(struct vmbus_channel *chan) { taskqueue_drain(chan->ch_tq, &chan->ch_task); } uint32_t vmbus_chan_write_available(struct vmbus_channel *chan) { return (vmbus_txbr_available(&chan->ch_txbr)); } bool vmbus_chan_write_signal(struct vmbus_channel *chan, int32_t min_signal_size) { if (min_signal_size >= 0 && vmbus_chan_write_available(chan) > min_signal_size) { return false; } if (!vmbus_txbr_get_imask(&chan->ch_txbr)) { /* txbr imask is not set, signal the reader */ vmbus_chan_signal_tx(chan); return true; } return false; } void vmbus_chan_set_pending_send_size(struct vmbus_channel *chan, uint32_t size) { if (chan) vmbus_txbr_set_pending_snd_sz(&chan->ch_txbr, size); } int vmbus_chan_iov_send(struct vmbus_channel *chan, const struct iovec iov[], int iovlen, vmbus_br_copy_callback_t cb, void *cbarg) { int error; boolean_t send_evt; if (iovlen == 0) return (0); error = vmbus_txbr_write_call(&chan->ch_txbr, iov, iovlen, cb, cbarg, &send_evt); if (!error && send_evt) { vmbus_chan_signal_tx(chan); } return error; } int vmbus_chan_send(struct vmbus_channel *chan, uint16_t type, uint16_t flags, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt pkt; int pktlen, pad_pktlen, hlen, error; uint64_t pad = 0; struct iovec iov[3]; boolean_t send_evt; hlen = sizeof(pkt); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), ("invalid packet size %d", pad_pktlen)); pkt.cp_hdr.cph_type = type; pkt.cp_hdr.cph_flags = flags; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; iov[0].iov_base = &pkt; iov[0].iov_len = hlen; iov[1].iov_base = data; iov[1].iov_len = dlen; iov[2].iov_base = &pad; iov[2].iov_len = pad_pktlen - pktlen; error = vmbus_txbr_write(&chan->ch_txbr, iov, 3, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt_sglist pkt; int pktlen, pad_pktlen, hlen, error; struct iovec iov[4]; boolean_t send_evt; uint64_t pad = 0; hlen = __offsetof(struct vmbus_chanpkt_sglist, cp_gpa[sglen]); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), ("invalid packet size %d", pad_pktlen)); pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; pkt.cp_rsvd = 0; pkt.cp_gpa_cnt = sglen; iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); iov[1].iov_base = sg; iov[1].iov_len = sizeof(struct vmbus_gpa) * sglen; iov[2].iov_base = data; iov[2].iov_len = dlen; iov[3].iov_base = &pad; iov[3].iov_len = pad_pktlen - pktlen; error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_send_prplist(struct vmbus_channel *chan, struct vmbus_gpa_range *prp, int prp_cnt, void *data, int dlen, uint64_t xactid) { struct vmbus_chanpkt_prplist pkt; int pktlen, pad_pktlen, hlen, error; struct iovec iov[4]; boolean_t send_evt; uint64_t pad = 0; hlen = __offsetof(struct vmbus_chanpkt_prplist, cp_range[0].gpa_page[prp_cnt]); pktlen = hlen + dlen; pad_pktlen = VMBUS_CHANPKT_TOTLEN(pktlen); KASSERT(pad_pktlen <= vmbus_txbr_maxpktsz(&chan->ch_txbr), ("invalid packet size %d", pad_pktlen)); pkt.cp_hdr.cph_type = VMBUS_CHANPKT_TYPE_GPA; pkt.cp_hdr.cph_flags = VMBUS_CHANPKT_FLAG_RC; VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_hlen, hlen); VMBUS_CHANPKT_SETLEN(pkt.cp_hdr.cph_tlen, pad_pktlen); pkt.cp_hdr.cph_xactid = xactid; pkt.cp_rsvd = 0; pkt.cp_range_cnt = 1; iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); iov[1].iov_base = prp; iov[1].iov_len = __offsetof(struct vmbus_gpa_range, gpa_page[prp_cnt]); iov[2].iov_base = data; iov[2].iov_len = dlen; iov[3].iov_base = &pad; iov[3].iov_len = pad_pktlen - pktlen; error = vmbus_txbr_write(&chan->ch_txbr, iov, 4, &send_evt); if (!error && send_evt) vmbus_chan_signal_tx(chan); return error; } int vmbus_chan_recv(struct vmbus_channel *chan, void *data, int *dlen0, uint64_t *xactid) { struct vmbus_chanpkt_hdr pkt; int error, dlen, hlen; error = vmbus_rxbr_peek(&chan->ch_rxbr, &pkt, sizeof(pkt)); if (error) return (error); if (__predict_false(pkt.cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { vmbus_chan_printf(chan, "invalid hlen %u\n", pkt.cph_hlen); /* XXX this channel is dead actually. */ return (EIO); } if (__predict_false(pkt.cph_hlen > pkt.cph_tlen)) { vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n", pkt.cph_hlen, pkt.cph_tlen); /* XXX this channel is dead actually. */ return (EIO); } hlen = VMBUS_CHANPKT_GETLEN(pkt.cph_hlen); dlen = VMBUS_CHANPKT_GETLEN(pkt.cph_tlen) - hlen; if (*dlen0 < dlen) { /* Return the size of this packet's data. */ *dlen0 = dlen; return (ENOBUFS); } *xactid = pkt.cph_xactid; *dlen0 = dlen; /* Skip packet header */ error = vmbus_rxbr_read(&chan->ch_rxbr, data, dlen, hlen); KASSERT(!error, ("vmbus_rxbr_read failed")); return (0); } int vmbus_chan_recv_pkt(struct vmbus_channel *chan, struct vmbus_chanpkt_hdr *pkt, int *pktlen0) { int error, pktlen, pkt_hlen; pkt_hlen = sizeof(*pkt); error = vmbus_rxbr_peek(&chan->ch_rxbr, pkt, pkt_hlen); if (error) return (error); if (__predict_false(pkt->cph_hlen < VMBUS_CHANPKT_HLEN_MIN)) { vmbus_chan_printf(chan, "invalid hlen %u\n", pkt->cph_hlen); /* XXX this channel is dead actually. */ return (EIO); } if (__predict_false(pkt->cph_hlen > pkt->cph_tlen)) { vmbus_chan_printf(chan, "invalid hlen %u and tlen %u\n", pkt->cph_hlen, pkt->cph_tlen); /* XXX this channel is dead actually. */ return (EIO); } pktlen = VMBUS_CHANPKT_GETLEN(pkt->cph_tlen); if (*pktlen0 < pktlen) { /* Return the size of this packet. */ *pktlen0 = pktlen; return (ENOBUFS); } *pktlen0 = pktlen; /* * Skip the fixed-size packet header, which has been filled * by the above vmbus_rxbr_peek(). */ error = vmbus_rxbr_read(&chan->ch_rxbr, pkt + 1, pktlen - pkt_hlen, pkt_hlen); KASSERT(!error, ("vmbus_rxbr_read failed")); return (0); } uint32_t vmbus_chan_read_available(struct vmbus_channel *chan) { return (vmbus_rxbr_available(&chan->ch_rxbr)); } /* * This routine does: * - Advance the channel read index for 'advance' bytes * - Copy data_len bytes in to the buffer pointed by 'data' * Return 0 if operation succeed. EAGAIN if operations if failed. * If failed, the buffer pointed by 'data' is intact, and the * channel read index is not advanced at all. */ int vmbus_chan_recv_peek(struct vmbus_channel *chan, void *data, int data_len, uint32_t advance) { int error; boolean_t sig_event; if (data == NULL || data_len <= 0) return (EINVAL); error = vmbus_rxbr_idxadv_peek(&chan->ch_rxbr, data, data_len, advance, &sig_event); if (!error && sig_event) { vmbus_chan_signal_rx(chan); } return (error); } /* * This routine does: * - Advance the channel read index for 'advance' bytes */ int vmbus_chan_recv_idxadv(struct vmbus_channel *chan, uint32_t advance) { int error; boolean_t sig_event; if (advance == 0) return (EINVAL); error = vmbus_rxbr_idxadv(&chan->ch_rxbr, advance, &sig_event); if (!error && sig_event) { vmbus_chan_signal_rx(chan); } return (error); } /* * Caller should hold its own lock to serialize the ring buffer * copy. */ int vmbus_chan_recv_peek_call(struct vmbus_channel *chan, int data_len, uint32_t skip, vmbus_br_copy_callback_t cb, void *cbarg) { if (!chan || data_len <= 0 || cb == NULL) return (EINVAL); return (vmbus_rxbr_peek_call(&chan->ch_rxbr, data_len, skip, cb, cbarg)); } static void vmbus_chan_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; vmbus_chan_callback_t cb = chan->ch_cb; void *cbarg = chan->ch_cbarg; KASSERT(chan->ch_poll_intvl == 0, ("chan%u: interrupted in polling mode", chan->ch_id)); /* * Optimize host to guest signaling by ensuring: * 1. While reading the channel, we disable interrupts from * host. * 2. Ensure that we process all posted messages from the host * before returning from this callback. * 3. Once we return, enable signaling from the host. Once this * state is set we check to see if additional packets are * available to read. In this case we repeat the process. * * NOTE: Interrupt has been disabled in the ISR. */ for (;;) { uint32_t left; cb(chan, cbarg); left = vmbus_rxbr_intr_unmask(&chan->ch_rxbr); if (left == 0) { /* No more data in RX bufring; done */ break; } vmbus_rxbr_intr_mask(&chan->ch_rxbr); } } static void vmbus_chan_task_nobatch(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; KASSERT(chan->ch_poll_intvl == 0, ("chan%u: interrupted in polling mode", chan->ch_id)); chan->ch_cb(chan, chan->ch_cbarg); } static void vmbus_chan_poll_timeout(void *xchan) { struct vmbus_channel *chan = xchan; KASSERT(chan->ch_poll_intvl != 0, ("chan%u: polling timeout in interrupt mode", chan->ch_id)); taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task); } static void vmbus_chan_poll_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; KASSERT(chan->ch_poll_intvl != 0, ("chan%u: polling in interrupt mode", chan->ch_id)); callout_reset_sbt_curcpu(&chan->ch_poll_timeo, chan->ch_poll_intvl, 0, vmbus_chan_poll_timeout, chan, chan->ch_poll_flags); chan->ch_cb(chan, chan->ch_cbarg); } static void vmbus_chan_pollcfg_task(void *xarg, int pending __unused) { const struct vmbus_chan_pollarg *arg = xarg; struct vmbus_channel *chan = arg->poll_chan; sbintime_t intvl; int poll_flags; /* * Save polling interval. */ intvl = SBT_1S / arg->poll_hz; if (intvl == 0) intvl = 1; if (intvl == chan->ch_poll_intvl) { /* Nothing changes; done */ return; } chan->ch_poll_intvl = intvl; /* Adjust callout flags. */ poll_flags = C_DIRECT_EXEC; if (arg->poll_hz <= hz) poll_flags |= C_HARDCLOCK; chan->ch_poll_flags = poll_flags; /* * Disconnect this channel from the channel map to make sure that * the RX bufring interrupt enabling bit can not be touched, and * ISR can not enqueue this channel task anymore. THEN, disable * interrupt from the RX bufring (TX bufring does not generate * interrupt to VM). * * NOTE: order is critical. */ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = NULL; __compiler_membar(); vmbus_rxbr_intr_mask(&chan->ch_rxbr); /* * NOTE: * At this point, this channel task will not be enqueued by * the ISR anymore, time to cancel the pending one. */ taskqueue_cancel(chan->ch_tq, &chan->ch_task, NULL); /* Kick start! */ taskqueue_enqueue(chan->ch_tq, &chan->ch_poll_task); } static bool vmbus_chan_poll_cancel_intq(struct vmbus_channel *chan) { if (chan->ch_poll_intvl == 0) { /* Not enabled. */ return (false); } /* * Stop polling callout, so that channel polling task * will not be enqueued anymore. */ callout_drain(&chan->ch_poll_timeo); /* * Disable polling by resetting polling interval. * * NOTE: * The polling interval resetting MUST be conducted * after the callout is drained; mainly to keep the * proper assertion in place. */ chan->ch_poll_intvl = 0; /* * NOTE: * At this point, this channel polling task will not be * enqueued by the callout anymore, time to cancel the * pending one. */ taskqueue_cancel(chan->ch_tq, &chan->ch_poll_task, NULL); /* Polling was enabled. */ return (true); } static void vmbus_chan_polldis_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; if (!vmbus_chan_poll_cancel_intq(chan)) { /* Already disabled; done. */ return; } /* * Plug this channel back to the channel map and unmask * the RX bufring interrupt. */ chan->ch_vmbus->vmbus_chmap[chan->ch_id] = chan; __compiler_membar(); vmbus_rxbr_intr_unmask(&chan->ch_rxbr); /* * Kick start the interrupt task, just in case unmasking * interrupt races ISR. */ taskqueue_enqueue(chan->ch_tq, &chan->ch_task); } static __inline void vmbus_event_flags_proc(struct vmbus_softc *sc, volatile u_long *event_flags, int flag_cnt) { int f; for (f = 0; f < flag_cnt; ++f) { uint32_t chid_base; u_long flags; int chid_ofs; if (event_flags[f] == 0) continue; flags = atomic_swap_long(&event_flags[f], 0); chid_base = f << VMBUS_EVTFLAG_SHIFT; while ((chid_ofs = ffsl(flags)) != 0) { struct vmbus_channel *chan; --chid_ofs; /* NOTE: ffsl is 1-based */ flags &= ~(1UL << chid_ofs); chan = sc->vmbus_chmap[chid_base + chid_ofs]; if (__predict_false(chan == NULL)) { /* Channel is closed. */ continue; } __compiler_membar(); if (chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD) vmbus_rxbr_intr_mask(&chan->ch_rxbr); taskqueue_enqueue(chan->ch_tq, &chan->ch_task); } } } void vmbus_event_proc(struct vmbus_softc *sc, int cpu) { struct vmbus_evtflags *eventf; /* * On Host with Win8 or above, the event page can be checked directly * to get the id of the channel that has the pending interrupt. */ eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; vmbus_event_flags_proc(sc, eventf->evt_flags, VMBUS_PCPU_GET(sc, event_flags_cnt, cpu)); } void vmbus_event_proc_compat(struct vmbus_softc *sc, int cpu) { struct vmbus_evtflags *eventf; eventf = VMBUS_PCPU_GET(sc, event_flags, cpu) + VMBUS_SINT_MESSAGE; if (atomic_testandclear_long(&eventf->evt_flags[0], 0)) { vmbus_event_flags_proc(sc, sc->vmbus_rx_evtflags, VMBUS_CHAN_MAX_COMPAT >> VMBUS_EVTFLAG_SHIFT); } } static void vmbus_chan_update_evtflagcnt(struct vmbus_softc *sc, const struct vmbus_channel *chan) { volatile int *flag_cnt_ptr; int flag_cnt; flag_cnt = (chan->ch_id / VMBUS_EVTFLAG_LEN) + 1; flag_cnt_ptr = VMBUS_PCPU_PTR(sc, event_flags_cnt, chan->ch_cpuid); for (;;) { int old_flag_cnt; old_flag_cnt = *flag_cnt_ptr; if (old_flag_cnt >= flag_cnt) break; if (atomic_cmpset_int(flag_cnt_ptr, old_flag_cnt, flag_cnt)) { if (bootverbose) { vmbus_chan_printf(chan, "chan%u update cpu%d flag_cnt to %d\n", chan->ch_id, chan->ch_cpuid, flag_cnt); } break; } } } static struct vmbus_channel * vmbus_chan_alloc(struct vmbus_softc *sc) { struct vmbus_channel *chan; chan = malloc(sizeof(*chan), M_DEVBUF, M_WAITOK | M_ZERO); - chan->ch_monprm = hyperv_dmamem_alloc(bus_get_dma_tag(sc->vmbus_dev), - HYPERCALL_PARAM_ALIGN, 0, sizeof(struct hyperv_mon_param), - &chan->ch_monprm_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); + chan->ch_monprm = contigmalloc(sizeof(struct hyperv_mon_param), + M_DEVBUF, M_WAITOK | M_ZERO, 0ul, ~0ul, HYPERCALL_PARAM_ALIGN, 0); if (chan->ch_monprm == NULL) { device_printf(sc->vmbus_dev, "monprm alloc failed\n"); free(chan, M_DEVBUF); return NULL; } chan->ch_refs = 1; chan->ch_vmbus = sc; mtx_init(&chan->ch_subchan_lock, "vmbus subchan", NULL, MTX_DEF); sx_init(&chan->ch_orphan_lock, "vmbus chorphan"); TAILQ_INIT(&chan->ch_subchans); vmbus_rxbr_init(&chan->ch_rxbr); vmbus_txbr_init(&chan->ch_txbr); TASK_INIT(&chan->ch_poll_task, 0, vmbus_chan_poll_task, chan); callout_init(&chan->ch_poll_timeo, 1); return chan; } static void vmbus_chan_free(struct vmbus_channel *chan) { KASSERT(TAILQ_EMPTY(&chan->ch_subchans) && chan->ch_subchan_cnt == 0, ("still owns sub-channels")); KASSERT((chan->ch_stflags & (VMBUS_CHAN_ST_OPENED | VMBUS_CHAN_ST_ONPRIL | VMBUS_CHAN_ST_ONSUBL | VMBUS_CHAN_ST_ONLIST)) == 0, ("free busy channel")); KASSERT(chan->ch_orphan_xact == NULL, ("still has orphan xact installed")); KASSERT(chan->ch_refs == 0, ("chan%u: invalid refcnt %d", chan->ch_id, chan->ch_refs)); KASSERT(chan->ch_poll_intvl == 0, ("chan%u: polling is activated", chan->ch_id)); - hyperv_dmamem_free(&chan->ch_monprm_dma, chan->ch_monprm); + contigfree(chan->ch_monprm, sizeof(struct hyperv_mon_param), M_DEVBUF); mtx_destroy(&chan->ch_subchan_lock); sx_destroy(&chan->ch_orphan_lock); vmbus_rxbr_deinit(&chan->ch_rxbr); vmbus_txbr_deinit(&chan->ch_txbr); free(chan, M_DEVBUF); } static int vmbus_chan_add(struct vmbus_channel *newchan) { struct vmbus_softc *sc = newchan->ch_vmbus; struct vmbus_channel *prichan; if (newchan->ch_id == 0) { /* * XXX * Chan0 will neither be processed nor should be offered; * skip it. */ device_printf(sc->vmbus_dev, "got chan0 offer, discard\n"); return EINVAL; } else if (newchan->ch_id >= VMBUS_CHAN_MAX) { device_printf(sc->vmbus_dev, "invalid chan%u offer\n", newchan->ch_id); return EINVAL; } mtx_lock(&sc->vmbus_prichan_lock); TAILQ_FOREACH(prichan, &sc->vmbus_prichans, ch_prilink) { /* * Sub-channel will have the same type GUID and instance * GUID as its primary channel. */ if (memcmp(&prichan->ch_guid_type, &newchan->ch_guid_type, sizeof(struct hyperv_guid)) == 0 && memcmp(&prichan->ch_guid_inst, &newchan->ch_guid_inst, sizeof(struct hyperv_guid)) == 0) break; } if (VMBUS_CHAN_ISPRIMARY(newchan)) { if (prichan == NULL) { /* Install the new primary channel */ vmbus_chan_ins_prilist(sc, newchan); mtx_unlock(&sc->vmbus_prichan_lock); goto done; } else { mtx_unlock(&sc->vmbus_prichan_lock); device_printf(sc->vmbus_dev, "duplicated primary chan%u\n", newchan->ch_id); return EINVAL; } } else { /* Sub-channel */ if (prichan == NULL) { mtx_unlock(&sc->vmbus_prichan_lock); device_printf(sc->vmbus_dev, "no primary chan for chan%u\n", newchan->ch_id); return EINVAL; } /* * Found the primary channel for this sub-channel and * move on. * * XXX refcnt prichan */ } mtx_unlock(&sc->vmbus_prichan_lock); /* * This is a sub-channel; link it with the primary channel. */ KASSERT(!VMBUS_CHAN_ISPRIMARY(newchan), ("new channel is not sub-channel")); KASSERT(prichan != NULL, ("no primary channel")); /* * Reference count this sub-channel; it will be dereferenced * when this sub-channel is closed. */ KASSERT(newchan->ch_refs == 1, ("chan%u: invalid refcnt %d", newchan->ch_id, newchan->ch_refs)); atomic_add_int(&newchan->ch_refs, 1); newchan->ch_prichan = prichan; newchan->ch_dev = prichan->ch_dev; mtx_lock(&prichan->ch_subchan_lock); vmbus_chan_ins_sublist(prichan, newchan); mtx_unlock(&prichan->ch_subchan_lock); /* * Notify anyone that is interested in this sub-channel, * after this sub-channel is setup. */ wakeup(prichan); done: /* * Hook this channel up for later revocation. */ mtx_lock(&sc->vmbus_chan_lock); vmbus_chan_ins_list(sc, newchan); mtx_unlock(&sc->vmbus_chan_lock); if (bootverbose) { vmbus_chan_printf(newchan, "chan%u subidx%u offer\n", newchan->ch_id, newchan->ch_subidx); } /* Select default cpu for this channel. */ vmbus_chan_cpu_default(newchan); return 0; } void vmbus_chan_cpu_set(struct vmbus_channel *chan, int cpu) { KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu %d", cpu)); if (chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WS2008 || chan->ch_vmbus->vmbus_version == VMBUS_VERSION_WIN7) { /* Only cpu0 is supported */ cpu = 0; } chan->ch_cpuid = cpu; chan->ch_vcpuid = VMBUS_PCPU_GET(chan->ch_vmbus, vcpuid, cpu); if (bootverbose) { vmbus_chan_printf(chan, "chan%u assigned to cpu%u [vcpu%u]\n", chan->ch_id, chan->ch_cpuid, chan->ch_vcpuid); } } void vmbus_chan_cpu_rr(struct vmbus_channel *chan) { static uint32_t vmbus_chan_nextcpu; int cpu; cpu = atomic_fetchadd_int(&vmbus_chan_nextcpu, 1) % mp_ncpus; vmbus_chan_cpu_set(chan, cpu); } static void vmbus_chan_cpu_default(struct vmbus_channel *chan) { /* * By default, pin the channel to cpu0. Devices having * special channel-cpu mapping requirement should call * vmbus_chan_cpu_{set,rr}(). */ vmbus_chan_cpu_set(chan, 0); } static void vmbus_chan_msgproc_choffer(struct vmbus_softc *sc, const struct vmbus_message *msg) { const struct vmbus_chanmsg_choffer *offer; struct vmbus_channel *chan; task_fn_t *detach_fn, *attach_fn; int error; offer = (const struct vmbus_chanmsg_choffer *)msg->msg_data; chan = vmbus_chan_alloc(sc); if (chan == NULL) { device_printf(sc->vmbus_dev, "allocate chan%u failed\n", offer->chm_chanid); return; } chan->ch_id = offer->chm_chanid; chan->ch_subidx = offer->chm_subidx; chan->ch_guid_type = offer->chm_chtype; chan->ch_guid_inst = offer->chm_chinst; /* Batch reading is on by default */ chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; chan->ch_monprm->mp_connid = VMBUS_CONNID_EVENT; if (sc->vmbus_version != VMBUS_VERSION_WS2008) chan->ch_monprm->mp_connid = offer->chm_connid; if (offer->chm_flags1 & VMBUS_CHOFFER_FLAG1_HASMNF) { int trig_idx; /* * Setup MNF stuffs. */ chan->ch_txflags |= VMBUS_CHAN_TXF_HASMNF; trig_idx = offer->chm_montrig / VMBUS_MONTRIG_LEN; if (trig_idx >= VMBUS_MONTRIGS_MAX) panic("invalid monitor trigger %u", offer->chm_montrig); chan->ch_montrig = &sc->vmbus_mnf2->mnf_trigs[trig_idx].mt_pending; chan->ch_montrig_mask = 1 << (offer->chm_montrig % VMBUS_MONTRIG_LEN); } if (offer->chm_chflags & VMBUS_CHAN_TLNPI_PROVIDER_OFFER) { /* This is HyperV socket channel */ chan->ch_is_hvs = true; /* The first byte != 0 means the host initiated connection. */ chan->ch_hvs_conn_from_host = offer->chm_udata.pipe.user_def[0]; if (bootverbose) { device_printf(sc->vmbus_dev, "chan%u is hyperv socket channel " "connected %s host\n", chan->ch_id, (chan->ch_hvs_conn_from_host != 0) ? "from" : "to"); } } else { chan->ch_is_hvs = false; } /* * Setup event flag. */ chan->ch_evtflag = &sc->vmbus_tx_evtflags[chan->ch_id >> VMBUS_EVTFLAG_SHIFT]; chan->ch_evtflag_mask = 1UL << (chan->ch_id & VMBUS_EVTFLAG_MASK); /* * Setup attach and detach tasks. */ if (VMBUS_CHAN_ISPRIMARY(chan)) { chan->ch_mgmt_tq = sc->vmbus_devtq; attach_fn = vmbus_prichan_attach_task; detach_fn = vmbus_prichan_detach_task; } else { chan->ch_mgmt_tq = sc->vmbus_subchtq; attach_fn = vmbus_subchan_attach_task; detach_fn = vmbus_subchan_detach_task; } TASK_INIT(&chan->ch_attach_task, 0, attach_fn, chan); TASK_INIT(&chan->ch_detach_task, 0, detach_fn, chan); error = vmbus_chan_add(chan); if (error) { device_printf(sc->vmbus_dev, "add chan%u failed: %d\n", chan->ch_id, error); atomic_subtract_int(&chan->ch_refs, 1); vmbus_chan_free(chan); return; } taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_attach_task); } static void vmbus_chan_msgproc_chrescind(struct vmbus_softc *sc, const struct vmbus_message *msg) { const struct vmbus_chanmsg_chrescind *note; struct vmbus_channel *chan; note = (const struct vmbus_chanmsg_chrescind *)msg->msg_data; if (note->chm_chanid > VMBUS_CHAN_MAX) { device_printf(sc->vmbus_dev, "invalid revoked chan%u\n", note->chm_chanid); return; } /* * Find and remove the target channel from the channel list. */ mtx_lock(&sc->vmbus_chan_lock); TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { if (chan->ch_id == note->chm_chanid) break; } if (chan == NULL) { mtx_unlock(&sc->vmbus_chan_lock); device_printf(sc->vmbus_dev, "chan%u is not offered\n", note->chm_chanid); return; } vmbus_chan_rem_list(sc, chan); mtx_unlock(&sc->vmbus_chan_lock); if (VMBUS_CHAN_ISPRIMARY(chan)) { /* * The target channel is a primary channel; remove the * target channel from the primary channel list now, * instead of later, so that it will not be found by * other sub-channel offers, which are processed in * this thread. */ mtx_lock(&sc->vmbus_prichan_lock); vmbus_chan_rem_prilist(sc, chan); mtx_unlock(&sc->vmbus_prichan_lock); } /* * NOTE: * The following processing order is critical: * Set the REVOKED state flag before orphaning the installed xact. */ if (atomic_testandset_int(&chan->ch_stflags, VMBUS_CHAN_ST_REVOKED_SHIFT)) panic("channel has already been revoked"); sx_xlock(&chan->ch_orphan_lock); if (chan->ch_orphan_xact != NULL) vmbus_xact_ctx_orphan(chan->ch_orphan_xact); sx_xunlock(&chan->ch_orphan_lock); if (bootverbose) vmbus_chan_printf(chan, "chan%u revoked\n", note->chm_chanid); vmbus_chan_detach(chan); } static int vmbus_chan_release(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; struct vmbus_chanmsg_chfree *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { vmbus_chan_printf(chan, "can not get msg hypercall for chfree(chan%u)\n", chan->ch_id); return (ENXIO); } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHFREE; req->chm_chanid = chan->ch_id; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { vmbus_chan_printf(chan, "chfree(chan%u) msg hypercall exec failed: %d\n", chan->ch_id, error); } else { if (bootverbose) vmbus_chan_printf(chan, "chan%u freed\n", chan->ch_id); } return (error); } static void vmbus_prichan_detach_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; KASSERT(VMBUS_CHAN_ISPRIMARY(chan), ("chan%u is not primary channel", chan->ch_id)); /* Delete and detach the device associated with this channel. */ vmbus_delete_child(chan); /* Release this channel (back to vmbus). */ vmbus_chan_release(chan); /* Free this channel's resource. */ vmbus_chan_free(chan); } static void vmbus_subchan_detach_task(void *xchan, int pending __unused) { struct vmbus_channel *chan = xchan; struct vmbus_channel *pri_chan = chan->ch_prichan; KASSERT(!VMBUS_CHAN_ISPRIMARY(chan), ("chan%u is primary channel", chan->ch_id)); /* Release this channel (back to vmbus). */ vmbus_chan_release(chan); /* Unlink from its primary channel's sub-channel list. */ mtx_lock(&pri_chan->ch_subchan_lock); vmbus_chan_rem_sublist(pri_chan, chan); mtx_unlock(&pri_chan->ch_subchan_lock); /* Notify anyone that is waiting for this sub-channel to vanish. */ wakeup(pri_chan); /* Free this channel's resource. */ vmbus_chan_free(chan); } static void vmbus_prichan_attach_task(void *xchan, int pending __unused) { /* * Add device for this primary channel. */ vmbus_add_child(xchan); } static void vmbus_subchan_attach_task(void *xchan __unused, int pending __unused) { /* Nothing */ } void vmbus_chan_destroy_all(struct vmbus_softc *sc) { /* * Detach all devices and destroy the corresponding primary * channels. */ for (;;) { struct vmbus_channel *chan; mtx_lock(&sc->vmbus_chan_lock); TAILQ_FOREACH(chan, &sc->vmbus_chans, ch_link) { if (VMBUS_CHAN_ISPRIMARY(chan)) break; } if (chan == NULL) { /* No more primary channels; done. */ mtx_unlock(&sc->vmbus_chan_lock); break; } vmbus_chan_rem_list(sc, chan); mtx_unlock(&sc->vmbus_chan_lock); mtx_lock(&sc->vmbus_prichan_lock); vmbus_chan_rem_prilist(sc, chan); mtx_unlock(&sc->vmbus_prichan_lock); taskqueue_enqueue(chan->ch_mgmt_tq, &chan->ch_detach_task); } } struct vmbus_channel ** vmbus_subchan_get(struct vmbus_channel *pri_chan, int subchan_cnt) { struct vmbus_channel **ret, *chan; int i; KASSERT(subchan_cnt > 0, ("invalid sub-channel count %d", subchan_cnt)); ret = malloc(subchan_cnt * sizeof(struct vmbus_channel *), M_TEMP, M_WAITOK); mtx_lock(&pri_chan->ch_subchan_lock); while (pri_chan->ch_subchan_cnt < subchan_cnt) mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "subch", 0); i = 0; TAILQ_FOREACH(chan, &pri_chan->ch_subchans, ch_sublink) { /* TODO: refcnt chan */ ret[i] = chan; ++i; if (i == subchan_cnt) break; } KASSERT(i == subchan_cnt, ("invalid subchan count %d, should be %d", pri_chan->ch_subchan_cnt, subchan_cnt)); mtx_unlock(&pri_chan->ch_subchan_lock); return ret; } void vmbus_subchan_rel(struct vmbus_channel **subchan, int subchan_cnt __unused) { free(subchan, M_TEMP); } void vmbus_subchan_drain(struct vmbus_channel *pri_chan) { mtx_lock(&pri_chan->ch_subchan_lock); while (pri_chan->ch_subchan_cnt > 0) mtx_sleep(pri_chan, &pri_chan->ch_subchan_lock, 0, "dsubch", 0); mtx_unlock(&pri_chan->ch_subchan_lock); } void vmbus_chan_msgproc(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_chanmsg_proc_t msg_proc; uint32_t msg_type; msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; KASSERT(msg_type < VMBUS_CHANMSG_TYPE_MAX, ("invalid message type %u", msg_type)); msg_proc = vmbus_chan_msgprocs[msg_type]; if (msg_proc != NULL) msg_proc(sc, msg); } void vmbus_chan_set_readbatch(struct vmbus_channel *chan, bool on) { if (!on) chan->ch_flags &= ~VMBUS_CHAN_FLAG_BATCHREAD; else chan->ch_flags |= VMBUS_CHAN_FLAG_BATCHREAD; } uint32_t vmbus_chan_id(const struct vmbus_channel *chan) { return chan->ch_id; } uint32_t vmbus_chan_subidx(const struct vmbus_channel *chan) { return chan->ch_subidx; } bool vmbus_chan_is_primary(const struct vmbus_channel *chan) { if (VMBUS_CHAN_ISPRIMARY(chan)) return true; else return false; } bool vmbus_chan_is_hvs(const struct vmbus_channel *chan) { return chan->ch_is_hvs; } bool vmbus_chan_is_hvs_conn_from_host(const struct vmbus_channel *chan) { KASSERT(vmbus_chan_is_hvs(chan) == true, ("Not a HyperV Socket channel %u", chan->ch_id)); if (chan->ch_hvs_conn_from_host != 0) return true; else return false; } struct hyperv_guid * vmbus_chan_guid_type(struct vmbus_channel *chan) { return &chan->ch_guid_type; } struct hyperv_guid * vmbus_chan_guid_inst(struct vmbus_channel *chan) { return &chan->ch_guid_inst; } int vmbus_chan_prplist_nelem(int br_size, int prpcnt_max, int dlen_max) { int elem_size; elem_size = __offsetof(struct vmbus_chanpkt_prplist, cp_range[0].gpa_page[prpcnt_max]); elem_size += dlen_max; elem_size = VMBUS_CHANPKT_TOTLEN(elem_size); return (vmbus_br_nelem(br_size, elem_size)); } bool vmbus_chan_tx_empty(const struct vmbus_channel *chan) { return (vmbus_txbr_empty(&chan->ch_txbr)); } bool vmbus_chan_rx_empty(const struct vmbus_channel *chan) { return (vmbus_rxbr_empty(&chan->ch_rxbr)); } static int vmbus_chan_printf(const struct vmbus_channel *chan, const char *fmt, ...) { va_list ap; device_t dev; int retval; if (chan->ch_dev == NULL || !device_is_alive(chan->ch_dev)) dev = chan->ch_vmbus->vmbus_dev; else dev = chan->ch_dev; retval = device_print_prettyname(dev); va_start(ap, fmt); retval += vprintf(fmt, ap); va_end(ap); return (retval); } void vmbus_chan_run_task(struct vmbus_channel *chan, struct task *task) { taskqueue_enqueue(chan->ch_tq, task); taskqueue_drain(chan->ch_tq, task); } struct taskqueue * vmbus_chan_mgmt_tq(const struct vmbus_channel *chan) { return (chan->ch_mgmt_tq); } bool vmbus_chan_is_revoked(const struct vmbus_channel *chan) { if (chan->ch_stflags & VMBUS_CHAN_ST_REVOKED) return (true); return (false); } void vmbus_chan_set_orphan(struct vmbus_channel *chan, struct vmbus_xact_ctx *xact) { sx_xlock(&chan->ch_orphan_lock); chan->ch_orphan_xact = xact; sx_xunlock(&chan->ch_orphan_lock); } void vmbus_chan_unset_orphan(struct vmbus_channel *chan) { sx_xlock(&chan->ch_orphan_lock); chan->ch_orphan_xact = NULL; sx_xunlock(&chan->ch_orphan_lock); } const void * vmbus_chan_xact_wait(const struct vmbus_channel *chan, struct vmbus_xact *xact, size_t *resp_len, bool can_sleep) { const void *ret; if (can_sleep) ret = vmbus_xact_wait(xact, resp_len); else ret = vmbus_xact_busywait(xact, resp_len); if (vmbus_chan_is_revoked(chan)) { /* * This xact probably is interrupted, and the * interruption can race the reply reception, * so we have to make sure that there are nothing * left on the RX bufring, i.e. this xact will * not be touched, once this function returns. * * Since the hypervisor will not put more data * onto the RX bufring once the channel is revoked, * the following loop will be terminated, once all * data are drained by the driver's channel * callback. */ while (!vmbus_chan_rx_empty(chan)) { if (can_sleep) pause("chxact", 1); else DELAY(1000); } } return (ret); } void vmbus_chan_poll_enable(struct vmbus_channel *chan, u_int pollhz) { struct vmbus_chan_pollarg arg; struct task poll_cfg; KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD, ("enable polling on non-batch chan%u", chan->ch_id)); KASSERT(pollhz >= VMBUS_CHAN_POLLHZ_MIN && pollhz <= VMBUS_CHAN_POLLHZ_MAX, ("invalid pollhz %u", pollhz)); arg.poll_chan = chan; arg.poll_hz = pollhz; TASK_INIT(&poll_cfg, 0, vmbus_chan_pollcfg_task, &arg); vmbus_chan_run_task(chan, &poll_cfg); } void vmbus_chan_poll_disable(struct vmbus_channel *chan) { struct task poll_dis; KASSERT(chan->ch_flags & VMBUS_CHAN_FLAG_BATCHREAD, ("disable polling on non-batch chan%u", chan->ch_id)); TASK_INIT(&poll_dis, 0, vmbus_chan_polldis_task, chan); vmbus_chan_run_task(chan, &poll_dis); } diff --git a/sys/dev/hyperv/vmbus/vmbus_chanvar.h b/sys/dev/hyperv/vmbus/vmbus_chanvar.h index b20b0119bc04..ba445ad2f82e 100644 --- a/sys/dev/hyperv/vmbus/vmbus_chanvar.h +++ b/sys/dev/hyperv/vmbus/vmbus_chanvar.h @@ -1,195 +1,193 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMBUS_CHANVAR_H_ #define _VMBUS_CHANVAR_H_ #include #include #include #include #include #include #include #include #include -#include #include #include struct vmbus_channel { /* * NOTE: * Fields before ch_txbr are only accessed on this channel's * target CPU. */ uint32_t ch_flags; /* VMBUS_CHAN_FLAG_ */ int ch_poll_flags; /* callout flags */ /* * RX bufring; immediately following ch_txbr. */ struct vmbus_rxbr ch_rxbr; struct taskqueue *ch_tq; struct task ch_task; struct task ch_poll_task; sbintime_t ch_poll_intvl; struct callout ch_poll_timeo; vmbus_chan_callback_t ch_cb; void *ch_cbarg; /* * TX bufring; at the beginning of ch_bufring. * * NOTE: * Put TX bufring and the following MNF/evtflag to a new * cacheline, since they will be accessed on all CPUs by * locking ch_txbr first. * * XXX * TX bufring and following MNF/evtflags do _not_ fit in * one 64B cacheline. */ struct vmbus_txbr ch_txbr __aligned(CACHE_LINE_SIZE); uint32_t ch_txflags; /* VMBUS_CHAN_TXF_ */ /* * These are based on the vmbus_chanmsg_choffer.chm_montrig. * Save it here for easy access. */ uint32_t ch_montrig_mask;/* MNF trig mask */ volatile uint32_t *ch_montrig; /* MNF trigger loc. */ /* * These are based on the vmbus_chanmsg_choffer.chm_chanid. * Save it here for easy access. */ u_long ch_evtflag_mask;/* event flag */ volatile u_long *ch_evtflag; /* event flag loc. */ /* * Rarely used fields. */ struct hyperv_mon_param *ch_monprm; - struct hyperv_dma ch_monprm_dma; uint32_t ch_id; /* channel id */ device_t ch_dev; struct vmbus_softc *ch_vmbus; int ch_cpuid; /* owner cpu */ /* * Virtual cpuid for ch_cpuid; it is used to communicate cpuid * related information w/ Hyper-V. If MSR_HV_VP_INDEX does not * exist, ch_vcpuid will always be 0 for compatibility. */ uint32_t ch_vcpuid; /* * If this is a primary channel, ch_subchan* fields * contain sub-channels belonging to this primary * channel. */ struct mtx ch_subchan_lock; TAILQ_HEAD(, vmbus_channel) ch_subchans; int ch_subchan_cnt; /* If this is a sub-channel */ TAILQ_ENTRY(vmbus_channel) ch_sublink; /* sub-channel link */ struct vmbus_channel *ch_prichan; /* owner primary chan */ void *ch_bufring; /* TX+RX bufrings */ - struct hyperv_dma ch_bufring_dma; + size_t ch_bufring_size; uint32_t ch_bufring_gpadl; struct task ch_attach_task; /* run in ch_mgmt_tq */ struct task ch_detach_task; /* run in ch_mgmt_tq */ struct taskqueue *ch_mgmt_tq; /* If this is a primary channel */ TAILQ_ENTRY(vmbus_channel) ch_prilink; /* primary chan link */ TAILQ_ENTRY(vmbus_channel) ch_link; /* channel link */ uint32_t ch_subidx; /* subchan index */ volatile uint32_t ch_stflags; /* atomic-op */ /* VMBUS_CHAN_ST_ */ struct hyperv_guid ch_guid_type; struct hyperv_guid ch_guid_inst; struct sx ch_orphan_lock; struct vmbus_xact_ctx *ch_orphan_xact; int ch_refs; /* * These are for HyperV socket channel only */ bool ch_is_hvs; uint8_t ch_hvs_conn_from_host; struct sysctl_ctx_list ch_sysctl_ctx; } __aligned(CACHE_LINE_SIZE); #define VMBUS_CHAN_ISPRIMARY(chan) ((chan)->ch_subidx == 0) /* * If this flag is set, this channel's interrupt will be masked in ISR, * and the RX bufring will be drained before this channel's interrupt is * unmasked. * * This flag is turned on by default. Drivers can turn it off according * to their own requirement. */ #define VMBUS_CHAN_FLAG_BATCHREAD 0x0002 #define VMBUS_CHAN_TXF_HASMNF 0x0001 #define VMBUS_CHAN_ST_OPENED_SHIFT 0 #define VMBUS_CHAN_ST_ONPRIL_SHIFT 1 #define VMBUS_CHAN_ST_ONSUBL_SHIFT 2 #define VMBUS_CHAN_ST_ONLIST_SHIFT 3 #define VMBUS_CHAN_ST_REVOKED_SHIFT 4 /* sticky */ #define VMBUS_CHAN_ST_OPENED (1 << VMBUS_CHAN_ST_OPENED_SHIFT) #define VMBUS_CHAN_ST_ONPRIL (1 << VMBUS_CHAN_ST_ONPRIL_SHIFT) #define VMBUS_CHAN_ST_ONSUBL (1 << VMBUS_CHAN_ST_ONSUBL_SHIFT) #define VMBUS_CHAN_ST_ONLIST (1 << VMBUS_CHAN_ST_ONLIST_SHIFT) #define VMBUS_CHAN_ST_REVOKED (1 << VMBUS_CHAN_ST_REVOKED_SHIFT) struct vmbus_softc; struct vmbus_message; void vmbus_event_proc(struct vmbus_softc *, int); void vmbus_event_proc_compat(struct vmbus_softc *, int); void vmbus_chan_msgproc(struct vmbus_softc *, const struct vmbus_message *); void vmbus_chan_destroy_all(struct vmbus_softc *); #endif /* !_VMBUS_CHANVAR_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus_var.h b/sys/dev/hyperv/vmbus/vmbus_var.h index c89d26d7aaf5..c50eeaca956d 100644 --- a/sys/dev/hyperv/vmbus/vmbus_var.h +++ b/sys/dev/hyperv/vmbus/vmbus_var.h @@ -1,186 +1,180 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _VMBUS_VAR_H_ #define _VMBUS_VAR_H_ #include #include #include #include -#include #include #include /* * NOTE: DO NOT CHANGE THIS. */ #define VMBUS_SINT_MESSAGE 2 /* * NOTE: * - DO NOT set it to the same value as VMBUS_SINT_MESSAGE. * - DO NOT set it to 0. */ #define VMBUS_SINT_TIMER 4 /* * NOTE: DO NOT CHANGE THESE */ #define VMBUS_CONNID_MESSAGE 1 #define VMBUS_CONNID_EVENT 2 struct vmbus_message; struct vmbus_softc; typedef void (*vmbus_chanmsg_proc_t)(struct vmbus_softc *, const struct vmbus_message *); #define VMBUS_CHANMSG_PROC(name, func) \ [VMBUS_CHANMSG_TYPE_##name] = func #define VMBUS_CHANMSG_PROC_WAKEUP(name) \ VMBUS_CHANMSG_PROC(name, vmbus_msghc_wakeup) struct vmbus_pcpu_data { u_long *intr_cnt; /* Hyper-V interrupt counter */ struct vmbus_message *message; /* shared messages */ uint32_t vcpuid; /* virtual cpuid */ int event_flags_cnt;/* # of event flags */ struct vmbus_evtflags *event_flags; /* event flags from host */ /* Rarely used fields */ - struct hyperv_dma message_dma; /* busdma glue */ - struct hyperv_dma event_flags_dma;/* busdma glue */ struct taskqueue *event_tq; /* event taskq */ struct taskqueue *message_tq; /* message taskq */ struct task message_task; /* message task */ } __aligned(CACHE_LINE_SIZE); struct vmbus_softc { void (*vmbus_event_proc)(struct vmbus_softc *, int); u_long *vmbus_tx_evtflags; /* event flags to host */ struct vmbus_mnf *vmbus_mnf2; /* monitored by host */ u_long *vmbus_rx_evtflags; /* compat evtflgs from host */ struct vmbus_channel *volatile *vmbus_chmap; struct vmbus_xact_ctx *vmbus_xc; struct vmbus_pcpu_data vmbus_pcpu[MAXCPU]; /* * Rarely used fields */ device_t vmbus_dev; int vmbus_idtvec; uint32_t vmbus_flags; /* see VMBUS_FLAG_ */ uint32_t vmbus_version; uint32_t vmbus_gpadl; /* Shared memory for vmbus_{rx,tx}_evtflags */ void *vmbus_evtflags; - struct hyperv_dma vmbus_evtflags_dma; void *vmbus_mnf1; /* monitored by VM, unused */ - struct hyperv_dma vmbus_mnf1_dma; - struct hyperv_dma vmbus_mnf2_dma; bool vmbus_scandone; struct task vmbus_scandone_task; struct taskqueue *vmbus_devtq; /* for dev attach/detach */ struct taskqueue *vmbus_subchtq; /* for sub-chan attach/detach */ /* Primary channels */ struct mtx vmbus_prichan_lock; TAILQ_HEAD(, vmbus_channel) vmbus_prichans; /* Complete channel list */ struct mtx vmbus_chan_lock; TAILQ_HEAD(, vmbus_channel) vmbus_chans; struct intr_config_hook vmbus_intrhook; #ifdef NEW_PCIB /* The list of usable MMIO ranges for PCIe pass-through */ struct pcib_host_resources vmbus_mmio_res; #endif #if defined(__aarch64__) struct resource *ires; void *icookie; int vector; #endif }; #define VMBUS_FLAG_ATTACHED 0x0001 /* vmbus was attached */ #define VMBUS_FLAG_SYNIC 0x0002 /* SynIC was setup */ #define VMBUS_PCPU_GET(sc, field, cpu) (sc)->vmbus_pcpu[(cpu)].field #define VMBUS_PCPU_PTR(sc, field, cpu) &(sc)->vmbus_pcpu[(cpu)].field struct vmbus_channel; struct trapframe; struct vmbus_message; struct vmbus_msghc; void vmbus_handle_intr(struct trapframe *); int vmbus_add_child(struct vmbus_channel *); int vmbus_delete_child(struct vmbus_channel *); #if !defined(__aarch64__) void vmbus_et_intr(struct trapframe *); #endif uint32_t vmbus_gpadl_alloc(struct vmbus_softc *); struct vmbus_msghc * vmbus_msghc_get(struct vmbus_softc *, size_t); void vmbus_msghc_put(struct vmbus_softc *, struct vmbus_msghc *); void *vmbus_msghc_dataptr(struct vmbus_msghc *); int vmbus_msghc_exec_noresult(struct vmbus_msghc *); int vmbus_msghc_exec(struct vmbus_softc *, struct vmbus_msghc *); void vmbus_msghc_exec_cancel(struct vmbus_softc *, struct vmbus_msghc *); const struct vmbus_message * vmbus_msghc_wait_result(struct vmbus_softc *, struct vmbus_msghc *); const struct vmbus_message * vmbus_msghc_poll_result(struct vmbus_softc *, struct vmbus_msghc *); void vmbus_msghc_wakeup(struct vmbus_softc *, const struct vmbus_message *); void vmbus_msghc_reset(struct vmbus_msghc *, size_t); void vmbus_handle_timer_intr1(struct vmbus_message *msg_base, struct trapframe *frame); void vmbus_synic_setup1(void *xsc); void vmbus_synic_teardown1(void); int vmbus_setup_intr1(struct vmbus_softc *sc); void vmbus_intr_teardown1(struct vmbus_softc *sc); #endif /* !_VMBUS_VAR_H_ */ diff --git a/sys/dev/hyperv/vmbus/vmbus_xact.c b/sys/dev/hyperv/vmbus/vmbus_xact.c index 90bdba7e1058..df6d8b45f9b1 100644 --- a/sys/dev/hyperv/vmbus/vmbus_xact.c +++ b/sys/dev/hyperv/vmbus/vmbus_xact.c @@ -1,442 +1,444 @@ /*- * Copyright (c) 2016 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include -#include +#include +#include +#include + #include struct vmbus_xact { struct vmbus_xact_ctx *x_ctx; void *x_priv; void *x_req; - struct hyperv_dma x_req_dma; const void *x_resp; size_t x_resp_len; void *x_resp0; }; struct vmbus_xact_ctx { size_t xc_req_size; size_t xc_resp_size; size_t xc_priv_size; struct mtx xc_lock; /* * Protected by xc_lock. */ uint32_t xc_flags; /* VMBUS_XACT_CTXF_ */ struct vmbus_xact *xc_free; struct vmbus_xact *xc_active; struct vmbus_xact *xc_orphan; }; #define VMBUS_XACT_CTXF_DESTROY 0x0001 static struct vmbus_xact *vmbus_xact_alloc(struct vmbus_xact_ctx *, bus_dma_tag_t); static void vmbus_xact_free(struct vmbus_xact *); static struct vmbus_xact *vmbus_xact_get1(struct vmbus_xact_ctx *, uint32_t); static const void *vmbus_xact_wait1(struct vmbus_xact *, size_t *, bool); static const void *vmbus_xact_return(struct vmbus_xact *, size_t *); static void vmbus_xact_save_resp(struct vmbus_xact *, const void *, size_t); static void vmbus_xact_ctx_free(struct vmbus_xact_ctx *); static struct vmbus_xact * vmbus_xact_alloc(struct vmbus_xact_ctx *ctx, bus_dma_tag_t parent_dtag) { struct vmbus_xact *xact; xact = malloc(sizeof(*xact), M_DEVBUF, M_WAITOK | M_ZERO); xact->x_ctx = ctx; /* XXX assume that page aligned is enough */ - xact->x_req = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, - ctx->xc_req_size, &xact->x_req_dma, BUS_DMA_WAITOK); + xact->x_req = contigmalloc(ctx->xc_req_size, M_DEVBUF, + M_WAITOK | M_ZERO, 0ul, ~0ul, PAGE_SIZE, 0); if (xact->x_req == NULL) { free(xact, M_DEVBUF); return (NULL); } if (ctx->xc_priv_size != 0) xact->x_priv = malloc(ctx->xc_priv_size, M_DEVBUF, M_WAITOK); xact->x_resp0 = malloc(ctx->xc_resp_size, M_DEVBUF, M_WAITOK); return (xact); } static void vmbus_xact_free(struct vmbus_xact *xact) { - hyperv_dmamem_free(&xact->x_req_dma, xact->x_req); + contigfree(xact->x_req, xact->x_ctx->xc_req_size, M_DEVBUF); free(xact->x_resp0, M_DEVBUF); if (xact->x_priv != NULL) free(xact->x_priv, M_DEVBUF); free(xact, M_DEVBUF); } static struct vmbus_xact * vmbus_xact_get1(struct vmbus_xact_ctx *ctx, uint32_t dtor_flag) { struct vmbus_xact *xact; mtx_lock(&ctx->xc_lock); while ((ctx->xc_flags & dtor_flag) == 0 && ctx->xc_free == NULL) mtx_sleep(&ctx->xc_free, &ctx->xc_lock, 0, "gxact", 0); if (ctx->xc_flags & dtor_flag) { /* Being destroyed */ xact = NULL; } else { xact = ctx->xc_free; KASSERT(xact != NULL, ("no free xact")); KASSERT(xact->x_resp == NULL, ("xact has pending response")); ctx->xc_free = NULL; } mtx_unlock(&ctx->xc_lock); return (xact); } struct vmbus_xact_ctx * vmbus_xact_ctx_create(bus_dma_tag_t dtag, size_t req_size, size_t resp_size, size_t priv_size) { struct vmbus_xact_ctx *ctx; KASSERT(req_size > 0, ("request size is 0")); KASSERT(resp_size > 0, ("response size is 0")); ctx = malloc(sizeof(*ctx), M_DEVBUF, M_WAITOK | M_ZERO); ctx->xc_req_size = req_size; ctx->xc_resp_size = resp_size; ctx->xc_priv_size = priv_size; ctx->xc_free = vmbus_xact_alloc(ctx, dtag); if (ctx->xc_free == NULL) { free(ctx, M_DEVBUF); return (NULL); } mtx_init(&ctx->xc_lock, "vmbus xact", NULL, MTX_DEF); return (ctx); } bool vmbus_xact_ctx_orphan(struct vmbus_xact_ctx *ctx) { mtx_lock(&ctx->xc_lock); if (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) { mtx_unlock(&ctx->xc_lock); return (false); } ctx->xc_flags |= VMBUS_XACT_CTXF_DESTROY; mtx_unlock(&ctx->xc_lock); wakeup(&ctx->xc_free); wakeup(&ctx->xc_active); ctx->xc_orphan = vmbus_xact_get1(ctx, 0); if (ctx->xc_orphan == NULL) panic("can't get xact"); return (true); } static void vmbus_xact_ctx_free(struct vmbus_xact_ctx *ctx) { KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY, ("xact ctx was not orphaned")); KASSERT(ctx->xc_orphan != NULL, ("no orphaned xact")); vmbus_xact_free(ctx->xc_orphan); mtx_destroy(&ctx->xc_lock); free(ctx, M_DEVBUF); } void vmbus_xact_ctx_destroy(struct vmbus_xact_ctx *ctx) { vmbus_xact_ctx_orphan(ctx); vmbus_xact_ctx_free(ctx); } struct vmbus_xact * vmbus_xact_get(struct vmbus_xact_ctx *ctx, size_t req_len) { struct vmbus_xact *xact; if (req_len > ctx->xc_req_size) panic("invalid request size %zu", req_len); xact = vmbus_xact_get1(ctx, VMBUS_XACT_CTXF_DESTROY); if (xact == NULL) return (NULL); memset(xact->x_req, 0, req_len); return (xact); } void vmbus_xact_put(struct vmbus_xact *xact) { struct vmbus_xact_ctx *ctx = xact->x_ctx; KASSERT(ctx->xc_active == NULL, ("pending active xact")); xact->x_resp = NULL; mtx_lock(&ctx->xc_lock); KASSERT(ctx->xc_free == NULL, ("has free xact")); ctx->xc_free = xact; mtx_unlock(&ctx->xc_lock); wakeup(&ctx->xc_free); } void * vmbus_xact_req_data(const struct vmbus_xact *xact) { return (xact->x_req); } bus_addr_t vmbus_xact_req_paddr(const struct vmbus_xact *xact) { - return (xact->x_req_dma.hv_paddr); + return (pmap_kextract((vm_offset_t)xact->x_req)); } void * vmbus_xact_priv(const struct vmbus_xact *xact, size_t priv_len) { if (priv_len > xact->x_ctx->xc_priv_size) panic("invalid priv size %zu", priv_len); return (xact->x_priv); } void vmbus_xact_activate(struct vmbus_xact *xact) { struct vmbus_xact_ctx *ctx = xact->x_ctx; KASSERT(xact->x_resp == NULL, ("xact has pending response")); mtx_lock(&ctx->xc_lock); KASSERT(ctx->xc_active == NULL, ("pending active xact")); ctx->xc_active = xact; mtx_unlock(&ctx->xc_lock); } void vmbus_xact_deactivate(struct vmbus_xact *xact) { struct vmbus_xact_ctx *ctx = xact->x_ctx; mtx_lock(&ctx->xc_lock); KASSERT(ctx->xc_active == xact, ("xact mismatch")); ctx->xc_active = NULL; mtx_unlock(&ctx->xc_lock); } static const void * vmbus_xact_return(struct vmbus_xact *xact, size_t *resp_len) { struct vmbus_xact_ctx *ctx = xact->x_ctx; const void *resp; mtx_assert(&ctx->xc_lock, MA_OWNED); KASSERT(ctx->xc_active == xact, ("xact trashed")); if ((ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) && xact->x_resp == NULL) { uint8_t b = 0; /* * Orphaned and no response was received yet; fake up * an one byte response. */ printf("vmbus: xact ctx was orphaned w/ pending xact\n"); vmbus_xact_save_resp(ctx->xc_active, &b, sizeof(b)); } KASSERT(xact->x_resp != NULL, ("no response")); ctx->xc_active = NULL; resp = xact->x_resp; *resp_len = xact->x_resp_len; return (resp); } static const void * vmbus_xact_wait1(struct vmbus_xact *xact, size_t *resp_len, bool can_sleep) { struct vmbus_xact_ctx *ctx = xact->x_ctx; const void *resp; mtx_lock(&ctx->xc_lock); KASSERT(ctx->xc_active == xact, ("xact mismatch")); while (xact->x_resp == NULL && (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) { if (can_sleep) { mtx_sleep(&ctx->xc_active, &ctx->xc_lock, 0, "wxact", 0); } else { mtx_unlock(&ctx->xc_lock); DELAY(1000); mtx_lock(&ctx->xc_lock); } } resp = vmbus_xact_return(xact, resp_len); mtx_unlock(&ctx->xc_lock); return (resp); } const void * vmbus_xact_wait(struct vmbus_xact *xact, size_t *resp_len) { return (vmbus_xact_wait1(xact, resp_len, true /* can sleep */)); } const void * vmbus_xact_busywait(struct vmbus_xact *xact, size_t *resp_len) { return (vmbus_xact_wait1(xact, resp_len, false /* can't sleep */)); } const void * vmbus_xact_poll(struct vmbus_xact *xact, size_t *resp_len) { struct vmbus_xact_ctx *ctx = xact->x_ctx; const void *resp; mtx_lock(&ctx->xc_lock); KASSERT(ctx->xc_active == xact, ("xact mismatch")); if (xact->x_resp == NULL && (ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY) == 0) { mtx_unlock(&ctx->xc_lock); *resp_len = 0; return (NULL); } resp = vmbus_xact_return(xact, resp_len); mtx_unlock(&ctx->xc_lock); return (resp); } static void vmbus_xact_save_resp(struct vmbus_xact *xact, const void *data, size_t dlen) { struct vmbus_xact_ctx *ctx = xact->x_ctx; size_t cplen = dlen; mtx_assert(&ctx->xc_lock, MA_OWNED); if (cplen > ctx->xc_resp_size) { printf("vmbus: xact response truncated %zu -> %zu\n", cplen, ctx->xc_resp_size); cplen = ctx->xc_resp_size; } KASSERT(ctx->xc_active == xact, ("xact mismatch")); memcpy(xact->x_resp0, data, cplen); xact->x_resp_len = cplen; xact->x_resp = xact->x_resp0; } void vmbus_xact_wakeup(struct vmbus_xact *xact, const void *data, size_t dlen) { struct vmbus_xact_ctx *ctx = xact->x_ctx; int do_wakeup = 0; mtx_lock(&ctx->xc_lock); /* * NOTE: * xc_active could be NULL, if the ctx has been orphaned. */ if (ctx->xc_active != NULL) { vmbus_xact_save_resp(xact, data, dlen); do_wakeup = 1; } else { KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY, ("no active xact pending")); printf("vmbus: drop xact response\n"); } mtx_unlock(&ctx->xc_lock); if (do_wakeup) wakeup(&ctx->xc_active); } void vmbus_xact_ctx_wakeup(struct vmbus_xact_ctx *ctx, const void *data, size_t dlen) { int do_wakeup = 0; mtx_lock(&ctx->xc_lock); /* * NOTE: * xc_active could be NULL, if the ctx has been orphaned. */ if (ctx->xc_active != NULL) { vmbus_xact_save_resp(ctx->xc_active, data, dlen); do_wakeup = 1; } else { KASSERT(ctx->xc_flags & VMBUS_XACT_CTXF_DESTROY, ("no active xact pending")); printf("vmbus: drop xact response\n"); } mtx_unlock(&ctx->xc_lock); if (do_wakeup) wakeup(&ctx->xc_active); }