Index: head/sys/dev/hyperv/include/hyperv.h =================================================================== --- head/sys/dev/hyperv/include/hyperv.h (revision 322487) +++ head/sys/dev/hyperv/include/hyperv.h (revision 322488) @@ -1,96 +1,96 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _HYPERV_H_ #define _HYPERV_H_ #ifdef _KERNEL #include #include #define MSR_HV_TIME_REF_COUNT 0x40000020 #define CPUID_HV_MSR_TIME_REFCNT 0x0002 /* MSR_HV_TIME_REF_COUNT */ #define CPUID_HV_MSR_SYNIC 0x0004 /* MSRs for SynIC */ #define CPUID_HV_MSR_SYNTIMER 0x0008 /* MSRs for SynTimer */ #define CPUID_HV_MSR_APIC 0x0010 /* MSR_HV_{EOI,ICR,TPR} */ #define CPUID_HV_MSR_HYPERCALL 0x0020 /* MSR_HV_GUEST_OS_ID * MSR_HV_HYPERCALL */ #define CPUID_HV_MSR_VP_INDEX 0x0040 /* MSR_HV_VP_INDEX */ #define CPUID_HV_MSR_REFERENCE_TSC 0x0200 /* MSR_HV_REFERENCE_TSC */ #define CPUID_HV_MSR_GUEST_IDLE 0x0400 /* MSR_HV_GUEST_IDLE */ #ifndef NANOSEC #define NANOSEC 1000000000ULL #endif #define HYPERV_TIMER_NS_FACTOR 100ULL #define HYPERV_TIMER_FREQ (NANOSEC / HYPERV_TIMER_NS_FACTOR) #endif /* _KERNEL */ #define HYPERV_REFTSC_DEVNAME "hv_tsc" /* * Hyper-V Reference TSC */ struct hyperv_reftsc { volatile uint32_t tsc_seq; volatile uint32_t tsc_rsvd1; volatile uint64_t tsc_scale; volatile int64_t tsc_ofs; } __packed __aligned(PAGE_SIZE); #ifdef CTASSERT CTASSERT(sizeof(struct hyperv_reftsc) == PAGE_SIZE); #endif #ifdef _KERNEL struct hyperv_guid { uint8_t hv_guid[16]; } __packed; #define HYPERV_GUID_STRLEN 40 typedef uint64_t (*hyperv_tc64_t)(void); int hyperv_guid2str(const struct hyperv_guid *, char *, size_t); /* * hyperv_tc64 could be NULL, if there were no suitable Hyper-V * specific timecounter. */ extern hyperv_tc64_t hyperv_tc64; extern u_int hyperv_features; /* CPUID_HV_MSR_ */ #endif /* _KERNEL */ #endif /* _HYPERV_H_ */ Index: head/sys/dev/hyperv/netvsc/hn_nvs.c =================================================================== --- head/sys/dev/hyperv/netvsc/hn_nvs.c (revision 322487) +++ head/sys/dev/hyperv/netvsc/hn_nvs.c (revision 322488) @@ -1,740 +1,740 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Network Virtualization Service. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static int hn_nvs_conn_chim(struct hn_softc *); static int hn_nvs_conn_rxbuf(struct hn_softc *); static void hn_nvs_disconn_chim(struct hn_softc *); static void hn_nvs_disconn_rxbuf(struct hn_softc *); static int hn_nvs_conf_ndis(struct hn_softc *, int); static int hn_nvs_init_ndis(struct hn_softc *); static int hn_nvs_doinit(struct hn_softc *, uint32_t); static int hn_nvs_init(struct hn_softc *); static const void *hn_nvs_xact_execute(struct hn_softc *, struct vmbus_xact *, void *, int, size_t *, uint32_t); static void hn_nvs_sent_none(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); struct hn_nvs_sendctx hn_nvs_sendctx_none = HN_NVS_SENDCTX_INITIALIZER(hn_nvs_sent_none, NULL); static const uint32_t hn_nvs_version[] = { HN_NVS_VERSION_5, HN_NVS_VERSION_4, HN_NVS_VERSION_2, HN_NVS_VERSION_1 }; static const void * hn_nvs_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, void *req, int reqlen, size_t *resplen0, uint32_t type) { struct hn_nvs_sendctx sndc; size_t resplen, min_resplen = *resplen0; const struct hn_nvs_hdr *hdr; int error; KASSERT(min_resplen >= sizeof(*hdr), ("invalid minimum response len %zu", min_resplen)); /* * Execute the xact setup by the caller. */ hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); vmbus_xact_activate(xact); error = hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_RC, req, reqlen, &sndc); if (error) { vmbus_xact_deactivate(xact); return (NULL); } hdr = vmbus_chan_xact_wait(sc->hn_prichan, xact, &resplen, HN_CAN_SLEEP(sc)); /* * Check this NVS response message. */ if (resplen < min_resplen) { if_printf(sc->hn_ifp, "invalid NVS resp len %zu\n", resplen); return (NULL); } if (hdr->nvs_type != type) { if_printf(sc->hn_ifp, "unexpected NVS resp 0x%08x, " "expect 0x%08x\n", hdr->nvs_type, type); return (NULL); } /* All pass! */ *resplen0 = resplen; return (hdr); } static __inline int hn_nvs_req_send(struct hn_softc *sc, void *req, int reqlen) { return (hn_nvs_send(sc->hn_prichan, VMBUS_CHANPKT_FLAG_NONE, req, reqlen, &hn_nvs_sendctx_none)); } static int hn_nvs_conn_rxbuf(struct hn_softc *sc) { struct vmbus_xact *xact = NULL; struct hn_nvs_rxbuf_conn *conn; const struct hn_nvs_rxbuf_connresp *resp; size_t resp_len; uint32_t status; int error, rxbuf_size; /* * Limit RXBUF size for old NVS. */ if (sc->hn_nvs_ver <= HN_NVS_VERSION_2) rxbuf_size = HN_RXBUF_SIZE_COMPAT; else rxbuf_size = HN_RXBUF_SIZE; /* * Connect the RXBUF GPADL to the primary channel. * * NOTE: * Only primary channel has RXBUF connected to it. Sub-channels * just share this RXBUF. */ error = vmbus_chan_gpadl_connect(sc->hn_prichan, sc->hn_rxbuf_dma.hv_paddr, rxbuf_size, &sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl conn failed: %d\n", error); goto cleanup; } /* * Connect RXBUF to NVS. */ xact = vmbus_xact_get(sc->hn_xact, sizeof(*conn)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs rxbuf conn\n"); error = ENXIO; goto cleanup; } conn = vmbus_xact_req_data(xact); conn->nvs_type = HN_NVS_TYPE_RXBUF_CONN; conn->nvs_gpadl = sc->hn_rxbuf_gpadl; conn->nvs_sig = HN_NVS_RXBUF_SIG; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, conn, sizeof(*conn), &resp_len, HN_NVS_TYPE_RXBUF_CONNRESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs rxbuf conn failed\n"); error = EIO; goto cleanup; } status = resp->nvs_status; vmbus_xact_put(xact); xact = NULL; if (status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs rxbuf conn failed: %x\n", status); error = EIO; goto cleanup; } sc->hn_flags |= HN_FLAG_RXBUF_CONNECTED; return (0); cleanup: if (xact != NULL) vmbus_xact_put(xact); hn_nvs_disconn_rxbuf(sc); return (error); } static int hn_nvs_conn_chim(struct hn_softc *sc) { struct vmbus_xact *xact = NULL; struct hn_nvs_chim_conn *chim; const struct hn_nvs_chim_connresp *resp; size_t resp_len; uint32_t status, sectsz; int error; /* * Connect chimney sending buffer GPADL to the primary channel. * * NOTE: * Only primary channel has chimney sending buffer connected to it. * Sub-channels just share this chimney sending buffer. */ error = vmbus_chan_gpadl_connect(sc->hn_prichan, sc->hn_chim_dma.hv_paddr, HN_CHIM_SIZE, &sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl conn failed: %d\n", error); goto cleanup; } /* * Connect chimney sending buffer to NVS */ xact = vmbus_xact_get(sc->hn_xact, sizeof(*chim)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs chim conn\n"); error = ENXIO; goto cleanup; } chim = vmbus_xact_req_data(xact); chim->nvs_type = HN_NVS_TYPE_CHIM_CONN; chim->nvs_gpadl = sc->hn_chim_gpadl; chim->nvs_sig = HN_NVS_CHIM_SIG; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, chim, sizeof(*chim), &resp_len, HN_NVS_TYPE_CHIM_CONNRESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs chim conn failed\n"); error = EIO; goto cleanup; } status = resp->nvs_status; sectsz = resp->nvs_sectsz; vmbus_xact_put(xact); xact = NULL; if (status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs chim conn failed: %x\n", status); error = EIO; goto cleanup; } if (sectsz == 0 || sectsz % sizeof(uint32_t) != 0) { /* * Can't use chimney sending buffer; done! */ if (sectsz == 0) { if_printf(sc->hn_ifp, "zero chimney sending buffer " "section size\n"); } else { if_printf(sc->hn_ifp, "misaligned chimney sending " "buffers, section size: %u\n", sectsz); } sc->hn_chim_szmax = 0; sc->hn_chim_cnt = 0; sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; return (0); } sc->hn_chim_szmax = sectsz; sc->hn_chim_cnt = HN_CHIM_SIZE / sc->hn_chim_szmax; if (HN_CHIM_SIZE % sc->hn_chim_szmax != 0) { if_printf(sc->hn_ifp, "chimney sending sections are " "not properly aligned\n"); } if (sc->hn_chim_cnt % LONG_BIT != 0) { if_printf(sc->hn_ifp, "discard %d chimney sending sections\n", sc->hn_chim_cnt % LONG_BIT); } sc->hn_chim_bmap_cnt = sc->hn_chim_cnt / LONG_BIT; sc->hn_chim_bmap = malloc(sc->hn_chim_bmap_cnt * sizeof(u_long), M_DEVBUF, M_WAITOK | M_ZERO); /* Done! */ sc->hn_flags |= HN_FLAG_CHIM_CONNECTED; if (bootverbose) { if_printf(sc->hn_ifp, "chimney sending buffer %d/%d\n", sc->hn_chim_szmax, sc->hn_chim_cnt); } return (0); cleanup: if (xact != NULL) vmbus_xact_put(xact); hn_nvs_disconn_chim(sc); return (error); } static void hn_nvs_disconn_rxbuf(struct hn_softc *sc) { int error; if (sc->hn_flags & HN_FLAG_RXBUF_CONNECTED) { struct hn_nvs_rxbuf_disconn disconn; /* * Disconnect RXBUF from NVS. */ memset(&disconn, 0, sizeof(disconn)); disconn.nvs_type = HN_NVS_TYPE_RXBUF_DISCONN; disconn.nvs_sig = HN_NVS_RXBUF_SIG; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); if (error) { if_printf(sc->hn_ifp, "send nvs rxbuf disconn failed: %d\n", error); /* * Fine for a revoked channel, since the hypervisor * does not drain TX bufring for a revoked channel. */ if (!vmbus_chan_is_revoked(sc->hn_prichan)) sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_flags &= ~HN_FLAG_RXBUF_CONNECTED; /* * Wait for the hypervisor to receive this NVS request. * * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_tx_empty(sc->hn_prichan) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("waittx", 1); /* * Linger long enough for NVS to disconnect RXBUF. */ pause("lingtx", (200 * hz) / 1000); } if (sc->hn_rxbuf_gpadl != 0) { /* * Disconnect RXBUF from primary channel. */ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_rxbuf_gpadl); if (error) { if_printf(sc->hn_ifp, "rxbuf gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_RXBUF_REF; } sc->hn_rxbuf_gpadl = 0; } } static void hn_nvs_disconn_chim(struct hn_softc *sc) { int error; if (sc->hn_flags & HN_FLAG_CHIM_CONNECTED) { struct hn_nvs_chim_disconn disconn; /* * Disconnect chimney sending buffer from NVS. */ memset(&disconn, 0, sizeof(disconn)); disconn.nvs_type = HN_NVS_TYPE_CHIM_DISCONN; disconn.nvs_sig = HN_NVS_CHIM_SIG; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &disconn, sizeof(disconn)); if (error) { if_printf(sc->hn_ifp, "send nvs chim disconn failed: %d\n", error); /* * Fine for a revoked channel, since the hypervisor * does not drain TX bufring for a revoked channel. */ if (!vmbus_chan_is_revoked(sc->hn_prichan)) sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_flags &= ~HN_FLAG_CHIM_CONNECTED; /* * Wait for the hypervisor to receive this NVS request. * * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_tx_empty(sc->hn_prichan) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("waittx", 1); /* * Linger long enough for NVS to disconnect chimney * sending buffer. */ pause("lingtx", (200 * hz) / 1000); } if (sc->hn_chim_gpadl != 0) { /* * Disconnect chimney sending buffer from primary channel. */ error = vmbus_chan_gpadl_disconnect(sc->hn_prichan, sc->hn_chim_gpadl); if (error) { if_printf(sc->hn_ifp, "chim gpadl disconn failed: %d\n", error); sc->hn_flags |= HN_FLAG_CHIM_REF; } sc->hn_chim_gpadl = 0; } if (sc->hn_chim_bmap != NULL) { free(sc->hn_chim_bmap, M_DEVBUF); sc->hn_chim_bmap = NULL; sc->hn_chim_bmap_cnt = 0; } } static int hn_nvs_doinit(struct hn_softc *sc, uint32_t nvs_ver) { struct vmbus_xact *xact; struct hn_nvs_init *init; const struct hn_nvs_init_resp *resp; size_t resp_len; uint32_t status; xact = vmbus_xact_get(sc->hn_xact, sizeof(*init)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs init\n"); return (ENXIO); } init = vmbus_xact_req_data(xact); init->nvs_type = HN_NVS_TYPE_INIT; init->nvs_ver_min = nvs_ver; init->nvs_ver_max = nvs_ver; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, init, sizeof(*init), &resp_len, HN_NVS_TYPE_INIT_RESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec init failed\n"); vmbus_xact_put(xact); return (EIO); } status = resp->nvs_status; vmbus_xact_put(xact); if (status != HN_NVS_STATUS_OK) { if (bootverbose) { /* * Caller may try another NVS version, and will log * error if there are no more NVS versions to try, * so don't bark out loud here. */ if_printf(sc->hn_ifp, "nvs init failed for ver 0x%x\n", nvs_ver); } return (EINVAL); } return (0); } /* * Configure MTU and enable VLAN. */ static int hn_nvs_conf_ndis(struct hn_softc *sc, int mtu) { struct hn_nvs_ndis_conf conf; int error; memset(&conf, 0, sizeof(conf)); conf.nvs_type = HN_NVS_TYPE_NDIS_CONF; conf.nvs_mtu = mtu; conf.nvs_caps = HN_NVS_NDIS_CONF_VLAN; if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) conf.nvs_caps |= HN_NVS_NDIS_CONF_SRIOV; /* NOTE: No response. */ error = hn_nvs_req_send(sc, &conf, sizeof(conf)); if (error) { if_printf(sc->hn_ifp, "send nvs ndis conf failed: %d\n", error); return (error); } if (bootverbose) if_printf(sc->hn_ifp, "nvs ndis conf done\n"); sc->hn_caps |= HN_CAP_MTU | HN_CAP_VLAN; return (0); } static int hn_nvs_init_ndis(struct hn_softc *sc) { struct hn_nvs_ndis_init ndis; int error; memset(&ndis, 0, sizeof(ndis)); ndis.nvs_type = HN_NVS_TYPE_NDIS_INIT; ndis.nvs_ndis_major = HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver); ndis.nvs_ndis_minor = HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver); /* NOTE: No response. */ error = hn_nvs_req_send(sc, &ndis, sizeof(ndis)); if (error) if_printf(sc->hn_ifp, "send nvs ndis init failed: %d\n", error); return (error); } static int hn_nvs_init(struct hn_softc *sc) { int i, error; if (device_is_attached(sc->hn_dev)) { /* * NVS version and NDIS version MUST NOT be changed. */ if (bootverbose) { if_printf(sc->hn_ifp, "reinit NVS version 0x%x, " "NDIS version %u.%u\n", sc->hn_nvs_ver, HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); } error = hn_nvs_doinit(sc, sc->hn_nvs_ver); if (error) { if_printf(sc->hn_ifp, "reinit NVS version 0x%x " "failed: %d\n", sc->hn_nvs_ver, error); return (error); } goto done; } /* * Find the supported NVS version and set NDIS version accordingly. */ for (i = 0; i < nitems(hn_nvs_version); ++i) { error = hn_nvs_doinit(sc, hn_nvs_version[i]); if (!error) { sc->hn_nvs_ver = hn_nvs_version[i]; /* Set NDIS version according to NVS version. */ sc->hn_ndis_ver = HN_NDIS_VERSION_6_30; if (sc->hn_nvs_ver <= HN_NVS_VERSION_4) sc->hn_ndis_ver = HN_NDIS_VERSION_6_1; if (bootverbose) { if_printf(sc->hn_ifp, "NVS version 0x%x, " "NDIS version %u.%u\n", sc->hn_nvs_ver, HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); } goto done; } } if_printf(sc->hn_ifp, "no NVS available\n"); return (ENXIO); done: if (sc->hn_nvs_ver >= HN_NVS_VERSION_5) sc->hn_caps |= HN_CAP_HASHVAL; return (0); } int hn_nvs_attach(struct hn_softc *sc, int mtu) { int error; /* * Initialize NVS. */ error = hn_nvs_init(sc); if (error) return (error); if (sc->hn_nvs_ver >= HN_NVS_VERSION_2) { /* * Configure NDIS before initializing it. */ error = hn_nvs_conf_ndis(sc, mtu); if (error) return (error); } /* * Initialize NDIS. */ error = hn_nvs_init_ndis(sc); if (error) return (error); /* * Connect RXBUF. */ error = hn_nvs_conn_rxbuf(sc); if (error) return (error); /* * Connect chimney sending buffer. */ error = hn_nvs_conn_chim(sc); if (error) { hn_nvs_disconn_rxbuf(sc); return (error); } return (0); } void hn_nvs_detach(struct hn_softc *sc) { /* NOTE: there are no requests to stop the NVS. */ hn_nvs_disconn_rxbuf(sc); hn_nvs_disconn_chim(sc); } void hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, const void *data, int dlen) { vmbus_xact_wakeup(sndc->hn_cbarg, data, dlen); } static void hn_nvs_sent_none(struct hn_nvs_sendctx *sndc __unused, struct hn_softc *sc __unused, struct vmbus_channel *chan __unused, const void *data __unused, int dlen __unused) { /* EMPTY */ } int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch0) { struct vmbus_xact *xact; struct hn_nvs_subch_req *req; const struct hn_nvs_subch_resp *resp; int error, nsubch_req; uint32_t nsubch; size_t resp_len; nsubch_req = *nsubch0; KASSERT(nsubch_req > 0, ("invalid # of sub-channels %d", nsubch_req)); xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for nvs subch alloc\n"); return (ENXIO); } req = vmbus_xact_req_data(xact); req->nvs_type = HN_NVS_TYPE_SUBCH_REQ; req->nvs_op = HN_NVS_SUBCH_OP_ALLOC; req->nvs_nsubch = nsubch_req; resp_len = sizeof(*resp); resp = hn_nvs_xact_execute(sc, xact, req, sizeof(*req), &resp_len, HN_NVS_TYPE_SUBCH_RESP); if (resp == NULL) { if_printf(sc->hn_ifp, "exec nvs subch alloc failed\n"); error = EIO; goto done; } if (resp->nvs_status != HN_NVS_STATUS_OK) { if_printf(sc->hn_ifp, "nvs subch alloc failed: %x\n", resp->nvs_status); error = EIO; goto done; } nsubch = resp->nvs_nsubch; if (nsubch > nsubch_req) { if_printf(sc->hn_ifp, "%u subchans are allocated, " "requested %d\n", nsubch, nsubch_req); nsubch = nsubch_req; } *nsubch0 = nsubch; error = 0; done: vmbus_xact_put(xact); return (error); } int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) { return hn_nvs_send_rndis_sglist(chan, HN_NVS_RNDIS_MTYPE_CTRL, sndc, gpa, gpa_cnt); } void hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path) { struct hn_nvs_datapath dp; memset(&dp, 0, sizeof(dp)); dp.nvs_type = HN_NVS_TYPE_SET_DATAPATH; dp.nvs_active_path = path; hn_nvs_req_send(sc, &dp, sizeof(dp)); } Index: head/sys/dev/hyperv/netvsc/hn_nvs.h =================================================================== --- head/sys/dev/hyperv/netvsc/hn_nvs.h (revision 322487) +++ head/sys/dev/hyperv/netvsc/hn_nvs.h (revision 322488) @@ -1,107 +1,107 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _HN_NVS_H_ #define _HN_NVS_H_ struct hn_nvs_sendctx; struct vmbus_channel; struct hn_softc; typedef void (*hn_nvs_sent_t) (struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); struct hn_nvs_sendctx { hn_nvs_sent_t hn_cb; void *hn_cbarg; }; #define HN_NVS_SENDCTX_INITIALIZER(cb, cbarg) \ { \ .hn_cb = cb, \ .hn_cbarg = cbarg \ } static __inline void hn_nvs_sendctx_init(struct hn_nvs_sendctx *sndc, hn_nvs_sent_t cb, void *cbarg) { sndc->hn_cb = cb; sndc->hn_cbarg = cbarg; } static __inline int hn_nvs_send(struct vmbus_channel *chan, uint16_t flags, void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc) { return (vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_INBAND, flags, nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc)); } static __inline int hn_nvs_send_sglist(struct vmbus_channel *chan, struct vmbus_gpa sg[], int sglen, void *nvs_msg, int nvs_msglen, struct hn_nvs_sendctx *sndc) { return (vmbus_chan_send_sglist(chan, sg, sglen, nvs_msg, nvs_msglen, (uint64_t)(uintptr_t)sndc)); } static __inline int hn_nvs_send_rndis_sglist(struct vmbus_channel *chan, uint32_t rndis_mtype, struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt) { struct hn_nvs_rndis rndis; rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = rndis_mtype; rndis.nvs_chim_idx = HN_NVS_CHIM_IDX_INVALID; rndis.nvs_chim_sz = 0; return (hn_nvs_send_sglist(chan, gpa, gpa_cnt, &rndis, sizeof(rndis), sndc)); } int hn_nvs_attach(struct hn_softc *sc, int mtu); void hn_nvs_detach(struct hn_softc *sc); int hn_nvs_alloc_subchans(struct hn_softc *sc, int *nsubch); void hn_nvs_sent_xact(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data, int dlen); int hn_nvs_send_rndis_ctrl(struct vmbus_channel *chan, struct hn_nvs_sendctx *sndc, struct vmbus_gpa *gpa, int gpa_cnt); void hn_nvs_set_datapath(struct hn_softc *sc, uint32_t path); extern struct hn_nvs_sendctx hn_nvs_sendctx_none; #endif /* !_HN_NVS_H_ */ Index: head/sys/dev/hyperv/netvsc/hn_rndis.c =================================================================== --- head/sys/dev/hyperv/netvsc/hn_rndis.c (revision 322487) +++ head/sys/dev/hyperv/netvsc/hn_rndis.c (revision 322488) @@ -1,1009 +1,1009 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define HN_RNDIS_RID_COMPAT_MASK 0xffff #define HN_RNDIS_RID_COMPAT_MAX HN_RNDIS_RID_COMPAT_MASK #define HN_RNDIS_XFER_SIZE 2048 #define HN_NDIS_TXCSUM_CAP_IP4 \ (NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT) #define HN_NDIS_TXCSUM_CAP_TCP4 \ (NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT) #define HN_NDIS_TXCSUM_CAP_TCP6 \ (NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \ NDIS_TXCSUM_CAP_IP6EXT) #define HN_NDIS_TXCSUM_CAP_UDP6 \ (NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT) #define HN_NDIS_LSOV2_CAP_IP6 \ (NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT) static const void *hn_rndis_xact_exec1(struct hn_softc *, struct vmbus_xact *, size_t, struct hn_nvs_sendctx *, size_t *); static const void *hn_rndis_xact_execute(struct hn_softc *, struct vmbus_xact *, uint32_t, size_t, size_t *, uint32_t); static int hn_rndis_query(struct hn_softc *, uint32_t, const void *, size_t, void *, size_t *); static int hn_rndis_query2(struct hn_softc *, uint32_t, const void *, size_t, void *, size_t *, size_t); static int hn_rndis_set(struct hn_softc *, uint32_t, const void *, size_t); static int hn_rndis_init(struct hn_softc *); static int hn_rndis_halt(struct hn_softc *); static int hn_rndis_conf_offload(struct hn_softc *, int); static int hn_rndis_query_hwcaps(struct hn_softc *, struct ndis_offload *); static __inline uint32_t hn_rndis_rid(struct hn_softc *sc) { uint32_t rid; again: rid = atomic_fetchadd_int(&sc->hn_rndis_rid, 1); if (rid == 0) goto again; /* Use upper 16 bits for non-compat RNDIS messages. */ return ((rid & 0xffff) << 16); } void hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_comp_hdr *comp; const struct rndis_msghdr *hdr; KASSERT(dlen >= sizeof(*hdr), ("invalid RNDIS msg\n")); hdr = data; switch (hdr->rm_type) { case REMOTE_NDIS_INITIALIZE_CMPLT: case REMOTE_NDIS_QUERY_CMPLT: case REMOTE_NDIS_SET_CMPLT: case REMOTE_NDIS_KEEPALIVE_CMPLT: /* unused */ if (dlen < sizeof(*comp)) { if_printf(sc->hn_ifp, "invalid RNDIS cmplt\n"); return; } comp = data; KASSERT(comp->rm_rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid RNDIS rid 0x%08x\n", comp->rm_rid)); vmbus_xact_ctx_wakeup(sc->hn_xact, comp, dlen); break; case REMOTE_NDIS_RESET_CMPLT: /* * Reset completed, no rid. * * NOTE: * RESET is not issued by hn(4), so this message should * _not_ be observed. */ if_printf(sc->hn_ifp, "RESET cmplt received\n"); break; default: if_printf(sc->hn_ifp, "unknown RNDIS msg 0x%x\n", hdr->rm_type); break; } } int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr) { size_t eaddr_len; int error; eaddr_len = ETHER_ADDR_LEN; error = hn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, NULL, 0, eaddr, &eaddr_len); if (error) return (error); if (eaddr_len != ETHER_ADDR_LEN) { if_printf(sc->hn_ifp, "invalid eaddr len %zu\n", eaddr_len); return (EINVAL); } return (0); } int hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status) { size_t size; int error; size = sizeof(*link_status); error = hn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, NULL, 0, link_status, &size); if (error) return (error); if (size != sizeof(uint32_t)) { if_printf(sc->hn_ifp, "invalid link status len %zu\n", size); return (EINVAL); } return (0); } static const void * hn_rndis_xact_exec1(struct hn_softc *sc, struct vmbus_xact *xact, size_t reqlen, struct hn_nvs_sendctx *sndc, size_t *comp_len) { struct vmbus_gpa gpa[HN_XACT_REQ_PGCNT]; int gpa_cnt, error; bus_addr_t paddr; KASSERT(reqlen <= HN_XACT_REQ_SIZE && reqlen > 0, ("invalid request length %zu", reqlen)); /* * Setup the SG list. */ paddr = vmbus_xact_req_paddr(xact); KASSERT((paddr & PAGE_MASK) == 0, ("vmbus xact request is not page aligned 0x%jx", (uintmax_t)paddr)); for (gpa_cnt = 0; gpa_cnt < HN_XACT_REQ_PGCNT; ++gpa_cnt) { int len = PAGE_SIZE; if (reqlen == 0) break; if (reqlen < len) len = reqlen; gpa[gpa_cnt].gpa_page = atop(paddr) + gpa_cnt; gpa[gpa_cnt].gpa_len = len; gpa[gpa_cnt].gpa_ofs = 0; reqlen -= len; } KASSERT(reqlen == 0, ("still have %zu request data left", reqlen)); /* * Send this RNDIS control message and wait for its completion * message. */ vmbus_xact_activate(xact); error = hn_nvs_send_rndis_ctrl(sc->hn_prichan, sndc, gpa, gpa_cnt); if (error) { vmbus_xact_deactivate(xact); if_printf(sc->hn_ifp, "RNDIS ctrl send failed: %d\n", error); return (NULL); } return (vmbus_chan_xact_wait(sc->hn_prichan, xact, comp_len, HN_CAN_SLEEP(sc))); } static const void * hn_rndis_xact_execute(struct hn_softc *sc, struct vmbus_xact *xact, uint32_t rid, size_t reqlen, size_t *comp_len0, uint32_t comp_type) { const struct rndis_comp_hdr *comp; size_t comp_len, min_complen = *comp_len0; KASSERT(rid > HN_RNDIS_RID_COMPAT_MAX, ("invalid rid %u\n", rid)); KASSERT(min_complen >= sizeof(*comp), ("invalid minimum complete len %zu", min_complen)); /* * Execute the xact setup by the caller. */ comp = hn_rndis_xact_exec1(sc, xact, reqlen, &hn_nvs_sendctx_none, &comp_len); if (comp == NULL) return (NULL); /* * Check this RNDIS complete message. */ if (comp_len < min_complen) { if (comp_len >= sizeof(*comp)) { /* rm_status field is valid */ if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu, " "status 0x%08x\n", comp_len, comp->rm_status); } else { if_printf(sc->hn_ifp, "invalid RNDIS comp len %zu\n", comp_len); } return (NULL); } if (comp->rm_len < min_complen) { if_printf(sc->hn_ifp, "invalid RNDIS comp msglen %u\n", comp->rm_len); return (NULL); } if (comp->rm_type != comp_type) { if_printf(sc->hn_ifp, "unexpected RNDIS comp 0x%08x, " "expect 0x%08x\n", comp->rm_type, comp_type); return (NULL); } if (comp->rm_rid != rid) { if_printf(sc->hn_ifp, "RNDIS comp rid mismatch %u, " "expect %u\n", comp->rm_rid, rid); return (NULL); } /* All pass! */ *comp_len0 = comp_len; return (comp); } static int hn_rndis_query(struct hn_softc *sc, uint32_t oid, const void *idata, size_t idlen, void *odata, size_t *odlen0) { return (hn_rndis_query2(sc, oid, idata, idlen, odata, odlen0, *odlen0)); } static int hn_rndis_query2(struct hn_softc *sc, uint32_t oid, const void *idata, size_t idlen, void *odata, size_t *odlen0, size_t min_odlen) { struct rndis_query_req *req; const struct rndis_query_comp *comp; struct vmbus_xact *xact; size_t reqlen, odlen = *odlen0, comp_len; int error, ofs; uint32_t rid; reqlen = sizeof(*req) + idlen; xact = vmbus_xact_get(sc->hn_xact, reqlen); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS query 0x%08x\n", oid); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_QUERY_MSG; req->rm_len = reqlen; req->rm_rid = rid; req->rm_oid = oid; /* * XXX * This is _not_ RNDIS Spec conforming: * "This MUST be set to 0 when there is no input data * associated with the OID." * * If this field was set to 0 according to the RNDIS Spec, * Hyper-V would set non-SUCCESS status in the query * completion. */ req->rm_infobufoffset = RNDIS_QUERY_REQ_INFOBUFOFFSET; if (idlen > 0) { req->rm_infobuflen = idlen; /* Input data immediately follows RNDIS query. */ memcpy(req + 1, idata, idlen); } comp_len = sizeof(*comp) + min_odlen; comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, REMOTE_NDIS_QUERY_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS query 0x%08x failed\n", oid); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS query 0x%08x failed: " "status 0x%08x\n", oid, comp->rm_status); error = EIO; goto done; } if (comp->rm_infobuflen == 0 || comp->rm_infobufoffset == 0) { /* No output data! */ if_printf(sc->hn_ifp, "RNDIS query 0x%08x, no data\n", oid); *odlen0 = 0; error = 0; goto done; } /* * Check output data length and offset. */ /* ofs is the offset from the beginning of comp. */ ofs = RNDIS_QUERY_COMP_INFOBUFOFFSET_ABS(comp->rm_infobufoffset); if (ofs < sizeof(*comp) || ofs + comp->rm_infobuflen > comp_len) { if_printf(sc->hn_ifp, "RNDIS query invalid comp ib off/len, " "%u/%u\n", comp->rm_infobufoffset, comp->rm_infobuflen); error = EINVAL; goto done; } /* * Save output data. */ if (comp->rm_infobuflen < odlen) odlen = comp->rm_infobuflen; memcpy(odata, ((const uint8_t *)comp) + ofs, odlen); *odlen0 = odlen; error = 0; done: vmbus_xact_put(xact); return (error); } int hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt0) { struct ndis_rss_caps in, caps; size_t caps_len; int error, indsz, rxr_cnt, hash_fnidx; uint32_t hash_func = 0, hash_types = 0; *rxr_cnt0 = 0; if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_20) return (EOPNOTSUPP); memset(&in, 0, sizeof(in)); in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS; in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2; in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE; caps_len = NDIS_RSS_CAPS_SIZE; error = hn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES, &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0); if (error) return (error); /* * Preliminary verification. */ if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) { if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", caps.ndis_hdr.ndis_type); return (EINVAL); } if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) { if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", caps.ndis_hdr.ndis_rev); return (EINVAL); } if (caps.ndis_hdr.ndis_size > caps_len) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " "data size %zu\n", caps.ndis_hdr.ndis_size, caps_len); return (EINVAL); } else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", caps.ndis_hdr.ndis_size); return (EINVAL); } /* * Save information for later RSS configuration. */ if (caps.ndis_nrxr == 0) { if_printf(sc->hn_ifp, "0 RX rings!?\n"); return (EINVAL); } if (bootverbose) if_printf(sc->hn_ifp, "%u RX rings\n", caps.ndis_nrxr); rxr_cnt = caps.ndis_nrxr; if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE && caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) { if (caps.ndis_nind > NDIS_HASH_INDCNT) { if_printf(sc->hn_ifp, "too many RSS indirect table entries %u\n", caps.ndis_nind); return (EOPNOTSUPP); } if (!powerof2(caps.ndis_nind)) { if_printf(sc->hn_ifp, "RSS indirect table size is not " "power-of-2 %u\n", caps.ndis_nind); } if (bootverbose) { if_printf(sc->hn_ifp, "RSS indirect table size %u\n", caps.ndis_nind); } indsz = caps.ndis_nind; } else { indsz = NDIS_HASH_INDCNT; } if (indsz < rxr_cnt) { if_printf(sc->hn_ifp, "# of RX rings (%d) > " "RSS indirect table size %d\n", rxr_cnt, indsz); rxr_cnt = indsz; } /* * NOTE: * Toeplitz is at the lowest bit, and it is prefered; so ffs(), * instead of fls(), is used here. */ hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK); if (hash_fnidx == 0) { if_printf(sc->hn_ifp, "no hash functions, caps 0x%08x\n", caps.ndis_caps); return (EOPNOTSUPP); } hash_func = 1 << (hash_fnidx - 1); /* ffs is 1-based */ if (caps.ndis_caps & NDIS_RSS_CAP_IPV4) hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4; if (caps.ndis_caps & NDIS_RSS_CAP_IPV6) hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6; if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX) hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX; if (hash_types == 0) { if_printf(sc->hn_ifp, "no hash types, caps 0x%08x\n", caps.ndis_caps); return (EOPNOTSUPP); } /* Commit! */ sc->hn_rss_ind_size = indsz; sc->hn_rss_hash = hash_func | hash_types; *rxr_cnt0 = rxr_cnt; return (0); } static int hn_rndis_set(struct hn_softc *sc, uint32_t oid, const void *data, size_t dlen) { struct rndis_set_req *req; const struct rndis_set_comp *comp; struct vmbus_xact *xact; size_t reqlen, comp_len; uint32_t rid; int error; KASSERT(dlen > 0, ("invalid dlen %zu", dlen)); reqlen = sizeof(*req) + dlen; xact = vmbus_xact_get(sc->hn_xact, reqlen); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS set 0x%08x\n", oid); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_SET_MSG; req->rm_len = reqlen; req->rm_rid = rid; req->rm_oid = oid; req->rm_infobuflen = dlen; req->rm_infobufoffset = RNDIS_SET_REQ_INFOBUFOFFSET; /* Data immediately follows RNDIS set. */ memcpy(req + 1, data, dlen); comp_len = sizeof(*comp); comp = hn_rndis_xact_execute(sc, xact, rid, reqlen, &comp_len, REMOTE_NDIS_SET_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS set 0x%08x failed\n", oid); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS set 0x%08x failed: " "status 0x%08x\n", oid, comp->rm_status); error = EIO; goto done; } error = 0; done: vmbus_xact_put(xact); return (error); } static int hn_rndis_conf_offload(struct hn_softc *sc, int mtu) { struct ndis_offload hwcaps; struct ndis_offload_params params; uint32_t caps = 0; size_t paramsz; int error, tso_maxsz, tso_minsg; error = hn_rndis_query_hwcaps(sc, &hwcaps); if (error) { if_printf(sc->hn_ifp, "hwcaps query failed: %d\n", error); return (error); } /* NOTE: 0 means "no change" */ memset(¶ms, 0, sizeof(params)); params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT; if (sc->hn_ndis_ver < HN_NDIS_VERSION_6_30) { params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2; paramsz = NDIS_OFFLOAD_PARAMS_SIZE_6_1; } else { params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3; paramsz = NDIS_OFFLOAD_PARAMS_SIZE; } params.ndis_hdr.ndis_size = paramsz; /* * TSO4/TSO6 setup. */ tso_maxsz = IP_MAXPACKET; tso_minsg = 2; if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) { caps |= HN_CAP_TSO4; params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON; if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz) tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz; if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg) tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg; } if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) && (hwcaps.ndis_lsov2.ndis_ip6_opts & HN_NDIS_LSOV2_CAP_IP6) == HN_NDIS_LSOV2_CAP_IP6) { caps |= HN_CAP_TSO6; params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON; if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz) tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz; if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg) tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg; } sc->hn_ndis_tso_szmax = 0; sc->hn_ndis_tso_sgmin = 0; if (caps & (HN_CAP_TSO4 | HN_CAP_TSO6)) { KASSERT(tso_maxsz <= IP_MAXPACKET, ("invalid NDIS TSO maxsz %d", tso_maxsz)); KASSERT(tso_minsg >= 2, ("invalid NDIS TSO minsg %d", tso_minsg)); if (tso_maxsz < tso_minsg * mtu) { if_printf(sc->hn_ifp, "invalid NDIS TSO config: " "maxsz %d, minsg %d, mtu %d; " "disable TSO4 and TSO6\n", tso_maxsz, tso_minsg, mtu); caps &= ~(HN_CAP_TSO4 | HN_CAP_TSO6); params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF; params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF; } else { sc->hn_ndis_tso_szmax = tso_maxsz; sc->hn_ndis_tso_sgmin = tso_minsg; if (bootverbose) { if_printf(sc->hn_ifp, "NDIS TSO " "szmax %d sgmin %d\n", sc->hn_ndis_tso_szmax, sc->hn_ndis_tso_sgmin); } } } /* IPv4 checksum */ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_IP4) == HN_NDIS_TXCSUM_CAP_IP4) { caps |= HN_CAP_IPCS; params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) { if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX; } /* TCP4 checksum */ if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HN_NDIS_TXCSUM_CAP_TCP4) == HN_NDIS_TXCSUM_CAP_TCP4) { caps |= HN_CAP_TCP4CS; params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) { if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX; } /* UDP4 checksum */ if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) { caps |= HN_CAP_UDP4CS; params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) { if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX; } /* TCP6 checksum */ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_TCP6) == HN_NDIS_TXCSUM_CAP_TCP6) { caps |= HN_CAP_TCP6CS; params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) { if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX; } /* UDP6 checksum */ if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HN_NDIS_TXCSUM_CAP_UDP6) == HN_NDIS_TXCSUM_CAP_UDP6) { caps |= HN_CAP_UDP6CS; params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX; } if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) { if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX) params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX; else params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX; } if (bootverbose) { if_printf(sc->hn_ifp, "offload csum: " "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n", params.ndis_ip4csum, params.ndis_tcp4csum, params.ndis_udp4csum, params.ndis_tcp6csum, params.ndis_udp6csum); if_printf(sc->hn_ifp, "offload lsov2: ip4 %u, ip6 %u\n", params.ndis_lsov2_ip4, params.ndis_lsov2_ip6); } error = hn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, ¶ms, paramsz); if (error) { if_printf(sc->hn_ifp, "offload config failed: %d\n", error); return (error); } if (bootverbose) if_printf(sc->hn_ifp, "offload config done\n"); sc->hn_caps |= caps; return (0); } int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; struct ndis_rss_params *prm = &rss->rss_params; int error, rss_size; /* * Only NDIS 6.20+ is supported: * We only support 4bytes element in indirect table, which has been * adopted since NDIS 6.20. */ KASSERT(sc->hn_ndis_ver >= HN_NDIS_VERSION_6_20, ("NDIS 6.20+ is required, NDIS version 0x%08x", sc->hn_ndis_ver)); /* XXX only one can be specified through, popcnt? */ KASSERT((sc->hn_rss_hash & NDIS_HASH_FUNCTION_MASK), ("no hash func")); KASSERT((sc->hn_rss_hash & NDIS_HASH_TYPE_MASK), ("no hash types")); KASSERT(sc->hn_rss_ind_size > 0, ("no indirect table size")); if (bootverbose) { if_printf(sc->hn_ifp, "RSS indirect table size %d, " "hash 0x%08x\n", sc->hn_rss_ind_size, sc->hn_rss_hash); } /* * NOTE: * DO NOT whack rss_key and rss_ind, which are setup by the caller. */ memset(prm, 0, sizeof(*prm)); rss_size = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->hn_rss_ind_size); prm->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS; prm->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2; prm->ndis_hdr.ndis_size = rss_size; prm->ndis_flags = flags; prm->ndis_hash = sc->hn_rss_hash; prm->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->hn_rss_ind_size; prm->ndis_indoffset = __offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]); prm->ndis_keysize = sizeof(rss->rss_key); prm->ndis_keyoffset = __offsetof(struct ndis_rssprm_toeplitz, rss_key[0]); error = hn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS, rss, rss_size); if (error) { if_printf(sc->hn_ifp, "RSS config failed: %d\n", error); } else { if (bootverbose) if_printf(sc->hn_ifp, "RSS config done\n"); } return (error); } int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error; error = hn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER, &filter, sizeof(filter)); if (error) { if_printf(sc->hn_ifp, "set RX filter 0x%08x failed: %d\n", filter, error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "set RX filter 0x%08x done\n", filter); } } return (error); } static int hn_rndis_init(struct hn_softc *sc) { struct rndis_init_req *req; const struct rndis_init_comp *comp; struct vmbus_xact *xact; size_t comp_len; uint32_t rid; int error; xact = vmbus_xact_get(sc->hn_xact, sizeof(*req)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS init\n"); return (ENXIO); } rid = hn_rndis_rid(sc); req = vmbus_xact_req_data(xact); req->rm_type = REMOTE_NDIS_INITIALIZE_MSG; req->rm_len = sizeof(*req); req->rm_rid = rid; req->rm_ver_major = RNDIS_VERSION_MAJOR; req->rm_ver_minor = RNDIS_VERSION_MINOR; req->rm_max_xfersz = HN_RNDIS_XFER_SIZE; comp_len = RNDIS_INIT_COMP_SIZE_MIN; comp = hn_rndis_xact_execute(sc, xact, rid, sizeof(*req), &comp_len, REMOTE_NDIS_INITIALIZE_CMPLT); if (comp == NULL) { if_printf(sc->hn_ifp, "exec RNDIS init failed\n"); error = EIO; goto done; } if (comp->rm_status != RNDIS_STATUS_SUCCESS) { if_printf(sc->hn_ifp, "RNDIS init failed: status 0x%08x\n", comp->rm_status); error = EIO; goto done; } sc->hn_rndis_agg_size = comp->rm_pktmaxsz; sc->hn_rndis_agg_pkts = comp->rm_pktmaxcnt; sc->hn_rndis_agg_align = 1U << comp->rm_align; if (sc->hn_rndis_agg_align < sizeof(uint32_t)) { /* * The RNDIS packet messsage encap assumes that the RNDIS * packet message is at least 4 bytes aligned. Fix up the * alignment here, if the remote side sets the alignment * too low. */ if_printf(sc->hn_ifp, "fixup RNDIS aggpkt align: %u -> %zu\n", sc->hn_rndis_agg_align, sizeof(uint32_t)); sc->hn_rndis_agg_align = sizeof(uint32_t); } if (bootverbose) { if_printf(sc->hn_ifp, "RNDIS ver %u.%u, " "aggpkt size %u, aggpkt cnt %u, aggpkt align %u\n", comp->rm_ver_major, comp->rm_ver_minor, sc->hn_rndis_agg_size, sc->hn_rndis_agg_pkts, sc->hn_rndis_agg_align); } error = 0; done: vmbus_xact_put(xact); return (error); } static int hn_rndis_halt(struct hn_softc *sc) { struct vmbus_xact *xact; struct rndis_halt_req *halt; struct hn_nvs_sendctx sndc; size_t comp_len; xact = vmbus_xact_get(sc->hn_xact, sizeof(*halt)); if (xact == NULL) { if_printf(sc->hn_ifp, "no xact for RNDIS halt\n"); return (ENXIO); } halt = vmbus_xact_req_data(xact); halt->rm_type = REMOTE_NDIS_HALT_MSG; halt->rm_len = sizeof(*halt); halt->rm_rid = hn_rndis_rid(sc); /* No RNDIS completion; rely on NVS message send completion */ hn_nvs_sendctx_init(&sndc, hn_nvs_sent_xact, xact); hn_rndis_xact_exec1(sc, xact, sizeof(*halt), &sndc, &comp_len); vmbus_xact_put(xact); if (bootverbose) if_printf(sc->hn_ifp, "RNDIS halt done\n"); return (0); } static int hn_rndis_query_hwcaps(struct hn_softc *sc, struct ndis_offload *caps) { struct ndis_offload in; size_t caps_len, size; int error; memset(&in, 0, sizeof(in)); in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD; if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_30) { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3; size = NDIS_OFFLOAD_SIZE; } else if (sc->hn_ndis_ver >= HN_NDIS_VERSION_6_1) { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2; size = NDIS_OFFLOAD_SIZE_6_1; } else { in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1; size = NDIS_OFFLOAD_SIZE_6_0; } in.ndis_hdr.ndis_size = size; caps_len = NDIS_OFFLOAD_SIZE; error = hn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES, &in, size, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0); if (error) return (error); /* * Preliminary verification. */ if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) { if_printf(sc->hn_ifp, "invalid NDIS objtype 0x%02x\n", caps->ndis_hdr.ndis_type); return (EINVAL); } if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) { if_printf(sc->hn_ifp, "invalid NDIS objrev 0x%02x\n", caps->ndis_hdr.ndis_rev); return (EINVAL); } if (caps->ndis_hdr.ndis_size > caps_len) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u, " "data size %zu\n", caps->ndis_hdr.ndis_size, caps_len); return (EINVAL); } else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) { if_printf(sc->hn_ifp, "invalid NDIS objsize %u\n", caps->ndis_hdr.ndis_size); return (EINVAL); } if (bootverbose) { /* * NOTE: * caps->ndis_hdr.ndis_size MUST be checked before accessing * NDIS 6.1+ specific fields. */ if_printf(sc->hn_ifp, "hwcaps rev %u\n", caps->ndis_hdr.ndis_rev); if_printf(sc->hn_ifp, "hwcaps csum: " "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, " "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n", caps->ndis_csum.ndis_ip4_txcsum, caps->ndis_csum.ndis_ip4_txenc, caps->ndis_csum.ndis_ip4_rxcsum, caps->ndis_csum.ndis_ip4_rxenc, caps->ndis_csum.ndis_ip6_txcsum, caps->ndis_csum.ndis_ip6_txenc, caps->ndis_csum.ndis_ip6_rxcsum, caps->ndis_csum.ndis_ip6_rxenc); if_printf(sc->hn_ifp, "hwcaps lsov2: " "ip4 maxsz %u minsg %u encap 0x%x, " "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n", caps->ndis_lsov2.ndis_ip4_maxsz, caps->ndis_lsov2.ndis_ip4_minsg, caps->ndis_lsov2.ndis_ip4_encap, caps->ndis_lsov2.ndis_ip6_maxsz, caps->ndis_lsov2.ndis_ip6_minsg, caps->ndis_lsov2.ndis_ip6_encap, caps->ndis_lsov2.ndis_ip6_opts); } return (0); } int hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done) { int error; *init_done = 0; /* * Initialize RNDIS. */ error = hn_rndis_init(sc); if (error) return (error); *init_done = 1; /* * Configure NDIS offload settings. */ hn_rndis_conf_offload(sc, mtu); return (0); } void hn_rndis_detach(struct hn_softc *sc) { /* Halt the RNDIS. */ hn_rndis_halt(sc); } Index: head/sys/dev/hyperv/netvsc/hn_rndis.h =================================================================== --- head/sys/dev/hyperv/netvsc/hn_rndis.h (revision 322487) +++ head/sys/dev/hyperv/netvsc/hn_rndis.h (revision 322488) @@ -1,49 +1,49 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2010-2012 Citrix Inc. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _HN_RNDIS_H_ #define _HN_RNDIS_H_ struct hn_softc; int hn_rndis_attach(struct hn_softc *sc, int mtu, int *init_done); void hn_rndis_detach(struct hn_softc *sc); int hn_rndis_conf_rss(struct hn_softc *sc, uint16_t flags); int hn_rndis_query_rsscaps(struct hn_softc *sc, int *rxr_cnt); int hn_rndis_get_eaddr(struct hn_softc *sc, uint8_t *eaddr); /* link_status: NDIS_MEDIA_STATE_ */ int hn_rndis_get_linkstatus(struct hn_softc *sc, uint32_t *link_status); /* filter: NDIS_PACKET_TYPE_. */ int hn_rndis_set_rxfilter(struct hn_softc *sc, uint32_t filter); void hn_rndis_rx_ctrl(struct hn_softc *sc, const void *data, int dlen); #endif /* !_HN_RNDIS_H_ */ Index: head/sys/dev/hyperv/netvsc/if_hn.c =================================================================== --- head/sys/dev/hyperv/netvsc/if_hn.c (revision 322487) +++ head/sys/dev/hyperv/netvsc/if_hn.c (revision 322488) @@ -1,7007 +1,7007 @@ /*- * Copyright (c) 2010-2012 Citrix Inc. - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /*- * Copyright (c) 2004-2006 Kip Macy * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include "opt_hn.h" #include "opt_inet6.h" #include "opt_inet.h" #include "opt_rss.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef RSS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmbus_if.h" #define HN_IFSTART_SUPPORT #define HN_RING_CNT_DEF_MAX 8 #define HN_VFMAP_SIZE_DEF 8 #define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */ /* YYY should get it from the underlying channel */ #define HN_TX_DESC_CNT 512 #define HN_RNDIS_PKT_LEN \ (sizeof(struct rndis_packet_msg) + \ HN_RNDIS_PKTINFO_SIZE(HN_NDIS_HASH_VALUE_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_VLAN_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ HN_RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) #define HN_RNDIS_PKT_BOUNDARY PAGE_SIZE #define HN_RNDIS_PKT_ALIGN CACHE_LINE_SIZE #define HN_TX_DATA_BOUNDARY PAGE_SIZE #define HN_TX_DATA_MAXSIZE IP_MAXPACKET #define HN_TX_DATA_SEGSIZE PAGE_SIZE /* -1 for RNDIS packet message */ #define HN_TX_DATA_SEGCNT_MAX (HN_GPACNT_MAX - 1) #define HN_DIRECT_TX_SIZE_DEF 128 #define HN_EARLY_TXEOF_THRESH 8 #define HN_PKTBUF_LEN_DEF (16 * 1024) #define HN_LROENT_CNT_DEF 128 #define HN_LRO_LENLIM_MULTIRX_DEF (12 * ETHERMTU) #define HN_LRO_LENLIM_DEF (25 * ETHERMTU) /* YYY 2*MTU is a bit rough, but should be good enough. */ #define HN_LRO_LENLIM_MIN(ifp) (2 * (ifp)->if_mtu) #define HN_LRO_ACKCNT_DEF 1 #define HN_LOCK_INIT(sc) \ sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) #define HN_LOCK(sc) \ do { \ while (sx_try_xlock(&(sc)->hn_lock) == 0) \ DELAY(1000); \ } while (0) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) #define HN_CSUM_IP6_MASK (CSUM_IP6_TCP | CSUM_IP6_UDP) #define HN_CSUM_IP_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP_MASK) #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) #define HN_PKTSIZE_MIN(align) \ roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ HN_RNDIS_PKT_LEN, (align)) #define HN_PKTSIZE(m, align) \ roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) #ifdef RSS #define HN_RING_IDX2CPU(sc, idx) rss_getcpu((idx) % rss_getnumbuckets()) #else #define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) #endif struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif STAILQ_ENTRY(hn_txdesc) agg_link; /* Aggregated txdescs, in sending order. */ STAILQ_HEAD(, hn_txdesc) agg_list; /* The oldest packet, if transmission aggregation happens. */ struct mbuf *m; struct hn_tx_ring *txr; int refs; uint32_t flags; /* HN_TXD_FLAG_ */ struct hn_nvs_sendctx send_ctx; uint32_t chim_index; int chim_size; bus_dmamap_t data_dmap; bus_addr_t rndis_pkt_paddr; struct rndis_packet_msg *rndis_pkt; bus_dmamap_t rndis_pkt_dmap; }; #define HN_TXD_FLAG_ONLIST 0x0001 #define HN_TXD_FLAG_DMAMAP 0x0002 #define HN_TXD_FLAG_ONAGG 0x0004 struct hn_rxinfo { uint32_t vlan_info; uint32_t csum_info; uint32_t hash_info; uint32_t hash_value; }; struct hn_rxvf_setarg { struct hn_rx_ring *rxr; struct ifnet *vf_ifp; }; #define HN_RXINFO_VLAN 0x0001 #define HN_RXINFO_CSUM 0x0002 #define HN_RXINFO_HASHINF 0x0004 #define HN_RXINFO_HASHVAL 0x0008 #define HN_RXINFO_ALL \ (HN_RXINFO_VLAN | \ HN_RXINFO_CSUM | \ HN_RXINFO_HASHINF | \ HN_RXINFO_HASHVAL) #define HN_NDIS_VLAN_INFO_INVALID 0xffffffff #define HN_NDIS_RXCSUM_INFO_INVALID 0 #define HN_NDIS_HASH_INFO_INVALID 0 static int hn_probe(device_t); static int hn_attach(device_t); static int hn_detach(device_t); static int hn_shutdown(device_t); static void hn_chan_callback(struct vmbus_channel *, void *); static void hn_init(void *); static int hn_ioctl(struct ifnet *, u_long, caddr_t); #ifdef HN_IFSTART_SUPPORT static void hn_start(struct ifnet *); #endif static int hn_transmit(struct ifnet *, struct mbuf *); static void hn_xmit_qflush(struct ifnet *); static int hn_ifmedia_upd(struct ifnet *); static void hn_ifmedia_sts(struct ifnet *, struct ifmediareq *); static void hn_ifnet_event(void *, struct ifnet *, int); static void hn_ifaddr_event(void *, struct ifnet *); static void hn_ifnet_attevent(void *, struct ifnet *); static void hn_ifnet_detevent(void *, struct ifnet *); static void hn_ifnet_lnkevent(void *, struct ifnet *, int); static bool hn_ismyvf(const struct hn_softc *, const struct ifnet *); static void hn_rxvf_change(struct hn_softc *, struct ifnet *, bool); static void hn_rxvf_set(struct hn_softc *, struct ifnet *); static void hn_rxvf_set_task(void *, int); static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *); static int hn_xpnt_vf_iocsetflags(struct hn_softc *); static int hn_xpnt_vf_iocsetcaps(struct hn_softc *, struct ifreq *); static void hn_xpnt_vf_saveifflags(struct hn_softc *); static bool hn_xpnt_vf_isready(struct hn_softc *); static void hn_xpnt_vf_setready(struct hn_softc *); static void hn_xpnt_vf_init_taskfunc(void *, int); static void hn_xpnt_vf_init(struct hn_softc *); static void hn_xpnt_vf_setenable(struct hn_softc *); static void hn_xpnt_vf_setdisable(struct hn_softc *, bool); static int hn_rndis_rxinfo(const void *, int, struct hn_rxinfo *); static void hn_rndis_rx_data(struct hn_rx_ring *, const void *, int); static void hn_rndis_rx_status(struct hn_softc *, const void *, int); static void hn_rndis_init_fixat(struct hn_softc *, int); static void hn_nvs_handle_notify(struct hn_softc *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_comp(struct hn_softc *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_handle_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, const struct vmbus_chanpkt_hdr *); static void hn_nvs_ack_rxbuf(struct hn_rx_ring *, struct vmbus_channel *, uint64_t); #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS); static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS); static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS); #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS); #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS); static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS); static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS); static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS); static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); #endif static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS); static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS); static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS); static void hn_stop(struct hn_softc *, bool); static void hn_init_locked(struct hn_softc *); static int hn_chan_attach(struct hn_softc *, struct vmbus_channel *); static void hn_chan_detach(struct hn_softc *, struct vmbus_channel *); static int hn_attach_subchans(struct hn_softc *); static void hn_detach_allchans(struct hn_softc *); static void hn_chan_rollup(struct hn_rx_ring *, struct hn_tx_ring *); static void hn_set_ring_inuse(struct hn_softc *, int); static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static int hn_synth_alloc_subchans(struct hn_softc *, int *); static bool hn_synth_attachable(const struct hn_softc *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_suspend_mgmt_taskfunc(void *, int); static void hn_chan_drain(struct hn_softc *, struct vmbus_channel *); static void hn_disable_rx(struct hn_softc *); static void hn_drain_rxtx(struct hn_softc *, int); static void hn_polling(struct hn_softc *, u_int); static void hn_chan_polling(struct vmbus_channel *, u_int); static void hn_mtu_change_fixup(struct hn_softc *); static void hn_update_link_status(struct hn_softc *); static void hn_change_network(struct hn_softc *); static void hn_link_taskfunc(void *, int); static void hn_netchg_init_taskfunc(void *, int); static void hn_netchg_status_taskfunc(void *, int); static void hn_link_status(struct hn_softc *); static int hn_create_rx_data(struct hn_softc *, int); static void hn_destroy_rx_data(struct hn_softc *); static int hn_check_iplen(const struct mbuf *, int); static int hn_set_rxfilter(struct hn_softc *, uint32_t); static int hn_rxfilter_config(struct hn_softc *); #ifndef RSS static int hn_rss_reconfig(struct hn_softc *); #endif static void hn_rss_ind_fixup(struct hn_softc *); static int hn_rxpkt(struct hn_rx_ring *, const void *, int, const struct hn_rxinfo *); static int hn_tx_ring_create(struct hn_softc *, int); static void hn_tx_ring_destroy(struct hn_tx_ring *); static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); static void hn_txdesc_gc(struct hn_tx_ring *, struct hn_txdesc *); static int hn_encap(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *); static void hn_set_chim_size(struct hn_softc *, int); static void hn_set_tso_maxsize(struct hn_softc *, int, int); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_tx_ring_qflush(struct hn_tx_ring *); static void hn_resume_tx(struct hn_softc *, int); static void hn_set_txagg(struct hn_softc *); static void *hn_try_txagg(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *, int); static int hn_get_txswq_depth(const struct hn_tx_ring *); static void hn_txpkt_done(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, const void *, int); static int hn_txpkt_sglist(struct hn_tx_ring *, struct hn_txdesc *); static int hn_txpkt_chim(struct hn_tx_ring *, struct hn_txdesc *); static int hn_xmit(struct hn_tx_ring *, int); static void hn_xmit_taskfunc(void *, int); static void hn_xmit_txeof(struct hn_tx_ring *); static void hn_xmit_txeof_taskfunc(void *, int); #ifdef HN_IFSTART_SUPPORT static int hn_start_locked(struct hn_tx_ring *, int); static void hn_start_taskfunc(void *, int); static void hn_start_txeof(struct hn_tx_ring *); static void hn_start_txeof_taskfunc(void *, int); #endif SYSCTL_NODE(_hw, OID_AUTO, hn, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V network interface"); /* Trust tcp segements verification on host side. */ static int hn_trust_hosttcp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hosttcp, CTLFLAG_RDTUN, &hn_trust_hosttcp, 0, "Trust tcp segement verification on host side, " "when csum info is missing (global setting)"); /* Trust udp datagrams verification on host side. */ static int hn_trust_hostudp = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostudp, CTLFLAG_RDTUN, &hn_trust_hostudp, 0, "Trust udp datagram verification on host side, " "when csum info is missing (global setting)"); /* Trust ip packets verification on host side. */ static int hn_trust_hostip = 1; SYSCTL_INT(_hw_hn, OID_AUTO, trust_hostip, CTLFLAG_RDTUN, &hn_trust_hostip, 0, "Trust ip packet verification on host side, " "when csum info is missing (global setting)"); /* Limit TSO burst size */ static int hn_tso_maxlen = IP_MAXPACKET; SYSCTL_INT(_hw_hn, OID_AUTO, tso_maxlen, CTLFLAG_RDTUN, &hn_tso_maxlen, 0, "TSO burst limit"); /* Limit chimney send size */ static int hn_tx_chimney_size = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_chimney_size, CTLFLAG_RDTUN, &hn_tx_chimney_size, 0, "Chimney send packet size limit"); /* Limit the size of packet for direct transmission */ static int hn_direct_tx_size = HN_DIRECT_TX_SIZE_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, direct_tx_size, CTLFLAG_RDTUN, &hn_direct_tx_size, 0, "Size of the packet for direct transmission"); /* # of LRO entries per RX ring */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 static int hn_lro_entry_count = HN_LROENT_CNT_DEF; SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, &hn_lro_entry_count, 0, "LRO entry count"); #endif #endif static int hn_tx_taskq_cnt = 1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); #define HN_TX_TASKQ_M_INDEP 0 #define HN_TX_TASKQ_M_GLOBAL 1 #define HN_TX_TASKQ_M_EVTTQ 2 static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, &hn_tx_taskq_mode, 0, "TX taskqueue modes: " "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; #else static int hn_use_txdesc_bufring = 1; #endif SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); #ifdef HN_IFSTART_SUPPORT /* Use ifnet.if_start instead of ifnet.if_transmit */ static int hn_use_if_start = 0; SYSCTL_INT(_hw_hn, OID_AUTO, use_if_start, CTLFLAG_RDTUN, &hn_use_if_start, 0, "Use if_start TX method"); #endif /* # of channels to use */ static int hn_chan_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hn_chan_cnt, 0, "# of channels to use; each channel has one RX ring and one TX ring"); /* # of transmit rings to use */ static int hn_tx_ring_cnt = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_ring_cnt, CTLFLAG_RDTUN, &hn_tx_ring_cnt, 0, "# of TX rings to use"); /* Software TX ring deptch */ static int hn_tx_swq_depth = 0; SYSCTL_INT(_hw_hn, OID_AUTO, tx_swq_depth, CTLFLAG_RDTUN, &hn_tx_swq_depth, 0, "Depth of IFQ or BUFRING"); /* Enable sorted LRO, and the depth of the per-channel mbuf queue */ #if __FreeBSD_version >= 1100095 static u_int hn_lro_mbufq_depth = 0; SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); #endif /* Packet transmission aggregation size limit */ static int hn_tx_agg_size = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); /* Packet transmission aggregation count limit */ static int hn_tx_agg_pkts = -1; SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); /* VF list */ SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING, 0, 0, hn_vflist_sysctl, "A", "VF list"); /* VF mapping */ SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING, 0, 0, hn_vfmap_sysctl, "A", "VF mapping"); /* Transparent VF */ static int hn_xpnt_vf = 0; SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN, &hn_xpnt_vf, 0, "Transparent VF mod"); /* Accurate BPF support for Transparent VF */ static int hn_xpnt_vf_accbpf = 0; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN, &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF"); /* Extra wait for transparent VF attach routing; unit seconds. */ static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN, &hn_xpnt_vf_attwait, 0, "Extra wait for transparent VF attach routing; unit: seconds"); static u_int hn_cpu_index; /* next CPU for channel */ static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ static struct rmlock hn_vfmap_lock; static int hn_vfmap_size; static struct ifnet **hn_vfmap; #ifndef RSS static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; #endif /* !RSS */ static const struct hyperv_guid hn_guid = { .hv_guid = { 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46, 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e } }; static device_method_t hn_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hn_probe), DEVMETHOD(device_attach, hn_attach), DEVMETHOD(device_detach, hn_detach), DEVMETHOD(device_shutdown, hn_shutdown), DEVMETHOD_END }; static driver_t hn_driver = { "hn", hn_methods, sizeof(struct hn_softc) }; static devclass_t hn_devclass; DRIVER_MODULE(hn, vmbus, hn_driver, hn_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); #if __FreeBSD_version >= 1100099 static void hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } #endif static int hn_txpkt_sglist(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0, ("invalid rndis sglist txd")); return (hn_nvs_send_rndis_sglist(txr->hn_chan, HN_NVS_RNDIS_MTYPE_DATA, &txd->send_ctx, txr->hn_gpa, txr->hn_gpa_cnt)); } static int hn_txpkt_chim(struct hn_tx_ring *txr, struct hn_txdesc *txd) { struct hn_nvs_rndis rndis; KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID && txd->chim_size > 0, ("invalid rndis chim txd")); rndis.nvs_type = HN_NVS_TYPE_RNDIS; rndis.nvs_rndis_mtype = HN_NVS_RNDIS_MTYPE_DATA; rndis.nvs_chim_idx = txd->chim_index; rndis.nvs_chim_sz = txd->chim_size; return (hn_nvs_send(txr->hn_chan, VMBUS_CHANPKT_FLAG_RC, &rndis, sizeof(rndis), &txd->send_ctx)); } static __inline uint32_t hn_chim_alloc(struct hn_softc *sc) { int i, bmap_cnt = sc->hn_chim_bmap_cnt; u_long *bmap = sc->hn_chim_bmap; uint32_t ret = HN_NVS_CHIM_IDX_INVALID; for (i = 0; i < bmap_cnt; ++i) { int idx; idx = ffsl(~bmap[i]); if (idx == 0) continue; --idx; /* ffsl is 1-based */ KASSERT(i * LONG_BIT + idx < sc->hn_chim_cnt, ("invalid i %d and idx %d", i, idx)); if (atomic_testandset_long(&bmap[i], idx)) continue; ret = i * LONG_BIT + idx; break; } return (ret); } static __inline void hn_chim_free(struct hn_softc *sc, uint32_t chim_idx) { u_long mask; uint32_t idx; idx = chim_idx / LONG_BIT; KASSERT(idx < sc->hn_chim_bmap_cnt, ("invalid chimney index 0x%x", chim_idx)); mask = 1UL << (chim_idx % LONG_BIT); KASSERT(sc->hn_chim_bmap[idx] & mask, ("index bitmap 0x%lx, chimney index %u, " "bitmap idx %d, bitmask 0x%lx", sc->hn_chim_bmap[idx], chim_idx, idx, mask)); atomic_clear_long(&sc->hn_chim_bmap[idx], mask); } #if defined(INET6) || defined(INET) #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (NULL); \ } \ } while (0) /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_tso_fixup(struct mbuf *m_head) { struct ether_vlan_header *evl; struct tcphdr *th; int ehlen; KASSERT(M_WRITABLE(m_head), ("TSO mbuf not writable")); PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); ip->ip_len = 0; ip->ip_sum = 0; th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP) { m_freem(m_head); return (NULL); } PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); th = mtodo(m_head, ehlen + sizeof(*ip6)); ip6->ip6_plen = 0; th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); } #endif return (m_head); } /* * NOTE: If this function failed, the m_head would be freed. */ static __inline struct mbuf * hn_check_tcpsyn(struct mbuf *m_head, int *tcpsyn) { const struct ether_vlan_header *evl; const struct tcphdr *th; int ehlen; *tcpsyn = 0; PULLUP_HDR(m_head, sizeof(*evl)); evl = mtod(m_head, const struct ether_vlan_header *); if (evl->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; else ehlen = ETHER_HDR_LEN; #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TCP) { const struct ip *ip; int iphlen; PULLUP_HDR(m_head, ehlen + sizeof(*ip)); ip = mtodo(m_head, ehlen); iphlen = ip->ip_hl << 2; PULLUP_HDR(m_head, ehlen + iphlen + sizeof(*th)); th = mtodo(m_head, ehlen + iphlen); if (th->th_flags & TH_SYN) *tcpsyn = 1; } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { const struct ip6_hdr *ip6; PULLUP_HDR(m_head, ehlen + sizeof(*ip6)); ip6 = mtodo(m_head, ehlen); if (ip6->ip6_nxt != IPPROTO_TCP) return (m_head); PULLUP_HDR(m_head, ehlen + sizeof(*ip6) + sizeof(*th)); th = mtodo(m_head, ehlen + sizeof(*ip6)); if (th->th_flags & TH_SYN) *tcpsyn = 1; } #endif return (m_head); } #undef PULLUP_HDR #endif /* INET6 || INET */ static int hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) { int error = 0; HN_LOCK_ASSERT(sc); if (sc->hn_rx_filter != filter) { error = hn_rndis_set_rxfilter(sc, filter); if (!error) sc->hn_rx_filter = filter; } return (error); } static int hn_rxfilter_config(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; uint32_t filter; HN_LOCK_ASSERT(sc); /* * If the non-transparent mode VF is activated, we don't know how * its RX filter is configured, so stick the synthetic device in * the promiscous mode. */ if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) { filter = NDIS_PACKET_TYPE_PROMISCUOUS; } else { filter = NDIS_PACKET_TYPE_DIRECTED; if (ifp->if_flags & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; /* TODO: support multicast list */ if ((ifp->if_flags & IFF_ALLMULTI) || !TAILQ_EMPTY(&ifp->if_multiaddrs)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; } return (hn_set_rxfilter(sc, filter)); } static void hn_set_txagg(struct hn_softc *sc) { uint32_t size, pkts; int i; /* * Setup aggregation size. */ if (sc->hn_agg_size < 0) size = UINT32_MAX; else size = sc->hn_agg_size; if (sc->hn_rndis_agg_size < size) size = sc->hn_rndis_agg_size; /* NOTE: We only aggregate packets using chimney sending buffers. */ if (size > (uint32_t)sc->hn_chim_szmax) size = sc->hn_chim_szmax; if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'int'. */ if (size > INT_MAX) size = INT_MAX; /* * Setup aggregation packet count. */ if (sc->hn_agg_pkts < 0) pkts = UINT32_MAX; else pkts = sc->hn_agg_pkts; if (sc->hn_rndis_agg_pkts < pkts) pkts = sc->hn_rndis_agg_pkts; if (pkts <= 1) { /* Disable */ size = 0; pkts = 0; goto done; } /* NOTE: Type of the per TX ring setting is 'short'. */ if (pkts > SHRT_MAX) pkts = SHRT_MAX; done: /* NOTE: Type of the per TX ring setting is 'short'. */ if (sc->hn_rndis_agg_align > SHRT_MAX) { /* Disable */ size = 0; pkts = 0; } if (bootverbose) { if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", size, pkts, sc->hn_rndis_agg_align); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_agg_szmax = size; txr->hn_agg_pktmax = pkts; txr->hn_agg_align = sc->hn_rndis_agg_align; mtx_unlock(&txr->hn_tx_lock); } } static int hn_get_txswq_depth(const struct hn_tx_ring *txr) { KASSERT(txr->hn_txdesc_cnt > 0, ("tx ring is not setup yet")); if (hn_tx_swq_depth < txr->hn_txdesc_cnt) return txr->hn_txdesc_cnt; return hn_tx_swq_depth; } #ifndef RSS static int hn_rss_reconfig(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return (ENXIO); /* * Disable RSS first. * * NOTE: * Direct reconfiguration by setting the UNCHG flags does * _not_ work properly. */ if (bootverbose) if_printf(sc->hn_ifp, "disable RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_DISABLE); if (error) { if_printf(sc->hn_ifp, "RSS disable failed\n"); return (error); } /* * Reenable the RSS w/ the updated RSS key or indirect * table. */ if (bootverbose) if_printf(sc->hn_ifp, "reconfig RSS\n"); error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) { if_printf(sc->hn_ifp, "RSS reconfig failed\n"); return (error); } return (0); } #endif /* !RSS */ static void hn_rss_ind_fixup(struct hn_softc *sc) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int i, nchan; nchan = sc->hn_rx_ring_inuse; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* * Check indirect table to make sure that all channels in it * can be used. */ for (i = 0; i < NDIS_HASH_INDCNT; ++i) { if (rss->rss_ind[i] >= nchan) { if_printf(sc->hn_ifp, "RSS indirect table %d fixup: %u -> %d\n", i, rss->rss_ind[i], nchan - 1); rss->rss_ind[i] = nchan - 1; } } } static int hn_ifmedia_upd(struct ifnet *ifp __unused) { return EOPNOTSUPP; } static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { struct hn_softc *sc = ifp->if_softc; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if ((sc->hn_link_flags & HN_LINK_FLAG_LINKUP) == 0) { ifmr->ifm_active |= IFM_NONE; return; } ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } static void hn_rxvf_set_task(void *xarg, int pending __unused) { struct hn_rxvf_setarg *arg = xarg; arg->rxr->hn_rxvf_ifp = arg->vf_ifp; } static void hn_rxvf_set(struct hn_softc *sc, struct ifnet *vf_ifp) { struct hn_rx_ring *rxr; struct hn_rxvf_setarg arg; struct task task; int i; HN_LOCK_ASSERT(sc); TASK_INIT(&task, 0, hn_rxvf_set_task, &arg); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; if (i < sc->hn_rx_ring_inuse) { arg.rxr = rxr; arg.vf_ifp = vf_ifp; vmbus_chan_run_task(rxr->hn_chan, &task); } else { rxr->hn_rxvf_ifp = vf_ifp; } } } static bool hn_ismyvf(const struct hn_softc *sc, const struct ifnet *ifp) { const struct ifnet *hn_ifp; hn_ifp = sc->hn_ifp; if (ifp == hn_ifp) return (false); if (ifp->if_alloctype != IFT_ETHER) return (false); /* Ignore lagg/vlan interfaces */ if (strcmp(ifp->if_dname, "lagg") == 0 || strcmp(ifp->if_dname, "vlan") == 0) return (false); if (bcmp(IF_LLADDR(ifp), IF_LLADDR(hn_ifp), ETHER_ADDR_LEN) != 0) return (false); return (true); } static void hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf) { struct ifnet *hn_ifp; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto out; if (!hn_ismyvf(sc, ifp)) goto out; hn_ifp = sc->hn_ifp; if (rxvf) { if (sc->hn_flags & HN_FLAG_RXVF) goto out; sc->hn_flags |= HN_FLAG_RXVF; hn_rxfilter_config(sc); } else { if (!(sc->hn_flags & HN_FLAG_RXVF)) goto out; sc->hn_flags &= ~HN_FLAG_RXVF; if (hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_rxfilter_config(sc); else hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); } hn_nvs_set_datapath(sc, rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH); hn_rxvf_set(sc, rxvf ? ifp : NULL); if (rxvf) { hn_suspend_mgmt(sc); sc->hn_link_flags &= ~(HN_LINK_FLAG_LINKUP | HN_LINK_FLAG_NETCHG); if_link_state_change(hn_ifp, LINK_STATE_DOWN); } else { hn_resume_mgmt(sc); } devctl_notify("HYPERV_NIC_VF", hn_ifp->if_xname, rxvf ? "VF_UP" : "VF_DOWN", NULL); if (bootverbose) { if_printf(hn_ifp, "datapath is switched %s %s\n", rxvf ? "to" : "from", ifp->if_xname); } out: HN_UNLOCK(sc); } static void hn_ifnet_event(void *arg, struct ifnet *ifp, int event) { if (event != IFNET_EVENT_UP && event != IFNET_EVENT_DOWN) return; hn_rxvf_change(arg, ifp, event == IFNET_EVENT_UP); } static void hn_ifaddr_event(void *arg, struct ifnet *ifp) { hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP); } static int hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr) { struct ifnet *ifp, *vf_ifp; uint64_t tmp; int error; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ ifr->ifr_reqcap &= ifp->if_capabilities; /* Pass SIOCSIFCAP to VF. */ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr); /* * NOTE: * The error will be propagated to the callers, however, it * is _not_ useful here. */ /* * Merge VF's enabled capabilities. */ ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities; tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc); if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc); if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & CSUM_IP_TSO; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= tmp; else ifp->if_hwassist &= ~tmp; return (error); } static int hn_xpnt_vf_iocsetflags(struct hn_softc *sc) { struct ifnet *vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); vf_ifp = sc->hn_vf_ifp; memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_flags = vf_ifp->if_flags & 0xffff; ifr.ifr_flagshigh = vf_ifp->if_flags >> 16; return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr)); } static void hn_xpnt_vf_saveifflags(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int allmulti = 0; HN_LOCK_ASSERT(sc); /* XXX vlan(4) style mcast addr maintenance */ if (!TAILQ_EMPTY(&ifp->if_multiaddrs)) allmulti = IFF_ALLMULTI; /* Always set the VF's if_flags */ sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti; } static void hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m) { struct rm_priotracker pt; struct ifnet *hn_ifp = NULL; struct mbuf *mn; /* * XXX racy, if hn(4) ever detached. */ rm_rlock(&hn_vfmap_lock, &pt); if (vf_ifp->if_index < hn_vfmap_size) hn_ifp = hn_vfmap[vf_ifp->if_index]; rm_runlock(&hn_vfmap_lock, &pt); if (hn_ifp != NULL) { for (mn = m; mn != NULL; mn = mn->m_nextpkt) { /* * Allow tapping on the VF. */ ETHER_BPF_MTAP(vf_ifp, mn); /* * Update VF stats. */ if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) { if_inc_counter(vf_ifp, IFCOUNTER_IBYTES, mn->m_pkthdr.len); } /* * XXX IFCOUNTER_IMCAST * This stat updating is kinda invasive, since it * requires two checks on the mbuf: the length check * and the ethernet header check. As of this write, * all multicast packets go directly to hn(4), which * makes imcast stat updating in the VF a try in vian. */ /* * Fix up rcvif and increase hn(4)'s ipackets. */ mn->m_pkthdr.rcvif = hn_ifp; if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); } /* * Go through hn(4)'s if_input. */ hn_ifp->if_input(hn_ifp, m); } else { /* * In the middle of the transition; free this * mbuf chain. */ while (m != NULL) { mn = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); m = mn; } } } static void hn_mtu_change_fixup(struct hn_softc *sc) { struct ifnet *ifp; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu); #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp)) hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp)); #endif } static void hn_xpnt_vf_setready(struct hn_softc *sc) { struct ifnet *ifp, *vf_ifp; struct ifreq ifr; HN_LOCK_ASSERT(sc); ifp = sc->hn_ifp; vf_ifp = sc->hn_vf_ifp; /* * Mark the VF ready. */ sc->hn_vf_rdytick = 0; /* * Save information for restoration. */ sc->hn_saved_caps = ifp->if_capabilities; sc->hn_saved_tsomax = ifp->if_hw_tsomax; sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount; sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize; /* * Intersect supported/enabled capabilities. * * NOTE: * if_hwassist is not changed here. */ ifp->if_capabilities &= vf_ifp->if_capabilities; ifp->if_capenable &= ifp->if_capabilities; /* * Fix TSO settings. */ if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax) ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax; if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount) ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount; if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize) ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize; /* * Change VF's enabled capabilities. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_reqcap = ifp->if_capenable; hn_xpnt_vf_iocsetcaps(sc, &ifr); if (ifp->if_mtu != ETHERMTU) { int error; /* * Change VF's MTU. */ memset(&ifr, 0, sizeof(ifr)); strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name)); ifr.ifr_mtu = ifp->if_mtu; error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr); if (error) { if_printf(ifp, "%s SIOCSIFMTU %u failed\n", vf_ifp->if_xname, ifp->if_mtu); if (ifp->if_mtu > ETHERMTU) { if_printf(ifp, "change MTU to %d\n", ETHERMTU); /* * XXX * No need to adjust the synthetic parts' MTU; * failure of the adjustment will cause us * infinite headache. */ ifp->if_mtu = ETHERMTU; hn_mtu_change_fixup(sc); } } } } static bool hn_xpnt_vf_isready(struct hn_softc *sc) { HN_LOCK_ASSERT(sc); if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL) return (false); if (sc->hn_vf_rdytick == 0) return (true); if (sc->hn_vf_rdytick > ticks) return (false); /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); return (true); } static void hn_xpnt_vf_setenable(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf) { int i; HN_LOCK_ASSERT(sc); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED; if (clear_vf) sc->hn_vf_ifp = NULL; rm_wunlock(&sc->hn_vf_lock); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF; } static void hn_xpnt_vf_init(struct hn_softc *sc) { int error; HN_LOCK_ASSERT(sc); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); if (bootverbose) { if_printf(sc->hn_ifp, "try bringing up %s\n", sc->hn_vf_ifp->if_xname); } /* * Bring the VF up. */ hn_xpnt_vf_saveifflags(sc); sc->hn_vf_ifp->if_flags |= IFF_UP; error = hn_xpnt_vf_iocsetflags(sc); if (error) { if_printf(sc->hn_ifp, "bringing up %s failed: %d\n", sc->hn_vf_ifp->if_xname, error); return; } /* * NOTE: * Datapath setting must happen _after_ bringing the VF up. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); /* Mark transparent mode VF as enabled. */ hn_xpnt_vf_setenable(sc); } static void hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) goto done; if (sc->hn_vf_ifp == NULL) goto done; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) goto done; if (sc->hn_vf_rdytick != 0) { /* Mark VF as ready. */ hn_xpnt_vf_setready(sc); } if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Delayed VF initialization. */ if (bootverbose) { if_printf(sc->hn_ifp, "delayed initialize %s\n", sc->hn_vf_ifp->if_xname); } hn_xpnt_vf_init(sc); } done: HN_UNLOCK(sc); } static void hn_ifnet_attevent(void *xsc, struct ifnet *ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (!(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (sc->hn_vf_ifp != NULL) { if_printf(sc->hn_ifp, "%s was attached as VF\n", sc->hn_vf_ifp->if_xname); goto done; } if (hn_xpnt_vf && ifp->if_start != NULL) { /* * ifnet.if_start is _not_ supported by transparent * mode VF; mainly due to the IFF_DRV_OACTIVE flag. */ if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported " "in transparent VF mode.\n", ifp->if_xname); goto done; } rm_wlock(&hn_vfmap_lock); if (ifp->if_index >= hn_vfmap_size) { struct ifnet **newmap; int newsize; newsize = ifp->if_index + HN_VFMAP_SIZE_DEF; newmap = malloc(sizeof(struct ifnet *) * newsize, M_DEVBUF, M_WAITOK | M_ZERO); memcpy(newmap, hn_vfmap, sizeof(struct ifnet *) * hn_vfmap_size); free(hn_vfmap, M_DEVBUF); hn_vfmap = newmap; hn_vfmap_size = newsize; } KASSERT(hn_vfmap[ifp->if_index] == NULL, ("%s: ifindex %d was mapped to %s", ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); hn_vfmap[ifp->if_index] = sc->hn_ifp; rm_wunlock(&hn_vfmap_lock); /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */ rm_wlock(&sc->hn_vf_lock); KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0, ("%s: transparent VF was enabled", sc->hn_ifp->if_xname)); sc->hn_vf_ifp = ifp; rm_wunlock(&sc->hn_vf_lock); if (hn_xpnt_vf) { int wait_ticks; /* * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp. * Save vf_ifp's current if_input for later restoration. */ sc->hn_vf_input = ifp->if_input; ifp->if_input = hn_xpnt_vf_input; /* * Stop link status management; use the VF's. */ hn_suspend_mgmt(sc); /* * Give VF sometime to complete its attach routing. */ wait_ticks = hn_xpnt_vf_attwait * hz; sc->hn_vf_rdytick = ticks + wait_ticks; taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init, wait_ticks); } done: HN_UNLOCK(sc); } static void hn_ifnet_detevent(void *xsc, struct ifnet *ifp) { struct hn_softc *sc = xsc; HN_LOCK(sc); if (sc->hn_vf_ifp == NULL) goto done; if (!hn_ismyvf(sc, ifp)) goto done; if (hn_xpnt_vf) { /* * Make sure that the delayed initialization is not running. * * NOTE: * - This lock _must_ be released, since the hn_vf_init task * will try holding this lock. * - It is safe to release this lock here, since the * hn_ifnet_attevent() is interlocked by the hn_vf_ifp. * * XXX racy, if hn(4) ever detached. */ HN_UNLOCK(sc); taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init); HN_LOCK(sc); KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved", sc->hn_ifp->if_xname)); ifp->if_input = sc->hn_vf_input; sc->hn_vf_input = NULL; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); if (sc->hn_vf_rdytick == 0) { /* * The VF was ready; restore some settings. */ sc->hn_ifp->if_capabilities = sc->hn_saved_caps; /* * NOTE: * There is _no_ need to fixup if_capenable and * if_hwassist, since the if_capabilities before * restoration was an intersection of the VF's * if_capabilites and the synthetic device's * if_capabilites. */ sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax; sc->hn_ifp->if_hw_tsomaxsegcount = sc->hn_saved_tsosegcnt; sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz; } /* * Resume link status management, which was suspended * by hn_ifnet_attevent(). */ hn_resume_mgmt(sc); } /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */); rm_wlock(&hn_vfmap_lock); KASSERT(ifp->if_index < hn_vfmap_size, ("ifindex %d, vfmapsize %d", ifp->if_index, hn_vfmap_size)); if (hn_vfmap[ifp->if_index] != NULL) { KASSERT(hn_vfmap[ifp->if_index] == sc->hn_ifp, ("%s: ifindex %d was mapped to %s", ifp->if_xname, ifp->if_index, hn_vfmap[ifp->if_index]->if_xname)); hn_vfmap[ifp->if_index] = NULL; } rm_wunlock(&hn_vfmap_lock); done: HN_UNLOCK(sc); } static void hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state) { struct hn_softc *sc = xsc; if (sc->hn_vf_ifp == ifp) if_link_state_change(sc->hn_ifp, link_state); } static int hn_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) { device_set_desc(dev, "Hyper-V Network Interface"); return BUS_PROBE_DEFAULT; } return ENXIO; } static int hn_attach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; uint8_t eaddr[ETHER_ADDR_LEN]; struct ifnet *ifp = NULL; int error, ring_cnt, tx_ring_cnt; sc->hn_dev = dev; sc->hn_prichan = vmbus_get_channel(dev); HN_LOCK_INIT(sc); rm_init(&sc->hn_vf_lock, "hnvf"); if (hn_xpnt_vf && hn_xpnt_vf_accbpf) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; /* * Initialize these tunables once. */ sc->hn_agg_size = hn_tx_agg_size; sc->hn_agg_pkts = hn_tx_agg_pkts; /* * Setup taskqueue for transmission. */ if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { int i; sc->hn_tx_taskqs = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_tx_taskqs[i]); taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, "%s tx%d", device_get_nameunit(dev), i); } } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { sc->hn_tx_taskqs = hn_tx_taskque; } /* * Setup taskqueue for mangement tasks, e.g. link status. */ sc->hn_mgmt_taskq0 = taskqueue_create("hn_mgmt", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_mgmt_taskq0); taskqueue_start_threads(&sc->hn_mgmt_taskq0, 1, PI_NET, "%s mgmt", device_get_nameunit(dev)); TASK_INIT(&sc->hn_link_task, 0, hn_link_taskfunc, sc); TASK_INIT(&sc->hn_netchg_init, 0, hn_netchg_init_taskfunc, sc); TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0, hn_netchg_status_taskfunc, sc); if (hn_xpnt_vf) { /* * Setup taskqueue for VF tasks, e.g. delayed VF bringing up. */ sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK, taskqueue_thread_enqueue, &sc->hn_vf_taskq); taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf", device_get_nameunit(dev)); TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0, hn_xpnt_vf_init_taskfunc, sc); } /* * Allocate ifnet and setup its name earlier, so that if_printf * can be used by functions, which will be called after * ether_ifattach(). */ ifp = sc->hn_ifp = if_alloc(IFT_ETHER); ifp->if_softc = sc; if_initname(ifp, device_get_name(dev), device_get_unit(dev)); /* * Initialize ifmedia earlier so that it can be unconditionally * destroyed, if error happened later on. */ ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); /* * Figure out the # of RX rings (ring_cnt) and the # of TX rings * to use (tx_ring_cnt). * * NOTE: * The # of RX rings to use is same as the # of channels to use. */ ring_cnt = hn_chan_cnt; if (ring_cnt <= 0) { /* Default */ ring_cnt = mp_ncpus; if (ring_cnt > HN_RING_CNT_DEF_MAX) ring_cnt = HN_RING_CNT_DEF_MAX; } else if (ring_cnt > mp_ncpus) { ring_cnt = mp_ncpus; } #ifdef RSS if (ring_cnt > rss_getnumbuckets()) ring_cnt = rss_getnumbuckets(); #endif tx_ring_cnt = hn_tx_ring_cnt; if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt) tx_ring_cnt = ring_cnt; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { /* ifnet.if_start only needs one TX ring. */ tx_ring_cnt = 1; } #endif /* * Set the leader CPU for channels. */ sc->hn_cpu = atomic_fetchadd_int(&hn_cpu_index, ring_cnt) % mp_ncpus; /* * Create enough TX/RX rings, even if only limited number of * channels can be allocated. */ error = hn_create_tx_data(sc, tx_ring_cnt); if (error) goto failed; error = hn_create_rx_data(sc, ring_cnt); if (error) goto failed; /* * Create transaction context for NVS and RNDIS transactions. */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); if (sc->hn_xact == NULL) { error = ENXIO; goto failed; } /* * Install orphan handler for the revocation of this device's * primary channel. * * NOTE: * The processing order is critical here: * Install the orphan handler, _before_ testing whether this * device's primary channel has been revoked or not. */ vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); if (vmbus_chan_is_revoked(sc->hn_prichan)) { error = ENXIO; goto failed; } /* * Attach the synthetic parts, i.e. NVS and RNDIS. */ error = hn_synth_attach(sc, ETHERMTU); if (error) goto failed; error = hn_rndis_get_eaddr(sc, eaddr); if (error) goto failed; #if __FreeBSD_version >= 1100099 if (sc->hn_rx_ring_inuse > 1) { /* * Reduce TCP segment aggregation limit for multiple * RX rings to increase ACK timeliness. */ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MULTIRX_DEF); } #endif /* * Fixup TX stuffs after synthetic parts are attached. */ hn_fixup_tx_data(sc); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "nvs_version", CTLFLAG_RD, &sc->hn_nvs_ver, 0, "NVS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "ndis_version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_ndis_version_sysctl, "A", "NDIS version"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "caps", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_caps_sysctl, "A", "capabilities"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_hwassist_sysctl, "A", "hwassist"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max", CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt", CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0, "max # of TSO segments"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz", CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0, "max size of TSO segment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxfilter_sysctl, "A", "rxfilter"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_hash", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rss_hash_sysctl, "A", "RSS hash"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rss_ind_size", CTLFLAG_RD, &sc->hn_rss_ind_size, 0, "RSS indirect entry count"); #ifndef RSS /* * Don't allow RSS key/indirect table changes, if RSS is defined. */ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_key", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_key_sysctl, "IU", "RSS key"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); #endif SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, "RNDIS offered packet transmission aggregation size limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, "RNDIS offered packet transmission aggregation count limit"); SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, "RNDIS packet transmission aggregation alignment"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_size_sysctl, "I", "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pkts_sysctl, "I", "Packet transmission aggregation packets, " "0 -- disable, -1 -- auto"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_polling_sysctl, "I", "Polling frequency: [100,1000000], 0 disable polling"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_vf_sysctl, "A", "Virtual Function's name"); if (!hn_xpnt_vf) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_rxvf_sysctl, "A", "activated Virtual Function's name"); } else { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_enabled_sysctl, "I", "Transparent VF enabled"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_xpnt_vf_accbpf_sysctl, "I", "Accurate BPF for transparent VF"); } /* * Setup the ifmedia, which has been initialized earlier. */ ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); /* XXX ifmedia_set really should do this for us */ sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; /* * Setup the ifnet for this interface. */ ifp->if_baudrate = IF_Gbps(10); ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = hn_ioctl; ifp->if_init = hn_init; #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { int qdepth = hn_get_txswq_depth(&sc->hn_tx_ring[0]); ifp->if_start = hn_start; IFQ_SET_MAXLEN(&ifp->if_snd, qdepth); ifp->if_snd.ifq_drv_maxlen = qdepth - 1; IFQ_SET_READY(&ifp->if_snd); } else #endif { ifp->if_transmit = hn_transmit; ifp->if_qflush = hn_xmit_qflush; } ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ ifp->if_capabilities |= IFCAP_RXCSUM_IPV6; #endif if (sc->hn_caps & HN_CAP_VLAN) { /* XXX not sure about VLAN_MTU. */ ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU; } ifp->if_hwassist = sc->hn_tx_ring[0].hn_csum_assist; if (ifp->if_hwassist & HN_CSUM_IP_MASK) ifp->if_capabilities |= IFCAP_TXCSUM; if (ifp->if_hwassist & HN_CSUM_IP6_MASK) ifp->if_capabilities |= IFCAP_TXCSUM_IPV6; if (sc->hn_caps & HN_CAP_TSO4) { ifp->if_capabilities |= IFCAP_TSO4; ifp->if_hwassist |= CSUM_IP_TSO; } if (sc->hn_caps & HN_CAP_TSO6) { ifp->if_capabilities |= IFCAP_TSO6; ifp->if_hwassist |= CSUM_IP6_TSO; } /* Enable all available capabilities by default. */ ifp->if_capenable = ifp->if_capabilities; /* * Disable IPv6 TSO and TXCSUM by default, they still can * be enabled through SIOCSIFCAP. */ ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { /* * Lock hn_set_tso_maxsize() to simplify its * internal logic. */ HN_LOCK(sc); hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); HN_UNLOCK(sc); ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; ifp->if_hw_tsomaxsegsize = PAGE_SIZE; } ether_ifattach(ifp, eaddr); if ((ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) && bootverbose) { if_printf(ifp, "TSO segcnt %u segsz %u\n", ifp->if_hw_tsomaxsegcount, ifp->if_hw_tsomaxsegsize); } /* Inform the upper layer about the long frame support. */ ifp->if_hdrlen = sizeof(struct ether_vlan_header); /* * Kick off link status check. */ sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; hn_update_link_status(sc); if (!hn_xpnt_vf) { sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event, hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event, hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY); } else { sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event, hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY); } /* * NOTE: * Subscribe ether_ifattach event, instead of ifnet_arrival event, * since interface's LLADDR is needed; interface LLADDR is not * available when ifnet_arrival event is triggered. */ sc->hn_ifnet_atthand = EVENTHANDLER_REGISTER(ether_ifattach_event, hn_ifnet_attevent, sc, EVENTHANDLER_PRI_ANY); sc->hn_ifnet_dethand = EVENTHANDLER_REGISTER(ifnet_departure_event, hn_ifnet_detevent, sc, EVENTHANDLER_PRI_ANY); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) hn_synth_detach(sc); hn_detach(dev); return (error); } static int hn_detach(device_t dev) { struct hn_softc *sc = device_get_softc(dev); struct ifnet *ifp = sc->hn_ifp, *vf_ifp; if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { /* * In case that the vmbus missed the orphan handler * installation. */ vmbus_xact_ctx_orphan(sc->hn_xact); } if (sc->hn_ifaddr_evthand != NULL) EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand); if (sc->hn_ifnet_evthand != NULL) EVENTHANDLER_DEREGISTER(ifnet_event, sc->hn_ifnet_evthand); if (sc->hn_ifnet_atthand != NULL) { EVENTHANDLER_DEREGISTER(ether_ifattach_event, sc->hn_ifnet_atthand); } if (sc->hn_ifnet_dethand != NULL) { EVENTHANDLER_DEREGISTER(ifnet_departure_event, sc->hn_ifnet_dethand); } if (sc->hn_ifnet_lnkhand != NULL) EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand); vf_ifp = sc->hn_vf_ifp; __compiler_membar(); if (vf_ifp != NULL) hn_ifnet_detevent(sc, vf_ifp); if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc, true); /* * NOTE: * hn_stop() only suspends data, so managment * stuffs have to be suspended manually here. */ hn_suspend_mgmt(sc); hn_synth_detach(sc); } HN_UNLOCK(sc); ether_ifdetach(ifp); } ifmedia_removeall(&sc->hn_media); hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(sc->hn_tx_taskqs[i]); free(sc->hn_tx_taskqs, M_DEVBUF); } taskqueue_free(sc->hn_mgmt_taskq0); if (sc->hn_vf_taskq != NULL) taskqueue_free(sc->hn_vf_taskq); if (sc->hn_xact != NULL) { /* * Uninstall the orphan handler _before_ the xact is * destructed. */ vmbus_chan_unset_orphan(sc->hn_prichan); vmbus_xact_ctx_destroy(sc->hn_xact); } if_free(ifp); HN_LOCK_DESTROY(sc); rm_destroy(&sc->hn_vf_lock); return (0); } static int hn_shutdown(device_t dev) { return (0); } static void hn_link_status(struct hn_softc *sc) { uint32_t link_status; int error; error = hn_rndis_get_linkstatus(sc, &link_status); if (error) { /* XXX what to do? */ return; } if (link_status == NDIS_MEDIA_STATE_CONNECTED) sc->hn_link_flags |= HN_LINK_FLAG_LINKUP; else sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, (sc->hn_link_flags & HN_LINK_FLAG_LINKUP) ? LINK_STATE_UP : LINK_STATE_DOWN); } static void hn_link_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) return; hn_link_status(sc); } static void hn_netchg_init_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Prevent any link status checks from running. */ sc->hn_link_flags |= HN_LINK_FLAG_NETCHG; /* * Fake up a [link down --> link up] state change; 5 seconds * delay is used, which closely simulates miibus reaction * upon link down event. */ sc->hn_link_flags &= ~HN_LINK_FLAG_LINKUP; if_link_state_change(sc->hn_ifp, LINK_STATE_DOWN); taskqueue_enqueue_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 5 * hz); } static void hn_netchg_status_taskfunc(void *xsc, int pending __unused) { struct hn_softc *sc = xsc; /* Re-allow link status checks. */ sc->hn_link_flags &= ~HN_LINK_FLAG_NETCHG; hn_link_status(sc); } static void hn_update_link_status(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_link_task); } static void hn_change_network(struct hn_softc *sc) { if (sc->hn_mgmt_taskq != NULL) taskqueue_enqueue(sc->hn_mgmt_taskq, &sc->hn_netchg_init); } static __inline int hn_txdesc_dmamap_load(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head, bus_dma_segment_t *segs, int *nsegs) { struct mbuf *m = *m_head; int error; KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("txd uses chim")); error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); if (error == EFBIG) { struct mbuf *m_new; m_new = m_collapse(m, M_NOWAIT, HN_TX_DATA_SEGCNT_MAX); if (m_new == NULL) return ENOBUFS; else *m_head = m = m_new; txr->hn_tx_collapsed++; error = bus_dmamap_load_mbuf_sg(txr->hn_tx_data_dtag, txd->data_dmap, m, segs, nsegs, BUS_DMA_NOWAIT); } if (!error) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_PREWRITE); txd->flags |= HN_TXD_FLAG_DMAMAP; } return error; } static __inline int hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("put an onagg txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; if (!STAILQ_EMPTY(&txd->agg_list)) { struct hn_txdesc *tmp_txd; while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { int freed; KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), ("resursive aggregation on aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), ("not aggregated txdesc")); KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("aggregated txdesc uses dmamap")); KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("aggregated txdesc consumes " "chimney sending buffer")); KASSERT(tmp_txd->chim_size == 0, ("aggregated txdesc has non-zero " "chimney sending size")); STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; freed = hn_txdesc_put(txr, tmp_txd); KASSERT(freed, ("failed to free aggregated txdesc")); } } if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(txr->hn_tx_data_dtag, txd->data_dmap); txd->flags &= ~HN_TXD_FLAG_DMAMAP; } if (txd->m != NULL) { m_freem(txd->m); txd->m = NULL; } txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); KASSERT(txr->hn_txdesc_avail >= 0 && txr->hn_txdesc_avail < txr->hn_txdesc_cnt, ("txdesc_put: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); #else /* HN_USE_TXDESC_BUFRING */ #ifdef HN_DEBUG atomic_add_int(&txr->hn_txdesc_avail, 1); #endif buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif /* !HN_USE_TXDESC_BUFRING */ return 1; } static __inline struct hn_txdesc * hn_txdesc_get(struct hn_tx_ring *txr) { struct hn_txdesc *txd; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); txd = SLIST_FIRST(&txr->hn_txlist); if (txd != NULL) { KASSERT(txr->hn_txdesc_avail > 0, ("txdesc_get: invalid txd avail %d", txr->hn_txdesc_avail)); txr->hn_txdesc_avail--; SLIST_REMOVE_HEAD(&txr->hn_txlist, link); } mtx_unlock_spin(&txr->hn_txlist_spin); #else txd = buf_ring_dequeue_sc(txr->hn_txdesc_br); #endif if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING #ifdef HN_DEBUG atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif #endif /* HN_USE_TXDESC_BUFRING */ KASSERT(txd->m == NULL && txd->refs == 0 && STAILQ_EMPTY(&txd->agg_list) && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && txd->chim_size == 0 && (txd->flags & HN_TXD_FLAG_ONLIST) && (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; } return txd; } static __inline void hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } static __inline void hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) { KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("recursive aggregation on aggregating txdesc")); KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, ("already aggregated")); KASSERT(STAILQ_EMPTY(&txd->agg_list), ("recursive aggregation on to-be-aggregated txdesc")); txd->flags |= HN_TXD_FLAG_ONAGG; STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); } static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { bool pending = false; #ifndef HN_USE_TXDESC_BUFRING mtx_lock_spin(&txr->hn_txlist_spin); if (txr->hn_txdesc_avail != txr->hn_txdesc_cnt) pending = true; mtx_unlock_spin(&txr->hn_txlist_spin); #else if (!buf_ring_full(txr->hn_txdesc_br)) pending = true; #endif return (pending); } static __inline void hn_txeof(struct hn_tx_ring *txr) { txr->hn_has_txeof = 0; txr->hn_txeof(txr); } static void hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, struct vmbus_channel *chan, const void *data __unused, int dlen __unused) { struct hn_txdesc *txd = sndc->hn_cbarg; struct hn_tx_ring *txr; txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); ++txr->hn_txdone_cnt; if (txr->hn_txdone_cnt >= HN_EARLY_TXEOF_THRESH) { txr->hn_txdone_cnt = 0; if (txr->hn_oactive) hn_txeof(txr); } } static void hn_chan_rollup(struct hn_rx_ring *rxr, struct hn_tx_ring *txr) { #if defined(INET) || defined(INET6) tcp_lro_flush_all(&rxr->hn_lro); #endif /* * NOTE: * 'txr' could be NULL, if multiple channels and * ifnet.if_start method are enabled. */ if (txr == NULL || !txr->hn_has_txeof) return; txr->hn_txdone_cnt = 0; hn_txeof(txr); } static __inline uint32_t hn_rndis_pktmsg_offset(uint32_t ofs) { KASSERT(ofs >= sizeof(struct rndis_packet_msg), ("invalid RNDIS packet msg offset %u", ofs)); return (ofs - __offsetof(struct rndis_packet_msg, rm_dataoffset)); } static __inline void * hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, size_t pi_dlen, uint32_t pi_type) { const size_t pi_size = HN_RNDIS_PKTINFO_SIZE(pi_dlen); struct rndis_pktinfo *pi; KASSERT((pi_size & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK) == 0, ("unaligned pktinfo size %zu, pktinfo dlen %zu", pi_size, pi_dlen)); /* * Per-packet-info does not move; it only grows. * * NOTE: * rm_pktinfooffset in this phase counts from the beginning * of rndis_packet_msg. */ KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <= pktsize, ("%u pktinfo overflows RNDIS packet msg", pi_type)); pi = (struct rndis_pktinfo *)((uint8_t *)pkt + pkt->rm_pktinfooffset + pkt->rm_pktinfolen); pkt->rm_pktinfolen += pi_size; pi->rm_size = pi_size; pi->rm_type = pi_type; pi->rm_pktinfooffset = RNDIS_PKTINFO_OFFSET; return (pi->rm_data); } static __inline int hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) { struct hn_txdesc *txd; struct mbuf *m; int error, pkts; txd = txr->hn_agg_txd; KASSERT(txd != NULL, ("no aggregate txdesc")); /* * Since hn_txpkt() will reset this temporary stat, save * it now, so that oerrors can be updated properly, if * hn_txpkt() ever fails. */ pkts = txr->hn_stat_pkts; /* * Since txd's mbuf will _not_ be freed upon hn_txpkt() * failure, save it for later freeing, if hn_txpkt() ever * fails. */ m = txd->m; error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m is not. */ m_freem(m); txr->hn_flush_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); } /* Reset all aggregation states. */ txr->hn_agg_txd = NULL; txr->hn_agg_szleft = 0; txr->hn_agg_pktleft = 0; txr->hn_agg_prevpkt = NULL; return (error); } static void * hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, int pktsize) { void *chim; if (txr->hn_agg_txd != NULL) { if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { struct hn_txdesc *agg_txd = txr->hn_agg_txd; struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; int olen; /* * Update the previous RNDIS packet's total length, * it can be increased due to the mandatory alignment * padding for this RNDIS packet. And update the * aggregating txdesc's chimney sending buffer size * accordingly. * * XXX * Zero-out the padding, as required by the RNDIS spec. */ olen = pkt->rm_len; pkt->rm_len = roundup2(olen, txr->hn_agg_align); agg_txd->chim_size += pkt->rm_len - olen; /* Link this txdesc to the parent. */ hn_txdesc_agg(agg_txd, txd); chim = (uint8_t *)pkt + pkt->rm_len; /* Save the current packet for later fixup. */ txr->hn_agg_prevpkt = chim; txr->hn_agg_pktleft--; txr->hn_agg_szleft -= pktsize; if (txr->hn_agg_szleft <= HN_PKTSIZE_MIN(txr->hn_agg_align)) { /* * Probably can't aggregate more packets, * flush this aggregating txdesc proactively. */ txr->hn_agg_pktleft = 0; } /* Done! */ return (chim); } hn_flush_txagg(ifp, txr); } KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); txr->hn_tx_chimney_tried++; txd->chim_index = hn_chim_alloc(txr->hn_sc); if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) return (NULL); txr->hn_tx_chimney++; chim = txr->hn_sc->hn_chim + (txd->chim_index * txr->hn_sc->hn_chim_szmax); if (txr->hn_agg_pktmax > 1 && txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { txr->hn_agg_txd = txd; txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; txr->hn_agg_prevpkt = chim; } return (chim); } /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; struct mbuf *m_head = *m_head0; struct rndis_packet_msg *pkt; uint32_t *pi_data; void *chim = NULL; int pkt_hlen, pkt_size; pkt = txd->rndis_pkt; pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); if (pkt_size < txr->hn_chim_size) { chim = hn_try_txagg(ifp, txr, txd, pkt_size); if (chim != NULL) pkt = chim; } else { if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); } pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = m_head->m_pkthdr.len; pkt->rm_dataoffset = 0; pkt->rm_datalen = m_head->m_pkthdr.len; pkt->rm_oobdataoffset = 0; pkt->rm_oobdatalen = 0; pkt->rm_oobdataelements = 0; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; pkt->rm_vchandle = 0; pkt->rm_reserved = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* * Set the hash value for this packet, so that the host could * dispatch the TX done event for this packet back to this TX * ring's channel. */ pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, HN_NDIS_HASH_VALUE_SIZE, HN_NDIS_PKTINFO_TYPE_HASHVAL); *pi_data = txr->hn_tx_idx; } if (m_head->m_flags & M_VLANTAG) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN); *pi_data = NDIS_VLAN_INFO_MAKE( EVL_VLANOFTAG(m_head->m_pkthdr.ether_vtag), EVL_PRIOFTAG(m_head->m_pkthdr.ether_vtag), EVL_CFIOFTAG(m_head->m_pkthdr.ether_vtag)); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { #if defined(INET6) || defined(INET) pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_LSO2_INFO_SIZE, NDIS_PKTINFO_TYPE_LSO); #ifdef INET if (m_head->m_pkthdr.csum_flags & CSUM_IP_TSO) { *pi_data = NDIS_LSO2_INFO_MAKEIPV4(0, m_head->m_pkthdr.tso_segsz); } #endif #if defined(INET6) && defined(INET) else #endif #ifdef INET6 { *pi_data = NDIS_LSO2_INFO_MAKEIPV6(0, m_head->m_pkthdr.tso_segsz); } #endif #endif /* INET6 || INET */ } else if (m_head->m_pkthdr.csum_flags & txr->hn_csum_assist) { pi_data = hn_rndis_pktinfo_append(pkt, HN_RNDIS_PKT_LEN, NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM); if (m_head->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP)) { *pi_data = NDIS_TXCSUM_INFO_IPV6; } else { *pi_data = NDIS_TXCSUM_INFO_IPV4; if (m_head->m_pkthdr.csum_flags & CSUM_IP) *pi_data |= NDIS_TXCSUM_INFO_IPCS; } if (m_head->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) *pi_data |= NDIS_TXCSUM_INFO_TCPCS; else if (m_head->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_IP6_UDP)) *pi_data |= NDIS_TXCSUM_INFO_UDPCS; } pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Fixup RNDIS packet message total length */ pkt->rm_len += pkt_hlen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt_hlen); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); /* * Fast path: Chimney sending. */ if (chim != NULL) { struct hn_txdesc *tgt_txd = txd; if (txr->hn_agg_txd != NULL) { tgt_txd = txr->hn_agg_txd; #ifdef INVARIANTS *m_head0 = NULL; #endif } KASSERT(pkt == chim, ("RNDIS pkt not in chimney sending buffer")); KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, ("chimney sending buffer is not used")); tgt_txd->chim_size += pkt->rm_len; m_copydata(m_head, 0, m_head->m_pkthdr.len, ((uint8_t *)chim) + pkt_hlen); txr->hn_gpa_cnt = 0; txr->hn_sendpkt = hn_txpkt_chim; goto done; } KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("chimney buffer is used")); KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); if (__predict_false(error)) { int freed; /* * This mbuf is not linked w/ the txd yet, so free it now. */ m_freem(m_head); *m_head0 = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; /* +1 RNDIS packet message */ txr->hn_gpa_cnt = nsegs + 1; /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; txr->hn_gpa[0].gpa_len = pkt_hlen; /* * Fill the page buffers with mbuf info after the page * buffer for RNDIS packet message. */ for (i = 0; i < nsegs; ++i) { struct vmbus_gpa *gpa = &txr->hn_gpa[i + 1]; gpa->gpa_page = atop(segs[i].ds_addr); gpa->gpa_ofs = segs[i].ds_addr & PAGE_MASK; gpa->gpa_len = segs[i].ds_len; } txd->chim_index = HN_NVS_CHIM_IDX_INVALID; txd->chim_size = 0; txr->hn_sendpkt = hn_txpkt_sglist; done: txd->m = m_head; /* Set the completion routine */ hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); /* Update temporary stats for later use. */ txr->hn_stat_pkts++; txr->hn_stat_size += m_head->m_pkthdr.len; if (m_head->m_flags & M_MCAST) txr->hn_stat_mcasts++; return 0; } /* * NOTE: * If this function fails, then txd will be freed, but the mbuf * associated w/ the txd will _not_ be freed. */ static int hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { int error, send_failed = 0, has_bpf; again: has_bpf = bpf_peers_present(ifp->if_bpf); if (has_bpf) { /* * Make sure that this txd and any aggregated txds are not * freed before ETHER_BPF_MTAP. */ hn_txdesc_hold(txd); } error = txr->hn_sendpkt(txr, txd); if (!error) { if (has_bpf) { const struct hn_txdesc *tmp_txd; ETHER_BPF_MTAP(ifp, txd->m); STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) ETHER_BPF_MTAP(ifp, tmp_txd->m); } if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { if_inc_counter(ifp, IFCOUNTER_OBYTES, txr->hn_stat_size); if (txr->hn_stat_mcasts != 0) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, txr->hn_stat_mcasts); } } txr->hn_pkts += txr->hn_stat_pkts; txr->hn_sends++; } if (has_bpf) hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed; /* * This should "really rarely" happen. * * XXX Too many RX to be acked or too many sideband * commands to run? Ask netvsc_channel_rollup() * to kick start later. */ txr->hn_has_txeof = 1; if (!send_failed) { txr->hn_send_failed++; send_failed = 1; /* * Try sending again after set hn_has_txeof; * in case that we missed the last * netvsc_channel_rollup(). */ goto again; } if_printf(ifp, "send failed\n"); /* * Caller will perform further processing on the * associated mbuf, so don't free it in hn_txdesc_put(); * only unload it from the DMA map in hn_txdesc_put(), * if it was loaded. */ txd->m = NULL; freed = hn_txdesc_put(txr, txd); KASSERT(freed != 0, ("fail to free txd upon send error")); txr->hn_send_failed++; } /* Reset temporary stats, after this sending is done. */ txr->hn_stat_size = 0; txr->hn_stat_pkts = 0; txr->hn_stat_mcasts = 0; return (error); } /* * Append the specified data to the indicated mbuf chain, * Extend the mbuf chain if the new data does not fit in * existing space. * * This is a minor rewrite of m_append() from sys/kern/uipc_mbuf.c. * There should be an equivalent in the kernel mbuf code, * but there does not appear to be one yet. * * Differs from m_append() in that additional mbufs are * allocated with cluster size MJUMPAGESIZE, and filled * accordingly. * * Return 1 if able to complete the job; otherwise 0. */ static int hv_m_append(struct mbuf *m0, int len, c_caddr_t cp) { struct mbuf *m, *n; int remainder, space; for (m = m0; m->m_next != NULL; m = m->m_next) ; remainder = len; space = M_TRAILINGSPACE(m); if (space > 0) { /* * Copy into available space. */ if (space > remainder) space = remainder; bcopy(cp, mtod(m, caddr_t) + m->m_len, space); m->m_len += space; cp += space; remainder -= space; } while (remainder > 0) { /* * Allocate a new mbuf; could check space * and allocate a cluster instead. */ n = m_getjcl(M_NOWAIT, m->m_type, 0, MJUMPAGESIZE); if (n == NULL) break; n->m_len = min(MJUMPAGESIZE, remainder); bcopy(cp, mtod(n, caddr_t), n->m_len); cp += n->m_len; remainder -= n->m_len; m->m_next = n; m = n; } if (m0->m_flags & M_PKTHDR) m0->m_pkthdr.len += len - remainder; return (remainder == 0); } #if defined(INET) || defined(INET6) static __inline int hn_lro_rx(struct lro_ctrl *lc, struct mbuf *m) { #if __FreeBSD_version >= 1100095 if (hn_lro_mbufq_depth) { tcp_lro_queue_mbuf(lc, m); return 0; } #endif return tcp_lro_rx(lc, m, 0); } #endif static int hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen, const struct hn_rxinfo *info) { struct ifnet *ifp, *hn_ifp = rxr->hn_ifp; struct mbuf *m_new; int size, do_lro = 0, do_csum = 1; int hash_type; /* * If the non-transparent mode VF is active, inject this packet * into the VF. */ ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : hn_ifp; if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { /* * NOTE: * See the NOTE of hn_rndis_init_fixat(). This * function can be reached, immediately after the * RNDIS is initialized but before the ifnet is * setup on the hn_attach() path; drop the unexpected * packets. */ return (0); } if (__predict_false(dlen < ETHER_HDR_LEN)) { if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1); return (0); } if (dlen <= MHLEN) { m_new = m_gethdr(M_NOWAIT, MT_DATA); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } memcpy(mtod(m_new, void *), data, dlen); m_new->m_pkthdr.len = m_new->m_len = dlen; rxr->hn_small_pkts++; } else { /* * Get an mbuf with a cluster. For packets 2K or less, * get a standard 2K cluster. For anything larger, get a * 4K cluster. Any buffers larger than 4K can cause problems * if looped around to the Hyper-V TX channel, so avoid them. */ size = MCLBYTES; if (dlen > MCLBYTES) { /* 4096 */ size = MJUMPAGESIZE; } m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); if (m_new == NULL) { if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1); return (0); } hv_m_append(m_new, dlen, data); } m_new->m_pkthdr.rcvif = ifp; if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0)) do_csum = 0; /* receive side checksum offload */ if (info->csum_info != HN_NDIS_RXCSUM_INFO_INVALID) { /* IP csum offload */ if ((info->csum_info & NDIS_RXCSUM_INFO_IPCS_OK) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); rxr->hn_csum_ip++; } /* TCP/UDP csum offload */ if ((info->csum_info & (NDIS_RXCSUM_INFO_UDPCS_OK | NDIS_RXCSUM_INFO_TCPCS_OK)) && do_csum) { m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; if (info->csum_info & NDIS_RXCSUM_INFO_TCPCS_OK) rxr->hn_csum_tcp++; else rxr->hn_csum_udp++; } /* * XXX * As of this write (Oct 28th, 2016), host side will turn * on only TCPCS_OK and IPCS_OK even for UDP datagrams, so * the do_lro setting here is actually _not_ accurate. We * depend on the RSS hash type check to reset do_lro. */ if ((info->csum_info & (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) == (NDIS_RXCSUM_INFO_TCPCS_OK | NDIS_RXCSUM_INFO_IPCS_OK)) do_lro = 1; } else { const struct ether_header *eh; uint16_t etype; int hoff; hoff = sizeof(*eh); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= hoff, ("not ethernet frame")); eh = mtod(m_new, struct ether_header *); etype = ntohs(eh->ether_type); if (etype == ETHERTYPE_VLAN) { const struct ether_vlan_header *evl; hoff = sizeof(*evl); if (m_new->m_len < hoff) goto skip; evl = mtod(m_new, struct ether_vlan_header *); etype = ntohs(evl->evl_proto); } if (etype == ETHERTYPE_IP) { int pr; pr = hn_check_iplen(m_new, hoff); if (pr == IPPROTO_TCP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_TCP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } do_lro = 1; } else if (pr == IPPROTO_UDP) { if (do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_UDP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; } } else if (pr != IPPROTO_DONE && do_csum && (rxr->hn_trust_hcsum & HN_TRUST_HCSUM_IP)) { rxr->hn_csum_trusted++; m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); } } } skip: if (info->vlan_info != HN_NDIS_VLAN_INFO_INVALID) { m_new->m_pkthdr.ether_vtag = EVL_MAKETAG( NDIS_VLAN_INFO_ID(info->vlan_info), NDIS_VLAN_INFO_PRI(info->vlan_info), NDIS_VLAN_INFO_CFI(info->vlan_info)); m_new->m_flags |= M_VLANTAG; } /* * If VF is activated (tranparent/non-transparent mode does not * matter here). * * - Don't setup mbuf hash, if 'options RSS' is set. * * In Azure, when VF is activated, TCP SYN and SYN|ACK go * through hn(4) while the rest of segments and ACKs belonging * to the same TCP 4-tuple go through the VF. So don't setup * mbuf hash, if a VF is activated and 'options RSS' is not * enabled. hn(4) and the VF may use neither the same RSS * hash key nor the same RSS hash function, so the hash value * for packets belonging to the same flow could be different! * * - Disable LRO * * hn(4) will only receive broadcast packets, multicast packets, * TCP SYN and SYN|ACK (in Azure), LRO is useless for these * packet types. * * For non-transparent, we definitely _cannot_ enable LRO at * all, since the LRO flush will use hn(4) as the receiving * interface; i.e. hn_ifp->if_input(hn_ifp, m). */ if (hn_ifp != ifp || (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF)) { do_lro = 0; /* disable LRO. */ #ifndef RSS goto skip_hash; /* skip mbuf hash setup */ #endif } if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) { rxr->hn_rss_pkts++; m_new->m_pkthdr.flowid = info->hash_value; hash_type = M_HASHTYPE_OPAQUE_HASH; if ((info->hash_info & NDIS_HASH_FUNCTION_MASK) == NDIS_HASH_FUNCTION_TOEPLITZ) { uint32_t type = (info->hash_info & NDIS_HASH_TYPE_MASK); /* * NOTE: * do_lro is resetted, if the hash types are not TCP * related. See the comment in the above csum_flags * setup section. */ switch (type) { case NDIS_HASH_IPV4: hash_type = M_HASHTYPE_RSS_IPV4; do_lro = 0; break; case NDIS_HASH_TCP_IPV4: hash_type = M_HASHTYPE_RSS_TCP_IPV4; break; case NDIS_HASH_IPV6: hash_type = M_HASHTYPE_RSS_IPV6; do_lro = 0; break; case NDIS_HASH_IPV6_EX: hash_type = M_HASHTYPE_RSS_IPV6_EX; do_lro = 0; break; case NDIS_HASH_TCP_IPV6: hash_type = M_HASHTYPE_RSS_TCP_IPV6; break; case NDIS_HASH_TCP_IPV6_EX: hash_type = M_HASHTYPE_RSS_TCP_IPV6_EX; break; } } } else { m_new->m_pkthdr.flowid = rxr->hn_rx_idx; hash_type = M_HASHTYPE_OPAQUE; } M_HASHTYPE_SET(m_new, hash_type); #ifndef RSS skip_hash: #endif if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1); if (hn_ifp != ifp) { const struct ether_header *eh; /* * Non-transparent mode VF is activated. */ /* * Allow tapping on hn(4). */ ETHER_BPF_MTAP(hn_ifp, m_new); /* * Update hn(4)'s stats. */ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1); if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len); /* Checked at the beginning of this function. */ KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame")); eh = mtod(m_new, struct ether_header *); if (ETHER_IS_MULTICAST(eh->ether_dhost)) if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1); } rxr->hn_pkts++; if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) { #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxr->hn_lro; if (lro->lro_cnt) { rxr->hn_lro_tried++; if (hn_lro_rx(lro, m_new) == 0) { /* DONE! */ return 0; } } #endif } ifp->if_input(ifp, m_new); return (0); } static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { struct hn_softc *sc = ifp->if_softc; struct ifreq *ifr = (struct ifreq *)data, ifr_vf; struct ifnet *vf_ifp; int mask, error = 0; switch (cmd) { case SIOCSIFMTU: if (ifr->ifr_mtu > HN_MTU_MAX) { error = EINVAL; break; } HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if ((sc->hn_caps & HN_CAP_MTU) == 0) { /* Can't change MTU */ HN_UNLOCK(sc); error = EOPNOTSUPP; break; } if (ifp->if_mtu == ifr->ifr_mtu) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) { vf_ifp = sc->hn_vf_ifp; ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname, sizeof(ifr_vf.ifr_name)); error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr_vf); if (error) { HN_UNLOCK(sc); if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n", vf_ifp->if_xname, ifr->ifr_mtu, error); break; } } /* * Suspend this interface before the synthetic parts * are ripped. */ hn_suspend(sc); /* * Detach the synthetics parts, i.e. NVS and RNDIS. */ hn_synth_detach(sc); /* * Reattach the synthetic parts, i.e. NVS and RNDIS, * with the new MTU setting. */ error = hn_synth_attach(sc, ifr->ifr_mtu); if (error) { HN_UNLOCK(sc); break; } /* * Commit the requested MTU, after the synthetic parts * have been successfully attached. */ ifp->if_mtu = ifr->ifr_mtu; /* * Synthetic parts' reattach may change the chimney * sending size; update it. */ if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax) hn_set_chim_size(sc, sc->hn_chim_szmax); /* * Make sure that various parameters based on MTU are * still valid, after the MTU change. */ hn_mtu_change_fixup(sc); /* * All done! Resume the interface now. */ hn_resume(sc); if ((sc->hn_flags & HN_FLAG_RXVF) || (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { /* * Since we have reattached the NVS part, * change the datapath to VF again; in case * that it is lost, after the NVS was detached. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF); } HN_UNLOCK(sc); break; case SIOCSIFFLAGS: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (hn_xpnt_vf_isready(sc)) hn_xpnt_vf_saveifflags(sc); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Caller meight hold mutex, e.g. * bpf; use busy-wait for the RNDIS * reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) error = hn_xpnt_vf_iocsetflags(sc); } else { hn_init_locked(sc); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc, false); } sc->hn_if_flags = ifp->if_flags; HN_UNLOCK(sc); break; case SIOCSIFCAP: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { ifr_vf = *ifr; strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname, sizeof(ifr_vf.ifr_name)); error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf); HN_UNLOCK(sc); break; } /* * Fix up requested capabilities w/ supported capabilities, * since the supported capabilities could have been changed. */ mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^ ifp->if_capenable; if (mask & IFCAP_TXCSUM) { ifp->if_capenable ^= IFCAP_TXCSUM; if (ifp->if_capenable & IFCAP_TXCSUM) ifp->if_hwassist |= HN_CSUM_IP_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP_HWASSIST(sc); } if (mask & IFCAP_TXCSUM_IPV6) { ifp->if_capenable ^= IFCAP_TXCSUM_IPV6; if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) ifp->if_hwassist |= HN_CSUM_IP6_HWASSIST(sc); else ifp->if_hwassist &= ~HN_CSUM_IP6_HWASSIST(sc); } /* TODO: flip RNDIS offload parameters for RXCSUM. */ if (mask & IFCAP_RXCSUM) ifp->if_capenable ^= IFCAP_RXCSUM; #ifdef foo /* We can't diff IPv6 packets from IPv4 packets on RX path. */ if (mask & IFCAP_RXCSUM_IPV6) ifp->if_capenable ^= IFCAP_RXCSUM_IPV6; #endif if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; if (ifp->if_capenable & IFCAP_TSO4) ifp->if_hwassist |= CSUM_IP_TSO; else ifp->if_hwassist &= ~CSUM_IP_TSO; } if (mask & IFCAP_TSO6) { ifp->if_capenable ^= IFCAP_TSO6; if (ifp->if_capenable & IFCAP_TSO6) ifp->if_hwassist |= CSUM_IP6_TSO; else ifp->if_hwassist &= ~CSUM_IP6_TSO; } HN_UNLOCK(sc); break; case SIOCADDMULTI: case SIOCDELMULTI: HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } if (ifp->if_drv_flags & IFF_DRV_RUNNING) { /* * Multicast uses mutex; use busy-wait for * the RNDIS reply. */ HN_NO_SLEEPING(sc); hn_rxfilter_config(sc); HN_SLEEPING_OK(sc); } /* XXX vlan(4) style mcast addr maintenance */ if (hn_xpnt_vf_isready(sc)) { int old_if_flags; old_if_flags = sc->hn_vf_ifp->if_flags; hn_xpnt_vf_saveifflags(sc); if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) && ((old_if_flags ^ sc->hn_vf_ifp->if_flags) & IFF_ALLMULTI)) error = hn_xpnt_vf_iocsetflags(sc); } HN_UNLOCK(sc); break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: HN_LOCK(sc); if (hn_xpnt_vf_isready(sc)) { /* * SIOCGIFMEDIA expects ifmediareq, so don't * create and pass ifr_vf to the VF here; just * replace the ifr_name. */ vf_ifp = sc->hn_vf_ifp; strlcpy(ifr->ifr_name, vf_ifp->if_xname, sizeof(ifr->ifr_name)); error = vf_ifp->if_ioctl(vf_ifp, cmd, data); /* Restore the ifr_name. */ strlcpy(ifr->ifr_name, ifp->if_xname, sizeof(ifr->ifr_name)); HN_UNLOCK(sc); break; } HN_UNLOCK(sc); error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); break; } return (error); } static void hn_stop(struct hn_softc *sc, bool detaching) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Clear RUNNING bit ASAP. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); /* Disable polling. */ hn_polling(sc, 0); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { KASSERT(sc->hn_vf_ifp != NULL, ("%s: VF is not attached", ifp->if_xname)); /* Mark transparent mode VF as disabled. */ hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */); /* * NOTE: * Datapath setting must happen _before_ bringing * the VF down. */ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH); /* * Bring the VF down. */ hn_xpnt_vf_saveifflags(sc); sc->hn_vf_ifp->if_flags &= ~IFF_UP; hn_xpnt_vf_iocsetflags(sc); } /* Suspend data transfers. */ hn_suspend_data(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* * If the non-transparent mode VF is active, make sure * that the RX filter still allows packet reception. */ if (!detaching && (sc->hn_flags & HN_FLAG_RXVF)) hn_rxfilter_config(sc); } static void hn_init_locked(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; int i; HN_LOCK_ASSERT(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) return; if (ifp->if_drv_flags & IFF_DRV_RUNNING) return; /* Configure RX filter */ hn_rxfilter_config(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); for (i = 0; i < sc->hn_tx_ring_inuse; ++i) sc->hn_tx_ring[i].hn_oactive = 0; /* Clear TX 'suspended' bit. */ hn_resume_tx(sc, sc->hn_tx_ring_inuse); if (hn_xpnt_vf_isready(sc)) { /* Initialize transparent VF. */ hn_xpnt_vf_init(sc); } /* Everything is ready; unleash! */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); /* Re-enable polling if requested. */ if (sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_init(void *xsc) { struct hn_softc *sc = xsc; HN_LOCK(sc); hn_init_locked(sc); HN_UNLOCK(sc); } #if __FreeBSD_version >= 1100099 static int hn_lro_lenlim_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; unsigned int lenlim; int error; lenlim = sc->hn_rx_ring[0].hn_lro.lro_length_lim; error = sysctl_handle_int(oidp, &lenlim, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); if (lenlim < HN_LRO_LENLIM_MIN(sc->hn_ifp) || lenlim > TCP_LRO_LENGTH_MAX) { HN_UNLOCK(sc); return EINVAL; } hn_set_lro_lenlim(sc, lenlim); HN_UNLOCK(sc); return 0; } static int hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ackcnt, error, i; /* * lro_ackcnt_lim is append count limit, * +1 to turn it into aggregation limit. */ ackcnt = sc->hn_rx_ring[0].hn_lro.lro_ackcnt_lim + 1; error = sysctl_handle_int(oidp, &ackcnt, 0, req); if (error || req->newptr == NULL) return error; if (ackcnt < 2 || ackcnt > (TCP_LRO_ACKCNT_MAX + 1)) return EINVAL; /* * Convert aggregation limit back to append * count limit. */ --ackcnt; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; } #endif static int hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int hcsum = arg2; int on, error, i; on = 0; if (sc->hn_rx_ring[0].hn_trust_hcsum & hcsum) on = 1; error = sysctl_handle_int(oidp, &on, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) rxr->hn_trust_hcsum |= hcsum; else rxr->hn_trust_hcsum &= ~hcsum; } HN_UNLOCK(sc); return 0; } static int hn_chim_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int chim_size, error; chim_size = sc->hn_tx_ring[0].hn_chim_size; error = sysctl_handle_int(oidp, &chim_size, 0, req); if (error || req->newptr == NULL) return error; if (chim_size > sc->hn_chim_szmax || chim_size <= 0) return EINVAL; HN_LOCK(sc); hn_set_chim_size(sc, chim_size); HN_UNLOCK(sc); return 0; } #if __FreeBSD_version < 1100095 static int hn_rx_stat_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((int *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((int *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #else static int hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; uint64_t stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_64(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } return 0; } #endif static int hn_rx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_rx_ring *rxr; u_long stat; stat = 0; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((u_long *)((uint8_t *)rxr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((u_long *)((uint8_t *)rxr + ofs)) = 0; } return 0; } static int hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error; struct hn_tx_ring *txr; u_long stat; stat = 0; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } error = sysctl_handle_long(oidp, &stat, 0, req); if (error || req->newptr == NULL) return error; /* Zero out this stat. */ for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } return 0; } static int hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int ofs = arg2, i, error, conf; struct hn_tx_ring *txr; txr = &sc->hn_tx_ring[0]; conf = *((int *)((uint8_t *)txr + ofs)); error = sysctl_handle_int(oidp, &conf, 0, req); if (error || req->newptr == NULL) return error; HN_LOCK(sc); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } HN_UNLOCK(sc); return 0; } static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, size; size = sc->hn_agg_size; error = sysctl_handle_int(oidp, &size, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_size = size; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, pkts; pkts = sc->hn_agg_pkts; error = sysctl_handle_int(oidp, &pkts, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); sc->hn_agg_pkts = pkts; hn_set_txagg(sc); HN_UNLOCK(sc); return (0); } static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pkts; pkts = sc->hn_tx_ring[0].hn_agg_pktmax; return (sysctl_handle_int(oidp, &pkts, 0, req)); } static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int align; align = sc->hn_tx_ring[0].hn_agg_align; return (sysctl_handle_int(oidp, &align, 0, req)); } static void hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) { if (pollhz == 0) vmbus_chan_poll_disable(chan); else vmbus_chan_poll_enable(chan, pollhz); } static void hn_polling(struct hn_softc *sc, u_int pollhz) { int nsubch = sc->hn_rx_ring_inuse - 1; HN_LOCK_ASSERT(sc); if (nsubch > 0) { struct vmbus_channel **subch; int i; subch = vmbus_subchan_get(sc->hn_prichan, nsubch); for (i = 0; i < nsubch; ++i) hn_chan_polling(subch[i], pollhz); vmbus_subchan_rel(subch, nsubch); } hn_chan_polling(sc->hn_prichan, pollhz); } static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int pollhz, error; pollhz = sc->hn_pollhz; error = sysctl_handle_int(oidp, &pollhz, 0, req); if (error || req->newptr == NULL) return (error); if (pollhz != 0 && (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) return (EINVAL); HN_LOCK(sc); if (sc->hn_pollhz != pollhz) { sc->hn_pollhz = pollhz; if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) hn_polling(sc, sc->hn_pollhz); } HN_UNLOCK(sc); return (0); } static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", HN_NDIS_VERSION_MAJOR(sc->hn_ndis_ver), HN_NDIS_VERSION_MINOR(sc->hn_ndis_ver)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } static int hn_caps_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char caps_str[128]; uint32_t caps; HN_LOCK(sc); caps = sc->hn_caps; HN_UNLOCK(sc); snprintf(caps_str, sizeof(caps_str), "%b", caps, HN_CAP_BITS); return sysctl_handle_string(oidp, caps_str, sizeof(caps_str), req); } static int hn_hwassist_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char assist_str[128]; uint32_t hwassist; HN_LOCK(sc); hwassist = sc->hn_ifp->if_hwassist; HN_UNLOCK(sc); snprintf(assist_str, sizeof(assist_str), "%b", hwassist, CSUM_BITS); return sysctl_handle_string(oidp, assist_str, sizeof(assist_str), req); } static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char filter_str[128]; uint32_t filter; HN_LOCK(sc); filter = sc->hn_rx_filter; HN_UNLOCK(sc); snprintf(filter_str, sizeof(filter_str), "%b", filter, NDIS_PACKET_TYPES); return sysctl_handle_string(oidp, filter_str, sizeof(filter_str), req); } #ifndef RSS static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error || req->newptr == NULL) goto back; error = SYSCTL_IN(req, sc->hn_rss.rss_key, sizeof(sc->hn_rss.rss_key)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSKEY; if (sc->hn_rx_ring_inuse > 1) { error = hn_rss_reconfig(sc); } else { /* Not RSS capable, at least for now; just save the RSS key. */ error = 0; } back: HN_UNLOCK(sc); return (error); } static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error; HN_LOCK(sc); error = SYSCTL_OUT(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error || req->newptr == NULL) goto back; /* * Don't allow RSS indirect table change, if this interface is not * RSS capable currently. */ if (sc->hn_rx_ring_inuse == 1) { error = EOPNOTSUPP; goto back; } error = SYSCTL_IN(req, sc->hn_rss.rss_ind, sizeof(sc->hn_rss.rss_ind)); if (error) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; hn_rss_ind_fixup(sc); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); return (error); } #endif /* !RSS */ static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char hash_str[128]; uint32_t hash; HN_LOCK(sc); hash = sc->hn_rss_hash; HN_UNLOCK(sc); snprintf(hash_str, sizeof(hash_str), "%b", hash, NDIS_HASH_BITS); return sysctl_handle_string(oidp, hash_str, sizeof(hash_str), req); } static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; struct ifnet *vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_vf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; char vf_name[IFNAMSIZ + 1]; struct ifnet *vf_ifp; HN_LOCK(sc); vf_name[0] = '\0'; vf_ifp = sc->hn_rx_ring[0].hn_rxvf_ifp; if (vf_ifp != NULL) snprintf(vf_name, sizeof(vf_name), "%s", vf_ifp->if_xname); HN_UNLOCK(sc); return sysctl_handle_string(oidp, vf_name, sizeof(vf_name), req); } static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct ifnet *ifp; if (hn_vfmap[i] == NULL) continue; ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) sbuf_printf(sb, "%s", ifp->if_xname); else sbuf_printf(sb, " %s", ifp->if_xname); first = false; } } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS) { struct rm_priotracker pt; struct sbuf *sb; int error, i; bool first; error = sysctl_wire_old_buffer(req, 0); if (error != 0) return (error); sb = sbuf_new_for_sysctl(NULL, NULL, 128, req); if (sb == NULL) return (ENOMEM); rm_rlock(&hn_vfmap_lock, &pt); first = true; for (i = 0; i < hn_vfmap_size; ++i) { struct ifnet *ifp, *hn_ifp; hn_ifp = hn_vfmap[i]; if (hn_ifp == NULL) continue; ifp = ifnet_byindex(i); if (ifp != NULL) { if (first) { sbuf_printf(sb, "%s:%s", ifp->if_xname, hn_ifp->if_xname); } else { sbuf_printf(sb, " %s:%s", ifp->if_xname, hn_ifp->if_xname); } first = false; } } rm_runlock(&hn_vfmap_lock, &pt); error = sbuf_finish(sb); sbuf_delete(sb); return (error); } static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int error, onoff = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) onoff = 1; error = sysctl_handle_int(oidp, &onoff, 0, req); if (error || req->newptr == NULL) return (error); HN_LOCK(sc); /* NOTE: hn_vf_lock for hn_transmit() */ rm_wlock(&sc->hn_vf_lock); if (onoff) sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF; else sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF; rm_wunlock(&sc->hn_vf_lock); HN_UNLOCK(sc); return (0); } static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; int enabled = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) enabled = 1; return (sysctl_handle_int(oidp, &enabled, 0, req)); } static int hn_check_iplen(const struct mbuf *m, int hoff) { const struct ip *ip; int len, iphlen, iplen; const struct tcphdr *th; int thoff; /* TCP data offset */ len = hoff + sizeof(struct ip); /* The packet must be at least the size of an IP header. */ if (m->m_pkthdr.len < len) return IPPROTO_DONE; /* The fixed IP header must reside completely in the first mbuf. */ if (m->m_len < len) return IPPROTO_DONE; ip = mtodo(m, hoff); /* Bound check the packet's stated IP header length. */ iphlen = ip->ip_hl << 2; if (iphlen < sizeof(struct ip)) /* minimum header length */ return IPPROTO_DONE; /* The full IP header must reside completely in the one mbuf. */ if (m->m_len < hoff + iphlen) return IPPROTO_DONE; iplen = ntohs(ip->ip_len); /* * Check that the amount of data in the buffers is as * at least much as the IP header would have us expect. */ if (m->m_pkthdr.len < hoff + iplen) return IPPROTO_DONE; /* * Ignore IP fragments. */ if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) return IPPROTO_DONE; /* * The TCP/IP or UDP/IP header must be entirely contained within * the first fragment of a packet. */ switch (ip->ip_p) { case IPPROTO_TCP: if (iplen < iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) return IPPROTO_DONE; th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); thoff = th->th_off << 2; if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + thoff) return IPPROTO_DONE; break; case IPPROTO_UDP: if (iplen < iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) return IPPROTO_DONE; break; default: if (iplen < iphlen) return IPPROTO_DONE; break; } return ip->ip_p; } static int hn_create_rx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev = sc->hn_dev; #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 int lroent_cnt; #endif #endif int i; /* * Create RXBUF for reception. * * NOTE: * - It is shared by all channels. * - A large enough buffer is allocated, certain version of NVSes * may further limit the usable space. */ sc->hn_rxbuf = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, HN_RXBUF_SIZE, &sc->hn_rxbuf_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_rxbuf == NULL) { device_printf(sc->hn_dev, "allocate rxbuf failed\n"); return (ENOMEM); } sc->hn_rx_ring_cnt = ring_cnt; sc->hn_rx_ring_inuse = sc->hn_rx_ring_cnt; sc->hn_rx_ring = malloc(sizeof(struct hn_rx_ring) * sc->hn_rx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 lroent_cnt = hn_lro_entry_count; if (lroent_cnt < TCP_LRO_ENTRIES) lroent_cnt = TCP_LRO_ENTRIES; if (bootverbose) device_printf(dev, "LRO: entry count %d\n", lroent_cnt); #endif #endif /* INET || INET6 */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); /* Create dev.hn.UNIT.rx sysctl tree */ sc->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "rx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; rxr->hn_br = hyperv_dmamem_alloc(bus_get_dma_tag(dev), PAGE_SIZE, 0, HN_TXBR_SIZE + HN_RXBR_SIZE, &rxr->hn_br_dma, BUS_DMA_WAITOK); if (rxr->hn_br == NULL) { device_printf(dev, "allocate bufring failed\n"); return (ENOMEM); } if (hn_trust_hosttcp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_TCP; if (hn_trust_hostudp) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_UDP; if (hn_trust_hostip) rxr->hn_trust_hcsum |= HN_TRUST_HCSUM_IP; rxr->hn_ifp = sc->hn_ifp; if (i < sc->hn_tx_ring_cnt) rxr->hn_txr = &sc->hn_tx_ring[i]; rxr->hn_pktbuf_len = HN_PKTBUF_LEN_DEF; rxr->hn_pktbuf = malloc(rxr->hn_pktbuf_len, M_DEVBUF, M_WAITOK); rxr->hn_rx_idx = i; rxr->hn_rxbuf = sc->hn_rxbuf; /* * Initialize LRO. */ #if defined(INET) || defined(INET6) #if __FreeBSD_version >= 1100095 tcp_lro_init_args(&rxr->hn_lro, sc->hn_ifp, lroent_cnt, hn_lro_mbufq_depth); #else tcp_lro_init(&rxr->hn_lro); rxr->hn_lro.ifp = sc->hn_ifp; #endif #if __FreeBSD_version >= 1100099 rxr->hn_lro.lro_length_lim = HN_LRO_LENLIM_DEF; rxr->hn_lro.lro_ackcnt_lim = HN_LRO_ACKCNT_DEF; #endif #endif /* INET || INET6 */ if (sc->hn_rx_sysctl_tree != NULL) { char name[16]; /* * Create per RX ring sysctl tree: * dev.hn.UNIT.rx.RINGID */ snprintf(name, sizeof(name), "%d", i); rxr->hn_rx_sysctl_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(sc->hn_rx_sysctl_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (rxr->hn_rx_sysctl_tree != NULL) { SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "packets", CTLFLAG_RW, &rxr->hn_pkts, "# of packets received"); SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "rss_pkts", CTLFLAG_RW, &rxr->hn_rss_pkts, "# of packets w/ RSS info received"); SYSCTL_ADD_INT(ctx, SYSCTL_CHILDREN(rxr->hn_rx_sysctl_tree), OID_AUTO, "pktbuf_len", CTLFLAG_RD, &rxr->hn_pktbuf_len, 0, "Temporary channel packet buffer length"); } } } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_queued", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_queued), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO queued"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_flushed", CTLTYPE_U64 | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro.lro_flushed), #if __FreeBSD_version < 1100095 hn_rx_stat_int_sysctl, #else hn_rx_stat_u64_sysctl, #endif "LU", "LRO flushed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_lro_tried), hn_rx_stat_ulong_sysctl, "LU", "# of LRO tries"); #if __FreeBSD_version >= 1100099 SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_length_lim", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_lenlim_sysctl, "IU", "Max # of data bytes to be aggregated by LRO"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_ackcnt_lim", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_lro_ackcnt_sysctl, "I", "Max # of ACKs to be aggregated by LRO"); #endif SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hosttcp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_TCP, hn_trust_hcsum_sysctl, "I", "Trust tcp segement verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostudp", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_UDP, hn_trust_hcsum_sysctl, "I", "Trust udp datagram verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "trust_hostip", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, HN_TRUST_HCSUM_IP, hn_trust_hcsum_sysctl, "I", "Trust ip packet verification on host side, " "when csum info is missing"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_ip", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_ip), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM IP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_tcp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_tcp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM TCP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_udp", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_udp), hn_rx_stat_ulong_sysctl, "LU", "RXCSUM UDP"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "csum_trusted", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_csum_trusted), hn_rx_stat_ulong_sysctl, "LU", "# of packets that we trust host's csum verification"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "small_pkts", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_small_pkts), hn_rx_stat_ulong_sysctl, "LU", "# of small packets received"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ack_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_rx_ring, hn_ack_failed), hn_rx_stat_ulong_sysctl, "LU", "# of RXBUF ack failures"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_cnt", CTLFLAG_RD, &sc->hn_rx_ring_cnt, 0, "# created RX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "rx_ring_inuse", CTLFLAG_RD, &sc->hn_rx_ring_inuse, 0, "# used RX rings"); return (0); } static void hn_destroy_rx_data(struct hn_softc *sc) { int i; if (sc->hn_rxbuf != NULL) { if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); else device_printf(sc->hn_dev, "RXBUF is referenced\n"); sc->hn_rxbuf = NULL; } if (sc->hn_rx_ring_cnt == 0) return; for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_br == NULL) continue; if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); } else { device_printf(sc->hn_dev, "%dth channel bufring is referenced", i); } rxr->hn_br = NULL; #if defined(INET) || defined(INET6) tcp_lro_free(&rxr->hn_lro); #endif free(rxr->hn_pktbuf, M_DEVBUF); } free(sc->hn_rx_ring, M_DEVBUF); sc->hn_rx_ring = NULL; sc->hn_rx_ring_cnt = 0; sc->hn_rx_ring_inuse = 0; } static int hn_tx_ring_create(struct hn_softc *sc, int id) { struct hn_tx_ring *txr = &sc->hn_tx_ring[id]; device_t dev = sc->hn_dev; bus_dma_tag_t parent_dtag; int error, i; txr->hn_sc = sc; txr->hn_tx_idx = id; #ifndef HN_USE_TXDESC_BUFRING mtx_init(&txr->hn_txlist_spin, "hn txlist", NULL, MTX_SPIN); #endif mtx_init(&txr->hn_tx_lock, "hn tx", NULL, MTX_DEF); txr->hn_txdesc_cnt = HN_TX_DESC_CNT; txr->hn_txdesc = malloc(sizeof(struct hn_txdesc) * txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK | M_ZERO); #ifndef HN_USE_TXDESC_BUFRING SLIST_INIT(&txr->hn_txlist); #else txr->hn_txdesc_br = buf_ring_alloc(txr->hn_txdesc_cnt, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); #endif if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); } else { txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; } #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { txr->hn_txeof = hn_start_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_start_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_start_txeof_taskfunc, txr); } else #endif { int br_depth; txr->hn_txeof = hn_xmit_txeof; TASK_INIT(&txr->hn_tx_task, 0, hn_xmit_taskfunc, txr); TASK_INIT(&txr->hn_txeof_task, 0, hn_xmit_txeof_taskfunc, txr); br_depth = hn_get_txswq_depth(txr); txr->hn_mbuf_br = buf_ring_alloc(br_depth, M_DEVBUF, M_WAITOK, &txr->hn_tx_lock); } txr->hn_direct_tx_size = hn_direct_tx_size; /* * Always schedule transmission instead of trying to do direct * transmission. This one gives the best performance so far. */ txr->hn_sched_tx = 1; parent_dtag = bus_get_dma_tag(dev); /* DMA tag for RNDIS packet messages. */ error = bus_dma_tag_create(parent_dtag, /* parent */ HN_RNDIS_PKT_ALIGN, /* alignment */ HN_RNDIS_PKT_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_RNDIS_PKT_LEN, /* maxsize */ 1, /* nsegments */ HN_RNDIS_PKT_LEN, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_rndis_dtag); if (error) { device_printf(dev, "failed to create rndis dmatag\n"); return error; } /* DMA tag for data. */ error = bus_dma_tag_create(parent_dtag, /* parent */ 1, /* alignment */ HN_TX_DATA_BOUNDARY, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ HN_TX_DATA_MAXSIZE, /* maxsize */ HN_TX_DATA_SEGCNT_MAX, /* nsegments */ HN_TX_DATA_SEGSIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &txr->hn_tx_data_dtag); if (error) { device_printf(dev, "failed to create data dmatag\n"); return error; } for (i = 0; i < txr->hn_txdesc_cnt; ++i) { struct hn_txdesc *txd = &txr->hn_txdesc[i]; txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; STAILQ_INIT(&txd->agg_list); /* * Allocate and load RNDIS packet message. */ error = bus_dmamem_alloc(txr->hn_tx_rndis_dtag, (void **)&txd->rndis_pkt, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &txd->rndis_pkt_dmap); if (error) { device_printf(dev, "failed to allocate rndis_packet_msg, %d\n", i); return error; } error = bus_dmamap_load(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap, txd->rndis_pkt, HN_RNDIS_PKT_LEN, hyperv_dma_map_paddr, &txd->rndis_pkt_paddr, BUS_DMA_NOWAIT); if (error) { device_printf(dev, "failed to load rndis_packet_msg, %d\n", i); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* DMA map for TX data. */ error = bus_dmamap_create(txr->hn_tx_data_dtag, 0, &txd->data_dmap); if (error) { device_printf(dev, "failed to allocate tx data dmamap\n"); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); return error; } /* All set, put it to list */ txd->flags |= HN_TXD_FLAG_ONLIST; #ifndef HN_USE_TXDESC_BUFRING SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); #else buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif } txr->hn_txdesc_avail = txr->hn_txdesc_cnt; if (sc->hn_tx_sysctl_tree != NULL) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; char name[16]; /* * Create per TX ring sysctl tree: * dev.hn.UNIT.tx.RINGID */ ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(sc->hn_tx_sysctl_tree); snprintf(name, sizeof(name), "%d", id); txr->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); #ifdef HN_DEBUG SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); #endif #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { SYSCTL_ADD_INT(ctx, child, OID_AUTO, "oactive", CTLFLAG_RD, &txr->hn_oactive, 0, "over active"); } SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW, &txr->hn_pkts, "# of packets transmitted"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", CTLFLAG_RW, &txr->hn_sends, "# of sends"); } } return 0; } static void hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) { struct hn_tx_ring *txr = txd->txr; KASSERT(txd->m == NULL, ("still has mbuf installed")); KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("still dma mapped")); bus_dmamap_unload(txr->hn_tx_rndis_dtag, txd->rndis_pkt_dmap); bus_dmamem_free(txr->hn_tx_rndis_dtag, txd->rndis_pkt, txd->rndis_pkt_dmap); bus_dmamap_destroy(txr->hn_tx_data_dtag, txd->data_dmap); } static void hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) { KASSERT(txd->refs == 0 || txd->refs == 1, ("invalid txd refs %d", txd->refs)); /* Aggregated txds will be freed by their aggregating txd. */ if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { int freed; freed = hn_txdesc_put(txr, txd); KASSERT(freed, ("can't free txdesc")); } } static void hn_tx_ring_destroy(struct hn_tx_ring *txr) { int i; if (txr->hn_txdesc == NULL) return; /* * NOTE: * Because the freeing of aggregated txds will be deferred * to the aggregating txd, two passes are used here: * - The first pass GCes any pending txds. This GC is necessary, * since if the channels are revoked, hypervisor will not * deliver send-done for all pending txds. * - The second pass frees the busdma stuffs, i.e. after all txds * were freed. */ for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_gc(txr, &txr->hn_txdesc[i]); for (i = 0; i < txr->hn_txdesc_cnt; ++i) hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); if (txr->hn_tx_rndis_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_rndis_dtag); #ifdef HN_USE_TXDESC_BUFRING buf_ring_free(txr->hn_txdesc_br, M_DEVBUF); #endif free(txr->hn_txdesc, M_DEVBUF); txr->hn_txdesc = NULL; if (txr->hn_mbuf_br != NULL) buf_ring_free(txr->hn_mbuf_br, M_DEVBUF); #ifndef HN_USE_TXDESC_BUFRING mtx_destroy(&txr->hn_txlist_spin); #endif mtx_destroy(&txr->hn_tx_lock); } static int hn_create_tx_data(struct hn_softc *sc, int ring_cnt) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int i; /* * Create TXBUF for chimney sending. * * NOTE: It is shared by all channels. */ sc->hn_chim = hyperv_dmamem_alloc(bus_get_dma_tag(sc->hn_dev), PAGE_SIZE, 0, HN_CHIM_SIZE, &sc->hn_chim_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->hn_chim == NULL) { device_printf(sc->hn_dev, "allocate txbuf failed\n"); return (ENOMEM); } sc->hn_tx_ring_cnt = ring_cnt; sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_tx_ring = malloc(sizeof(struct hn_tx_ring) * sc->hn_tx_ring_cnt, M_DEVBUF, M_WAITOK | M_ZERO); ctx = device_get_sysctl_ctx(sc->hn_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->hn_dev)); /* Create dev.hn.UNIT.tx sysctl tree */ sc->hn_tx_sysctl_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "tx", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { int error; error = hn_tx_ring_create(sc, i); if (error) return error; } SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "no_txdescs", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_no_txdescs), hn_tx_stat_ulong_sysctl, "LU", "# of times short of TX descs"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "send_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_send_failed), hn_tx_stat_ulong_sysctl, "LU", "# of hyper-v sending failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "txdma_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_flush_failed), hn_tx_stat_ulong_sysctl, "LU", "# of packet transmission aggregation flush failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), hn_tx_stat_ulong_sysctl, "LU", "# of TX mbuf collapsed"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_tried", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_chimney_tried), hn_tx_stat_ulong_sysctl, "LU", "# of chimney send tries"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_cnt", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_txdesc_cnt, 0, "# of total TX descs"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_chimney_max", CTLFLAG_RD, &sc->hn_chim_szmax, 0, "Chimney send packet size upper boundary"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_chimney_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_chim_size_sysctl, "I", "Chimney send packet size limit"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "direct_tx_size", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_direct_tx_size), hn_tx_conf_int_sysctl, "I", "Size of the packet for direct transmission"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "sched_tx", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_sched_tx), hn_tx_conf_int_sysctl, "I", "Always schedule transmission " "instead of doing direct transmission"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_cnt", CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, "Applied packet transmission aggregation size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_pktmax_sysctl, "I", "Applied packet transmission aggregation packets"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, hn_txagg_align_sysctl, "I", "Applied packet transmission aggregation alignment"); return 0; } static void hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } static void hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu) { struct ifnet *ifp = sc->hn_ifp; u_int hw_tsomax; int tso_minlen; HN_LOCK_ASSERT(sc); if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0) return; KASSERT(sc->hn_ndis_tso_sgmin >= 2, ("invalid NDIS tso sgmin %d", sc->hn_ndis_tso_sgmin)); tso_minlen = sc->hn_ndis_tso_sgmin * mtu; KASSERT(sc->hn_ndis_tso_szmax >= tso_minlen && sc->hn_ndis_tso_szmax <= IP_MAXPACKET, ("invalid NDIS tso szmax %d", sc->hn_ndis_tso_szmax)); if (tso_maxlen < tso_minlen) tso_maxlen = tso_minlen; else if (tso_maxlen > IP_MAXPACKET) tso_maxlen = IP_MAXPACKET; if (tso_maxlen > sc->hn_ndis_tso_szmax) tso_maxlen = sc->hn_ndis_tso_szmax; hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); if (hn_xpnt_vf_isready(sc)) { if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax) hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax; } ifp->if_hw_tsomax = hw_tsomax; if (bootverbose) if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax); } static void hn_fixup_tx_data(struct hn_softc *sc) { uint64_t csum_assist; int i; hn_set_chim_size(sc, sc->hn_chim_szmax); if (hn_tx_chimney_size > 0 && hn_tx_chimney_size < sc->hn_chim_szmax) hn_set_chim_size(sc, hn_tx_chimney_size); csum_assist = 0; if (sc->hn_caps & HN_CAP_IPCS) csum_assist |= CSUM_IP; if (sc->hn_caps & HN_CAP_TCP4CS) csum_assist |= CSUM_IP_TCP; if (sc->hn_caps & HN_CAP_UDP4CS) csum_assist |= CSUM_IP_UDP; if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if (sc->hn_caps & HN_CAP_UDP6CS) csum_assist |= CSUM_IP6_UDP; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; if (sc->hn_caps & HN_CAP_HASHVAL) { /* * Support HASHVAL pktinfo on TX path. */ if (bootverbose) if_printf(sc->hn_ifp, "support HASHVAL pktinfo\n"); for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_tx_flags |= HN_TX_FLAG_HASHVAL; } } static void hn_destroy_tx_data(struct hn_softc *sc) { int i; if (sc->hn_chim != NULL) { if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); } else { device_printf(sc->hn_dev, "chimney sending buffer is referenced"); } sc->hn_chim = NULL; } if (sc->hn_tx_ring_cnt == 0) return; for (i = 0; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_destroy(&sc->hn_tx_ring[i]); free(sc->hn_tx_ring, M_DEVBUF); sc->hn_tx_ring = NULL; sc->hn_tx_ring_cnt = 0; sc->hn_tx_ring_inuse = 0; } #ifdef HN_IFSTART_SUPPORT static void hn_start_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; int sched = 0; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) return (0); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { struct hn_txdesc *txd; struct mbuf *m_head; int error; IFQ_DRV_DEQUEUE(&ifp->if_snd, m_head); if (m_head == NULL) break; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); sched = 1; break; } #if defined(INET6) || defined(INET) if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { m_head = hn_tso_fixup(m_head); if (__predict_false(m_head == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); continue; } } #endif txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static void hn_start(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr = &sc->hn_tx_ring[0]; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } static void hn_start_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); atomic_clear_int(&txr->hn_sc->hn_ifp->if_drv_flags, IFF_DRV_OACTIVE); hn_start_locked(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_start_txeof(struct hn_tx_ring *txr) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); sched = hn_start_locked(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the OACTIVE earlier, with the hope, that * others could catch up. The task will clear the * flag again with the hn_tx_lock to avoid possible * races. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } #endif /* HN_IFSTART_SUPPORT */ static int hn_xmit(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; struct mbuf *m_head; int sched = 0; mtx_assert(&txr->hn_tx_lock, MA_OWNED); #ifdef HN_IFSTART_SUPPORT KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); #endif KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) return (0); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) return (0); while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; int error; if (len > 0 && m_head->m_pkthdr.len > len) { /* * This sending could be time consuming; let callers * dispatch this packet sending (and sending of any * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); sched = 1; break; } txd = hn_txdesc_get(txr); if (txd == NULL) { txr->hn_no_txdescs++; drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ KASSERT(txr->hn_agg_txd == NULL, ("encap failed w/ pending aggregating txdesc")); drbr_advance(ifp, txr->hn_mbuf_br); continue; } if (txr->hn_agg_pktleft == 0) { if (txr->hn_agg_txd != NULL) { KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); error = hn_flush_txagg(ifp, txr); if (__predict_false(error)) { txr->hn_oactive = 1; break; } } else { KASSERT(m_head != NULL, ("mbuf was freed")); error = hn_txpkt(ifp, txr, txd); if (__predict_false(error)) { /* txd is freed, but m_head is not */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); txr->hn_oactive = 1; break; } } } #ifdef INVARIANTS else { KASSERT(txr->hn_agg_txd != NULL, ("no aggregating txdesc")); KASSERT(m_head == NULL, ("pending mbuf for aggregating txdesc")); } #endif /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } /* Flush pending aggerated transmission. */ if (txr->hn_agg_txd != NULL) hn_flush_txagg(ifp, txr); return (sched); } static int hn_transmit(struct ifnet *ifp, struct mbuf *m) { struct hn_softc *sc = ifp->if_softc; struct hn_tx_ring *txr; int error, idx = 0; if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) { struct rm_priotracker pt; rm_rlock(&sc->hn_vf_lock, &pt); if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) { struct mbuf *m_bpf = NULL; int obytes, omcast; obytes = m->m_pkthdr.len; if (m->m_flags & M_MCAST) omcast = 1; if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) { if (bpf_peers_present(ifp->if_bpf)) { m_bpf = m_copypacket(m, M_NOWAIT); if (m_bpf == NULL) { /* * Failed to grab a shallow * copy; tap now. */ ETHER_BPF_MTAP(ifp, m); } } } else { ETHER_BPF_MTAP(ifp, m); } error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m); rm_runlock(&sc->hn_vf_lock, &pt); if (m_bpf != NULL) { if (!error) ETHER_BPF_MTAP(ifp, m_bpf); m_freem(m_bpf); } if (error == ENOBUFS) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); } else if (error) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } else { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes); if (omcast) { if_inc_counter(ifp, IFCOUNTER_OMCASTS, omcast); } } return (error); } rm_runlock(&sc->hn_vf_lock, &pt); } #if defined(INET6) || defined(INET) /* * Perform TSO packet header fixup now, since the TSO * packet header should be cache-hot. */ if (m->m_pkthdr.csum_flags & CSUM_TSO) { m = hn_tso_fixup(m); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return EIO; } } #endif /* * Select the TX ring based on flowid */ if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { #ifdef RSS uint32_t bid; if (rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), &bid) == 0) idx = bid % sc->hn_tx_ring_inuse; else #endif { #if defined(INET6) || defined(INET) int tcpsyn = 0; if (m->m_pkthdr.len < 128 && (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_IP6_TCP)) && (m->m_pkthdr.csum_flags & CSUM_TSO) == 0) { m = hn_check_tcpsyn(m, &tcpsyn); if (__predict_false(m == NULL)) { if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return (EIO); } } #else const int tcpsyn = 0; #endif if (tcpsyn) idx = 0; else idx = m->m_pkthdr.flowid % sc->hn_tx_ring_inuse; } } txr = &sc->hn_tx_ring[idx]; error = drbr_enqueue(ifp, txr->hn_mbuf_br, m); if (error) { if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1); return error; } if (txr->hn_oactive) return 0; if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (!sched) return 0; } do_sched: taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); return 0; } static void hn_tx_ring_qflush(struct hn_tx_ring *txr) { struct mbuf *m; mtx_lock(&txr->hn_tx_lock); while ((m = buf_ring_dequeue_sc(txr->hn_mbuf_br)) != NULL) m_freem(m); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_qflush(struct ifnet *ifp) { struct hn_softc *sc = ifp->if_softc; struct rm_priotracker pt; int i; for (i = 0; i < sc->hn_tx_ring_inuse; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); if_qflush(ifp); rm_rlock(&sc->hn_vf_lock, &pt); if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp); rm_runlock(&sc->hn_vf_lock, &pt); } static void hn_xmit_txeof(struct hn_tx_ring *txr) { if (txr->hn_sched_tx) goto do_sched; if (mtx_trylock(&txr->hn_tx_lock)) { int sched; txr->hn_oactive = 0; sched = hn_xmit(txr, txr->hn_direct_tx_size); mtx_unlock(&txr->hn_tx_lock); if (sched) { taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_tx_task); } } else { do_sched: /* * Release the oactive earlier, with the hope, that * others could catch up. The task will clear the * oactive again with the hn_tx_lock to avoid possible * races. */ txr->hn_oactive = 0; taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_xmit_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static void hn_xmit_txeof_taskfunc(void *xtxr, int pending __unused) { struct hn_tx_ring *txr = xtxr; mtx_lock(&txr->hn_tx_lock); txr->hn_oactive = 0; hn_xmit(txr, 0); mtx_unlock(&txr->hn_tx_lock); } static int hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) { struct vmbus_chan_br cbr; struct hn_rx_ring *rxr; struct hn_tx_ring *txr = NULL; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("RX ring %d already attached", idx)); rxr->hn_rx_flags |= HN_RX_FLAG_ATTACHED; rxr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link RX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } if (idx < sc->hn_tx_ring_inuse) { txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("TX ring %d already attached", idx)); txr->hn_tx_flags |= HN_TX_FLAG_ATTACHED; txr->hn_chan = chan; if (bootverbose) { if_printf(sc->hn_ifp, "link TX ring %d to chan%u\n", idx, vmbus_chan_id(chan)); } } /* Bind this channel to a proper CPU. */ vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); /* * Open this channel */ cbr.cbr = rxr->hn_br; cbr.cbr_paddr = rxr->hn_br_dma.hv_paddr; cbr.cbr_txsz = HN_TXBR_SIZE; cbr.cbr_rxsz = HN_RXBR_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { if (error == EISCONN) { if_printf(sc->hn_ifp, "bufring is connected after " "chan%u open failure\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else { if_printf(sc->hn_ifp, "open chan%u failed: %d\n", vmbus_chan_id(chan), error); } } return (error); } static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; int idx, error; idx = vmbus_chan_subidx(chan); /* * Link this channel to RX/TX ring. */ KASSERT(idx >= 0 && idx < sc->hn_rx_ring_inuse, ("invalid channel index %d, should > 0 && < %d", idx, sc->hn_rx_ring_inuse)); rxr = &sc->hn_rx_ring[idx]; KASSERT((rxr->hn_rx_flags & HN_RX_FLAG_ATTACHED), ("RX ring %d is not attached", idx)); rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; if (idx < sc->hn_tx_ring_inuse) { struct hn_tx_ring *txr = &sc->hn_tx_ring[idx]; KASSERT((txr->hn_tx_flags & HN_TX_FLAG_ATTACHED), ("TX ring %d is not attached attached", idx)); txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; } /* * Close this channel. * * NOTE: * Channel closing does _not_ destroy the target channel. */ error = vmbus_chan_close_direct(chan); if (error == EISCONN) { if_printf(sc->hn_ifp, "chan%u bufring is connected " "after being closed\n", vmbus_chan_id(chan)); rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; } else if (error) { if_printf(sc->hn_ifp, "chan%u close failed: %d\n", vmbus_chan_id(chan), error); } } static int hn_attach_subchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; KASSERT(subchan_cnt > 0, ("no sub-channels")); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { int error1; error1 = hn_chan_attach(sc, subchans[i]); if (error1) { error = error1; /* Move on; all channels will be detached later. */ } } vmbus_subchan_rel(subchans, subchan_cnt); if (error) { if_printf(sc->hn_ifp, "sub-channels attach failed: %d\n", error); } else { if (bootverbose) { if_printf(sc->hn_ifp, "%d sub-channels attached\n", subchan_cnt); } } return (error); } static void hn_detach_allchans(struct hn_softc *sc) { struct vmbus_channel **subchans; int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i; if (subchan_cnt == 0) goto back; /* Detach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) hn_chan_detach(sc, subchans[i]); vmbus_subchan_rel(subchans, subchan_cnt); back: /* * Detach the primary channel, _after_ all sub-channels * are detached. */ hn_chan_detach(sc, sc->hn_prichan); /* Wait for sub-channels to be destroyed, if any. */ vmbus_subchan_drain(sc->hn_prichan); #ifdef INVARIANTS for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { KASSERT((sc->hn_rx_ring[i].hn_rx_flags & HN_RX_FLAG_ATTACHED) == 0, ("%dth RX ring is still attached", i)); } for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { KASSERT((sc->hn_tx_ring[i].hn_tx_flags & HN_TX_FLAG_ATTACHED) == 0, ("%dth TX ring is still attached", i)); } #endif } static int hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) { struct vmbus_channel **subchans; int nchan, rxr_cnt, error; nchan = *nsubch + 1; if (nchan == 1) { /* * Multiple RX/TX rings are not requested. */ *nsubch = 0; return (0); } /* * Query RSS capabilities, e.g. # of RX rings, and # of indirect * table entries. */ error = hn_rndis_query_rsscaps(sc, &rxr_cnt); if (error) { /* No RSS; this is benign. */ *nsubch = 0; return (0); } if (bootverbose) { if_printf(sc->hn_ifp, "RX rings offered %u, requested %d\n", rxr_cnt, nchan); } if (nchan > rxr_cnt) nchan = rxr_cnt; if (nchan == 1) { if_printf(sc->hn_ifp, "only 1 channel is supported, no vRSS\n"); *nsubch = 0; return (0); } /* * Allocate sub-channels from NVS. */ *nsubch = nchan - 1; error = hn_nvs_alloc_subchans(sc, nsubch); if (error || *nsubch == 0) { /* Failed to allocate sub-channels. */ *nsubch = 0; return (0); } /* * Wait for all sub-channels to become ready before moving on. */ subchans = vmbus_subchan_get(sc->hn_prichan, *nsubch); vmbus_subchan_rel(subchans, *nsubch); return (0); } static bool hn_synth_attachable(const struct hn_softc *sc) { int i; if (sc->hn_flags & HN_FLAG_ERRORS) return (false); for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) return (false); } return (true); } /* * Make sure that the RX filter is zero after the successful * RNDIS initialization. * * NOTE: * Under certain conditions on certain versions of Hyper-V, * the RNDIS rxfilter is _not_ zero on the hypervisor side * after the successful RNDIS initialization, which breaks * the assumption of any following code (well, it breaks the * RNDIS API contract actually). Clear the RNDIS rxfilter * explicitly, drain packets sneaking through, and drain the * interrupt taskqueues scheduled due to the stealth packets. */ static void hn_rndis_init_fixat(struct hn_softc *sc, int nchan) { hn_disable_rx(sc); hn_drain_rxtx(sc, nchan); } static int hn_synth_attach(struct hn_softc *sc, int mtu) { #define ATTACHED_NVS 0x0002 #define ATTACHED_RNDIS 0x0004 struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan = 1, i, rndis_inited; uint32_t old_caps, attached = 0; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); if (!hn_synth_attachable(sc)) return (ENXIO); /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; /* Clear RSS stuffs. */ sc->hn_rss_ind_size = 0; sc->hn_rss_hash = 0; /* * Attach the primary channel _before_ attaching NVS and RNDIS. */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) goto failed; /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) goto failed; attached |= ATTACHED_NVS; /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu, &rndis_inited); if (rndis_inited) attached |= ATTACHED_RNDIS; if (error) goto failed; /* * Make sure capabilities are not changed. */ if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); error = ENXIO; goto failed; } /* * Allocate sub-channels for multi-TX/RX rings. * * NOTE: * The # of RX rings that can be used is equivalent to the # of * channels to be requested. */ nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) goto failed; /* NOTE: _Full_ synthetic parts detach is required now. */ sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; /* * Set the # of TX/RX rings that could be used according to * the # of channels that NVS offered. */ nchan = nsubch + 1; hn_set_ring_inuse(sc, nchan); if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* * Attach the sub-channels. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ error = hn_attach_subchans(sc); if (error) goto failed; /* * Configure RSS key and indirect table _after_ all sub-channels * are attached. */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. */ if (bootverbose) if_printf(sc->hn_ifp, "setup default RSS key\n"); #ifdef RSS rss_getkey(rss->rss_key); #else memcpy(rss->rss_key, hn_rss_key_default, sizeof(rss->rss_key)); #endif sc->hn_flags |= HN_FLAG_HAS_RSSKEY; } if ((sc->hn_flags & HN_FLAG_HAS_RSSIND) == 0) { /* * RSS indirect table is not set yet; set it up in round- * robin fashion. */ if (bootverbose) { if_printf(sc->hn_ifp, "setup default RSS indirect " "table\n"); } for (i = 0; i < NDIS_HASH_INDCNT; ++i) { uint32_t subidx; #ifdef RSS subidx = rss_get_indirection_to_bucket(i); #else subidx = i; #endif rss->rss_ind[i] = subidx % nchan; } sc->hn_flags |= HN_FLAG_HAS_RSSIND; } else { /* * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. * * NOTE: hn_set_ring_inuse() _must_ have been called. */ hn_rss_ind_fixup(sc); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); if (error) goto failed; back: /* * Fixup transmission aggregation setup. */ hn_set_txagg(sc); hn_rndis_init_fixat(sc, nchan); return (0); failed: if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { hn_rndis_init_fixat(sc, nchan); hn_synth_detach(sc); } else { if (attached & ATTACHED_RNDIS) { hn_rndis_init_fixat(sc, nchan); hn_rndis_detach(sc); } if (attached & ATTACHED_NVS) hn_nvs_detach(sc); hn_chan_detach(sc, sc->hn_prichan); /* Restore old capabilities. */ sc->hn_caps = old_caps; } return (error); #undef ATTACHED_RNDIS #undef ATTACHED_NVS } /* * NOTE: * The interface must have been suspended though hn_suspend(), before * this function get called. */ static void hn_synth_detach(struct hn_softc *sc) { KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); /* Detach the RNDIS first. */ hn_rndis_detach(sc); /* Detach NVS. */ hn_nvs_detach(sc); /* Detach all of the channels. */ hn_detach_allchans(sc); sc->hn_flags &= ~HN_FLAG_SYNTH_ATTACHED; } static void hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) { KASSERT(ring_cnt > 0 && ring_cnt <= sc->hn_rx_ring_cnt, ("invalid ring count %d", ring_cnt)); if (sc->hn_tx_ring_cnt > ring_cnt) sc->hn_tx_ring_inuse = ring_cnt; else sc->hn_tx_ring_inuse = sc->hn_tx_ring_cnt; sc->hn_rx_ring_inuse = ring_cnt; #ifdef RSS if (sc->hn_rx_ring_inuse != rss_getnumbuckets()) { if_printf(sc->hn_ifp, "# of RX rings (%d) does not match " "# of RSS buckets (%d)\n", sc->hn_rx_ring_inuse, rss_getnumbuckets()); } #endif if (bootverbose) { if_printf(sc->hn_ifp, "%d TX ring, %d RX ring\n", sc->hn_tx_ring_inuse, sc->hn_rx_ring_inuse); } } static void hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) { /* * NOTE: * The TX bufring will not be drained by the hypervisor, * if the primary channel is revoked. */ while (!vmbus_chan_rx_empty(chan) || (!vmbus_chan_is_revoked(sc->hn_prichan) && !vmbus_chan_tx_empty(chan))) pause("waitch", 1); vmbus_chan_intr_drain(chan); } static void hn_disable_rx(struct hn_softc *sc) { /* * Disable RX by clearing RX filter forcefully. */ sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); /* ignore error */ /* * Give RNDIS enough time to flush all pending data packets. */ pause("waitrx", (200 * hz) / 1000); } /* * NOTE: * RX/TX _must_ have been suspended/disabled, before this function * is called. */ static void hn_drain_rxtx(struct hn_softc *sc, int nchan) { struct vmbus_channel **subch = NULL; int nsubch; /* * Drain RX/TX bufrings and interrupts. */ nsubch = nchan - 1; if (nsubch > 0) subch = vmbus_subchan_get(sc->hn_prichan, nsubch); if (subch != NULL) { int i; for (i = 0; i < nsubch; ++i) hn_chan_drain(sc, subch[i]); } hn_chan_drain(sc, sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); } static void hn_suspend_data(struct hn_softc *sc) { struct hn_tx_ring *txr; int i; HN_LOCK_ASSERT(sc); /* * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ /* * Wait for all pending sends to finish. * * NOTE: * We will _not_ receive all pending send-done, if the * primary channel is revoked. */ while (hn_tx_ring_pending(txr) && !vmbus_chan_is_revoked(sc->hn_prichan)) pause("hnwtx", 1 /* 1 tick */); } /* * Disable RX. */ hn_disable_rx(sc); /* * Drain RX/TX. */ hn_drain_rxtx(sc, sc->hn_rx_ring_inuse); /* * Drain any pending TX tasks. * * NOTE: * The above hn_drain_rxtx() can dispatch TX tasks, so the TX * tasks will have to be drained _after_ the above hn_drain_rxtx(). */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { txr = &sc->hn_tx_ring[i]; taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_suspend_mgmt_taskfunc(void *xsc, int pending __unused) { ((struct hn_softc *)xsc)->hn_mgmt_taskq = NULL; } static void hn_suspend_mgmt(struct hn_softc *sc) { struct task task; HN_LOCK_ASSERT(sc); /* * Make sure that hn_mgmt_taskq0 can nolonger be accessed * through hn_mgmt_taskq. */ TASK_INIT(&task, 0, hn_suspend_mgmt_taskfunc, sc); vmbus_chan_run_task(sc->hn_prichan, &task); /* * Make sure that all pending management tasks are completed. */ taskqueue_drain(sc->hn_mgmt_taskq0, &sc->hn_netchg_init); taskqueue_drain_timeout(sc->hn_mgmt_taskq0, &sc->hn_netchg_status); taskqueue_drain_all(sc->hn_mgmt_taskq0); } static void hn_suspend(struct hn_softc *sc) { /* Disable polling. */ hn_polling(sc, 0); /* * If the non-transparent mode VF is activated, the synthetic * device is receiving packets, so the data path of the * synthetic device must be suspended. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_suspend_data(sc); hn_suspend_mgmt(sc); } static void hn_resume_tx(struct hn_softc *sc, int tx_ring_cnt) { int i; KASSERT(tx_ring_cnt <= sc->hn_tx_ring_cnt, ("invalid TX ring count %d", tx_ring_cnt)); for (i = 0; i < tx_ring_cnt; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 0; mtx_unlock(&txr->hn_tx_lock); } } static void hn_resume_data(struct hn_softc *sc) { int i; HN_LOCK_ASSERT(sc); /* * Re-enable RX. */ hn_rxfilter_config(sc); /* * Make sure to clear suspend status on "all" TX rings, * since hn_tx_ring_inuse can be changed after * hn_suspend_data(). */ hn_resume_tx(sc, sc->hn_tx_ring_cnt); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { /* * Flush unused drbrs, since hn_tx_ring_inuse may be * reduced. */ for (i = sc->hn_tx_ring_inuse; i < sc->hn_tx_ring_cnt; ++i) hn_tx_ring_qflush(&sc->hn_tx_ring[i]); } /* * Kick start TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; /* * Use txeof task, so that any pending oactive can be * cleared properly. */ taskqueue_enqueue(txr->hn_tx_taskq, &txr->hn_txeof_task); } } static void hn_resume_mgmt(struct hn_softc *sc) { sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0; /* * Kick off network change detection, if it was pending. * If no network change was pending, start link status * checks, which is more lightweight than network change * detection. */ if (sc->hn_link_flags & HN_LINK_FLAG_NETCHG) hn_change_network(sc); else hn_update_link_status(sc); } static void hn_resume(struct hn_softc *sc) { /* * If the non-transparent mode VF is activated, the synthetic * device have to receive packets, so the data path of the * synthetic device must be resumed. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) || (sc->hn_flags & HN_FLAG_RXVF)) hn_resume_data(sc); /* * Don't resume link status change if VF is attached/activated. * - In the non-transparent VF mode, the synthetic device marks * link down until the VF is deactivated; i.e. VF is down. * - In transparent VF mode, VF's media status is used until * the VF is detached. */ if ((sc->hn_flags & HN_FLAG_RXVF) == 0 && !(hn_xpnt_vf && sc->hn_vf_ifp != NULL)) hn_resume_mgmt(sc); /* * Re-enable polling if this interface is running and * the polling is requested. */ if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) hn_polling(sc, sc->hn_pollhz); } static void hn_rndis_rx_status(struct hn_softc *sc, const void *data, int dlen) { const struct rndis_status_msg *msg; int ofs; if (dlen < sizeof(*msg)) { if_printf(sc->hn_ifp, "invalid RNDIS status\n"); return; } msg = data; switch (msg->rm_status) { case RNDIS_STATUS_MEDIA_CONNECT: case RNDIS_STATUS_MEDIA_DISCONNECT: hn_update_link_status(sc); break; case RNDIS_STATUS_TASK_OFFLOAD_CURRENT_CONFIG: case RNDIS_STATUS_LINK_SPEED_CHANGE: /* Not really useful; ignore. */ break; case RNDIS_STATUS_NETWORK_CHANGE: ofs = RNDIS_STBUFOFFSET_ABS(msg->rm_stbufoffset); if (dlen < ofs + msg->rm_stbuflen || msg->rm_stbuflen < sizeof(uint32_t)) { if_printf(sc->hn_ifp, "network changed\n"); } else { uint32_t change; memcpy(&change, ((const uint8_t *)msg) + ofs, sizeof(change)); if_printf(sc->hn_ifp, "network changed, change %u\n", change); } hn_change_network(sc); break; default: if_printf(sc->hn_ifp, "unknown RNDIS status 0x%08x\n", msg->rm_status); break; } } static int hn_rndis_rxinfo(const void *info_data, int info_dlen, struct hn_rxinfo *info) { const struct rndis_pktinfo *pi = info_data; uint32_t mask = 0; while (info_dlen != 0) { const void *data; uint32_t dlen; if (__predict_false(info_dlen < sizeof(*pi))) return (EINVAL); if (__predict_false(info_dlen < pi->rm_size)) return (EINVAL); info_dlen -= pi->rm_size; if (__predict_false(pi->rm_size & RNDIS_PKTINFO_SIZE_ALIGNMASK)) return (EINVAL); if (__predict_false(pi->rm_size < pi->rm_pktinfooffset)) return (EINVAL); dlen = pi->rm_size - pi->rm_pktinfooffset; data = pi->rm_data; switch (pi->rm_type) { case NDIS_PKTINFO_TYPE_VLAN: if (__predict_false(dlen < NDIS_VLAN_INFO_SIZE)) return (EINVAL); info->vlan_info = *((const uint32_t *)data); mask |= HN_RXINFO_VLAN; break; case NDIS_PKTINFO_TYPE_CSUM: if (__predict_false(dlen < NDIS_RXCSUM_INFO_SIZE)) return (EINVAL); info->csum_info = *((const uint32_t *)data); mask |= HN_RXINFO_CSUM; break; case HN_NDIS_PKTINFO_TYPE_HASHVAL: if (__predict_false(dlen < HN_NDIS_HASH_VALUE_SIZE)) return (EINVAL); info->hash_value = *((const uint32_t *)data); mask |= HN_RXINFO_HASHVAL; break; case HN_NDIS_PKTINFO_TYPE_HASHINF: if (__predict_false(dlen < HN_NDIS_HASH_INFO_SIZE)) return (EINVAL); info->hash_info = *((const uint32_t *)data); mask |= HN_RXINFO_HASHINF; break; default: goto next; } if (mask == HN_RXINFO_ALL) { /* All found; done */ break; } next: pi = (const struct rndis_pktinfo *) ((const uint8_t *)pi + pi->rm_size); } /* * Final fixup. * - If there is no hash value, invalidate the hash info. */ if ((mask & HN_RXINFO_HASHVAL) == 0) info->hash_info = HN_NDIS_HASH_INFO_INVALID; return (0); } static __inline bool hn_rndis_check_overlap(int off, int len, int check_off, int check_len) { if (off < check_off) { if (__predict_true(off + len <= check_off)) return (false); } else if (off > check_off) { if (__predict_true(check_off + check_len <= off)) return (false); } return (true); } static void hn_rndis_rx_data(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_packet_msg *pkt; struct hn_rxinfo info; int data_off, pktinfo_off, data_len, pktinfo_len; /* * Check length. */ if (__predict_false(dlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg\n"); return; } pkt = data; if (__predict_false(dlen < pkt->rm_len)) { if_printf(rxr->hn_ifp, "truncated RNDIS packet msg, " "dlen %d, msglen %u\n", dlen, pkt->rm_len); return; } if (__predict_false(pkt->rm_len < pkt->rm_datalen + pkt->rm_oobdatalen + pkt->rm_pktinfolen)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msglen, " "msglen %u, data %u, oob %u, pktinfo %u\n", pkt->rm_len, pkt->rm_datalen, pkt->rm_oobdatalen, pkt->rm_pktinfolen); return; } if (__predict_false(pkt->rm_datalen == 0)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, no data\n"); return; } /* * Check offests. */ #define IS_OFFSET_INVALID(ofs) \ ((ofs) < RNDIS_PACKET_MSG_OFFSET_MIN || \ ((ofs) & RNDIS_PACKET_MSG_OFFSET_ALIGNMASK)) /* XXX Hyper-V does not meet data offset alignment requirement */ if (__predict_false(pkt->rm_dataoffset < RNDIS_PACKET_MSG_OFFSET_MIN)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data offset %u\n", pkt->rm_dataoffset); return; } if (__predict_false(pkt->rm_oobdataoffset > 0 && IS_OFFSET_INVALID(pkt->rm_oobdataoffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob offset %u\n", pkt->rm_oobdataoffset); return; } if (__predict_true(pkt->rm_pktinfooffset > 0) && __predict_false(IS_OFFSET_INVALID(pkt->rm_pktinfooffset))) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo offset %u\n", pkt->rm_pktinfooffset); return; } #undef IS_OFFSET_INVALID data_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_dataoffset); data_len = pkt->rm_datalen; pktinfo_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_pktinfooffset); pktinfo_len = pkt->rm_pktinfolen; /* * Check OOB coverage. */ if (__predict_false(pkt->rm_oobdatalen != 0)) { int oob_off, oob_len; if_printf(rxr->hn_ifp, "got oobdata\n"); oob_off = RNDIS_PACKET_MSG_OFFSET_ABS(pkt->rm_oobdataoffset); oob_len = pkt->rm_oobdatalen; if (__predict_false(oob_off + oob_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overflow, msglen %u, oob abs %d len %d\n", pkt->rm_len, oob_off, oob_len); return; } /* * Check against data. */ if (hn_rndis_check_overlap(oob_off, oob_len, data_off, data_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps data, oob abs %d len %d, " "data abs %d len %d\n", oob_off, oob_len, data_off, data_len); return; } /* * Check against pktinfo. */ if (pktinfo_len != 0 && hn_rndis_check_overlap(oob_off, oob_len, pktinfo_off, pktinfo_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "oob overlaps pktinfo, oob abs %d len %d, " "pktinfo abs %d len %d\n", oob_off, oob_len, pktinfo_off, pktinfo_len); return; } } /* * Check per-packet-info coverage and find useful per-packet-info. */ info.vlan_info = HN_NDIS_VLAN_INFO_INVALID; info.csum_info = HN_NDIS_RXCSUM_INFO_INVALID; info.hash_info = HN_NDIS_HASH_INFO_INVALID; if (__predict_true(pktinfo_len != 0)) { bool overlap; int error; if (__predict_false(pktinfo_off + pktinfo_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overflow, msglen %u, " "pktinfo abs %d len %d\n", pkt->rm_len, pktinfo_off, pktinfo_len); return; } /* * Check packet info coverage. */ overlap = hn_rndis_check_overlap(pktinfo_off, pktinfo_len, data_off, data_len); if (__predict_false(overlap)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "pktinfo overlap data, pktinfo abs %d len %d, " "data abs %d len %d\n", pktinfo_off, pktinfo_len, data_off, data_len); return; } /* * Find useful per-packet-info. */ error = hn_rndis_rxinfo(((const uint8_t *)pkt) + pktinfo_off, pktinfo_len, &info); if (__predict_false(error)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg " "pktinfo\n"); return; } } if (__predict_false(data_off + data_len > pkt->rm_len)) { if_printf(rxr->hn_ifp, "invalid RNDIS packet msg, " "data overflow, msglen %u, data abs %d len %d\n", pkt->rm_len, data_off, data_len); return; } hn_rxpkt(rxr, ((const uint8_t *)pkt) + data_off, data_len, &info); } static __inline void hn_rndis_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen) { const struct rndis_msghdr *hdr; if (__predict_false(dlen < sizeof(*hdr))) { if_printf(rxr->hn_ifp, "invalid RNDIS msg\n"); return; } hdr = data; if (__predict_true(hdr->rm_type == REMOTE_NDIS_PACKET_MSG)) { /* Hot data path. */ hn_rndis_rx_data(rxr, data, dlen); /* Done! */ return; } if (hdr->rm_type == REMOTE_NDIS_INDICATE_STATUS_MSG) hn_rndis_rx_status(rxr->hn_ifp->if_softc, data, dlen); else hn_rndis_rx_ctrl(rxr->hn_ifp->if_softc, data, dlen); } static void hn_nvs_handle_notify(struct hn_softc *sc, const struct vmbus_chanpkt_hdr *pkt) { const struct hn_nvs_hdr *hdr; if (VMBUS_CHANPKT_DATALEN(pkt) < sizeof(*hdr)) { if_printf(sc->hn_ifp, "invalid nvs notify\n"); return; } hdr = VMBUS_CHANPKT_CONST_DATA(pkt); if (hdr->nvs_type == HN_NVS_TYPE_TXTBL_NOTE) { /* Useless; ignore */ return; } if_printf(sc->hn_ifp, "got notify, nvs type %u\n", hdr->nvs_type); } static void hn_nvs_handle_comp(struct hn_softc *sc, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkt) { struct hn_nvs_sendctx *sndc; sndc = (struct hn_nvs_sendctx *)(uintptr_t)pkt->cph_xactid; sndc->hn_cb(sndc, sc, chan, VMBUS_CHANPKT_CONST_DATA(pkt), VMBUS_CHANPKT_DATALEN(pkt)); /* * NOTE: * 'sndc' CAN NOT be accessed anymore, since it can be freed by * its callback. */ } static void hn_nvs_handle_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, const struct vmbus_chanpkt_hdr *pkthdr) { const struct vmbus_chanpkt_rxbuf *pkt; const struct hn_nvs_hdr *nvs_hdr; int count, i, hlen; if (__predict_false(VMBUS_CHANPKT_DATALEN(pkthdr) < sizeof(*nvs_hdr))) { if_printf(rxr->hn_ifp, "invalid nvs RNDIS\n"); return; } nvs_hdr = VMBUS_CHANPKT_CONST_DATA(pkthdr); /* Make sure that this is a RNDIS message. */ if (__predict_false(nvs_hdr->nvs_type != HN_NVS_TYPE_RNDIS)) { if_printf(rxr->hn_ifp, "nvs type %u, not RNDIS\n", nvs_hdr->nvs_type); return; } hlen = VMBUS_CHANPKT_GETLEN(pkthdr->cph_hlen); if (__predict_false(hlen < sizeof(*pkt))) { if_printf(rxr->hn_ifp, "invalid rxbuf chanpkt\n"); return; } pkt = (const struct vmbus_chanpkt_rxbuf *)pkthdr; if (__predict_false(pkt->cp_rxbuf_id != HN_NVS_RXBUF_SIG)) { if_printf(rxr->hn_ifp, "invalid rxbuf_id 0x%08x\n", pkt->cp_rxbuf_id); return; } count = pkt->cp_rxbuf_cnt; if (__predict_false(hlen < __offsetof(struct vmbus_chanpkt_rxbuf, cp_rxbuf[count]))) { if_printf(rxr->hn_ifp, "invalid rxbuf_cnt %d\n", count); return; } /* Each range represents 1 RNDIS pkt that contains 1 Ethernet frame */ for (i = 0; i < count; ++i) { int ofs, len; ofs = pkt->cp_rxbuf[i].rb_ofs; len = pkt->cp_rxbuf[i].rb_len; if (__predict_false(ofs + len > HN_RXBUF_SIZE)) { if_printf(rxr->hn_ifp, "%dth RNDIS msg overflow rxbuf, " "ofs %d, len %d\n", i, ofs, len); continue; } hn_rndis_rxpkt(rxr, rxr->hn_rxbuf + ofs, len); } /* * Ack the consumed RXBUF associated w/ this channel packet, * so that this RXBUF can be recycled by the hypervisor. */ hn_nvs_ack_rxbuf(rxr, chan, pkt->cp_hdr.cph_xactid); } static void hn_nvs_ack_rxbuf(struct hn_rx_ring *rxr, struct vmbus_channel *chan, uint64_t tid) { struct hn_nvs_rndis_ack ack; int retries, error; ack.nvs_type = HN_NVS_TYPE_RNDIS_ACK; ack.nvs_status = HN_NVS_STATUS_OK; retries = 0; again: error = vmbus_chan_send(chan, VMBUS_CHANPKT_TYPE_COMP, VMBUS_CHANPKT_FLAG_NONE, &ack, sizeof(ack), tid); if (__predict_false(error == EAGAIN)) { /* * NOTE: * This should _not_ happen in real world, since the * consumption of the TX bufring from the TX path is * controlled. */ if (rxr->hn_ack_failed == 0) if_printf(rxr->hn_ifp, "RXBUF ack retry\n"); rxr->hn_ack_failed++; retries++; if (retries < 10) { DELAY(100); goto again; } /* RXBUF leaks! */ if_printf(rxr->hn_ifp, "RXBUF ack failed\n"); } } static void hn_chan_callback(struct vmbus_channel *chan, void *xrxr) { struct hn_rx_ring *rxr = xrxr; struct hn_softc *sc = rxr->hn_ifp->if_softc; for (;;) { struct vmbus_chanpkt_hdr *pkt = rxr->hn_pktbuf; int error, pktlen; pktlen = rxr->hn_pktbuf_len; error = vmbus_chan_recv_pkt(chan, pkt, &pktlen); if (__predict_false(error == ENOBUFS)) { void *nbuf; int nlen; /* * Expand channel packet buffer. * * XXX * Use M_WAITOK here, since allocation failure * is fatal. */ nlen = rxr->hn_pktbuf_len * 2; while (nlen < pktlen) nlen *= 2; nbuf = malloc(nlen, M_DEVBUF, M_WAITOK); if_printf(rxr->hn_ifp, "expand pktbuf %d -> %d\n", rxr->hn_pktbuf_len, nlen); free(rxr->hn_pktbuf, M_DEVBUF); rxr->hn_pktbuf = nbuf; rxr->hn_pktbuf_len = nlen; /* Retry! */ continue; } else if (__predict_false(error == EAGAIN)) { /* No more channel packets; done! */ break; } KASSERT(!error, ("vmbus_chan_recv_pkt failed: %d", error)); switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: hn_nvs_handle_comp(sc, chan, pkt); break; case VMBUS_CHANPKT_TYPE_RXBUF: hn_nvs_handle_rxbuf(rxr, chan, pkt); break; case VMBUS_CHANPKT_TYPE_INBAND: hn_nvs_handle_notify(sc, pkt); break; default: if_printf(rxr->hn_ifp, "unknown chan pkt %u\n", pkt->cph_type); break; } } hn_chan_rollup(rxr, rxr->hn_txr); } static void hn_sysinit(void *arg __unused) { int i; #ifdef HN_IFSTART_SUPPORT /* * Don't use ifnet.if_start if transparent VF mode is requested; * mainly due to the IFF_DRV_OACTIVE flag. */ if (hn_xpnt_vf && hn_use_if_start) { hn_use_if_start = 0; printf("hn: tranparent VF mode, if_transmit will be used, " "instead of if_start\n"); } #endif if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) { printf("hn: invalid transparent VF attach routing " "wait timeout %d, reset to %d\n", hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN); hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN; } /* * Initialize VF map. */ rm_init_flags(&hn_vfmap_lock, "hn_vfmap", RM_SLEEPABLE); hn_vfmap_size = HN_VFMAP_SIZE_DEF; hn_vfmap = malloc(sizeof(struct ifnet *) * hn_vfmap_size, M_DEVBUF, M_WAITOK | M_ZERO); /* * Fix the # of TX taskqueues. */ if (hn_tx_taskq_cnt <= 0) hn_tx_taskq_cnt = 1; else if (hn_tx_taskq_cnt > mp_ncpus) hn_tx_taskq_cnt = mp_ncpus; /* * Fix the TX taskqueue mode. */ switch (hn_tx_taskq_mode) { case HN_TX_TASKQ_M_INDEP: case HN_TX_TASKQ_M_GLOBAL: case HN_TX_TASKQ_M_EVTTQ: break; default: hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; break; } if (vm_guest != VM_GUEST_HV) return; if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) return; hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), M_DEVBUF, M_WAITOK); for (i = 0; i < hn_tx_taskq_cnt; ++i) { hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, taskqueue_thread_enqueue, &hn_tx_taskque[i]); taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, "hn tx%d", i); } } SYSINIT(hn_sysinit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysinit, NULL); static void hn_sysuninit(void *arg __unused) { if (hn_tx_taskque != NULL) { int i; for (i = 0; i < hn_tx_taskq_cnt; ++i) taskqueue_free(hn_tx_taskque[i]); free(hn_tx_taskque, M_DEVBUF); } if (hn_vfmap != NULL) free(hn_vfmap, M_DEVBUF); rm_destroy(&hn_vfmap_lock); } SYSUNINIT(hn_sysuninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_sysuninit, NULL); Index: head/sys/dev/hyperv/netvsc/if_hnreg.h =================================================================== --- head/sys/dev/hyperv/netvsc/if_hnreg.h (revision 322487) +++ head/sys/dev/hyperv/netvsc/if_hnreg.h (revision 322488) @@ -1,256 +1,256 @@ /*- - * Copyright (c) 2016 Microsoft Corp. + * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IF_HNREG_H_ #define _IF_HNREG_H_ #include #include /* * NDIS protocol version numbers */ #define HN_NDIS_VERSION_6_1 0x00060001 #define HN_NDIS_VERSION_6_20 0x00060014 #define HN_NDIS_VERSION_6_30 0x0006001e #define HN_NDIS_VERSION_MAJOR(ver) (((ver) & 0xffff0000) >> 16) #define HN_NDIS_VERSION_MINOR(ver) ((ver) & 0xffff) /* * NVS versions. */ #define HN_NVS_VERSION_1 0x00002 #define HN_NVS_VERSION_2 0x30002 #define HN_NVS_VERSION_4 0x40000 #define HN_NVS_VERSION_5 0x50000 #define HN_NVS_RXBUF_SIG 0xcafe #define HN_NVS_CHIM_SIG 0xface #define HN_NVS_CHIM_IDX_INVALID 0xffffffff #define HN_NVS_RNDIS_MTYPE_DATA 0 #define HN_NVS_RNDIS_MTYPE_CTRL 1 /* * NVS message transacion status codes. */ #define HN_NVS_STATUS_OK 1 #define HN_NVS_STATUS_FAILED 2 /* * NVS request/response message types. */ #define HN_NVS_TYPE_INIT 1 #define HN_NVS_TYPE_INIT_RESP 2 #define HN_NVS_TYPE_NDIS_INIT 100 #define HN_NVS_TYPE_RXBUF_CONN 101 #define HN_NVS_TYPE_RXBUF_CONNRESP 102 #define HN_NVS_TYPE_RXBUF_DISCONN 103 #define HN_NVS_TYPE_CHIM_CONN 104 #define HN_NVS_TYPE_CHIM_CONNRESP 105 #define HN_NVS_TYPE_CHIM_DISCONN 106 #define HN_NVS_TYPE_RNDIS 107 #define HN_NVS_TYPE_RNDIS_ACK 108 #define HN_NVS_TYPE_NDIS_CONF 125 #define HN_NVS_TYPE_VFASSOC_NOTE 128 /* notification */ #define HN_NVS_TYPE_SET_DATAPATH 129 #define HN_NVS_TYPE_SUBCH_REQ 133 #define HN_NVS_TYPE_SUBCH_RESP 133 /* same as SUBCH_REQ */ #define HN_NVS_TYPE_TXTBL_NOTE 134 /* notification */ /* * Any size less than this one will _not_ work, e.g. hn_nvs_init * only has 12B valid data, however, if only 12B data were sent, * Hypervisor would never reply. */ #define HN_NVS_REQSIZE_MIN 32 /* NVS message common header */ struct hn_nvs_hdr { uint32_t nvs_type; } __packed; struct hn_nvs_init { uint32_t nvs_type; /* HN_NVS_TYPE_INIT */ uint32_t nvs_ver_min; uint32_t nvs_ver_max; uint8_t nvs_rsvd[20]; } __packed; CTASSERT(sizeof(struct hn_nvs_init) >= HN_NVS_REQSIZE_MIN); struct hn_nvs_init_resp { uint32_t nvs_type; /* HN_NVS_TYPE_INIT_RESP */ uint32_t nvs_ver; /* deprecated */ uint32_t nvs_rsvd; uint32_t nvs_status; /* HN_NVS_STATUS_ */ } __packed; /* No reponse */ struct hn_nvs_ndis_conf { uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_CONF */ uint32_t nvs_mtu; uint32_t nvs_rsvd; uint64_t nvs_caps; /* HN_NVS_NDIS_CONF_ */ uint8_t nvs_rsvd1[12]; } __packed; CTASSERT(sizeof(struct hn_nvs_ndis_conf) >= HN_NVS_REQSIZE_MIN); #define HN_NVS_NDIS_CONF_SRIOV 0x0004 #define HN_NVS_NDIS_CONF_VLAN 0x0008 /* No response */ struct hn_nvs_ndis_init { uint32_t nvs_type; /* HN_NVS_TYPE_NDIS_INIT */ uint32_t nvs_ndis_major; /* NDIS_VERSION_MAJOR_ */ uint32_t nvs_ndis_minor; /* NDIS_VERSION_MINOR_ */ uint8_t nvs_rsvd[20]; } __packed; CTASSERT(sizeof(struct hn_nvs_ndis_init) >= HN_NVS_REQSIZE_MIN); #define HN_NVS_DATAPATH_SYNTH 0 #define HN_NVS_DATAPATH_VF 1 /* No response */ struct hn_nvs_datapath { uint32_t nvs_type; /* HN_NVS_TYPE_SET_DATAPATH */ uint32_t nvs_active_path;/* HN_NVS_DATAPATH_* */ uint32_t nvs_rsvd[6]; } __packed; CTASSERT(sizeof(struct hn_nvs_datapath) >= HN_NVS_REQSIZE_MIN); struct hn_nvs_rxbuf_conn { uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONN */ uint32_t nvs_gpadl; /* RXBUF vmbus GPADL */ uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */ uint8_t nvs_rsvd[22]; } __packed; CTASSERT(sizeof(struct hn_nvs_rxbuf_conn) >= HN_NVS_REQSIZE_MIN); struct hn_nvs_rxbuf_sect { uint32_t nvs_start; uint32_t nvs_slotsz; uint32_t nvs_slotcnt; uint32_t nvs_end; } __packed; struct hn_nvs_rxbuf_connresp { uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_CONNRESP */ uint32_t nvs_status; /* HN_NVS_STATUS_ */ uint32_t nvs_nsect; /* # of elem in nvs_sect */ struct hn_nvs_rxbuf_sect nvs_sect[]; } __packed; /* No response */ struct hn_nvs_rxbuf_disconn { uint32_t nvs_type; /* HN_NVS_TYPE_RXBUF_DISCONN */ uint16_t nvs_sig; /* HN_NVS_RXBUF_SIG */ uint8_t nvs_rsvd[26]; } __packed; CTASSERT(sizeof(struct hn_nvs_rxbuf_disconn) >= HN_NVS_REQSIZE_MIN); struct hn_nvs_chim_conn { uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONN */ uint32_t nvs_gpadl; /* chimney buf vmbus GPADL */ uint16_t nvs_sig; /* NDIS_NVS_CHIM_SIG */ uint8_t nvs_rsvd[22]; } __packed; CTASSERT(sizeof(struct hn_nvs_chim_conn) >= HN_NVS_REQSIZE_MIN); struct hn_nvs_chim_connresp { uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_CONNRESP */ uint32_t nvs_status; /* HN_NVS_STATUS_ */ uint32_t nvs_sectsz; /* section size */ } __packed; /* No response */ struct hn_nvs_chim_disconn { uint32_t nvs_type; /* HN_NVS_TYPE_CHIM_DISCONN */ uint16_t nvs_sig; /* HN_NVS_CHIM_SIG */ uint8_t nvs_rsvd[26]; } __packed; CTASSERT(sizeof(struct hn_nvs_chim_disconn) >= HN_NVS_REQSIZE_MIN); #define HN_NVS_SUBCH_OP_ALLOC 1 struct hn_nvs_subch_req { uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_REQ */ uint32_t nvs_op; /* HN_NVS_SUBCH_OP_ */ uint32_t nvs_nsubch; uint8_t nvs_rsvd[20]; } __packed; CTASSERT(sizeof(struct hn_nvs_subch_req) >= HN_NVS_REQSIZE_MIN); struct hn_nvs_subch_resp { uint32_t nvs_type; /* HN_NVS_TYPE_SUBCH_RESP */ uint32_t nvs_status; /* HN_NVS_STATUS_ */ uint32_t nvs_nsubch; } __packed; struct hn_nvs_rndis { uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS */ uint32_t nvs_rndis_mtype;/* HN_NVS_RNDIS_MTYPE_ */ /* * Chimney sending buffer index and size. * * NOTE: * If nvs_chim_idx is set to HN_NVS_CHIM_IDX_INVALID * and nvs_chim_sz is set to 0, then chimney sending * buffer is _not_ used by this RNDIS message. */ uint32_t nvs_chim_idx; uint32_t nvs_chim_sz; uint8_t nvs_rsvd[16]; } __packed; CTASSERT(sizeof(struct hn_nvs_rndis) >= HN_NVS_REQSIZE_MIN); struct hn_nvs_rndis_ack { uint32_t nvs_type; /* HN_NVS_TYPE_RNDIS_ACK */ uint32_t nvs_status; /* HN_NVS_STATUS_ */ uint8_t nvs_rsvd[24]; } __packed; CTASSERT(sizeof(struct hn_nvs_rndis_ack) >= HN_NVS_REQSIZE_MIN); /* * RNDIS extension */ /* Per-packet hash info */ #define HN_NDIS_HASH_INFO_SIZE sizeof(uint32_t) #define HN_NDIS_PKTINFO_TYPE_HASHINF NDIS_PKTINFO_TYPE_ORIG_NBLIST /* NDIS_HASH_ */ /* Per-packet hash value */ #define HN_NDIS_HASH_VALUE_SIZE sizeof(uint32_t) #define HN_NDIS_PKTINFO_TYPE_HASHVAL NDIS_PKTINFO_TYPE_PKT_CANCELID /* Per-packet-info size */ #define HN_RNDIS_PKTINFO_SIZE(dlen) \ __offsetof(struct rndis_pktinfo, rm_data[dlen]) #endif /* !_IF_HNREG_H_ */ Index: head/sys/dev/hyperv/netvsc/if_hnvar.h =================================================================== --- head/sys/dev/hyperv/netvsc/if_hnvar.h (revision 322487) +++ head/sys/dev/hyperv/netvsc/if_hnvar.h (revision 322488) @@ -1,315 +1,315 @@ /*- - * Copyright (c) 2016 Microsoft Corp. + * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef _IF_HNVAR_H_ #define _IF_HNVAR_H_ #define HN_USE_TXDESC_BUFRING #define HN_CHIM_SIZE (15 * 1024 * 1024) #define HN_RXBUF_SIZE (16 * 1024 * 1024) #define HN_RXBUF_SIZE_COMPAT (15 * 1024 * 1024) /* Claimed to be 12232B */ #define HN_MTU_MAX (9 * 1024) #define HN_TXBR_SIZE (128 * PAGE_SIZE) #define HN_RXBR_SIZE (128 * PAGE_SIZE) #define HN_XACT_REQ_PGCNT 2 #define HN_XACT_RESP_PGCNT 2 #define HN_XACT_REQ_SIZE (HN_XACT_REQ_PGCNT * PAGE_SIZE) #define HN_XACT_RESP_SIZE (HN_XACT_RESP_PGCNT * PAGE_SIZE) #define HN_GPACNT_MAX 32 struct hn_txdesc; #ifndef HN_USE_TXDESC_BUFRING SLIST_HEAD(hn_txdesc_list, hn_txdesc); #else struct buf_ring; #endif struct hn_tx_ring; struct hn_rx_ring { struct ifnet *hn_ifp; struct ifnet *hn_rxvf_ifp; /* SR-IOV VF for RX */ struct hn_tx_ring *hn_txr; void *hn_pktbuf; int hn_pktbuf_len; int hn_rx_flags; /* HN_RX_FLAG_ */ uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */ int hn_rx_idx; /* Trust csum verification on host side */ int hn_trust_hcsum; /* HN_TRUST_HCSUM_ */ struct lro_ctrl hn_lro; u_long hn_csum_ip; u_long hn_csum_tcp; u_long hn_csum_udp; u_long hn_csum_trusted; u_long hn_lro_tried; u_long hn_small_pkts; u_long hn_pkts; u_long hn_rss_pkts; u_long hn_ack_failed; /* Rarely used stuffs */ struct sysctl_oid *hn_rx_sysctl_tree; void *hn_br; /* TX/RX bufring */ struct hyperv_dma hn_br_dma; struct vmbus_channel *hn_chan; } __aligned(CACHE_LINE_SIZE); #define HN_TRUST_HCSUM_IP 0x0001 #define HN_TRUST_HCSUM_TCP 0x0002 #define HN_TRUST_HCSUM_UDP 0x0004 #define HN_RX_FLAG_ATTACHED 0x0001 #define HN_RX_FLAG_BR_REF 0x0002 #define HN_RX_FLAG_XPNT_VF 0x0004 struct hn_tx_ring { #ifndef HN_USE_TXDESC_BUFRING struct mtx hn_txlist_spin; struct hn_txdesc_list hn_txlist; #else struct buf_ring *hn_txdesc_br; #endif int hn_txdesc_cnt; int hn_txdesc_avail; u_short hn_has_txeof; u_short hn_txdone_cnt; int hn_sched_tx; void (*hn_txeof)(struct hn_tx_ring *); struct taskqueue *hn_tx_taskq; struct task hn_tx_task; struct task hn_txeof_task; struct buf_ring *hn_mbuf_br; int hn_oactive; int hn_tx_idx; int hn_tx_flags; struct mtx hn_tx_lock; struct hn_softc *hn_sc; struct vmbus_channel *hn_chan; int hn_direct_tx_size; int hn_chim_size; bus_dma_tag_t hn_tx_data_dtag; uint64_t hn_csum_assist; /* Applied packet transmission aggregation limits. */ int hn_agg_szmax; short hn_agg_pktmax; short hn_agg_align; /* Packet transmission aggregation states. */ struct hn_txdesc *hn_agg_txd; int hn_agg_szleft; short hn_agg_pktleft; struct rndis_packet_msg *hn_agg_prevpkt; /* Temporary stats for each sends. */ int hn_stat_size; short hn_stat_pkts; short hn_stat_mcasts; int (*hn_sendpkt)(struct hn_tx_ring *, struct hn_txdesc *); int hn_suspended; int hn_gpa_cnt; struct vmbus_gpa hn_gpa[HN_GPACNT_MAX]; u_long hn_no_txdescs; u_long hn_send_failed; u_long hn_txdma_failed; u_long hn_tx_collapsed; u_long hn_tx_chimney_tried; u_long hn_tx_chimney; u_long hn_pkts; u_long hn_sends; u_long hn_flush_failed; /* Rarely used stuffs */ struct hn_txdesc *hn_txdesc; bus_dma_tag_t hn_tx_rndis_dtag; struct sysctl_oid *hn_tx_sysctl_tree; } __aligned(CACHE_LINE_SIZE); #define HN_TX_FLAG_ATTACHED 0x0001 #define HN_TX_FLAG_HASHVAL 0x0002 /* support HASHVAL pktinfo */ /* * Device-specific softc structure */ struct hn_softc { struct ifnet *hn_ifp; struct ifmedia hn_media; device_t hn_dev; int hn_if_flags; struct sx hn_lock; struct vmbus_channel *hn_prichan; int hn_rx_ring_cnt; int hn_rx_ring_inuse; struct hn_rx_ring *hn_rx_ring; struct rmlock hn_vf_lock; struct ifnet *hn_vf_ifp; /* SR-IOV VF */ uint32_t hn_xvf_flags; /* transparent VF flags */ int hn_tx_ring_cnt; int hn_tx_ring_inuse; struct hn_tx_ring *hn_tx_ring; uint8_t *hn_chim; u_long *hn_chim_bmap; int hn_chim_bmap_cnt; int hn_chim_cnt; int hn_chim_szmax; int hn_cpu; struct taskqueue **hn_tx_taskqs; struct sysctl_oid *hn_tx_sysctl_tree; struct sysctl_oid *hn_rx_sysctl_tree; struct vmbus_xact_ctx *hn_xact; uint32_t hn_nvs_ver; uint32_t hn_rx_filter; /* Packet transmission aggregation user settings. */ int hn_agg_size; int hn_agg_pkts; struct taskqueue *hn_mgmt_taskq; struct taskqueue *hn_mgmt_taskq0; struct task hn_link_task; struct task hn_netchg_init; struct timeout_task hn_netchg_status; uint32_t hn_link_flags; /* HN_LINK_FLAG_ */ uint32_t hn_caps; /* HN_CAP_ */ uint32_t hn_flags; /* HN_FLAG_ */ u_int hn_pollhz; void *hn_rxbuf; uint32_t hn_rxbuf_gpadl; struct hyperv_dma hn_rxbuf_dma; uint32_t hn_chim_gpadl; struct hyperv_dma hn_chim_dma; uint32_t hn_rndis_rid; uint32_t hn_ndis_ver; int hn_ndis_tso_szmax; int hn_ndis_tso_sgmin; uint32_t hn_rndis_agg_size; uint32_t hn_rndis_agg_pkts; uint32_t hn_rndis_agg_align; int hn_rss_ind_size; uint32_t hn_rss_hash; /* NDIS_HASH_ */ struct ndis_rssprm_toeplitz hn_rss; eventhandler_tag hn_ifaddr_evthand; eventhandler_tag hn_ifnet_evthand; eventhandler_tag hn_ifnet_atthand; eventhandler_tag hn_ifnet_dethand; eventhandler_tag hn_ifnet_lnkhand; /* * Transparent VF delayed initialization. */ int hn_vf_rdytick; /* ticks, 0 == ready */ struct taskqueue *hn_vf_taskq; struct timeout_task hn_vf_init; /* * Saved information for VF under transparent mode. */ void (*hn_vf_input) (struct ifnet *, struct mbuf *); int hn_saved_caps; u_int hn_saved_tsomax; u_int hn_saved_tsosegcnt; u_int hn_saved_tsosegsz; }; #define HN_FLAG_RXBUF_CONNECTED 0x0001 #define HN_FLAG_CHIM_CONNECTED 0x0002 #define HN_FLAG_HAS_RSSKEY 0x0004 #define HN_FLAG_HAS_RSSIND 0x0008 #define HN_FLAG_SYNTH_ATTACHED 0x0010 #define HN_FLAG_NO_SLEEPING 0x0020 #define HN_FLAG_RXBUF_REF 0x0040 #define HN_FLAG_CHIM_REF 0x0080 #define HN_FLAG_RXVF 0x0100 #define HN_FLAG_ERRORS (HN_FLAG_RXBUF_REF | HN_FLAG_CHIM_REF) #define HN_XVFFLAG_ENABLED 0x0001 #define HN_XVFFLAG_ACCBPF 0x0002 #define HN_NO_SLEEPING(sc) \ do { \ (sc)->hn_flags |= HN_FLAG_NO_SLEEPING; \ } while (0) #define HN_SLEEPING_OK(sc) \ do { \ (sc)->hn_flags &= ~HN_FLAG_NO_SLEEPING; \ } while (0) #define HN_CAN_SLEEP(sc) \ (((sc)->hn_flags & HN_FLAG_NO_SLEEPING) == 0) #define HN_CAP_VLAN 0x0001 #define HN_CAP_MTU 0x0002 #define HN_CAP_IPCS 0x0004 #define HN_CAP_TCP4CS 0x0008 #define HN_CAP_TCP6CS 0x0010 #define HN_CAP_UDP4CS 0x0020 #define HN_CAP_UDP6CS 0x0040 #define HN_CAP_TSO4 0x0080 #define HN_CAP_TSO6 0x0100 #define HN_CAP_HASHVAL 0x0200 /* Capability description for use with printf(9) %b identifier. */ #define HN_CAP_BITS \ "\020\1VLAN\2MTU\3IPCS\4TCP4CS\5TCP6CS" \ "\6UDP4CS\7UDP6CS\10TSO4\11TSO6\12HASHVAL" #define HN_LINK_FLAG_LINKUP 0x0001 #define HN_LINK_FLAG_NETCHG 0x0002 #endif /* !_IF_HNVAR_H_ */ Index: head/sys/dev/hyperv/pcib/vmbus_pcib.c =================================================================== --- head/sys/dev/hyperv/pcib/vmbus_pcib.c (revision 322487) +++ head/sys/dev/hyperv/pcib/vmbus_pcib.c (revision 322488) @@ -1,1798 +1,1798 @@ /*- - * Copyright (c) 2016 Microsoft Corp. + * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #ifdef NEW_PCIB #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" #include #include #include #include #include #include #include #include "vmbus_if.h" #if __FreeBSD_version < 1100000 typedef u_long rman_res_t; #define RM_MAX_END (~(rman_res_t)0) #endif struct completion { unsigned int done; struct mtx lock; }; static void init_completion(struct completion *c) { memset(c, 0, sizeof(*c)); mtx_init(&c->lock, "hvcmpl", NULL, MTX_DEF); c->done = 0; } static void free_completion(struct completion *c) { mtx_destroy(&c->lock); } static void complete(struct completion *c) { mtx_lock(&c->lock); c->done++; mtx_unlock(&c->lock); wakeup(c); } static void wait_for_completion(struct completion *c) { mtx_lock(&c->lock); while (c->done == 0) mtx_sleep(c, &c->lock, 0, "hvwfc", 0); c->done--; mtx_unlock(&c->lock); } #define PCI_MAKE_VERSION(major, minor) ((uint32_t)(((major) << 16) | (major))) enum { PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), PCI_PROTOCOL_VERSION_CURRENT = PCI_PROTOCOL_VERSION_1_1 }; #define PCI_CONFIG_MMIO_LENGTH 0x2000 #define CFG_PAGE_OFFSET 0x1000 #define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET) /* * Message Types */ enum pci_message_type { /* * Version 1.1 */ PCI_MESSAGE_BASE = 0x42490000, PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0, PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1, PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4, PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5, PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6, PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7, PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8, PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9, PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA, PCI_EJECT = PCI_MESSAGE_BASE + 0xB, PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC, PCI_REENABLE = PCI_MESSAGE_BASE + 0xD, PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE, PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF, PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10, PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11, PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12, PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13, PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14, PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15, PCI_MESSAGE_MAXIMUM }; /* * Structures defining the virtual PCI Express protocol. */ union pci_version { struct { uint16_t minor_version; uint16_t major_version; } parts; uint32_t version; } __packed; /* * This representation is the one used in Windows, which is * what is expected when sending this back and forth with * the Hyper-V parent partition. */ union win_slot_encoding { struct { uint32_t slot:5; uint32_t func:3; uint32_t reserved:24; } bits; uint32_t val; } __packed; struct pci_func_desc { uint16_t v_id; /* vendor ID */ uint16_t d_id; /* device ID */ uint8_t rev; uint8_t prog_intf; uint8_t subclass; uint8_t base_class; uint32_t subsystem_id; union win_slot_encoding wslot; uint32_t ser; /* serial number */ } __packed; struct hv_msi_desc { uint8_t vector; uint8_t delivery_mode; uint16_t vector_count; uint32_t reserved; uint64_t cpu_mask; } __packed; struct tran_int_desc { uint16_t reserved; uint16_t vector_count; uint32_t data; uint64_t address; } __packed; struct pci_message { uint32_t type; } __packed; struct pci_child_message { struct pci_message message_type; union win_slot_encoding wslot; } __packed; struct pci_incoming_message { struct vmbus_chanpkt_hdr hdr; struct pci_message message_type; } __packed; struct pci_response { struct vmbus_chanpkt_hdr hdr; int32_t status; /* negative values are failures */ } __packed; struct pci_packet { void (*completion_func)(void *context, struct pci_response *resp, int resp_packet_size); void *compl_ctxt; struct pci_message message[0]; }; /* * Specific message types supporting the PCI protocol. */ struct pci_version_request { struct pci_message message_type; uint32_t protocol_version; uint32_t is_last_attempt:1; uint32_t reservedz:31; } __packed; struct pci_bus_d0_entry { struct pci_message message_type; uint32_t reserved; uint64_t mmio_base; } __packed; struct pci_bus_relations { struct pci_incoming_message incoming; uint32_t device_count; struct pci_func_desc func[0]; } __packed; #define MAX_NUM_BARS (PCIR_MAX_BAR_0 + 1) struct pci_q_res_req_response { struct vmbus_chanpkt_hdr hdr; int32_t status; /* negative values are failures */ uint32_t probed_bar[MAX_NUM_BARS]; } __packed; struct pci_resources_assigned { struct pci_message message_type; union win_slot_encoding wslot; uint8_t memory_range[0x14][MAX_NUM_BARS]; /* unused here */ uint32_t msi_descriptors; uint32_t reserved[4]; } __packed; struct pci_create_interrupt { struct pci_message message_type; union win_slot_encoding wslot; struct hv_msi_desc int_desc; } __packed; struct pci_create_int_response { struct pci_response response; uint32_t reserved; struct tran_int_desc int_desc; } __packed; struct pci_delete_interrupt { struct pci_message message_type; union win_slot_encoding wslot; struct tran_int_desc int_desc; } __packed; struct pci_dev_incoming { struct pci_incoming_message incoming; union win_slot_encoding wslot; } __packed; struct pci_eject_response { struct pci_message message_type; union win_slot_encoding wslot; uint32_t status; } __packed; /* * Driver specific state. */ enum hv_pcibus_state { hv_pcibus_init = 0, hv_pcibus_installed, }; struct hv_pcibus { device_t pcib; device_t pci_bus; struct vmbus_pcib_softc *sc; uint16_t pci_domain; enum hv_pcibus_state state; struct resource *cfg_res; struct completion query_completion, *query_comp; struct mtx config_lock; /* Avoid two threads writing index page */ struct mtx device_list_lock; /* Protect lists below */ TAILQ_HEAD(, hv_pci_dev) children; TAILQ_HEAD(, hv_dr_state) dr_list; volatile int detaching; }; struct hv_pci_dev { TAILQ_ENTRY(hv_pci_dev) link; struct pci_func_desc desc; bool reported_missing; struct hv_pcibus *hbus; struct task eject_task; TAILQ_HEAD(, hv_irq_desc) irq_desc_list; /* * What would be observed if one wrote 0xFFFFFFFF to a BAR and then * read it back, for each of the BAR offsets within config space. */ uint32_t probed_bar[MAX_NUM_BARS]; }; /* * Tracks "Device Relations" messages from the host, which must be both * processed in order. */ struct hv_dr_work { struct task task; struct hv_pcibus *bus; }; struct hv_dr_state { TAILQ_ENTRY(hv_dr_state) link; uint32_t device_count; struct pci_func_desc func[0]; }; struct hv_irq_desc { TAILQ_ENTRY(hv_irq_desc) link; struct tran_int_desc desc; int irq; }; #define PCI_DEVFN(slot, func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) #define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) #define PCI_FUNC(devfn) ((devfn) & 0x07) static uint32_t devfn_to_wslot(unsigned int devfn) { union win_slot_encoding wslot; wslot.val = 0; wslot.bits.slot = PCI_SLOT(devfn); wslot.bits.func = PCI_FUNC(devfn); return (wslot.val); } static unsigned int wslot_to_devfn(uint32_t wslot) { union win_slot_encoding encoding; unsigned int slot; unsigned int func; encoding.val = wslot; slot = encoding.bits.slot; func = encoding.bits.func; return (PCI_DEVFN(slot, func)); } struct vmbus_pcib_softc { struct vmbus_channel *chan; void *rx_buf; struct taskqueue *taskq; struct hv_pcibus *hbus; }; /* {44C4F61D-4444-4400-9D52-802E27EDE19F} */ static const struct hyperv_guid g_pass_through_dev_type = { .hv_guid = {0x1D, 0xF6, 0xC4, 0x44, 0x44, 0x44, 0x00, 0x44, 0x9D, 0x52, 0x80, 0x2E, 0x27, 0xED, 0xE1, 0x9F} }; struct hv_pci_compl { struct completion host_event; int32_t completion_status; }; struct q_res_req_compl { struct completion host_event; struct hv_pci_dev *hpdev; }; struct compose_comp_ctxt { struct hv_pci_compl comp_pkt; struct tran_int_desc int_desc; }; static void hv_pci_generic_compl(void *context, struct pci_response *resp, int resp_packet_size) { struct hv_pci_compl *comp_pkt = context; if (resp_packet_size >= sizeof(struct pci_response)) comp_pkt->completion_status = resp->status; else comp_pkt->completion_status = -1; complete(&comp_pkt->host_event); } static void q_resource_requirements(void *context, struct pci_response *resp, int resp_packet_size) { struct q_res_req_compl *completion = context; struct pci_q_res_req_response *q_res_req = (struct pci_q_res_req_response *)resp; int i; if (resp->status < 0) { printf("vmbus_pcib: failed to query resource requirements\n"); } else { for (i = 0; i < MAX_NUM_BARS; i++) completion->hpdev->probed_bar[i] = q_res_req->probed_bar[i]; } complete(&completion->host_event); } static void hv_pci_compose_compl(void *context, struct pci_response *resp, int resp_packet_size) { struct compose_comp_ctxt *comp_pkt = context; struct pci_create_int_response *int_resp = (struct pci_create_int_response *)resp; comp_pkt->comp_pkt.completion_status = resp->status; comp_pkt->int_desc = int_resp->int_desc; complete(&comp_pkt->comp_pkt.host_event); } static void hv_int_desc_free(struct hv_pci_dev *hpdev, struct hv_irq_desc *hid) { struct pci_delete_interrupt *int_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_delete_interrupt)]; } ctxt; memset(&ctxt, 0, sizeof(ctxt)); int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; int_pkt->message_type.type = PCI_DELETE_INTERRUPT_MESSAGE; int_pkt->wslot.val = hpdev->desc.wslot.val; int_pkt->int_desc = hid->desc; vmbus_chan_send(hpdev->hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, int_pkt, sizeof(*int_pkt), 0); free(hid, M_DEVBUF); } static void hv_pci_delete_device(struct hv_pci_dev *hpdev) { struct hv_pcibus *hbus = hpdev->hbus; struct hv_irq_desc *hid, *tmp_hid; device_t pci_dev; int devfn; devfn = wslot_to_devfn(hpdev->desc.wslot.val); mtx_lock(&Giant); pci_dev = pci_find_dbsf(hbus->pci_domain, 0, PCI_SLOT(devfn), PCI_FUNC(devfn)); if (pci_dev) device_delete_child(hbus->pci_bus, pci_dev); mtx_unlock(&Giant); mtx_lock(&hbus->device_list_lock); TAILQ_REMOVE(&hbus->children, hpdev, link); mtx_unlock(&hbus->device_list_lock); TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) hv_int_desc_free(hpdev, hid); free(hpdev, M_DEVBUF); } static struct hv_pci_dev * new_pcichild_device(struct hv_pcibus *hbus, struct pci_func_desc *desc) { struct hv_pci_dev *hpdev; struct pci_child_message *res_req; struct q_res_req_compl comp_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_child_message)]; } ctxt; int ret; hpdev = malloc(sizeof(*hpdev), M_DEVBUF, M_WAITOK | M_ZERO); hpdev->hbus = hbus; TAILQ_INIT(&hpdev->irq_desc_list); init_completion(&comp_pkt.host_event); comp_pkt.hpdev = hpdev; ctxt.pkt.compl_ctxt = &comp_pkt; ctxt.pkt.completion_func = q_resource_requirements; res_req = (struct pci_child_message *)&ctxt.pkt.message; res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS; res_req->wslot.val = desc->wslot.val; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, res_req, sizeof(*res_req), (uint64_t)&ctxt.pkt); if (ret) goto err; wait_for_completion(&comp_pkt.host_event); free_completion(&comp_pkt.host_event); hpdev->desc = *desc; mtx_lock(&hbus->device_list_lock); if (TAILQ_EMPTY(&hbus->children)) hbus->pci_domain = desc->ser & 0xFFFF; TAILQ_INSERT_TAIL(&hbus->children, hpdev, link); mtx_unlock(&hbus->device_list_lock); return (hpdev); err: free_completion(&comp_pkt.host_event); free(hpdev, M_DEVBUF); return (NULL); } #if __FreeBSD_version < 1100000 /* Old versions don't have BUS_RESCAN(). Let's copy it from FreeBSD 11. */ static struct pci_devinfo * pci_identify_function(device_t pcib, device_t dev, int domain, int busno, int slot, int func, size_t dinfo_size) { struct pci_devinfo *dinfo; dinfo = pci_read_device(pcib, domain, busno, slot, func, dinfo_size); if (dinfo != NULL) pci_add_child(dev, dinfo); return (dinfo); } static int pci_rescan(device_t dev) { #define REG(n, w) PCIB_READ_CONFIG(pcib, busno, s, f, n, w) device_t pcib = device_get_parent(dev); struct pci_softc *sc; device_t child, *devlist, *unchanged; int devcount, error, i, j, maxslots, oldcount; int busno, domain, s, f, pcifunchigh; uint8_t hdrtype; /* No need to check for ARI on a rescan. */ error = device_get_children(dev, &devlist, &devcount); if (error) return (error); if (devcount != 0) { unchanged = malloc(devcount * sizeof(device_t), M_TEMP, M_NOWAIT | M_ZERO); if (unchanged == NULL) { free(devlist, M_TEMP); return (ENOMEM); } } else unchanged = NULL; sc = device_get_softc(dev); domain = pcib_get_domain(dev); busno = pcib_get_bus(dev); maxslots = PCIB_MAXSLOTS(pcib); for (s = 0; s <= maxslots; s++) { /* If function 0 is not present, skip to the next slot. */ f = 0; if (REG(PCIR_VENDOR, 2) == 0xffff) continue; pcifunchigh = 0; hdrtype = REG(PCIR_HDRTYPE, 1); if ((hdrtype & PCIM_HDRTYPE) > PCI_MAXHDRTYPE) continue; if (hdrtype & PCIM_MFDEV) pcifunchigh = PCIB_MAXFUNCS(pcib); for (f = 0; f <= pcifunchigh; f++) { if (REG(PCIR_VENDOR, 2) == 0xffff) continue; /* * Found a valid function. Check if a * device_t for this device already exists. */ for (i = 0; i < devcount; i++) { child = devlist[i]; if (child == NULL) continue; if (pci_get_slot(child) == s && pci_get_function(child) == f) { unchanged[i] = child; goto next_func; } } pci_identify_function(pcib, dev, domain, busno, s, f, sizeof(struct pci_devinfo)); next_func:; } } /* Remove devices that are no longer present. */ for (i = 0; i < devcount; i++) { if (unchanged[i] != NULL) continue; device_delete_child(dev, devlist[i]); } free(devlist, M_TEMP); oldcount = devcount; /* Try to attach the devices just added. */ error = device_get_children(dev, &devlist, &devcount); if (error) { free(unchanged, M_TEMP); return (error); } for (i = 0; i < devcount; i++) { for (j = 0; j < oldcount; j++) { if (devlist[i] == unchanged[j]) goto next_device; } device_probe_and_attach(devlist[i]); next_device:; } free(unchanged, M_TEMP); free(devlist, M_TEMP); return (0); #undef REG } #else static int pci_rescan(device_t dev) { return (BUS_RESCAN(dev)); } #endif static void pci_devices_present_work(void *arg, int pending __unused) { struct hv_dr_work *dr_wrk = arg; struct hv_dr_state *dr = NULL; struct hv_pcibus *hbus; uint32_t child_no; bool found; struct pci_func_desc *new_desc; struct hv_pci_dev *hpdev, *tmp_hpdev; struct completion *query_comp; bool need_rescan = false; hbus = dr_wrk->bus; free(dr_wrk, M_DEVBUF); /* Pull this off the queue and process it if it was the last one. */ mtx_lock(&hbus->device_list_lock); while (!TAILQ_EMPTY(&hbus->dr_list)) { dr = TAILQ_FIRST(&hbus->dr_list); TAILQ_REMOVE(&hbus->dr_list, dr, link); /* Throw this away if the list still has stuff in it. */ if (!TAILQ_EMPTY(&hbus->dr_list)) { free(dr, M_DEVBUF); continue; } } mtx_unlock(&hbus->device_list_lock); if (!dr) return; /* First, mark all existing children as reported missing. */ mtx_lock(&hbus->device_list_lock); TAILQ_FOREACH(hpdev, &hbus->children, link) hpdev->reported_missing = true; mtx_unlock(&hbus->device_list_lock); /* Next, add back any reported devices. */ for (child_no = 0; child_no < dr->device_count; child_no++) { found = false; new_desc = &dr->func[child_no]; mtx_lock(&hbus->device_list_lock); TAILQ_FOREACH(hpdev, &hbus->children, link) { if ((hpdev->desc.wslot.val == new_desc->wslot.val) && (hpdev->desc.v_id == new_desc->v_id) && (hpdev->desc.d_id == new_desc->d_id) && (hpdev->desc.ser == new_desc->ser)) { hpdev->reported_missing = false; found = true; break; } } mtx_unlock(&hbus->device_list_lock); if (!found) { if (!need_rescan) need_rescan = true; hpdev = new_pcichild_device(hbus, new_desc); if (!hpdev) printf("vmbus_pcib: failed to add a child\n"); } } /* Remove missing device(s), if any */ TAILQ_FOREACH_SAFE(hpdev, &hbus->children, link, tmp_hpdev) { if (hpdev->reported_missing) hv_pci_delete_device(hpdev); } /* Rescan the bus to find any new device, if necessary. */ if (hbus->state == hv_pcibus_installed && need_rescan) pci_rescan(hbus->pci_bus); /* Wake up hv_pci_query_relations(), if it's waiting. */ query_comp = hbus->query_comp; if (query_comp) { hbus->query_comp = NULL; complete(query_comp); } free(dr, M_DEVBUF); } static struct hv_pci_dev * get_pcichild_wslot(struct hv_pcibus *hbus, uint32_t wslot) { struct hv_pci_dev *hpdev, *ret = NULL; mtx_lock(&hbus->device_list_lock); TAILQ_FOREACH(hpdev, &hbus->children, link) { if (hpdev->desc.wslot.val == wslot) { ret = hpdev; break; } } mtx_unlock(&hbus->device_list_lock); return (ret); } static void hv_pci_devices_present(struct hv_pcibus *hbus, struct pci_bus_relations *relations) { struct hv_dr_state *dr; struct hv_dr_work *dr_wrk; unsigned long dr_size; if (hbus->detaching && relations->device_count > 0) return; dr_size = offsetof(struct hv_dr_state, func) + (sizeof(struct pci_func_desc) * relations->device_count); dr = malloc(dr_size, M_DEVBUF, M_WAITOK | M_ZERO); dr->device_count = relations->device_count; if (dr->device_count != 0) memcpy(dr->func, relations->func, sizeof(struct pci_func_desc) * dr->device_count); mtx_lock(&hbus->device_list_lock); TAILQ_INSERT_TAIL(&hbus->dr_list, dr, link); mtx_unlock(&hbus->device_list_lock); dr_wrk = malloc(sizeof(*dr_wrk), M_DEVBUF, M_WAITOK | M_ZERO); dr_wrk->bus = hbus; TASK_INIT(&dr_wrk->task, 0, pci_devices_present_work, dr_wrk); taskqueue_enqueue(hbus->sc->taskq, &dr_wrk->task); } static void hv_eject_device_work(void *arg, int pending __unused) { struct hv_pci_dev *hpdev = arg; union win_slot_encoding wslot = hpdev->desc.wslot; struct hv_pcibus *hbus = hpdev->hbus; struct pci_eject_response *eject_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_eject_response)]; } ctxt; hv_pci_delete_device(hpdev); memset(&ctxt, 0, sizeof(ctxt)); eject_pkt = (struct pci_eject_response *)&ctxt.pkt.message; eject_pkt->message_type.type = PCI_EJECTION_COMPLETE; eject_pkt->wslot.val = wslot.val; vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, eject_pkt, sizeof(*eject_pkt), 0); } static void hv_pci_eject_device(struct hv_pci_dev *hpdev) { struct hv_pcibus *hbus = hpdev->hbus; struct taskqueue *taskq; if (hbus->detaching) return; /* * Push this task into the same taskqueue on which * vmbus_pcib_attach() runs, so we're sure this task can't run * concurrently with vmbus_pcib_attach(). */ TASK_INIT(&hpdev->eject_task, 0, hv_eject_device_work, hpdev); taskq = vmbus_chan_mgmt_tq(hbus->sc->chan); taskqueue_enqueue(taskq, &hpdev->eject_task); } #define PCIB_PACKET_SIZE 0x100 static void vmbus_pcib_on_channel_callback(struct vmbus_channel *chan, void *arg) { struct vmbus_pcib_softc *sc = arg; struct hv_pcibus *hbus = sc->hbus; void *buffer; int bufferlen = PCIB_PACKET_SIZE; struct pci_packet *comp_packet; struct pci_response *response; struct pci_incoming_message *new_msg; struct pci_bus_relations *bus_rel; struct pci_dev_incoming *dev_msg; struct hv_pci_dev *hpdev; buffer = sc->rx_buf; do { struct vmbus_chanpkt_hdr *pkt = buffer; uint32_t bytes_rxed; int ret; bytes_rxed = bufferlen; ret = vmbus_chan_recv_pkt(chan, pkt, &bytes_rxed); if (ret == ENOBUFS) { /* Handle large packet */ if (bufferlen > PCIB_PACKET_SIZE) { free(buffer, M_DEVBUF); buffer = NULL; } /* alloc new buffer */ buffer = malloc(bytes_rxed, M_DEVBUF, M_WAITOK | M_ZERO); bufferlen = bytes_rxed; continue; } if (ret != 0) { /* ignore EIO or EAGAIN */ break; } if (bytes_rxed <= sizeof(struct pci_response)) continue; switch (pkt->cph_type) { case VMBUS_CHANPKT_TYPE_COMP: comp_packet = (struct pci_packet *)pkt->cph_xactid; response = (struct pci_response *)pkt; comp_packet->completion_func(comp_packet->compl_ctxt, response, bytes_rxed); break; case VMBUS_CHANPKT_TYPE_INBAND: new_msg = (struct pci_incoming_message *)buffer; switch (new_msg->message_type.type) { case PCI_BUS_RELATIONS: bus_rel = (struct pci_bus_relations *)buffer; if (bus_rel->device_count == 0) break; if (bytes_rxed < offsetof(struct pci_bus_relations, func) + (sizeof(struct pci_func_desc) * (bus_rel->device_count))) break; hv_pci_devices_present(hbus, bus_rel); break; case PCI_EJECT: dev_msg = (struct pci_dev_incoming *)buffer; hpdev = get_pcichild_wslot(hbus, dev_msg->wslot.val); if (hpdev) hv_pci_eject_device(hpdev); break; default: printf("vmbus_pcib: Unknown msg type 0x%x\n", new_msg->message_type.type); break; } break; default: printf("vmbus_pcib: Unknown VMBus msg type %hd\n", pkt->cph_type); break; } } while (1); if (bufferlen > PCIB_PACKET_SIZE) free(buffer, M_DEVBUF); } static int hv_pci_protocol_negotiation(struct hv_pcibus *hbus) { struct pci_version_request *version_req; struct hv_pci_compl comp_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_version_request)]; } ctxt; int ret; init_completion(&comp_pkt.host_event); ctxt.pkt.completion_func = hv_pci_generic_compl; ctxt.pkt.compl_ctxt = &comp_pkt; version_req = (struct pci_version_request *)&ctxt.pkt.message; version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION; version_req->protocol_version = PCI_PROTOCOL_VERSION_CURRENT; version_req->is_last_attempt = 1; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, version_req, sizeof(*version_req), (uint64_t)&ctxt.pkt); if (ret) goto out; wait_for_completion(&comp_pkt.host_event); if (comp_pkt.completion_status < 0) { device_printf(hbus->pcib, "vmbus_pcib version negotiation failed: %x\n", comp_pkt.completion_status); ret = EPROTO; } else { ret = 0; } out: free_completion(&comp_pkt.host_event); return (ret); } /* Ask the host to send along the list of child devices */ static int hv_pci_query_relations(struct hv_pcibus *hbus) { struct pci_message message; int ret; message.type = PCI_QUERY_BUS_RELATIONS; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, &message, sizeof(message), 0); return (ret); } static int hv_pci_enter_d0(struct hv_pcibus *hbus) { struct pci_bus_d0_entry *d0_entry; struct hv_pci_compl comp_pkt; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_bus_d0_entry)]; } ctxt; int ret; /* * Tell the host that the bus is ready to use, and moved into the * powered-on state. This includes telling the host which region * of memory-mapped I/O space has been chosen for configuration space * access. */ init_completion(&comp_pkt.host_event); ctxt.pkt.completion_func = hv_pci_generic_compl; ctxt.pkt.compl_ctxt = &comp_pkt; d0_entry = (struct pci_bus_d0_entry *)&ctxt.pkt.message; memset(d0_entry, 0, sizeof(*d0_entry)); d0_entry->message_type.type = PCI_BUS_D0ENTRY; d0_entry->mmio_base = rman_get_start(hbus->cfg_res); ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, d0_entry, sizeof(*d0_entry), (uint64_t)&ctxt.pkt); if (ret) goto out; wait_for_completion(&comp_pkt.host_event); if (comp_pkt.completion_status < 0) { device_printf(hbus->pcib, "vmbus_pcib failed to enable D0\n"); ret = EPROTO; } else { ret = 0; } out: free_completion(&comp_pkt.host_event); return (ret); } /* * It looks this is only needed by Windows VM, but let's send the message too * just to make the host happy. */ static int hv_send_resources_allocated(struct hv_pcibus *hbus) { struct pci_resources_assigned *res_assigned; struct hv_pci_compl comp_pkt; struct hv_pci_dev *hpdev; struct pci_packet *pkt; uint32_t wslot; int ret = 0; pkt = malloc(sizeof(*pkt) + sizeof(*res_assigned), M_DEVBUF, M_WAITOK | M_ZERO); for (wslot = 0; wslot < 256; wslot++) { hpdev = get_pcichild_wslot(hbus, wslot); if (!hpdev) continue; init_completion(&comp_pkt.host_event); memset(pkt, 0, sizeof(*pkt) + sizeof(*res_assigned)); pkt->completion_func = hv_pci_generic_compl; pkt->compl_ctxt = &comp_pkt; res_assigned = (struct pci_resources_assigned *)&pkt->message; res_assigned->message_type.type = PCI_RESOURCES_ASSIGNED; res_assigned->wslot.val = hpdev->desc.wslot.val; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, &pkt->message, sizeof(*res_assigned), (uint64_t)pkt); if (ret) { free_completion(&comp_pkt.host_event); break; } wait_for_completion(&comp_pkt.host_event); free_completion(&comp_pkt.host_event); if (comp_pkt.completion_status < 0) { ret = EPROTO; device_printf(hbus->pcib, "failed to send PCI_RESOURCES_ASSIGNED\n"); break; } } free(pkt, M_DEVBUF); return (ret); } static int hv_send_resources_released(struct hv_pcibus *hbus) { struct pci_child_message pkt; struct hv_pci_dev *hpdev; uint32_t wslot; int ret; for (wslot = 0; wslot < 256; wslot++) { hpdev = get_pcichild_wslot(hbus, wslot); if (!hpdev) continue; pkt.message_type.type = PCI_RESOURCES_RELEASED; pkt.wslot.val = hpdev->desc.wslot.val; ret = vmbus_chan_send(hbus->sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, &pkt, sizeof(pkt), 0); if (ret) return (ret); } return (0); } #define hv_cfg_read(x, s) \ static inline uint##x##_t hv_cfg_read_##s(struct hv_pcibus *bus, \ bus_size_t offset) \ { \ return (bus_read_##s(bus->cfg_res, offset)); \ } #define hv_cfg_write(x, s) \ static inline void hv_cfg_write_##s(struct hv_pcibus *bus, \ bus_size_t offset, uint##x##_t val) \ { \ return (bus_write_##s(bus->cfg_res, offset, val)); \ } hv_cfg_read(8, 1) hv_cfg_read(16, 2) hv_cfg_read(32, 4) hv_cfg_write(8, 1) hv_cfg_write(16, 2) hv_cfg_write(32, 4) static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where, int size, uint32_t *val) { struct hv_pcibus *hbus = hpdev->hbus; bus_size_t addr = CFG_PAGE_OFFSET + where; /* * If the attempt is to read the IDs or the ROM BAR, simulate that. */ if (where + size <= PCIR_COMMAND) { memcpy(val, ((uint8_t *)&hpdev->desc.v_id) + where, size); } else if (where >= PCIR_REVID && where + size <= PCIR_CACHELNSZ) { memcpy(val, ((uint8_t *)&hpdev->desc.rev) + where - PCIR_REVID, size); } else if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_BIOS) { memcpy(val, (uint8_t *)&hpdev->desc.subsystem_id + where - PCIR_SUBVEND_0, size); } else if (where >= PCIR_BIOS && where + size <= PCIR_CAP_PTR) { /* ROM BARs are unimplemented */ *val = 0; } else if ((where >= PCIR_INTLINE && where + size <= PCIR_INTPIN) ||(where == PCIR_INTPIN && size == 1)) { /* * Interrupt Line and Interrupt PIN are hard-wired to zero * because this front-end only supports message-signaled * interrupts. */ *val = 0; } else if (where + size <= CFG_PAGE_SIZE) { mtx_lock(&hbus->config_lock); /* Choose the function to be read. */ hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val); /* Make sure the function was chosen before we start reading.*/ mb(); /* Read from that function's config space. */ switch (size) { case 1: *((uint8_t *)val) = hv_cfg_read_1(hbus, addr); break; case 2: *((uint16_t *)val) = hv_cfg_read_2(hbus, addr); break; default: *((uint32_t *)val) = hv_cfg_read_4(hbus, addr); break; } /* * Make sure the write was done before we release the lock, * allowing consecutive reads/writes. */ mb(); mtx_unlock(&hbus->config_lock); } else { /* Invalid config read: it's unlikely to reach here. */ memset(val, 0, size); } } static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where, int size, uint32_t val) { struct hv_pcibus *hbus = hpdev->hbus; bus_size_t addr = CFG_PAGE_OFFSET + where; /* SSIDs and ROM BARs are read-only */ if (where >= PCIR_SUBVEND_0 && where + size <= PCIR_CAP_PTR) return; if (where >= PCIR_COMMAND && where + size <= CFG_PAGE_SIZE) { mtx_lock(&hbus->config_lock); /* Choose the function to be written. */ hv_cfg_write_4(hbus, 0, hpdev->desc.wslot.val); /* Make sure the function was chosen before we start writing.*/ wmb(); /* Write to that function's config space. */ switch (size) { case 1: hv_cfg_write_1(hbus, addr, (uint8_t)val); break; case 2: hv_cfg_write_2(hbus, addr, (uint16_t)val); break; default: hv_cfg_write_4(hbus, addr, (uint32_t)val); break; } /* * Make sure the write was done before we release the lock, * allowing consecutive reads/writes. */ mb(); mtx_unlock(&hbus->config_lock); } else { /* Invalid config write: it's unlikely to reach here. */ return; } } static void vmbus_pcib_set_detaching(void *arg, int pending __unused) { struct hv_pcibus *hbus = arg; atomic_set_int(&hbus->detaching, 1); } static void vmbus_pcib_pre_detach(struct hv_pcibus *hbus) { struct task task; TASK_INIT(&task, 0, vmbus_pcib_set_detaching, hbus); /* * Make sure the channel callback won't push any possible new * PCI_BUS_RELATIONS and PCI_EJECT tasks to sc->taskq. */ vmbus_chan_run_task(hbus->sc->chan, &task); taskqueue_drain_all(hbus->sc->taskq); } /* * Standard probe entry point. * */ static int vmbus_pcib_probe(device_t dev) { if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &g_pass_through_dev_type) == 0) { device_set_desc(dev, "Hyper-V PCI Express Pass Through"); return (BUS_PROBE_DEFAULT); } return (ENXIO); } /* * Standard attach entry point. * */ static int vmbus_pcib_attach(device_t dev) { const int pci_ring_size = (4 * PAGE_SIZE); const struct hyperv_guid *inst_guid; struct vmbus_channel *channel; struct vmbus_pcib_softc *sc; struct hv_pcibus *hbus; int rid = 0; int ret; hbus = malloc(sizeof(*hbus), M_DEVBUF, M_WAITOK | M_ZERO); hbus->pcib = dev; channel = vmbus_get_channel(dev); inst_guid = vmbus_chan_guid_inst(channel); hbus->pci_domain = inst_guid->hv_guid[9] | (inst_guid->hv_guid[8] << 8); mtx_init(&hbus->config_lock, "hbcfg", NULL, MTX_DEF); mtx_init(&hbus->device_list_lock, "hbdl", NULL, MTX_DEF); TAILQ_INIT(&hbus->children); TAILQ_INIT(&hbus->dr_list); hbus->cfg_res = bus_alloc_resource(dev, SYS_RES_MEMORY, &rid, 0, RM_MAX_END, PCI_CONFIG_MMIO_LENGTH, RF_ACTIVE | rman_make_alignment_flags(PAGE_SIZE)); if (!hbus->cfg_res) { device_printf(dev, "failed to get resource for cfg window\n"); ret = ENXIO; goto free_bus; } sc = device_get_softc(dev); sc->chan = channel; sc->rx_buf = malloc(PCIB_PACKET_SIZE, M_DEVBUF, M_WAITOK | M_ZERO); sc->hbus = hbus; /* * The taskq is used to handle PCI_BUS_RELATIONS and PCI_EJECT * messages. NB: we can't handle the messages in the channel callback * directly, because the message handlers need to send new messages * to the host and waits for the host's completion messages, which * must also be handled by the channel callback. */ sc->taskq = taskqueue_create("vmbus_pcib_tq", M_WAITOK, taskqueue_thread_enqueue, &sc->taskq); taskqueue_start_threads(&sc->taskq, 1, PI_NET, "vmbus_pcib_tq"); hbus->sc = sc; init_completion(&hbus->query_completion); hbus->query_comp = &hbus->query_completion; ret = vmbus_chan_open(sc->chan, pci_ring_size, pci_ring_size, NULL, 0, vmbus_pcib_on_channel_callback, sc); if (ret) goto free_res; ret = hv_pci_protocol_negotiation(hbus); if (ret) goto vmbus_close; ret = hv_pci_query_relations(hbus); if (ret) goto vmbus_close; wait_for_completion(hbus->query_comp); ret = hv_pci_enter_d0(hbus); if (ret) goto vmbus_close; ret = hv_send_resources_allocated(hbus); if (ret) goto vmbus_close; hbus->pci_bus = device_add_child(dev, "pci", -1); if (!hbus->pci_bus) { device_printf(dev, "failed to create pci bus\n"); ret = ENXIO; goto vmbus_close; } bus_generic_attach(dev); hbus->state = hv_pcibus_installed; return (0); vmbus_close: vmbus_pcib_pre_detach(hbus); vmbus_chan_close(sc->chan); free_res: taskqueue_free(sc->taskq); free_completion(&hbus->query_completion); free(sc->rx_buf, M_DEVBUF); bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res); free_bus: mtx_destroy(&hbus->device_list_lock); mtx_destroy(&hbus->config_lock); free(hbus, M_DEVBUF); return (ret); } /* * Standard detach entry point */ static int vmbus_pcib_detach(device_t dev) { struct vmbus_pcib_softc *sc = device_get_softc(dev); struct hv_pcibus *hbus = sc->hbus; struct pci_message teardown_packet; struct pci_bus_relations relations; int ret; vmbus_pcib_pre_detach(hbus); if (hbus->state == hv_pcibus_installed) bus_generic_detach(dev); /* Delete any children which might still exist. */ memset(&relations, 0, sizeof(relations)); hv_pci_devices_present(hbus, &relations); ret = hv_send_resources_released(hbus); if (ret) device_printf(dev, "failed to send PCI_RESOURCES_RELEASED\n"); teardown_packet.type = PCI_BUS_D0EXIT; ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, 0, &teardown_packet, sizeof(struct pci_message), 0); if (ret) device_printf(dev, "failed to send PCI_BUS_D0EXIT\n"); taskqueue_drain_all(hbus->sc->taskq); vmbus_chan_close(sc->chan); taskqueue_free(sc->taskq); free_completion(&hbus->query_completion); free(sc->rx_buf, M_DEVBUF); bus_release_resource(dev, SYS_RES_MEMORY, 0, hbus->cfg_res); mtx_destroy(&hbus->device_list_lock); mtx_destroy(&hbus->config_lock); free(hbus, M_DEVBUF); return (0); } static int vmbus_pcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *val) { struct vmbus_pcib_softc *sc = device_get_softc(dev); switch (which) { case PCIB_IVAR_DOMAIN: *val = sc->hbus->pci_domain; return (0); case PCIB_IVAR_BUS: /* There is only bus 0. */ *val = 0; return (0); } return (ENOENT); } static int vmbus_pcib_write_ivar(device_t dev, device_t child, int which, uintptr_t val) { return (ENOENT); } static struct resource * vmbus_pcib_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { unsigned int bar_no; struct hv_pci_dev *hpdev; struct vmbus_pcib_softc *sc = device_get_softc(dev); struct resource *res; unsigned int devfn; if (type == PCI_RES_BUS) return (pci_domain_alloc_bus(sc->hbus->pci_domain, child, rid, start, end, count, flags)); /* Devices with port I/O BAR are not supported. */ if (type == SYS_RES_IOPORT) return (NULL); if (type == SYS_RES_MEMORY) { devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return (NULL); bar_no = PCI_RID2BAR(*rid); if (bar_no >= MAX_NUM_BARS) return (NULL); /* Make sure a 32-bit BAR gets a 32-bit address */ if (!(hpdev->probed_bar[bar_no] & PCIM_BAR_MEM_64)) end = ulmin(end, 0xFFFFFFFF); } res = bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags); /* * If this is a request for a specific range, assume it is * correct and pass it up to the parent. */ if (res == NULL && start + count - 1 == end) res = bus_generic_alloc_resource(dev, child, type, rid, start, end, count, flags); return (res); } static int vmbus_pcib_release_resource(device_t dev, device_t child, int type, int rid, struct resource *r) { struct vmbus_pcib_softc *sc = device_get_softc(dev); if (type == PCI_RES_BUS) return (pci_domain_release_bus(sc->hbus->pci_domain, child, rid, r)); if (type == SYS_RES_IOPORT) return (EINVAL); return (bus_generic_release_resource(dev, child, type, rid, r)); } #if __FreeBSD_version >= 1100000 static int vmbus_pcib_get_cpus(device_t pcib, device_t dev, enum cpu_sets op, size_t setsize, cpuset_t *cpuset) { return (bus_get_cpus(pcib, op, setsize, cpuset)); } #endif static uint32_t vmbus_pcib_read_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, int bytes) { struct vmbus_pcib_softc *sc = device_get_softc(dev); struct hv_pci_dev *hpdev; unsigned int devfn = PCI_DEVFN(slot, func); uint32_t data = 0; KASSERT(bus == 0, ("bus should be 0, but is %u", bus)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return (~0); _hv_pcifront_read_config(hpdev, reg, bytes, &data); return (data); } static void vmbus_pcib_write_config(device_t dev, u_int bus, u_int slot, u_int func, u_int reg, uint32_t data, int bytes) { struct vmbus_pcib_softc *sc = device_get_softc(dev); struct hv_pci_dev *hpdev; unsigned int devfn = PCI_DEVFN(slot, func); KASSERT(bus == 0, ("bus should be 0, but is %u", bus)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return; _hv_pcifront_write_config(hpdev, reg, bytes, data); } static int vmbus_pcib_route_intr(device_t pcib, device_t dev, int pin) { /* We only support MSI/MSI-X and don't support INTx interrupt. */ return (PCI_INVALID_IRQ); } static int vmbus_pcib_alloc_msi(device_t pcib, device_t dev, int count, int maxcount, int *irqs) { return (PCIB_ALLOC_MSI(device_get_parent(pcib), dev, count, maxcount, irqs)); } static int vmbus_pcib_release_msi(device_t pcib, device_t dev, int count, int *irqs) { return (PCIB_RELEASE_MSI(device_get_parent(pcib), dev, count, irqs)); } static int vmbus_pcib_alloc_msix(device_t pcib, device_t dev, int *irq) { return (PCIB_ALLOC_MSIX(device_get_parent(pcib), dev, irq)); } static int vmbus_pcib_release_msix(device_t pcib, device_t dev, int irq) { return (PCIB_RELEASE_MSIX(device_get_parent(pcib), dev, irq)); } #define MSI_INTEL_ADDR_DEST 0x000ff000 #define MSI_INTEL_DATA_INTVEC IOART_INTVEC /* Interrupt vector. */ #define MSI_INTEL_DATA_DELFIXED IOART_DELFIXED static int vmbus_pcib_map_msi(device_t pcib, device_t child, int irq, uint64_t *addr, uint32_t *data) { unsigned int devfn; struct hv_pci_dev *hpdev; uint64_t v_addr; uint32_t v_data; struct hv_irq_desc *hid, *tmp_hid; unsigned int cpu, vcpu_id; unsigned int vector; struct vmbus_pcib_softc *sc = device_get_softc(pcib); struct pci_create_interrupt *int_pkt; struct compose_comp_ctxt comp; struct { struct pci_packet pkt; uint8_t buffer[sizeof(struct pci_create_interrupt)]; } ctxt; int ret; devfn = PCI_DEVFN(pci_get_slot(child), pci_get_function(child)); hpdev = get_pcichild_wslot(sc->hbus, devfn_to_wslot(devfn)); if (!hpdev) return (ENOENT); ret = PCIB_MAP_MSI(device_get_parent(pcib), child, irq, &v_addr, &v_data); if (ret) return (ret); TAILQ_FOREACH_SAFE(hid, &hpdev->irq_desc_list, link, tmp_hid) { if (hid->irq == irq) { TAILQ_REMOVE(&hpdev->irq_desc_list, hid, link); hv_int_desc_free(hpdev, hid); break; } } cpu = (v_addr & MSI_INTEL_ADDR_DEST) >> 12; vcpu_id = VMBUS_GET_VCPU_ID(device_get_parent(pcib), pcib, cpu); vector = v_data & MSI_INTEL_DATA_INTVEC; init_completion(&comp.comp_pkt.host_event); memset(&ctxt, 0, sizeof(ctxt)); ctxt.pkt.completion_func = hv_pci_compose_compl; ctxt.pkt.compl_ctxt = ∁ int_pkt = (struct pci_create_interrupt *)&ctxt.pkt.message; int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; int_pkt->wslot.val = hpdev->desc.wslot.val; int_pkt->int_desc.vector = vector; int_pkt->int_desc.vector_count = 1; int_pkt->int_desc.delivery_mode = MSI_INTEL_DATA_DELFIXED; int_pkt->int_desc.cpu_mask = 1ULL << vcpu_id; ret = vmbus_chan_send(sc->chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, int_pkt, sizeof(*int_pkt), (uint64_t)&ctxt.pkt); if (ret) { free_completion(&comp.comp_pkt.host_event); return (ret); } wait_for_completion(&comp.comp_pkt.host_event); free_completion(&comp.comp_pkt.host_event); if (comp.comp_pkt.completion_status < 0) return (EPROTO); *addr = comp.int_desc.address; *data = comp.int_desc.data; hid = malloc(sizeof(struct hv_irq_desc), M_DEVBUF, M_WAITOK | M_ZERO); hid->irq = irq; hid->desc = comp.int_desc; TAILQ_INSERT_TAIL(&hpdev->irq_desc_list, hid, link); return (0); } static device_method_t vmbus_pcib_methods[] = { /* Device interface */ DEVMETHOD(device_probe, vmbus_pcib_probe), DEVMETHOD(device_attach, vmbus_pcib_attach), DEVMETHOD(device_detach, vmbus_pcib_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_read_ivar, vmbus_pcib_read_ivar), DEVMETHOD(bus_write_ivar, vmbus_pcib_write_ivar), DEVMETHOD(bus_alloc_resource, vmbus_pcib_alloc_resource), DEVMETHOD(bus_release_resource, vmbus_pcib_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), #if __FreeBSD_version >= 1100000 DEVMETHOD(bus_get_cpus, vmbus_pcib_get_cpus), #endif /* pcib interface */ DEVMETHOD(pcib_maxslots, pcib_maxslots), DEVMETHOD(pcib_read_config, vmbus_pcib_read_config), DEVMETHOD(pcib_write_config, vmbus_pcib_write_config), DEVMETHOD(pcib_route_interrupt, vmbus_pcib_route_intr), DEVMETHOD(pcib_alloc_msi, vmbus_pcib_alloc_msi), DEVMETHOD(pcib_release_msi, vmbus_pcib_release_msi), DEVMETHOD(pcib_alloc_msix, vmbus_pcib_alloc_msix), DEVMETHOD(pcib_release_msix, vmbus_pcib_release_msix), DEVMETHOD(pcib_map_msi, vmbus_pcib_map_msi), DEVMETHOD(pcib_request_feature, pcib_request_feature_allow), DEVMETHOD_END }; static devclass_t pcib_devclass; DEFINE_CLASS_0(pcib, vmbus_pcib_driver, vmbus_pcib_methods, sizeof(struct vmbus_pcib_softc)); DRIVER_MODULE(vmbus_pcib, vmbus, vmbus_pcib_driver, pcib_devclass, 0, 0); MODULE_DEPEND(vmbus_pcib, vmbus, 1, 1, 1); MODULE_DEPEND(vmbus_pcib, pci, 1, 1, 1); #endif /* NEW_PCIB */ Index: head/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c =================================================================== --- head/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c (revision 322487) +++ head/sys/dev/hyperv/storvsc/hv_storvsc_drv_freebsd.c (revision 322488) @@ -1,2401 +1,2401 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * StorVSC driver for Hyper-V. This driver presents a SCSI HBA interface * to the Comman Access Method (CAM) layer. CAM control blocks (CCBs) are * converted into VSCSI protocol messages which are delivered to the parent * partition StorVSP driver over the Hyper-V VMBUS. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "hv_vstorage.h" #include "vmbus_if.h" #define STORVSC_MAX_LUNS_PER_TARGET (64) #define STORVSC_MAX_IO_REQUESTS (STORVSC_MAX_LUNS_PER_TARGET * 2) #define BLKVSC_MAX_IDE_DISKS_PER_TARGET (1) #define BLKVSC_MAX_IO_REQUESTS STORVSC_MAX_IO_REQUESTS #define STORVSC_MAX_TARGETS (2) #define VSTOR_PKT_SIZE (sizeof(struct vstor_packet) - vmscsi_size_delta) /* * 33 segments are needed to allow 128KB maxio, in case the data * in the first page is _not_ PAGE_SIZE aligned, e.g. * * |<----------- 128KB ----------->| * | | * 0 2K 4K 8K 16K 124K 128K 130K * | | | | | | | | * +--+--+-----+-----+.......+-----+--+--+ * | | | | | | | | | DATA * | | | | | | | | | * +--+--+-----+-----+.......------+--+--+ * | | | | * | 1| 31 | 1| ...... # of segments */ #define STORVSC_DATA_SEGCNT_MAX 33 #define STORVSC_DATA_SEGSZ_MAX PAGE_SIZE #define STORVSC_DATA_SIZE_MAX \ ((STORVSC_DATA_SEGCNT_MAX - 1) * STORVSC_DATA_SEGSZ_MAX) struct storvsc_softc; struct hv_sgl_node { LIST_ENTRY(hv_sgl_node) link; struct sglist *sgl_data; }; struct hv_sgl_page_pool{ LIST_HEAD(, hv_sgl_node) in_use_sgl_list; LIST_HEAD(, hv_sgl_node) free_sgl_list; boolean_t is_init; } g_hv_sgl_page_pool; enum storvsc_request_type { WRITE_TYPE, READ_TYPE, UNKNOWN_TYPE }; SYSCTL_NODE(_hw, OID_AUTO, storvsc, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Hyper-V storage interface"); static u_int hv_storvsc_use_win8ext_flags = 1; SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_win8ext_flags, CTLFLAG_RW, &hv_storvsc_use_win8ext_flags, 0, "Use win8 extension flags or not"); static u_int hv_storvsc_use_pim_unmapped = 1; SYSCTL_UINT(_hw_storvsc, OID_AUTO, use_pim_unmapped, CTLFLAG_RDTUN, &hv_storvsc_use_pim_unmapped, 0, "Optimize storvsc by using unmapped I/O"); static u_int hv_storvsc_ringbuffer_size = (64 * PAGE_SIZE); SYSCTL_UINT(_hw_storvsc, OID_AUTO, ringbuffer_size, CTLFLAG_RDTUN, &hv_storvsc_ringbuffer_size, 0, "Hyper-V storage ringbuffer size"); static u_int hv_storvsc_max_io = 512; SYSCTL_UINT(_hw_storvsc, OID_AUTO, max_io, CTLFLAG_RDTUN, &hv_storvsc_max_io, 0, "Hyper-V storage max io limit"); static int hv_storvsc_chan_cnt = 0; SYSCTL_INT(_hw_storvsc, OID_AUTO, chan_cnt, CTLFLAG_RDTUN, &hv_storvsc_chan_cnt, 0, "# of channels to use"); #define STORVSC_MAX_IO \ vmbus_chan_prplist_nelem(hv_storvsc_ringbuffer_size, \ STORVSC_DATA_SEGCNT_MAX, VSTOR_PKT_SIZE) struct hv_storvsc_sysctl { u_long data_bio_cnt; u_long data_vaddr_cnt; u_long data_sg_cnt; u_long chan_send_cnt[MAXCPU]; }; struct storvsc_gpa_range { struct vmbus_gpa_range gpa_range; uint64_t gpa_page[STORVSC_DATA_SEGCNT_MAX]; } __packed; struct hv_storvsc_request { LIST_ENTRY(hv_storvsc_request) link; struct vstor_packet vstor_packet; int prp_cnt; struct storvsc_gpa_range prp_list; void *sense_data; uint8_t sense_info_len; uint8_t retries; union ccb *ccb; struct storvsc_softc *softc; struct callout callout; struct sema synch_sema; /*Synchronize the request/response if needed */ struct sglist *bounce_sgl; unsigned int bounce_sgl_count; uint64_t not_aligned_seg_bits; bus_dmamap_t data_dmap; }; struct storvsc_softc { struct vmbus_channel *hs_chan; LIST_HEAD(, hv_storvsc_request) hs_free_list; struct mtx hs_lock; struct storvsc_driver_props *hs_drv_props; int hs_unit; uint32_t hs_frozen; struct cam_sim *hs_sim; struct cam_path *hs_path; uint32_t hs_num_out_reqs; boolean_t hs_destroy; boolean_t hs_drain_notify; struct sema hs_drain_sema; struct hv_storvsc_request hs_init_req; struct hv_storvsc_request hs_reset_req; device_t hs_dev; bus_dma_tag_t storvsc_req_dtag; struct hv_storvsc_sysctl sysctl_data; uint32_t hs_nchan; struct vmbus_channel *hs_sel_chan[MAXCPU]; }; static eventhandler_tag storvsc_handler_tag; /* * The size of the vmscsi_request has changed in win8. The * additional size is for the newly added elements in the * structure. These elements are valid only when we are talking * to a win8 host. * Track the correct size we need to apply. */ static int vmscsi_size_delta = sizeof(struct vmscsi_win8_extension); /** * HyperV storvsc timeout testing cases: * a. IO returned after first timeout; * b. IO returned after second timeout and queue freeze; * c. IO returned while timer handler is running * The first can be tested by "sg_senddiag -vv /dev/daX", * and the second and third can be done by * "sg_wr_mode -v -p 08 -c 0,1a -m 0,ff /dev/daX". */ #define HVS_TIMEOUT_TEST 0 /* * Bus/adapter reset functionality on the Hyper-V host is * buggy and it will be disabled until * it can be further tested. */ #define HVS_HOST_RESET 0 struct storvsc_driver_props { char *drv_name; char *drv_desc; uint8_t drv_max_luns_per_target; uint32_t drv_max_ios_per_target; uint32_t drv_ringbuffer_size; }; enum hv_storage_type { DRIVER_BLKVSC, DRIVER_STORVSC, DRIVER_UNKNOWN }; #define HS_MAX_ADAPTERS 10 #define HV_STORAGE_SUPPORTS_MULTI_CHANNEL 0x1 /* {ba6163d9-04a1-4d29-b605-72e2ffb1dc7f} */ static const struct hyperv_guid gStorVscDeviceType={ .hv_guid = {0xd9, 0x63, 0x61, 0xba, 0xa1, 0x04, 0x29, 0x4d, 0xb6, 0x05, 0x72, 0xe2, 0xff, 0xb1, 0xdc, 0x7f} }; /* {32412632-86cb-44a2-9b5c-50d1417354f5} */ static const struct hyperv_guid gBlkVscDeviceType={ .hv_guid = {0x32, 0x26, 0x41, 0x32, 0xcb, 0x86, 0xa2, 0x44, 0x9b, 0x5c, 0x50, 0xd1, 0x41, 0x73, 0x54, 0xf5} }; static struct storvsc_driver_props g_drv_props_table[] = { {"blkvsc", "Hyper-V IDE", BLKVSC_MAX_IDE_DISKS_PER_TARGET, BLKVSC_MAX_IO_REQUESTS, 20*PAGE_SIZE}, {"storvsc", "Hyper-V SCSI", STORVSC_MAX_LUNS_PER_TARGET, STORVSC_MAX_IO_REQUESTS, 20*PAGE_SIZE} }; /* * Sense buffer size changed in win8; have a run-time * variable to track the size we should use. */ static int sense_buffer_size = PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE; /* * The storage protocol version is determined during the * initial exchange with the host. It will indicate which * storage functionality is available in the host. */ static int vmstor_proto_version; struct vmstor_proto { int proto_version; int sense_buffer_size; int vmscsi_size_delta; }; static const struct vmstor_proto vmstor_proto_list[] = { { VMSTOR_PROTOCOL_VERSION_WIN10, POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, 0 }, { VMSTOR_PROTOCOL_VERSION_WIN8_1, POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, 0 }, { VMSTOR_PROTOCOL_VERSION_WIN8, POST_WIN7_STORVSC_SENSE_BUFFER_SIZE, 0 }, { VMSTOR_PROTOCOL_VERSION_WIN7, PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, sizeof(struct vmscsi_win8_extension), }, { VMSTOR_PROTOCOL_VERSION_WIN6, PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE, sizeof(struct vmscsi_win8_extension), } }; /* static functions */ static int storvsc_probe(device_t dev); static int storvsc_attach(device_t dev); static int storvsc_detach(device_t dev); static void storvsc_poll(struct cam_sim * sim); static void storvsc_action(struct cam_sim * sim, union ccb * ccb); static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp); static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp); static enum hv_storage_type storvsc_get_storage_type(device_t dev); static void hv_storvsc_rescan_target(struct storvsc_softc *sc); static void hv_storvsc_on_channel_callback(struct vmbus_channel *chan, void *xsc); static void hv_storvsc_on_iocompletion( struct storvsc_softc *sc, struct vstor_packet *vstor_packet, struct hv_storvsc_request *request); static int hv_storvsc_connect_vsp(struct storvsc_softc *); static void storvsc_io_done(struct hv_storvsc_request *reqp); static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, bus_dma_segment_t *orig_sgl, unsigned int orig_sgl_count, uint64_t seg_bits); void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, unsigned int dest_sgl_count, struct sglist* src_sgl, uint64_t seg_bits); static device_method_t storvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, storvsc_probe), DEVMETHOD(device_attach, storvsc_attach), DEVMETHOD(device_detach, storvsc_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD_END }; static driver_t storvsc_driver = { "storvsc", storvsc_methods, sizeof(struct storvsc_softc), }; static devclass_t storvsc_devclass; DRIVER_MODULE(storvsc, vmbus, storvsc_driver, storvsc_devclass, 0, 0); MODULE_VERSION(storvsc, 1); MODULE_DEPEND(storvsc, vmbus, 1, 1, 1); static void storvsc_subchan_attach(struct storvsc_softc *sc, struct vmbus_channel *new_channel) { struct vmstor_chan_props props; int ret = 0; memset(&props, 0, sizeof(props)); vmbus_chan_cpu_rr(new_channel); ret = vmbus_chan_open(new_channel, sc->hs_drv_props->drv_ringbuffer_size, sc->hs_drv_props->drv_ringbuffer_size, (void *)&props, sizeof(struct vmstor_chan_props), hv_storvsc_on_channel_callback, sc); } /** * @brief Send multi-channel creation request to host * * @param device a Hyper-V device pointer * @param max_chans the max channels supported by vmbus */ static void storvsc_send_multichannel_request(struct storvsc_softc *sc, int max_subch) { struct vmbus_channel **subchan; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; int request_subch; int ret, i; /* get sub-channel count that need to create */ request_subch = MIN(max_subch, mp_ncpus - 1); request = &sc->hs_init_req; /* request the host to create multi-channel */ memset(request, 0, sizeof(struct hv_storvsc_request)); sema_init(&request->synch_sema, 0, ("stor_synch_sema")); vstor_packet = &request->vstor_packet; vstor_packet->operation = VSTOR_OPERATION_CREATE_MULTI_CHANNELS; vstor_packet->flags = REQUEST_COMPLETION_FLAG; vstor_packet->u.multi_channels_cnt = request_subch; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); sema_wait(&request->synch_sema); if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { printf("Storvsc_error: create multi-channel invalid operation " "(%d) or statue (%u)\n", vstor_packet->operation, vstor_packet->status); return; } /* Update channel count */ sc->hs_nchan = request_subch + 1; /* Wait for sub-channels setup to complete. */ subchan = vmbus_subchan_get(sc->hs_chan, request_subch); /* Attach the sub-channels. */ for (i = 0; i < request_subch; ++i) storvsc_subchan_attach(sc, subchan[i]); /* Release the sub-channels. */ vmbus_subchan_rel(subchan, request_subch); if (bootverbose) printf("Storvsc create multi-channel success!\n"); } /** * @brief initialize channel connection to parent partition * * @param dev a Hyper-V device pointer * @returns 0 on success, non-zero error on failure */ static int hv_storvsc_channel_init(struct storvsc_softc *sc) { int ret = 0, i; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; uint16_t max_subch; boolean_t support_multichannel; uint32_t version; max_subch = 0; support_multichannel = FALSE; request = &sc->hs_init_req; memset(request, 0, sizeof(struct hv_storvsc_request)); vstor_packet = &request->vstor_packet; request->softc = sc; /** * Initiate the vsc/vsp initialization protocol on the open channel */ sema_init(&request->synch_sema, 0, ("stor_synch_sema")); vstor_packet->operation = VSTOR_OPERATION_BEGININITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if (ret != 0) goto cleanup; sema_wait(&request->synch_sema); if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { goto cleanup; } for (i = 0; i < nitems(vmstor_proto_list); i++) { /* reuse the packet for version range supported */ memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_QUERYPROTOCOLVERSION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; vstor_packet->u.version.major_minor = vmstor_proto_list[i].proto_version; /* revision is only significant for Windows guests */ vstor_packet->u.version.revision = 0; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if (ret != 0) goto cleanup; sema_wait(&request->synch_sema); if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO) { ret = EINVAL; goto cleanup; } if (vstor_packet->status == 0) { vmstor_proto_version = vmstor_proto_list[i].proto_version; sense_buffer_size = vmstor_proto_list[i].sense_buffer_size; vmscsi_size_delta = vmstor_proto_list[i].vmscsi_size_delta; break; } } if (vstor_packet->status != 0) { ret = EINVAL; goto cleanup; } /** * Query channel properties */ memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_QUERYPROPERTIES; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if ( ret != 0) goto cleanup; sema_wait(&request->synch_sema); /* TODO: Check returned version */ if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) { goto cleanup; } max_subch = vstor_packet->u.chan_props.max_channel_cnt; if (hv_storvsc_chan_cnt > 0 && hv_storvsc_chan_cnt < (max_subch + 1)) max_subch = hv_storvsc_chan_cnt - 1; /* multi-channels feature is supported by WIN8 and above version */ version = VMBUS_GET_VERSION(device_get_parent(sc->hs_dev), sc->hs_dev); if (version != VMBUS_VERSION_WIN7 && version != VMBUS_VERSION_WS2008 && (vstor_packet->u.chan_props.flags & HV_STORAGE_SUPPORTS_MULTI_CHANNEL)) { support_multichannel = TRUE; } if (bootverbose) { device_printf(sc->hs_dev, "max chans %d%s\n", max_subch + 1, support_multichannel ? ", multi-chan capable" : ""); } memset(vstor_packet, 0, sizeof(struct vstor_packet)); vstor_packet->operation = VSTOR_OPERATION_ENDINITIALIZATION; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(sc->hs_chan, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); if (ret != 0) { goto cleanup; } sema_wait(&request->synch_sema); if (vstor_packet->operation != VSTOR_OPERATION_COMPLETEIO || vstor_packet->status != 0) goto cleanup; /* * If multi-channel is supported, send multichannel create * request to host. */ if (support_multichannel && max_subch > 0) storvsc_send_multichannel_request(sc, max_subch); cleanup: sema_destroy(&request->synch_sema); return (ret); } /** * @brief Open channel connection to paraent partition StorVSP driver * * Open and initialize channel connection to parent partition StorVSP driver. * * @param pointer to a Hyper-V device * @returns 0 on success, non-zero error on failure */ static int hv_storvsc_connect_vsp(struct storvsc_softc *sc) { int ret = 0; struct vmstor_chan_props props; memset(&props, 0, sizeof(struct vmstor_chan_props)); /* * Open the channel */ vmbus_chan_cpu_rr(sc->hs_chan); ret = vmbus_chan_open( sc->hs_chan, sc->hs_drv_props->drv_ringbuffer_size, sc->hs_drv_props->drv_ringbuffer_size, (void *)&props, sizeof(struct vmstor_chan_props), hv_storvsc_on_channel_callback, sc); if (ret != 0) { return ret; } ret = hv_storvsc_channel_init(sc); return (ret); } #if HVS_HOST_RESET static int hv_storvsc_host_reset(struct storvsc_softc *sc) { int ret = 0; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; request = &sc->hs_reset_req; request->softc = sc; vstor_packet = &request->vstor_packet; sema_init(&request->synch_sema, 0, "stor synch sema"); vstor_packet->operation = VSTOR_OPERATION_RESETBUS; vstor_packet->flags = REQUEST_COMPLETION_FLAG; ret = vmbus_chan_send(dev->channel, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)&sc->hs_reset_req); if (ret != 0) { goto cleanup; } sema_wait(&request->synch_sema); /* * At this point, all outstanding requests in the adapter * should have been flushed out and return to us */ cleanup: sema_destroy(&request->synch_sema); return (ret); } #endif /* HVS_HOST_RESET */ /** * @brief Function to initiate an I/O request * * @param device Hyper-V device pointer * @param request pointer to a request structure * @returns 0 on success, non-zero error on failure */ static int hv_storvsc_io_request(struct storvsc_softc *sc, struct hv_storvsc_request *request) { struct vstor_packet *vstor_packet = &request->vstor_packet; struct vmbus_channel* outgoing_channel = NULL; int ret = 0, ch_sel; vstor_packet->flags |= REQUEST_COMPLETION_FLAG; vstor_packet->u.vm_srb.length = sizeof(struct vmscsi_req) - vmscsi_size_delta; vstor_packet->u.vm_srb.sense_info_len = sense_buffer_size; vstor_packet->u.vm_srb.transfer_len = request->prp_list.gpa_range.gpa_len; vstor_packet->operation = VSTOR_OPERATION_EXECUTESRB; ch_sel = (vstor_packet->u.vm_srb.lun + curcpu) % sc->hs_nchan; outgoing_channel = sc->hs_sel_chan[ch_sel]; mtx_unlock(&request->softc->hs_lock); if (request->prp_list.gpa_range.gpa_len) { ret = vmbus_chan_send_prplist(outgoing_channel, &request->prp_list.gpa_range, request->prp_cnt, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); } else { ret = vmbus_chan_send(outgoing_channel, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC, vstor_packet, VSTOR_PKT_SIZE, (uint64_t)(uintptr_t)request); } /* statistic for successful request sending on each channel */ if (!ret) { sc->sysctl_data.chan_send_cnt[ch_sel]++; } mtx_lock(&request->softc->hs_lock); if (ret != 0) { printf("Unable to send packet %p ret %d", vstor_packet, ret); } else { atomic_add_int(&sc->hs_num_out_reqs, 1); } return (ret); } /** * Process IO_COMPLETION_OPERATION and ready * the result to be completed for upper layer * processing by the CAM layer. */ static void hv_storvsc_on_iocompletion(struct storvsc_softc *sc, struct vstor_packet *vstor_packet, struct hv_storvsc_request *request) { struct vmscsi_req *vm_srb; vm_srb = &vstor_packet->u.vm_srb; /* * Copy some fields of the host's response into the request structure, * because the fields will be used later in storvsc_io_done(). */ request->vstor_packet.u.vm_srb.scsi_status = vm_srb->scsi_status; request->vstor_packet.u.vm_srb.srb_status = vm_srb->srb_status; request->vstor_packet.u.vm_srb.transfer_len = vm_srb->transfer_len; if (((vm_srb->scsi_status & 0xFF) == SCSI_STATUS_CHECK_COND) && (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID)) { /* Autosense data available */ KASSERT(vm_srb->sense_info_len <= request->sense_info_len, ("vm_srb->sense_info_len <= " "request->sense_info_len")); memcpy(request->sense_data, vm_srb->u.sense_data, vm_srb->sense_info_len); request->sense_info_len = vm_srb->sense_info_len; } /* Complete request by passing to the CAM layer */ storvsc_io_done(request); atomic_subtract_int(&sc->hs_num_out_reqs, 1); if (sc->hs_drain_notify && (sc->hs_num_out_reqs == 0)) { sema_post(&sc->hs_drain_sema); } } static void hv_storvsc_rescan_target(struct storvsc_softc *sc) { path_id_t pathid; target_id_t targetid; union ccb *ccb; pathid = cam_sim_path(sc->hs_sim); targetid = CAM_TARGET_WILDCARD; /* * Allocate a CCB and schedule a rescan. */ ccb = xpt_alloc_ccb_nowait(); if (ccb == NULL) { printf("unable to alloc CCB for rescan\n"); return; } if (xpt_create_path(&ccb->ccb_h.path, NULL, pathid, targetid, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { printf("unable to create path for rescan, pathid: %u," "targetid: %u\n", pathid, targetid); xpt_free_ccb(ccb); return; } if (targetid == CAM_TARGET_WILDCARD) ccb->ccb_h.func_code = XPT_SCAN_BUS; else ccb->ccb_h.func_code = XPT_SCAN_TGT; xpt_rescan(ccb); } static void hv_storvsc_on_channel_callback(struct vmbus_channel *channel, void *xsc) { int ret = 0; struct storvsc_softc *sc = xsc; uint32_t bytes_recvd; uint64_t request_id; uint8_t packet[roundup2(sizeof(struct vstor_packet), 8)]; struct hv_storvsc_request *request; struct vstor_packet *vstor_packet; bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8); ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id); KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough")); /* XXX check bytes_recvd to make sure that it contains enough data */ while ((ret == 0) && (bytes_recvd > 0)) { request = (struct hv_storvsc_request *)(uintptr_t)request_id; if ((request == &sc->hs_init_req) || (request == &sc->hs_reset_req)) { memcpy(&request->vstor_packet, packet, sizeof(struct vstor_packet)); sema_post(&request->synch_sema); } else { vstor_packet = (struct vstor_packet *)packet; switch(vstor_packet->operation) { case VSTOR_OPERATION_COMPLETEIO: if (request == NULL) panic("VMBUS: storvsc received a " "packet with NULL request id in " "COMPLETEIO operation."); hv_storvsc_on_iocompletion(sc, vstor_packet, request); break; case VSTOR_OPERATION_REMOVEDEVICE: printf("VMBUS: storvsc operation %d not " "implemented.\n", vstor_packet->operation); /* TODO: implement */ break; case VSTOR_OPERATION_ENUMERATE_BUS: hv_storvsc_rescan_target(sc); break; default: break; } } bytes_recvd = roundup2(VSTOR_PKT_SIZE, 8), ret = vmbus_chan_recv(channel, packet, &bytes_recvd, &request_id); KASSERT(ret != ENOBUFS, ("storvsc recvbuf is not large enough")); /* * XXX check bytes_recvd to make sure that it contains * enough data */ } } /** * @brief StorVSC probe function * * Device probe function. Returns 0 if the input device is a StorVSC * device. Otherwise, a ENXIO is returned. If the input device is * for BlkVSC (paravirtual IDE) device and this support is disabled in * favor of the emulated ATA/IDE device, return ENXIO. * * @param a device * @returns 0 on success, ENXIO if not a matcing StorVSC device */ static int storvsc_probe(device_t dev) { int ret = ENXIO; switch (storvsc_get_storage_type(dev)) { case DRIVER_BLKVSC: if(bootverbose) device_printf(dev, "Enlightened ATA/IDE detected\n"); device_set_desc(dev, g_drv_props_table[DRIVER_BLKVSC].drv_desc); ret = BUS_PROBE_DEFAULT; break; case DRIVER_STORVSC: if(bootverbose) device_printf(dev, "Enlightened SCSI device detected\n"); device_set_desc(dev, g_drv_props_table[DRIVER_STORVSC].drv_desc); ret = BUS_PROBE_DEFAULT; break; default: ret = ENXIO; } return (ret); } static void storvsc_create_chan_sel(struct storvsc_softc *sc) { struct vmbus_channel **subch; int i, nsubch; sc->hs_sel_chan[0] = sc->hs_chan; nsubch = sc->hs_nchan - 1; if (nsubch == 0) return; subch = vmbus_subchan_get(sc->hs_chan, nsubch); for (i = 0; i < nsubch; i++) sc->hs_sel_chan[i + 1] = subch[i]; vmbus_subchan_rel(subch, nsubch); } static int storvsc_init_requests(device_t dev) { struct storvsc_softc *sc = device_get_softc(dev); struct hv_storvsc_request *reqp; int error, i; LIST_INIT(&sc->hs_free_list); error = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ 1, /* alignment */ PAGE_SIZE, /* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ STORVSC_DATA_SIZE_MAX, /* maxsize */ STORVSC_DATA_SEGCNT_MAX, /* nsegments */ STORVSC_DATA_SEGSZ_MAX, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &sc->storvsc_req_dtag); if (error) { device_printf(dev, "failed to create storvsc dma tag\n"); return (error); } for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; ++i) { reqp = malloc(sizeof(struct hv_storvsc_request), M_DEVBUF, M_WAITOK|M_ZERO); reqp->softc = sc; error = bus_dmamap_create(sc->storvsc_req_dtag, 0, &reqp->data_dmap); if (error) { device_printf(dev, "failed to allocate storvsc " "data dmamap\n"); goto cleanup; } LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); } return (0); cleanup: while ((reqp = LIST_FIRST(&sc->hs_free_list)) != NULL) { LIST_REMOVE(reqp, link); bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap); free(reqp, M_DEVBUF); } return (error); } static void storvsc_sysctl(device_t dev) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; struct sysctl_oid *ch_tree, *chid_tree; struct storvsc_softc *sc; char name[16]; int i; sc = device_get_softc(dev); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_bio_cnt", CTLFLAG_RW, &sc->sysctl_data.data_bio_cnt, "# of bio data block"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_vaddr_cnt", CTLFLAG_RW, &sc->sysctl_data.data_vaddr_cnt, "# of vaddr data block"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "data_sg_cnt", CTLFLAG_RW, &sc->sysctl_data.data_sg_cnt, "# of sg data block"); /* dev.storvsc.UNIT.channel */ ch_tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "channel", CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (ch_tree == NULL) return; for (i = 0; i < sc->hs_nchan; i++) { uint32_t ch_id; ch_id = vmbus_chan_id(sc->hs_sel_chan[i]); snprintf(name, sizeof(name), "%d", ch_id); /* dev.storvsc.UNIT.channel.CHID */ chid_tree = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(ch_tree), OID_AUTO, name, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, ""); if (chid_tree == NULL) return; /* dev.storvsc.UNIT.channel.CHID.send_req */ SYSCTL_ADD_ULONG(ctx, SYSCTL_CHILDREN(chid_tree), OID_AUTO, "send_req", CTLFLAG_RD, &sc->sysctl_data.chan_send_cnt[i], "# of request sending from this channel"); } } /** * @brief StorVSC attach function * * Function responsible for allocating per-device structures, * setting up CAM interfaces and scanning for available LUNs to * be used for SCSI device peripherals. * * @param a device * @returns 0 on success or an error on failure */ static int storvsc_attach(device_t dev) { enum hv_storage_type stor_type; struct storvsc_softc *sc; struct cam_devq *devq; int ret, i, j; struct hv_storvsc_request *reqp; struct root_hold_token *root_mount_token = NULL; struct hv_sgl_node *sgl_node = NULL; void *tmp_buff = NULL; /* * We need to serialize storvsc attach calls. */ root_mount_token = root_mount_hold("storvsc"); sc = device_get_softc(dev); sc->hs_nchan = 1; sc->hs_chan = vmbus_get_channel(dev); stor_type = storvsc_get_storage_type(dev); if (stor_type == DRIVER_UNKNOWN) { ret = ENODEV; goto cleanup; } /* fill in driver specific properties */ sc->hs_drv_props = &g_drv_props_table[stor_type]; sc->hs_drv_props->drv_ringbuffer_size = hv_storvsc_ringbuffer_size; sc->hs_drv_props->drv_max_ios_per_target = MIN(STORVSC_MAX_IO, hv_storvsc_max_io); if (bootverbose) { printf("storvsc ringbuffer size: %d, max_io: %d\n", sc->hs_drv_props->drv_ringbuffer_size, sc->hs_drv_props->drv_max_ios_per_target); } /* fill in device specific properties */ sc->hs_unit = device_get_unit(dev); sc->hs_dev = dev; mtx_init(&sc->hs_lock, "hvslck", NULL, MTX_DEF); ret = storvsc_init_requests(dev); if (ret != 0) goto cleanup; /* create sg-list page pool */ if (FALSE == g_hv_sgl_page_pool.is_init) { g_hv_sgl_page_pool.is_init = TRUE; LIST_INIT(&g_hv_sgl_page_pool.in_use_sgl_list); LIST_INIT(&g_hv_sgl_page_pool.free_sgl_list); /* * Pre-create SG list, each SG list with * STORVSC_DATA_SEGCNT_MAX segments, each * segment has one page buffer */ for (i = 0; i < sc->hs_drv_props->drv_max_ios_per_target; i++) { sgl_node = malloc(sizeof(struct hv_sgl_node), M_DEVBUF, M_WAITOK|M_ZERO); sgl_node->sgl_data = sglist_alloc(STORVSC_DATA_SEGCNT_MAX, M_WAITOK|M_ZERO); for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) { tmp_buff = malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK|M_ZERO); sgl_node->sgl_data->sg_segs[j].ss_paddr = (vm_paddr_t)tmp_buff; } LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); } } sc->hs_destroy = FALSE; sc->hs_drain_notify = FALSE; sema_init(&sc->hs_drain_sema, 0, "Store Drain Sema"); ret = hv_storvsc_connect_vsp(sc); if (ret != 0) { goto cleanup; } /* Construct cpu to channel mapping */ storvsc_create_chan_sel(sc); /* * Create the device queue. * Hyper-V maps each target to one SCSI HBA */ devq = cam_simq_alloc(sc->hs_drv_props->drv_max_ios_per_target); if (devq == NULL) { device_printf(dev, "Failed to alloc device queue\n"); ret = ENOMEM; goto cleanup; } sc->hs_sim = cam_sim_alloc(storvsc_action, storvsc_poll, sc->hs_drv_props->drv_name, sc, sc->hs_unit, &sc->hs_lock, 1, sc->hs_drv_props->drv_max_ios_per_target, devq); if (sc->hs_sim == NULL) { device_printf(dev, "Failed to alloc sim\n"); cam_simq_free(devq); ret = ENOMEM; goto cleanup; } mtx_lock(&sc->hs_lock); /* bus_id is set to 0, need to get it from VMBUS channel query? */ if (xpt_bus_register(sc->hs_sim, dev, 0) != CAM_SUCCESS) { cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); mtx_unlock(&sc->hs_lock); device_printf(dev, "Unable to register SCSI bus\n"); ret = ENXIO; goto cleanup; } if (xpt_create_path(&sc->hs_path, /*periph*/NULL, cam_sim_path(sc->hs_sim), CAM_TARGET_WILDCARD, CAM_LUN_WILDCARD) != CAM_REQ_CMP) { xpt_bus_deregister(cam_sim_path(sc->hs_sim)); cam_sim_free(sc->hs_sim, /*free_devq*/TRUE); mtx_unlock(&sc->hs_lock); device_printf(dev, "Unable to create path\n"); ret = ENXIO; goto cleanup; } mtx_unlock(&sc->hs_lock); storvsc_sysctl(dev); root_mount_rel(root_mount_token); return (0); cleanup: root_mount_rel(root_mount_token); while (!LIST_EMPTY(&sc->hs_free_list)) { reqp = LIST_FIRST(&sc->hs_free_list); LIST_REMOVE(reqp, link); bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap); free(reqp, M_DEVBUF); } while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); LIST_REMOVE(sgl_node, link); for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++) { if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); } } sglist_free(sgl_node->sgl_data); free(sgl_node, M_DEVBUF); } return (ret); } /** * @brief StorVSC device detach function * * This function is responsible for safely detaching a * StorVSC device. This includes waiting for inbound responses * to complete and freeing associated per-device structures. * * @param dev a device * returns 0 on success */ static int storvsc_detach(device_t dev) { struct storvsc_softc *sc = device_get_softc(dev); struct hv_storvsc_request *reqp = NULL; struct hv_sgl_node *sgl_node = NULL; int j = 0; sc->hs_destroy = TRUE; /* * At this point, all outbound traffic should be disabled. We * only allow inbound traffic (responses) to proceed so that * outstanding requests can be completed. */ sc->hs_drain_notify = TRUE; sema_wait(&sc->hs_drain_sema); sc->hs_drain_notify = FALSE; /* * Since we have already drained, we don't need to busy wait. * The call to close the channel will reset the callback * under the protection of the incoming channel lock. */ vmbus_chan_close(sc->hs_chan); mtx_lock(&sc->hs_lock); while (!LIST_EMPTY(&sc->hs_free_list)) { reqp = LIST_FIRST(&sc->hs_free_list); LIST_REMOVE(reqp, link); bus_dmamap_destroy(sc->storvsc_req_dtag, reqp->data_dmap); free(reqp, M_DEVBUF); } mtx_unlock(&sc->hs_lock); while (!LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); LIST_REMOVE(sgl_node, link); for (j = 0; j < STORVSC_DATA_SEGCNT_MAX; j++){ if (NULL != (void*)sgl_node->sgl_data->sg_segs[j].ss_paddr) { free((void*)sgl_node->sgl_data->sg_segs[j].ss_paddr, M_DEVBUF); } } sglist_free(sgl_node->sgl_data); free(sgl_node, M_DEVBUF); } return (0); } #if HVS_TIMEOUT_TEST /** * @brief unit test for timed out operations * * This function provides unit testing capability to simulate * timed out operations. Recompilation with HV_TIMEOUT_TEST=1 * is required. * * @param reqp pointer to a request structure * @param opcode SCSI operation being performed * @param wait if 1, wait for I/O to complete */ static void storvsc_timeout_test(struct hv_storvsc_request *reqp, uint8_t opcode, int wait) { int ret; union ccb *ccb = reqp->ccb; struct storvsc_softc *sc = reqp->softc; if (reqp->vstor_packet.vm_srb.cdb[0] != opcode) { return; } if (wait) { mtx_lock(&reqp->event.mtx); } ret = hv_storvsc_io_request(sc, reqp); if (ret != 0) { if (wait) { mtx_unlock(&reqp->event.mtx); } printf("%s: io_request failed with %d.\n", __func__, ret); ccb->ccb_h.status = CAM_PROVIDE_FAIL; mtx_lock(&sc->hs_lock); storvsc_free_request(sc, reqp); xpt_done(ccb); mtx_unlock(&sc->hs_lock); return; } if (wait) { xpt_print(ccb->ccb_h.path, "%u: %s: waiting for IO return.\n", ticks, __func__); ret = cv_timedwait(&reqp->event.cv, &reqp->event.mtx, 60*hz); mtx_unlock(&reqp->event.mtx); xpt_print(ccb->ccb_h.path, "%u: %s: %s.\n", ticks, __func__, (ret == 0)? "IO return detected" : "IO return not detected"); /* * Now both the timer handler and io done are running * simultaneously. We want to confirm the io done always * finishes after the timer handler exits. So reqp used by * timer handler is not freed or stale. Do busy loop for * another 1/10 second to make sure io done does * wait for the timer handler to complete. */ DELAY(100*1000); mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "%u: %s: finishing, queue frozen %d, " "ccb status 0x%x scsi_status 0x%x.\n", ticks, __func__, sc->hs_frozen, ccb->ccb_h.status, ccb->csio.scsi_status); mtx_unlock(&sc->hs_lock); } } #endif /* HVS_TIMEOUT_TEST */ #ifdef notyet /** * @brief timeout handler for requests * * This function is called as a result of a callout expiring. * * @param arg pointer to a request */ static void storvsc_timeout(void *arg) { struct hv_storvsc_request *reqp = arg; struct storvsc_softc *sc = reqp->softc; union ccb *ccb = reqp->ccb; if (reqp->retries == 0) { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "%u: IO timed out (req=0x%p), wait for another %u secs.\n", ticks, reqp, ccb->ccb_h.timeout / 1000); cam_error_print(ccb, CAM_ESF_ALL, CAM_EPF_ALL); mtx_unlock(&sc->hs_lock); reqp->retries++; callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout, 0, storvsc_timeout, reqp, 0); #if HVS_TIMEOUT_TEST storvsc_timeout_test(reqp, SEND_DIAGNOSTIC, 0); #endif return; } mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "%u: IO (reqp = 0x%p) did not return for %u seconds, %s.\n", ticks, reqp, ccb->ccb_h.timeout * (reqp->retries+1) / 1000, (sc->hs_frozen == 0)? "freezing the queue" : "the queue is already frozen"); if (sc->hs_frozen == 0) { sc->hs_frozen = 1; xpt_freeze_simq(xpt_path_sim(ccb->ccb_h.path), 1); } mtx_unlock(&sc->hs_lock); #if HVS_TIMEOUT_TEST storvsc_timeout_test(reqp, MODE_SELECT_10, 1); #endif } #endif /** * @brief StorVSC device poll function * * This function is responsible for servicing requests when * interrupts are disabled (i.e when we are dumping core.) * * @param sim a pointer to a CAM SCSI interface module */ static void storvsc_poll(struct cam_sim *sim) { struct storvsc_softc *sc = cam_sim_softc(sim); mtx_assert(&sc->hs_lock, MA_OWNED); mtx_unlock(&sc->hs_lock); hv_storvsc_on_channel_callback(sc->hs_chan, sc); mtx_lock(&sc->hs_lock); } /** * @brief StorVSC device action function * * This function is responsible for handling SCSI operations which * are passed from the CAM layer. The requests are in the form of * CAM control blocks which indicate the action being performed. * Not all actions require converting the request to a VSCSI protocol * message - these actions can be responded to by this driver. * Requests which are destined for a backend storage device are converted * to a VSCSI protocol message and sent on the channel connection associated * with this device. * * @param sim pointer to a CAM SCSI interface module * @param ccb pointer to a CAM control block */ static void storvsc_action(struct cam_sim *sim, union ccb *ccb) { struct storvsc_softc *sc = cam_sim_softc(sim); int res; mtx_assert(&sc->hs_lock, MA_OWNED); switch (ccb->ccb_h.func_code) { case XPT_PATH_INQ: { struct ccb_pathinq *cpi = &ccb->cpi; cpi->version_num = 1; cpi->hba_inquiry = PI_TAG_ABLE|PI_SDTR_ABLE; cpi->target_sprt = 0; cpi->hba_misc = PIM_NOBUSRESET; if (hv_storvsc_use_pim_unmapped) cpi->hba_misc |= PIM_UNMAPPED; cpi->maxio = STORVSC_DATA_SIZE_MAX; cpi->hba_eng_cnt = 0; cpi->max_target = STORVSC_MAX_TARGETS; cpi->max_lun = sc->hs_drv_props->drv_max_luns_per_target; cpi->initiator_id = cpi->max_target; cpi->bus_id = cam_sim_bus(sim); cpi->base_transfer_speed = 300000; cpi->transport = XPORT_SAS; cpi->transport_version = 0; cpi->protocol = PROTO_SCSI; cpi->protocol_version = SCSI_REV_SPC2; strlcpy(cpi->sim_vid, "FreeBSD", SIM_IDLEN); strlcpy(cpi->hba_vid, sc->hs_drv_props->drv_name, HBA_IDLEN); strlcpy(cpi->dev_name, cam_sim_name(sim), DEV_IDLEN); cpi->unit_number = cam_sim_unit(sim); ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; } case XPT_GET_TRAN_SETTINGS: { struct ccb_trans_settings *cts = &ccb->cts; cts->transport = XPORT_SAS; cts->transport_version = 0; cts->protocol = PROTO_SCSI; cts->protocol_version = SCSI_REV_SPC2; /* enable tag queuing and disconnected mode */ cts->proto_specific.valid = CTS_SCSI_VALID_TQ; cts->proto_specific.scsi.valid = CTS_SCSI_VALID_TQ; cts->proto_specific.scsi.flags = CTS_SCSI_FLAGS_TAG_ENB; cts->xport_specific.valid = CTS_SPI_VALID_DISC; cts->xport_specific.spi.flags = CTS_SPI_FLAGS_DISC_ENB; ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; } case XPT_SET_TRAN_SETTINGS: { ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; } case XPT_CALC_GEOMETRY:{ cam_calc_geometry(&ccb->ccg, 1); xpt_done(ccb); return; } case XPT_RESET_BUS: case XPT_RESET_DEV:{ #if HVS_HOST_RESET if ((res = hv_storvsc_host_reset(sc)) != 0) { xpt_print(ccb->ccb_h.path, "hv_storvsc_host_reset failed with %d\n", res); ccb->ccb_h.status = CAM_PROVIDE_FAIL; xpt_done(ccb); return; } ccb->ccb_h.status = CAM_REQ_CMP; xpt_done(ccb); return; #else xpt_print(ccb->ccb_h.path, "%s reset not supported.\n", (ccb->ccb_h.func_code == XPT_RESET_BUS)? "bus" : "dev"); ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); return; #endif /* HVS_HOST_RESET */ } case XPT_SCSI_IO: case XPT_IMMED_NOTIFY: { struct hv_storvsc_request *reqp = NULL; bus_dmamap_t dmap_saved; if (ccb->csio.cdb_len == 0) { panic("cdl_len is 0\n"); } if (LIST_EMPTY(&sc->hs_free_list)) { ccb->ccb_h.status = CAM_REQUEUE_REQ; if (sc->hs_frozen == 0) { sc->hs_frozen = 1; xpt_freeze_simq(sim, /* count*/1); } xpt_done(ccb); return; } reqp = LIST_FIRST(&sc->hs_free_list); LIST_REMOVE(reqp, link); /* Save the data_dmap before reset request */ dmap_saved = reqp->data_dmap; /* XXX this is ugly */ bzero(reqp, sizeof(struct hv_storvsc_request)); /* Restore necessary bits */ reqp->data_dmap = dmap_saved; reqp->softc = sc; ccb->ccb_h.status |= CAM_SIM_QUEUED; if ((res = create_storvsc_request(ccb, reqp)) != 0) { ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); return; } #ifdef notyet if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { callout_init(&reqp->callout, 1); callout_reset_sbt(&reqp->callout, SBT_1MS * ccb->ccb_h.timeout, 0, storvsc_timeout, reqp, 0); #if HVS_TIMEOUT_TEST cv_init(&reqp->event.cv, "storvsc timeout cv"); mtx_init(&reqp->event.mtx, "storvsc timeout mutex", NULL, MTX_DEF); switch (reqp->vstor_packet.vm_srb.cdb[0]) { case MODE_SELECT_10: case SEND_DIAGNOSTIC: /* To have timer send the request. */ return; default: break; } #endif /* HVS_TIMEOUT_TEST */ } #endif if ((res = hv_storvsc_io_request(sc, reqp)) != 0) { xpt_print(ccb->ccb_h.path, "hv_storvsc_io_request failed with %d\n", res); ccb->ccb_h.status = CAM_PROVIDE_FAIL; storvsc_free_request(sc, reqp); xpt_done(ccb); return; } return; } default: ccb->ccb_h.status = CAM_REQ_INVALID; xpt_done(ccb); return; } } /** * @brief destroy bounce buffer * * This function is responsible for destroy a Scatter/Gather list * that create by storvsc_create_bounce_buffer() * * @param sgl- the Scatter/Gather need be destroy * @param sg_count- page count of the SG list. * */ static void storvsc_destroy_bounce_buffer(struct sglist *sgl) { struct hv_sgl_node *sgl_node = NULL; if (LIST_EMPTY(&g_hv_sgl_page_pool.in_use_sgl_list)) { printf("storvsc error: not enough in use sgl\n"); return; } sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.in_use_sgl_list); LIST_REMOVE(sgl_node, link); sgl_node->sgl_data = sgl; LIST_INSERT_HEAD(&g_hv_sgl_page_pool.free_sgl_list, sgl_node, link); } /** * @brief create bounce buffer * * This function is responsible for create a Scatter/Gather list, * which hold several pages that can be aligned with page size. * * @param seg_count- SG-list segments count * @param write - if WRITE_TYPE, set SG list page used size to 0, * otherwise set used size to page size. * * return NULL if create failed */ static struct sglist * storvsc_create_bounce_buffer(uint16_t seg_count, int write) { int i = 0; struct sglist *bounce_sgl = NULL; unsigned int buf_len = ((write == WRITE_TYPE) ? 0 : PAGE_SIZE); struct hv_sgl_node *sgl_node = NULL; /* get struct sglist from free_sgl_list */ if (LIST_EMPTY(&g_hv_sgl_page_pool.free_sgl_list)) { printf("storvsc error: not enough free sgl\n"); return NULL; } sgl_node = LIST_FIRST(&g_hv_sgl_page_pool.free_sgl_list); LIST_REMOVE(sgl_node, link); bounce_sgl = sgl_node->sgl_data; LIST_INSERT_HEAD(&g_hv_sgl_page_pool.in_use_sgl_list, sgl_node, link); bounce_sgl->sg_maxseg = seg_count; if (write == WRITE_TYPE) bounce_sgl->sg_nseg = 0; else bounce_sgl->sg_nseg = seg_count; for (i = 0; i < seg_count; i++) bounce_sgl->sg_segs[i].ss_len = buf_len; return bounce_sgl; } /** * @brief copy data from SG list to bounce buffer * * This function is responsible for copy data from one SG list's segments * to another SG list which used as bounce buffer. * * @param bounce_sgl - the destination SG list * @param orig_sgl - the segment of the source SG list. * @param orig_sgl_count - the count of segments. * @param orig_sgl_count - indicate which segment need bounce buffer, * set 1 means need. * */ static void storvsc_copy_sgl_to_bounce_buf(struct sglist *bounce_sgl, bus_dma_segment_t *orig_sgl, unsigned int orig_sgl_count, uint64_t seg_bits) { int src_sgl_idx = 0; for (src_sgl_idx = 0; src_sgl_idx < orig_sgl_count; src_sgl_idx++) { if (seg_bits & (1 << src_sgl_idx)) { memcpy((void*)bounce_sgl->sg_segs[src_sgl_idx].ss_paddr, (void*)orig_sgl[src_sgl_idx].ds_addr, orig_sgl[src_sgl_idx].ds_len); bounce_sgl->sg_segs[src_sgl_idx].ss_len = orig_sgl[src_sgl_idx].ds_len; } } } /** * @brief copy data from SG list which used as bounce to another SG list * * This function is responsible for copy data from one SG list with bounce * buffer to another SG list's segments. * * @param dest_sgl - the destination SG list's segments * @param dest_sgl_count - the count of destination SG list's segment. * @param src_sgl - the source SG list. * @param seg_bits - indicate which segment used bounce buffer of src SG-list. * */ void storvsc_copy_from_bounce_buf_to_sgl(bus_dma_segment_t *dest_sgl, unsigned int dest_sgl_count, struct sglist* src_sgl, uint64_t seg_bits) { int sgl_idx = 0; for (sgl_idx = 0; sgl_idx < dest_sgl_count; sgl_idx++) { if (seg_bits & (1 << sgl_idx)) { memcpy((void*)(dest_sgl[sgl_idx].ds_addr), (void*)(src_sgl->sg_segs[sgl_idx].ss_paddr), src_sgl->sg_segs[sgl_idx].ss_len); } } } /** * @brief check SG list with bounce buffer or not * * This function is responsible for check if need bounce buffer for SG list. * * @param sgl - the SG list's segments * @param sg_count - the count of SG list's segment. * @param bits - segmengs number that need bounce buffer * * return -1 if SG list needless bounce buffer */ static int storvsc_check_bounce_buffer_sgl(bus_dma_segment_t *sgl, unsigned int sg_count, uint64_t *bits) { int i = 0; int offset = 0; uint64_t phys_addr = 0; uint64_t tmp_bits = 0; boolean_t found_hole = FALSE; boolean_t pre_aligned = TRUE; if (sg_count < 2){ return -1; } *bits = 0; phys_addr = vtophys(sgl[0].ds_addr); offset = phys_addr - trunc_page(phys_addr); if (offset != 0) { pre_aligned = FALSE; tmp_bits |= 1; } for (i = 1; i < sg_count; i++) { phys_addr = vtophys(sgl[i].ds_addr); offset = phys_addr - trunc_page(phys_addr); if (offset == 0) { if (FALSE == pre_aligned){ /* * This segment is aligned, if the previous * one is not aligned, find a hole */ found_hole = TRUE; } pre_aligned = TRUE; } else { tmp_bits |= 1ULL << i; if (!pre_aligned) { if (phys_addr != vtophys(sgl[i-1].ds_addr + sgl[i-1].ds_len)) { /* * Check whether connect to previous * segment,if not, find the hole */ found_hole = TRUE; } } else { found_hole = TRUE; } pre_aligned = FALSE; } } if (!found_hole) { return (-1); } else { *bits = tmp_bits; return 0; } } /** * Copy bus_dma segments to multiple page buffer, which requires * the pages are compact composed except for the 1st and last pages. */ static void storvsc_xferbuf_prepare(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { struct hv_storvsc_request *reqp = arg; union ccb *ccb = reqp->ccb; struct ccb_scsiio *csio = &ccb->csio; struct storvsc_gpa_range *prplist; int i; prplist = &reqp->prp_list; prplist->gpa_range.gpa_len = csio->dxfer_len; prplist->gpa_range.gpa_ofs = segs[0].ds_addr & PAGE_MASK; for (i = 0; i < nsegs; i++) { #ifdef INVARIANTS if (nsegs > 1) { if (i == 0) { KASSERT((segs[i].ds_addr & PAGE_MASK) + segs[i].ds_len == PAGE_SIZE, ("invalid 1st page, ofs 0x%jx, len %zu", (uintmax_t)segs[i].ds_addr, segs[i].ds_len)); } else if (i == nsegs - 1) { KASSERT((segs[i].ds_addr & PAGE_MASK) == 0, ("invalid last page, ofs 0x%jx", (uintmax_t)segs[i].ds_addr)); } else { KASSERT((segs[i].ds_addr & PAGE_MASK) == 0 && segs[i].ds_len == PAGE_SIZE, ("not a full page, ofs 0x%jx, len %zu", (uintmax_t)segs[i].ds_addr, segs[i].ds_len)); } } #endif prplist->gpa_page[i] = atop(segs[i].ds_addr); } reqp->prp_cnt = nsegs; } /** * @brief Fill in a request structure based on a CAM control block * * Fills in a request structure based on the contents of a CAM control * block. The request structure holds the payload information for * VSCSI protocol request. * * @param ccb pointer to a CAM contorl block * @param reqp pointer to a request structure */ static int create_storvsc_request(union ccb *ccb, struct hv_storvsc_request *reqp) { struct ccb_scsiio *csio = &ccb->csio; uint64_t phys_addr; uint32_t pfn; uint64_t not_aligned_seg_bits = 0; int error; /* refer to struct vmscsi_req for meanings of these two fields */ reqp->vstor_packet.u.vm_srb.port = cam_sim_unit(xpt_path_sim(ccb->ccb_h.path)); reqp->vstor_packet.u.vm_srb.path_id = cam_sim_bus(xpt_path_sim(ccb->ccb_h.path)); reqp->vstor_packet.u.vm_srb.target_id = ccb->ccb_h.target_id; reqp->vstor_packet.u.vm_srb.lun = ccb->ccb_h.target_lun; reqp->vstor_packet.u.vm_srb.cdb_len = csio->cdb_len; if(ccb->ccb_h.flags & CAM_CDB_POINTER) { memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_ptr, csio->cdb_len); } else { memcpy(&reqp->vstor_packet.u.vm_srb.u.cdb, csio->cdb_io.cdb_bytes, csio->cdb_len); } if (hv_storvsc_use_win8ext_flags) { reqp->vstor_packet.u.vm_srb.win8_extension.time_out_value = 60; reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= SRB_FLAGS_DISABLE_SYNCH_TRANSFER; } switch (ccb->ccb_h.flags & CAM_DIR_MASK) { case CAM_DIR_OUT: reqp->vstor_packet.u.vm_srb.data_in = WRITE_TYPE; if (hv_storvsc_use_win8ext_flags) { reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= SRB_FLAGS_DATA_OUT; } break; case CAM_DIR_IN: reqp->vstor_packet.u.vm_srb.data_in = READ_TYPE; if (hv_storvsc_use_win8ext_flags) { reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= SRB_FLAGS_DATA_IN; } break; case CAM_DIR_NONE: reqp->vstor_packet.u.vm_srb.data_in = UNKNOWN_TYPE; if (hv_storvsc_use_win8ext_flags) { reqp->vstor_packet.u.vm_srb.win8_extension.srb_flags |= SRB_FLAGS_NO_DATA_TRANSFER; } break; default: printf("Error: unexpected data direction: 0x%x\n", ccb->ccb_h.flags & CAM_DIR_MASK); return (EINVAL); } reqp->sense_data = &csio->sense_data; reqp->sense_info_len = csio->sense_len; reqp->ccb = ccb; if (0 == csio->dxfer_len) { return (0); } switch (ccb->ccb_h.flags & CAM_DATA_MASK) { case CAM_DATA_BIO: case CAM_DATA_VADDR: error = bus_dmamap_load_ccb(reqp->softc->storvsc_req_dtag, reqp->data_dmap, ccb, storvsc_xferbuf_prepare, reqp, BUS_DMA_NOWAIT); if (error) { xpt_print(ccb->ccb_h.path, "bus_dmamap_load_ccb failed: %d\n", error); return (error); } if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO) reqp->softc->sysctl_data.data_bio_cnt++; else reqp->softc->sysctl_data.data_vaddr_cnt++; break; case CAM_DATA_SG: { struct storvsc_gpa_range *prplist; int i = 0; int offset = 0; int ret; bus_dma_segment_t *storvsc_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; u_int16_t storvsc_sg_count = ccb->csio.sglist_cnt; prplist = &reqp->prp_list; prplist->gpa_range.gpa_len = csio->dxfer_len; printf("Storvsc: get SG I/O operation, %d\n", reqp->vstor_packet.u.vm_srb.data_in); if (storvsc_sg_count > STORVSC_DATA_SEGCNT_MAX){ printf("Storvsc: %d segments is too much, " "only support %d segments\n", storvsc_sg_count, STORVSC_DATA_SEGCNT_MAX); return (EINVAL); } /* * We create our own bounce buffer function currently. Idealy * we should use BUS_DMA(9) framework. But with current BUS_DMA * code there is no callback API to check the page alignment of * middle segments before busdma can decide if a bounce buffer * is needed for particular segment. There is callback, * "bus_dma_filter_t *filter", but the parrameters are not * sufficient for storvsc driver. * TODO: * Add page alignment check in BUS_DMA(9) callback. Once * this is complete, switch the following code to use * BUS_DMA(9) for storvsc bounce buffer support. */ /* check if we need to create bounce buffer */ ret = storvsc_check_bounce_buffer_sgl(storvsc_sglist, storvsc_sg_count, ¬_aligned_seg_bits); if (ret != -1) { reqp->bounce_sgl = storvsc_create_bounce_buffer(storvsc_sg_count, reqp->vstor_packet.u.vm_srb.data_in); if (NULL == reqp->bounce_sgl) { printf("Storvsc_error: " "create bounce buffer failed.\n"); return (ENOMEM); } reqp->bounce_sgl_count = storvsc_sg_count; reqp->not_aligned_seg_bits = not_aligned_seg_bits; /* * if it is write, we need copy the original data *to bounce buffer */ if (WRITE_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { storvsc_copy_sgl_to_bounce_buf( reqp->bounce_sgl, storvsc_sglist, storvsc_sg_count, reqp->not_aligned_seg_bits); } /* transfer virtual address to physical frame number */ if (reqp->not_aligned_seg_bits & 0x1){ phys_addr = vtophys(reqp->bounce_sgl->sg_segs[0].ss_paddr); }else{ phys_addr = vtophys(storvsc_sglist[0].ds_addr); } prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK; pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[0] = pfn; for (i = 1; i < storvsc_sg_count; i++) { if (reqp->not_aligned_seg_bits & (1 << i)) { phys_addr = vtophys(reqp->bounce_sgl->sg_segs[i].ss_paddr); } else { phys_addr = vtophys(storvsc_sglist[i].ds_addr); } pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[i] = pfn; } reqp->prp_cnt = i; } else { phys_addr = vtophys(storvsc_sglist[0].ds_addr); prplist->gpa_range.gpa_ofs = phys_addr & PAGE_MASK; for (i = 0; i < storvsc_sg_count; i++) { phys_addr = vtophys(storvsc_sglist[i].ds_addr); pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[i] = pfn; } reqp->prp_cnt = i; /* check the last segment cross boundary or not */ offset = phys_addr & PAGE_MASK; if (offset) { /* Add one more PRP entry */ phys_addr = vtophys(storvsc_sglist[i-1].ds_addr + PAGE_SIZE - offset); pfn = phys_addr >> PAGE_SHIFT; prplist->gpa_page[i] = pfn; reqp->prp_cnt++; } reqp->bounce_sgl_count = 0; } reqp->softc->sysctl_data.data_sg_cnt++; break; } default: printf("Unknow flags: %d\n", ccb->ccb_h.flags); return(EINVAL); } return(0); } static uint32_t is_scsi_valid(const struct scsi_inquiry_data *inq_data) { u_int8_t type; type = SID_TYPE(inq_data); if (type == T_NODEVICE) return (0); if (SID_QUAL(inq_data) == SID_QUAL_BAD_LU) return (0); return (1); } /** * @brief completion function before returning to CAM * * I/O process has been completed and the result needs * to be passed to the CAM layer. * Free resources related to this request. * * @param reqp pointer to a request structure */ static void storvsc_io_done(struct hv_storvsc_request *reqp) { union ccb *ccb = reqp->ccb; struct ccb_scsiio *csio = &ccb->csio; struct storvsc_softc *sc = reqp->softc; struct vmscsi_req *vm_srb = &reqp->vstor_packet.u.vm_srb; bus_dma_segment_t *ori_sglist = NULL; int ori_sg_count = 0; const struct scsi_generic *cmd; /* destroy bounce buffer if it is used */ if (reqp->bounce_sgl_count) { ori_sglist = (bus_dma_segment_t *)ccb->csio.data_ptr; ori_sg_count = ccb->csio.sglist_cnt; /* * If it is READ operation, we should copy back the data * to original SG list. */ if (READ_TYPE == reqp->vstor_packet.u.vm_srb.data_in) { storvsc_copy_from_bounce_buf_to_sgl(ori_sglist, ori_sg_count, reqp->bounce_sgl, reqp->not_aligned_seg_bits); } storvsc_destroy_bounce_buffer(reqp->bounce_sgl); reqp->bounce_sgl_count = 0; } if (reqp->retries > 0) { mtx_lock(&sc->hs_lock); #if HVS_TIMEOUT_TEST xpt_print(ccb->ccb_h.path, "%u: IO returned after timeout, " "waking up timer handler if any.\n", ticks); mtx_lock(&reqp->event.mtx); cv_signal(&reqp->event.cv); mtx_unlock(&reqp->event.mtx); #endif reqp->retries = 0; xpt_print(ccb->ccb_h.path, "%u: IO returned after timeout, " "stopping timer if any.\n", ticks); mtx_unlock(&sc->hs_lock); } #ifdef notyet /* * callout_drain() will wait for the timer handler to finish * if it is running. So we don't need any lock to synchronize * between this routine and the timer handler. * Note that we need to make sure reqp is not freed when timer * handler is using or will use it. */ if (ccb->ccb_h.timeout != CAM_TIME_INFINITY) { callout_drain(&reqp->callout); } #endif cmd = (const struct scsi_generic *) ((ccb->ccb_h.flags & CAM_CDB_POINTER) ? csio->cdb_io.cdb_ptr : csio->cdb_io.cdb_bytes); ccb->ccb_h.status &= ~CAM_SIM_QUEUED; ccb->ccb_h.status &= ~CAM_STATUS_MASK; int srb_status = SRB_STATUS(vm_srb->srb_status); if (vm_srb->scsi_status == SCSI_STATUS_OK) { if (srb_status != SRB_STATUS_SUCCESS) { /* * If there are errors, for example, invalid LUN, * host will inform VM through SRB status. */ if (bootverbose) { if (srb_status == SRB_STATUS_INVALID_LUN) { xpt_print(ccb->ccb_h.path, "invalid LUN %d for op: %s\n", vm_srb->lun, scsi_op_desc(cmd->opcode, NULL)); } else { xpt_print(ccb->ccb_h.path, "Unknown SRB flag: %d for op: %s\n", srb_status, scsi_op_desc(cmd->opcode, NULL)); } } /* * XXX For a selection timeout, all of the LUNs * on the target will be gone. It works for SCSI * disks, but does not work for IDE disks. * * For CAM_DEV_NOT_THERE, CAM will only get * rid of the device(s) specified by the path. */ if (storvsc_get_storage_type(sc->hs_dev) == DRIVER_STORVSC) ccb->ccb_h.status |= CAM_SEL_TIMEOUT; else ccb->ccb_h.status |= CAM_DEV_NOT_THERE; } else { ccb->ccb_h.status |= CAM_REQ_CMP; } if (cmd->opcode == INQUIRY && srb_status == SRB_STATUS_SUCCESS) { int resp_xfer_len, resp_buf_len, data_len; uint8_t *resp_buf = (uint8_t *)csio->data_ptr; struct scsi_inquiry_data *inq_data = (struct scsi_inquiry_data *)csio->data_ptr; /* Get the buffer length reported by host */ resp_xfer_len = vm_srb->transfer_len; /* Get the available buffer length */ resp_buf_len = resp_xfer_len >= 5 ? resp_buf[4] + 5 : 0; data_len = (resp_buf_len < resp_xfer_len) ? resp_buf_len : resp_xfer_len; if (bootverbose && data_len >= 5) { xpt_print(ccb->ccb_h.path, "storvsc inquiry " "(%d) [%x %x %x %x %x ... ]\n", data_len, resp_buf[0], resp_buf[1], resp_buf[2], resp_buf[3], resp_buf[4]); } /* * XXX: Hyper-V (since win2012r2) responses inquiry with * unknown version (0) for GEN-2 DVD device. * Manually set the version number to SPC3 in order to * ask CAM to continue probing with "PROBE_REPORT_LUNS". * see probedone() in scsi_xpt.c */ if (SID_TYPE(inq_data) == T_CDROM && inq_data->version == 0 && (vmstor_proto_version >= VMSTOR_PROTOCOL_VERSION_WIN8)) { inq_data->version = SCSI_REV_SPC3; if (bootverbose) { xpt_print(ccb->ccb_h.path, "set version from 0 to %d\n", inq_data->version); } } /* * XXX: Manually fix the wrong response returned from WS2012 */ if (!is_scsi_valid(inq_data) && (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 || vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8 || vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN7)) { if (data_len >= 4 && (resp_buf[2] == 0 || resp_buf[3] == 0)) { resp_buf[2] = SCSI_REV_SPC3; resp_buf[3] = 2; // resp fmt must be 2 if (bootverbose) xpt_print(ccb->ccb_h.path, "fix version and resp fmt for 0x%x\n", vmstor_proto_version); } } else if (data_len >= SHORT_INQUIRY_LENGTH) { char vendor[16]; cam_strvis(vendor, inq_data->vendor, sizeof(inq_data->vendor), sizeof(vendor)); /* * XXX: Upgrade SPC2 to SPC3 if host is WIN8 or * WIN2012 R2 in order to support UNMAP feature. */ if (!strncmp(vendor, "Msft", 4) && SID_ANSI_REV(inq_data) == SCSI_REV_SPC2 && (vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8_1 || vmstor_proto_version == VMSTOR_PROTOCOL_VERSION_WIN8)) { inq_data->version = SCSI_REV_SPC3; if (bootverbose) { xpt_print(ccb->ccb_h.path, "storvsc upgrades " "SPC2 to SPC3\n"); } } } } } else { /** * On Some Windows hosts TEST_UNIT_READY command can return * SRB_STATUS_ERROR and sense data, for example, asc=0x3a,1 * "(Medium not present - tray closed)". This error can be * ignored since it will be sent to host periodically. */ boolean_t unit_not_ready = \ vm_srb->scsi_status == SCSI_STATUS_CHECK_COND && cmd->opcode == TEST_UNIT_READY && srb_status == SRB_STATUS_ERROR; if (!unit_not_ready && bootverbose) { mtx_lock(&sc->hs_lock); xpt_print(ccb->ccb_h.path, "storvsc scsi_status = %d, srb_status = %d\n", vm_srb->scsi_status, srb_status); mtx_unlock(&sc->hs_lock); } ccb->ccb_h.status |= CAM_SCSI_STATUS_ERROR; } ccb->csio.scsi_status = (vm_srb->scsi_status & 0xFF); ccb->csio.resid = ccb->csio.dxfer_len - vm_srb->transfer_len; if (reqp->sense_info_len != 0) { csio->sense_resid = csio->sense_len - reqp->sense_info_len; ccb->ccb_h.status |= CAM_AUTOSNS_VALID; } mtx_lock(&sc->hs_lock); if (reqp->softc->hs_frozen == 1) { xpt_print(ccb->ccb_h.path, "%u: storvsc unfreezing softc 0x%p.\n", ticks, reqp->softc); ccb->ccb_h.status |= CAM_RELEASE_SIMQ; reqp->softc->hs_frozen = 0; } storvsc_free_request(sc, reqp); mtx_unlock(&sc->hs_lock); xpt_done_direct(ccb); } /** * @brief Free a request structure * * Free a request structure by returning it to the free list * * @param sc pointer to a softc * @param reqp pointer to a request structure */ static void storvsc_free_request(struct storvsc_softc *sc, struct hv_storvsc_request *reqp) { LIST_INSERT_HEAD(&sc->hs_free_list, reqp, link); } /** * @brief Determine type of storage device from GUID * * Using the type GUID, determine if this is a StorVSC (paravirtual * SCSI or BlkVSC (paravirtual IDE) device. * * @param dev a device * returns an enum */ static enum hv_storage_type storvsc_get_storage_type(device_t dev) { device_t parent = device_get_parent(dev); if (VMBUS_PROBE_GUID(parent, dev, &gBlkVscDeviceType) == 0) return DRIVER_BLKVSC; if (VMBUS_PROBE_GUID(parent, dev, &gStorVscDeviceType) == 0) return DRIVER_STORVSC; return DRIVER_UNKNOWN; } #define PCI_VENDOR_INTEL 0x8086 #define PCI_PRODUCT_PIIX4 0x7111 static void storvsc_ada_probe_veto(void *arg __unused, struct cam_path *path, struct ata_params *ident_buf __unused, int *veto) { /* * The ATA disks are shared with the controllers managed * by this driver, so veto the ATA disks' attachment; the * ATA disks will be attached as SCSI disks once this driver * attached. */ if (path->device->protocol == PROTO_ATA) { struct ccb_pathinq cpi; bzero(&cpi, sizeof(cpi)); xpt_setup_ccb(&cpi.ccb_h, path, CAM_PRIORITY_NONE); cpi.ccb_h.func_code = XPT_PATH_INQ; xpt_action((union ccb *)&cpi); if (cpi.ccb_h.status == CAM_REQ_CMP && cpi.hba_vendor == PCI_VENDOR_INTEL && cpi.hba_device == PCI_PRODUCT_PIIX4) { (*veto)++; if (bootverbose) { xpt_print(path, "Disable ATA disks on " "simulated ATA controller (0x%04x%04x)\n", cpi.hba_device, cpi.hba_vendor); } } } } static void storvsc_sysinit(void *arg __unused) { if (vm_guest == VM_GUEST_HV) { storvsc_handler_tag = EVENTHANDLER_REGISTER(ada_probe_veto, storvsc_ada_probe_veto, NULL, EVENTHANDLER_PRI_ANY); } } SYSINIT(storvsc_sys_init, SI_SUB_DRIVERS, SI_ORDER_SECOND, storvsc_sysinit, NULL); static void storvsc_sysuninit(void *arg __unused) { if (storvsc_handler_tag != NULL) EVENTHANDLER_DEREGISTER(ada_probe_veto, storvsc_handler_tag); } SYSUNINIT(storvsc_sys_uninit, SI_SUB_DRIVERS, SI_ORDER_SECOND, storvsc_sysuninit, NULL); Index: head/sys/dev/hyperv/storvsc/hv_vstorage.h =================================================================== --- head/sys/dev/hyperv/storvsc/hv_vstorage.h (revision 322487) +++ head/sys/dev/hyperv/storvsc/hv_vstorage.h (revision 322488) @@ -1,286 +1,286 @@ /*- - * Copyright (c) 2009-2012 Microsoft Corp. + * Copyright (c) 2009-2012,2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __HV_VSTORAGE_H__ #define __HV_VSTORAGE_H__ /* * Major/minor macros. Minor version is in LSB, meaning that earlier flat * version numbers will be interpreted as "0.x" (i.e., 1 becomes 0.1). */ #define VMSTOR_PROTOCOL_MAJOR(VERSION_) (((VERSION_) >> 8) & 0xff) #define VMSTOR_PROTOCOL_MINOR(VERSION_) (((VERSION_) ) & 0xff) #define VMSTOR_PROTOCOL_VERSION(MAJOR_, MINOR_) ((((MAJOR_) & 0xff) << 8) | \ (((MINOR_) & 0xff) )) #define VMSTOR_PROTOCOL_VERSION_WIN6 VMSTOR_PROTOCOL_VERSION(2, 0) #define VMSTOR_PROTOCOL_VERSION_WIN7 VMSTOR_PROTOCOL_VERSION(4, 2) #define VMSTOR_PROTOCOL_VERSION_WIN8 VMSTOR_PROTOCOL_VERSION(5, 1) #define VMSTOR_PROTOCOL_VERSION_WIN8_1 VMSTOR_PROTOCOL_VERSION(6, 0) #define VMSTOR_PROTOCOL_VERSION_WIN10 VMSTOR_PROTOCOL_VERSION(6, 2) /* * Invalid version. */ #define VMSTOR_INVALID_PROTOCOL_VERSION -1 /* * Version history: * V1 Beta 0.1 * V1 RC < 2008/1/31 1.0 * V1 RC > 2008/1/31 2.0 * Win7: 4.2 * Win8: 5.1 */ #define VMSTOR_PROTOCOL_VERSION_CURRENT VMSTOR_PROTOCOL_VERSION(5, 1) /** * Packet structure ops describing virtual storage requests. */ enum vstor_packet_ops { VSTOR_OPERATION_COMPLETEIO = 1, VSTOR_OPERATION_REMOVEDEVICE = 2, VSTOR_OPERATION_EXECUTESRB = 3, VSTOR_OPERATION_RESETLUN = 4, VSTOR_OPERATION_RESETADAPTER = 5, VSTOR_OPERATION_RESETBUS = 6, VSTOR_OPERATION_BEGININITIALIZATION = 7, VSTOR_OPERATION_ENDINITIALIZATION = 8, VSTOR_OPERATION_QUERYPROTOCOLVERSION = 9, VSTOR_OPERATION_QUERYPROPERTIES = 10, VSTOR_OPERATION_ENUMERATE_BUS = 11, VSTOR_OPERATION_FCHBA_DATA = 12, VSTOR_OPERATION_CREATE_MULTI_CHANNELS = 13, VSTOR_OPERATION_MAXIMUM = 13 }; /* * Platform neutral description of a scsi request - * this remains the same across the write regardless of 32/64 bit * note: it's patterned off the Windows DDK SCSI_PASS_THROUGH structure */ #define CDB16GENERIC_LENGTH 0x10 #define SENSE_BUFFER_SIZE 0x14 #define MAX_DATA_BUFFER_LENGTH_WITH_PADDING 0x14 #define POST_WIN7_STORVSC_SENSE_BUFFER_SIZE 0x14 #define PRE_WIN8_STORVSC_SENSE_BUFFER_SIZE 0x12 struct vmscsi_win8_extension { /* * The following were added in Windows 8 */ uint16_t reserve; uint8_t queue_tag; uint8_t queue_action; uint32_t srb_flags; uint32_t time_out_value; uint32_t queue_sort_ey; } __packed; struct vmscsi_req { uint16_t length; uint8_t srb_status; uint8_t scsi_status; /* HBA number, set to the order number detected by initiator. */ uint8_t port; /* SCSI bus number or bus_id, different from CAM's path_id. */ uint8_t path_id; uint8_t target_id; uint8_t lun; uint8_t cdb_len; uint8_t sense_info_len; uint8_t data_in; uint8_t reserved; uint32_t transfer_len; union { uint8_t cdb[CDB16GENERIC_LENGTH]; uint8_t sense_data[SENSE_BUFFER_SIZE]; uint8_t reserved_array[MAX_DATA_BUFFER_LENGTH_WITH_PADDING]; } u; /* * The following was added in win8. */ struct vmscsi_win8_extension win8_extension; } __packed; /** * This structure is sent during the initialization phase to get the different * properties of the channel. */ struct vmstor_chan_props { uint16_t proto_ver; uint8_t path_id; uint8_t target_id; uint16_t max_channel_cnt; /** * Note: port number is only really known on the client side */ uint16_t port; uint32_t flags; uint32_t max_transfer_bytes; /** * This id is unique for each channel and will correspond with * vendor specific data in the inquiry_ata */ uint64_t unique_id; } __packed; /** * This structure is sent during the storage protocol negotiations. */ struct vmstor_proto_ver { /** * Major (MSW) and minor (LSW) version numbers. */ uint16_t major_minor; uint16_t revision; /* always zero */ } __packed; /** * Channel Property Flags */ #define STORAGE_CHANNEL_REMOVABLE_FLAG 0x1 #define STORAGE_CHANNEL_EMULATED_IDE_FLAG 0x2 struct vstor_packet { /** * Requested operation type */ enum vstor_packet_ops operation; /* * Flags - see below for values */ uint32_t flags; /** * Status of the request returned from the server side. */ uint32_t status; union { /** * Structure used to forward SCSI commands from the client to * the server. */ struct vmscsi_req vm_srb; /** * Structure used to query channel properties. */ struct vmstor_chan_props chan_props; /** * Used during version negotiations. */ struct vmstor_proto_ver version; /** * Number of multichannels to create */ uint16_t multi_channels_cnt; } u; } __packed; /** * SRB (SCSI Request Block) Status Codes */ #define SRB_STATUS_PENDING 0x00 #define SRB_STATUS_SUCCESS 0x01 #define SRB_STATUS_ABORTED 0x02 #define SRB_STATUS_ERROR 0x04 #define SRB_STATUS_INVALID_LUN 0x20 /** * SRB Status Masks (can be combined with above status codes) */ #define SRB_STATUS_QUEUE_FROZEN 0x40 #define SRB_STATUS_AUTOSENSE_VALID 0x80 #define SRB_STATUS(status) \ ((status) & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN)) /* * SRB Flag Bits */ #define SRB_FLAGS_QUEUE_ACTION_ENABLE 0x00000002 #define SRB_FLAGS_DISABLE_DISCONNECT 0x00000004 #define SRB_FLAGS_DISABLE_SYNCH_TRANSFER 0x00000008 #define SRB_FLAGS_BYPASS_FROZEN_QUEUE 0x00000010 #define SRB_FLAGS_DISABLE_AUTOSENSE 0x00000020 #define SRB_FLAGS_DATA_IN 0x00000040 #define SRB_FLAGS_DATA_OUT 0x00000080 #define SRB_FLAGS_NO_DATA_TRANSFER 0x00000000 #define SRB_FLAGS_UNSPECIFIED_DIRECTION (SRB_FLAGS_DATA_IN | SRB_FLAGS_DATA_OUT) #define SRB_FLAGS_NO_QUEUE_FREEZE 0x00000100 #define SRB_FLAGS_ADAPTER_CACHE_ENABLE 0x00000200 #define SRB_FLAGS_FREE_SENSE_BUFFER 0x00000400 /** * Packet flags */ /** * This flag indicates that the server should send back a completion for this * packet. */ #define REQUEST_COMPLETION_FLAG 0x1 /** * This is the set of flags that the vsc can set in any packets it sends */ #define VSC_LEGAL_FLAGS (REQUEST_COMPLETION_FLAG) #endif /* __HV_VSTORAGE_H__ */ Index: head/sys/dev/hyperv/utilities/hv_kvp.c =================================================================== --- head/sys/dev/hyperv/utilities/hv_kvp.c (revision 322487) +++ head/sys/dev/hyperv/utilities/hv_kvp.c (revision 322488) @@ -1,920 +1,920 @@ /*- - * Copyright (c) 2014,2016 Microsoft Corp. + * Copyright (c) 2014,2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * Author: Sainath Varanasi. * Date: 4/2012 * Email: bsdic@microsoft.com */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "unicode.h" #include "hv_kvp.h" #include "vmbus_if.h" /* hv_kvp defines */ #define BUFFERSIZE sizeof(struct hv_kvp_msg) #define kvp_hdr hdr.kvp_hdr #define KVP_FWVER_MAJOR 3 #define KVP_FWVER VMBUS_IC_VERSION(KVP_FWVER_MAJOR, 0) #define KVP_MSGVER_MAJOR 4 #define KVP_MSGVER VMBUS_IC_VERSION(KVP_MSGVER_MAJOR, 0) /* hv_kvp debug control */ static int hv_kvp_log = 0; #define hv_kvp_log_error(...) do { \ if (hv_kvp_log > 0) \ log(LOG_ERR, "hv_kvp: " __VA_ARGS__); \ } while (0) #define hv_kvp_log_info(...) do { \ if (hv_kvp_log > 1) \ log(LOG_INFO, "hv_kvp: " __VA_ARGS__); \ } while (0) static const struct vmbus_ic_desc vmbus_kvp_descs[] = { { .ic_guid = { .hv_guid = { 0xe7, 0xf4, 0xa0, 0xa9, 0x45, 0x5a, 0x96, 0x4d, 0xb8, 0x27, 0x8a, 0x84, 0x1e, 0x8c, 0x3, 0xe6 } }, .ic_desc = "Hyper-V KVP" }, VMBUS_IC_DESC_END }; /* character device prototypes */ static d_open_t hv_kvp_dev_open; static d_close_t hv_kvp_dev_close; static d_read_t hv_kvp_dev_daemon_read; static d_write_t hv_kvp_dev_daemon_write; static d_poll_t hv_kvp_dev_daemon_poll; /* hv_kvp character device structure */ static struct cdevsw hv_kvp_cdevsw = { .d_version = D_VERSION, .d_open = hv_kvp_dev_open, .d_close = hv_kvp_dev_close, .d_read = hv_kvp_dev_daemon_read, .d_write = hv_kvp_dev_daemon_write, .d_poll = hv_kvp_dev_daemon_poll, .d_name = "hv_kvp_dev", }; /* * Global state to track and synchronize multiple * KVP transaction requests from the host. */ typedef struct hv_kvp_sc { struct vmbus_ic_softc util_sc; device_t dev; /* Unless specified the pending mutex should be * used to alter the values of the following parameters: * 1. req_in_progress * 2. req_timed_out */ struct mtx pending_mutex; struct task task; /* To track if transaction is active or not */ boolean_t req_in_progress; /* Tracks if daemon did not reply back in time */ boolean_t req_timed_out; /* Tracks if daemon is serving a request currently */ boolean_t daemon_busy; /* Length of host message */ uint32_t host_msg_len; /* Host message id */ uint64_t host_msg_id; /* Current kvp message from the host */ struct hv_kvp_msg *host_kvp_msg; /* Current kvp message for daemon */ struct hv_kvp_msg daemon_kvp_msg; /* Rcv buffer for communicating with the host*/ uint8_t *rcv_buf; /* Device semaphore to control communication */ struct sema dev_sema; /* Indicates if daemon registered with driver */ boolean_t register_done; /* Character device status */ boolean_t dev_accessed; struct cdev *hv_kvp_dev; struct proc *daemon_task; struct selinfo hv_kvp_selinfo; } hv_kvp_sc; /* hv_kvp prototypes */ static int hv_kvp_req_in_progress(hv_kvp_sc *sc); static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t, uint64_t, uint8_t *); static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc); static void hv_kvp_process_request(void *context, int pending); /* * hv_kvp low level functions */ /* * Check if kvp transaction is in progres */ static int hv_kvp_req_in_progress(hv_kvp_sc *sc) { return (sc->req_in_progress); } /* * This routine is called whenever a message is received from the host */ static void hv_kvp_transaction_init(hv_kvp_sc *sc, uint32_t rcv_len, uint64_t request_id, uint8_t *rcv_buf) { /* Store all the relevant message details in the global structure */ /* Do not need to use mutex for req_in_progress here */ sc->req_in_progress = true; sc->host_msg_len = rcv_len; sc->host_msg_id = request_id; sc->rcv_buf = rcv_buf; sc->host_kvp_msg = (struct hv_kvp_msg *)&rcv_buf[ sizeof(struct hv_vmbus_pipe_hdr) + sizeof(struct hv_vmbus_icmsg_hdr)]; } /* * Convert ip related info in umsg from utf8 to utf16 and store in hmsg */ static int hv_kvp_convert_utf8_ipinfo_to_utf16(struct hv_kvp_msg *umsg, struct hv_kvp_ip_msg *host_ip_msg) { int err_ip, err_subnet, err_gway, err_dns, err_adap; int UNUSED_FLAG = 1; utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.ip_addr, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.ip_addr, strlen((char *)umsg->body.kvp_ip_val.ip_addr), UNUSED_FLAG, &err_ip); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.sub_net, strlen((char *)umsg->body.kvp_ip_val.sub_net), UNUSED_FLAG, &err_subnet); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, (char *)umsg->body.kvp_ip_val.gate_way, strlen((char *)umsg->body.kvp_ip_val.gate_way), UNUSED_FLAG, &err_gway); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE, (char *)umsg->body.kvp_ip_val.dns_addr, strlen((char *)umsg->body.kvp_ip_val.dns_addr), UNUSED_FLAG, &err_dns); utf8_to_utf16((uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, (char *)umsg->body.kvp_ip_val.adapter_id, strlen((char *)umsg->body.kvp_ip_val.adapter_id), UNUSED_FLAG, &err_adap); host_ip_msg->kvp_ip_val.dhcp_enabled = umsg->body.kvp_ip_val.dhcp_enabled; host_ip_msg->kvp_ip_val.addr_family = umsg->body.kvp_ip_val.addr_family; return (err_ip | err_subnet | err_gway | err_dns | err_adap); } /* * Convert ip related info in hmsg from utf16 to utf8 and store in umsg */ static int hv_kvp_convert_utf16_ipinfo_to_utf8(struct hv_kvp_ip_msg *host_ip_msg, struct hv_kvp_msg *umsg) { int err_ip, err_subnet, err_gway, err_dns, err_adap; int UNUSED_FLAG = 1; device_t *devs; int devcnt; /* IP Address */ utf16_to_utf8((char *)umsg->body.kvp_ip_val.ip_addr, MAX_IP_ADDR_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.ip_addr, MAX_IP_ADDR_SIZE, UNUSED_FLAG, &err_ip); /* Adapter ID : GUID */ utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, UNUSED_FLAG, &err_adap); if (devclass_get_devices(devclass_find("hn"), &devs, &devcnt) == 0) { for (devcnt = devcnt - 1; devcnt >= 0; devcnt--) { device_t dev = devs[devcnt]; struct vmbus_channel *chan; char buf[HYPERV_GUID_STRLEN]; int n; chan = vmbus_get_channel(dev); n = hyperv_guid2str(vmbus_chan_guid_inst(chan), buf, sizeof(buf)); /* * The string in the 'kvp_ip_val.adapter_id' has * braces around the GUID; skip the leading brace * in 'kvp_ip_val.adapter_id'. */ if (strncmp(buf, ((char *)&umsg->body.kvp_ip_val.adapter_id) + 1, n) == 0) { strlcpy((char *)umsg->body.kvp_ip_val.adapter_id, device_get_nameunit(dev), MAX_ADAPTER_ID_SIZE); break; } } free(devs, M_TEMP); } /* Address Family , DHCP , SUBNET, Gateway, DNS */ umsg->kvp_hdr.operation = host_ip_msg->operation; umsg->body.kvp_ip_val.addr_family = host_ip_msg->kvp_ip_val.addr_family; umsg->body.kvp_ip_val.dhcp_enabled = host_ip_msg->kvp_ip_val.dhcp_enabled; utf16_to_utf8((char *)umsg->body.kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.sub_net, MAX_IP_ADDR_SIZE, UNUSED_FLAG, &err_subnet); utf16_to_utf8((char *)umsg->body.kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.gate_way, MAX_GATEWAY_SIZE, UNUSED_FLAG, &err_gway); utf16_to_utf8((char *)umsg->body.kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.dns_addr, MAX_IP_ADDR_SIZE, UNUSED_FLAG, &err_dns); return (err_ip | err_subnet | err_gway | err_dns | err_adap); } /* * Prepare a user kvp msg based on host kvp msg (utf16 to utf8) * Ensure utf16_utf8 takes care of the additional string terminating char!! */ static void hv_kvp_convert_hostmsg_to_usermsg(struct hv_kvp_msg *hmsg, struct hv_kvp_msg *umsg) { int utf_err = 0; uint32_t value_type; struct hv_kvp_ip_msg *host_ip_msg; host_ip_msg = (struct hv_kvp_ip_msg*)hmsg; memset(umsg, 0, sizeof(struct hv_kvp_msg)); umsg->kvp_hdr.operation = hmsg->kvp_hdr.operation; umsg->kvp_hdr.pool = hmsg->kvp_hdr.pool; switch (umsg->kvp_hdr.operation) { case HV_KVP_OP_SET_IP_INFO: hv_kvp_convert_utf16_ipinfo_to_utf8(host_ip_msg, umsg); break; case HV_KVP_OP_GET_IP_INFO: utf16_to_utf8((char *)umsg->body.kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, (uint16_t *)host_ip_msg->kvp_ip_val.adapter_id, MAX_ADAPTER_ID_SIZE, 1, &utf_err); umsg->body.kvp_ip_val.addr_family = host_ip_msg->kvp_ip_val.addr_family; break; case HV_KVP_OP_SET: value_type = hmsg->body.kvp_set.data.value_type; switch (value_type) { case HV_REG_SZ: umsg->body.kvp_set.data.value_size = utf16_to_utf8( (char *)umsg->body.kvp_set.data.msg_value.value, HV_KVP_EXCHANGE_MAX_VALUE_SIZE - 1, (uint16_t *)hmsg->body.kvp_set.data.msg_value.value, hmsg->body.kvp_set.data.value_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_set.data.value_size = umsg->body.kvp_set.data.value_size / 2; break; case HV_REG_U32: umsg->body.kvp_set.data.value_size = sprintf(umsg->body.kvp_set.data.msg_value.value, "%d", hmsg->body.kvp_set.data.msg_value.value_u32) + 1; break; case HV_REG_U64: umsg->body.kvp_set.data.value_size = sprintf(umsg->body.kvp_set.data.msg_value.value, "%llu", (unsigned long long) hmsg->body.kvp_set.data.msg_value.value_u64) + 1; break; } umsg->body.kvp_set.data.key_size = utf16_to_utf8( umsg->body.kvp_set.data.key, HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, (uint16_t *)hmsg->body.kvp_set.data.key, hmsg->body.kvp_set.data.key_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_set.data.key_size = umsg->body.kvp_set.data.key_size / 2; break; case HV_KVP_OP_GET: umsg->body.kvp_get.data.key_size = utf16_to_utf8(umsg->body.kvp_get.data.key, HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, (uint16_t *)hmsg->body.kvp_get.data.key, hmsg->body.kvp_get.data.key_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_get.data.key_size = umsg->body.kvp_get.data.key_size / 2; break; case HV_KVP_OP_DELETE: umsg->body.kvp_delete.key_size = utf16_to_utf8(umsg->body.kvp_delete.key, HV_KVP_EXCHANGE_MAX_KEY_SIZE - 1, (uint16_t *)hmsg->body.kvp_delete.key, hmsg->body.kvp_delete.key_size, 1, &utf_err); /* utf8 encoding */ umsg->body.kvp_delete.key_size = umsg->body.kvp_delete.key_size / 2; break; case HV_KVP_OP_ENUMERATE: umsg->body.kvp_enum_data.index = hmsg->body.kvp_enum_data.index; break; default: hv_kvp_log_info("%s: daemon_kvp_msg: Invalid operation : %d\n", __func__, umsg->kvp_hdr.operation); } } /* * Prepare a host kvp msg based on user kvp msg (utf8 to utf16) */ static int hv_kvp_convert_usermsg_to_hostmsg(struct hv_kvp_msg *umsg, struct hv_kvp_msg *hmsg) { int hkey_len = 0, hvalue_len = 0, utf_err = 0; struct hv_kvp_exchg_msg_value *host_exchg_data; char *key_name, *value; struct hv_kvp_ip_msg *host_ip_msg = (struct hv_kvp_ip_msg *)hmsg; switch (hmsg->kvp_hdr.operation) { case HV_KVP_OP_GET_IP_INFO: return (hv_kvp_convert_utf8_ipinfo_to_utf16(umsg, host_ip_msg)); case HV_KVP_OP_SET_IP_INFO: case HV_KVP_OP_SET: case HV_KVP_OP_DELETE: return (0); case HV_KVP_OP_ENUMERATE: host_exchg_data = &hmsg->body.kvp_enum_data.data; key_name = umsg->body.kvp_enum_data.data.key; hkey_len = utf8_to_utf16((uint16_t *)host_exchg_data->key, ((HV_KVP_EXCHANGE_MAX_KEY_SIZE / 2) - 2), key_name, strlen(key_name), 1, &utf_err); /* utf16 encoding */ host_exchg_data->key_size = 2 * (hkey_len + 1); value = umsg->body.kvp_enum_data.data.msg_value.value; hvalue_len = utf8_to_utf16( (uint16_t *)host_exchg_data->msg_value.value, ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2), value, strlen(value), 1, &utf_err); host_exchg_data->value_size = 2 * (hvalue_len + 1); host_exchg_data->value_type = HV_REG_SZ; if ((hkey_len < 0) || (hvalue_len < 0)) return (EINVAL); return (0); case HV_KVP_OP_GET: host_exchg_data = &hmsg->body.kvp_get.data; value = umsg->body.kvp_get.data.msg_value.value; hvalue_len = utf8_to_utf16( (uint16_t *)host_exchg_data->msg_value.value, ((HV_KVP_EXCHANGE_MAX_VALUE_SIZE / 2) - 2), value, strlen(value), 1, &utf_err); /* Convert value size to uft16 */ host_exchg_data->value_size = 2 * (hvalue_len + 1); /* Use values by string */ host_exchg_data->value_type = HV_REG_SZ; if (hvalue_len < 0) return (EINVAL); return (0); default: return (EINVAL); } } /* * Send the response back to the host. */ static void hv_kvp_respond_host(hv_kvp_sc *sc, uint32_t error) { struct hv_vmbus_icmsg_hdr *hv_icmsg_hdrp; hv_icmsg_hdrp = (struct hv_vmbus_icmsg_hdr *) &sc->rcv_buf[sizeof(struct hv_vmbus_pipe_hdr)]; hv_icmsg_hdrp->status = error; hv_icmsg_hdrp->icflags = HV_ICMSGHDRFLAG_TRANSACTION | HV_ICMSGHDRFLAG_RESPONSE; error = vmbus_chan_send(vmbus_get_channel(sc->dev), VMBUS_CHANPKT_TYPE_INBAND, 0, sc->rcv_buf, sc->host_msg_len, sc->host_msg_id); if (error) hv_kvp_log_info("%s: hv_kvp_respond_host: sendpacket error:%d\n", __func__, error); } /* * This is the main kvp kernel process that interacts with both user daemon * and the host */ static void hv_kvp_send_msg_to_daemon(hv_kvp_sc *sc) { struct hv_kvp_msg *hmsg = sc->host_kvp_msg; struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; /* Prepare kvp_msg to be sent to user */ hv_kvp_convert_hostmsg_to_usermsg(hmsg, umsg); /* Send the msg to user via function deamon_read - setting sema */ sema_post(&sc->dev_sema); /* We should wake up the daemon, in case it's doing poll() */ selwakeup(&sc->hv_kvp_selinfo); } /* * Function to read the kvp request buffer from host * and interact with daemon */ static void hv_kvp_process_request(void *context, int pending) { uint8_t *kvp_buf; struct vmbus_channel *channel; uint32_t recvlen = 0; uint64_t requestid; struct hv_vmbus_icmsg_hdr *icmsghdrp; int ret = 0, error; hv_kvp_sc *sc; hv_kvp_log_info("%s: entering hv_kvp_process_request\n", __func__); sc = (hv_kvp_sc*)context; kvp_buf = sc->util_sc.ic_buf; channel = vmbus_get_channel(sc->dev); recvlen = sc->util_sc.ic_buflen; ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); /* XXX check recvlen to make sure that it contains enough data */ while ((ret == 0) && (recvlen > 0)) { icmsghdrp = (struct hv_vmbus_icmsg_hdr *) &kvp_buf[sizeof(struct hv_vmbus_pipe_hdr)]; hv_kvp_transaction_init(sc, recvlen, requestid, kvp_buf); if (icmsghdrp->icmsgtype == HV_ICMSGTYPE_NEGOTIATE) { error = vmbus_ic_negomsg(&sc->util_sc, kvp_buf, &recvlen, KVP_FWVER, KVP_MSGVER); /* XXX handle vmbus_ic_negomsg failure. */ if (!error) hv_kvp_respond_host(sc, HV_S_OK); else hv_kvp_respond_host(sc, HV_E_FAIL); /* * It is ok to not acquire the mutex before setting * req_in_progress here because negotiation is the * first thing that happens and hence there is no * chance of a race condition. */ sc->req_in_progress = false; hv_kvp_log_info("%s :version negotiated\n", __func__); } else { if (!sc->daemon_busy) { hv_kvp_log_info("%s: issuing qury to daemon\n", __func__); mtx_lock(&sc->pending_mutex); sc->req_timed_out = false; sc->daemon_busy = true; mtx_unlock(&sc->pending_mutex); hv_kvp_send_msg_to_daemon(sc); hv_kvp_log_info("%s: waiting for daemon\n", __func__); } /* Wait 5 seconds for daemon to respond back */ tsleep(sc, 0, "kvpworkitem", 5 * hz); hv_kvp_log_info("%s: came out of wait\n", __func__); } mtx_lock(&sc->pending_mutex); /* Notice that once req_timed_out is set to true * it will remain true until the next request is * sent to the daemon. The response from daemon * is forwarded to host only when this flag is * false. */ sc->req_timed_out = true; /* * Cancel request if so need be. */ if (hv_kvp_req_in_progress(sc)) { hv_kvp_log_info("%s: request was still active after wait so failing\n", __func__); hv_kvp_respond_host(sc, HV_E_FAIL); sc->req_in_progress = false; } mtx_unlock(&sc->pending_mutex); /* * Try reading next buffer */ recvlen = sc->util_sc.ic_buflen; ret = vmbus_chan_recv(channel, kvp_buf, &recvlen, &requestid); KASSERT(ret != ENOBUFS, ("hvkvp recvbuf is not large enough")); /* XXX check recvlen to make sure that it contains enough data */ hv_kvp_log_info("%s: read: context %p, ret =%d, recvlen=%d\n", __func__, context, ret, recvlen); } } /* * Callback routine that gets called whenever there is a message from host */ static void hv_kvp_callback(struct vmbus_channel *chan __unused, void *context) { hv_kvp_sc *sc = (hv_kvp_sc*)context; /* The first request from host will not be handled until daemon is registered. when callback is triggered without a registered daemon, callback just return. When a new daemon gets regsitered, this callbcak is trigged from _write op. */ if (sc->register_done) { hv_kvp_log_info("%s: Queuing work item\n", __func__); taskqueue_enqueue(taskqueue_thread, &sc->task); } } static int hv_kvp_dev_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; hv_kvp_log_info("%s: Opened device \"hv_kvp_device\" successfully.\n", __func__); if (sc->dev_accessed) return (-EBUSY); sc->daemon_task = curproc; sc->dev_accessed = true; sc->daemon_busy = false; return (0); } static int hv_kvp_dev_close(struct cdev *dev __unused, int fflag __unused, int devtype __unused, struct thread *td __unused) { hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; hv_kvp_log_info("%s: Closing device \"hv_kvp_device\".\n", __func__); sc->dev_accessed = false; sc->register_done = false; return (0); } /* * hv_kvp_daemon read invokes this function * acts as a send to daemon */ static int hv_kvp_dev_daemon_read(struct cdev *dev, struct uio *uio, int ioflag __unused) { size_t amt; int error = 0; struct hv_kvp_msg *hv_kvp_dev_buf; hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; /* Read is not allowed util registering is done. */ if (!sc->register_done) return (EPERM); sema_wait(&sc->dev_sema); hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); memcpy(hv_kvp_dev_buf, &sc->daemon_kvp_msg, sizeof(struct hv_kvp_msg)); amt = MIN(uio->uio_resid, uio->uio_offset >= BUFFERSIZE + 1 ? 0 : BUFFERSIZE + 1 - uio->uio_offset); if ((error = uiomove(hv_kvp_dev_buf, amt, uio)) != 0) hv_kvp_log_info("%s: hv_kvp uiomove read failed!\n", __func__); free(hv_kvp_dev_buf, M_TEMP); return (error); } /* * hv_kvp_daemon write invokes this function * acts as a receive from daemon */ static int hv_kvp_dev_daemon_write(struct cdev *dev, struct uio *uio, int ioflag __unused) { size_t amt; int error = 0; struct hv_kvp_msg *hv_kvp_dev_buf; hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; uio->uio_offset = 0; hv_kvp_dev_buf = malloc(sizeof(*hv_kvp_dev_buf), M_TEMP, M_WAITOK); amt = MIN(uio->uio_resid, BUFFERSIZE); error = uiomove(hv_kvp_dev_buf, amt, uio); if (error != 0) { free(hv_kvp_dev_buf, M_TEMP); return (error); } memcpy(&sc->daemon_kvp_msg, hv_kvp_dev_buf, sizeof(struct hv_kvp_msg)); free(hv_kvp_dev_buf, M_TEMP); if (sc->register_done == false) { if (sc->daemon_kvp_msg.kvp_hdr.operation == HV_KVP_OP_REGISTER) { sc->register_done = true; hv_kvp_callback(vmbus_get_channel(sc->dev), dev->si_drv1); } else { hv_kvp_log_info("%s, KVP Registration Failed\n", __func__); return (EINVAL); } } else { mtx_lock(&sc->pending_mutex); if(!sc->req_timed_out) { struct hv_kvp_msg *hmsg = sc->host_kvp_msg; struct hv_kvp_msg *umsg = &sc->daemon_kvp_msg; error = hv_kvp_convert_usermsg_to_hostmsg(umsg, hmsg); hv_kvp_respond_host(sc, umsg->hdr.error); wakeup(sc); sc->req_in_progress = false; if (umsg->hdr.error != HV_S_OK) hv_kvp_log_info("%s, Error 0x%x from daemon\n", __func__, umsg->hdr.error); if (error) hv_kvp_log_info("%s, Error from convert\n", __func__); } sc->daemon_busy = false; mtx_unlock(&sc->pending_mutex); } return (error); } /* * hv_kvp_daemon poll invokes this function to check if data is available * for daemon to read. */ static int hv_kvp_dev_daemon_poll(struct cdev *dev, int events, struct thread *td) { int revents = 0; hv_kvp_sc *sc = (hv_kvp_sc*)dev->si_drv1; mtx_lock(&sc->pending_mutex); /* * We check global flag daemon_busy for the data availiability for * userland to read. Deamon_busy is set to true before driver has data * for daemon to read. It is set to false after daemon sends * then response back to driver. */ if (sc->daemon_busy == true) revents = POLLIN; else selrecord(td, &sc->hv_kvp_selinfo); mtx_unlock(&sc->pending_mutex); return (revents); } static int hv_kvp_probe(device_t dev) { return (vmbus_ic_probe(dev, vmbus_kvp_descs)); } static int hv_kvp_attach(device_t dev) { int error; struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); sc->dev = dev; sema_init(&sc->dev_sema, 0, "hv_kvp device semaphore"); mtx_init(&sc->pending_mutex, "hv-kvp pending mutex", NULL, MTX_DEF); ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "hv_kvp_log", CTLFLAG_RWTUN, &hv_kvp_log, 0, "Hyperv KVP service log level"); TASK_INIT(&sc->task, 0, hv_kvp_process_request, sc); /* create character device */ error = make_dev_p(MAKEDEV_CHECKNAME | MAKEDEV_WAITOK, &sc->hv_kvp_dev, &hv_kvp_cdevsw, 0, UID_ROOT, GID_WHEEL, 0640, "hv_kvp_dev"); if (error != 0) return (error); sc->hv_kvp_dev->si_drv1 = sc; return (vmbus_ic_attach(dev, hv_kvp_callback)); } static int hv_kvp_detach(device_t dev) { hv_kvp_sc *sc = (hv_kvp_sc*)device_get_softc(dev); if (sc->daemon_task != NULL) { PROC_LOCK(sc->daemon_task); kern_psignal(sc->daemon_task, SIGKILL); PROC_UNLOCK(sc->daemon_task); } destroy_dev(sc->hv_kvp_dev); return (vmbus_ic_detach(dev)); } static device_method_t kvp_methods[] = { /* Device interface */ DEVMETHOD(device_probe, hv_kvp_probe), DEVMETHOD(device_attach, hv_kvp_attach), DEVMETHOD(device_detach, hv_kvp_detach), { 0, 0 } }; static driver_t kvp_driver = { "hvkvp", kvp_methods, sizeof(hv_kvp_sc)}; static devclass_t kvp_devclass; DRIVER_MODULE(hv_kvp, vmbus, kvp_driver, kvp_devclass, NULL, NULL); MODULE_VERSION(hv_kvp, 1); MODULE_DEPEND(hv_kvp, vmbus, 1, 1, 1); Index: head/sys/dev/hyperv/utilities/vmbus_timesync.c =================================================================== --- head/sys/dev/hyperv/utilities/vmbus_timesync.c (revision 322487) +++ head/sys/dev/hyperv/utilities/vmbus_timesync.c (revision 322488) @@ -1,260 +1,260 @@ /*- - * Copyright (c) 2014,2016 Microsoft Corp. + * Copyright (c) 2014,2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #define VMBUS_TIMESYNC_FWVER_MAJOR 3 #define VMBUS_TIMESYNC_FWVER \ VMBUS_IC_VERSION(VMBUS_TIMESYNC_FWVER_MAJOR, 0) #define VMBUS_TIMESYNC_MSGVER_MAJOR 4 #define VMBUS_TIMESYNC_MSGVER \ VMBUS_IC_VERSION(VMBUS_TIMESYNC_MSGVER_MAJOR, 0) #define VMBUS_TIMESYNC_MSGVER4(sc) \ VMBUS_ICVER_LE(VMBUS_IC_VERSION(4, 0), (sc)->ic_msgver) #define VMBUS_TIMESYNC_DORTT(sc) \ (VMBUS_TIMESYNC_MSGVER4((sc)) && hyperv_tc64 != NULL) static int vmbus_timesync_probe(device_t); static int vmbus_timesync_attach(device_t); static const struct vmbus_ic_desc vmbus_timesync_descs[] = { { .ic_guid = { .hv_guid = { 0x30, 0xe6, 0x27, 0x95, 0xae, 0xd0, 0x7b, 0x49, 0xad, 0xce, 0xe8, 0x0a, 0xb0, 0x17, 0x5c, 0xaf } }, .ic_desc = "Hyper-V Timesync" }, VMBUS_IC_DESC_END }; static device_method_t vmbus_timesync_methods[] = { /* Device interface */ DEVMETHOD(device_probe, vmbus_timesync_probe), DEVMETHOD(device_attach, vmbus_timesync_attach), DEVMETHOD(device_detach, vmbus_ic_detach), DEVMETHOD_END }; static driver_t vmbus_timesync_driver = { "hvtimesync", vmbus_timesync_methods, sizeof(struct vmbus_ic_softc) }; static devclass_t vmbus_timesync_devclass; DRIVER_MODULE(hv_timesync, vmbus, vmbus_timesync_driver, vmbus_timesync_devclass, NULL, NULL); MODULE_VERSION(hv_timesync, 1); MODULE_DEPEND(hv_timesync, vmbus, 1, 1, 1); SYSCTL_NODE(_hw, OID_AUTO, hvtimesync, CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Hyper-V timesync interface"); static int vmbus_ts_ignore_sync = 0; SYSCTL_INT(_hw_hvtimesync, OID_AUTO, ignore_sync, CTLFLAG_RWTUN, &vmbus_ts_ignore_sync, 0, "Ignore the sync request."); /* * Trigger sample sync when drift exceeds threshold (ms). * Ignore the sample request when set to 0. */ static int vmbus_ts_sample_thresh = 100; SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_thresh, CTLFLAG_RWTUN, &vmbus_ts_sample_thresh, 0, "Threshold that makes sample request trigger the sync (unit: ms)."); static int vmbus_ts_sample_verbose = 0; SYSCTL_INT(_hw_hvtimesync, OID_AUTO, sample_verbose, CTLFLAG_RWTUN, &vmbus_ts_sample_verbose, 0, "Increase sample request verbosity."); static void vmbus_timesync(struct vmbus_ic_softc *sc, uint64_t hvtime, uint64_t sent_tc, uint8_t tsflags) { struct timespec vm_ts; uint64_t hv_ns, vm_ns, rtt = 0; if (VMBUS_TIMESYNC_DORTT(sc)) rtt = hyperv_tc64() - sent_tc; hv_ns = (hvtime - VMBUS_ICMSG_TS_BASE + rtt) * HYPERV_TIMER_NS_FACTOR; nanotime(&vm_ts); vm_ns = (vm_ts.tv_sec * NANOSEC) + vm_ts.tv_nsec; if ((tsflags & VMBUS_ICMSG_TS_FLAG_SYNC) && !vmbus_ts_ignore_sync) { struct timespec hv_ts; if (bootverbose) { device_printf(sc->ic_dev, "apply sync request, " "hv: %ju, vm: %ju\n", (uintmax_t)hv_ns, (uintmax_t)vm_ns); } hv_ts.tv_sec = hv_ns / NANOSEC; hv_ts.tv_nsec = hv_ns % NANOSEC; kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); /* Done! */ return; } if ((tsflags & VMBUS_ICMSG_TS_FLAG_SAMPLE) && vmbus_ts_sample_thresh >= 0) { int64_t diff; if (vmbus_ts_sample_verbose) { device_printf(sc->ic_dev, "sample request, " "hv: %ju, vm: %ju\n", (uintmax_t)hv_ns, (uintmax_t)vm_ns); } if (hv_ns > vm_ns) diff = hv_ns - vm_ns; else diff = vm_ns - hv_ns; /* nanosec -> millisec */ diff /= 1000000; if (diff > vmbus_ts_sample_thresh) { struct timespec hv_ts; if (bootverbose) { device_printf(sc->ic_dev, "apply sample request, hv: %ju, vm: %ju\n", (uintmax_t)hv_ns, (uintmax_t)vm_ns); } hv_ts.tv_sec = hv_ns / NANOSEC; hv_ts.tv_nsec = hv_ns % NANOSEC; kern_clock_settime(curthread, CLOCK_REALTIME, &hv_ts); } /* Done */ return; } } static void vmbus_timesync_cb(struct vmbus_channel *chan, void *xsc) { struct vmbus_ic_softc *sc = xsc; struct vmbus_icmsg_hdr *hdr; int dlen, error; uint64_t xactid; void *data; /* * Receive request. */ data = sc->ic_buf; dlen = sc->ic_buflen; error = vmbus_chan_recv(chan, data, &dlen, &xactid); KASSERT(error != ENOBUFS, ("icbuf is not large enough")); if (error) return; if (dlen < sizeof(*hdr)) { device_printf(sc->ic_dev, "invalid data len %d\n", dlen); return; } hdr = data; /* * Update request, which will be echoed back as response. */ switch (hdr->ic_type) { case VMBUS_ICMSG_TYPE_NEGOTIATE: error = vmbus_ic_negomsg(sc, data, &dlen, VMBUS_TIMESYNC_FWVER, VMBUS_TIMESYNC_MSGVER); if (error) return; if (VMBUS_TIMESYNC_DORTT(sc)) device_printf(sc->ic_dev, "RTT\n"); break; case VMBUS_ICMSG_TYPE_TIMESYNC: if (VMBUS_TIMESYNC_MSGVER4(sc)) { const struct vmbus_icmsg_timesync4 *msg4; if (dlen < sizeof(*msg4)) { device_printf(sc->ic_dev, "invalid timesync4 " "len %d\n", dlen); return; } msg4 = data; vmbus_timesync(sc, msg4->ic_hvtime, msg4->ic_sent_tc, msg4->ic_tsflags); } else { const struct vmbus_icmsg_timesync *msg; if (dlen < sizeof(*msg)) { device_printf(sc->ic_dev, "invalid timesync " "len %d\n", dlen); return; } msg = data; vmbus_timesync(sc, msg->ic_hvtime, 0, msg->ic_tsflags); } break; default: device_printf(sc->ic_dev, "got 0x%08x icmsg\n", hdr->ic_type); break; } /* * Send response by echoing the request back. */ vmbus_ic_sendresp(sc, chan, data, dlen, xactid); } static int vmbus_timesync_probe(device_t dev) { return (vmbus_ic_probe(dev, vmbus_timesync_descs)); } static int vmbus_timesync_attach(device_t dev) { return (vmbus_ic_attach(dev, vmbus_timesync_cb)); } Index: head/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c =================================================================== --- head/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c (revision 322487) +++ head/sys/dev/hyperv/vmbus/amd64/hyperv_machdep.c (revision 322488) @@ -1,233 +1,233 @@ /*- - * Copyright (c) 2016 Microsoft Corp. + * Copyright (c) 2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct hyperv_reftsc_ctx { struct hyperv_reftsc *tsc_ref; struct hyperv_dma tsc_ref_dma; }; static uint32_t hyperv_tsc_vdso_timehands( struct vdso_timehands *, struct timecounter *); static d_open_t hyperv_tsc_open; static d_mmap_t hyperv_tsc_mmap; static struct timecounter hyperv_tsc_timecounter = { .tc_get_timecount = NULL, /* based on CPU vendor. */ .tc_counter_mask = 0xffffffff, .tc_frequency = HYPERV_TIMER_FREQ, .tc_name = "Hyper-V-TSC", .tc_quality = 3000, .tc_fill_vdso_timehands = hyperv_tsc_vdso_timehands, }; static struct cdevsw hyperv_tsc_cdevsw = { .d_version = D_VERSION, .d_open = hyperv_tsc_open, .d_mmap = hyperv_tsc_mmap, .d_name = HYPERV_REFTSC_DEVNAME }; static struct hyperv_reftsc_ctx hyperv_ref_tsc; uint64_t hypercall_md(volatile void *hc_addr, uint64_t in_val, uint64_t in_paddr, uint64_t out_paddr) { uint64_t status; __asm__ __volatile__ ("mov %0, %%r8" : : "r" (out_paddr): "r8"); __asm__ __volatile__ ("call *%3" : "=a" (status) : "c" (in_val), "d" (in_paddr), "m" (hc_addr)); return (status); } static int hyperv_tsc_open(struct cdev *dev __unused, int oflags, int devtype __unused, struct thread *td __unused) { if (oflags & FWRITE) return (EPERM); return (0); } static int hyperv_tsc_mmap(struct cdev *dev __unused, vm_ooffset_t offset, vm_paddr_t *paddr, int nprot __unused, vm_memattr_t *memattr __unused) { KASSERT(hyperv_ref_tsc.tsc_ref != NULL, ("reftsc has not been setup")); /* * NOTE: * 'nprot' does not contain information interested to us; * WR-open is blocked by d_open. */ if (offset != 0) return (EOPNOTSUPP); *paddr = hyperv_ref_tsc.tsc_ref_dma.hv_paddr; return (0); } static uint32_t hyperv_tsc_vdso_timehands(struct vdso_timehands *vdso_th, struct timecounter *tc __unused) { vdso_th->th_algo = VDSO_TH_ALGO_X86_HVTSC; vdso_th->th_x86_shift = 0; vdso_th->th_x86_hpet_idx = 0; bzero(vdso_th->th_res, sizeof(vdso_th->th_res)); return (1); } #define HYPERV_TSC_TIMECOUNT(fence) \ static uint64_t \ hyperv_tc64_tsc_##fence(void) \ { \ struct hyperv_reftsc *tsc_ref = hyperv_ref_tsc.tsc_ref; \ uint32_t seq; \ \ while ((seq = atomic_load_acq_int(&tsc_ref->tsc_seq)) != 0) { \ uint64_t disc, ret, tsc; \ uint64_t scale = tsc_ref->tsc_scale; \ int64_t ofs = tsc_ref->tsc_ofs; \ \ fence(); \ tsc = rdtsc(); \ \ /* ret = ((tsc * scale) >> 64) + ofs */ \ __asm__ __volatile__ ("mulq %3" : \ "=d" (ret), "=a" (disc) : \ "a" (tsc), "r" (scale)); \ ret += ofs; \ \ atomic_thread_fence_acq(); \ if (tsc_ref->tsc_seq == seq) \ return (ret); \ \ /* Sequence changed; re-sync. */ \ } \ /* Fallback to the generic timecounter, i.e. rdmsr. */ \ return (rdmsr(MSR_HV_TIME_REF_COUNT)); \ } \ \ static u_int \ hyperv_tsc_timecount_##fence(struct timecounter *tc __unused) \ { \ \ return (hyperv_tc64_tsc_##fence()); \ } \ struct __hack HYPERV_TSC_TIMECOUNT(lfence); HYPERV_TSC_TIMECOUNT(mfence); static void hyperv_tsc_tcinit(void *dummy __unused) { hyperv_tc64_t tc64 = NULL; uint64_t val, orig; if ((hyperv_features & (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC)) != (CPUID_HV_MSR_TIME_REFCNT | CPUID_HV_MSR_REFERENCE_TSC) || (cpu_feature & CPUID_SSE2) == 0) /* SSE2 for mfence/lfence */ return; switch (cpu_vendor_id) { case CPU_VENDOR_AMD: hyperv_tsc_timecounter.tc_get_timecount = hyperv_tsc_timecount_mfence; tc64 = hyperv_tc64_tsc_mfence; break; case CPU_VENDOR_INTEL: hyperv_tsc_timecounter.tc_get_timecount = hyperv_tsc_timecount_lfence; tc64 = hyperv_tc64_tsc_lfence; break; default: /* Unsupport CPU vendors. */ return; } hyperv_ref_tsc.tsc_ref = hyperv_dmamem_alloc(NULL, PAGE_SIZE, 0, sizeof(struct hyperv_reftsc), &hyperv_ref_tsc.tsc_ref_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (hyperv_ref_tsc.tsc_ref == NULL) { printf("hyperv: reftsc page allocation failed\n"); return; } orig = rdmsr(MSR_HV_REFERENCE_TSC); val = MSR_HV_REFTSC_ENABLE | (orig & MSR_HV_REFTSC_RSVD_MASK) | ((hyperv_ref_tsc.tsc_ref_dma.hv_paddr >> PAGE_SHIFT) << MSR_HV_REFTSC_PGSHIFT); wrmsr(MSR_HV_REFERENCE_TSC, val); /* Register "enlightened" timecounter. */ tc_init(&hyperv_tsc_timecounter); /* Install 64 bits timecounter method for other modules to use. */ KASSERT(tc64 != NULL, ("tc64 is not set")); hyperv_tc64 = tc64; /* Add device for mmap(2). */ make_dev(&hyperv_tsc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0444, HYPERV_REFTSC_DEVNAME); } SYSINIT(hyperv_tsc_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, hyperv_tsc_tcinit, NULL); Index: head/sys/dev/hyperv/vmbus/hyperv.c =================================================================== --- head/sys/dev/hyperv/vmbus/hyperv.c (revision 322487) +++ head/sys/dev/hyperv/vmbus/hyperv.c (revision 322488) @@ -1,334 +1,334 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /** * Implements low-level interactions with Hypver-V/Azure */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define HYPERV_FREEBSD_BUILD 0ULL #define HYPERV_FREEBSD_VERSION ((uint64_t)__FreeBSD_version) #define HYPERV_FREEBSD_OSID 0ULL #define MSR_HV_GUESTID_BUILD_FREEBSD \ (HYPERV_FREEBSD_BUILD & MSR_HV_GUESTID_BUILD_MASK) #define MSR_HV_GUESTID_VERSION_FREEBSD \ ((HYPERV_FREEBSD_VERSION << MSR_HV_GUESTID_VERSION_SHIFT) & \ MSR_HV_GUESTID_VERSION_MASK) #define MSR_HV_GUESTID_OSID_FREEBSD \ ((HYPERV_FREEBSD_OSID << MSR_HV_GUESTID_OSID_SHIFT) & \ MSR_HV_GUESTID_OSID_MASK) #define MSR_HV_GUESTID_FREEBSD \ (MSR_HV_GUESTID_BUILD_FREEBSD | \ MSR_HV_GUESTID_VERSION_FREEBSD | \ MSR_HV_GUESTID_OSID_FREEBSD | \ MSR_HV_GUESTID_OSTYPE_FREEBSD) struct hypercall_ctx { void *hc_addr; vm_paddr_t hc_paddr; }; static u_int hyperv_get_timecount(struct timecounter *); static bool hyperv_identify(void); static void hypercall_memfree(void); u_int hyperv_features; u_int hyperv_recommends; static u_int hyperv_pm_features; static u_int hyperv_features3; hyperv_tc64_t hyperv_tc64; static struct timecounter hyperv_timecounter = { .tc_get_timecount = hyperv_get_timecount, .tc_poll_pps = NULL, .tc_counter_mask = 0xffffffff, .tc_frequency = HYPERV_TIMER_FREQ, .tc_name = "Hyper-V", .tc_quality = 2000, .tc_flags = 0, .tc_priv = NULL }; static struct hypercall_ctx hypercall_context; static u_int hyperv_get_timecount(struct timecounter *tc __unused) { return rdmsr(MSR_HV_TIME_REF_COUNT); } static uint64_t hyperv_tc64_rdmsr(void) { return (rdmsr(MSR_HV_TIME_REF_COUNT)); } uint64_t hypercall_post_message(bus_addr_t msg_paddr) { return hypercall_md(hypercall_context.hc_addr, HYPERCALL_POST_MESSAGE, msg_paddr, 0); } uint64_t hypercall_signal_event(bus_addr_t monprm_paddr) { return hypercall_md(hypercall_context.hc_addr, HYPERCALL_SIGNAL_EVENT, monprm_paddr, 0); } int hyperv_guid2str(const struct hyperv_guid *guid, char *buf, size_t sz) { const uint8_t *d = guid->hv_guid; return snprintf(buf, sz, "%02x%02x%02x%02x-" "%02x%02x-%02x%02x-%02x%02x-" "%02x%02x%02x%02x%02x%02x", d[3], d[2], d[1], d[0], d[5], d[4], d[7], d[6], d[8], d[9], d[10], d[11], d[12], d[13], d[14], d[15]); } static bool hyperv_identify(void) { u_int regs[4]; unsigned int maxleaf; if (vm_guest != VM_GUEST_HV) return (false); do_cpuid(CPUID_LEAF_HV_MAXLEAF, regs); maxleaf = regs[0]; if (maxleaf < CPUID_LEAF_HV_LIMITS) return (false); do_cpuid(CPUID_LEAF_HV_INTERFACE, regs); if (regs[0] != CPUID_HV_IFACE_HYPERV) return (false); do_cpuid(CPUID_LEAF_HV_FEATURES, regs); if ((regs[0] & CPUID_HV_MSR_HYPERCALL) == 0) { /* * Hyper-V w/o Hypercall is impossible; someone * is faking Hyper-V. */ return (false); } hyperv_features = regs[0]; hyperv_pm_features = regs[2]; hyperv_features3 = regs[3]; do_cpuid(CPUID_LEAF_HV_IDENTITY, regs); printf("Hyper-V Version: %d.%d.%d [SP%d]\n", regs[1] >> 16, regs[1] & 0xffff, regs[0], regs[2]); printf(" Features=0x%b\n", hyperv_features, "\020" "\001VPRUNTIME" /* MSR_HV_VP_RUNTIME */ "\002TMREFCNT" /* MSR_HV_TIME_REF_COUNT */ "\003SYNIC" /* MSRs for SynIC */ "\004SYNTM" /* MSRs for SynTimer */ "\005APIC" /* MSR_HV_{EOI,ICR,TPR} */ "\006HYPERCALL" /* MSR_HV_{GUEST_OS_ID,HYPERCALL} */ "\007VPINDEX" /* MSR_HV_VP_INDEX */ "\010RESET" /* MSR_HV_RESET */ "\011STATS" /* MSR_HV_STATS_ */ "\012REFTSC" /* MSR_HV_REFERENCE_TSC */ "\013IDLE" /* MSR_HV_GUEST_IDLE */ "\014TMFREQ" /* MSR_HV_{TSC,APIC}_FREQUENCY */ "\015DEBUG"); /* MSR_HV_SYNTH_DEBUG_ */ printf(" PM Features=0x%b [C%u]\n", (hyperv_pm_features & ~CPUPM_HV_CSTATE_MASK), "\020" "\005C3HPET", /* HPET is required for C3 state */ CPUPM_HV_CSTATE(hyperv_pm_features)); printf(" Features3=0x%b\n", hyperv_features3, "\020" "\001MWAIT" /* MWAIT */ "\002DEBUG" /* guest debug support */ "\003PERFMON" /* performance monitor */ "\004PCPUDPE" /* physical CPU dynamic partition event */ "\005XMMHC" /* hypercall input through XMM regs */ "\006IDLE" /* guest idle support */ "\007SLEEP" /* hypervisor sleep support */ "\010NUMA" /* NUMA distance query support */ "\011TMFREQ" /* timer frequency query (TSC, LAPIC) */ "\012SYNCMC" /* inject synthetic machine checks */ "\013CRASH" /* MSRs for guest crash */ "\014DEBUGMSR" /* MSRs for guest debug */ "\015NPIEP" /* NPIEP */ "\016HVDIS"); /* disabling hypervisor */ do_cpuid(CPUID_LEAF_HV_RECOMMENDS, regs); hyperv_recommends = regs[0]; if (bootverbose) printf(" Recommends: %08x %08x\n", regs[0], regs[1]); do_cpuid(CPUID_LEAF_HV_LIMITS, regs); if (bootverbose) { printf(" Limits: Vcpu:%d Lcpu:%d Int:%d\n", regs[0], regs[1], regs[2]); } if (maxleaf >= CPUID_LEAF_HV_HWFEATURES) { do_cpuid(CPUID_LEAF_HV_HWFEATURES, regs); if (bootverbose) { printf(" HW Features: %08x, AMD: %08x\n", regs[0], regs[3]); } } return (true); } static void hyperv_init(void *dummy __unused) { if (!hyperv_identify()) { /* Not Hyper-V; reset guest id to the generic one. */ if (vm_guest == VM_GUEST_HV) vm_guest = VM_GUEST_VM; return; } /* Set guest id */ wrmsr(MSR_HV_GUEST_OS_ID, MSR_HV_GUESTID_FREEBSD); if (hyperv_features & CPUID_HV_MSR_TIME_REFCNT) { /* Register Hyper-V timecounter */ tc_init(&hyperv_timecounter); /* * Install 64 bits timecounter method for other modules * to use. */ hyperv_tc64 = hyperv_tc64_rdmsr; } } SYSINIT(hyperv_initialize, SI_SUB_HYPERVISOR, SI_ORDER_FIRST, hyperv_init, NULL); static void hypercall_memfree(void) { kmem_free(kernel_arena, (vm_offset_t)hypercall_context.hc_addr, PAGE_SIZE); hypercall_context.hc_addr = NULL; } static void hypercall_create(void *arg __unused) { uint64_t hc, hc_orig; if (vm_guest != VM_GUEST_HV) return; /* * NOTE: * - busdma(9), i.e. hyperv_dmamem APIs, can _not_ be used due to * the NX bit. * - Assume kmem_malloc() returns properly aligned memory. */ hypercall_context.hc_addr = (void *)kmem_malloc(kernel_arena, PAGE_SIZE, M_WAITOK); hypercall_context.hc_paddr = vtophys(hypercall_context.hc_addr); /* Get the 'reserved' bits, which requires preservation. */ hc_orig = rdmsr(MSR_HV_HYPERCALL); /* * Setup the Hypercall page. * * NOTE: 'reserved' bits MUST be preserved. */ hc = ((hypercall_context.hc_paddr >> PAGE_SHIFT) << MSR_HV_HYPERCALL_PGSHIFT) | (hc_orig & MSR_HV_HYPERCALL_RSVD_MASK) | MSR_HV_HYPERCALL_ENABLE; wrmsr(MSR_HV_HYPERCALL, hc); /* * Confirm that Hypercall page did get setup. */ hc = rdmsr(MSR_HV_HYPERCALL); if ((hc & MSR_HV_HYPERCALL_ENABLE) == 0) { printf("hyperv: Hypercall setup failed\n"); hypercall_memfree(); /* Can't perform any Hyper-V specific actions */ vm_guest = VM_GUEST_VM; return; } if (bootverbose) printf("hyperv: Hypercall created\n"); } SYSINIT(hypercall_ctor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_create, NULL); static void hypercall_destroy(void *arg __unused) { uint64_t hc; if (hypercall_context.hc_addr == NULL) return; /* Disable Hypercall */ hc = rdmsr(MSR_HV_HYPERCALL); wrmsr(MSR_HV_HYPERCALL, (hc & MSR_HV_HYPERCALL_RSVD_MASK)); hypercall_memfree(); if (bootverbose) printf("hyperv: Hypercall destroyed\n"); } SYSUNINIT(hypercall_dtor, SI_SUB_DRIVERS, SI_ORDER_FIRST, hypercall_destroy, NULL); Index: head/sys/dev/hyperv/vmbus/vmbus.c =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus.c (revision 322487) +++ head/sys/dev/hyperv/vmbus/vmbus.c (revision 322488) @@ -1,1538 +1,1538 @@ /*- - * Copyright (c) 2009-2012,2016 Microsoft Corp. + * Copyright (c) 2009-2012,2016-2017 Microsoft Corp. * Copyright (c) 2012 NetApp Inc. * Copyright (c) 2012 Citrix Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * VM Bus Driver Implementation */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "acpi_if.h" #include "pcib_if.h" #include "vmbus_if.h" #define VMBUS_GPADL_START 0xe1e10 struct vmbus_msghc { struct vmbus_xact *mh_xact; struct hypercall_postmsg_in mh_inprm_save; }; static void vmbus_identify(driver_t *, device_t); static int vmbus_probe(device_t); static int vmbus_attach(device_t); static int vmbus_detach(device_t); static int vmbus_read_ivar(device_t, device_t, int, uintptr_t *); static int vmbus_child_pnpinfo_str(device_t, device_t, char *, size_t); static struct resource *vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags); static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs); static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs); static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq); static int vmbus_release_msix(device_t bus, device_t dev, int irq); static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data); static uint32_t vmbus_get_version_method(device_t, device_t); static int vmbus_probe_guid_method(device_t, device_t, const struct hyperv_guid *); static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu); static struct taskqueue *vmbus_get_eventtq_method(device_t, device_t, int); #ifdef EARLY_AP_STARTUP static void vmbus_intrhook(void *); #endif static int vmbus_init(struct vmbus_softc *); static int vmbus_connect(struct vmbus_softc *, uint32_t); static int vmbus_req_channels(struct vmbus_softc *sc); static void vmbus_disconnect(struct vmbus_softc *); static int vmbus_scan(struct vmbus_softc *); static void vmbus_scan_teardown(struct vmbus_softc *); static void vmbus_scan_done(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_chanmsg_handle(struct vmbus_softc *, const struct vmbus_message *); static void vmbus_msg_task(void *, int); static void vmbus_synic_setup(void *); static void vmbus_synic_teardown(void *); static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS); static int vmbus_dma_alloc(struct vmbus_softc *); static void vmbus_dma_free(struct vmbus_softc *); static int vmbus_intr_setup(struct vmbus_softc *); static void vmbus_intr_teardown(struct vmbus_softc *); static int vmbus_doattach(struct vmbus_softc *); static void vmbus_event_proc_dummy(struct vmbus_softc *, int); static struct vmbus_softc *vmbus_sc; extern inthand_t IDTVEC(vmbus_isr); static const uint32_t vmbus_version[] = { VMBUS_VERSION_WIN8_1, VMBUS_VERSION_WIN8, VMBUS_VERSION_WIN7, VMBUS_VERSION_WS2008 }; static const vmbus_chanmsg_proc_t vmbus_chanmsg_handlers[VMBUS_CHANMSG_TYPE_MAX] = { VMBUS_CHANMSG_PROC(CHOFFER_DONE, vmbus_scan_done), VMBUS_CHANMSG_PROC_WAKEUP(CONNECT_RESP) }; static device_method_t vmbus_methods[] = { /* Device interface */ DEVMETHOD(device_identify, vmbus_identify), DEVMETHOD(device_probe, vmbus_probe), DEVMETHOD(device_attach, vmbus_attach), DEVMETHOD(device_detach, vmbus_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), DEVMETHOD(device_suspend, bus_generic_suspend), DEVMETHOD(device_resume, bus_generic_resume), /* Bus interface */ DEVMETHOD(bus_add_child, bus_generic_add_child), DEVMETHOD(bus_print_child, bus_generic_print_child), DEVMETHOD(bus_read_ivar, vmbus_read_ivar), DEVMETHOD(bus_child_pnpinfo_str, vmbus_child_pnpinfo_str), DEVMETHOD(bus_alloc_resource, vmbus_alloc_resource), DEVMETHOD(bus_release_resource, bus_generic_release_resource), DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), #if __FreeBSD_version >= 1100000 DEVMETHOD(bus_get_cpus, bus_generic_get_cpus), #endif /* pcib interface */ DEVMETHOD(pcib_alloc_msi, vmbus_alloc_msi), DEVMETHOD(pcib_release_msi, vmbus_release_msi), DEVMETHOD(pcib_alloc_msix, vmbus_alloc_msix), DEVMETHOD(pcib_release_msix, vmbus_release_msix), DEVMETHOD(pcib_map_msi, vmbus_map_msi), /* Vmbus interface */ DEVMETHOD(vmbus_get_version, vmbus_get_version_method), DEVMETHOD(vmbus_probe_guid, vmbus_probe_guid_method), DEVMETHOD(vmbus_get_vcpu_id, vmbus_get_vcpu_id_method), DEVMETHOD(vmbus_get_event_taskq, vmbus_get_eventtq_method), DEVMETHOD_END }; static driver_t vmbus_driver = { "vmbus", vmbus_methods, sizeof(struct vmbus_softc) }; static devclass_t vmbus_devclass; DRIVER_MODULE(vmbus, pcib, vmbus_driver, vmbus_devclass, NULL, NULL); DRIVER_MODULE(vmbus, acpi_syscontainer, vmbus_driver, vmbus_devclass, NULL, NULL); MODULE_DEPEND(vmbus, acpi, 1, 1, 1); MODULE_DEPEND(vmbus, pci, 1, 1, 1); MODULE_VERSION(vmbus, 1); static __inline struct vmbus_softc * vmbus_get_softc(void) { return vmbus_sc; } void vmbus_msghc_reset(struct vmbus_msghc *mh, size_t dsize) { struct hypercall_postmsg_in *inprm; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); inprm = vmbus_xact_req_data(mh->mh_xact); memset(inprm, 0, HYPERCALL_POSTMSGIN_SIZE); inprm->hc_connid = VMBUS_CONNID_MESSAGE; inprm->hc_msgtype = HYPERV_MSGTYPE_CHANNEL; inprm->hc_dsize = dsize; } struct vmbus_msghc * vmbus_msghc_get(struct vmbus_softc *sc, size_t dsize) { struct vmbus_msghc *mh; struct vmbus_xact *xact; if (dsize > HYPERCALL_POSTMSGIN_DSIZE_MAX) panic("invalid data size %zu", dsize); xact = vmbus_xact_get(sc->vmbus_xc, dsize + __offsetof(struct hypercall_postmsg_in, hc_data[0])); if (xact == NULL) return (NULL); mh = vmbus_xact_priv(xact, sizeof(*mh)); mh->mh_xact = xact; vmbus_msghc_reset(mh, dsize); return (mh); } void vmbus_msghc_put(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_put(mh->mh_xact); } void * vmbus_msghc_dataptr(struct vmbus_msghc *mh) { struct hypercall_postmsg_in *inprm; inprm = vmbus_xact_req_data(mh->mh_xact); return (inprm->hc_data); } int vmbus_msghc_exec_noresult(struct vmbus_msghc *mh) { sbintime_t time = SBT_1MS; struct hypercall_postmsg_in *inprm; bus_addr_t inprm_paddr; int i; inprm = vmbus_xact_req_data(mh->mh_xact); inprm_paddr = vmbus_xact_req_paddr(mh->mh_xact); /* * Save the input parameter so that we could restore the input * parameter if the Hypercall failed. * * XXX * Is this really necessary?! i.e. Will the Hypercall ever * overwrite the input parameter? */ memcpy(&mh->mh_inprm_save, inprm, HYPERCALL_POSTMSGIN_SIZE); /* * In order to cope with transient failures, e.g. insufficient * resources on host side, we retry the post message Hypercall * several times. 20 retries seem sufficient. */ #define HC_RETRY_MAX 20 for (i = 0; i < HC_RETRY_MAX; ++i) { uint64_t status; status = hypercall_post_message(inprm_paddr); if (status == HYPERCALL_STATUS_SUCCESS) return 0; pause_sbt("hcpmsg", time, 0, C_HARDCLOCK); if (time < SBT_1S * 2) time *= 2; /* Restore input parameter and try again */ memcpy(inprm, &mh->mh_inprm_save, HYPERCALL_POSTMSGIN_SIZE); } #undef HC_RETRY_MAX return EIO; } int vmbus_msghc_exec(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { int error; vmbus_xact_activate(mh->mh_xact); error = vmbus_msghc_exec_noresult(mh); if (error) vmbus_xact_deactivate(mh->mh_xact); return error; } void vmbus_msghc_exec_cancel(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { vmbus_xact_deactivate(mh->mh_xact); } const struct vmbus_message * vmbus_msghc_wait_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_wait(mh->mh_xact, &resp_len)); } const struct vmbus_message * vmbus_msghc_poll_result(struct vmbus_softc *sc __unused, struct vmbus_msghc *mh) { size_t resp_len; return (vmbus_xact_poll(mh->mh_xact, &resp_len)); } void vmbus_msghc_wakeup(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_xact_ctx_wakeup(sc->vmbus_xc, msg, sizeof(*msg)); } uint32_t vmbus_gpadl_alloc(struct vmbus_softc *sc) { uint32_t gpadl; again: gpadl = atomic_fetchadd_int(&sc->vmbus_gpadl, 1); if (gpadl == 0) goto again; return (gpadl); } static int vmbus_connect(struct vmbus_softc *sc, uint32_t version) { struct vmbus_chanmsg_connect *req; const struct vmbus_message *msg; struct vmbus_msghc *mh; int error, done = 0; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CONNECT; req->chm_ver = version; req->chm_evtflags = sc->vmbus_evtflags_dma.hv_paddr; req->chm_mnf1 = sc->vmbus_mnf1_dma.hv_paddr; req->chm_mnf2 = sc->vmbus_mnf2_dma.hv_paddr; error = vmbus_msghc_exec(sc, mh); if (error) { vmbus_msghc_put(sc, mh); return error; } msg = vmbus_msghc_wait_result(sc, mh); done = ((const struct vmbus_chanmsg_connect_resp *) msg->msg_data)->chm_done; vmbus_msghc_put(sc, mh); return (done ? 0 : EOPNOTSUPP); } static int vmbus_init(struct vmbus_softc *sc) { int i; for (i = 0; i < nitems(vmbus_version); ++i) { int error; error = vmbus_connect(sc, vmbus_version[i]); if (!error) { sc->vmbus_version = vmbus_version[i]; device_printf(sc->vmbus_dev, "version %u.%u\n", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return 0; } } return ENXIO; } static void vmbus_disconnect(struct vmbus_softc *sc) { struct vmbus_chanmsg_disconnect *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) { device_printf(sc->vmbus_dev, "can not get msg hypercall for disconnect\n"); return; } req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_DISCONNECT; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); if (error) { device_printf(sc->vmbus_dev, "disconnect msg hypercall failed\n"); } } static int vmbus_req_channels(struct vmbus_softc *sc) { struct vmbus_chanmsg_chrequest *req; struct vmbus_msghc *mh; int error; mh = vmbus_msghc_get(sc, sizeof(*req)); if (mh == NULL) return ENXIO; req = vmbus_msghc_dataptr(mh); req->chm_hdr.chm_type = VMBUS_CHANMSG_TYPE_CHREQUEST; error = vmbus_msghc_exec_noresult(mh); vmbus_msghc_put(sc, mh); return error; } static void vmbus_scan_done_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; mtx_lock(&Giant); sc->vmbus_scandone = true; mtx_unlock(&Giant); wakeup(&sc->vmbus_scandone); } static void vmbus_scan_done(struct vmbus_softc *sc, const struct vmbus_message *msg __unused) { taskqueue_enqueue(sc->vmbus_devtq, &sc->vmbus_scandone_task); } static int vmbus_scan(struct vmbus_softc *sc) { int error; /* * Identify, probe and attach for non-channel devices. */ bus_generic_probe(sc->vmbus_dev); bus_generic_attach(sc->vmbus_dev); /* * This taskqueue serializes vmbus devices' attach and detach * for channel offer and rescind messages. */ sc->vmbus_devtq = taskqueue_create("vmbus dev", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_devtq); taskqueue_start_threads(&sc->vmbus_devtq, 1, PI_NET, "vmbusdev"); TASK_INIT(&sc->vmbus_scandone_task, 0, vmbus_scan_done_task, sc); /* * This taskqueue handles sub-channel detach, so that vmbus * device's detach running in vmbus_devtq can drain its sub- * channels. */ sc->vmbus_subchtq = taskqueue_create("vmbus subch", M_WAITOK, taskqueue_thread_enqueue, &sc->vmbus_subchtq); taskqueue_start_threads(&sc->vmbus_subchtq, 1, PI_NET, "vmbussch"); /* * Start vmbus scanning. */ error = vmbus_req_channels(sc); if (error) { device_printf(sc->vmbus_dev, "channel request failed: %d\n", error); return (error); } /* * Wait for all vmbus devices from the initial channel offers to be * attached. */ GIANT_REQUIRED; while (!sc->vmbus_scandone) mtx_sleep(&sc->vmbus_scandone, &Giant, 0, "vmbusdev", 0); if (bootverbose) { device_printf(sc->vmbus_dev, "device scan, probe and attach " "done\n"); } return (0); } static void vmbus_scan_teardown(struct vmbus_softc *sc) { GIANT_REQUIRED; if (sc->vmbus_devtq != NULL) { mtx_unlock(&Giant); taskqueue_free(sc->vmbus_devtq); mtx_lock(&Giant); sc->vmbus_devtq = NULL; } if (sc->vmbus_subchtq != NULL) { mtx_unlock(&Giant); taskqueue_free(sc->vmbus_subchtq); mtx_lock(&Giant); sc->vmbus_subchtq = NULL; } } static void vmbus_chanmsg_handle(struct vmbus_softc *sc, const struct vmbus_message *msg) { vmbus_chanmsg_proc_t msg_proc; uint32_t msg_type; msg_type = ((const struct vmbus_chanmsg_hdr *)msg->msg_data)->chm_type; if (msg_type >= VMBUS_CHANMSG_TYPE_MAX) { device_printf(sc->vmbus_dev, "unknown message type 0x%x\n", msg_type); return; } msg_proc = vmbus_chanmsg_handlers[msg_type]; if (msg_proc != NULL) msg_proc(sc, msg); /* Channel specific processing */ vmbus_chan_msgproc(sc, msg); } static void vmbus_msg_task(void *xsc, int pending __unused) { struct vmbus_softc *sc = xsc; volatile struct vmbus_message *msg; msg = VMBUS_PCPU_GET(sc, message, curcpu) + VMBUS_SINT_MESSAGE; for (;;) { if (msg->msg_type == HYPERV_MSGTYPE_NONE) { /* No message */ break; } else if (msg->msg_type == HYPERV_MSGTYPE_CHANNEL) { /* Channel message */ vmbus_chanmsg_handle(sc, __DEVOLATILE(const struct vmbus_message *, msg)); } msg->msg_type = HYPERV_MSGTYPE_NONE; /* * Make sure the write to msg_type (i.e. set to * HYPERV_MSGTYPE_NONE) happens before we read the * msg_flags and EOMing. Otherwise, the EOMing will * not deliver any more messages since there is no * empty slot * * NOTE: * mb() is used here, since atomic_thread_fence_seq_cst() * will become compiler fence on UP kernel. */ mb(); if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { /* * This will cause message queue rescan to possibly * deliver another msg from the hypervisor */ wrmsr(MSR_HV_EOM, 0); } } } static __inline int vmbus_handle_intr1(struct vmbus_softc *sc, struct trapframe *frame, int cpu) { volatile struct vmbus_message *msg; struct vmbus_message *msg_base; msg_base = VMBUS_PCPU_GET(sc, message, cpu); /* * Check event timer. * * TODO: move this to independent IDT vector. */ msg = msg_base + VMBUS_SINT_TIMER; if (msg->msg_type == HYPERV_MSGTYPE_TIMER_EXPIRED) { msg->msg_type = HYPERV_MSGTYPE_NONE; vmbus_et_intr(frame); /* * Make sure the write to msg_type (i.e. set to * HYPERV_MSGTYPE_NONE) happens before we read the * msg_flags and EOMing. Otherwise, the EOMing will * not deliver any more messages since there is no * empty slot * * NOTE: * mb() is used here, since atomic_thread_fence_seq_cst() * will become compiler fence on UP kernel. */ mb(); if (msg->msg_flags & VMBUS_MSGFLAG_PENDING) { /* * This will cause message queue rescan to possibly * deliver another msg from the hypervisor */ wrmsr(MSR_HV_EOM, 0); } } /* * Check events. Hot path for network and storage I/O data; high rate. * * NOTE: * As recommended by the Windows guest fellows, we check events before * checking messages. */ sc->vmbus_event_proc(sc, cpu); /* * Check messages. Mainly management stuffs; ultra low rate. */ msg = msg_base + VMBUS_SINT_MESSAGE; if (__predict_false(msg->msg_type != HYPERV_MSGTYPE_NONE)) { taskqueue_enqueue(VMBUS_PCPU_GET(sc, message_tq, cpu), VMBUS_PCPU_PTR(sc, message_task, cpu)); } return (FILTER_HANDLED); } void vmbus_handle_intr(struct trapframe *trap_frame) { struct vmbus_softc *sc = vmbus_get_softc(); int cpu = curcpu; /* * Disable preemption. */ critical_enter(); /* * Do a little interrupt counting. */ (*VMBUS_PCPU_GET(sc, intr_cnt, cpu))++; vmbus_handle_intr1(sc, trap_frame, cpu); /* * Enable preemption. */ critical_exit(); } static void vmbus_synic_setup(void *xsc) { struct vmbus_softc *sc = xsc; int cpu = curcpu; uint64_t val, orig; uint32_t sint; if (hyperv_features & CPUID_HV_MSR_VP_INDEX) { /* Save virtual processor id. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = rdmsr(MSR_HV_VP_INDEX); } else { /* Set virtual processor id to 0 for compatibility. */ VMBUS_PCPU_GET(sc, vcpuid, cpu) = 0; } /* * Setup the SynIC message. */ orig = rdmsr(MSR_HV_SIMP); val = MSR_HV_SIMP_ENABLE | (orig & MSR_HV_SIMP_RSVD_MASK) | ((VMBUS_PCPU_GET(sc, message_dma.hv_paddr, cpu) >> PAGE_SHIFT) << MSR_HV_SIMP_PGSHIFT); wrmsr(MSR_HV_SIMP, val); /* * Setup the SynIC event flags. */ orig = rdmsr(MSR_HV_SIEFP); val = MSR_HV_SIEFP_ENABLE | (orig & MSR_HV_SIEFP_RSVD_MASK) | ((VMBUS_PCPU_GET(sc, event_flags_dma.hv_paddr, cpu) >> PAGE_SHIFT) << MSR_HV_SIEFP_PGSHIFT); wrmsr(MSR_HV_SIEFP, val); /* * Configure and unmask SINT for message and event flags. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = rdmsr(sint); val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | (orig & MSR_HV_SINT_RSVD_MASK); wrmsr(sint, val); /* * Configure and unmask SINT for timer. */ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; orig = rdmsr(sint); val = sc->vmbus_idtvec | MSR_HV_SINT_AUTOEOI | (orig & MSR_HV_SINT_RSVD_MASK); wrmsr(sint, val); /* * All done; enable SynIC. */ orig = rdmsr(MSR_HV_SCONTROL); val = MSR_HV_SCTRL_ENABLE | (orig & MSR_HV_SCTRL_RSVD_MASK); wrmsr(MSR_HV_SCONTROL, val); } static void vmbus_synic_teardown(void *arg) { uint64_t orig; uint32_t sint; /* * Disable SynIC. */ orig = rdmsr(MSR_HV_SCONTROL); wrmsr(MSR_HV_SCONTROL, (orig & MSR_HV_SCTRL_RSVD_MASK)); /* * Mask message and event flags SINT. */ sint = MSR_HV_SINT0 + VMBUS_SINT_MESSAGE; orig = rdmsr(sint); wrmsr(sint, orig | MSR_HV_SINT_MASKED); /* * Mask timer SINT. */ sint = MSR_HV_SINT0 + VMBUS_SINT_TIMER; orig = rdmsr(sint); wrmsr(sint, orig | MSR_HV_SINT_MASKED); /* * Teardown SynIC message. */ orig = rdmsr(MSR_HV_SIMP); wrmsr(MSR_HV_SIMP, (orig & MSR_HV_SIMP_RSVD_MASK)); /* * Teardown SynIC event flags. */ orig = rdmsr(MSR_HV_SIEFP); wrmsr(MSR_HV_SIEFP, (orig & MSR_HV_SIEFP_RSVD_MASK)); } static int vmbus_dma_alloc(struct vmbus_softc *sc) { bus_dma_tag_t parent_dtag; uint8_t *evtflags; int cpu; parent_dtag = bus_get_dma_tag(sc->vmbus_dev); CPU_FOREACH(cpu) { void *ptr; /* * Per-cpu messages and event flags. */ ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, VMBUS_PCPU_PTR(sc, message_dma, cpu), BUS_DMA_WAITOK | BUS_DMA_ZERO); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, message, cpu) = ptr; ptr = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), BUS_DMA_WAITOK | BUS_DMA_ZERO); if (ptr == NULL) return ENOMEM; VMBUS_PCPU_GET(sc, event_flags, cpu) = ptr; } evtflags = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, &sc->vmbus_evtflags_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (evtflags == NULL) return ENOMEM; sc->vmbus_rx_evtflags = (u_long *)evtflags; sc->vmbus_tx_evtflags = (u_long *)(evtflags + (PAGE_SIZE / 2)); sc->vmbus_evtflags = evtflags; sc->vmbus_mnf1 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, PAGE_SIZE, &sc->vmbus_mnf1_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->vmbus_mnf1 == NULL) return ENOMEM; sc->vmbus_mnf2 = hyperv_dmamem_alloc(parent_dtag, PAGE_SIZE, 0, sizeof(struct vmbus_mnf), &sc->vmbus_mnf2_dma, BUS_DMA_WAITOK | BUS_DMA_ZERO); if (sc->vmbus_mnf2 == NULL) return ENOMEM; return 0; } static void vmbus_dma_free(struct vmbus_softc *sc) { int cpu; if (sc->vmbus_evtflags != NULL) { hyperv_dmamem_free(&sc->vmbus_evtflags_dma, sc->vmbus_evtflags); sc->vmbus_evtflags = NULL; sc->vmbus_rx_evtflags = NULL; sc->vmbus_tx_evtflags = NULL; } if (sc->vmbus_mnf1 != NULL) { hyperv_dmamem_free(&sc->vmbus_mnf1_dma, sc->vmbus_mnf1); sc->vmbus_mnf1 = NULL; } if (sc->vmbus_mnf2 != NULL) { hyperv_dmamem_free(&sc->vmbus_mnf2_dma, sc->vmbus_mnf2); sc->vmbus_mnf2 = NULL; } CPU_FOREACH(cpu) { if (VMBUS_PCPU_GET(sc, message, cpu) != NULL) { hyperv_dmamem_free( VMBUS_PCPU_PTR(sc, message_dma, cpu), VMBUS_PCPU_GET(sc, message, cpu)); VMBUS_PCPU_GET(sc, message, cpu) = NULL; } if (VMBUS_PCPU_GET(sc, event_flags, cpu) != NULL) { hyperv_dmamem_free( VMBUS_PCPU_PTR(sc, event_flags_dma, cpu), VMBUS_PCPU_GET(sc, event_flags, cpu)); VMBUS_PCPU_GET(sc, event_flags, cpu) = NULL; } } } static int vmbus_intr_setup(struct vmbus_softc *sc) { int cpu; CPU_FOREACH(cpu) { char buf[MAXCOMLEN + 1]; cpuset_t cpu_mask; /* Allocate an interrupt counter for Hyper-V interrupt */ snprintf(buf, sizeof(buf), "cpu%d:hyperv", cpu); intrcnt_add(buf, VMBUS_PCPU_PTR(sc, intr_cnt, cpu)); /* * Setup taskqueue to handle events. Task will be per- * channel. */ VMBUS_PCPU_GET(sc, event_tq, cpu) = taskqueue_create_fast( "hyperv event", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, event_tq, cpu)); CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset( VMBUS_PCPU_PTR(sc, event_tq, cpu), 1, PI_NET, &cpu_mask, "hvevent%d", cpu); /* * Setup tasks and taskqueues to handle messages. */ VMBUS_PCPU_GET(sc, message_tq, cpu) = taskqueue_create_fast( "hyperv msg", M_WAITOK, taskqueue_thread_enqueue, VMBUS_PCPU_PTR(sc, message_tq, cpu)); CPU_SETOF(cpu, &cpu_mask); taskqueue_start_threads_cpuset( VMBUS_PCPU_PTR(sc, message_tq, cpu), 1, PI_NET, &cpu_mask, "hvmsg%d", cpu); TASK_INIT(VMBUS_PCPU_PTR(sc, message_task, cpu), 0, vmbus_msg_task, sc); } /* * All Hyper-V ISR required resources are setup, now let's find a * free IDT vector for Hyper-V ISR and set it up. */ sc->vmbus_idtvec = lapic_ipi_alloc(IDTVEC(vmbus_isr)); if (sc->vmbus_idtvec < 0) { device_printf(sc->vmbus_dev, "cannot find free IDT vector\n"); return ENXIO; } if (bootverbose) { device_printf(sc->vmbus_dev, "vmbus IDT vector %d\n", sc->vmbus_idtvec); } return 0; } static void vmbus_intr_teardown(struct vmbus_softc *sc) { int cpu; if (sc->vmbus_idtvec >= 0) { lapic_ipi_free(sc->vmbus_idtvec); sc->vmbus_idtvec = -1; } CPU_FOREACH(cpu) { if (VMBUS_PCPU_GET(sc, event_tq, cpu) != NULL) { taskqueue_free(VMBUS_PCPU_GET(sc, event_tq, cpu)); VMBUS_PCPU_GET(sc, event_tq, cpu) = NULL; } if (VMBUS_PCPU_GET(sc, message_tq, cpu) != NULL) { taskqueue_drain(VMBUS_PCPU_GET(sc, message_tq, cpu), VMBUS_PCPU_PTR(sc, message_task, cpu)); taskqueue_free(VMBUS_PCPU_GET(sc, message_tq, cpu)); VMBUS_PCPU_GET(sc, message_tq, cpu) = NULL; } } } static int vmbus_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) { return (ENOENT); } static int vmbus_child_pnpinfo_str(device_t dev, device_t child, char *buf, size_t buflen) { const struct vmbus_channel *chan; char guidbuf[HYPERV_GUID_STRLEN]; chan = vmbus_get_channel(child); if (chan == NULL) { /* Event timer device, which does not belong to a channel */ return (0); } strlcat(buf, "classid=", buflen); hyperv_guid2str(&chan->ch_guid_type, guidbuf, sizeof(guidbuf)); strlcat(buf, guidbuf, buflen); strlcat(buf, " deviceid=", buflen); hyperv_guid2str(&chan->ch_guid_inst, guidbuf, sizeof(guidbuf)); strlcat(buf, guidbuf, buflen); return (0); } int vmbus_add_child(struct vmbus_channel *chan) { struct vmbus_softc *sc = chan->ch_vmbus; device_t parent = sc->vmbus_dev; mtx_lock(&Giant); chan->ch_dev = device_add_child(parent, NULL, -1); if (chan->ch_dev == NULL) { mtx_unlock(&Giant); device_printf(parent, "device_add_child for chan%u failed\n", chan->ch_id); return (ENXIO); } device_set_ivars(chan->ch_dev, chan); device_probe_and_attach(chan->ch_dev); mtx_unlock(&Giant); return (0); } int vmbus_delete_child(struct vmbus_channel *chan) { int error = 0; mtx_lock(&Giant); if (chan->ch_dev != NULL) { error = device_delete_child(chan->ch_vmbus->vmbus_dev, chan->ch_dev); chan->ch_dev = NULL; } mtx_unlock(&Giant); return (error); } static int vmbus_sysctl_version(SYSCTL_HANDLER_ARGS) { struct vmbus_softc *sc = arg1; char verstr[16]; snprintf(verstr, sizeof(verstr), "%u.%u", VMBUS_VERSION_MAJOR(sc->vmbus_version), VMBUS_VERSION_MINOR(sc->vmbus_version)); return sysctl_handle_string(oidp, verstr, sizeof(verstr), req); } /* * We need the function to make sure the MMIO resource is allocated from the * ranges found in _CRS. * * For the release function, we can use bus_generic_release_resource(). */ static struct resource * vmbus_alloc_resource(device_t dev, device_t child, int type, int *rid, rman_res_t start, rman_res_t end, rman_res_t count, u_int flags) { device_t parent = device_get_parent(dev); struct resource *res; #ifdef NEW_PCIB if (type == SYS_RES_MEMORY) { struct vmbus_softc *sc = device_get_softc(dev); res = pcib_host_res_alloc(&sc->vmbus_mmio_res, child, type, rid, start, end, count, flags); } else #endif { res = BUS_ALLOC_RESOURCE(parent, child, type, rid, start, end, count, flags); } return (res); } static int vmbus_alloc_msi(device_t bus, device_t dev, int count, int maxcount, int *irqs) { return (PCIB_ALLOC_MSI(device_get_parent(bus), dev, count, maxcount, irqs)); } static int vmbus_release_msi(device_t bus, device_t dev, int count, int *irqs) { return (PCIB_RELEASE_MSI(device_get_parent(bus), dev, count, irqs)); } static int vmbus_alloc_msix(device_t bus, device_t dev, int *irq) { return (PCIB_ALLOC_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_release_msix(device_t bus, device_t dev, int irq) { return (PCIB_RELEASE_MSIX(device_get_parent(bus), dev, irq)); } static int vmbus_map_msi(device_t bus, device_t dev, int irq, uint64_t *addr, uint32_t *data) { return (PCIB_MAP_MSI(device_get_parent(bus), dev, irq, addr, data)); } static uint32_t vmbus_get_version_method(device_t bus, device_t dev) { struct vmbus_softc *sc = device_get_softc(bus); return sc->vmbus_version; } static int vmbus_probe_guid_method(device_t bus, device_t dev, const struct hyperv_guid *guid) { const struct vmbus_channel *chan = vmbus_get_channel(dev); if (memcmp(&chan->ch_guid_type, guid, sizeof(struct hyperv_guid)) == 0) return 0; return ENXIO; } static uint32_t vmbus_get_vcpu_id_method(device_t bus, device_t dev, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); return (VMBUS_PCPU_GET(sc, vcpuid, cpu)); } static struct taskqueue * vmbus_get_eventtq_method(device_t bus, device_t dev __unused, int cpu) { const struct vmbus_softc *sc = device_get_softc(bus); KASSERT(cpu >= 0 && cpu < mp_ncpus, ("invalid cpu%d", cpu)); return (VMBUS_PCPU_GET(sc, event_tq, cpu)); } #ifdef NEW_PCIB #define VTPM_BASE_ADDR 0xfed40000 #define FOUR_GB (1ULL << 32) enum parse_pass { parse_64, parse_32 }; struct parse_context { device_t vmbus_dev; enum parse_pass pass; }; static ACPI_STATUS parse_crs(ACPI_RESOURCE *res, void *ctx) { const struct parse_context *pc = ctx; device_t vmbus_dev = pc->vmbus_dev; struct vmbus_softc *sc = device_get_softc(vmbus_dev); UINT64 start, end; switch (res->Type) { case ACPI_RESOURCE_TYPE_ADDRESS32: start = res->Data.Address32.Address.Minimum; end = res->Data.Address32.Address.Maximum; break; case ACPI_RESOURCE_TYPE_ADDRESS64: start = res->Data.Address64.Address.Minimum; end = res->Data.Address64.Address.Maximum; break; default: /* Unused types. */ return (AE_OK); } /* * We don't use <1MB addresses. */ if (end < 0x100000) return (AE_OK); /* Don't conflict with vTPM. */ if (end >= VTPM_BASE_ADDR && start < VTPM_BASE_ADDR) end = VTPM_BASE_ADDR - 1; if ((pc->pass == parse_32 && start < FOUR_GB) || (pc->pass == parse_64 && start >= FOUR_GB)) pcib_host_res_decodes(&sc->vmbus_mmio_res, SYS_RES_MEMORY, start, end, 0); return (AE_OK); } static void vmbus_get_crs(device_t dev, device_t vmbus_dev, enum parse_pass pass) { struct parse_context pc; ACPI_STATUS status; if (bootverbose) device_printf(dev, "walking _CRS, pass=%d\n", pass); pc.vmbus_dev = vmbus_dev; pc.pass = pass; status = AcpiWalkResources(acpi_get_handle(dev), "_CRS", parse_crs, &pc); if (bootverbose && ACPI_FAILURE(status)) device_printf(dev, "_CRS: not found, pass=%d\n", pass); } static void vmbus_get_mmio_res_pass(device_t dev, enum parse_pass pass) { device_t acpi0, parent; parent = device_get_parent(dev); acpi0 = device_get_parent(parent); if (strcmp("acpi0", device_get_nameunit(acpi0)) == 0) { device_t *children; int count; /* * Try to locate VMBUS resources and find _CRS on them. */ if (device_get_children(acpi0, &children, &count) == 0) { int i; for (i = 0; i < count; ++i) { if (!device_is_attached(children[i])) continue; if (strcmp("vmbus_res", device_get_name(children[i])) == 0) vmbus_get_crs(children[i], dev, pass); } free(children, M_TEMP); } /* * Try to find _CRS on acpi. */ vmbus_get_crs(acpi0, dev, pass); } else { device_printf(dev, "not grandchild of acpi\n"); } /* * Try to find _CRS on parent. */ vmbus_get_crs(parent, dev, pass); } static void vmbus_get_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); /* * We walk the resources twice to make sure that: in the resource * list, the 32-bit resources appear behind the 64-bit resources. * NB: resource_list_add() uses INSERT_TAIL. This way, when we * iterate through the list to find a range for a 64-bit BAR in * vmbus_alloc_resource(), we can make sure we try to use >4GB * ranges first. */ pcib_host_res_init(dev, &sc->vmbus_mmio_res); vmbus_get_mmio_res_pass(dev, parse_64); vmbus_get_mmio_res_pass(dev, parse_32); } static void vmbus_free_mmio_res(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); pcib_host_res_free(dev, &sc->vmbus_mmio_res); } #endif /* NEW_PCIB */ static void vmbus_identify(driver_t *driver, device_t parent) { if (device_get_unit(parent) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return; device_add_child(parent, "vmbus", -1); } static int vmbus_probe(device_t dev) { if (device_get_unit(dev) != 0 || vm_guest != VM_GUEST_HV || (hyperv_features & CPUID_HV_MSR_SYNIC) == 0) return (ENXIO); device_set_desc(dev, "Hyper-V Vmbus"); return (BUS_PROBE_DEFAULT); } /** * @brief Main vmbus driver initialization routine. * * Here, we * - initialize the vmbus driver context * - setup various driver entry points * - invoke the vmbus hv main init routine * - get the irq resource * - invoke the vmbus to add the vmbus root device * - setup the vmbus root device * - retrieve the channel offers */ static int vmbus_doattach(struct vmbus_softc *sc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; int ret; if (sc->vmbus_flags & VMBUS_FLAG_ATTACHED) return (0); #ifdef NEW_PCIB vmbus_get_mmio_res(sc->vmbus_dev); #endif sc->vmbus_flags |= VMBUS_FLAG_ATTACHED; sc->vmbus_gpadl = VMBUS_GPADL_START; mtx_init(&sc->vmbus_prichan_lock, "vmbus prichan", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_prichans); mtx_init(&sc->vmbus_chan_lock, "vmbus channel", NULL, MTX_DEF); TAILQ_INIT(&sc->vmbus_chans); sc->vmbus_chmap = malloc( sizeof(struct vmbus_channel *) * VMBUS_CHAN_MAX, M_DEVBUF, M_WAITOK | M_ZERO); /* * Create context for "post message" Hypercalls */ sc->vmbus_xc = vmbus_xact_ctx_create(bus_get_dma_tag(sc->vmbus_dev), HYPERCALL_POSTMSGIN_SIZE, VMBUS_MSG_SIZE, sizeof(struct vmbus_msghc)); if (sc->vmbus_xc == NULL) { ret = ENXIO; goto cleanup; } /* * Allocate DMA stuffs. */ ret = vmbus_dma_alloc(sc); if (ret != 0) goto cleanup; /* * Setup interrupt. */ ret = vmbus_intr_setup(sc); if (ret != 0) goto cleanup; /* * Setup SynIC. */ if (bootverbose) device_printf(sc->vmbus_dev, "smp_started = %d\n", smp_started); smp_rendezvous(NULL, vmbus_synic_setup, NULL, sc); sc->vmbus_flags |= VMBUS_FLAG_SYNIC; /* * Initialize vmbus, e.g. connect to Hypervisor. */ ret = vmbus_init(sc); if (ret != 0) goto cleanup; if (sc->vmbus_version == VMBUS_VERSION_WS2008 || sc->vmbus_version == VMBUS_VERSION_WIN7) sc->vmbus_event_proc = vmbus_event_proc_compat; else sc->vmbus_event_proc = vmbus_event_proc; ret = vmbus_scan(sc); if (ret != 0) goto cleanup; ctx = device_get_sysctl_ctx(sc->vmbus_dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->vmbus_dev)); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "version", CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, vmbus_sysctl_version, "A", "vmbus version"); return (ret); cleanup: vmbus_scan_teardown(sc); vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); return (ret); } static void vmbus_event_proc_dummy(struct vmbus_softc *sc __unused, int cpu __unused) { } #ifdef EARLY_AP_STARTUP static void vmbus_intrhook(void *xsc) { struct vmbus_softc *sc = xsc; if (bootverbose) device_printf(sc->vmbus_dev, "intrhook\n"); vmbus_doattach(sc); config_intrhook_disestablish(&sc->vmbus_intrhook); } #endif /* EARLY_AP_STARTUP */ static int vmbus_attach(device_t dev) { vmbus_sc = device_get_softc(dev); vmbus_sc->vmbus_dev = dev; vmbus_sc->vmbus_idtvec = -1; /* * Event processing logic will be configured: * - After the vmbus protocol version negotiation. * - Before we request channel offers. */ vmbus_sc->vmbus_event_proc = vmbus_event_proc_dummy; #ifdef EARLY_AP_STARTUP /* * Defer the real attach until the pause(9) works as expected. */ vmbus_sc->vmbus_intrhook.ich_func = vmbus_intrhook; vmbus_sc->vmbus_intrhook.ich_arg = vmbus_sc; config_intrhook_establish(&vmbus_sc->vmbus_intrhook); #else /* !EARLY_AP_STARTUP */ /* * If the system has already booted and thread * scheduling is possible indicated by the global * cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(vmbus_sc); #endif /* EARLY_AP_STARTUP */ return (0); } static int vmbus_detach(device_t dev) { struct vmbus_softc *sc = device_get_softc(dev); bus_generic_detach(dev); vmbus_chan_destroy_all(sc); vmbus_scan_teardown(sc); vmbus_disconnect(sc); if (sc->vmbus_flags & VMBUS_FLAG_SYNIC) { sc->vmbus_flags &= ~VMBUS_FLAG_SYNIC; smp_rendezvous(NULL, vmbus_synic_teardown, NULL, NULL); } vmbus_intr_teardown(sc); vmbus_dma_free(sc); if (sc->vmbus_xc != NULL) { vmbus_xact_ctx_destroy(sc->vmbus_xc); sc->vmbus_xc = NULL; } free(__DEVOLATILE(void *, sc->vmbus_chmap), M_DEVBUF); mtx_destroy(&sc->vmbus_prichan_lock); mtx_destroy(&sc->vmbus_chan_lock); #ifdef NEW_PCIB vmbus_free_mmio_res(dev); #endif return (0); } #ifndef EARLY_AP_STARTUP static void vmbus_sysinit(void *arg __unused) { struct vmbus_softc *sc = vmbus_get_softc(); if (vm_guest != VM_GUEST_HV || sc == NULL) return; /* * If the system has already booted and thread * scheduling is possible, as indicated by the * global cold set to zero, we just call the driver * initialization directly. */ if (!cold) vmbus_doattach(sc); } /* * NOTE: * We have to start as the last step of SI_SUB_SMP, i.e. after SMP is * initialized. */ SYSINIT(vmbus_initialize, SI_SUB_SMP, SI_ORDER_ANY, vmbus_sysinit, NULL); #endif /* !EARLY_AP_STARTUP */ Index: head/sys/dev/hyperv/vmbus/vmbus_et.c =================================================================== --- head/sys/dev/hyperv/vmbus/vmbus_et.c (revision 322487) +++ head/sys/dev/hyperv/vmbus/vmbus_et.c (revision 322488) @@ -1,201 +1,201 @@ /*- - * Copyright (c) 2015,2016 Microsoft Corp. + * Copyright (c) 2015,2016-2017 Microsoft Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #define VMBUS_ET_NAME "hvet" #define MSR_HV_STIMER0_CFG_SINT \ ((((uint64_t)VMBUS_SINT_TIMER) << MSR_HV_STIMER_CFG_SINT_SHIFT) & \ MSR_HV_STIMER_CFG_SINT_MASK) /* * Additionally required feature: * - SynIC is needed for interrupt generation. */ #define CPUID_HV_ET_MASK (CPUID_HV_MSR_SYNIC | \ CPUID_HV_MSR_SYNTIMER) static void vmbus_et_identify(driver_t *, device_t); static int vmbus_et_probe(device_t); static int vmbus_et_attach(device_t); static int vmbus_et_detach(device_t); static int vmbus_et_start(struct eventtimer *, sbintime_t, sbintime_t); static struct eventtimer vmbus_et; static device_method_t vmbus_et_methods[] = { DEVMETHOD(device_identify, vmbus_et_identify), DEVMETHOD(device_probe, vmbus_et_probe), DEVMETHOD(device_attach, vmbus_et_attach), DEVMETHOD(device_detach, vmbus_et_detach), DEVMETHOD_END }; static driver_t vmbus_et_driver = { VMBUS_ET_NAME, vmbus_et_methods, 0 }; static devclass_t vmbus_et_devclass; DRIVER_MODULE(hv_et, vmbus, vmbus_et_driver, vmbus_et_devclass, NULL, NULL); MODULE_VERSION(hv_et, 1); static __inline uint64_t hyperv_sbintime2count(sbintime_t time) { struct timespec val; val = sbttots(time); return (val.tv_sec * HYPERV_TIMER_FREQ) + (val.tv_nsec / HYPERV_TIMER_NS_FACTOR); } static int vmbus_et_start(struct eventtimer *et __unused, sbintime_t first, sbintime_t period __unused) { uint64_t current; current = hyperv_tc64(); current += hyperv_sbintime2count(first); wrmsr(MSR_HV_STIMER0_COUNT, current); return (0); } void vmbus_et_intr(struct trapframe *frame) { struct trapframe *oldframe; struct thread *td; if (vmbus_et.et_active) { td = curthread; td->td_intr_nesting_level++; oldframe = td->td_intr_frame; td->td_intr_frame = frame; vmbus_et.et_event_cb(&vmbus_et, vmbus_et.et_arg); td->td_intr_frame = oldframe; td->td_intr_nesting_level--; } } static void vmbus_et_identify(driver_t *driver, device_t parent) { if (device_get_unit(parent) != 0 || device_find_child(parent, VMBUS_ET_NAME, -1) != NULL || (hyperv_features & CPUID_HV_ET_MASK) != CPUID_HV_ET_MASK || hyperv_tc64 == NULL) return; device_add_child(parent, VMBUS_ET_NAME, -1); } static int vmbus_et_probe(device_t dev) { if (resource_disabled(VMBUS_ET_NAME, 0)) return (ENXIO); device_set_desc(dev, "Hyper-V event timer"); return (BUS_PROBE_NOWILDCARD); } static void vmbus_et_config(void *arg __unused) { /* * Make sure that STIMER0 is really disabled before writing * to STIMER0_CONFIG. * * "Writing to the configuration register of a timer that * is already enabled may result in undefined behaviour." */ for (;;) { uint64_t val; /* Stop counting, and this also implies disabling STIMER0 */ wrmsr(MSR_HV_STIMER0_COUNT, 0); val = rdmsr(MSR_HV_STIMER0_CONFIG); if ((val & MSR_HV_STIMER_CFG_ENABLE) == 0) break; cpu_spinwait(); } wrmsr(MSR_HV_STIMER0_CONFIG, MSR_HV_STIMER_CFG_AUTOEN | MSR_HV_STIMER0_CFG_SINT); } static int vmbus_et_attach(device_t dev) { /* TODO: use independent IDT vector */ vmbus_et.et_name = "Hyper-V"; vmbus_et.et_flags = ET_FLAGS_ONESHOT | ET_FLAGS_PERCPU; vmbus_et.et_quality = 1000; vmbus_et.et_frequency = HYPERV_TIMER_FREQ; vmbus_et.et_min_period = (0x00000001ULL << 32) / HYPERV_TIMER_FREQ; vmbus_et.et_max_period = (0xfffffffeULL << 32) / HYPERV_TIMER_FREQ; vmbus_et.et_start = vmbus_et_start; /* * Delay a bit to make sure that hyperv_tc64 will not return 0, * since writing 0 to STIMER0_COUNT will disable STIMER0. */ DELAY(100); smp_rendezvous(NULL, vmbus_et_config, NULL, NULL); return (et_register(&vmbus_et)); } static int vmbus_et_detach(device_t dev) { return (et_deregister(&vmbus_et)); }