Index: stable/12/sys/amd64/vmm/amd/amdvi_hw.c =================================================================== --- stable/12/sys/amd64/vmm/amd/amdvi_hw.c (revision 353104) +++ stable/12/sys/amd64/vmm/amd/amdvi_hw.c (revision 353105) @@ -1,1461 +1,1461 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice unmodified, this list of conditions, and the following * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "pcib_if.h" #include "io/iommu.h" #include "amdvi_priv.h" SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL); #define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) #define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) /* Print RID or device ID in PCI string format. */ #define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) static void amdvi_dump_cmds(struct amdvi_softc *softc); static void amdvi_print_dev_cap(struct amdvi_softc *softc); MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); extern device_t *ivhd_devs; extern int ivhd_count; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, 0, NULL); static int amdvi_enable_user = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, &amdvi_enable_user, 0, NULL); TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); #ifdef AMDVI_ATS_ENABLE /* XXX: ATS is not tested. */ static int amdvi_enable_iotlb = 1; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, &amdvi_enable_iotlb, 0, NULL); TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); #endif static int amdvi_host_ptp = 1; /* Use page tables for host. */ SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, &amdvi_host_ptp, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); /* Page table level used <= supported by h/w[v1=7]. */ -static int amdvi_ptp_level = 4; +int amdvi_ptp_level = 4; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, &amdvi_ptp_level, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); /* Disable fault event reporting. */ static int amdvi_disable_io_fault = 0; SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, &amdvi_disable_io_fault, 0, NULL); TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, &amdvi_dom_id, 0, NULL); /* * Device table entry. * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). * = 256 * 2 * PAGE_SIZE. */ static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); CTASSERT(PCI_NUM_DEV_MAX == 0x10000); CTASSERT(sizeof(amdvi_dte) == 0x200000); static SLIST_HEAD (, amdvi_domain) dom_head; static inline uint32_t amdvi_pci_read(struct amdvi_softc *softc, int off) { return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), off, 4)); } #ifdef AMDVI_ATS_ENABLE /* XXX: Should be in pci.c */ /* * Check if device has ATS capability and its enabled. * If ATS is absent or disabled, return (-1), otherwise ATS * queue length. */ static int amdvi_find_ats_qlen(uint16_t devid) { device_t dev; uint32_t off, cap; int qlen = -1; dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), PCI_RID2FUNC(devid)); if (!dev) { return (-1); } #define PCIM_ATS_EN BIT(31) if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { cap = pci_read_config(dev, off + 4, 4); qlen = (cap & 0x1F); qlen = qlen ? qlen : 32; printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", RID2PCI_STR(devid), (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", qlen); qlen = (cap & PCIM_ATS_EN) ? qlen : -1; } return (qlen); } /* * Check if an endpoint device support device IOTLB or ATS. */ static inline bool amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct ivhd_dev_cfg *cfg; int qlen, i; bool pci_ats, ivhd_ats; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) return (false); KASSERT(softc, ("softc is NULL")); cfg = softc->dev_cfg; ivhd_ats = false; for (i = 0; i < softc->dev_cfg_cnt; i++) { if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { ivhd_ats = cfg->enable_ats; break; } cfg++; } pci_ats = (qlen < 0) ? false : true; if (pci_ats != ivhd_ats) device_printf(softc->dev, "BIOS bug: mismatch in ATS setting for %d.%d.%d," "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); /* Ignore IVRS setting and respect PCI setting. */ return (pci_ats); } #endif /* Enable IOTLB support for IOMMU if its supported. */ static inline void amdvi_hw_enable_iotlb(struct amdvi_softc *softc) { #ifndef AMDVI_ATS_ENABLE softc->iotlb = false; #else bool supported; supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { if (!supported) device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); if (supported && !amdvi_enable_iotlb) { device_printf(softc->dev, "IOTLB disabled by user.\n"); supported = false; } } else supported = false; softc->iotlb = supported; #endif } static int amdvi_init_cmd(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl = softc->ctrl; ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ softc->cmd_max = 1 << ctrl->cmd.len; softc->cmd = malloc(sizeof(struct amdvi_cmd) * softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->cmd & PAGE_MASK) panic("AMDVi: Command buffer not aligned on page boundary."); ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; /* * XXX: Reset the h/w pointers in case IOMMU is restarting, * h/w doesn't clear these pointers based on empirical data. */ ctrl->cmd_tail = 0; ctrl->cmd_head = 0; return (0); } /* * Note: Update tail pointer after we have written the command since tail * pointer update cause h/w to execute new commands, see section 3.3 * of AMD IOMMU spec ver 2.0. */ /* Get the command tail pointer w/o updating it. */ static struct amdvi_cmd * amdvi_get_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *tail; KASSERT(softc, ("softc is NULL")); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + ctrl->cmd_tail); return (tail); } /* * Update the command tail pointer which will start command execution. */ static void amdvi_update_cmd_tail(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int size; size = sizeof(struct amdvi_cmd); KASSERT(softc->cmd != NULL, ("cmd is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); softc->total_cmd++; #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); #endif } /* * Various commands supported by IOMMU. */ /* Completion wait command. */ static void amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) { struct amdvi_cmd *cmd; uint64_t pa; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); pa = vtophys(&softc->cmp_data); cmd->opcode = AMDVI_CMP_WAIT_OPCODE; cmd->word0 = (pa & 0xFFFFFFF8) | (AMDVI_CMP_WAIT_STORE); //(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE); cmd->word1 = (pa >> 32) & 0xFFFFF; cmd->addr = data; amdvi_update_cmd_tail(softc); } /* Invalidate device table entry. */ static void amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_DTE_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); #endif } /* Invalidate IOMMU page, use for invalidation of domain. */ static void amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, uint64_t addr, bool guest_nested, bool pde, bool page) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_PAGE_OPCODE; cmd->word1 = domain_id; /* * Invalidate all addresses for this domain. */ cmd->addr = addr; cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; amdvi_update_cmd_tail(softc); } #ifdef AMDVI_ATS_ENABLE /* Invalidate device IOTLB. */ static void amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; int qlen; if (!softc->iotlb) return; qlen = amdvi_find_ats_qlen(devid); if (qlen < 0) { panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", qlen, RID2PCI_STR(devid)); } cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" " Qlen:%d\n", devid, qlen); #endif cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; cmd->word0 = devid; cmd->word1 = qlen; cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | AMDVI_INVD_IOTLB_S; amdvi_update_cmd_tail(softc); } #endif #ifdef notyet /* For Interrupt Remap. */ static void amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, uint16_t devid) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); cmd->opcode = AMDVI_INVD_INTR_OPCODE; cmd->word0 = devid; amdvi_update_cmd_tail(softc); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); #endif } #endif /* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ static void amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) { struct amdvi_cmd *cmd; cmd = amdvi_get_cmd_tail(softc); KASSERT(cmd != NULL, ("Cmd is NULL")); /* * See section 3.3.3 of IOMMU spec rev 2.0, software note * for invalidating domain. */ amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, false, true, true); #ifdef AMDVI_DEBUG_CMD device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); #endif } static bool amdvi_cmp_wait(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; const uint64_t VERIFY = 0xA5A5; volatile uint64_t *read; int i; bool status; ctrl = softc->ctrl; read = &softc->cmp_data; *read = 0; amdvi_cmd_cmp(softc, VERIFY); /* Wait for h/w to update completion data. */ for (i = 0; i < 100 && (*read != VERIFY); i++) { DELAY(1000); /* 1 ms */ } status = (VERIFY == softc->cmp_data) ? true : false; #ifdef AMDVI_DEBUG_CMD if (status) device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, ctrl->cmd_head, loop); #endif return (status); } static void amdvi_wait(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; int i; KASSERT(softc, ("softc is NULL")); ctrl = softc->ctrl; KASSERT(ctrl != NULL, ("ctrl is NULL")); /* Don't wait if h/w is not enabled. */ if ((ctrl->control & AMDVI_CTRL_EN) == 0) return; for (i = 0; i < 10; i++) { if (amdvi_cmp_wait(softc)) return; } device_printf(softc->dev, "Error: completion failed" " tail:0x%x, head:0x%x.\n", ctrl->cmd_tail, ctrl->cmd_head); amdvi_dump_cmds(softc); } static void amdvi_dump_cmds(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_cmd *cmd; int off, i; ctrl = softc->ctrl; device_printf(softc->dev, "Dump all the commands:\n"); /* * If h/w is stuck in completion, it is the previous command, * start dumping from previous command onward. */ off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), softc->cmd_max); for (i = 0; off != ctrl->cmd_tail && i < softc->cmd_max; i++) { cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" " 0x%x 0x%lx\n", i, off, cmd->opcode, cmd->word0, cmd->word1, cmd->addr); off = (off + sizeof(struct amdvi_cmd)) % (softc->cmd_max * sizeof(struct amdvi_cmd)); } } static int amdvi_init_event(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->event.len = 8; softc->event_max = 1 << ctrl->event.len; softc->event = malloc(sizeof(struct amdvi_event) * softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); if ((uintptr_t)softc->event & PAGE_MASK) { device_printf(softc->dev, "Event buffer not aligned on page."); return (false); } ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; /* Reset the pointers. */ ctrl->evt_head = 0; ctrl->evt_tail = 0; return (0); } static inline void amdvi_decode_evt_flag(uint16_t flag) { flag &= AMDVI_EVENT_FLAG_MASK; printf(" 0x%b]\n", flag, "\020" "\001GN" "\002NX" "\003US" "\004I" "\005PR" "\006RW" "\007PE" "\010RZ" "\011TR" ); } /* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ static inline void amdvi_decode_evt_flag_type(uint8_t type) { switch (AMDVI_EVENT_FLAG_TYPE(type)) { case 0: printf("RSVD\n"); break; case 1: printf("Master Abort\n"); break; case 2: printf("Target Abort\n"); break; case 3: printf("Data Err\n"); break; default: break; } } static void amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); } static void amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(flag); } static void amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, uint16_t flag) { printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" " Addr:0x%lx", devid, domid, addr); amdvi_decode_evt_flag(flag); amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); } static void amdvi_decode_evt(struct amdvi_event *evt) { struct amdvi_cmd *cmd; switch (evt->opcode) { case AMDVI_EVENT_INVALID_DTE: amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PFAULT: amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_DTE_HW_ERROR: amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_PAGE_HW_ERROR: amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, evt->addr, evt->flag); break; case AMDVI_EVENT_ILLEGAL_CMD: /* FALL THROUGH */ case AMDVI_EVENT_CMD_HW_ERROR: printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? "ILLEGAL CMD" : "CMD HW ERR"); cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", cmd->opcode, cmd->word0, cmd->word1, cmd->addr); break; case AMDVI_EVENT_IOTLB_TIMEOUT: printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", evt->devid, evt->addr); break; case AMDVI_EVENT_INVALID_DTE_REQ: printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", evt->devid, evt->addr, evt->flag >> 9, (evt->flag >> 8) & 1); break; case AMDVI_EVENT_INVALID_PPR_REQ: case AMDVI_EVENT_COUNTER_ZERO: printf("AMD-Vi: v2 events.\n"); break; default: printf("Unsupported AMD-Vi event:%d\n", evt->opcode); } } static void amdvi_print_events(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; struct amdvi_event *event; int i, size; ctrl = softc->ctrl; size = sizeof(struct amdvi_event); for (i = 0; i < softc->event_max; i++) { event = &softc->event[ctrl->evt_head / size]; if (!event->opcode) break; device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", i, ctrl->evt_head, ctrl->evt_tail); amdvi_decode_evt(event); ctrl->evt_head = MOD_INC(ctrl->evt_head, size, softc->event_max); } } static int amdvi_init_dte(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; ctrl = softc->ctrl; ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; ctrl->dte.size = 0x1FF; /* 2MB device table. */ return (0); } /* * Not all capabilities of IOMMU are available in ACPI IVHD flag * or EFR entry, read directly from device. */ static int amdvi_print_pci_cap(device_t dev) { struct amdvi_softc *softc; uint32_t off, cap; softc = device_get_softc(dev); off = softc->cap_off; /* * Section 3.7.1 of IOMMU sepc rev 2.0. * Read capability from device. */ cap = amdvi_pci_read(softc, off); /* Make sure capability type[18:16] is 3. */ KASSERT((((cap >> 16) & 0x7) == 0x3), ("Not a IOMMU capability 0x%x@0x%x", cap, off)); softc->pci_cap = cap >> 24; device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", cap, off, softc->pci_cap, "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); return (0); } static void amdvi_event_intr(void *arg) { struct amdvi_softc *softc; struct amdvi_ctrl *ctrl; softc = (struct amdvi_softc *)arg; ctrl = softc->ctrl; device_printf(softc->dev, "EVT INTR %ld Status:0x%x" " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, ctrl->status, ctrl->evt_head, ctrl->evt_tail); printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); amdvi_print_events(softc); ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; } static void amdvi_free_evt_intr_res(device_t dev) { struct amdvi_softc *softc; softc = device_get_softc(dev); if (softc->event_tag != NULL) { bus_teardown_intr(dev, softc->event_res, softc->event_tag); } if (softc->event_res != NULL) { bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid, softc->event_res); } bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid); PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)), dev, 1, &softc->event_irq); } static bool amdvi_alloc_intr_resources(struct amdvi_softc *softc) { struct amdvi_ctrl *ctrl; device_t dev, pcib; device_t mmio_dev; uint64_t msi_addr; uint32_t msi_data; int err; dev = softc->dev; pcib = device_get_parent(device_get_parent(dev)); mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid), PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid)); if (device_is_attached(mmio_dev)) { device_printf(dev, "warning: IOMMU device is claimed by another driver %s\n", device_get_driver(mmio_dev)->name); } softc->event_irq = -1; softc->event_rid = 0; /* * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one * interrupt. XXX: Enable MSI/X support. */ err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq); if (err) { device_printf(dev, "Couldn't find event MSI IRQ resource.\n"); return (ENOENT); } err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid, softc->event_irq, 1); if (err) { device_printf(dev, "Couldn't set event MSI resource.\n"); return (ENXIO); } softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, &softc->event_rid, RF_ACTIVE); if (!softc->event_res) { device_printf(dev, "Unable to allocate event INTR resource.\n"); return (ENOMEM); } if (bus_setup_intr(dev, softc->event_res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr, softc, &softc->event_tag)) { device_printf(dev, "Fail to setup event intr\n"); bus_release_resource(softc->dev, SYS_RES_IRQ, softc->event_rid, softc->event_res); softc->event_res = NULL; return (ENXIO); } bus_describe_intr(dev, softc->event_res, softc->event_tag, "fault"); err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr, &msi_data); if (err) { device_printf(dev, "Event interrupt config failed, err=%d.\n", err); amdvi_free_evt_intr_res(softc->dev); return (err); } /* Clear interrupt status bits. */ ctrl = softc->ctrl; ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; /* Now enable MSI interrupt. */ pci_enable_msi(mmio_dev, msi_addr, msi_data); return (0); } static void amdvi_print_dev_cap(struct amdvi_softc *softc) { struct ivhd_dev_cfg *cfg; int i; cfg = softc->dev_cfg; for (i = 0; i < softc->dev_cfg_cnt; i++) { device_printf(softc->dev, "device [0x%x - 0x%x]" "config:%b%s\n", cfg->start_id, cfg->end_id, cfg->data, "\020\001INIT\002ExtInt\003NMI" "\007LINT0\008LINT1", cfg->enable_ats ? "ATS enabled" : ""); cfg++; } } static int amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) { struct amdvi_softc *softc; int result, type, error = 0; softc = (struct amdvi_softc *)arg1; type = arg2; switch (type) { case 0: result = softc->ctrl->cmd_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 1: result = softc->ctrl->cmd_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; case 2: result = softc->ctrl->evt_head; error = sysctl_handle_int(oidp, &result, 0, req); break; case 3: result = softc->ctrl->evt_tail; error = sysctl_handle_int(oidp, &result, 0, req); break; default: device_printf(softc->dev, "Unknown sysctl:%d\n", type); } return (error); } static void amdvi_add_sysctl(struct amdvi_softc *softc) { struct sysctl_oid_list *child; struct sysctl_ctx_list *ctx; device_t dev; dev = softc->dev; ctx = device_get_sysctl_ctx(dev); child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, &softc->event_intr_cnt, "Event interrupt count"); SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, &softc->total_cmd, "Command submitted count"); SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, &softc->pci_rid, 0, "IOMMU RID"); SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, &softc->start_dev_rid, 0, "Start of device under this IOMMU"); SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, &softc->end_dev_rid, 0, "End of device under this IOMMU"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", CTLTYPE_UINT | CTLFLAG_RD, softc, 0, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", CTLTYPE_UINT | CTLFLAG_RD, softc, 1, amdvi_handle_sysctl, "IU", "Command tail"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", CTLTYPE_UINT | CTLFLAG_RD, softc, 2, amdvi_handle_sysctl, "IU", "Command head"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", CTLTYPE_UINT | CTLFLAG_RD, softc, 3, amdvi_handle_sysctl, "IU", "Command tail"); } int amdvi_setup_hw(struct amdvi_softc *softc) { device_t dev; int status; dev = softc->dev; amdvi_hw_enable_iotlb(softc); amdvi_print_dev_cap(softc); if ((status = amdvi_print_pci_cap(dev)) != 0) { device_printf(dev, "PCI capability.\n"); return (status); } if ((status = amdvi_init_cmd(softc)) != 0) { device_printf(dev, "Couldn't configure command buffer.\n"); return (status); } if ((status = amdvi_init_event(softc)) != 0) { device_printf(dev, "Couldn't configure event buffer.\n"); return (status); } if ((status = amdvi_init_dte(softc)) != 0) { device_printf(dev, "Couldn't configure device table.\n"); return (status); } if ((status = amdvi_alloc_intr_resources(softc)) != 0) { return (status); } amdvi_add_sysctl(softc); return (0); } int amdvi_teardown_hw(struct amdvi_softc *softc) { device_t dev; dev = softc->dev; /* * Called after disable, h/w is stopped by now, free all the resources. */ amdvi_free_evt_intr_res(dev); if (softc->cmd) free(softc->cmd, M_AMDVI); if (softc->event) free(softc->event, M_AMDVI); return (0); } /*********** bhyve interfaces *********************/ static int amdvi_init(void) { if (!ivhd_count) { return (EIO); } if (!amdvi_enable_user && ivhd_count) { printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", ivhd_count); return (EINVAL); } return (0); } static void amdvi_cleanup(void) { /* Nothing. */ } static uint16_t amdvi_domainId(void) { /* * If we hit maximum domain limit, rollover leaving host * domain(0). * XXX: make sure that this domain is not used. */ if (amdvi_dom_id == AMDVI_MAX_DOMAIN) amdvi_dom_id = 1; return ((uint16_t)amdvi_dom_id++); } static void amdvi_do_inv_domain(uint16_t domain_id, bool create) { struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL")); /* * If not present pages are cached, invalidate page after * creating domain. */ #if 0 if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) continue; #endif amdvi_inv_domain(softc, domain_id); amdvi_wait(softc); } } static void * amdvi_create_domain(vm_paddr_t maxaddr) { struct amdvi_domain *dom; dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); dom->id = amdvi_domainId(); //dom->maxaddr = maxaddr; #ifdef AMDVI_DEBUG_CMD printf("Created domain #%d\n", dom->id); #endif /* * Host domain(#0) don't create translation table. */ if (dom->id || amdvi_host_ptp) dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); dom->ptp_level = amdvi_ptp_level; amdvi_do_inv_domain(dom->id, true); SLIST_INSERT_HEAD(&dom_head, dom, next); return (dom); } static void amdvi_free_ptp(uint64_t *ptp, int level) { int i; if (level < 1) return; for (i = 0; i < NPTEPG ; i++) { if ((ptp[i] & AMDVI_PT_PRESENT) == 0) continue; /* XXX: Add super-page or PTE mapping > 4KB. */ #ifdef notyet /* Super-page mapping. */ if (AMDVI_PD_SUPER(ptp[i])) continue; #endif amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] & AMDVI_PT_MASK), level - 1); } free(ptp, M_AMDVI); } static void amdvi_destroy_domain(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Destroying domain %d\n", domain->id); #endif if (domain->ptp) amdvi_free_ptp(domain->ptp, domain->ptp_level); amdvi_do_inv_domain(domain->id, false); SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); free(domain, M_AMDVI); } static uint64_t amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t pg_size, bool create) { uint64_t *page, pa; int shift, index; const int PT_SHIFT = 9; const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ if (!pg_size) return (0); if (hpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } if (gpa & (pg_size - 1)) { printf("HPA is not size aligned.\n"); return (0); } shift = PML4SHIFT; while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { index = (gpa >> shift) & PT_INDEX_MASK; if ((pt[index] == 0) && create) { page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); pa = vtophys(page); pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | ((level - 1) << AMDVI_PD_LEVEL_SHIFT); } #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif #define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) pa = PTE2PA(pt[index]); pt = (uint64_t *)PHYS_TO_DMAP(pa); shift -= PT_SHIFT; level--; } /* Leaf entry. */ index = (gpa >> shift) & PT_INDEX_MASK; if (create) { pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; } else pt[index] = 0; #ifdef AMDVI_DEBUG_PTE if ((gpa % 0x1000000) == 0) printf("[Last level%d, shift = %d]PTE:0x%lx\n", level, shift, pt[index]); #endif return (1ULL << shift); } static uint64_t amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t size, bool create) { uint64_t mapped, *ptp, len; int level; KASSERT(domain, ("domain is NULL")); level = domain->ptp_level; KASSERT(level, ("Page table level is 0")); ptp = domain->ptp; KASSERT(ptp, ("PTP is NULL")); mapped = 0; while (mapped < size) { len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, PAGE_SIZE, create); if (!len) { printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", hpa, gpa); return (0); } mapped += len; } return (mapped); } static uint64_t amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; if (domain->id && !domain->ptp) { printf("ptp is NULL"); return (-1); } /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) return (amdvi_update_mapping(domain, gpa, hpa, len, true)); else return (len); } static uint64_t amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; /* * If host domain is created w/o page table, skip IOMMU page * table set-up. */ if (domain->ptp) return (amdvi_update_mapping(domain, gpa, 0, len, false)); return (len); } static struct amdvi_softc * amdvi_find_iommu(uint16_t devid) { struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); if ((devid >= softc->start_dev_rid) && (devid <= softc->end_dev_rid)) return (softc); } /* * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU. */ printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n", RID2PCI_STR(devid)); return (device_get_softc(ivhd_devs[0])); } /* * Set-up device table entry. * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must * be set concurrently, e.g. read and write bits. */ static void amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable) { struct amdvi_softc *softc; struct amdvi_dte* temp; KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); softc = amdvi_find_iommu(devid); KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); temp = &amdvi_dte[devid]; #ifdef AMDVI_ATS_ENABLE /* If IOMMU and device support IOTLB, enable it. */ if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) temp->iotlb_enable = 1; #endif /* Avoid duplicate I/O faults. */ temp->sup_second_io_fault = 1; temp->sup_all_io_fault = amdvi_disable_io_fault; temp->dt_valid = 1; temp->domain_id = domain->id; if (enable) { if (domain->ptp) { temp->pt_base = vtophys(domain->ptp) >> 12; temp->pt_level = amdvi_ptp_level; } /* * XXX: Page table valid[TV] bit must be set even if host domain * page tables are not enabled. */ temp->pt_valid = 1; temp->read_allow = 1; temp->write_allow = 1; } } static void amdvi_inv_device(uint16_t devid) { struct amdvi_softc *softc; softc = amdvi_find_iommu(devid); KASSERT(softc, ("softc is NULL")); amdvi_cmd_inv_dte(softc, devid); #ifdef AMDVI_ATS_ENABLE if (amdvi_dev_support_iotlb(softc, devid)) amdvi_cmd_inv_iotlb(softc, devid); #endif amdvi_wait(softc); } static void amdvi_add_device(void *arg, uint16_t devid) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain != NULL, ("domain is NULL")); #ifdef AMDVI_DEBUG_CMD printf("Assigning device(%d.%d.%d) to domain:%d\n", RID2PCI_STR(devid), domain->id); #endif amdvi_set_dte(domain, devid, true); amdvi_inv_device(devid); } static void amdvi_remove_device(void *arg, uint16_t devid) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; #ifdef AMDVI_DEBUG_CMD printf("Remove device(0x%x) from domain:%d\n", devid, domain->id); #endif amdvi_set_dte(domain, devid, false); amdvi_inv_device(devid); } static void amdvi_enable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; uint64_t val; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); val = ( AMDVI_CTRL_EN | AMDVI_CTRL_CMD | AMDVI_CTRL_ELOG | AMDVI_CTRL_ELOGINT | AMDVI_CTRL_INV_TO_1S); if (softc->ivhd_flag & IVHD_FLAG_COH) val |= AMDVI_CTRL_COH; if (softc->ivhd_flag & IVHD_FLAG_HTT) val |= AMDVI_CTRL_HTT; if (softc->ivhd_flag & IVHD_FLAG_RPPW) val |= AMDVI_CTRL_RPPW; if (softc->ivhd_flag & IVHD_FLAG_PPW) val |= AMDVI_CTRL_PPW; if (softc->ivhd_flag & IVHD_FLAG_ISOC) val |= AMDVI_CTRL_ISOC; ctrl->control = val; } } static void amdvi_disable(void) { struct amdvi_ctrl *ctrl; struct amdvi_softc *softc; int i; for (i = 0; i < ivhd_count; i++) { softc = device_get_softc(ivhd_devs[i]); KASSERT(softc, ("softc is NULL\n")); ctrl = softc->ctrl; KASSERT(ctrl, ("ctrl is NULL\n")); ctrl->control = 0; } } static void amdvi_inv_tlb(void *arg) { struct amdvi_domain *domain; domain = (struct amdvi_domain *)arg; KASSERT(domain, ("domain is NULL")); amdvi_do_inv_domain(domain->id, false); } struct iommu_ops iommu_ops_amd = { amdvi_init, amdvi_cleanup, amdvi_enable, amdvi_disable, amdvi_create_domain, amdvi_destroy_domain, amdvi_create_mapping, amdvi_destroy_mapping, amdvi_add_device, amdvi_remove_device, amdvi_inv_tlb }; Index: stable/12/sys/amd64/vmm/intel/vmx.c =================================================================== --- stable/12/sys/amd64/vmm/intel/vmx.c (revision 353104) +++ stable/12/sys/amd64/vmm/intel/vmx.c (revision 353105) @@ -1,3809 +1,3809 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * Copyright (c) 2018 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "vmm_lapic.h" #include "vmm_host.h" #include "vmm_ioport.h" #include "vmm_ktr.h" #include "vmm_stat.h" #include "vatpic.h" #include "vlapic.h" #include "vlapic_priv.h" #include "ept.h" #include "vmx_cpufunc.h" #include "vmx.h" #include "vmx_msr.h" #include "x86.h" #include "vmx_controls.h" #define PINBASED_CTLS_ONE_SETTING \ (PINBASED_EXTINT_EXITING | \ PINBASED_NMI_EXITING | \ PINBASED_VIRTUAL_NMI) #define PINBASED_CTLS_ZERO_SETTING 0 #define PROCBASED_CTLS_WINDOW_SETTING \ (PROCBASED_INT_WINDOW_EXITING | \ PROCBASED_NMI_WINDOW_EXITING) #define PROCBASED_CTLS_ONE_SETTING \ (PROCBASED_SECONDARY_CONTROLS | \ PROCBASED_MWAIT_EXITING | \ PROCBASED_MONITOR_EXITING | \ PROCBASED_IO_EXITING | \ PROCBASED_MSR_BITMAPS | \ PROCBASED_CTLS_WINDOW_SETTING | \ PROCBASED_CR8_LOAD_EXITING | \ PROCBASED_CR8_STORE_EXITING) #define PROCBASED_CTLS_ZERO_SETTING \ (PROCBASED_CR3_LOAD_EXITING | \ PROCBASED_CR3_STORE_EXITING | \ PROCBASED_IO_BITMAPS) #define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT #define PROCBASED_CTLS2_ZERO_SETTING 0 #define VM_EXIT_CTLS_ONE_SETTING \ (VM_EXIT_SAVE_DEBUG_CONTROLS | \ VM_EXIT_HOST_LMA | \ VM_EXIT_SAVE_EFER | \ VM_EXIT_LOAD_EFER | \ VM_EXIT_ACKNOWLEDGE_INTERRUPT) #define VM_EXIT_CTLS_ZERO_SETTING 0 #define VM_ENTRY_CTLS_ONE_SETTING \ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ VM_ENTRY_LOAD_EFER) #define VM_ENTRY_CTLS_ZERO_SETTING \ (VM_ENTRY_INTO_SMM | \ VM_ENTRY_DEACTIVATE_DUAL_MONITOR) #define HANDLED 1 #define UNHANDLED 0 static MALLOC_DEFINE(M_VMX, "vmx", "vmx"); static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; static uint32_t exit_ctls, entry_ctls; static uint64_t cr0_ones_mask, cr0_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD, &cr0_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD, &cr0_zeros_mask, 0, NULL); static uint64_t cr4_ones_mask, cr4_zeros_mask; SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD, &cr4_ones_mask, 0, NULL); SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD, &cr4_zeros_mask, 0, NULL); static int vmx_initialized; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, &vmx_initialized, 0, "Intel VMX initialized"); /* * Optional capabilities */ static SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); static int cap_halt_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, "HLT triggers a VM-exit"); static int cap_pause_exit; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, 0, "PAUSE triggers a VM-exit"); static int cap_unrestricted_guest; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, &cap_unrestricted_guest, 0, "Unrestricted guests"); static int cap_monitor_trap; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, &cap_monitor_trap, 0, "Monitor trap flag"); static int cap_invpcid; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, 0, "Guests are allowed to use INVPCID"); static int virtual_interrupt_delivery; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); static int pirvec = -1; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, &pirvec, 0, "APICv posted interrupt vector"); static struct unrhdr *vpid_unr; static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); -static int guest_l1d_flush; +int guest_l1d_flush; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, &guest_l1d_flush, 0, NULL); -static int guest_l1d_flush_sw; +int guest_l1d_flush_sw; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, &guest_l1d_flush_sw, 0, NULL); static struct msr_entry msr_load_list[1] __aligned(16); /* * The definitions of SDT probes for VMX. */ SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, "struct vmx *", "int", "struct vm_exit *", "uint64_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, "struct vmx *", "int", "struct vm_exit *", "uint64_t"); SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, "struct vmx *", "int", "struct vm_exit *"); SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, "struct vmx *", "int", "struct vm_exit *", "uint32_t"); SDT_PROBE_DEFINE4(vmm, vmx, exit, return, "struct vmx *", "int", "struct vm_exit *", "int"); /* * Use the last page below 4GB as the APIC access address. This address is * occupied by the boot firmware so it is guaranteed that it will not conflict * with a page in system memory. */ #define APIC_ACCESS_ADDRESS 0xFFFFF000 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); static void vmx_inject_pir(struct vlapic *vlapic); #ifdef KTR static const char * exit_reason_to_str(int reason) { static char reasonbuf[32]; switch (reason) { case EXIT_REASON_EXCEPTION: return "exception"; case EXIT_REASON_EXT_INTR: return "extint"; case EXIT_REASON_TRIPLE_FAULT: return "triplefault"; case EXIT_REASON_INIT: return "init"; case EXIT_REASON_SIPI: return "sipi"; case EXIT_REASON_IO_SMI: return "iosmi"; case EXIT_REASON_SMI: return "smi"; case EXIT_REASON_INTR_WINDOW: return "intrwindow"; case EXIT_REASON_NMI_WINDOW: return "nmiwindow"; case EXIT_REASON_TASK_SWITCH: return "taskswitch"; case EXIT_REASON_CPUID: return "cpuid"; case EXIT_REASON_GETSEC: return "getsec"; case EXIT_REASON_HLT: return "hlt"; case EXIT_REASON_INVD: return "invd"; case EXIT_REASON_INVLPG: return "invlpg"; case EXIT_REASON_RDPMC: return "rdpmc"; case EXIT_REASON_RDTSC: return "rdtsc"; case EXIT_REASON_RSM: return "rsm"; case EXIT_REASON_VMCALL: return "vmcall"; case EXIT_REASON_VMCLEAR: return "vmclear"; case EXIT_REASON_VMLAUNCH: return "vmlaunch"; case EXIT_REASON_VMPTRLD: return "vmptrld"; case EXIT_REASON_VMPTRST: return "vmptrst"; case EXIT_REASON_VMREAD: return "vmread"; case EXIT_REASON_VMRESUME: return "vmresume"; case EXIT_REASON_VMWRITE: return "vmwrite"; case EXIT_REASON_VMXOFF: return "vmxoff"; case EXIT_REASON_VMXON: return "vmxon"; case EXIT_REASON_CR_ACCESS: return "craccess"; case EXIT_REASON_DR_ACCESS: return "draccess"; case EXIT_REASON_INOUT: return "inout"; case EXIT_REASON_RDMSR: return "rdmsr"; case EXIT_REASON_WRMSR: return "wrmsr"; case EXIT_REASON_INVAL_VMCS: return "invalvmcs"; case EXIT_REASON_INVAL_MSR: return "invalmsr"; case EXIT_REASON_MWAIT: return "mwait"; case EXIT_REASON_MTF: return "mtf"; case EXIT_REASON_MONITOR: return "monitor"; case EXIT_REASON_PAUSE: return "pause"; case EXIT_REASON_MCE_DURING_ENTRY: return "mce-during-entry"; case EXIT_REASON_TPR: return "tpr"; case EXIT_REASON_APIC_ACCESS: return "apic-access"; case EXIT_REASON_GDTR_IDTR: return "gdtridtr"; case EXIT_REASON_LDTR_TR: return "ldtrtr"; case EXIT_REASON_EPT_FAULT: return "eptfault"; case EXIT_REASON_EPT_MISCONFIG: return "eptmisconfig"; case EXIT_REASON_INVEPT: return "invept"; case EXIT_REASON_RDTSCP: return "rdtscp"; case EXIT_REASON_VMX_PREEMPT: return "vmxpreempt"; case EXIT_REASON_INVVPID: return "invvpid"; case EXIT_REASON_WBINVD: return "wbinvd"; case EXIT_REASON_XSETBV: return "xsetbv"; case EXIT_REASON_APIC_WRITE: return "apic-write"; default: snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason); return (reasonbuf); } } #endif /* KTR */ static int vmx_allow_x2apic_msrs(struct vmx *vmx) { int i, error; error = 0; /* * Allow readonly access to the following x2APIC MSRs from the guest. */ error += guest_msr_ro(vmx, MSR_APIC_ID); error += guest_msr_ro(vmx, MSR_APIC_VERSION); error += guest_msr_ro(vmx, MSR_APIC_LDR); error += guest_msr_ro(vmx, MSR_APIC_SVR); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); error += guest_msr_ro(vmx, MSR_APIC_ESR); error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER); error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL); error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0); error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1); error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR); error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER); error += guest_msr_ro(vmx, MSR_APIC_ICR); /* * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest. * * These registers get special treatment described in the section * "Virtualizing MSR-Based APIC Accesses". */ error += guest_msr_rw(vmx, MSR_APIC_TPR); error += guest_msr_rw(vmx, MSR_APIC_EOI); error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI); return (error); } u_long vmx_fix_cr0(u_long cr0) { return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask); } u_long vmx_fix_cr4(u_long cr4) { return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask); } static void vpid_free(int vpid) { if (vpid < 0 || vpid > 0xffff) panic("vpid_free: invalid vpid %d", vpid); /* * VPIDs [0,VM_MAXCPU] are special and are not allocated from * the unit number allocator. */ if (vpid > VM_MAXCPU) free_unr(vpid_unr, vpid); } static void vpid_alloc(uint16_t *vpid, int num) { int i, x; if (num <= 0 || num > VM_MAXCPU) panic("invalid number of vpids requested: %d", num); /* * If the "enable vpid" execution control is not enabled then the * VPID is required to be 0 for all vcpus. */ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) { for (i = 0; i < num; i++) vpid[i] = 0; return; } /* * Allocate a unique VPID for each vcpu from the unit number allocator. */ for (i = 0; i < num; i++) { x = alloc_unr(vpid_unr); if (x == -1) break; else vpid[i] = x; } if (i < num) { atomic_add_int(&vpid_alloc_failed, 1); /* * If the unit number allocator does not have enough unique * VPIDs then we need to allocate from the [1,VM_MAXCPU] range. * * These VPIDs are not be unique across VMs but this does not * affect correctness because the combined mappings are also * tagged with the EP4TA which is unique for each VM. * * It is still sub-optimal because the invvpid will invalidate * combined mappings for a particular VPID across all EP4TAs. */ while (i-- > 0) vpid_free(vpid[i]); for (i = 0; i < num; i++) vpid[i] = i + 1; } } static void vpid_init(void) { /* * VPID 0 is required when the "enable VPID" execution control is * disabled. * * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the * unit number allocator does not have sufficient unique VPIDs to * satisfy the allocation. * * The remaining VPIDs are managed by the unit number allocator. */ vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); } static void vmx_disable(void *arg __unused) { struct invvpid_desc invvpid_desc = { 0 }; struct invept_desc invept_desc = { 0 }; if (vmxon_enabled[curcpu]) { /* * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b. * * VMXON or VMXOFF are not required to invalidate any TLB * caching structures. This prevents potential retention of * cached information in the TLB between distinct VMX episodes. */ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc); invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc); vmxoff(); } load_cr4(rcr4() & ~CR4_VMXE); } static int vmx_cleanup(void) { if (pirvec >= 0) lapic_ipi_free(pirvec); if (vpid_unr != NULL) { delete_unrhdr(vpid_unr); vpid_unr = NULL; } if (nmi_flush_l1d_sw == 1) nmi_flush_l1d_sw = 0; smp_rendezvous(NULL, vmx_disable, NULL, NULL); return (0); } static void vmx_enable(void *arg __unused) { int error; uint64_t feature_control; feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 || (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | IA32_FEATURE_CONTROL_VMX_EN | IA32_FEATURE_CONTROL_LOCK); } load_cr4(rcr4() | CR4_VMXE); *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); error = vmxon(vmxon_region[curcpu]); if (error == 0) vmxon_enabled[curcpu] = 1; } static void vmx_restore(void) { if (vmxon_enabled[curcpu]) vmxon(vmxon_region[curcpu]); } static int vmx_init(int ipinum) { int error, use_tpr_shadow; uint64_t basic, fixed0, fixed1, feature_control; uint32_t tmp, procbased2_vid_bits; /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ if (!(cpu_feature2 & CPUID2_VMX)) { printf("vmx_init: processor does not support VMX operation\n"); return (ENXIO); } /* * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits * are set (bits 0 and 2 respectively). */ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL); if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 && (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) { printf("vmx_init: VMX operation disabled by BIOS\n"); return (ENXIO); } /* * Verify capabilities MSR_VMX_BASIC: * - bit 54 indicates support for INS/OUTS decoding */ basic = rdmsr(MSR_VMX_BASIC); if ((basic & (1UL << 54)) == 0) { printf("vmx_init: processor does not support desired basic " "capabilities\n"); return (EINVAL); } /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls); if (error) { printf("vmx_init: processor does not support desired primary " "processor-based controls\n"); return (error); } /* Clear the processor-based ctl bits that are set on demand */ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING; /* Check support for secondary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2); if (error) { printf("vmx_init: processor does not support desired secondary " "processor-based controls\n"); return (error); } /* Check support for VPID */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_VPID, 0, &tmp); if (error == 0) procbased_ctls2 |= PROCBASED2_ENABLE_VPID; /* Check support for pin-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls); if (error) { printf("vmx_init: processor does not support desired " "pin-based controls\n"); return (error); } /* Check support for VM-exit controls */ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &exit_ctls); if (error) { printf("vmx_init: processor does not support desired " "exit controls\n"); return (error); } /* Check support for VM-entry controls */ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &entry_ctls); if (error) { printf("vmx_init: processor does not support desired " "entry controls\n"); return (error); } /* * Check support for optional features by testing them * as individual bits */ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_HLT_EXITING, 0, &tmp) == 0); cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_PROCBASED_CTLS, PROCBASED_MTF, 0, &tmp) == 0); cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_PAUSE_EXITING, 0, &tmp) == 0); cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp) == 0); cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, &tmp) == 0); /* * Check support for virtual interrupt delivery. */ procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | PROCBASED2_VIRTUALIZE_X2APIC_MODE | PROCBASED2_APIC_REGISTER_VIRTUALIZATION | PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, &tmp) == 0); error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, procbased2_vid_bits, 0, &tmp); if (error == 0 && use_tpr_shadow) { virtual_interrupt_delivery = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", &virtual_interrupt_delivery); } if (virtual_interrupt_delivery) { procbased_ctls |= PROCBASED_USE_TPR_SHADOW; procbased_ctls2 |= procbased2_vid_bits; procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; /* * No need to emulate accesses to %CR8 if virtual * interrupt delivery is enabled. */ procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; /* * Check for Posted Interrupts only if Virtual Interrupt * Delivery is enabled. */ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, &tmp); if (error == 0) { pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : &IDTVEC(justreturn)); if (pirvec < 0) { if (bootverbose) { printf("vmx_init: unable to allocate " "posted interrupt vector\n"); } } else { posted_interrupts = 1; TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", &posted_interrupts); } } } if (posted_interrupts) pinbased_ctls |= PINBASED_POSTED_INTERRUPT; /* Initialize EPT */ error = ept_init(ipinum); if (error) { printf("vmx_init: ept initialization failed (%d)\n", error); return (error); } guest_l1d_flush = (cpu_ia32_arch_caps & IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); /* * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when * available. Otherwise fall back to the software flush * method which loads enough data from the kernel text to * flush existing L1D content, both on VMX entry and on NMI * return. */ if (guest_l1d_flush) { if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { guest_l1d_flush_sw = 1; TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", &guest_l1d_flush_sw); } if (guest_l1d_flush_sw) { if (nmi_flush_l1d_sw <= 1) nmi_flush_l1d_sw = 1; } else { msr_load_list[0].index = MSR_IA32_FLUSH_CMD; msr_load_list[0].val = IA32_FLUSH_CMD_L1D; } } /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0); fixed1 = rdmsr(MSR_VMX_CR0_FIXED1); cr0_ones_mask = fixed0 & fixed1; cr0_zeros_mask = ~fixed0 & ~fixed1; /* * CR0_PE and CR0_PG can be set to zero in VMX non-root operation * if unrestricted guest execution is allowed. */ if (cap_unrestricted_guest) cr0_ones_mask &= ~(CR0_PG | CR0_PE); /* * Do not allow the guest to set CR0_NW or CR0_CD. */ cr0_zeros_mask |= (CR0_NW | CR0_CD); fixed0 = rdmsr(MSR_VMX_CR4_FIXED0); fixed1 = rdmsr(MSR_VMX_CR4_FIXED1); cr4_ones_mask = fixed0 & fixed1; cr4_zeros_mask = ~fixed0 & ~fixed1; vpid_init(); vmx_msr_init(); /* enable VMX operation */ smp_rendezvous(NULL, vmx_enable, NULL, NULL); vmx_initialized = 1; return (0); } static void vmx_trigger_hostintr(int vector) { uintptr_t func; struct gate_descriptor *gd; gd = &idt[vector]; KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " "invalid vector %d", vector)); KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", vector)); KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " "has invalid type %d", vector, gd->gd_type)); KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " "has invalid dpl %d", vector, gd->gd_dpl)); KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " "for vector %d has invalid selector %d", vector, gd->gd_selector)); KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " "IST %d", vector, gd->gd_ist)); func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); vmx_call_isr(func); } static int vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) { int error, mask_ident, shadow_ident; uint64_t mask_value; if (which != 0 && which != 4) panic("vmx_setup_cr_shadow: unknown cr%d", which); if (which == 0) { mask_ident = VMCS_CR0_MASK; mask_value = cr0_ones_mask | cr0_zeros_mask; shadow_ident = VMCS_CR0_SHADOW; } else { mask_ident = VMCS_CR4_MASK; mask_value = cr4_ones_mask | cr4_zeros_mask; shadow_ident = VMCS_CR4_SHADOW; } error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value); if (error) return (error); error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial); if (error) return (error); return (0); } #define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init)) #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) static void * vmx_vminit(struct vm *vm, pmap_t pmap) { uint16_t vpid[VM_MAXCPU]; int i, error; struct vmx *vmx; struct vmcs *vmcs; uint32_t exc_bitmap; uint16_t maxcpus; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { panic("malloc of struct vmx not aligned on %d byte boundary", PAGE_SIZE); } vmx->vm = vm; vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); /* * Clean up EPTP-tagged guest physical and combined mappings * * VMX transitions are not required to invalidate any guest physical * mappings. So, it may be possible for stale guest physical mappings * to be present in the processor TLBs. * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ ept_invalidate_mappings(vmx->eptp); msr_bitmap_initialize(vmx->msr_bitmap); /* * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. * The guest FSBASE and GSBASE are saved and restored during * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are * always restored from the vmcs host state area on vm-exit. * * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in * how they are saved/restored so can be directly accessed by the * guest. * * MSR_EFER is saved and restored in the guest VMCS area on a * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. * * The TSC MSR is exposed read-only. Writes are disallowed as * that will impact the host TSC. If the guest does a write * the "use TSC offsetting" execution control is enabled and the * difference between the host TSC and the guest TSC is written * into the TSC offset in the VMCS. */ if (guest_msr_rw(vmx, MSR_GSBASE) || guest_msr_rw(vmx, MSR_FSBASE) || guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_EFER) || guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); vpid_alloc(vpid, VM_MAXCPU); if (virtual_interrupt_delivery) { error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, APIC_ACCESS_ADDRESS); /* XXX this should really return an error to the caller */ KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); } maxcpus = vm_get_maxcpus(vm); for (i = 0; i < maxcpus; i++) { vmcs = &vmx->vmcs[i]; vmcs->identifier = vmx_revision(); error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", error, i); } vmx_msr_guest_init(vmx, i); error = vmcs_init(vmcs); KASSERT(error == 0, ("vmcs_init error %d", error)); VMPTRLD(vmcs); error = 0; error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); error += vmwrite(VMCS_EPTP, vmx->eptp); error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); error += vmwrite(VMCS_VPID, vpid[i]); if (guest_l1d_flush && !guest_l1d_flush_sw) { vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( (vm_offset_t)&msr_load_list[0])); vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, nitems(msr_load_list)); vmcs_write(VMCS_EXIT_MSR_STORE, 0); vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); } /* exception bitmap */ if (vcpu_trace_exceptions(vm, i)) exc_bitmap = 0xffffffff; else exc_bitmap = 1 << IDT_MC; error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); if (virtual_interrupt_delivery) { error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); error += vmwrite(VMCS_VIRTUAL_APIC, vtophys(&vmx->apic_page[i])); error += vmwrite(VMCS_EOI_EXIT0, 0); error += vmwrite(VMCS_EOI_EXIT1, 0); error += vmwrite(VMCS_EOI_EXIT2, 0); error += vmwrite(VMCS_EOI_EXIT3, 0); } if (posted_interrupts) { error += vmwrite(VMCS_PIR_VECTOR, pirvec); error += vmwrite(VMCS_PIR_DESC, vtophys(&vmx->pir_desc[i])); } VMCLEAR(vmcs); KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); vmx->cap[i].set = 0; vmx->cap[i].proc_ctls = procbased_ctls; vmx->cap[i].proc_ctls2 = procbased_ctls2; vmx->state[i].nextrip = ~0; vmx->state[i].lastcpu = NOCPU; vmx->state[i].vpid = vpid[i]; /* * Set up the CR0/4 shadows, and init the read shadow * to the power-on register value from the Intel Sys Arch. * CR0 - 0x60000010 * CR4 - 0 */ error = vmx_setup_cr0_shadow(vmcs, 0x60000010); if (error != 0) panic("vmx_setup_cr0_shadow %d", error); error = vmx_setup_cr4_shadow(vmcs, 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); vmx->ctx[i].pmap = pmap; } return (vmx); } static int vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) { int handled, func; func = vmxctx->guest_rax; handled = x86_emulate_cpuid(vm, vcpu, (uint32_t*)(&vmxctx->guest_rax), (uint32_t*)(&vmxctx->guest_rbx), (uint32_t*)(&vmxctx->guest_rcx), (uint32_t*)(&vmxctx->guest_rdx)); return (handled); } static __inline void vmx_run_trace(struct vmx *vmx, int vcpu) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip()); #endif } static __inline void vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, int handled) { #ifdef KTR VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx", handled ? "handled" : "unhandled", exit_reason_to_str(exit_reason), rip); #endif } static __inline void vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) { #ifdef KTR VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip); #endif } static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); /* * Invalidate guest mappings identified by its vpid from the TLB. */ static __inline void vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; if (vmxstate->vpid == 0) return; if (!running) { /* * Set the 'lastcpu' to an invalid host cpu. * * This will invalidate TLB entries tagged with the vcpu's * vpid the next time it runs via vmx_set_pcpu_defaults(). */ vmxstate->lastcpu = NOCPU; return; } KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " "critical section", __func__, vcpu)); /* * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated * mappings associated with its 'vpid' during that run. So we must * assume that the mappings associated with 'vpid' on 'curcpu' are * stale and invalidate them. * * Note that we incur this penalty only when the scheduler chooses to * move the thread associated with this vcpu between host cpus. * * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { invvpid_desc._res1 = 0; invvpid_desc._res2 = 0; invvpid_desc.vpid = vmxstate->vpid; invvpid_desc.linear_addr = 0; invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); } else { /* * The invvpid can be skipped if an invept is going to * be performed before entering the guest. The invept * will invalidate combined mappings tagged with * 'vmx->eptp' for all vpids. */ vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } static void vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) { struct vmxstate *vmxstate; vmxstate = &vmx->state[vcpu]; if (vmxstate->lastcpu == curcpu) return; vmxstate->lastcpu = curcpu; vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); vmx_invvpid(vmx, vcpu, pmap, 1); } /* * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. */ CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); static void __inline vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting"); } } static void __inline vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); } static void __inline vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting"); } } static void __inline vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) { KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); } int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) { int error; if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) { vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting"); } error = vmwrite(VMCS_TSC_OFFSET, offset); return (error); } #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) static void vmx_inject_nmi(struct vmx *vmx, int vcpu) { uint32_t gi, info; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " "interruptibility-state %#x", gi)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " "VM-entry interruption information %#x", info)); /* * Inject the virtual NMI. The vector must be the NMI IDT entry * or the VMCS entry check will fail. */ info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID; vmcs_write(VMCS_ENTRY_INTR_INFO, info); VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI"); /* Clear the request */ vm_nmi_clear(vmx->vm, vcpu); } static void vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, uint64_t guestrip) { int vector, need_nmi_exiting, extint_pending; uint64_t rflags, entryinfo; uint32_t gi, info; if (vmx->state[vcpu].nextrip != guestrip) { gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " "cleared due to rip change: %#lx/%#lx", vmx->state[vcpu].nextrip, guestrip); gi &= ~HWINTR_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } } if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " "pending exception: %#lx/%#x", __func__, entryinfo, info)); info = entryinfo; vector = info & 0xff; if (vector == IDT_BP || vector == IDT_OF) { /* * VT-x requires #BP and #OF to be injected as software * exceptions. */ info &= ~VMCS_INTR_T_MASK; info |= VMCS_INTR_T_SWEXCEPTION; } if (info & VMCS_INTR_DEL_ERRCODE) vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); vmcs_write(VMCS_ENTRY_INTR_INFO, info); } if (vm_nmi_pending(vmx->vm, vcpu)) { /* * If there are no conditions blocking NMI injection then * inject it directly here otherwise enable "NMI window * exiting" to inject it as soon as we can. * * We also check for STI_BLOCKING because some implementations * don't allow NMI injection in this case. If we are running * on a processor that doesn't have this restriction it will * immediately exit and the NMI will be injected in the * "NMI window exiting" handler. */ need_nmi_exiting = 1; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { info = vmcs_read(VMCS_ENTRY_INTR_INFO); if ((info & VMCS_INTR_VALID) == 0) { vmx_inject_nmi(vmx, vcpu); need_nmi_exiting = 0; } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " "due to VM-entry intr info %#x", info); } } else { VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " "Guest Interruptibility-state %#x", gi); } if (need_nmi_exiting) vmx_set_nmi_window_exiting(vmx, vcpu); } extint_pending = vm_extint_pending(vmx->vm, vcpu); if (!extint_pending && virtual_interrupt_delivery) { vmx_inject_pir(vlapic); return; } /* * If interrupt-window exiting is already in effect then don't bother * checking for pending interrupts. This is just an optimization and * not needed for correctness. */ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) { VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to " "pending int_window_exiting"); return; } if (!extint_pending) { /* Ask the local apic for a vector to inject */ if (!vlapic_pending_intr(vlapic, &vector)) return; /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [16,255] can be delivered * through the local APIC. */ KASSERT(vector >= 16 && vector <= 255, ("invalid vector %d from local APIC", vector)); } else { /* Ask the legacy pic for a vector to inject */ vatpic_pending_intr(vmx->vm, &vector); /* * From the Intel SDM, Volume 3, Section "Maskable * Hardware Interrupts": * - maskable interrupt vectors [0,255] can be delivered * through the INTR pin. */ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d from INTR", vector)); } /* Check RFLAGS.IF and the interruptibility state of the guest */ rflags = vmcs_read(VMCS_GUEST_RFLAGS); if ((rflags & PSL_I) == 0) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "rflags %#lx", vector, rflags); goto cantinject; } gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); if (gi & HWINTR_BLOCKING) { VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "Guest Interruptibility-state %#x", vector, gi); goto cantinject; } info = vmcs_read(VMCS_ENTRY_INTR_INFO); if (info & VMCS_INTR_VALID) { /* * This is expected and could happen for multiple reasons: * - A vectoring VM-entry was aborted due to astpending * - A VM-exit happened during event injection. * - An exception was injected above. * - An NMI was injected above or after "NMI window exiting" */ VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " "VM-entry intr info %#x", vector, info); goto cantinject; } /* Inject the interrupt */ info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; info |= vector; vmcs_write(VMCS_ENTRY_INTR_INFO, info); if (!extint_pending) { /* Update the Local APIC ISR */ vlapic_intr_accepted(vlapic, vector); } else { vm_extint_clear(vmx->vm, vcpu); vatpic_intr_accepted(vmx->vm, vector); /* * After we accepted the current ExtINT the PIC may * have posted another one. If that is the case, set * the Interrupt Window Exiting execution control so * we can inject that one too. * * Also, interrupt window exiting allows us to inject any * pending APIC vector that was preempted by the ExtINT * as soon as possible. This applies both for the software * emulated vlapic and the hardware assisted virtual APIC. */ vmx_set_int_window_exiting(vmx, vcpu); } VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); return; cantinject: /* * Set the Interrupt Window Exiting execution control so we can inject * the interrupt as soon as blocking condition goes away. */ vmx_set_int_window_exiting(vmx, vcpu); } /* * If the Virtual NMIs execution control is '1' then the logical processor * tracks virtual-NMI blocking in the Guest Interruptibility-state field of * the VMCS. An IRET instruction in VMX non-root operation will remove any * virtual-NMI blocking. * * This unblocking occurs even if the IRET causes a fault. In this case the * hypervisor needs to restore virtual-NMI blocking before resuming the guest. */ static void vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking"); gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING; vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } static void vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) { uint32_t gi; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, ("NMI blocking is not in effect %#x", gi)); } static int vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { struct vmxctx *vmxctx; uint64_t xcrval; const struct xsave_limits *limits; vmxctx = &vmx->ctx[vcpu]; limits = vmm_get_xsave_limits(); /* * Note that the processor raises a GP# fault on its own if * xsetbv is executed for CPL != 0, so we do not have to * emulate that fault here. */ /* Only xcr0 is supported. */ if (vmxctx->guest_rcx != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { vm_inject_ud(vmx->vm, vcpu); return (HANDLED); } xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); if ((xcrval & ~limits->xcr0_allowed) != 0) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } if (!(xcrval & XFEATURE_ENABLED_X87)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* AVX (YMM_Hi128) requires SSE. */ if (xcrval & XFEATURE_ENABLED_AVX && (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, * ZMM_Hi256, and Hi16_ZMM. */ if (xcrval & XFEATURE_AVX512 && (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != (XFEATURE_AVX512 | XFEATURE_AVX)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * Intel MPX requires both bound register state flags to be * set. */ if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { vm_inject_gp(vmx->vm, vcpu); return (HANDLED); } /* * This runs "inside" vmrun() with the guest's FPU state, so * modifying xcr0 directly modifies the guest's xcr0, not the * host's. */ load_xcr(0, xcrval); return (HANDLED); } static uint64_t vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) { const struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: return (vmxctx->guest_rax); case 1: return (vmxctx->guest_rcx); case 2: return (vmxctx->guest_rdx); case 3: return (vmxctx->guest_rbx); case 4: return (vmcs_read(VMCS_GUEST_RSP)); case 5: return (vmxctx->guest_rbp); case 6: return (vmxctx->guest_rsi); case 7: return (vmxctx->guest_rdi); case 8: return (vmxctx->guest_r8); case 9: return (vmxctx->guest_r9); case 10: return (vmxctx->guest_r10); case 11: return (vmxctx->guest_r11); case 12: return (vmxctx->guest_r12); case 13: return (vmxctx->guest_r13); case 14: return (vmxctx->guest_r14); case 15: return (vmxctx->guest_r15); default: panic("invalid vmx register %d", ident); } } static void vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval) { struct vmxctx *vmxctx; vmxctx = &vmx->ctx[vcpu]; switch (ident) { case 0: vmxctx->guest_rax = regval; break; case 1: vmxctx->guest_rcx = regval; break; case 2: vmxctx->guest_rdx = regval; break; case 3: vmxctx->guest_rbx = regval; break; case 4: vmcs_write(VMCS_GUEST_RSP, regval); break; case 5: vmxctx->guest_rbp = regval; break; case 6: vmxctx->guest_rsi = regval; break; case 7: vmxctx->guest_rdi = regval; break; case 8: vmxctx->guest_r8 = regval; break; case 9: vmxctx->guest_r9 = regval; break; case 10: vmxctx->guest_r10 = regval; break; case 11: vmxctx->guest_r11 = regval; break; case 12: vmxctx->guest_r12 = regval; break; case 13: vmxctx->guest_r13 = regval; break; case 14: vmxctx->guest_r14 = regval; break; case 15: vmxctx->guest_r15 = regval; break; default: panic("invalid vmx register %d", ident); } } static int vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr0 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR0_SHADOW, regval); crval = regval | cr0_ones_mask; crval &= ~cr0_zeros_mask; vmcs_write(VMCS_GUEST_CR0, crval); if (regval & CR0_PG) { uint64_t efer, entry_ctls; /* * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and * the "IA-32e mode guest" bit in VM-entry control must be * equal. */ efer = vmcs_read(VMCS_GUEST_IA32_EFER); if (efer & EFER_LME) { efer |= EFER_LMA; vmcs_write(VMCS_GUEST_IA32_EFER, efer); entry_ctls = vmcs_read(VMCS_ENTRY_CTLS); entry_ctls |= VM_ENTRY_GUEST_LMA; vmcs_write(VMCS_ENTRY_CTLS, entry_ctls); } } return (HANDLED); } static int vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { uint64_t crval, regval; /* We only handle mov to %cr4 at this time */ if ((exitqual & 0xf0) != 0x00) return (UNHANDLED); regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf); vmcs_write(VMCS_CR4_SHADOW, regval); crval = regval | cr4_ones_mask; crval &= ~cr4_zeros_mask; vmcs_write(VMCS_GUEST_CR4, crval); return (HANDLED); } static int vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual) { struct vlapic *vlapic; uint64_t cr8; int regnum; /* We only handle mov %cr8 to/from a register at this time. */ if ((exitqual & 0xe0) != 0x00) { return (UNHANDLED); } vlapic = vm_lapic(vmx->vm, vcpu); regnum = (exitqual >> 8) & 0xf; if (exitqual & 0x10) { cr8 = vlapic_get_cr8(vlapic); vmx_set_guest_reg(vmx, vcpu, regnum, cr8); } else { cr8 = vmx_get_guest_reg(vmx, vcpu, regnum); vlapic_set_cr8(vlapic, cr8); } return (HANDLED); } /* * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL */ static int vmx_cpl(void) { uint32_t ssar; ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS); return ((ssar >> 5) & 0x3); } static enum vm_cpu_mode vmx_cpu_mode(void) { uint32_t csar; if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) { csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); if (csar & 0x2000) return (CPU_MODE_64BIT); /* CS.L = 1 */ else return (CPU_MODE_COMPATIBILITY); } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) { return (CPU_MODE_PROTECTED); } else { return (CPU_MODE_REAL); } } static enum vm_paging_mode vmx_paging_mode(void) { if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG)) return (PAGING_MODE_FLAT); if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE)) return (PAGING_MODE_32); if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME) return (PAGING_MODE_64); else return (PAGING_MODE_PAE); } static uint64_t inout_str_index(struct vmx *vmx, int vcpuid, int in) { uint64_t val; int error; enum vm_reg_name reg; reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; error = vmx_getreg(vmx, vcpuid, reg, &val); KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); return (val); } static uint64_t inout_str_count(struct vmx *vmx, int vcpuid, int rep) { uint64_t val; int error; if (rep) { error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); } else { val = 1; } return (val); } static int inout_str_addrsize(uint32_t inst_info) { uint32_t size; size = (inst_info >> 7) & 0x7; switch (size) { case 0: return (2); /* 16 bit */ case 1: return (4); /* 32 bit */ case 2: return (8); /* 64 bit */ default: panic("%s: invalid size encoding %d", __func__, size); } } static void inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, struct vm_inout_str *vis) { int error, s; if (in) { vis->seg_name = VM_REG_GUEST_ES; } else { s = (inst_info >> 15) & 0x7; vis->seg_name = vm_segment_name(s); } error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); } static void vmx_paging_info(struct vm_guest_paging *paging) { paging->cr3 = vmcs_guest_cr3(); paging->cpl = vmx_cpl(); paging->cpu_mode = vmx_cpu_mode(); paging->paging_mode = vmx_paging_mode(); } static void vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) { struct vm_guest_paging *paging; uint32_t csar; paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; vmexit->inst_length = 0; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; vmx_paging_info(paging); switch (paging->cpu_mode) { case CPU_MODE_REAL: vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); vmexit->u.inst_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); break; default: vmexit->u.inst_emul.cs_base = 0; vmexit->u.inst_emul.cs_d = 0; break; } vie_init(&vmexit->u.inst_emul.vie, NULL, 0); } static int ept_fault_type(uint64_t ept_qual) { int fault_type; if (ept_qual & EPT_VIOLATION_DATA_WRITE) fault_type = VM_PROT_WRITE; else if (ept_qual & EPT_VIOLATION_INST_FETCH) fault_type = VM_PROT_EXECUTE; else fault_type= VM_PROT_READ; return (fault_type); } static bool ept_emulation_fault(uint64_t ept_qual) { int read, write; /* EPT fault on an instruction fetch doesn't make sense here */ if (ept_qual & EPT_VIOLATION_INST_FETCH) return (false); /* EPT fault must be a read fault or a write fault */ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0; write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0; if ((read | write) == 0) return (false); /* * The EPT violation must have been caused by accessing a * guest-physical address that is a translation of a guest-linear * address. */ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 || (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) { return (false); } return (true); } static __inline int apic_access_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); } static __inline int x2apic_virtualization(struct vmx *vmx, int vcpuid) { uint32_t proc_ctls2; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); } static int vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, uint64_t qual) { int error, handled, offset; uint32_t *apic_regs, vector; bool retu; handled = HANDLED; offset = APIC_WRITE_OFFSET(qual); if (!apic_access_virtualization(vmx, vcpuid)) { /* * In general there should not be any APIC write VM-exits * unless APIC-access virtualization is enabled. * * However self-IPI virtualization can legitimately trigger * an APIC-write VM-exit so treat it specially. */ if (x2apic_virtualization(vmx, vcpuid) && offset == APIC_OFFSET_SELF_IPI) { apic_regs = (uint32_t *)(vlapic->apic_page); vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; vlapic_self_ipi_handler(vlapic, vector); return (HANDLED); } else return (UNHANDLED); } switch (offset) { case APIC_OFFSET_ID: vlapic_id_write_handler(vlapic); break; case APIC_OFFSET_LDR: vlapic_ldr_write_handler(vlapic); break; case APIC_OFFSET_DFR: vlapic_dfr_write_handler(vlapic); break; case APIC_OFFSET_SVR: vlapic_svr_write_handler(vlapic); break; case APIC_OFFSET_ESR: vlapic_esr_write_handler(vlapic); break; case APIC_OFFSET_ICR_LOW: retu = false; error = vlapic_icrlo_write_handler(vlapic, &retu); if (error != 0 || retu) handled = UNHANDLED; break; case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: vlapic_lvt_write_handler(vlapic, offset); break; case APIC_OFFSET_TIMER_ICR: vlapic_icrtmr_write_handler(vlapic); break; case APIC_OFFSET_TIMER_DCR: vlapic_dcr_write_handler(vlapic); break; default: handled = UNHANDLED; break; } return (handled); } static bool apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) { if (apic_access_virtualization(vmx, vcpuid) && (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) return (true); else return (false); } static int vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint64_t qual; int access_type, offset, allowed; if (!apic_access_virtualization(vmx, vcpuid)) return (UNHANDLED); qual = vmexit->u.vmx.exit_qualification; access_type = APIC_ACCESS_TYPE(qual); offset = APIC_ACCESS_OFFSET(qual); allowed = 0; if (access_type == 0) { /* * Read data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } else if (access_type == 1) { /* * Write data access to the following registers is expected. */ switch (offset) { case APIC_OFFSET_VER: case APIC_OFFSET_APR: case APIC_OFFSET_PPR: case APIC_OFFSET_RRR: case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: case APIC_OFFSET_CMCI_LVT: case APIC_OFFSET_TIMER_CCR: allowed = 1; break; default: break; } } if (allowed) { vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, VIE_INVALID_GLA); } /* * Regardless of whether the APIC-access is allowed this handler * always returns UNHANDLED: * - if the access is allowed then it is handled by emulating the * instruction that caused the VM-exit (outside the critical section) * - if the access is not allowed then it will be converted to an * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. */ return (UNHANDLED); } static enum task_switch_reason vmx_task_switch_reason(uint64_t qual) { int reason; reason = (qual >> 30) & 0x3; switch (reason) { case 0: return (TSR_CALL); case 1: return (TSR_IRET); case 2: return (TSR_JMP); case 3: return (TSR_IDT_GATE); default: panic("%s: invalid reason %d", __func__, reason); } } static int emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { int error; if (lapic_msr(num)) error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu); else error = vmx_wrmsr(vmx, vcpuid, num, val, retu); return (error); } static int emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) { struct vmxctx *vmxctx; uint64_t result; uint32_t eax, edx; int error; if (lapic_msr(num)) error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu); else error = vmx_rdmsr(vmx, vcpuid, num, &result, retu); if (error == 0) { eax = result; vmxctx = &vmx->ctx[vcpuid]; error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax); KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error)); edx = result >> 32; error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx); KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error)); } return (error); } static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { int error, errcode, errcode_valid, handled, in; struct vmxctx *vmxctx; struct vlapic *vlapic; struct vm_inout_str *vis; struct vm_task_switch *ts; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; uint32_t intr_type, intr_vec, reason; uint64_t exitintinfo, qual, gpa; bool retu; CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); handled = UNHANDLED; vmxctx = &vmx->ctx[vcpu]; qual = vmexit->u.vmx.exit_qualification; reason = vmexit->u.vmx.exit_reason; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); /* * VM-entry failures during or after loading guest state. * * These VM-exits are uncommon but must be handled specially * as most VM-exit fields are not populated as usual. */ if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); __asm __volatile("int $18"); return (1); } /* * VM exits that can be triggered during event delivery need to * be handled specially by re-injecting the event if the IDT * vectoring information field's valid bit is set. * * See "Information for VM Exits During Event Delivery" in Intel SDM * for details. */ idtvec_info = vmcs_idt_vectoring_info(); if (idtvec_info & VMCS_IDT_VEC_VALID) { idtvec_info &= ~(1 << 12); /* clear undefined bit */ exitintinfo = idtvec_info; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { idtvec_err = vmcs_idt_vectoring_err(); exitintinfo |= (uint64_t)idtvec_err << 32; } error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); KASSERT(error == 0, ("%s: vm_set_intinfo error %d", __func__, error)); /* * If 'virtual NMIs' are being used and the VM-exit * happened while injecting an NMI during the previous * VM-entry, then clear "blocking by NMI" in the * Guest Interruptibility-State so the NMI can be * reinjected on the subsequent VM-entry. * * However, if the NMI was being delivered through a task * gate, then the new task must start execution with NMIs * blocked so don't clear NMI blocking in this case. */ intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type == VMCS_INTR_T_NMI) { if (reason != EXIT_REASON_TASK_SWITCH) vmx_clear_nmi_blocking(vmx, vcpu); else vmx_assert_nmi_blocking(vmx, vcpu); } /* * Update VM-entry instruction length if the event being * delivered was a software interrupt or software exception. */ if (intr_type == VMCS_INTR_T_SWINTR || intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || intr_type == VMCS_INTR_T_SWEXCEPTION) { vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); } } switch (reason) { case EXIT_REASON_TASK_SWITCH: ts = &vmexit->u.task_switch; ts->tsssel = qual & 0xffff; ts->reason = vmx_task_switch_reason(qual); ts->ext = 0; ts->errcode_valid = 0; vmx_paging_info(&ts->paging); /* * If the task switch was due to a CALL, JMP, IRET, software * interrupt (INT n) or software exception (INT3, INTO), * then the saved %rip references the instruction that caused * the task switch. The instruction length field in the VMCS * is valid in this case. * * In all other cases (e.g., NMI, hardware exception) the * saved %rip is one that would have been saved in the old TSS * had the task switch completed normally so the instruction * length field is not needed in this case and is explicitly * set to 0. */ if (ts->reason == TSR_IDT_GATE) { KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, ("invalid idtvec_info %#x for IDT task switch", idtvec_info)); intr_type = idtvec_info & VMCS_INTR_T_MASK; if (intr_type != VMCS_INTR_T_SWINTR && intr_type != VMCS_INTR_T_SWEXCEPTION && intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { /* Task switch triggered by external event */ ts->ext = 1; vmexit->inst_length = 0; if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { ts->errcode_valid = 1; ts->errcode = vmcs_idt_vectoring_err(); } } } vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " "%s errcode 0x%016lx", ts->reason, ts->tsssel, ts->ext ? "external" : "internal", ((uint64_t)ts->errcode << 32) | ts->errcode_valid); break; case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); switch (qual & 0xf) { case 0: handled = vmx_emulate_cr0_access(vmx, vcpu, qual); break; case 4: handled = vmx_emulate_cr4_access(vmx, vcpu, qual); break; case 8: handled = vmx_emulate_cr8_access(vmx, vcpu, qual); break; } break; case EXIT_REASON_RDMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1); retu = false; ecx = vmxctx->guest_rcx; VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); error = emulate_rdmsr(vmx, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; vmexit->u.msr.code = ecx; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_rdmsr retu with bogus exitcode")); } break; case EXIT_REASON_WRMSR: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1); retu = false; eax = vmxctx->guest_rax; ecx = vmxctx->guest_rcx; edx = vmxctx->guest_rdx; VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", ecx, (uint64_t)edx << 32 | eax); SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, (uint64_t)edx << 32 | eax); error = emulate_wrmsr(vmx, vcpu, ecx, (uint64_t)edx << 32 | eax, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_WRMSR; vmexit->u.msr.code = ecx; vmexit->u.msr.wval = (uint64_t)edx << 32 | eax; } else if (!retu) { handled = HANDLED; } else { /* Return to userspace with a valid exitcode */ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, ("emulate_wrmsr retu with bogus exitcode")); } break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); if (virtual_interrupt_delivery) vmexit->u.hlt.intr_status = vmcs_read(VMCS_GUEST_INTR_STATUS); else vmexit->u.hlt.intr_status = 0; break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MTRAP; vmexit->inst_length = 0; break; case EXIT_REASON_PAUSE: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_PAUSE; break; case EXIT_REASON_INTR_WINDOW: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); vmx_clear_int_window_exiting(vmx, vcpu); return (1); case EXIT_REASON_EXT_INTR: /* * External interrupts serve only to cause VM exits and allow * the host interrupt handler to run. * * If this external interrupt triggers a virtual interrupt * to a VM, then that state will be recorded by the * host interrupt handler in the VM's softc. We will inject * this virtual interrupt during the subsequent VM enter. */ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); SDT_PROBE4(vmm, vmx, exit, interrupt, vmx, vcpu, vmexit, intr_info); /* * XXX: Ignore this exit if VMCS_INTR_VALID is not set. * This appears to be a bug in VMware Fusion? */ if (!(intr_info & VMCS_INTR_VALID)) return (1); KASSERT((intr_info & VMCS_INTR_VALID) != 0 && (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, ("VM exit interruption info invalid: %#x", intr_info)); vmx_trigger_hostintr(intr_info & 0xff); /* * This is special. We want to treat this as an 'handled' * VM-exit but not increment the instruction pointer. */ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); return (1); case EXIT_REASON_NMI_WINDOW: SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); /* Exit to allow the pending virtual NMI to be injected */ if (vm_nmi_pending(vmx->vm, vcpu)) vmx_inject_nmi(vmx, vcpu); vmx_clear_nmi_window_exiting(vmx, vcpu); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1); return (1); case EXIT_REASON_INOUT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); vmexit->exitcode = VM_EXITCODE_INOUT; vmexit->u.inout.bytes = (qual & 0x7) + 1; vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; vmexit->u.inout.port = (uint16_t)(qual >> 16); vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); if (vmexit->u.inout.string) { inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); vmexit->exitcode = VM_EXITCODE_INOUT_STR; vis = &vmexit->u.inout_str; vmx_paging_info(&vis->paging); vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); vis->cr0 = vmcs_read(VMCS_GUEST_CR0); vis->index = inout_str_index(vmx, vcpu, in); vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); vis->addrsize = inout_str_addrsize(inst_info); inout_str_seginfo(vmx, vcpu, inst_info, in, vis); } SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); break; case EXIT_REASON_CPUID: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); intr_vec = intr_info & 0xff; intr_type = intr_info & VMCS_INTR_T_MASK; /* * If Virtual NMIs control is 1 and the VM-exit is due to a * fault encountered during the execution of IRET then we must * restore the state of "virtual-NMI blocking" before resuming * the guest. * * See "Resuming Guest Software after Handling an Exception". * See "Information for VM Exits Due to Vectored Events". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (intr_vec != IDT_DF) && (intr_info & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); /* * The NMI has already been handled in vmx_exit_handle_nmi(). */ if (intr_type == VMCS_INTR_T_NMI) return (1); /* * Call the machine check handler by hand. Also don't reflect * the machine check back into the guest. */ if (intr_vec == IDT_MC) { VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); __asm __volatile("int $18"); return (1); } if (intr_vec == IDT_PF) { error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", __func__, error)); } /* * Software exceptions exhibit trap-like behavior. This in * turn requires populating the VM-entry instruction length * so that the %rip in the trap frame is past the INT3/INTO * instruction. */ if (intr_type == VMCS_INTR_T_SWEXCEPTION) vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); /* Reflect all other exceptions back into the guest */ errcode_valid = errcode = 0; if (intr_info & VMCS_INTR_DEL_ERRCODE) { errcode_valid = 1; errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); } VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " "the guest", intr_vec, errcode); SDT_PROBE5(vmm, vmx, exit, exception, vmx, vcpu, vmexit, intr_vec, errcode); error = vm_inject_exception(vmx->vm, vcpu, intr_vec, errcode_valid, errcode, 0); KASSERT(error == 0, ("%s: vm_inject_exception error %d", __func__, error)); return (1); case EXIT_REASON_EPT_FAULT: /* * If 'gpa' lies within the address space allocated to * memory then this must be a nested page fault otherwise * this must be an instruction that accesses MMIO space. */ gpa = vmcs_gpa(); if (vm_mem_allocated(vmx->vm, vcpu, gpa) || apic_access_fault(vmx, vcpu, gpa)) { vmexit->exitcode = VM_EXITCODE_PAGING; vmexit->inst_length = 0; vmexit->u.paging.gpa = gpa; vmexit->u.paging.fault_type = ept_fault_type(qual); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); SDT_PROBE5(vmm, vmx, exit, nestedfault, vmx, vcpu, vmexit, gpa, qual); } else if (ept_emulation_fault(qual)) { vmexit_inst_emul(vmexit, gpa, vmcs_gla()); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); SDT_PROBE4(vmm, vmx, exit, mmiofault, vmx, vcpu, vmexit, gpa); } /* * If Virtual NMIs control is 1 and the VM-exit is due to an * EPT fault during the execution of IRET then we must restore * the state of "virtual-NMI blocking" before resuming. * * See description of "NMI unblocking due to IRET" in * "Exit Qualification for EPT Violations". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && (qual & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); break; case EXIT_REASON_VIRTUALIZED_EOI: vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; vmexit->u.ioapic_eoi.vector = qual & 0xFF; SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); vmexit->inst_length = 0; /* trap-like */ break; case EXIT_REASON_APIC_ACCESS: SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); handled = vmx_handle_apic_access(vmx, vcpu, vmexit); break; case EXIT_REASON_APIC_WRITE: /* * APIC-write VM exit is trap-like so the %rip is already * pointing to the next instruction. */ vmexit->inst_length = 0; vlapic = vm_lapic(vmx->vm, vcpu); SDT_PROBE4(vmm, vmx, exit, apicwrite, vmx, vcpu, vmexit, vlapic); handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); break; case EXIT_REASON_XSETBV: SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); break; case EXIT_REASON_MONITOR: SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MONITOR; break; case EXIT_REASON_MWAIT: SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MWAIT; break; case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMXOFF: case EXIT_REASON_VMXON: SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_VMINSN; break; default: SDT_PROBE4(vmm, vmx, exit, unknown, vmx, vcpu, vmexit, reason); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } if (handled) { /* * It is possible that control is returned to userland * even though we were able to handle the VM exit in the * kernel. * * In such a case we want to make sure that the userland * restarts guest execution at the instruction *after* * the one we just processed. Therefore we update the * guest rip in the VMCS and in 'vmexit'. */ vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; vmcs_write(VMCS_GUEST_RIP, vmexit->rip); } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* * If this VM exit was not claimed by anybody then * treat it as a generic VMX exit. */ vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = VM_SUCCESS; vmexit->u.vmx.inst_type = 0; vmexit->u.vmx.inst_error = 0; } else { /* * The exitcode and collateral have been populated. * The VM exit will be processed further in userland. */ } } SDT_PROBE4(vmm, vmx, exit, return, vmx, vcpu, vmexit, handled); return (handled); } static __inline void vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) { KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, ("vmx_exit_inst_error: invalid inst_fail_status %d", vmxctx->inst_fail_status)); vmexit->inst_length = 0; vmexit->exitcode = VM_EXITCODE_VMX; vmexit->u.vmx.status = vmxctx->inst_fail_status; vmexit->u.vmx.inst_error = vmcs_instruction_error(); vmexit->u.vmx.exit_reason = ~0; vmexit->u.vmx.exit_qualification = ~0; switch (rc) { case VMX_VMRESUME_ERROR: case VMX_VMLAUNCH_ERROR: case VMX_INVEPT_ERROR: vmexit->u.vmx.inst_type = rc; break; default: panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); } } /* * If the NMI-exiting VM execution control is set to '1' then an NMI in * non-root operation causes a VM-exit. NMI blocking is in effect so it is * sufficient to simply vector to the NMI handler via a software interrupt. * However, this must be done before maskable interrupts are enabled * otherwise the "iret" issued by an interrupt handler will incorrectly * clear NMI blocking. */ static __inline void vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint32_t intr_info; KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) return; intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " "to NMI has invalid vector: %#x", intr_info)); VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); __asm __volatile("int $2"); } } static __inline void vmx_dr_enter_guest(struct vmxctx *vmxctx) { register_t rflags; /* Save host control debug registers. */ vmxctx->host_dr7 = rdr7(); vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); /* * Disable debugging in DR7 and DEBUGCTL to avoid triggering * exceptions in the host based on the guest DRx values. The * guest DR7 and DEBUGCTL are saved/restored in the VMCS. */ load_dr7(0); wrmsr(MSR_DEBUGCTLMSR, 0); /* * Disable single stepping the kernel to avoid corrupting the * guest DR6. A debugger might still be able to corrupt the * guest DR6 by setting a breakpoint after this point and then * single stepping. */ rflags = read_rflags(); vmxctx->host_tf = rflags & PSL_T; write_rflags(rflags & ~PSL_T); /* Save host debug registers. */ vmxctx->host_dr0 = rdr0(); vmxctx->host_dr1 = rdr1(); vmxctx->host_dr2 = rdr2(); vmxctx->host_dr3 = rdr3(); vmxctx->host_dr6 = rdr6(); /* Restore guest debug registers. */ load_dr0(vmxctx->guest_dr0); load_dr1(vmxctx->guest_dr1); load_dr2(vmxctx->guest_dr2); load_dr3(vmxctx->guest_dr3); load_dr6(vmxctx->guest_dr6); } static __inline void vmx_dr_leave_guest(struct vmxctx *vmxctx) { /* Save guest debug registers. */ vmxctx->guest_dr0 = rdr0(); vmxctx->guest_dr1 = rdr1(); vmxctx->guest_dr2 = rdr2(); vmxctx->guest_dr3 = rdr3(); vmxctx->guest_dr6 = rdr6(); /* * Restore host debug registers. Restore DR7, DEBUGCTL, and * PSL_T last. */ load_dr0(vmxctx->host_dr0); load_dr1(vmxctx->host_dr1); load_dr2(vmxctx->host_dr2); load_dr3(vmxctx->host_dr3); load_dr6(vmxctx->host_dr6); wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); load_dr7(vmxctx->host_dr7); write_rflags(read_rflags() | vmxctx->host_tf); } static int vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, struct vm_eventinfo *evinfo) { int rc, handled, launched; struct vmx *vmx; struct vm *vm; struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; struct vlapic *vlapic; uint32_t exit_reason; struct region_descriptor gdtr, idtr; uint16_t ldt_sel; vmx = arg; vm = vmx->vm; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; vlapic = vm_lapic(vm, vcpu); vmexit = vm_exitinfo(vm, vcpu); launched = 0; KASSERT(vmxctx->pmap == pmap, ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); vmx_msr_guest_enter(vmx, vcpu); VMPTRLD(vmcs); /* * XXX * We do this every time because we may setup the virtual machine * from a different process than the one that actually runs it. * * If the life of a virtual machine was spent entirely in the context * of a single process we could do this once in vmx_vminit(). */ vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, rip); vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); handled = UNHANDLED; /* * Interrupts are disabled from this point on until the * guest starts executing. This is done for the following * reasons: * * If an AST is asserted on this thread after the check below, * then the IPI_AST notification will not be lost, because it * will cause a VM exit due to external interrupt as soon as * the guest state is loaded. * * A posted interrupt after 'vmx_inject_interrupts()' will * not be "lost" because it will be held pending in the host * APIC because interrupts are disabled. The pending interrupt * will be recognized as soon as the guest state is loaded. * * The same reasoning applies to the IPI generated by * pmap_invalidate_ept(). */ disable_intr(); vmx_inject_interrupts(vmx, vcpu, vlapic, rip); /* * Check for vcpu suspension after injecting events because * vmx_inject_interrupts() can suspend the vcpu due to a * triple fault. */ if (vcpu_suspended(evinfo)) { enable_intr(); vm_exit_suspended(vmx->vm, vcpu, rip); break; } if (vcpu_rendezvous_pending(evinfo)) { enable_intr(); vm_exit_rendezvous(vmx->vm, vcpu, rip); break; } if (vcpu_reqidle(evinfo)) { enable_intr(); vm_exit_reqidle(vmx->vm, vcpu, rip); break; } if (vcpu_should_yield(vm, vcpu)) { enable_intr(); vm_exit_astpending(vmx->vm, vcpu, rip); vmx_astpending_trace(vmx, vcpu, rip); handled = HANDLED; break; } if (vcpu_debugged(vm, vcpu)) { enable_intr(); vm_exit_debug(vmx->vm, vcpu, rip); break; } /* * VM exits restore the base address but not the * limits of GDTR and IDTR. The VMCS only stores the * base address, so VM exits set the limits to 0xffff. * Save and restore the full GDTR and IDTR to restore * the limits. * * The VMCS does not save the LDTR at all, and VM * exits clear LDTR as if a NULL selector were loaded. * The userspace hypervisor probably doesn't use a * LDT, but save and restore it to be safe. */ sgdt(&gdtr); sidt(&idtr); ldt_sel = sldt(); vmx_run_trace(vmx, vcpu); vmx_dr_enter_guest(vmxctx); rc = vmx_enter_guest(vmxctx, vmx, launched); vmx_dr_leave_guest(vmxctx); bare_lgdt(&gdtr); lidt(&idtr); lldt(ldt_sel); /* Collect some information for VM exit processing */ vmexit->rip = rip = vmcs_guest_rip(); vmexit->inst_length = vmexit_instruction_length(); vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); vmexit->u.vmx.exit_qualification = vmcs_exit_qualification(); /* Update 'nextrip' */ vmx->state[vcpu].nextrip = rip; if (rc == VMX_GUEST_VMEXIT) { vmx_exit_handle_nmi(vmx, vcpu, vmexit); enable_intr(); handled = vmx_exit_process(vmx, vcpu, vmexit); } else { enable_intr(); vmx_exit_inst_error(vmxctx, rc, vmexit); } launched = 1; vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); rip = vmexit->rip; } while (handled); /* * If a VM exit has been handled then the exitcode must be BOGUS * If a VM exit is not handled then the exitcode must not be BOGUS */ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { panic("Mismatch between handled (%d) and exitcode (%d)", handled, vmexit->exitcode); } if (!handled) vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); VMCLEAR(vmcs); vmx_msr_guest_exit(vmx, vcpu); return (0); } static void vmx_vmcleanup(void *arg) { int i; struct vmx *vmx = arg; uint16_t maxcpus; if (apic_access_virtualization(vmx, 0)) vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); maxcpus = vm_get_maxcpus(vmx->vm); for (i = 0; i < maxcpus; i++) vpid_free(vmx->state[i].vpid); free(vmx, M_VMX); return; } static register_t * vmxctx_regptr(struct vmxctx *vmxctx, int reg) { switch (reg) { case VM_REG_GUEST_RAX: return (&vmxctx->guest_rax); case VM_REG_GUEST_RBX: return (&vmxctx->guest_rbx); case VM_REG_GUEST_RCX: return (&vmxctx->guest_rcx); case VM_REG_GUEST_RDX: return (&vmxctx->guest_rdx); case VM_REG_GUEST_RSI: return (&vmxctx->guest_rsi); case VM_REG_GUEST_RDI: return (&vmxctx->guest_rdi); case VM_REG_GUEST_RBP: return (&vmxctx->guest_rbp); case VM_REG_GUEST_R8: return (&vmxctx->guest_r8); case VM_REG_GUEST_R9: return (&vmxctx->guest_r9); case VM_REG_GUEST_R10: return (&vmxctx->guest_r10); case VM_REG_GUEST_R11: return (&vmxctx->guest_r11); case VM_REG_GUEST_R12: return (&vmxctx->guest_r12); case VM_REG_GUEST_R13: return (&vmxctx->guest_r13); case VM_REG_GUEST_R14: return (&vmxctx->guest_r14); case VM_REG_GUEST_R15: return (&vmxctx->guest_r15); case VM_REG_GUEST_CR2: return (&vmxctx->guest_cr2); case VM_REG_GUEST_DR0: return (&vmxctx->guest_dr0); case VM_REG_GUEST_DR1: return (&vmxctx->guest_dr1); case VM_REG_GUEST_DR2: return (&vmxctx->guest_dr2); case VM_REG_GUEST_DR3: return (&vmxctx->guest_dr3); case VM_REG_GUEST_DR6: return (&vmxctx->guest_dr6); default: break; } return (NULL); } static int vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *retval = *regp; return (0); } else return (EINVAL); } static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) { register_t *regp; if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) { *regp = val; return (0); } else return (EINVAL); } static int vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) { uint64_t gi; int error; error = vmcs_getreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; return (error); } static int vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) { struct vmcs *vmcs; uint64_t gi; int error, ident; /* * Forcing the vcpu into an interrupt shadow is not supported. */ if (val) { error = EINVAL; goto done; } vmcs = &vmx->vmcs[vcpu]; ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); error = vmcs_getreg(vmcs, running, ident, &gi); if (error == 0) { gi &= ~HWINTR_BLOCKING; error = vmcs_setreg(vmcs, running, ident, gi); } done: VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, error ? "failed" : "succeeded"); return (error); } static int vmx_shadow_reg(int reg) { int shreg; shreg = -1; switch (reg) { case VM_REG_GUEST_CR0: shreg = VMCS_CR0_SHADOW; break; case VM_REG_GUEST_CR4: shreg = VMCS_CR4_SHADOW; break; default: break; } return (shreg); } static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) { int running, hostcpu; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); if (reg == VM_REG_GUEST_INTR_SHADOW) return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) return (0); return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval)); } static int vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) { int error, hostcpu, running, shadow; uint64_t ctls; pmap_t pmap; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); if (reg == VM_REG_GUEST_INTR_SHADOW) return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) return (0); error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val); if (error == 0) { /* * If the "load EFER" VM-entry control is 1 then the * value of EFER.LMA must be identical to "IA-32e mode guest" * bit in the VM-entry control. */ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 && (reg == VM_REG_GUEST_EFER)) { vmcs_getreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls); if (val & EFER_LMA) ctls |= VM_ENTRY_GUEST_LMA; else ctls &= ~VM_ENTRY_GUEST_LMA; vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(VMCS_ENTRY_CTLS), ctls); } shadow = vmx_shadow_reg(reg); if (shadow > 0) { /* * Store the unmodified value in the shadow */ error = vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(shadow), val); } if (reg == VM_REG_GUEST_CR3) { /* * Invalidate the guest vcpu's TLB mappings to emulate * the behavior of updating %cr3. * * XXX the processor retains global mappings when %cr3 * is updated but vmx_invvpid() does not. */ pmap = vmx->ctx[vcpu].pmap; vmx_invvpid(vmx, vcpu, pmap, running); } } return (error); } static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu); return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc)); } static int vmx_getcap(void *arg, int vcpu, int type, int *retval) { struct vmx *vmx = arg; int vcap; int ret; ret = ENOENT; vcap = vmx->cap[vcpu].set; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) ret = 0; break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) ret = 0; break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) ret = 0; break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) ret = 0; break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) ret = 0; break; default: break; } if (ret == 0) *retval = (vcap & (1 << type)) ? 1 : 0; return (ret); } static int vmx_setcap(void *arg, int vcpu, int type, int val) { struct vmx *vmx = arg; struct vmcs *vmcs = &vmx->vmcs[vcpu]; uint32_t baseval; uint32_t *pptr; int error; int flag; int reg; int retval; retval = ENOENT; pptr = NULL; switch (type) { case VM_CAP_HALT_EXIT: if (cap_halt_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_HLT_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_MTRAP_EXIT: if (cap_monitor_trap) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_MTF; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_PAUSE_EXIT: if (cap_pause_exit) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls; baseval = *pptr; flag = PROCBASED_PAUSE_EXITING; reg = VMCS_PRI_PROC_BASED_CTLS; } break; case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_UNRESTRICTED_GUEST; reg = VMCS_SEC_PROC_BASED_CTLS; } break; case VM_CAP_ENABLE_INVPCID: if (cap_invpcid) { retval = 0; pptr = &vmx->cap[vcpu].proc_ctls2; baseval = *pptr; flag = PROCBASED2_ENABLE_INVPCID; reg = VMCS_SEC_PROC_BASED_CTLS; } break; default: break; } if (retval == 0) { if (val) { baseval |= flag; } else { baseval &= ~flag; } VMPTRLD(vmcs); error = vmwrite(reg, baseval); VMCLEAR(vmcs); if (error) { retval = error; } else { /* * Update optional stored flags, and record * setting */ if (pptr != NULL) { *pptr = baseval; } if (val) { vmx->cap[vcpu].set |= (1 << type); } else { vmx->cap[vcpu].set &= ~(1 << type); } } } return (retval); } struct vlapic_vtx { struct vlapic vlapic; struct pir_desc *pir_desc; struct vmx *vmx; u_int pending_prio; }; #define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ do { \ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ level ? "level" : "edge", vector); \ VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \ VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \ VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \ VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \ VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\ } while (0) /* * vlapic->ops handlers that utilize the APICv hardware assist described in * Chapter 29 of the Intel SDM. */ static int vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; uint64_t mask; int idx, notify = 0; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; /* * Keep track of interrupt requests in the PIR descriptor. This is * because the virtual APIC page pointed to by the VMCS cannot be * modified if the vcpu is running. */ idx = vector / 64; mask = 1UL << (vector % 64); atomic_set_long(&pir_desc->pir[idx], mask); /* * A notification is required whenever the 'pending' bit makes a * transition from 0->1. * * Even if the 'pending' bit is already asserted, notification about * the incoming interrupt may still be necessary. For example, if a * vCPU is HLTed with a high PPR, a low priority interrupt would cause * the 0->1 'pending' transition with a notification, but the vCPU * would ignore the interrupt for the time being. The same vCPU would * need to then be notified if a high-priority interrupt arrived which * satisfied the PPR. * * The priorities of interrupts injected while 'pending' is asserted * are tracked in a custom bitfield 'pending_prio'. Should the * to-be-injected interrupt exceed the priorities already present, the * notification is sent. The priorities recorded in 'pending_prio' are * cleared whenever the 'pending' bit makes another 0->1 transition. */ if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { notify = 1; vlapic_vtx->pending_prio = 0; } else { const u_int old_prio = vlapic_vtx->pending_prio; const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); notify = 1; } } VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, level, "vmx_set_intr_ready"); return (notify); } static int vmx_pending_intr(struct vlapic *vlapic, int *vecptr) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t pending, pirval; uint32_t ppr, vpr; int i; /* * This function is only expected to be called from the 'HLT' exit * handler which does not care about the vector that is pending. */ KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL")); vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; pending = atomic_load_acq_long(&pir_desc->pending); if (!pending) { /* * While a virtual interrupt may have already been * processed the actual delivery maybe pending the * interruptibility of the guest. Recognize a pending * interrupt by reevaluating virtual interrupts * following Section 29.2.1 in the Intel SDM Volume 3. */ struct vm_exit *vmexit; uint8_t rvi, ppr; vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); KASSERT(vmexit->exitcode == VM_EXITCODE_HLT, ("vmx_pending_intr: exitcode not 'HLT'")); rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; lapic = vlapic->apic_page; ppr = lapic->ppr & APIC_TPR_INT; if (rvi > ppr) { return (1); } return (0); } /* * If there is an interrupt pending then it will be recognized only * if its priority is greater than the processor priority. * * Special case: if the processor priority is zero then any pending * interrupt will be recognized. */ lapic = vlapic->apic_page; ppr = lapic->ppr & APIC_TPR_INT; if (ppr == 0) return (1); VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", lapic->ppr); vpr = 0; for (i = 3; i >= 0; i--) { pirval = pir_desc->pir[i]; if (pirval != 0) { vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; break; } } /* * If the highest-priority pending interrupt falls short of the * processor priority of this vCPU, ensure that 'pending_prio' does not * have any stale bits which would preclude a higher-priority interrupt * from incurring a notification later. */ if (vpr <= ppr) { const u_int prio_bit = VPR_PRIO_BIT(vpr); const u_int old = vlapic_vtx->pending_prio; if (old > prio_bit && (old & prio_bit) == 0) { vlapic_vtx->pending_prio = prio_bit; } return (0); } return (1); } static void vmx_intr_accepted(struct vlapic *vlapic, int vector) { panic("vmx_intr_accepted: not expected to be called"); } static void vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) { struct vlapic_vtx *vlapic_vtx; struct vmx *vmx; struct vmcs *vmcs; uint64_t mask, val; KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), ("vmx_set_tmr: vcpu cannot be running")); vlapic_vtx = (struct vlapic_vtx *)vlapic; vmx = vlapic_vtx->vmx; vmcs = &vmx->vmcs[vlapic->vcpuid]; mask = 1UL << (vector % 64); VMPTRLD(vmcs); val = vmcs_read(VMCS_EOI_EXIT(vector)); if (level) val |= mask; else val &= ~mask; vmcs_write(VMCS_EOI_EXIT(vector), val); VMCLEAR(vmcs); } static void vmx_enable_x2apic_mode(struct vlapic *vlapic) { struct vmx *vmx; struct vmcs *vmcs; uint32_t proc_ctls2; int vcpuid, error; vcpuid = vlapic->vcpuid; vmx = ((struct vlapic_vtx *)vlapic)->vmx; vmcs = &vmx->vmcs[vcpuid]; proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; VMPTRLD(vmcs); vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); VMCLEAR(vmcs); if (vlapic->vcpuid == 0) { /* * The nested page table mappings are shared by all vcpus * so unmap the APIC access page just once. */ error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", __func__, error)); /* * The MSR bitmap is shared by all vcpus so modify it only * once in the context of vcpu 0. */ error = vmx_allow_x2apic_msrs(vmx); KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", __func__, error)); } } static void vmx_post_intr(struct vlapic *vlapic, int hostcpu) { ipi_cpu(hostcpu, pirvec); } /* * Transfer the pending interrupts in the PIR descriptor to the IRR * in the virtual APIC page. */ static void vmx_inject_pir(struct vlapic *vlapic) { struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; struct LAPIC *lapic; uint64_t val, pirval; int rvi, pirbase = -1; uint16_t intr_status_old, intr_status_new; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) { VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "no posted interrupt pending"); return; } pirval = 0; pirbase = -1; lapic = vlapic->apic_page; val = atomic_readandclear_long(&pir_desc->pir[0]); if (val != 0) { lapic->irr0 |= val; lapic->irr1 |= val >> 32; pirbase = 0; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[1]); if (val != 0) { lapic->irr2 |= val; lapic->irr3 |= val >> 32; pirbase = 64; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[2]); if (val != 0) { lapic->irr4 |= val; lapic->irr5 |= val >> 32; pirbase = 128; pirval = val; } val = atomic_readandclear_long(&pir_desc->pir[3]); if (val != 0) { lapic->irr6 |= val; lapic->irr7 |= val >> 32; pirbase = 192; pirval = val; } VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir"); /* * Update RVI so the processor can evaluate pending virtual * interrupts on VM-entry. * * It is possible for pirval to be 0 here, even though the * pending bit has been set. The scenario is: * CPU-Y is sending a posted interrupt to CPU-X, which * is running a guest and processing posted interrupts in h/w. * CPU-X will eventually exit and the state seen in s/w is * the pending bit set, but no PIR bits set. * * CPU-X CPU-Y * (vm running) (host running) * rx posted interrupt * CLEAR pending bit * SET PIR bit * READ/CLEAR PIR bits * SET pending bit * (vm exit) * pending bit set, PIR 0 */ if (pirval != 0) { rvi = pirbase + flsl(pirval) - 1; intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS); intr_status_new = (intr_status_old & 0xFF00) | rvi; if (intr_status_new > intr_status_old) { vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new); VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: " "guest_intr_status changed from 0x%04x to 0x%04x", intr_status_old, intr_status_new); } } } static struct vlapic * vmx_vlapic_init(void *arg, int vcpuid) { struct vmx *vmx; struct vlapic *vlapic; struct vlapic_vtx *vlapic_vtx; vmx = arg; vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); vlapic->vm = vmx->vm; vlapic->vcpuid = vcpuid; vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid]; vlapic_vtx = (struct vlapic_vtx *)vlapic; vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid]; vlapic_vtx->vmx = vmx; if (virtual_interrupt_delivery) { vlapic->ops.set_intr_ready = vmx_set_intr_ready; vlapic->ops.pending_intr = vmx_pending_intr; vlapic->ops.intr_accepted = vmx_intr_accepted; vlapic->ops.set_tmr = vmx_set_tmr; vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; } if (posted_interrupts) vlapic->ops.post_intr = vmx_post_intr; vlapic_init(vlapic); return (vlapic); } static void vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) { vlapic_cleanup(vlapic); free(vlapic, M_VLAPIC); } struct vmm_ops vmm_ops_intel = { .init = vmx_init, .cleanup = vmx_cleanup, .resume = vmx_restore, .vminit = vmx_vminit, .vmrun = vmx_run, .vmcleanup = vmx_vmcleanup, .vmgetreg = vmx_getreg, .vmsetreg = vmx_setreg, .vmgetdesc = vmx_getdesc, .vmsetdesc = vmx_setdesc, .vmgetcap = vmx_getcap, .vmsetcap = vmx_setcap, .vmspace_alloc = ept_vmspace_alloc, .vmspace_free = ept_vmspace_free, .vlapic_init = vmx_vlapic_init, .vlapic_cleanup = vmx_vlapic_cleanup, }; Index: stable/12/sys/dev/qlnx/qlnxe/qlnx_os.c =================================================================== --- stable/12/sys/dev/qlnx/qlnxe/qlnx_os.c (revision 353104) +++ stable/12/sys/dev/qlnx/qlnxe/qlnx_os.c (revision 353105) @@ -1,8659 +1,8658 @@ /* * Copyright (c) 2017-2018 Cavium, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * File: qlnx_os.c * Author : David C Somayajulu, Cavium, Inc., San Jose, CA 95131. */ #include __FBSDID("$FreeBSD$"); #include "qlnx_os.h" #include "bcm_osal.h" #include "reg_addr.h" #include "ecore_gtt_reg_addr.h" #include "ecore.h" #include "ecore_chain.h" #include "ecore_status.h" #include "ecore_hw.h" #include "ecore_rt_defs.h" #include "ecore_init_ops.h" #include "ecore_int.h" #include "ecore_cxt.h" #include "ecore_spq.h" #include "ecore_init_fw_funcs.h" #include "ecore_sp_commands.h" #include "ecore_dev_api.h" #include "ecore_l2_api.h" #include "ecore_mcp.h" #include "ecore_hw_defs.h" #include "mcp_public.h" #include "ecore_iro.h" #include "nvm_cfg.h" #include "ecore_dev_api.h" #include "ecore_dbg_fw_funcs.h" #include "ecore_iov_api.h" #include "ecore_vf_api.h" #include "qlnx_ioctl.h" #include "qlnx_def.h" #include "qlnx_ver.h" #ifdef QLNX_ENABLE_IWARP #include "qlnx_rdma.h" #endif /* #ifdef QLNX_ENABLE_IWARP */ #include /* * static functions */ /* * ioctl related functions */ static void qlnx_add_sysctls(qlnx_host_t *ha); /* * main driver */ static void qlnx_release(qlnx_host_t *ha); static void qlnx_fp_isr(void *arg); static void qlnx_init_ifnet(device_t dev, qlnx_host_t *ha); static void qlnx_init(void *arg); static void qlnx_init_locked(qlnx_host_t *ha); static int qlnx_set_multi(qlnx_host_t *ha, uint32_t add_multi); static int qlnx_set_promisc(qlnx_host_t *ha); static int qlnx_set_allmulti(qlnx_host_t *ha); static int qlnx_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int qlnx_media_change(struct ifnet *ifp); static void qlnx_media_status(struct ifnet *ifp, struct ifmediareq *ifmr); static void qlnx_stop(qlnx_host_t *ha); static int qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf **m_headp); static int qlnx_get_ifq_snd_maxlen(qlnx_host_t *ha); static uint32_t qlnx_get_optics(qlnx_host_t *ha, struct qlnx_link_output *if_link); static int qlnx_transmit(struct ifnet *ifp, struct mbuf *mp); static int qlnx_transmit_locked(struct ifnet *ifp, struct qlnx_fastpath *fp, struct mbuf *mp); static void qlnx_qflush(struct ifnet *ifp); static int qlnx_alloc_parent_dma_tag(qlnx_host_t *ha); static void qlnx_free_parent_dma_tag(qlnx_host_t *ha); static int qlnx_alloc_tx_dma_tag(qlnx_host_t *ha); static void qlnx_free_tx_dma_tag(qlnx_host_t *ha); static int qlnx_alloc_rx_dma_tag(qlnx_host_t *ha); static void qlnx_free_rx_dma_tag(qlnx_host_t *ha); static int qlnx_get_mfw_version(qlnx_host_t *ha, uint32_t *mfw_ver); static int qlnx_get_flash_size(qlnx_host_t *ha, uint32_t *flash_size); static int qlnx_nic_setup(struct ecore_dev *cdev, struct ecore_pf_params *func_params); static int qlnx_nic_start(struct ecore_dev *cdev); static int qlnx_slowpath_start(qlnx_host_t *ha); static int qlnx_slowpath_stop(qlnx_host_t *ha); static int qlnx_init_hw(qlnx_host_t *ha); static void qlnx_set_id(struct ecore_dev *cdev, char name[NAME_SIZE], char ver_str[VER_SIZE]); static void qlnx_unload(qlnx_host_t *ha); static int qlnx_load(qlnx_host_t *ha); static void qlnx_hw_set_multi(qlnx_host_t *ha, uint8_t *mta, uint32_t mcnt, uint32_t add_mac); static void qlnx_dump_buf8(qlnx_host_t *ha, const char *msg, void *dbuf, uint32_t len); static int qlnx_alloc_rx_buffer(qlnx_host_t *ha, struct qlnx_rx_queue *rxq); static void qlnx_reuse_rx_data(struct qlnx_rx_queue *rxq); static void qlnx_update_rx_prod(struct ecore_hwfn *p_hwfn, struct qlnx_rx_queue *rxq); static int qlnx_set_rx_accept_filter(qlnx_host_t *ha, uint8_t filter); static int qlnx_grc_dumpsize(qlnx_host_t *ha, uint32_t *num_dwords, int hwfn_index); static int qlnx_idle_chk_size(qlnx_host_t *ha, uint32_t *num_dwords, int hwfn_index); static void qlnx_timer(void *arg); static int qlnx_alloc_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp); static void qlnx_free_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp); static void qlnx_trigger_dump(qlnx_host_t *ha); static uint16_t qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq); static void qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq); static int qlnx_rx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, int budget, int lro_enable); static void qlnx_fp_taskqueue(void *context, int pending); static void qlnx_sample_storm_stats(qlnx_host_t *ha); static int qlnx_alloc_tpa_mbuf(qlnx_host_t *ha, uint16_t rx_buf_size, struct qlnx_agg_info *tpa); static void qlnx_free_tpa_mbuf(qlnx_host_t *ha, struct qlnx_agg_info *tpa); #if __FreeBSD_version >= 1100000 static uint64_t qlnx_get_counter(if_t ifp, ift_counter cnt); #endif /* * Hooks to the Operating Systems */ static int qlnx_pci_probe (device_t); static int qlnx_pci_attach (device_t); static int qlnx_pci_detach (device_t); #ifndef QLNX_VF #ifdef CONFIG_ECORE_SRIOV static int qlnx_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *params); static void qlnx_iov_uninit(device_t dev); static int qlnx_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params); static void qlnx_initialize_sriov(qlnx_host_t *ha); static void qlnx_pf_taskqueue(void *context, int pending); static int qlnx_create_pf_taskqueues(qlnx_host_t *ha); static void qlnx_destroy_pf_taskqueues(qlnx_host_t *ha); static void qlnx_inform_vf_link_state(struct ecore_hwfn *p_hwfn, qlnx_host_t *ha); #endif /* #ifdef CONFIG_ECORE_SRIOV */ static device_method_t qlnx_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, qlnx_pci_probe), DEVMETHOD(device_attach, qlnx_pci_attach), DEVMETHOD(device_detach, qlnx_pci_detach), #ifdef CONFIG_ECORE_SRIOV DEVMETHOD(pci_iov_init, qlnx_iov_init), DEVMETHOD(pci_iov_uninit, qlnx_iov_uninit), DEVMETHOD(pci_iov_add_vf, qlnx_iov_add_vf), #endif /* #ifdef CONFIG_ECORE_SRIOV */ { 0, 0 } }; static driver_t qlnx_pci_driver = { "ql", qlnx_pci_methods, sizeof (qlnx_host_t), }; static devclass_t qlnx_devclass; MODULE_VERSION(if_qlnxe,1); DRIVER_MODULE(if_qlnxe, pci, qlnx_pci_driver, qlnx_devclass, 0, 0); MODULE_DEPEND(if_qlnxe, pci, 1, 1, 1); MODULE_DEPEND(if_qlnxe, ether, 1, 1, 1); #else static device_method_t qlnxv_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, qlnx_pci_probe), DEVMETHOD(device_attach, qlnx_pci_attach), DEVMETHOD(device_detach, qlnx_pci_detach), { 0, 0 } }; static driver_t qlnxv_pci_driver = { "ql", qlnxv_pci_methods, sizeof (qlnx_host_t), }; static devclass_t qlnxv_devclass; MODULE_VERSION(if_qlnxev,1); DRIVER_MODULE(if_qlnxev, pci, qlnxv_pci_driver, qlnxv_devclass, 0, 0); MODULE_DEPEND(if_qlnxev, pci, 1, 1, 1); MODULE_DEPEND(if_qlnxev, ether, 1, 1, 1); #endif /* #ifdef QLNX_VF */ MALLOC_DEFINE(M_QLNXBUF, "qlnxbuf", "Buffers for qlnx driver"); - -static char qlnx_dev_str[128]; -static char qlnx_ver_str[VER_SIZE]; -static char qlnx_name_str[NAME_SIZE]; +char qlnx_dev_str[128]; +char qlnx_ver_str[VER_SIZE]; +char qlnx_name_str[NAME_SIZE]; /* * Some PCI Configuration Space Related Defines */ #ifndef PCI_VENDOR_QLOGIC #define PCI_VENDOR_QLOGIC 0x1077 #endif /* 40G Adapter QLE45xxx*/ #ifndef QLOGIC_PCI_DEVICE_ID_1634 #define QLOGIC_PCI_DEVICE_ID_1634 0x1634 #endif /* 100G Adapter QLE45xxx*/ #ifndef QLOGIC_PCI_DEVICE_ID_1644 #define QLOGIC_PCI_DEVICE_ID_1644 0x1644 #endif /* 25G Adapter QLE45xxx*/ #ifndef QLOGIC_PCI_DEVICE_ID_1656 #define QLOGIC_PCI_DEVICE_ID_1656 0x1656 #endif /* 50G Adapter QLE45xxx*/ #ifndef QLOGIC_PCI_DEVICE_ID_1654 #define QLOGIC_PCI_DEVICE_ID_1654 0x1654 #endif /* 10G/25G/40G Adapter QLE41xxx*/ #ifndef QLOGIC_PCI_DEVICE_ID_8070 #define QLOGIC_PCI_DEVICE_ID_8070 0x8070 #endif /* SRIOV Device (All Speeds) Adapter QLE41xxx*/ #ifndef QLOGIC_PCI_DEVICE_ID_8090 #define QLOGIC_PCI_DEVICE_ID_8090 0x8090 #endif SYSCTL_NODE(_hw, OID_AUTO, qlnxe, CTLFLAG_RD, 0, "qlnxe driver parameters"); /* Number of Queues: 0 (Auto) or 1 to 32 (fixed queue number) */ static int qlnxe_queue_count = QLNX_DEFAULT_RSS; #if __FreeBSD_version < 1100000 TUNABLE_INT("hw.qlnxe.queue_count", &qlnxe_queue_count); #endif SYSCTL_INT(_hw_qlnxe, OID_AUTO, queue_count, CTLFLAG_RDTUN, &qlnxe_queue_count, 0, "Multi-Queue queue count"); /* * Note on RDMA personality setting * * Read the personality configured in NVRAM * If the personality is ETH_ONLY, ETH_IWARP or ETH_ROCE and * the configured personality in sysctl is QLNX_PERSONALITY_DEFAULT * use the personality in NVRAM. * Otherwise use t the personality configured in sysctl. * */ #define QLNX_PERSONALITY_DEFAULT 0x0 /* use personality in NVRAM */ #define QLNX_PERSONALITY_ETH_ONLY 0x1 /* Override with ETH_ONLY */ #define QLNX_PERSONALITY_ETH_IWARP 0x2 /* Override with ETH_IWARP */ #define QLNX_PERSONALITY_ETH_ROCE 0x3 /* Override with ETH_ROCE */ #define QLNX_PERSONALITY_BITS_PER_FUNC 4 #define QLNX_PERSONALIY_MASK 0xF /* RDMA configuration; 64bit field allows setting for 16 physical functions*/ static uint64_t qlnxe_rdma_configuration = 0x22222222; #if __FreeBSD_version < 1100000 TUNABLE_QUAD("hw.qlnxe.rdma_configuration", &qlnxe_rdma_configuration); SYSCTL_UQUAD(_hw_qlnxe, OID_AUTO, rdma_configuration, CTLFLAG_RDTUN, &qlnxe_rdma_configuration, 0, "RDMA Configuration"); #else SYSCTL_U64(_hw_qlnxe, OID_AUTO, rdma_configuration, CTLFLAG_RDTUN, &qlnxe_rdma_configuration, 0, "RDMA Configuration"); #endif /* #if __FreeBSD_version < 1100000 */ int qlnx_vf_device(qlnx_host_t *ha) { uint16_t device_id; device_id = ha->device_id; if (device_id == QLOGIC_PCI_DEVICE_ID_8090) return 0; return -1; } static int qlnx_valid_device(qlnx_host_t *ha) { uint16_t device_id; device_id = ha->device_id; #ifndef QLNX_VF if ((device_id == QLOGIC_PCI_DEVICE_ID_1634) || (device_id == QLOGIC_PCI_DEVICE_ID_1644) || (device_id == QLOGIC_PCI_DEVICE_ID_1656) || (device_id == QLOGIC_PCI_DEVICE_ID_1654) || (device_id == QLOGIC_PCI_DEVICE_ID_8070)) return 0; #else if (device_id == QLOGIC_PCI_DEVICE_ID_8090) return 0; #endif /* #ifndef QLNX_VF */ return -1; } #ifdef QLNX_ENABLE_IWARP static int qlnx_rdma_supported(struct qlnx_host *ha) { uint16_t device_id; device_id = pci_get_device(ha->pci_dev); if ((device_id == QLOGIC_PCI_DEVICE_ID_1634) || (device_id == QLOGIC_PCI_DEVICE_ID_1656) || (device_id == QLOGIC_PCI_DEVICE_ID_1654) || (device_id == QLOGIC_PCI_DEVICE_ID_8070)) return (0); return (-1); } #endif /* #ifdef QLNX_ENABLE_IWARP */ /* * Name: qlnx_pci_probe * Function: Validate the PCI device to be a QLA80XX device */ static int qlnx_pci_probe(device_t dev) { snprintf(qlnx_ver_str, sizeof(qlnx_ver_str), "v%d.%d.%d", QLNX_VERSION_MAJOR, QLNX_VERSION_MINOR, QLNX_VERSION_BUILD); snprintf(qlnx_name_str, sizeof(qlnx_name_str), "qlnx"); if (pci_get_vendor(dev) != PCI_VENDOR_QLOGIC) { return (ENXIO); } switch (pci_get_device(dev)) { #ifndef QLNX_VF case QLOGIC_PCI_DEVICE_ID_1644: snprintf(qlnx_dev_str, sizeof(qlnx_dev_str), "%s v%d.%d.%d", "Qlogic 100GbE PCI CNA Adapter-Ethernet Function", QLNX_VERSION_MAJOR, QLNX_VERSION_MINOR, QLNX_VERSION_BUILD); device_set_desc_copy(dev, qlnx_dev_str); break; case QLOGIC_PCI_DEVICE_ID_1634: snprintf(qlnx_dev_str, sizeof(qlnx_dev_str), "%s v%d.%d.%d", "Qlogic 40GbE PCI CNA Adapter-Ethernet Function", QLNX_VERSION_MAJOR, QLNX_VERSION_MINOR, QLNX_VERSION_BUILD); device_set_desc_copy(dev, qlnx_dev_str); break; case QLOGIC_PCI_DEVICE_ID_1656: snprintf(qlnx_dev_str, sizeof(qlnx_dev_str), "%s v%d.%d.%d", "Qlogic 25GbE PCI CNA Adapter-Ethernet Function", QLNX_VERSION_MAJOR, QLNX_VERSION_MINOR, QLNX_VERSION_BUILD); device_set_desc_copy(dev, qlnx_dev_str); break; case QLOGIC_PCI_DEVICE_ID_1654: snprintf(qlnx_dev_str, sizeof(qlnx_dev_str), "%s v%d.%d.%d", "Qlogic 50GbE PCI CNA Adapter-Ethernet Function", QLNX_VERSION_MAJOR, QLNX_VERSION_MINOR, QLNX_VERSION_BUILD); device_set_desc_copy(dev, qlnx_dev_str); break; case QLOGIC_PCI_DEVICE_ID_8070: snprintf(qlnx_dev_str, sizeof(qlnx_dev_str), "%s v%d.%d.%d", "Qlogic 10GbE/25GbE/40GbE PCI CNA (AH)" " Adapter-Ethernet Function", QLNX_VERSION_MAJOR, QLNX_VERSION_MINOR, QLNX_VERSION_BUILD); device_set_desc_copy(dev, qlnx_dev_str); break; #else case QLOGIC_PCI_DEVICE_ID_8090: snprintf(qlnx_dev_str, sizeof(qlnx_dev_str), "%s v%d.%d.%d", "Qlogic SRIOV PCI CNA (AH) " "Adapter-Ethernet Function", QLNX_VERSION_MAJOR, QLNX_VERSION_MINOR, QLNX_VERSION_BUILD); device_set_desc_copy(dev, qlnx_dev_str); break; #endif /* #ifndef QLNX_VF */ default: return (ENXIO); } #ifdef QLNX_ENABLE_IWARP qlnx_rdma_init(); #endif /* #ifdef QLNX_ENABLE_IWARP */ return (BUS_PROBE_DEFAULT); } static uint16_t qlnx_num_tx_compl(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq) { u16 hw_bd_cons; u16 ecore_cons_idx; uint16_t diff; hw_bd_cons = le16toh(*txq->hw_cons_ptr); ecore_cons_idx = ecore_chain_get_cons_idx(&txq->tx_pbl); if (hw_bd_cons < ecore_cons_idx) { diff = (1 << 16) - (ecore_cons_idx - hw_bd_cons); } else { diff = hw_bd_cons - ecore_cons_idx; } return diff; } static void qlnx_sp_intr(void *arg) { struct ecore_hwfn *p_hwfn; qlnx_host_t *ha; int i; p_hwfn = arg; if (p_hwfn == NULL) { printf("%s: spurious slowpath intr\n", __func__); return; } ha = (qlnx_host_t *)p_hwfn->p_dev; QL_DPRINT2(ha, "enter\n"); for (i = 0; i < ha->cdev.num_hwfns; i++) { if (&ha->cdev.hwfns[i] == p_hwfn) { taskqueue_enqueue(ha->sp_taskqueue[i], &ha->sp_task[i]); break; } } QL_DPRINT2(ha, "exit\n"); return; } static void qlnx_sp_taskqueue(void *context, int pending) { struct ecore_hwfn *p_hwfn; p_hwfn = context; if (p_hwfn != NULL) { qlnx_sp_isr(p_hwfn); } return; } static int qlnx_create_sp_taskqueues(qlnx_host_t *ha) { int i; uint8_t tq_name[32]; for (i = 0; i < ha->cdev.num_hwfns; i++) { struct ecore_hwfn *p_hwfn = &ha->cdev.hwfns[i]; bzero(tq_name, sizeof (tq_name)); snprintf(tq_name, sizeof (tq_name), "ql_sp_tq_%d", i); TASK_INIT(&ha->sp_task[i], 0, qlnx_sp_taskqueue, p_hwfn); ha->sp_taskqueue[i] = taskqueue_create(tq_name, M_NOWAIT, taskqueue_thread_enqueue, &ha->sp_taskqueue[i]); if (ha->sp_taskqueue[i] == NULL) return (-1); taskqueue_start_threads(&ha->sp_taskqueue[i], 1, PI_NET, "%s", tq_name); QL_DPRINT1(ha, "%p\n", ha->sp_taskqueue[i]); } return (0); } static void qlnx_destroy_sp_taskqueues(qlnx_host_t *ha) { int i; for (i = 0; i < ha->cdev.num_hwfns; i++) { if (ha->sp_taskqueue[i] != NULL) { taskqueue_drain(ha->sp_taskqueue[i], &ha->sp_task[i]); taskqueue_free(ha->sp_taskqueue[i]); } } return; } static void qlnx_fp_taskqueue(void *context, int pending) { struct qlnx_fastpath *fp; qlnx_host_t *ha; struct ifnet *ifp; fp = context; if (fp == NULL) return; ha = (qlnx_host_t *)fp->edev; ifp = ha->ifp; if(ifp->if_drv_flags & IFF_DRV_RUNNING) { if (!drbr_empty(ifp, fp->tx_br)) { if(mtx_trylock(&fp->tx_mtx)) { #ifdef QLNX_TRACE_PERF_DATA tx_pkts = fp->tx_pkts_transmitted; tx_compl = fp->tx_pkts_completed; #endif qlnx_transmit_locked(ifp, fp, NULL); #ifdef QLNX_TRACE_PERF_DATA fp->tx_pkts_trans_fp += (fp->tx_pkts_transmitted - tx_pkts); fp->tx_pkts_compl_fp += (fp->tx_pkts_completed - tx_compl); #endif mtx_unlock(&fp->tx_mtx); } } } QL_DPRINT2(ha, "exit \n"); return; } static int qlnx_create_fp_taskqueues(qlnx_host_t *ha) { int i; uint8_t tq_name[32]; struct qlnx_fastpath *fp; for (i = 0; i < ha->num_rss; i++) { fp = &ha->fp_array[i]; bzero(tq_name, sizeof (tq_name)); snprintf(tq_name, sizeof (tq_name), "ql_fp_tq_%d", i); TASK_INIT(&fp->fp_task, 0, qlnx_fp_taskqueue, fp); fp->fp_taskqueue = taskqueue_create(tq_name, M_NOWAIT, taskqueue_thread_enqueue, &fp->fp_taskqueue); if (fp->fp_taskqueue == NULL) return (-1); taskqueue_start_threads(&fp->fp_taskqueue, 1, PI_NET, "%s", tq_name); QL_DPRINT1(ha, "%p\n",fp->fp_taskqueue); } return (0); } static void qlnx_destroy_fp_taskqueues(qlnx_host_t *ha) { int i; struct qlnx_fastpath *fp; for (i = 0; i < ha->num_rss; i++) { fp = &ha->fp_array[i]; if (fp->fp_taskqueue != NULL) { taskqueue_drain(fp->fp_taskqueue, &fp->fp_task); taskqueue_free(fp->fp_taskqueue); fp->fp_taskqueue = NULL; } } return; } static void qlnx_drain_fp_taskqueues(qlnx_host_t *ha) { int i; struct qlnx_fastpath *fp; for (i = 0; i < ha->num_rss; i++) { fp = &ha->fp_array[i]; if (fp->fp_taskqueue != NULL) { QLNX_UNLOCK(ha); taskqueue_drain(fp->fp_taskqueue, &fp->fp_task); QLNX_LOCK(ha); } } return; } static void qlnx_get_params(qlnx_host_t *ha) { if ((qlnxe_queue_count < 0) || (qlnxe_queue_count > QLNX_MAX_RSS)) { device_printf(ha->pci_dev, "invalid queue_count value (%d)\n", qlnxe_queue_count); qlnxe_queue_count = 0; } return; } static void qlnx_error_recovery_taskqueue(void *context, int pending) { qlnx_host_t *ha; ha = context; QL_DPRINT2(ha, "enter\n"); QLNX_LOCK(ha); qlnx_stop(ha); QLNX_UNLOCK(ha); #ifdef QLNX_ENABLE_IWARP qlnx_rdma_dev_remove(ha); #endif /* #ifdef QLNX_ENABLE_IWARP */ qlnx_slowpath_stop(ha); qlnx_slowpath_start(ha); #ifdef QLNX_ENABLE_IWARP qlnx_rdma_dev_add(ha); #endif /* #ifdef QLNX_ENABLE_IWARP */ qlnx_init(ha); callout_reset(&ha->qlnx_callout, hz, qlnx_timer, ha); QL_DPRINT2(ha, "exit\n"); return; } static int qlnx_create_error_recovery_taskqueue(qlnx_host_t *ha) { uint8_t tq_name[32]; bzero(tq_name, sizeof (tq_name)); snprintf(tq_name, sizeof (tq_name), "ql_err_tq"); TASK_INIT(&ha->err_task, 0, qlnx_error_recovery_taskqueue, ha); ha->err_taskqueue = taskqueue_create(tq_name, M_NOWAIT, taskqueue_thread_enqueue, &ha->err_taskqueue); if (ha->err_taskqueue == NULL) return (-1); taskqueue_start_threads(&ha->err_taskqueue, 1, PI_NET, "%s", tq_name); QL_DPRINT1(ha, "%p\n",ha->err_taskqueue); return (0); } static void qlnx_destroy_error_recovery_taskqueue(qlnx_host_t *ha) { if (ha->err_taskqueue != NULL) { taskqueue_drain(ha->err_taskqueue, &ha->err_task); taskqueue_free(ha->err_taskqueue); } ha->err_taskqueue = NULL; return; } /* * Name: qlnx_pci_attach * Function: attaches the device to the operating system */ static int qlnx_pci_attach(device_t dev) { qlnx_host_t *ha = NULL; uint32_t rsrc_len_reg = 0; uint32_t rsrc_len_dbells = 0; uint32_t rsrc_len_msix = 0; int i; uint32_t mfw_ver; uint32_t num_sp_msix = 0; uint32_t num_rdma_irqs = 0; if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "cannot get softc\n"); return (ENOMEM); } memset(ha, 0, sizeof (qlnx_host_t)); ha->device_id = pci_get_device(dev); if (qlnx_valid_device(ha) != 0) { device_printf(dev, "device is not valid device\n"); return (ENXIO); } ha->pci_func = pci_get_function(dev); ha->pci_dev = dev; mtx_init(&ha->hw_lock, "qlnx_hw_lock", MTX_NETWORK_LOCK, MTX_DEF); ha->flags.lock_init = 1; pci_enable_busmaster(dev); /* * map the PCI BARs */ ha->reg_rid = PCIR_BAR(0); ha->pci_reg = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->reg_rid, RF_ACTIVE); if (ha->pci_reg == NULL) { device_printf(dev, "unable to map BAR0\n"); goto qlnx_pci_attach_err; } rsrc_len_reg = (uint32_t) bus_get_resource_count(dev, SYS_RES_MEMORY, ha->reg_rid); ha->dbells_rid = PCIR_BAR(2); rsrc_len_dbells = (uint32_t) bus_get_resource_count(dev, SYS_RES_MEMORY, ha->dbells_rid); if (rsrc_len_dbells) { ha->pci_dbells = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->dbells_rid, RF_ACTIVE); if (ha->pci_dbells == NULL) { device_printf(dev, "unable to map BAR1\n"); goto qlnx_pci_attach_err; } ha->dbells_phys_addr = (uint64_t) bus_get_resource_start(dev, SYS_RES_MEMORY, ha->dbells_rid); ha->dbells_size = rsrc_len_dbells; } else { if (qlnx_vf_device(ha) != 0) { device_printf(dev, " BAR1 size is zero\n"); goto qlnx_pci_attach_err; } } ha->msix_rid = PCIR_BAR(4); ha->msix_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &ha->msix_rid, RF_ACTIVE); if (ha->msix_bar == NULL) { device_printf(dev, "unable to map BAR2\n"); goto qlnx_pci_attach_err; } rsrc_len_msix = (uint32_t) bus_get_resource_count(dev, SYS_RES_MEMORY, ha->msix_rid); ha->dbg_level = 0x0000; QL_DPRINT1(ha, "\n\t\t\t" "pci_dev = %p pci_reg = %p, reg_len = 0x%08x reg_rid = 0x%08x" "\n\t\t\tdbells = %p, dbells_len = 0x%08x dbells_rid = 0x%08x" "\n\t\t\tmsix = %p, msix_len = 0x%08x msix_rid = 0x%08x" " msix_avail = 0x%x " "\n\t\t\t[ncpus = %d]\n", ha->pci_dev, ha->pci_reg, rsrc_len_reg, ha->reg_rid, ha->pci_dbells, rsrc_len_dbells, ha->dbells_rid, ha->msix_bar, rsrc_len_msix, ha->msix_rid, pci_msix_count(dev), mp_ncpus); /* * allocate dma tags */ if (qlnx_alloc_parent_dma_tag(ha)) goto qlnx_pci_attach_err; if (qlnx_alloc_tx_dma_tag(ha)) goto qlnx_pci_attach_err; if (qlnx_alloc_rx_dma_tag(ha)) goto qlnx_pci_attach_err; if (qlnx_init_hw(ha) != 0) goto qlnx_pci_attach_err; ha->flags.hw_init = 1; qlnx_get_params(ha); if((pci_get_device(dev) == QLOGIC_PCI_DEVICE_ID_1644) && (qlnxe_queue_count == QLNX_DEFAULT_RSS)) { qlnxe_queue_count = QLNX_MAX_RSS; } /* * Allocate MSI-x vectors */ if (qlnx_vf_device(ha) != 0) { if (qlnxe_queue_count == 0) ha->num_rss = QLNX_DEFAULT_RSS; else ha->num_rss = qlnxe_queue_count; num_sp_msix = ha->cdev.num_hwfns; } else { uint8_t max_rxq; uint8_t max_txq; ecore_vf_get_num_rxqs(&ha->cdev.hwfns[0], &max_rxq); ecore_vf_get_num_rxqs(&ha->cdev.hwfns[0], &max_txq); if (max_rxq < max_txq) ha->num_rss = max_rxq; else ha->num_rss = max_txq; if (ha->num_rss > QLNX_MAX_VF_RSS) ha->num_rss = QLNX_MAX_VF_RSS; num_sp_msix = 0; } if (ha->num_rss > mp_ncpus) ha->num_rss = mp_ncpus; ha->num_tc = QLNX_MAX_TC; ha->msix_count = pci_msix_count(dev); #ifdef QLNX_ENABLE_IWARP num_rdma_irqs = qlnx_rdma_get_num_irqs(ha); #endif /* #ifdef QLNX_ENABLE_IWARP */ if (!ha->msix_count || (ha->msix_count < (num_sp_msix + 1 + num_rdma_irqs))) { device_printf(dev, "%s: msix_count[%d] not enough\n", __func__, ha->msix_count); goto qlnx_pci_attach_err; } if (ha->msix_count > (ha->num_rss + num_sp_msix + num_rdma_irqs)) ha->msix_count = ha->num_rss + num_sp_msix + num_rdma_irqs; else ha->num_rss = ha->msix_count - (num_sp_msix + num_rdma_irqs); QL_DPRINT1(ha, "\n\t\t\t" "pci_reg = %p, reg_len = 0x%08x reg_rid = 0x%08x" "\n\t\t\tdbells = %p, dbells_len = 0x%08x dbells_rid = 0x%08x" "\n\t\t\tmsix = %p, msix_len = 0x%08x msix_rid = 0x%08x" " msix_avail = 0x%x msix_alloc = 0x%x" "\n\t\t\t[ncpus = %d][num_rss = 0x%x] [num_tc = 0x%x]\n", ha->pci_reg, rsrc_len_reg, ha->reg_rid, ha->pci_dbells, rsrc_len_dbells, ha->dbells_rid, ha->msix_bar, rsrc_len_msix, ha->msix_rid, pci_msix_count(dev), ha->msix_count, mp_ncpus, ha->num_rss, ha->num_tc); if (pci_alloc_msix(dev, &ha->msix_count)) { device_printf(dev, "%s: pci_alloc_msix[%d] failed\n", __func__, ha->msix_count); ha->msix_count = 0; goto qlnx_pci_attach_err; } /* * Initialize slow path interrupt and task queue */ if (num_sp_msix) { if (qlnx_create_sp_taskqueues(ha) != 0) goto qlnx_pci_attach_err; for (i = 0; i < ha->cdev.num_hwfns; i++) { struct ecore_hwfn *p_hwfn = &ha->cdev.hwfns[i]; ha->sp_irq_rid[i] = i + 1; ha->sp_irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ, &ha->sp_irq_rid[i], (RF_ACTIVE | RF_SHAREABLE)); if (ha->sp_irq[i] == NULL) { device_printf(dev, "could not allocate mbx interrupt\n"); goto qlnx_pci_attach_err; } if (bus_setup_intr(dev, ha->sp_irq[i], (INTR_TYPE_NET | INTR_MPSAFE), NULL, qlnx_sp_intr, p_hwfn, &ha->sp_handle[i])) { device_printf(dev, "could not setup slow path interrupt\n"); goto qlnx_pci_attach_err; } QL_DPRINT1(ha, "p_hwfn [%p] sp_irq_rid %d" " sp_irq %p sp_handle %p\n", p_hwfn, ha->sp_irq_rid[i], ha->sp_irq[i], ha->sp_handle[i]); } } /* * initialize fast path interrupt */ if (qlnx_create_fp_taskqueues(ha) != 0) goto qlnx_pci_attach_err; for (i = 0; i < ha->num_rss; i++) { ha->irq_vec[i].rss_idx = i; ha->irq_vec[i].ha = ha; ha->irq_vec[i].irq_rid = (1 + num_sp_msix) + i; ha->irq_vec[i].irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &ha->irq_vec[i].irq_rid, (RF_ACTIVE | RF_SHAREABLE)); if (ha->irq_vec[i].irq == NULL) { device_printf(dev, "could not allocate interrupt[%d] irq_rid = %d\n", i, ha->irq_vec[i].irq_rid); goto qlnx_pci_attach_err; } if (qlnx_alloc_tx_br(ha, &ha->fp_array[i])) { device_printf(dev, "could not allocate tx_br[%d]\n", i); goto qlnx_pci_attach_err; } } if (qlnx_vf_device(ha) != 0) { callout_init(&ha->qlnx_callout, 1); ha->flags.callout_init = 1; for (i = 0; i < ha->cdev.num_hwfns; i++) { if (qlnx_grc_dumpsize(ha, &ha->grcdump_size[i], i) != 0) goto qlnx_pci_attach_err; if (ha->grcdump_size[i] == 0) goto qlnx_pci_attach_err; ha->grcdump_size[i] = ha->grcdump_size[i] << 2; QL_DPRINT1(ha, "grcdump_size[%d] = 0x%08x\n", i, ha->grcdump_size[i]); ha->grcdump[i] = qlnx_zalloc(ha->grcdump_size[i]); if (ha->grcdump[i] == NULL) { device_printf(dev, "grcdump alloc[%d] failed\n", i); goto qlnx_pci_attach_err; } if (qlnx_idle_chk_size(ha, &ha->idle_chk_size[i], i) != 0) goto qlnx_pci_attach_err; if (ha->idle_chk_size[i] == 0) goto qlnx_pci_attach_err; ha->idle_chk_size[i] = ha->idle_chk_size[i] << 2; QL_DPRINT1(ha, "idle_chk_size[%d] = 0x%08x\n", i, ha->idle_chk_size[i]); ha->idle_chk[i] = qlnx_zalloc(ha->idle_chk_size[i]); if (ha->idle_chk[i] == NULL) { device_printf(dev, "idle_chk alloc failed\n"); goto qlnx_pci_attach_err; } } if (qlnx_create_error_recovery_taskqueue(ha) != 0) goto qlnx_pci_attach_err; } if (qlnx_slowpath_start(ha) != 0) goto qlnx_pci_attach_err; else ha->flags.slowpath_start = 1; if (qlnx_vf_device(ha) != 0) { if (qlnx_get_flash_size(ha, &ha->flash_size) != 0) { qlnx_mdelay(__func__, 1000); qlnx_trigger_dump(ha); goto qlnx_pci_attach_err0; } if (qlnx_get_mfw_version(ha, &mfw_ver) != 0) { qlnx_mdelay(__func__, 1000); qlnx_trigger_dump(ha); goto qlnx_pci_attach_err0; } } else { struct ecore_hwfn *p_hwfn = &ha->cdev.hwfns[0]; ecore_mcp_get_mfw_ver(p_hwfn, NULL, &mfw_ver, NULL); } snprintf(ha->mfw_ver, sizeof(ha->mfw_ver), "%d.%d.%d.%d", ((mfw_ver >> 24) & 0xFF), ((mfw_ver >> 16) & 0xFF), ((mfw_ver >> 8) & 0xFF), (mfw_ver & 0xFF)); snprintf(ha->stormfw_ver, sizeof(ha->stormfw_ver), "%d.%d.%d.%d", FW_MAJOR_VERSION, FW_MINOR_VERSION, FW_REVISION_VERSION, FW_ENGINEERING_VERSION); QL_DPRINT1(ha, "STORM_FW version %s MFW version %s\n", ha->stormfw_ver, ha->mfw_ver); qlnx_init_ifnet(dev, ha); /* * add sysctls */ qlnx_add_sysctls(ha); qlnx_pci_attach_err0: /* * create ioctl device interface */ if (qlnx_vf_device(ha) != 0) { if (qlnx_make_cdev(ha)) { device_printf(dev, "%s: ql_make_cdev failed\n", __func__); goto qlnx_pci_attach_err; } #ifdef QLNX_ENABLE_IWARP qlnx_rdma_dev_add(ha); #endif /* #ifdef QLNX_ENABLE_IWARP */ } #ifndef QLNX_VF #ifdef CONFIG_ECORE_SRIOV if (qlnx_vf_device(ha) != 0) qlnx_initialize_sriov(ha); #endif /* #ifdef CONFIG_ECORE_SRIOV */ #endif /* #ifdef QLNX_VF */ QL_DPRINT2(ha, "success\n"); return (0); qlnx_pci_attach_err: qlnx_release(ha); return (ENXIO); } /* * Name: qlnx_pci_detach * Function: Unhooks the device from the operating system */ static int qlnx_pci_detach(device_t dev) { qlnx_host_t *ha = NULL; if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "%s: cannot get softc\n", __func__); return (ENOMEM); } if (qlnx_vf_device(ha) != 0) { #ifdef CONFIG_ECORE_SRIOV int ret; ret = pci_iov_detach(dev); if (ret) { device_printf(dev, "%s: SRIOV in use\n", __func__); return (ret); } #endif /* #ifdef CONFIG_ECORE_SRIOV */ #ifdef QLNX_ENABLE_IWARP if (qlnx_rdma_dev_remove(ha) != 0) return (EBUSY); #endif /* #ifdef QLNX_ENABLE_IWARP */ } QLNX_LOCK(ha); qlnx_stop(ha); QLNX_UNLOCK(ha); qlnx_release(ha); return (0); } #ifdef QLNX_ENABLE_IWARP static uint8_t qlnx_get_personality(uint8_t pci_func) { uint8_t personality; personality = (qlnxe_rdma_configuration >> (pci_func * QLNX_PERSONALITY_BITS_PER_FUNC)) & QLNX_PERSONALIY_MASK; return (personality); } static void qlnx_set_personality(qlnx_host_t *ha) { struct ecore_hwfn *p_hwfn; uint8_t personality; p_hwfn = &ha->cdev.hwfns[0]; personality = qlnx_get_personality(ha->pci_func); switch (personality) { case QLNX_PERSONALITY_DEFAULT: device_printf(ha->pci_dev, "%s: DEFAULT\n", __func__); ha->personality = ECORE_PCI_DEFAULT; break; case QLNX_PERSONALITY_ETH_ONLY: device_printf(ha->pci_dev, "%s: ETH_ONLY\n", __func__); ha->personality = ECORE_PCI_ETH; break; case QLNX_PERSONALITY_ETH_IWARP: device_printf(ha->pci_dev, "%s: ETH_IWARP\n", __func__); ha->personality = ECORE_PCI_ETH_IWARP; break; case QLNX_PERSONALITY_ETH_ROCE: device_printf(ha->pci_dev, "%s: ETH_ROCE\n", __func__); ha->personality = ECORE_PCI_ETH_ROCE; break; } return; } #endif /* #ifdef QLNX_ENABLE_IWARP */ static int qlnx_init_hw(qlnx_host_t *ha) { int rval = 0; struct ecore_hw_prepare_params params; ecore_init_struct(&ha->cdev); /* ha->dp_module = ECORE_MSG_PROBE | ECORE_MSG_INTR | ECORE_MSG_SP | ECORE_MSG_LINK | ECORE_MSG_SPQ | ECORE_MSG_RDMA; ha->dp_level = ECORE_LEVEL_VERBOSE;*/ //ha->dp_module = ECORE_MSG_RDMA | ECORE_MSG_INTR | ECORE_MSG_LL2; ha->dp_level = ECORE_LEVEL_NOTICE; //ha->dp_level = ECORE_LEVEL_VERBOSE; ecore_init_dp(&ha->cdev, ha->dp_module, ha->dp_level, ha->pci_dev); ha->cdev.regview = ha->pci_reg; ha->personality = ECORE_PCI_DEFAULT; if (qlnx_vf_device(ha) == 0) { ha->cdev.b_is_vf = true; if (ha->pci_dbells != NULL) { ha->cdev.doorbells = ha->pci_dbells; ha->cdev.db_phys_addr = ha->dbells_phys_addr; ha->cdev.db_size = ha->dbells_size; } else { ha->pci_dbells = ha->pci_reg; } } else { ha->cdev.doorbells = ha->pci_dbells; ha->cdev.db_phys_addr = ha->dbells_phys_addr; ha->cdev.db_size = ha->dbells_size; #ifdef QLNX_ENABLE_IWARP if (qlnx_rdma_supported(ha) == 0) qlnx_set_personality(ha); #endif /* #ifdef QLNX_ENABLE_IWARP */ } QL_DPRINT2(ha, "%s: %s\n", __func__, (ha->personality == ECORE_PCI_ETH_IWARP ? "iwarp": "ethernet")); bzero(¶ms, sizeof (struct ecore_hw_prepare_params)); params.personality = ha->personality; params.drv_resc_alloc = false; params.chk_reg_fifo = false; params.initiate_pf_flr = true; params.epoch = 0; ecore_hw_prepare(&ha->cdev, ¶ms); qlnx_set_id(&ha->cdev, qlnx_name_str, qlnx_ver_str); QL_DPRINT1(ha, "ha = %p cdev = %p p_hwfn = %p\n", ha, &ha->cdev, &ha->cdev.hwfns[0]); return (rval); } static void qlnx_release(qlnx_host_t *ha) { device_t dev; int i; dev = ha->pci_dev; QL_DPRINT2(ha, "enter\n"); for (i = 0; i < QLNX_MAX_HW_FUNCS; i++) { if (ha->idle_chk[i] != NULL) { free(ha->idle_chk[i], M_QLNXBUF); ha->idle_chk[i] = NULL; } if (ha->grcdump[i] != NULL) { free(ha->grcdump[i], M_QLNXBUF); ha->grcdump[i] = NULL; } } if (ha->flags.callout_init) callout_drain(&ha->qlnx_callout); if (ha->flags.slowpath_start) { qlnx_slowpath_stop(ha); } if (ha->flags.hw_init) ecore_hw_remove(&ha->cdev); qlnx_del_cdev(ha); if (ha->ifp != NULL) ether_ifdetach(ha->ifp); qlnx_free_tx_dma_tag(ha); qlnx_free_rx_dma_tag(ha); qlnx_free_parent_dma_tag(ha); if (qlnx_vf_device(ha) != 0) { qlnx_destroy_error_recovery_taskqueue(ha); } for (i = 0; i < ha->num_rss; i++) { struct qlnx_fastpath *fp = &ha->fp_array[i]; if (ha->irq_vec[i].handle) { (void)bus_teardown_intr(dev, ha->irq_vec[i].irq, ha->irq_vec[i].handle); } if (ha->irq_vec[i].irq) { (void)bus_release_resource(dev, SYS_RES_IRQ, ha->irq_vec[i].irq_rid, ha->irq_vec[i].irq); } qlnx_free_tx_br(ha, fp); } qlnx_destroy_fp_taskqueues(ha); for (i = 0; i < ha->cdev.num_hwfns; i++) { if (ha->sp_handle[i]) (void)bus_teardown_intr(dev, ha->sp_irq[i], ha->sp_handle[i]); if (ha->sp_irq[i]) (void) bus_release_resource(dev, SYS_RES_IRQ, ha->sp_irq_rid[i], ha->sp_irq[i]); } qlnx_destroy_sp_taskqueues(ha); if (ha->msix_count) pci_release_msi(dev); if (ha->flags.lock_init) { mtx_destroy(&ha->hw_lock); } if (ha->pci_reg) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->reg_rid, ha->pci_reg); if (ha->dbells_size && ha->pci_dbells) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->dbells_rid, ha->pci_dbells); if (ha->msix_bar) (void) bus_release_resource(dev, SYS_RES_MEMORY, ha->msix_rid, ha->msix_bar); QL_DPRINT2(ha, "exit\n"); return; } static void qlnx_trigger_dump(qlnx_host_t *ha) { int i; if (ha->ifp != NULL) ha->ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE | IFF_DRV_RUNNING); QL_DPRINT2(ha, "enter\n"); if (qlnx_vf_device(ha) == 0) return; ha->error_recovery = 1; for (i = 0; i < ha->cdev.num_hwfns; i++) { qlnx_grc_dump(ha, &ha->grcdump_dwords[i], i); qlnx_idle_chk(ha, &ha->idle_chk_dwords[i], i); } QL_DPRINT2(ha, "exit\n"); return; } static int qlnx_trigger_dump_sysctl(SYSCTL_HANDLER_ARGS) { int err, ret = 0; qlnx_host_t *ha; err = sysctl_handle_int(oidp, &ret, 0, req); if (err || !req->newptr) return (err); if (ret == 1) { ha = (qlnx_host_t *)arg1; qlnx_trigger_dump(ha); } return (err); } static int qlnx_set_tx_coalesce(SYSCTL_HANDLER_ARGS) { int err, i, ret = 0, usecs = 0; qlnx_host_t *ha; struct ecore_hwfn *p_hwfn; struct qlnx_fastpath *fp; err = sysctl_handle_int(oidp, &usecs, 0, req); if (err || !req->newptr || !usecs || (usecs > 255)) return (err); ha = (qlnx_host_t *)arg1; if (qlnx_vf_device(ha) == 0) return (-1); for (i = 0; i < ha->num_rss; i++) { p_hwfn = &ha->cdev.hwfns[(i % ha->cdev.num_hwfns)]; fp = &ha->fp_array[i]; if (fp->txq[0]->handle != NULL) { ret = ecore_set_queue_coalesce(p_hwfn, 0, (uint16_t)usecs, fp->txq[0]->handle); } } if (!ret) ha->tx_coalesce_usecs = (uint8_t)usecs; return (err); } static int qlnx_set_rx_coalesce(SYSCTL_HANDLER_ARGS) { int err, i, ret = 0, usecs = 0; qlnx_host_t *ha; struct ecore_hwfn *p_hwfn; struct qlnx_fastpath *fp; err = sysctl_handle_int(oidp, &usecs, 0, req); if (err || !req->newptr || !usecs || (usecs > 255)) return (err); ha = (qlnx_host_t *)arg1; if (qlnx_vf_device(ha) == 0) return (-1); for (i = 0; i < ha->num_rss; i++) { p_hwfn = &ha->cdev.hwfns[(i % ha->cdev.num_hwfns)]; fp = &ha->fp_array[i]; if (fp->rxq->handle != NULL) { ret = ecore_set_queue_coalesce(p_hwfn, (uint16_t)usecs, 0, fp->rxq->handle); } } if (!ret) ha->rx_coalesce_usecs = (uint8_t)usecs; return (err); } static void qlnx_add_sp_stats_sysctls(qlnx_host_t *ha) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; struct sysctl_oid *ctx_oid; ctx = device_get_sysctl_ctx(ha->pci_dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(ha->pci_dev)); ctx_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "spstat", CTLFLAG_RD, NULL, "spstat"); children = SYSCTL_CHILDREN(ctx_oid); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "sp_interrupts", CTLFLAG_RD, &ha->sp_interrupts, "No. of slowpath interrupts"); return; } static void qlnx_add_fp_stats_sysctls(qlnx_host_t *ha) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; struct sysctl_oid_list *node_children; struct sysctl_oid *ctx_oid; int i, j; uint8_t name_str[16]; ctx = device_get_sysctl_ctx(ha->pci_dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(ha->pci_dev)); ctx_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "fpstat", CTLFLAG_RD, NULL, "fpstat"); children = SYSCTL_CHILDREN(ctx_oid); for (i = 0; i < ha->num_rss; i++) { bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); snprintf(name_str, sizeof(name_str), "%d", i); ctx_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, name_str, CTLFLAG_RD, NULL, name_str); node_children = SYSCTL_CHILDREN(ctx_oid); /* Tx Related */ SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_processed", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_processed, "No. of packets processed for transmission"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_freed", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_freed, "No. of freed packets"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_transmitted", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_transmitted, "No. of transmitted packets"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_completed", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_completed, "No. of transmit completions"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_non_tso_pkts", CTLFLAG_RD, &ha->fp_array[i].tx_non_tso_pkts, "No. of non LSO transmited packets"); #ifdef QLNX_TRACE_PERF_DATA SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_trans_ctx", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_ctx, "No. of transmitted packets in transmit context"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_compl_ctx", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_ctx, "No. of transmit completions in transmit context"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_trans_fp", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_trans_fp, "No. of transmitted packets in taskqueue"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_compl_fp", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_fp, "No. of transmit completions in taskqueue"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_pkts_compl_intr", CTLFLAG_RD, &ha->fp_array[i].tx_pkts_compl_intr, "No. of transmit completions in interrupt ctx"); #endif SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_tso_pkts", CTLFLAG_RD, &ha->fp_array[i].tx_tso_pkts, "No. of LSO transmited packets"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_lso_wnd_min_len", CTLFLAG_RD, &ha->fp_array[i].tx_lso_wnd_min_len, "tx_lso_wnd_min_len"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_defrag", CTLFLAG_RD, &ha->fp_array[i].tx_defrag, "tx_defrag"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tx_nsegs_gt_elem_left", CTLFLAG_RD, &ha->fp_array[i].tx_nsegs_gt_elem_left, "tx_nsegs_gt_elem_left"); SYSCTL_ADD_UINT(ctx, node_children, OID_AUTO, "tx_tso_max_nsegs", CTLFLAG_RD, &ha->fp_array[i].tx_tso_max_nsegs, ha->fp_array[i].tx_tso_max_nsegs, "tx_tso_max_nsegs"); SYSCTL_ADD_UINT(ctx, node_children, OID_AUTO, "tx_tso_min_nsegs", CTLFLAG_RD, &ha->fp_array[i].tx_tso_min_nsegs, ha->fp_array[i].tx_tso_min_nsegs, "tx_tso_min_nsegs"); SYSCTL_ADD_UINT(ctx, node_children, OID_AUTO, "tx_tso_max_pkt_len", CTLFLAG_RD, &ha->fp_array[i].tx_tso_max_pkt_len, ha->fp_array[i].tx_tso_max_pkt_len, "tx_tso_max_pkt_len"); SYSCTL_ADD_UINT(ctx, node_children, OID_AUTO, "tx_tso_min_pkt_len", CTLFLAG_RD, &ha->fp_array[i].tx_tso_min_pkt_len, ha->fp_array[i].tx_tso_min_pkt_len, "tx_tso_min_pkt_len"); for (j = 0; j < QLNX_FP_MAX_SEGS; j++) { bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); snprintf(name_str, sizeof(name_str), "tx_pkts_nseg_%02d", (j+1)); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, name_str, CTLFLAG_RD, &ha->fp_array[i].tx_pkts[j], name_str); } #ifdef QLNX_TRACE_PERF_DATA for (j = 0; j < 18; j++) { bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); snprintf(name_str, sizeof(name_str), "tx_pkts_hist_%02d", (j+1)); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, name_str, CTLFLAG_RD, &ha->fp_array[i].tx_pkts_hist[j], name_str); } for (j = 0; j < 5; j++) { bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); snprintf(name_str, sizeof(name_str), "tx_comInt_%02d", (j+1)); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, name_str, CTLFLAG_RD, &ha->fp_array[i].tx_comInt[j], name_str); } for (j = 0; j < 18; j++) { bzero(name_str, (sizeof(uint8_t) * sizeof(name_str))); snprintf(name_str, sizeof(name_str), "tx_pkts_q_%02d", (j+1)); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, name_str, CTLFLAG_RD, &ha->fp_array[i].tx_pkts_q[j], name_str); } #endif SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_nsegs_gt_elem_left", CTLFLAG_RD, &ha->fp_array[i].err_tx_nsegs_gt_elem_left, "err_tx_nsegs_gt_elem_left"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_dmamap_create", CTLFLAG_RD, &ha->fp_array[i].err_tx_dmamap_create, "err_tx_dmamap_create"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_defrag_dmamap_load", CTLFLAG_RD, &ha->fp_array[i].err_tx_defrag_dmamap_load, "err_tx_defrag_dmamap_load"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_non_tso_max_seg", CTLFLAG_RD, &ha->fp_array[i].err_tx_non_tso_max_seg, "err_tx_non_tso_max_seg"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_dmamap_load", CTLFLAG_RD, &ha->fp_array[i].err_tx_dmamap_load, "err_tx_dmamap_load"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_defrag", CTLFLAG_RD, &ha->fp_array[i].err_tx_defrag, "err_tx_defrag"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_free_pkt_null", CTLFLAG_RD, &ha->fp_array[i].err_tx_free_pkt_null, "err_tx_free_pkt_null"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_tx_cons_idx_conflict", CTLFLAG_RD, &ha->fp_array[i].err_tx_cons_idx_conflict, "err_tx_cons_idx_conflict"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "lro_cnt_64", CTLFLAG_RD, &ha->fp_array[i].lro_cnt_64, "lro_cnt_64"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "lro_cnt_128", CTLFLAG_RD, &ha->fp_array[i].lro_cnt_128, "lro_cnt_128"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "lro_cnt_256", CTLFLAG_RD, &ha->fp_array[i].lro_cnt_256, "lro_cnt_256"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "lro_cnt_512", CTLFLAG_RD, &ha->fp_array[i].lro_cnt_512, "lro_cnt_512"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "lro_cnt_1024", CTLFLAG_RD, &ha->fp_array[i].lro_cnt_1024, "lro_cnt_1024"); /* Rx Related */ SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "rx_pkts", CTLFLAG_RD, &ha->fp_array[i].rx_pkts, "No. of received packets"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tpa_start", CTLFLAG_RD, &ha->fp_array[i].tpa_start, "No. of tpa_start packets"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tpa_cont", CTLFLAG_RD, &ha->fp_array[i].tpa_cont, "No. of tpa_cont packets"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "tpa_end", CTLFLAG_RD, &ha->fp_array[i].tpa_end, "No. of tpa_end packets"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_m_getcl", CTLFLAG_RD, &ha->fp_array[i].err_m_getcl, "err_m_getcl"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_m_getjcl", CTLFLAG_RD, &ha->fp_array[i].err_m_getjcl, "err_m_getjcl"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_rx_hw_errors", CTLFLAG_RD, &ha->fp_array[i].err_rx_hw_errors, "err_rx_hw_errors"); SYSCTL_ADD_QUAD(ctx, node_children, OID_AUTO, "err_rx_alloc_errors", CTLFLAG_RD, &ha->fp_array[i].err_rx_alloc_errors, "err_rx_alloc_errors"); } return; } static void qlnx_add_hw_stats_sysctls(qlnx_host_t *ha) { struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; struct sysctl_oid *ctx_oid; ctx = device_get_sysctl_ctx(ha->pci_dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(ha->pci_dev)); ctx_oid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO, "hwstat", CTLFLAG_RD, NULL, "hwstat"); children = SYSCTL_CHILDREN(ctx_oid); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "no_buff_discards", CTLFLAG_RD, &ha->hw_stats.common.no_buff_discards, "No. of packets discarded due to lack of buffer"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "packet_too_big_discard", CTLFLAG_RD, &ha->hw_stats.common.packet_too_big_discard, "No. of packets discarded because packet was too big"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "ttl0_discard", CTLFLAG_RD, &ha->hw_stats.common.ttl0_discard, "ttl0_discard"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_ucast_bytes", CTLFLAG_RD, &ha->hw_stats.common.rx_ucast_bytes, "rx_ucast_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mcast_bytes", CTLFLAG_RD, &ha->hw_stats.common.rx_mcast_bytes, "rx_mcast_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_bcast_bytes", CTLFLAG_RD, &ha->hw_stats.common.rx_bcast_bytes, "rx_bcast_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_ucast_pkts", CTLFLAG_RD, &ha->hw_stats.common.rx_ucast_pkts, "rx_ucast_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mcast_pkts", CTLFLAG_RD, &ha->hw_stats.common.rx_mcast_pkts, "rx_mcast_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_bcast_pkts", CTLFLAG_RD, &ha->hw_stats.common.rx_bcast_pkts, "rx_bcast_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "mftag_filter_discards", CTLFLAG_RD, &ha->hw_stats.common.mftag_filter_discards, "mftag_filter_discards"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "mac_filter_discards", CTLFLAG_RD, &ha->hw_stats.common.mac_filter_discards, "mac_filter_discards"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_ucast_bytes", CTLFLAG_RD, &ha->hw_stats.common.tx_ucast_bytes, "tx_ucast_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_mcast_bytes", CTLFLAG_RD, &ha->hw_stats.common.tx_mcast_bytes, "tx_mcast_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_bcast_bytes", CTLFLAG_RD, &ha->hw_stats.common.tx_bcast_bytes, "tx_bcast_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_ucast_pkts", CTLFLAG_RD, &ha->hw_stats.common.tx_ucast_pkts, "tx_ucast_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_mcast_pkts", CTLFLAG_RD, &ha->hw_stats.common.tx_mcast_pkts, "tx_mcast_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_bcast_pkts", CTLFLAG_RD, &ha->hw_stats.common.tx_bcast_pkts, "tx_bcast_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_err_drop_pkts", CTLFLAG_RD, &ha->hw_stats.common.tx_err_drop_pkts, "tx_err_drop_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tpa_coalesced_pkts", CTLFLAG_RD, &ha->hw_stats.common.tpa_coalesced_pkts, "tpa_coalesced_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tpa_coalesced_events", CTLFLAG_RD, &ha->hw_stats.common.tpa_coalesced_events, "tpa_coalesced_events"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tpa_aborts_num", CTLFLAG_RD, &ha->hw_stats.common.tpa_aborts_num, "tpa_aborts_num"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tpa_not_coalesced_pkts", CTLFLAG_RD, &ha->hw_stats.common.tpa_not_coalesced_pkts, "tpa_not_coalesced_pkts"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tpa_coalesced_bytes", CTLFLAG_RD, &ha->hw_stats.common.tpa_coalesced_bytes, "tpa_coalesced_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_64_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_64_byte_packets, "rx_64_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_65_to_127_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_65_to_127_byte_packets, "rx_65_to_127_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_128_to_255_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_128_to_255_byte_packets, "rx_128_to_255_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_256_to_511_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_256_to_511_byte_packets, "rx_256_to_511_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_512_to_1023_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_512_to_1023_byte_packets, "rx_512_to_1023_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_1024_to_1518_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_1024_to_1518_byte_packets, "rx_1024_to_1518_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_1519_to_1522_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.rx_1519_to_1522_byte_packets, "rx_1519_to_1522_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_1523_to_2047_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.rx_1519_to_2047_byte_packets, "rx_1523_to_2047_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_2048_to_4095_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.rx_2048_to_4095_byte_packets, "rx_2048_to_4095_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_4096_to_9216_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.rx_4096_to_9216_byte_packets, "rx_4096_to_9216_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_9217_to_16383_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.rx_9217_to_16383_byte_packets, "rx_9217_to_16383_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_crc_errors", CTLFLAG_RD, &ha->hw_stats.common.rx_crc_errors, "rx_crc_errors"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mac_crtl_frames", CTLFLAG_RD, &ha->hw_stats.common.rx_mac_crtl_frames, "rx_mac_crtl_frames"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_pause_frames", CTLFLAG_RD, &ha->hw_stats.common.rx_pause_frames, "rx_pause_frames"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_pfc_frames", CTLFLAG_RD, &ha->hw_stats.common.rx_pfc_frames, "rx_pfc_frames"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_align_errors", CTLFLAG_RD, &ha->hw_stats.common.rx_align_errors, "rx_align_errors"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_carrier_errors", CTLFLAG_RD, &ha->hw_stats.common.rx_carrier_errors, "rx_carrier_errors"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_oversize_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_oversize_packets, "rx_oversize_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_jabbers", CTLFLAG_RD, &ha->hw_stats.common.rx_jabbers, "rx_jabbers"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_undersize_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_undersize_packets, "rx_undersize_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_fragments", CTLFLAG_RD, &ha->hw_stats.common.rx_fragments, "rx_fragments"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_64_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_64_byte_packets, "tx_64_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_65_to_127_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_65_to_127_byte_packets, "tx_65_to_127_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_128_to_255_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_128_to_255_byte_packets, "tx_128_to_255_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_256_to_511_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_256_to_511_byte_packets, "tx_256_to_511_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_512_to_1023_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_512_to_1023_byte_packets, "tx_512_to_1023_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_1024_to_1518_byte_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_1024_to_1518_byte_packets, "tx_1024_to_1518_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_1519_to_2047_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.tx_1519_to_2047_byte_packets, "tx_1519_to_2047_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_2048_to_4095_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.tx_2048_to_4095_byte_packets, "tx_2048_to_4095_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_4096_to_9216_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.tx_4096_to_9216_byte_packets, "tx_4096_to_9216_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_9217_to_16383_byte_packets", CTLFLAG_RD, &ha->hw_stats.bb.tx_9217_to_16383_byte_packets, "tx_9217_to_16383_byte_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_pause_frames", CTLFLAG_RD, &ha->hw_stats.common.tx_pause_frames, "tx_pause_frames"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_pfc_frames", CTLFLAG_RD, &ha->hw_stats.common.tx_pfc_frames, "tx_pfc_frames"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_lpi_entry_count", CTLFLAG_RD, &ha->hw_stats.bb.tx_lpi_entry_count, "tx_lpi_entry_count"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_total_collisions", CTLFLAG_RD, &ha->hw_stats.bb.tx_total_collisions, "tx_total_collisions"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "brb_truncates", CTLFLAG_RD, &ha->hw_stats.common.brb_truncates, "brb_truncates"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "brb_discards", CTLFLAG_RD, &ha->hw_stats.common.brb_discards, "brb_discards"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mac_bytes", CTLFLAG_RD, &ha->hw_stats.common.rx_mac_bytes, "rx_mac_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mac_uc_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_mac_uc_packets, "rx_mac_uc_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mac_mc_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_mac_mc_packets, "rx_mac_mc_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mac_bc_packets", CTLFLAG_RD, &ha->hw_stats.common.rx_mac_bc_packets, "rx_mac_bc_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "rx_mac_frames_ok", CTLFLAG_RD, &ha->hw_stats.common.rx_mac_frames_ok, "rx_mac_frames_ok"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_mac_bytes", CTLFLAG_RD, &ha->hw_stats.common.tx_mac_bytes, "tx_mac_bytes"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_mac_uc_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_mac_uc_packets, "tx_mac_uc_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_mac_mc_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_mac_mc_packets, "tx_mac_mc_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_mac_bc_packets", CTLFLAG_RD, &ha->hw_stats.common.tx_mac_bc_packets, "tx_mac_bc_packets"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "tx_mac_ctrl_frames", CTLFLAG_RD, &ha->hw_stats.common.tx_mac_ctrl_frames, "tx_mac_ctrl_frames"); return; } static void qlnx_add_sysctls(qlnx_host_t *ha) { device_t dev = ha->pci_dev; struct sysctl_ctx_list *ctx; struct sysctl_oid_list *children; ctx = device_get_sysctl_ctx(dev); children = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); qlnx_add_fp_stats_sysctls(ha); qlnx_add_sp_stats_sysctls(ha); if (qlnx_vf_device(ha) != 0) qlnx_add_hw_stats_sysctls(ha); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "Driver_Version", CTLFLAG_RD, qlnx_ver_str, 0, "Driver Version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "STORMFW_Version", CTLFLAG_RD, ha->stormfw_ver, 0, "STORM Firmware Version"); SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "MFW_Version", CTLFLAG_RD, ha->mfw_ver, 0, "Management Firmware Version"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "personality", CTLFLAG_RD, &ha->personality, ha->personality, "\tpersonality = 0 => Ethernet Only\n" "\tpersonality = 3 => Ethernet and RoCE\n" "\tpersonality = 4 => Ethernet and iWARP\n" "\tpersonality = 6 => Default in Shared Memory\n"); ha->dbg_level = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "debug", CTLFLAG_RW, &ha->dbg_level, ha->dbg_level, "Debug Level"); ha->dp_level = 0x01; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "dp_level", CTLFLAG_RW, &ha->dp_level, ha->dp_level, "DP Level"); ha->dbg_trace_lro_cnt = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "dbg_trace_lro_cnt", CTLFLAG_RW, &ha->dbg_trace_lro_cnt, ha->dbg_trace_lro_cnt, "Trace LRO Counts"); ha->dbg_trace_tso_pkt_len = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "dbg_trace_tso_pkt_len", CTLFLAG_RW, &ha->dbg_trace_tso_pkt_len, ha->dbg_trace_tso_pkt_len, "Trace TSO packet lengths"); ha->dp_module = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "dp_module", CTLFLAG_RW, &ha->dp_module, ha->dp_module, "DP Module"); ha->err_inject = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "err_inject", CTLFLAG_RW, &ha->err_inject, ha->err_inject, "Error Inject"); ha->storm_stats_enable = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "storm_stats_enable", CTLFLAG_RW, &ha->storm_stats_enable, ha->storm_stats_enable, "Enable Storm Statistics Gathering"); ha->storm_stats_index = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "storm_stats_index", CTLFLAG_RD, &ha->storm_stats_index, ha->storm_stats_index, "Enable Storm Statistics Gathering Current Index"); ha->grcdump_taken = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "grcdump_taken", CTLFLAG_RD, &ha->grcdump_taken, ha->grcdump_taken, "grcdump_taken"); ha->idle_chk_taken = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "idle_chk_taken", CTLFLAG_RD, &ha->idle_chk_taken, ha->idle_chk_taken, "idle_chk_taken"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_coalesce_usecs", CTLFLAG_RD, &ha->rx_coalesce_usecs, ha->rx_coalesce_usecs, "rx_coalesce_usecs"); SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "tx_coalesce_usecs", CTLFLAG_RD, &ha->tx_coalesce_usecs, ha->tx_coalesce_usecs, "tx_coalesce_usecs"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "trigger_dump", (CTLTYPE_INT | CTLFLAG_RW), (void *)ha, 0, qlnx_trigger_dump_sysctl, "I", "trigger_dump"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "set_rx_coalesce_usecs", (CTLTYPE_INT | CTLFLAG_RW), (void *)ha, 0, qlnx_set_rx_coalesce, "I", "rx interrupt coalesce period microseconds"); SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "set_tx_coalesce_usecs", (CTLTYPE_INT | CTLFLAG_RW), (void *)ha, 0, qlnx_set_tx_coalesce, "I", "tx interrupt coalesce period microseconds"); ha->rx_pkt_threshold = 128; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_pkt_threshold", CTLFLAG_RW, &ha->rx_pkt_threshold, ha->rx_pkt_threshold, "No. of Rx Pkts to process at a time"); ha->rx_jumbo_buf_eq_mtu = 0; SYSCTL_ADD_UINT(ctx, children, OID_AUTO, "rx_jumbo_buf_eq_mtu", CTLFLAG_RW, &ha->rx_jumbo_buf_eq_mtu, ha->rx_jumbo_buf_eq_mtu, "== 0 => Rx Jumbo buffers are capped to 4Kbytes\n" "otherwise Rx Jumbo buffers are set to >= MTU size\n"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "err_illegal_intr", CTLFLAG_RD, &ha->err_illegal_intr, "err_illegal_intr"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "err_fp_null", CTLFLAG_RD, &ha->err_fp_null, "err_fp_null"); SYSCTL_ADD_QUAD(ctx, children, OID_AUTO, "err_get_proto_invalid_type", CTLFLAG_RD, &ha->err_get_proto_invalid_type, "err_get_proto_invalid_type"); return; } /***************************************************************************** * Operating System Network Interface Functions *****************************************************************************/ static void qlnx_init_ifnet(device_t dev, qlnx_host_t *ha) { uint16_t device_id; struct ifnet *ifp; ifp = ha->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) panic("%s: cannot if_alloc()\n", device_get_nameunit(dev)); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); device_id = pci_get_device(ha->pci_dev); #if __FreeBSD_version >= 1000000 if (device_id == QLOGIC_PCI_DEVICE_ID_1634) ifp->if_baudrate = IF_Gbps(40); else if ((device_id == QLOGIC_PCI_DEVICE_ID_1656) || (device_id == QLOGIC_PCI_DEVICE_ID_8070)) ifp->if_baudrate = IF_Gbps(25); else if (device_id == QLOGIC_PCI_DEVICE_ID_1654) ifp->if_baudrate = IF_Gbps(50); else if (device_id == QLOGIC_PCI_DEVICE_ID_1644) ifp->if_baudrate = IF_Gbps(100); ifp->if_capabilities = IFCAP_LINKSTATE; #else ifp->if_mtu = ETHERMTU; ifp->if_baudrate = (1 * 1000 * 1000 *1000); #endif /* #if __FreeBSD_version >= 1000000 */ ifp->if_init = qlnx_init; ifp->if_softc = ha; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; ifp->if_ioctl = qlnx_ioctl; ifp->if_transmit = qlnx_transmit; ifp->if_qflush = qlnx_qflush; IFQ_SET_MAXLEN(&ifp->if_snd, qlnx_get_ifq_snd_maxlen(ha)); ifp->if_snd.ifq_drv_maxlen = qlnx_get_ifq_snd_maxlen(ha); IFQ_SET_READY(&ifp->if_snd); #if __FreeBSD_version >= 1100036 if_setgetcounterfn(ifp, qlnx_get_counter); #endif ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; memcpy(ha->primary_mac, qlnx_get_mac_addr(ha), ETH_ALEN); if (!ha->primary_mac[0] && !ha->primary_mac[1] && !ha->primary_mac[2] && !ha->primary_mac[3] && !ha->primary_mac[4] && !ha->primary_mac[5]) { uint32_t rnd; rnd = arc4random(); ha->primary_mac[0] = 0x00; ha->primary_mac[1] = 0x0e; ha->primary_mac[2] = 0x1e; ha->primary_mac[3] = rnd & 0xFF; ha->primary_mac[4] = (rnd >> 8) & 0xFF; ha->primary_mac[5] = (rnd >> 16) & 0xFF; } ether_ifattach(ifp, ha->primary_mac); bcopy(IF_LLADDR(ha->ifp), ha->primary_mac, ETHER_ADDR_LEN); ifp->if_capabilities = IFCAP_HWCSUM; ifp->if_capabilities |= IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_VLAN_MTU; ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING; ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; ifp->if_capabilities |= IFCAP_VLAN_HWCSUM; ifp->if_capabilities |= IFCAP_VLAN_HWTSO; ifp->if_capabilities |= IFCAP_TSO4; ifp->if_capabilities |= IFCAP_TSO6; ifp->if_capabilities |= IFCAP_LRO; ifp->if_hw_tsomax = QLNX_MAX_TSO_FRAME_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); ifp->if_hw_tsomaxsegcount = QLNX_MAX_SEGMENTS - 1 /* hdr */; ifp->if_hw_tsomaxsegsize = QLNX_MAX_TX_MBUF_SIZE; ifp->if_capenable = ifp->if_capabilities; ifp->if_hwassist = CSUM_IP; ifp->if_hwassist |= CSUM_TCP | CSUM_UDP; ifp->if_hwassist |= CSUM_TCP_IPV6 | CSUM_UDP_IPV6; ifp->if_hwassist |= CSUM_TSO; ifp->if_hdrlen = sizeof(struct ether_vlan_header); ifmedia_init(&ha->media, IFM_IMASK, qlnx_media_change,\ qlnx_media_status); if (device_id == QLOGIC_PCI_DEVICE_ID_1634) { ifmedia_add(&ha->media, (IFM_ETHER | IFM_40G_LR4), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | IFM_40G_SR4), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | IFM_40G_CR4), 0, NULL); } else if ((device_id == QLOGIC_PCI_DEVICE_ID_1656) || (device_id == QLOGIC_PCI_DEVICE_ID_8070)) { ifmedia_add(&ha->media, (IFM_ETHER | QLNX_IFM_25G_SR), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | QLNX_IFM_25G_CR), 0, NULL); } else if (device_id == QLOGIC_PCI_DEVICE_ID_1654) { ifmedia_add(&ha->media, (IFM_ETHER | IFM_50G_KR2), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | IFM_50G_CR2), 0, NULL); } else if (device_id == QLOGIC_PCI_DEVICE_ID_1644) { ifmedia_add(&ha->media, (IFM_ETHER | QLNX_IFM_100G_LR4), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | QLNX_IFM_100G_SR4), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | QLNX_IFM_100G_CR4), 0, NULL); } ifmedia_add(&ha->media, (IFM_ETHER | IFM_FDX), 0, NULL); ifmedia_add(&ha->media, (IFM_ETHER | IFM_AUTO), 0, NULL); ifmedia_set(&ha->media, (IFM_ETHER | IFM_AUTO)); QL_DPRINT2(ha, "exit\n"); return; } static void qlnx_init_locked(qlnx_host_t *ha) { struct ifnet *ifp = ha->ifp; QL_DPRINT1(ha, "Driver Initialization start \n"); qlnx_stop(ha); if (qlnx_load(ha) == 0) { ifp->if_drv_flags |= IFF_DRV_RUNNING; ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; #ifdef QLNX_ENABLE_IWARP if (qlnx_vf_device(ha) != 0) { qlnx_rdma_dev_open(ha); } #endif /* #ifdef QLNX_ENABLE_IWARP */ } return; } static void qlnx_init(void *arg) { qlnx_host_t *ha; ha = (qlnx_host_t *)arg; QL_DPRINT2(ha, "enter\n"); QLNX_LOCK(ha); qlnx_init_locked(ha); QLNX_UNLOCK(ha); QL_DPRINT2(ha, "exit\n"); return; } static int qlnx_config_mcast_mac_addr(qlnx_host_t *ha, uint8_t *mac_addr, uint32_t add_mac) { struct ecore_filter_mcast *mcast; struct ecore_dev *cdev; int rc; cdev = &ha->cdev; mcast = &ha->ecore_mcast; bzero(mcast, sizeof(struct ecore_filter_mcast)); if (add_mac) mcast->opcode = ECORE_FILTER_ADD; else mcast->opcode = ECORE_FILTER_REMOVE; mcast->num_mc_addrs = 1; memcpy(mcast->mac, mac_addr, ETH_ALEN); rc = ecore_filter_mcast_cmd(cdev, mcast, ECORE_SPQ_MODE_CB, NULL); return (rc); } static int qlnx_hw_add_mcast(qlnx_host_t *ha, uint8_t *mta) { int i; for (i = 0; i < QLNX_MAX_NUM_MULTICAST_ADDRS; i++) { if (QL_MAC_CMP(ha->mcast[i].addr, mta) == 0) return 0; /* its been already added */ } for (i = 0; i < QLNX_MAX_NUM_MULTICAST_ADDRS; i++) { if ((ha->mcast[i].addr[0] == 0) && (ha->mcast[i].addr[1] == 0) && (ha->mcast[i].addr[2] == 0) && (ha->mcast[i].addr[3] == 0) && (ha->mcast[i].addr[4] == 0) && (ha->mcast[i].addr[5] == 0)) { if (qlnx_config_mcast_mac_addr(ha, mta, 1)) return (-1); bcopy(mta, ha->mcast[i].addr, ETH_ALEN); ha->nmcast++; return 0; } } return 0; } static int qlnx_hw_del_mcast(qlnx_host_t *ha, uint8_t *mta) { int i; for (i = 0; i < QLNX_MAX_NUM_MULTICAST_ADDRS; i++) { if (QL_MAC_CMP(ha->mcast[i].addr, mta) == 0) { if (qlnx_config_mcast_mac_addr(ha, mta, 0)) return (-1); ha->mcast[i].addr[0] = 0; ha->mcast[i].addr[1] = 0; ha->mcast[i].addr[2] = 0; ha->mcast[i].addr[3] = 0; ha->mcast[i].addr[4] = 0; ha->mcast[i].addr[5] = 0; ha->nmcast--; return 0; } } return 0; } /* * Name: qls_hw_set_multi * Function: Sets the Multicast Addresses provided the host O.S into the * hardware (for the given interface) */ static void qlnx_hw_set_multi(qlnx_host_t *ha, uint8_t *mta, uint32_t mcnt, uint32_t add_mac) { int i; for (i = 0; i < mcnt; i++) { if (add_mac) { if (qlnx_hw_add_mcast(ha, mta)) break; } else { if (qlnx_hw_del_mcast(ha, mta)) break; } mta += ETHER_HDR_LEN; } return; } #define QLNX_MCAST_ADDRS_SIZE (QLNX_MAX_NUM_MULTICAST_ADDRS * ETHER_HDR_LEN) static int qlnx_set_multi(qlnx_host_t *ha, uint32_t add_multi) { uint8_t mta[QLNX_MCAST_ADDRS_SIZE]; struct ifmultiaddr *ifma; int mcnt = 0; struct ifnet *ifp = ha->ifp; int ret = 0; if (qlnx_vf_device(ha) == 0) return (0); if_maddr_rlock(ifp); CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { if (ifma->ifma_addr->sa_family != AF_LINK) continue; if (mcnt == QLNX_MAX_NUM_MULTICAST_ADDRS) break; bcopy(LLADDR((struct sockaddr_dl *) ifma->ifma_addr), &mta[mcnt * ETHER_HDR_LEN], ETHER_HDR_LEN); mcnt++; } if_maddr_runlock(ifp); QLNX_LOCK(ha); qlnx_hw_set_multi(ha, mta, mcnt, add_multi); QLNX_UNLOCK(ha); return (ret); } static int qlnx_set_promisc(qlnx_host_t *ha) { int rc = 0; uint8_t filter; if (qlnx_vf_device(ha) == 0) return (0); filter = ha->filter; filter |= ECORE_ACCEPT_MCAST_UNMATCHED; filter |= ECORE_ACCEPT_UCAST_UNMATCHED; rc = qlnx_set_rx_accept_filter(ha, filter); return (rc); } static int qlnx_set_allmulti(qlnx_host_t *ha) { int rc = 0; uint8_t filter; if (qlnx_vf_device(ha) == 0) return (0); filter = ha->filter; filter |= ECORE_ACCEPT_MCAST_UNMATCHED; rc = qlnx_set_rx_accept_filter(ha, filter); return (rc); } static int qlnx_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { int ret = 0, mask; struct ifreq *ifr = (struct ifreq *)data; struct ifaddr *ifa = (struct ifaddr *)data; qlnx_host_t *ha; ha = (qlnx_host_t *)ifp->if_softc; switch (cmd) { case SIOCSIFADDR: QL_DPRINT4(ha, "SIOCSIFADDR (0x%lx)\n", cmd); if (ifa->ifa_addr->sa_family == AF_INET) { ifp->if_flags |= IFF_UP; if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) { QLNX_LOCK(ha); qlnx_init_locked(ha); QLNX_UNLOCK(ha); } QL_DPRINT4(ha, "SIOCSIFADDR (0x%lx) ipv4 [0x%08x]\n", cmd, ntohl(IA_SIN(ifa)->sin_addr.s_addr)); arp_ifinit(ifp, ifa); } else { ether_ioctl(ifp, cmd, data); } break; case SIOCSIFMTU: QL_DPRINT4(ha, "SIOCSIFMTU (0x%lx)\n", cmd); if (ifr->ifr_mtu > QLNX_MAX_MTU) { ret = EINVAL; } else { QLNX_LOCK(ha); ifp->if_mtu = ifr->ifr_mtu; ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { qlnx_init_locked(ha); } QLNX_UNLOCK(ha); } break; case SIOCSIFFLAGS: QL_DPRINT4(ha, "SIOCSIFFLAGS (0x%lx)\n", cmd); QLNX_LOCK(ha); if (ifp->if_flags & IFF_UP) { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if ((ifp->if_flags ^ ha->if_flags) & IFF_PROMISC) { ret = qlnx_set_promisc(ha); } else if ((ifp->if_flags ^ ha->if_flags) & IFF_ALLMULTI) { ret = qlnx_set_allmulti(ha); } } else { ha->max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; qlnx_init_locked(ha); } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) qlnx_stop(ha); ha->if_flags = ifp->if_flags; } QLNX_UNLOCK(ha); break; case SIOCADDMULTI: QL_DPRINT4(ha, "%s (0x%lx)\n", "SIOCADDMULTI", cmd); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (qlnx_set_multi(ha, 1)) ret = EINVAL; } break; case SIOCDELMULTI: QL_DPRINT4(ha, "%s (0x%lx)\n", "SIOCDELMULTI", cmd); if (ifp->if_drv_flags & IFF_DRV_RUNNING) { if (qlnx_set_multi(ha, 0)) ret = EINVAL; } break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: QL_DPRINT4(ha, "SIOCSIFMEDIA/SIOCGIFMEDIA (0x%lx)\n", cmd); ret = ifmedia_ioctl(ifp, ifr, &ha->media, cmd); break; case SIOCSIFCAP: mask = ifr->ifr_reqcap ^ ifp->if_capenable; QL_DPRINT4(ha, "SIOCSIFCAP (0x%lx)\n", cmd); if (mask & IFCAP_HWCSUM) ifp->if_capenable ^= IFCAP_HWCSUM; if (mask & IFCAP_TSO4) ifp->if_capenable ^= IFCAP_TSO4; if (mask & IFCAP_TSO6) ifp->if_capenable ^= IFCAP_TSO6; if (mask & IFCAP_VLAN_HWTAGGING) ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; if (mask & IFCAP_VLAN_HWTSO) ifp->if_capenable ^= IFCAP_VLAN_HWTSO; if (mask & IFCAP_LRO) ifp->if_capenable ^= IFCAP_LRO; QLNX_LOCK(ha); if (ifp->if_drv_flags & IFF_DRV_RUNNING) qlnx_init_locked(ha); QLNX_UNLOCK(ha); VLAN_CAPABILITIES(ifp); break; #if (__FreeBSD_version >= 1100101) case SIOCGI2C: { struct ifi2creq i2c; struct ecore_hwfn *p_hwfn = &ha->cdev.hwfns[0]; struct ecore_ptt *p_ptt; ret = copyin(ifr_data_get_ptr(ifr), &i2c, sizeof(i2c)); if (ret) break; if ((i2c.len > sizeof (i2c.data)) || (i2c.dev_addr != 0xA0 && i2c.dev_addr != 0xA2)) { ret = EINVAL; break; } p_ptt = ecore_ptt_acquire(p_hwfn); if (!p_ptt) { QL_DPRINT1(ha, "ecore_ptt_acquire failed\n"); ret = -1; break; } ret = ecore_mcp_phy_sfp_read(p_hwfn, p_ptt, (ha->pci_func & 0x1), i2c.dev_addr, i2c.offset, i2c.len, &i2c.data[0]); ecore_ptt_release(p_hwfn, p_ptt); if (ret) { ret = -1; break; } ret = copyout(&i2c, ifr_data_get_ptr(ifr), sizeof(i2c)); QL_DPRINT8(ha, "SIOCGI2C copyout ret = %d \ len = %d addr = 0x%02x offset = 0x%04x \ data[0..7]=0x%02x 0x%02x 0x%02x 0x%02x 0x%02x \ 0x%02x 0x%02x 0x%02x\n", ret, i2c.len, i2c.dev_addr, i2c.offset, i2c.data[0], i2c.data[1], i2c.data[2], i2c.data[3], i2c.data[4], i2c.data[5], i2c.data[6], i2c.data[7]); break; } #endif /* #if (__FreeBSD_version >= 1100101) */ default: QL_DPRINT4(ha, "default (0x%lx)\n", cmd); ret = ether_ioctl(ifp, cmd, data); break; } return (ret); } static int qlnx_media_change(struct ifnet *ifp) { qlnx_host_t *ha; struct ifmedia *ifm; int ret = 0; ha = (qlnx_host_t *)ifp->if_softc; QL_DPRINT2(ha, "enter\n"); ifm = &ha->media; if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) ret = EINVAL; QL_DPRINT2(ha, "exit\n"); return (ret); } static void qlnx_media_status(struct ifnet *ifp, struct ifmediareq *ifmr) { qlnx_host_t *ha; ha = (qlnx_host_t *)ifp->if_softc; QL_DPRINT2(ha, "enter\n"); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (ha->link_up) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= (IFM_FDX | qlnx_get_optics(ha, &ha->if_link)); if (ha->if_link.link_partner_caps & (QLNX_LINK_CAP_Pause | QLNX_LINK_CAP_Asym_Pause)) ifmr->ifm_active |= (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE); } QL_DPRINT2(ha, "exit (%s)\n", (ha->link_up ? "link_up" : "link_down")); return; } static void qlnx_free_tx_pkt(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq) { u16 idx; struct mbuf *mp; bus_dmamap_t map; int i; struct eth_tx_bd *tx_data_bd; struct eth_tx_1st_bd *first_bd; int nbds = 0; idx = txq->sw_tx_cons; mp = txq->sw_tx_ring[idx].mp; map = txq->sw_tx_ring[idx].map; if ((mp == NULL) || QL_ERR_INJECT(ha, QL_ERR_INJCT_TX_INT_MBUF_NULL)){ QL_RESET_ERR_INJECT(ha, QL_ERR_INJCT_TX_INT_MBUF_NULL); QL_DPRINT1(ha, "(mp == NULL) " " tx_idx = 0x%x" " ecore_prod_idx = 0x%x" " ecore_cons_idx = 0x%x" " hw_bd_cons = 0x%x" " txq_db_last = 0x%x" " elem_left = 0x%x\n", fp->rss_id, ecore_chain_get_prod_idx(&txq->tx_pbl), ecore_chain_get_cons_idx(&txq->tx_pbl), le16toh(*txq->hw_cons_ptr), txq->tx_db.raw, ecore_chain_get_elem_left(&txq->tx_pbl)); fp->err_tx_free_pkt_null++; //DEBUG qlnx_trigger_dump(ha); return; } else { QLNX_INC_OPACKETS((ha->ifp)); QLNX_INC_OBYTES((ha->ifp), (mp->m_pkthdr.len)); bus_dmamap_sync(ha->tx_tag, map, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(ha->tx_tag, map); fp->tx_pkts_freed++; fp->tx_pkts_completed++; m_freem(mp); } first_bd = (struct eth_tx_1st_bd *)ecore_chain_consume(&txq->tx_pbl); nbds = first_bd->data.nbds; // BD_SET_UNMAP_ADDR_LEN(first_bd, 0, 0); for (i = 1; i < nbds; i++) { tx_data_bd = ecore_chain_consume(&txq->tx_pbl); // BD_SET_UNMAP_ADDR_LEN(tx_data_bd, 0, 0); } txq->sw_tx_ring[idx].flags = 0; txq->sw_tx_ring[idx].mp = NULL; txq->sw_tx_ring[idx].map = (bus_dmamap_t)0; return; } static void qlnx_tx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq) { u16 hw_bd_cons; u16 ecore_cons_idx; uint16_t diff; uint16_t idx, idx2; hw_bd_cons = le16toh(*txq->hw_cons_ptr); while (hw_bd_cons != (ecore_cons_idx = ecore_chain_get_cons_idx(&txq->tx_pbl))) { if (hw_bd_cons < ecore_cons_idx) { diff = (1 << 16) - (ecore_cons_idx - hw_bd_cons); } else { diff = hw_bd_cons - ecore_cons_idx; } if ((diff > TX_RING_SIZE) || QL_ERR_INJECT(ha, QL_ERR_INJCT_TX_INT_DIFF)){ QL_RESET_ERR_INJECT(ha, QL_ERR_INJCT_TX_INT_DIFF); QL_DPRINT1(ha, "(diff = 0x%x) " " tx_idx = 0x%x" " ecore_prod_idx = 0x%x" " ecore_cons_idx = 0x%x" " hw_bd_cons = 0x%x" " txq_db_last = 0x%x" " elem_left = 0x%x\n", diff, fp->rss_id, ecore_chain_get_prod_idx(&txq->tx_pbl), ecore_chain_get_cons_idx(&txq->tx_pbl), le16toh(*txq->hw_cons_ptr), txq->tx_db.raw, ecore_chain_get_elem_left(&txq->tx_pbl)); fp->err_tx_cons_idx_conflict++; //DEBUG qlnx_trigger_dump(ha); } idx = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1); idx2 = (txq->sw_tx_cons + 2) & (TX_RING_SIZE - 1); prefetch(txq->sw_tx_ring[idx].mp); prefetch(txq->sw_tx_ring[idx2].mp); qlnx_free_tx_pkt(ha, fp, txq); txq->sw_tx_cons = (txq->sw_tx_cons + 1) & (TX_RING_SIZE - 1); } return; } static int qlnx_transmit_locked(struct ifnet *ifp,struct qlnx_fastpath *fp, struct mbuf *mp) { int ret = 0; struct qlnx_tx_queue *txq; qlnx_host_t * ha; uint16_t elem_left; txq = fp->txq[0]; ha = (qlnx_host_t *)fp->edev; if ((!(ifp->if_drv_flags & IFF_DRV_RUNNING)) || (!ha->link_up)) { if(mp != NULL) ret = drbr_enqueue(ifp, fp->tx_br, mp); return (ret); } if(mp != NULL) ret = drbr_enqueue(ifp, fp->tx_br, mp); mp = drbr_peek(ifp, fp->tx_br); while (mp != NULL) { if (qlnx_send(ha, fp, &mp)) { if (mp != NULL) { drbr_putback(ifp, fp->tx_br, mp); } else { fp->tx_pkts_processed++; drbr_advance(ifp, fp->tx_br); } goto qlnx_transmit_locked_exit; } else { drbr_advance(ifp, fp->tx_br); fp->tx_pkts_transmitted++; fp->tx_pkts_processed++; } mp = drbr_peek(ifp, fp->tx_br); } qlnx_transmit_locked_exit: if((qlnx_num_tx_compl(ha,fp, fp->txq[0]) > QLNX_TX_COMPL_THRESH) || ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl)) < QLNX_TX_ELEM_MAX_THRESH)) (void)qlnx_tx_int(ha, fp, fp->txq[0]); QL_DPRINT2(ha, "%s: exit ret = %d\n", __func__, ret); return ret; } static int qlnx_transmit(struct ifnet *ifp, struct mbuf *mp) { qlnx_host_t *ha = (qlnx_host_t *)ifp->if_softc; struct qlnx_fastpath *fp; int rss_id = 0, ret = 0; #ifdef QLNX_TRACEPERF_DATA uint64_t tx_pkts = 0, tx_compl = 0; #endif QL_DPRINT2(ha, "enter\n"); #if __FreeBSD_version >= 1100000 if (M_HASHTYPE_GET(mp) != M_HASHTYPE_NONE) #else if (mp->m_flags & M_FLOWID) #endif rss_id = (mp->m_pkthdr.flowid % ECORE_RSS_IND_TABLE_SIZE) % ha->num_rss; fp = &ha->fp_array[rss_id]; if (fp->tx_br == NULL) { ret = EINVAL; goto qlnx_transmit_exit; } if (mtx_trylock(&fp->tx_mtx)) { #ifdef QLNX_TRACEPERF_DATA tx_pkts = fp->tx_pkts_transmitted; tx_compl = fp->tx_pkts_completed; #endif ret = qlnx_transmit_locked(ifp, fp, mp); #ifdef QLNX_TRACEPERF_DATA fp->tx_pkts_trans_ctx += (fp->tx_pkts_transmitted - tx_pkts); fp->tx_pkts_compl_ctx += (fp->tx_pkts_completed - tx_compl); #endif mtx_unlock(&fp->tx_mtx); } else { if (mp != NULL && (fp->fp_taskqueue != NULL)) { ret = drbr_enqueue(ifp, fp->tx_br, mp); taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task); } } qlnx_transmit_exit: QL_DPRINT2(ha, "exit ret = %d\n", ret); return ret; } static void qlnx_qflush(struct ifnet *ifp) { int rss_id; struct qlnx_fastpath *fp; struct mbuf *mp; qlnx_host_t *ha; ha = (qlnx_host_t *)ifp->if_softc; QL_DPRINT2(ha, "enter\n"); for (rss_id = 0; rss_id < ha->num_rss; rss_id++) { fp = &ha->fp_array[rss_id]; if (fp == NULL) continue; if (fp->tx_br) { mtx_lock(&fp->tx_mtx); while ((mp = drbr_dequeue(ifp, fp->tx_br)) != NULL) { fp->tx_pkts_freed++; m_freem(mp); } mtx_unlock(&fp->tx_mtx); } } QL_DPRINT2(ha, "exit\n"); return; } static void qlnx_txq_doorbell_wr32(qlnx_host_t *ha, void *reg_addr, uint32_t value) { struct ecore_dev *cdev; uint32_t offset; cdev = &ha->cdev; offset = (uint32_t)((uint8_t *)reg_addr - (uint8_t *)ha->pci_dbells); bus_write_4(ha->pci_dbells, offset, value); bus_barrier(ha->pci_reg, 0, 0, BUS_SPACE_BARRIER_READ); bus_barrier(ha->pci_dbells, 0, 0, BUS_SPACE_BARRIER_READ); return; } static uint32_t qlnx_tcp_offset(qlnx_host_t *ha, struct mbuf *mp) { struct ether_vlan_header *eh = NULL; struct ip *ip = NULL; struct ip6_hdr *ip6 = NULL; struct tcphdr *th = NULL; uint32_t ehdrlen = 0, ip_hlen = 0, offset = 0; uint16_t etype = 0; device_t dev; uint8_t buf[sizeof(struct ip6_hdr)]; dev = ha->pci_dev; eh = mtod(mp, struct ether_vlan_header *); if (eh->evl_encap_proto == htons(ETHERTYPE_VLAN)) { ehdrlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; etype = ntohs(eh->evl_proto); } else { ehdrlen = ETHER_HDR_LEN; etype = ntohs(eh->evl_encap_proto); } switch (etype) { case ETHERTYPE_IP: ip = (struct ip *)(mp->m_data + ehdrlen); ip_hlen = sizeof (struct ip); if (mp->m_len < (ehdrlen + ip_hlen)) { m_copydata(mp, ehdrlen, sizeof(struct ip), buf); ip = (struct ip *)buf; } th = (struct tcphdr *)(ip + 1); offset = ip_hlen + ehdrlen + (th->th_off << 2); break; case ETHERTYPE_IPV6: ip6 = (struct ip6_hdr *)(mp->m_data + ehdrlen); ip_hlen = sizeof(struct ip6_hdr); if (mp->m_len < (ehdrlen + ip_hlen)) { m_copydata(mp, ehdrlen, sizeof (struct ip6_hdr), buf); ip6 = (struct ip6_hdr *)buf; } th = (struct tcphdr *)(ip6 + 1); offset = ip_hlen + ehdrlen + (th->th_off << 2); break; default: break; } return (offset); } static __inline int qlnx_tso_check(struct qlnx_fastpath *fp, bus_dma_segment_t *segs, int nsegs, uint32_t offset) { int i; uint32_t sum, nbds_in_hdr = 1; uint32_t window; bus_dma_segment_t *s_seg; /* If the header spans mulitple segments, skip those segments */ if (nsegs < ETH_TX_LSO_WINDOW_BDS_NUM) return (0); i = 0; while ((i < nsegs) && (offset >= segs->ds_len)) { offset = offset - segs->ds_len; segs++; i++; nbds_in_hdr++; } window = ETH_TX_LSO_WINDOW_BDS_NUM - nbds_in_hdr; nsegs = nsegs - i; while (nsegs >= window) { sum = 0; s_seg = segs; for (i = 0; i < window; i++){ sum += s_seg->ds_len; s_seg++; } if (sum < ETH_TX_LSO_WINDOW_MIN_LEN) { fp->tx_lso_wnd_min_len++; return (-1); } nsegs = nsegs - 1; segs++; } return (0); } static int qlnx_send(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf **m_headp) { bus_dma_segment_t *segs; bus_dmamap_t map = 0; uint32_t nsegs = 0; int ret = -1; struct mbuf *m_head = *m_headp; uint16_t idx = 0; uint16_t elem_left; uint8_t nbd = 0; struct qlnx_tx_queue *txq; struct eth_tx_1st_bd *first_bd; struct eth_tx_2nd_bd *second_bd; struct eth_tx_3rd_bd *third_bd; struct eth_tx_bd *tx_data_bd; int seg_idx = 0; uint32_t nbds_in_hdr = 0; uint32_t offset = 0; #ifdef QLNX_TRACE_PERF_DATA uint16_t bd_used; #endif QL_DPRINT8(ha, "enter[%d]\n", fp->rss_id); if (!ha->link_up) return (-1); first_bd = NULL; second_bd = NULL; third_bd = NULL; tx_data_bd = NULL; txq = fp->txq[0]; if ((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl)) < QLNX_TX_ELEM_MIN_THRESH) { fp->tx_nsegs_gt_elem_left++; fp->err_tx_nsegs_gt_elem_left++; return (ENOBUFS); } idx = txq->sw_tx_prod; map = txq->sw_tx_ring[idx].map; segs = txq->segs; ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT); if (ha->dbg_trace_tso_pkt_len) { if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { if (!fp->tx_tso_min_pkt_len) { fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len; fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len; } else { if (fp->tx_tso_min_pkt_len > m_head->m_pkthdr.len) fp->tx_tso_min_pkt_len = m_head->m_pkthdr.len; if (fp->tx_tso_max_pkt_len < m_head->m_pkthdr.len) fp->tx_tso_max_pkt_len = m_head->m_pkthdr.len; } } } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) offset = qlnx_tcp_offset(ha, m_head); if ((ret == EFBIG) || ((nsegs > QLNX_MAX_SEGMENTS_NON_TSO) && ( (!(m_head->m_pkthdr.csum_flags & CSUM_TSO)) || ((m_head->m_pkthdr.csum_flags & CSUM_TSO) && qlnx_tso_check(fp, segs, nsegs, offset))))) { struct mbuf *m; QL_DPRINT8(ha, "EFBIG [%d]\n", m_head->m_pkthdr.len); fp->tx_defrag++; m = m_defrag(m_head, M_NOWAIT); if (m == NULL) { fp->err_tx_defrag++; fp->tx_pkts_freed++; m_freem(m_head); *m_headp = NULL; QL_DPRINT1(ha, "m_defrag() = NULL [%d]\n", ret); return (ENOBUFS); } m_head = m; *m_headp = m_head; if ((ret = bus_dmamap_load_mbuf_sg(ha->tx_tag, map, m_head, segs, &nsegs, BUS_DMA_NOWAIT))) { fp->err_tx_defrag_dmamap_load++; QL_DPRINT1(ha, "bus_dmamap_load_mbuf_sg failed0 [%d, %d]\n", ret, m_head->m_pkthdr.len); fp->tx_pkts_freed++; m_freem(m_head); *m_headp = NULL; return (ret); } if ((nsegs > QLNX_MAX_SEGMENTS_NON_TSO) && !(m_head->m_pkthdr.csum_flags & CSUM_TSO)) { fp->err_tx_non_tso_max_seg++; QL_DPRINT1(ha, "(%d) nsegs too many for non-TSO [%d, %d]\n", ret, nsegs, m_head->m_pkthdr.len); fp->tx_pkts_freed++; m_freem(m_head); *m_headp = NULL; return (ret); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) offset = qlnx_tcp_offset(ha, m_head); } else if (ret) { fp->err_tx_dmamap_load++; QL_DPRINT1(ha, "bus_dmamap_load_mbuf_sg failed1 [%d, %d]\n", ret, m_head->m_pkthdr.len); fp->tx_pkts_freed++; m_freem(m_head); *m_headp = NULL; return (ret); } QL_ASSERT(ha, (nsegs != 0), ("qlnx_send: empty packet")); if (ha->dbg_trace_tso_pkt_len) { if (nsegs < QLNX_FP_MAX_SEGS) fp->tx_pkts[(nsegs - 1)]++; else fp->tx_pkts[(QLNX_FP_MAX_SEGS - 1)]++; } #ifdef QLNX_TRACE_PERF_DATA if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { if(m_head->m_pkthdr.len <= 2048) fp->tx_pkts_hist[0]++; else if((m_head->m_pkthdr.len > 2048) && (m_head->m_pkthdr.len <= 4096)) fp->tx_pkts_hist[1]++; else if((m_head->m_pkthdr.len > 4096) && (m_head->m_pkthdr.len <= 8192)) fp->tx_pkts_hist[2]++; else if((m_head->m_pkthdr.len > 8192) && (m_head->m_pkthdr.len <= 12288 )) fp->tx_pkts_hist[3]++; else if((m_head->m_pkthdr.len > 11288) && (m_head->m_pkthdr.len <= 16394)) fp->tx_pkts_hist[4]++; else if((m_head->m_pkthdr.len > 16384) && (m_head->m_pkthdr.len <= 20480)) fp->tx_pkts_hist[5]++; else if((m_head->m_pkthdr.len > 20480) && (m_head->m_pkthdr.len <= 24576)) fp->tx_pkts_hist[6]++; else if((m_head->m_pkthdr.len > 24576) && (m_head->m_pkthdr.len <= 28672)) fp->tx_pkts_hist[7]++; else if((m_head->m_pkthdr.len > 28762) && (m_head->m_pkthdr.len <= 32768)) fp->tx_pkts_hist[8]++; else if((m_head->m_pkthdr.len > 32768) && (m_head->m_pkthdr.len <= 36864)) fp->tx_pkts_hist[9]++; else if((m_head->m_pkthdr.len > 36864) && (m_head->m_pkthdr.len <= 40960)) fp->tx_pkts_hist[10]++; else if((m_head->m_pkthdr.len > 40960) && (m_head->m_pkthdr.len <= 45056)) fp->tx_pkts_hist[11]++; else if((m_head->m_pkthdr.len > 45056) && (m_head->m_pkthdr.len <= 49152)) fp->tx_pkts_hist[12]++; else if((m_head->m_pkthdr.len > 49512) && m_head->m_pkthdr.len <= 53248)) fp->tx_pkts_hist[13]++; else if((m_head->m_pkthdr.len > 53248) && (m_head->m_pkthdr.len <= 57344)) fp->tx_pkts_hist[14]++; else if((m_head->m_pkthdr.len > 53248) && (m_head->m_pkthdr.len <= 57344)) fp->tx_pkts_hist[15]++; else if((m_head->m_pkthdr.len > 57344) && (m_head->m_pkthdr.len <= 61440)) fp->tx_pkts_hist[16]++; else fp->tx_pkts_hist[17]++; } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { elem_left = ecore_chain_get_elem_left(&txq->tx_pbl); bd_used = TX_RING_SIZE - elem_left; if(bd_used <= 100) fp->tx_pkts_q[0]++; else if((bd_used > 100) && (bd_used <= 500)) fp->tx_pkts_q[1]++; else if((bd_used > 500) && (bd_used <= 1000)) fp->tx_pkts_q[2]++; else if((bd_used > 1000) && (bd_used <= 2000)) fp->tx_pkts_q[3]++; else if((bd_used > 3000) && (bd_used <= 4000)) fp->tx_pkts_q[4]++; else if((bd_used > 4000) && (bd_used <= 5000)) fp->tx_pkts_q[5]++; else if((bd_used > 6000) && (bd_used <= 7000)) fp->tx_pkts_q[6]++; else if((bd_used > 7000) && (bd_used <= 8000)) fp->tx_pkts_q[7]++; else if((bd_used > 8000) && (bd_used <= 9000)) fp->tx_pkts_q[8]++; else if((bd_used > 9000) && (bd_used <= 10000)) fp->tx_pkts_q[9]++; else if((bd_used > 10000) && (bd_used <= 11000)) fp->tx_pkts_q[10]++; else if((bd_used > 11000) && (bd_used <= 12000)) fp->tx_pkts_q[11]++; else if((bd_used > 12000) && (bd_used <= 13000)) fp->tx_pkts_q[12]++; else if((bd_used > 13000) && (bd_used <= 14000)) fp->tx_pkts_q[13]++; else if((bd_used > 14000) && (bd_used <= 15000)) fp->tx_pkts_q[14]++; else if((bd_used > 15000) && (bd_used <= 16000)) fp->tx_pkts_q[15]++; else fp->tx_pkts_q[16]++; } #endif /* end of QLNX_TRACE_PERF_DATA */ if ((nsegs + QLNX_TX_ELEM_RESERVE) > (int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))) { QL_DPRINT1(ha, "(%d, 0x%x) insuffient BDs" " in chain[%d] trying to free packets\n", nsegs, elem_left, fp->rss_id); fp->tx_nsegs_gt_elem_left++; (void)qlnx_tx_int(ha, fp, txq); if ((nsegs + QLNX_TX_ELEM_RESERVE) > (int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl))) { QL_DPRINT1(ha, "(%d, 0x%x) insuffient BDs in chain[%d]\n", nsegs, elem_left, fp->rss_id); fp->err_tx_nsegs_gt_elem_left++; fp->tx_ring_full = 1; if (ha->storm_stats_enable) ha->storm_stats_gather = 1; return (ENOBUFS); } } bus_dmamap_sync(ha->tx_tag, map, BUS_DMASYNC_PREWRITE); txq->sw_tx_ring[idx].mp = m_head; first_bd = (struct eth_tx_1st_bd *)ecore_chain_produce(&txq->tx_pbl); memset(first_bd, 0, sizeof(*first_bd)); first_bd->data.bd_flags.bitfields = 1 << ETH_TX_1ST_BD_FLAGS_START_BD_SHIFT; BD_SET_UNMAP_ADDR_LEN(first_bd, segs->ds_addr, segs->ds_len); nbd++; if (m_head->m_pkthdr.csum_flags & CSUM_IP) { first_bd->data.bd_flags.bitfields |= (1 << ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT); } if (m_head->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_TCP | CSUM_TCP_IPV6 | CSUM_UDP_IPV6)) { first_bd->data.bd_flags.bitfields |= (1 << ETH_TX_1ST_BD_FLAGS_L4_CSUM_SHIFT); } if (m_head->m_flags & M_VLANTAG) { first_bd->data.vlan = m_head->m_pkthdr.ether_vtag; first_bd->data.bd_flags.bitfields |= (1 << ETH_TX_1ST_BD_FLAGS_VLAN_INSERTION_SHIFT); } if (m_head->m_pkthdr.csum_flags & CSUM_TSO) { first_bd->data.bd_flags.bitfields |= (1 << ETH_TX_1ST_BD_FLAGS_LSO_SHIFT); first_bd->data.bd_flags.bitfields |= (1 << ETH_TX_1ST_BD_FLAGS_IP_CSUM_SHIFT); nbds_in_hdr = 1; if (offset == segs->ds_len) { BD_SET_UNMAP_ADDR_LEN(first_bd, segs->ds_addr, offset); segs++; seg_idx++; second_bd = (struct eth_tx_2nd_bd *) ecore_chain_produce(&txq->tx_pbl); memset(second_bd, 0, sizeof(*second_bd)); nbd++; if (seg_idx < nsegs) { BD_SET_UNMAP_ADDR_LEN(second_bd, \ (segs->ds_addr), (segs->ds_len)); segs++; seg_idx++; } third_bd = (struct eth_tx_3rd_bd *) ecore_chain_produce(&txq->tx_pbl); memset(third_bd, 0, sizeof(*third_bd)); third_bd->data.lso_mss = m_head->m_pkthdr.tso_segsz; third_bd->data.bitfields |= (nbds_in_hdr<ds_addr), (segs->ds_len)); segs++; seg_idx++; } for (; seg_idx < nsegs; seg_idx++) { tx_data_bd = (struct eth_tx_bd *) ecore_chain_produce(&txq->tx_pbl); memset(tx_data_bd, 0, sizeof(*tx_data_bd)); BD_SET_UNMAP_ADDR_LEN(tx_data_bd, \ segs->ds_addr,\ segs->ds_len); segs++; nbd++; } } else if (offset < segs->ds_len) { BD_SET_UNMAP_ADDR_LEN(first_bd, segs->ds_addr, offset); second_bd = (struct eth_tx_2nd_bd *) ecore_chain_produce(&txq->tx_pbl); memset(second_bd, 0, sizeof(*second_bd)); BD_SET_UNMAP_ADDR_LEN(second_bd, \ (segs->ds_addr + offset),\ (segs->ds_len - offset)); nbd++; segs++; third_bd = (struct eth_tx_3rd_bd *) ecore_chain_produce(&txq->tx_pbl); memset(third_bd, 0, sizeof(*third_bd)); BD_SET_UNMAP_ADDR_LEN(third_bd, \ segs->ds_addr,\ segs->ds_len); third_bd->data.lso_mss = m_head->m_pkthdr.tso_segsz; third_bd->data.bitfields |= (nbds_in_hdr<tx_pbl); memset(tx_data_bd, 0, sizeof(*tx_data_bd)); BD_SET_UNMAP_ADDR_LEN(tx_data_bd, \ segs->ds_addr,\ segs->ds_len); segs++; nbd++; } } else { offset = offset - segs->ds_len; segs++; for (seg_idx = 1; seg_idx < nsegs; seg_idx++) { if (offset) nbds_in_hdr++; tx_data_bd = (struct eth_tx_bd *) ecore_chain_produce(&txq->tx_pbl); memset(tx_data_bd, 0, sizeof(*tx_data_bd)); if (second_bd == NULL) { second_bd = (struct eth_tx_2nd_bd *) tx_data_bd; } else if (third_bd == NULL) { third_bd = (struct eth_tx_3rd_bd *) tx_data_bd; } if (offset && (offset < segs->ds_len)) { BD_SET_UNMAP_ADDR_LEN(tx_data_bd,\ segs->ds_addr, offset); tx_data_bd = (struct eth_tx_bd *) ecore_chain_produce(&txq->tx_pbl); memset(tx_data_bd, 0, sizeof(*tx_data_bd)); if (second_bd == NULL) { second_bd = (struct eth_tx_2nd_bd *)tx_data_bd; } else if (third_bd == NULL) { third_bd = (struct eth_tx_3rd_bd *)tx_data_bd; } BD_SET_UNMAP_ADDR_LEN(tx_data_bd,\ (segs->ds_addr + offset), \ (segs->ds_len - offset)); nbd++; offset = 0; } else { if (offset) offset = offset - segs->ds_len; BD_SET_UNMAP_ADDR_LEN(tx_data_bd,\ segs->ds_addr, segs->ds_len); } segs++; nbd++; } if (third_bd == NULL) { third_bd = (struct eth_tx_3rd_bd *) ecore_chain_produce(&txq->tx_pbl); memset(third_bd, 0, sizeof(*third_bd)); } third_bd->data.lso_mss = m_head->m_pkthdr.tso_segsz; third_bd->data.bitfields |= (nbds_in_hdr<tx_tso_pkts++; } else { segs++; for (seg_idx = 1; seg_idx < nsegs; seg_idx++) { tx_data_bd = (struct eth_tx_bd *) ecore_chain_produce(&txq->tx_pbl); memset(tx_data_bd, 0, sizeof(*tx_data_bd)); BD_SET_UNMAP_ADDR_LEN(tx_data_bd, segs->ds_addr,\ segs->ds_len); segs++; nbd++; } first_bd->data.bitfields = (m_head->m_pkthdr.len & ETH_TX_DATA_1ST_BD_PKT_LEN_MASK) << ETH_TX_DATA_1ST_BD_PKT_LEN_SHIFT; first_bd->data.bitfields = htole16(first_bd->data.bitfields); fp->tx_non_tso_pkts++; } first_bd->data.nbds = nbd; if (ha->dbg_trace_tso_pkt_len) { if (fp->tx_tso_max_nsegs < nsegs) fp->tx_tso_max_nsegs = nsegs; if ((nsegs < fp->tx_tso_min_nsegs) || (!fp->tx_tso_min_nsegs)) fp->tx_tso_min_nsegs = nsegs; } txq->sw_tx_ring[idx].nsegs = nsegs; txq->sw_tx_prod = (txq->sw_tx_prod + 1) & (TX_RING_SIZE - 1); txq->tx_db.data.bd_prod = htole16(ecore_chain_get_prod_idx(&txq->tx_pbl)); qlnx_txq_doorbell_wr32(ha, txq->doorbell_addr, txq->tx_db.raw); QL_DPRINT8(ha, "exit[%d]\n", fp->rss_id); return (0); } static void qlnx_stop(qlnx_host_t *ha) { struct ifnet *ifp = ha->ifp; device_t dev; int i; dev = ha->pci_dev; ifp->if_drv_flags &= ~(IFF_DRV_OACTIVE | IFF_DRV_RUNNING); /* * We simply lock and unlock each fp->tx_mtx to * propagate the if_drv_flags * state to each tx thread */ QL_DPRINT1(ha, "QLNX STATE = %d\n",ha->state); if (ha->state == QLNX_STATE_OPEN) { for (i = 0; i < ha->num_rss; i++) { struct qlnx_fastpath *fp = &ha->fp_array[i]; mtx_lock(&fp->tx_mtx); mtx_unlock(&fp->tx_mtx); if (fp->fp_taskqueue != NULL) taskqueue_enqueue(fp->fp_taskqueue, &fp->fp_task); } } #ifdef QLNX_ENABLE_IWARP if (qlnx_vf_device(ha) != 0) { qlnx_rdma_dev_close(ha); } #endif /* #ifdef QLNX_ENABLE_IWARP */ qlnx_unload(ha); return; } static int qlnx_get_ifq_snd_maxlen(qlnx_host_t *ha) { return(TX_RING_SIZE - 1); } uint8_t * qlnx_get_mac_addr(qlnx_host_t *ha) { struct ecore_hwfn *p_hwfn; unsigned char mac[ETHER_ADDR_LEN]; uint8_t p_is_forced; p_hwfn = &ha->cdev.hwfns[0]; if (qlnx_vf_device(ha) != 0) return (p_hwfn->hw_info.hw_mac_addr); ecore_vf_read_bulletin(p_hwfn, &p_is_forced); if (ecore_vf_bulletin_get_forced_mac(p_hwfn, mac, &p_is_forced) == true) { device_printf(ha->pci_dev, "%s: p_is_forced = %d" " mac_addr = %02x:%02x:%02x:%02x:%02x:%02x\n", __func__, p_is_forced, mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); memcpy(ha->primary_mac, mac, ETH_ALEN); } return (ha->primary_mac); } static uint32_t qlnx_get_optics(qlnx_host_t *ha, struct qlnx_link_output *if_link) { uint32_t ifm_type = 0; switch (if_link->media_type) { case MEDIA_MODULE_FIBER: case MEDIA_UNSPECIFIED: if (if_link->speed == (100 * 1000)) ifm_type = QLNX_IFM_100G_SR4; else if (if_link->speed == (40 * 1000)) ifm_type = IFM_40G_SR4; else if (if_link->speed == (25 * 1000)) ifm_type = QLNX_IFM_25G_SR; else if (if_link->speed == (10 * 1000)) ifm_type = (IFM_10G_LR | IFM_10G_SR); else if (if_link->speed == (1 * 1000)) ifm_type = (IFM_1000_SX | IFM_1000_LX); break; case MEDIA_DA_TWINAX: if (if_link->speed == (100 * 1000)) ifm_type = QLNX_IFM_100G_CR4; else if (if_link->speed == (40 * 1000)) ifm_type = IFM_40G_CR4; else if (if_link->speed == (25 * 1000)) ifm_type = QLNX_IFM_25G_CR; else if (if_link->speed == (10 * 1000)) ifm_type = IFM_10G_TWINAX; break; default : ifm_type = IFM_UNKNOWN; break; } return (ifm_type); } /***************************************************************************** * Interrupt Service Functions *****************************************************************************/ static int qlnx_rx_jumbo_chain(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct mbuf *mp_head, uint16_t len) { struct mbuf *mp, *mpf, *mpl; struct sw_rx_data *sw_rx_data; struct qlnx_rx_queue *rxq; uint16_t len_in_buffer; rxq = fp->rxq; mpf = mpl = mp = NULL; while (len) { rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_cons]; mp = sw_rx_data->data; if (mp == NULL) { QL_DPRINT1(ha, "mp = NULL\n"); fp->err_rx_mp_null++; rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); if (mpf != NULL) m_freem(mpf); return (-1); } bus_dmamap_sync(ha->rx_tag, sw_rx_data->map, BUS_DMASYNC_POSTREAD); if (qlnx_alloc_rx_buffer(ha, rxq) != 0) { QL_DPRINT1(ha, "New buffer allocation failed, dropping" " incoming packet and reusing its buffer\n"); qlnx_reuse_rx_data(rxq); fp->err_rx_alloc_errors++; if (mpf != NULL) m_freem(mpf); return (-1); } ecore_chain_consume(&rxq->rx_bd_ring); if (len > rxq->rx_buf_size) len_in_buffer = rxq->rx_buf_size; else len_in_buffer = len; len = len - len_in_buffer; mp->m_flags &= ~M_PKTHDR; mp->m_next = NULL; mp->m_len = len_in_buffer; if (mpf == NULL) mpf = mpl = mp; else { mpl->m_next = mp; mpl = mp; } } if (mpf != NULL) mp_head->m_next = mpf; return (0); } static void qlnx_tpa_start(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_rx_queue *rxq, struct eth_fast_path_rx_tpa_start_cqe *cqe) { uint32_t agg_index; struct ifnet *ifp = ha->ifp; struct mbuf *mp; struct mbuf *mpf = NULL, *mpl = NULL, *mpc = NULL; struct sw_rx_data *sw_rx_data; dma_addr_t addr; bus_dmamap_t map; struct eth_rx_bd *rx_bd; int i; device_t dev; #if __FreeBSD_version >= 1100000 uint8_t hash_type; #endif /* #if __FreeBSD_version >= 1100000 */ dev = ha->pci_dev; agg_index = cqe->tpa_agg_index; QL_DPRINT7(ha, "[rss_id = %d]: enter\n \ \t type = 0x%x\n \ \t bitfields = 0x%x\n \ \t seg_len = 0x%x\n \ \t pars_flags = 0x%x\n \ \t vlan_tag = 0x%x\n \ \t rss_hash = 0x%x\n \ \t len_on_first_bd = 0x%x\n \ \t placement_offset = 0x%x\n \ \t tpa_agg_index = 0x%x\n \ \t header_len = 0x%x\n \ \t ext_bd_len_list[0] = 0x%x\n \ \t ext_bd_len_list[1] = 0x%x\n \ \t ext_bd_len_list[2] = 0x%x\n \ \t ext_bd_len_list[3] = 0x%x\n \ \t ext_bd_len_list[4] = 0x%x\n", fp->rss_id, cqe->type, cqe->bitfields, cqe->seg_len, cqe->pars_flags.flags, cqe->vlan_tag, cqe->rss_hash, cqe->len_on_first_bd, cqe->placement_offset, cqe->tpa_agg_index, cqe->header_len, cqe->ext_bd_len_list[0], cqe->ext_bd_len_list[1], cqe->ext_bd_len_list[2], cqe->ext_bd_len_list[3], cqe->ext_bd_len_list[4]); if (agg_index >= ETH_TPA_MAX_AGGS_NUM) { fp->err_rx_tpa_invalid_agg_num++; return; } sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_cons]; bus_dmamap_sync(ha->rx_tag, sw_rx_data->map, BUS_DMASYNC_POSTREAD); mp = sw_rx_data->data; QL_DPRINT7(ha, "[rss_id = %d]: mp = %p \n ", fp->rss_id, mp); if (mp == NULL) { QL_DPRINT7(ha, "[%d]: mp = NULL\n", fp->rss_id); fp->err_rx_mp_null++; rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); return; } if ((le16toh(cqe->pars_flags.flags)) & CQE_FLAGS_ERR) { QL_DPRINT7(ha, "[%d]: CQE in CONS = %u has error," " flags = %x, dropping incoming packet\n", fp->rss_id, rxq->sw_rx_cons, le16toh(cqe->pars_flags.flags)); fp->err_rx_hw_errors++; qlnx_reuse_rx_data(rxq); QLNX_INC_IERRORS(ifp); return; } if (qlnx_alloc_rx_buffer(ha, rxq) != 0) { QL_DPRINT7(ha, "[%d]: New buffer allocation failed," " dropping incoming packet and reusing its buffer\n", fp->rss_id); fp->err_rx_alloc_errors++; QLNX_INC_IQDROPS(ifp); /* * Load the tpa mbuf into the rx ring and save the * posted mbuf */ map = sw_rx_data->map; addr = sw_rx_data->dma_addr; sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_prod]; sw_rx_data->data = rxq->tpa_info[agg_index].rx_buf.data; sw_rx_data->dma_addr = rxq->tpa_info[agg_index].rx_buf.dma_addr; sw_rx_data->map = rxq->tpa_info[agg_index].rx_buf.map; rxq->tpa_info[agg_index].rx_buf.data = mp; rxq->tpa_info[agg_index].rx_buf.dma_addr = addr; rxq->tpa_info[agg_index].rx_buf.map = map; rx_bd = (struct eth_rx_bd *) ecore_chain_produce(&rxq->rx_bd_ring); rx_bd->addr.hi = htole32(U64_HI(sw_rx_data->dma_addr)); rx_bd->addr.lo = htole32(U64_LO(sw_rx_data->dma_addr)); bus_dmamap_sync(ha->rx_tag, sw_rx_data->map, BUS_DMASYNC_PREREAD); rxq->sw_rx_prod = (rxq->sw_rx_prod + 1) & (RX_RING_SIZE - 1); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); ecore_chain_consume(&rxq->rx_bd_ring); /* Now reuse any buffers posted in ext_bd_len_list */ for (i = 0; i < ETH_TPA_CQE_START_LEN_LIST_SIZE; i++) { if (cqe->ext_bd_len_list[i] == 0) break; qlnx_reuse_rx_data(rxq); } rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; return; } if (rxq->tpa_info[agg_index].agg_state != QLNX_AGG_STATE_NONE) { QL_DPRINT7(ha, "[%d]: invalid aggregation state," " dropping incoming packet and reusing its buffer\n", fp->rss_id); QLNX_INC_IQDROPS(ifp); /* if we already have mbuf head in aggregation free it */ if (rxq->tpa_info[agg_index].mpf) { m_freem(rxq->tpa_info[agg_index].mpf); rxq->tpa_info[agg_index].mpl = NULL; } rxq->tpa_info[agg_index].mpf = mp; rxq->tpa_info[agg_index].mpl = NULL; rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); ecore_chain_consume(&rxq->rx_bd_ring); /* Now reuse any buffers posted in ext_bd_len_list */ for (i = 0; i < ETH_TPA_CQE_START_LEN_LIST_SIZE; i++) { if (cqe->ext_bd_len_list[i] == 0) break; qlnx_reuse_rx_data(rxq); } rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; return; } /* * first process the ext_bd_len_list * if this fails then we simply drop the packet */ ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); for (i = 0; i < ETH_TPA_CQE_START_LEN_LIST_SIZE; i++) { QL_DPRINT7(ha, "[%d]: 4\n ", fp->rss_id); if (cqe->ext_bd_len_list[i] == 0) break; sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_cons]; bus_dmamap_sync(ha->rx_tag, sw_rx_data->map, BUS_DMASYNC_POSTREAD); mpc = sw_rx_data->data; if (mpc == NULL) { QL_DPRINT7(ha, "[%d]: mpc = NULL\n", fp->rss_id); fp->err_rx_mp_null++; if (mpf != NULL) m_freem(mpf); mpf = mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); continue; } if (qlnx_alloc_rx_buffer(ha, rxq) != 0) { QL_DPRINT7(ha, "[%d]: New buffer allocation failed," " dropping incoming packet and reusing its" " buffer\n", fp->rss_id); qlnx_reuse_rx_data(rxq); if (mpf != NULL) m_freem(mpf); mpf = mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); continue; } mpc->m_flags &= ~M_PKTHDR; mpc->m_next = NULL; mpc->m_len = cqe->ext_bd_len_list[i]; if (mpf == NULL) { mpf = mpl = mpc; } else { mpl->m_len = ha->rx_buf_size; mpl->m_next = mpc; mpl = mpc; } ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); } if (rxq->tpa_info[agg_index].agg_state != QLNX_AGG_STATE_NONE) { QL_DPRINT7(ha, "[%d]: invalid aggregation state, dropping" " incoming packet and reusing its buffer\n", fp->rss_id); QLNX_INC_IQDROPS(ifp); rxq->tpa_info[agg_index].mpf = mp; rxq->tpa_info[agg_index].mpl = NULL; return; } rxq->tpa_info[agg_index].placement_offset = cqe->placement_offset; if (mpf != NULL) { mp->m_len = ha->rx_buf_size; mp->m_next = mpf; rxq->tpa_info[agg_index].mpf = mp; rxq->tpa_info[agg_index].mpl = mpl; } else { mp->m_len = cqe->len_on_first_bd + cqe->placement_offset; rxq->tpa_info[agg_index].mpf = mp; rxq->tpa_info[agg_index].mpl = mp; mp->m_next = NULL; } mp->m_flags |= M_PKTHDR; /* assign packet to this interface interface */ mp->m_pkthdr.rcvif = ifp; /* assume no hardware checksum has complated */ mp->m_pkthdr.csum_flags = 0; //mp->m_pkthdr.flowid = fp->rss_id; mp->m_pkthdr.flowid = cqe->rss_hash; #if __FreeBSD_version >= 1100000 hash_type = cqe->bitfields & (ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE_MASK << ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE_SHIFT); switch (hash_type) { case RSS_HASH_TYPE_IPV4: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_IPV4); break; case RSS_HASH_TYPE_TCP_IPV4: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_TCP_IPV4); break; case RSS_HASH_TYPE_IPV6: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_IPV6); break; case RSS_HASH_TYPE_TCP_IPV6: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_TCP_IPV6); break; default: M_HASHTYPE_SET(mp, M_HASHTYPE_OPAQUE); break; } #else mp->m_flags |= M_FLOWID; #endif mp->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); mp->m_pkthdr.csum_data = 0xFFFF; if (CQE_HAS_VLAN(cqe->pars_flags.flags)) { mp->m_pkthdr.ether_vtag = le16toh(cqe->vlan_tag); mp->m_flags |= M_VLANTAG; } rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_START; QL_DPRINT7(ha, "[%d]: 5\n\tagg_state = %d\n\t mpf = %p mpl = %p\n", fp->rss_id, rxq->tpa_info[agg_index].agg_state, rxq->tpa_info[agg_index].mpf, rxq->tpa_info[agg_index].mpl); return; } static void qlnx_tpa_cont(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_rx_queue *rxq, struct eth_fast_path_rx_tpa_cont_cqe *cqe) { struct sw_rx_data *sw_rx_data; int i; struct mbuf *mpf = NULL, *mpl = NULL, *mpc = NULL; struct mbuf *mp; uint32_t agg_index; device_t dev; dev = ha->pci_dev; QL_DPRINT7(ha, "[%d]: enter\n \ \t type = 0x%x\n \ \t tpa_agg_index = 0x%x\n \ \t len_list[0] = 0x%x\n \ \t len_list[1] = 0x%x\n \ \t len_list[2] = 0x%x\n \ \t len_list[3] = 0x%x\n \ \t len_list[4] = 0x%x\n \ \t len_list[5] = 0x%x\n", fp->rss_id, cqe->type, cqe->tpa_agg_index, cqe->len_list[0], cqe->len_list[1], cqe->len_list[2], cqe->len_list[3], cqe->len_list[4], cqe->len_list[5]); agg_index = cqe->tpa_agg_index; if (agg_index >= ETH_TPA_MAX_AGGS_NUM) { QL_DPRINT7(ha, "[%d]: 0\n ", fp->rss_id); fp->err_rx_tpa_invalid_agg_num++; return; } for (i = 0; i < ETH_TPA_CQE_CONT_LEN_LIST_SIZE; i++) { QL_DPRINT7(ha, "[%d]: 1\n ", fp->rss_id); if (cqe->len_list[i] == 0) break; if (rxq->tpa_info[agg_index].agg_state != QLNX_AGG_STATE_START) { qlnx_reuse_rx_data(rxq); continue; } sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_cons]; bus_dmamap_sync(ha->rx_tag, sw_rx_data->map, BUS_DMASYNC_POSTREAD); mpc = sw_rx_data->data; if (mpc == NULL) { QL_DPRINT7(ha, "[%d]: mpc = NULL\n", fp->rss_id); fp->err_rx_mp_null++; if (mpf != NULL) m_freem(mpf); mpf = mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); continue; } if (qlnx_alloc_rx_buffer(ha, rxq) != 0) { QL_DPRINT7(ha, "[%d]: New buffer allocation failed," " dropping incoming packet and reusing its" " buffer\n", fp->rss_id); qlnx_reuse_rx_data(rxq); if (mpf != NULL) m_freem(mpf); mpf = mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); continue; } mpc->m_flags &= ~M_PKTHDR; mpc->m_next = NULL; mpc->m_len = cqe->len_list[i]; if (mpf == NULL) { mpf = mpl = mpc; } else { mpl->m_len = ha->rx_buf_size; mpl->m_next = mpc; mpl = mpc; } ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); } QL_DPRINT7(ha, "[%d]: 2\n" "\tmpf = %p mpl = %p\n", fp->rss_id, mpf, mpl); if (mpf != NULL) { mp = rxq->tpa_info[agg_index].mpl; mp->m_len = ha->rx_buf_size; mp->m_next = mpf; rxq->tpa_info[agg_index].mpl = mpl; } return; } static int qlnx_tpa_end(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_rx_queue *rxq, struct eth_fast_path_rx_tpa_end_cqe *cqe) { struct sw_rx_data *sw_rx_data; int i; struct mbuf *mpf = NULL, *mpl = NULL, *mpc = NULL; struct mbuf *mp; uint32_t agg_index; uint32_t len = 0; struct ifnet *ifp = ha->ifp; device_t dev; dev = ha->pci_dev; QL_DPRINT7(ha, "[%d]: enter\n \ \t type = 0x%x\n \ \t tpa_agg_index = 0x%x\n \ \t total_packet_len = 0x%x\n \ \t num_of_bds = 0x%x\n \ \t end_reason = 0x%x\n \ \t num_of_coalesced_segs = 0x%x\n \ \t ts_delta = 0x%x\n \ \t len_list[0] = 0x%x\n \ \t len_list[1] = 0x%x\n \ \t len_list[2] = 0x%x\n \ \t len_list[3] = 0x%x\n", fp->rss_id, cqe->type, cqe->tpa_agg_index, cqe->total_packet_len, cqe->num_of_bds, cqe->end_reason, cqe->num_of_coalesced_segs, cqe->ts_delta, cqe->len_list[0], cqe->len_list[1], cqe->len_list[2], cqe->len_list[3]); agg_index = cqe->tpa_agg_index; if (agg_index >= ETH_TPA_MAX_AGGS_NUM) { QL_DPRINT7(ha, "[%d]: 0\n ", fp->rss_id); fp->err_rx_tpa_invalid_agg_num++; return (0); } for (i = 0; i < ETH_TPA_CQE_END_LEN_LIST_SIZE; i++) { QL_DPRINT7(ha, "[%d]: 1\n ", fp->rss_id); if (cqe->len_list[i] == 0) break; if (rxq->tpa_info[agg_index].agg_state != QLNX_AGG_STATE_START) { QL_DPRINT7(ha, "[%d]: 2\n ", fp->rss_id); qlnx_reuse_rx_data(rxq); continue; } sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_cons]; bus_dmamap_sync(ha->rx_tag, sw_rx_data->map, BUS_DMASYNC_POSTREAD); mpc = sw_rx_data->data; if (mpc == NULL) { QL_DPRINT7(ha, "[%d]: mpc = NULL\n", fp->rss_id); fp->err_rx_mp_null++; if (mpf != NULL) m_freem(mpf); mpf = mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); continue; } if (qlnx_alloc_rx_buffer(ha, rxq) != 0) { QL_DPRINT7(ha, "[%d]: New buffer allocation failed," " dropping incoming packet and reusing its" " buffer\n", fp->rss_id); qlnx_reuse_rx_data(rxq); if (mpf != NULL) m_freem(mpf); mpf = mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_ERROR; ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); continue; } mpc->m_flags &= ~M_PKTHDR; mpc->m_next = NULL; mpc->m_len = cqe->len_list[i]; if (mpf == NULL) { mpf = mpl = mpc; } else { mpl->m_len = ha->rx_buf_size; mpl->m_next = mpc; mpl = mpc; } ecore_chain_consume(&rxq->rx_bd_ring); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); } QL_DPRINT7(ha, "[%d]: 5\n ", fp->rss_id); if (mpf != NULL) { QL_DPRINT7(ha, "[%d]: 6\n ", fp->rss_id); mp = rxq->tpa_info[agg_index].mpl; mp->m_len = ha->rx_buf_size; mp->m_next = mpf; } if (rxq->tpa_info[agg_index].agg_state != QLNX_AGG_STATE_START) { QL_DPRINT7(ha, "[%d]: 7\n ", fp->rss_id); if (rxq->tpa_info[agg_index].mpf != NULL) m_freem(rxq->tpa_info[agg_index].mpf); rxq->tpa_info[agg_index].mpf = NULL; rxq->tpa_info[agg_index].mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_NONE; return (0); } mp = rxq->tpa_info[agg_index].mpf; m_adj(mp, rxq->tpa_info[agg_index].placement_offset); mp->m_pkthdr.len = cqe->total_packet_len; if (mp->m_next == NULL) mp->m_len = mp->m_pkthdr.len; else { /* compute the total packet length */ mpf = mp; while (mpf != NULL) { len += mpf->m_len; mpf = mpf->m_next; } if (cqe->total_packet_len > len) { mpl = rxq->tpa_info[agg_index].mpl; mpl->m_len += (cqe->total_packet_len - len); } } QLNX_INC_IPACKETS(ifp); QLNX_INC_IBYTES(ifp, (cqe->total_packet_len)); QL_DPRINT7(ha, "[%d]: 8 csum_data = 0x%x csum_flags = 0x%" PRIu64 "\n \ m_len = 0x%x m_pkthdr_len = 0x%x\n", fp->rss_id, mp->m_pkthdr.csum_data, (uint64_t)mp->m_pkthdr.csum_flags, mp->m_len, mp->m_pkthdr.len); (*ifp->if_input)(ifp, mp); rxq->tpa_info[agg_index].mpf = NULL; rxq->tpa_info[agg_index].mpl = NULL; rxq->tpa_info[agg_index].agg_state = QLNX_AGG_STATE_NONE; return (cqe->num_of_coalesced_segs); } static int qlnx_rx_int(qlnx_host_t *ha, struct qlnx_fastpath *fp, int budget, int lro_enable) { uint16_t hw_comp_cons, sw_comp_cons; int rx_pkt = 0; struct qlnx_rx_queue *rxq = fp->rxq; struct ifnet *ifp = ha->ifp; struct ecore_dev *cdev = &ha->cdev; struct ecore_hwfn *p_hwfn; #ifdef QLNX_SOFT_LRO struct lro_ctrl *lro; lro = &rxq->lro; #endif /* #ifdef QLNX_SOFT_LRO */ hw_comp_cons = le16toh(*rxq->hw_cons_ptr); sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring); p_hwfn = &ha->cdev.hwfns[(fp->rss_id % cdev->num_hwfns)]; /* Memory barrier to prevent the CPU from doing speculative reads of CQE * / BD in the while-loop before reading hw_comp_cons. If the CQE is * read before it is written by FW, then FW writes CQE and SB, and then * the CPU reads the hw_comp_cons, it will use an old CQE. */ /* Loop to complete all indicated BDs */ while (sw_comp_cons != hw_comp_cons) { union eth_rx_cqe *cqe; struct eth_fast_path_rx_reg_cqe *fp_cqe; struct sw_rx_data *sw_rx_data; register struct mbuf *mp; enum eth_rx_cqe_type cqe_type; uint16_t len, pad, len_on_first_bd; uint8_t *data; #if __FreeBSD_version >= 1100000 uint8_t hash_type; #endif /* #if __FreeBSD_version >= 1100000 */ /* Get the CQE from the completion ring */ cqe = (union eth_rx_cqe *) ecore_chain_consume(&rxq->rx_comp_ring); cqe_type = cqe->fast_path_regular.type; if (cqe_type == ETH_RX_CQE_TYPE_SLOW_PATH) { QL_DPRINT3(ha, "Got a slowath CQE\n"); ecore_eth_cqe_completion(p_hwfn, (struct eth_slow_path_rx_cqe *)cqe); goto next_cqe; } if (cqe_type != ETH_RX_CQE_TYPE_REGULAR) { switch (cqe_type) { case ETH_RX_CQE_TYPE_TPA_START: qlnx_tpa_start(ha, fp, rxq, &cqe->fast_path_tpa_start); fp->tpa_start++; break; case ETH_RX_CQE_TYPE_TPA_CONT: qlnx_tpa_cont(ha, fp, rxq, &cqe->fast_path_tpa_cont); fp->tpa_cont++; break; case ETH_RX_CQE_TYPE_TPA_END: rx_pkt += qlnx_tpa_end(ha, fp, rxq, &cqe->fast_path_tpa_end); fp->tpa_end++; break; default: break; } goto next_cqe; } /* Get the data from the SW ring */ sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_cons]; mp = sw_rx_data->data; if (mp == NULL) { QL_DPRINT1(ha, "mp = NULL\n"); fp->err_rx_mp_null++; rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); goto next_cqe; } bus_dmamap_sync(ha->rx_tag, sw_rx_data->map, BUS_DMASYNC_POSTREAD); /* non GRO */ fp_cqe = &cqe->fast_path_regular;/* MK CR TPA check assembly */ len = le16toh(fp_cqe->pkt_len); pad = fp_cqe->placement_offset; #if 0 QL_DPRINT3(ha, "CQE type = %x, flags = %x, vlan = %x," " len %u, parsing flags = %d pad = %d\n", cqe_type, fp_cqe->bitfields, le16toh(fp_cqe->vlan_tag), len, le16toh(fp_cqe->pars_flags.flags), pad); #endif data = mtod(mp, uint8_t *); data = data + pad; if (0) qlnx_dump_buf8(ha, __func__, data, len); /* For every Rx BD consumed, we allocate a new BD so the BD ring * is always with a fixed size. If allocation fails, we take the * consumed BD and return it to the ring in the PROD position. * The packet that was received on that BD will be dropped (and * not passed to the upper stack). */ /* If this is an error packet then drop it */ if ((le16toh(cqe->fast_path_regular.pars_flags.flags)) & CQE_FLAGS_ERR) { QL_DPRINT1(ha, "CQE in CONS = %u has error, flags = %x," " dropping incoming packet\n", sw_comp_cons, le16toh(cqe->fast_path_regular.pars_flags.flags)); fp->err_rx_hw_errors++; qlnx_reuse_rx_data(rxq); QLNX_INC_IERRORS(ifp); goto next_cqe; } if (qlnx_alloc_rx_buffer(ha, rxq) != 0) { QL_DPRINT1(ha, "New buffer allocation failed, dropping" " incoming packet and reusing its buffer\n"); qlnx_reuse_rx_data(rxq); fp->err_rx_alloc_errors++; QLNX_INC_IQDROPS(ifp); goto next_cqe; } ecore_chain_consume(&rxq->rx_bd_ring); len_on_first_bd = fp_cqe->len_on_first_bd; m_adj(mp, pad); mp->m_pkthdr.len = len; if ((len > 60 ) && (len > len_on_first_bd)) { mp->m_len = len_on_first_bd; if (qlnx_rx_jumbo_chain(ha, fp, mp, (len - len_on_first_bd)) != 0) { m_freem(mp); QLNX_INC_IQDROPS(ifp); goto next_cqe; } } else if (len_on_first_bd < len) { fp->err_rx_jumbo_chain_pkts++; } else { mp->m_len = len; } mp->m_flags |= M_PKTHDR; /* assign packet to this interface interface */ mp->m_pkthdr.rcvif = ifp; /* assume no hardware checksum has complated */ mp->m_pkthdr.csum_flags = 0; mp->m_pkthdr.flowid = fp_cqe->rss_hash; #if __FreeBSD_version >= 1100000 hash_type = fp_cqe->bitfields & (ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE_MASK << ETH_FAST_PATH_RX_REG_CQE_RSS_HASH_TYPE_SHIFT); switch (hash_type) { case RSS_HASH_TYPE_IPV4: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_IPV4); break; case RSS_HASH_TYPE_TCP_IPV4: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_TCP_IPV4); break; case RSS_HASH_TYPE_IPV6: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_IPV6); break; case RSS_HASH_TYPE_TCP_IPV6: M_HASHTYPE_SET(mp, M_HASHTYPE_RSS_TCP_IPV6); break; default: M_HASHTYPE_SET(mp, M_HASHTYPE_OPAQUE); break; } #else mp->m_flags |= M_FLOWID; #endif if (CQE_L3_PACKET(fp_cqe->pars_flags.flags)) { mp->m_pkthdr.csum_flags |= CSUM_IP_CHECKED; } if (!(CQE_IP_HDR_ERR(fp_cqe->pars_flags.flags))) { mp->m_pkthdr.csum_flags |= CSUM_IP_VALID; } if (CQE_L4_HAS_CSUM(fp_cqe->pars_flags.flags)) { mp->m_pkthdr.csum_data = 0xFFFF; mp->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); } if (CQE_HAS_VLAN(fp_cqe->pars_flags.flags)) { mp->m_pkthdr.ether_vtag = le16toh(fp_cqe->vlan_tag); mp->m_flags |= M_VLANTAG; } QLNX_INC_IPACKETS(ifp); QLNX_INC_IBYTES(ifp, len); #ifdef QLNX_SOFT_LRO if (lro_enable) { #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) tcp_lro_queue_mbuf(lro, mp); #else if (tcp_lro_rx(lro, mp, 0)) (*ifp->if_input)(ifp, mp); #endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */ } else { (*ifp->if_input)(ifp, mp); } #else (*ifp->if_input)(ifp, mp); #endif /* #ifdef QLNX_SOFT_LRO */ rx_pkt++; rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); next_cqe: /* don't consume bd rx buffer */ ecore_chain_recycle_consumed(&rxq->rx_comp_ring); sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring); /* CR TPA - revisit how to handle budget in TPA perhaps increase on "end" */ if (rx_pkt == budget) break; } /* repeat while sw_comp_cons != hw_comp_cons... */ /* Update producers */ qlnx_update_rx_prod(p_hwfn, rxq); return rx_pkt; } /* * fast path interrupt */ static void qlnx_fp_isr(void *arg) { qlnx_ivec_t *ivec = arg; qlnx_host_t *ha; struct qlnx_fastpath *fp = NULL; int idx; ha = ivec->ha; if (ha->state != QLNX_STATE_OPEN) { return; } idx = ivec->rss_idx; if ((idx = ivec->rss_idx) >= ha->num_rss) { QL_DPRINT1(ha, "illegal interrupt[%d]\n", idx); ha->err_illegal_intr++; return; } fp = &ha->fp_array[idx]; if (fp == NULL) { ha->err_fp_null++; } else { int rx_int = 0, total_rx_count = 0; int lro_enable, tc; struct qlnx_tx_queue *txq; uint16_t elem_left; lro_enable = ha->ifp->if_capenable & IFCAP_LRO; ecore_sb_ack(fp->sb_info, IGU_INT_DISABLE, 0); do { for (tc = 0; tc < ha->num_tc; tc++) { txq = fp->txq[tc]; if((int)(elem_left = ecore_chain_get_elem_left(&txq->tx_pbl)) < QLNX_TX_ELEM_THRESH) { if (mtx_trylock(&fp->tx_mtx)) { #ifdef QLNX_TRACE_PERF_DATA tx_compl = fp->tx_pkts_completed; #endif qlnx_tx_int(ha, fp, fp->txq[tc]); #ifdef QLNX_TRACE_PERF_DATA fp->tx_pkts_compl_intr += (fp->tx_pkts_completed - tx_compl); if ((fp->tx_pkts_completed - tx_compl) <= 32) fp->tx_comInt[0]++; else if (((fp->tx_pkts_completed - tx_compl) > 32) && ((fp->tx_pkts_completed - tx_compl) <= 64)) fp->tx_comInt[1]++; else if(((fp->tx_pkts_completed - tx_compl) > 64) && ((fp->tx_pkts_completed - tx_compl) <= 128)) fp->tx_comInt[2]++; else if(((fp->tx_pkts_completed - tx_compl) > 128)) fp->tx_comInt[3]++; #endif mtx_unlock(&fp->tx_mtx); } } } rx_int = qlnx_rx_int(ha, fp, ha->rx_pkt_threshold, lro_enable); if (rx_int) { fp->rx_pkts += rx_int; total_rx_count += rx_int; } } while (rx_int); #ifdef QLNX_SOFT_LRO { struct lro_ctrl *lro; lro = &fp->rxq->lro; if (lro_enable && total_rx_count) { #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) #ifdef QLNX_TRACE_LRO_CNT if (lro->lro_mbuf_count & ~1023) fp->lro_cnt_1024++; else if (lro->lro_mbuf_count & ~511) fp->lro_cnt_512++; else if (lro->lro_mbuf_count & ~255) fp->lro_cnt_256++; else if (lro->lro_mbuf_count & ~127) fp->lro_cnt_128++; else if (lro->lro_mbuf_count & ~63) fp->lro_cnt_64++; #endif /* #ifdef QLNX_TRACE_LRO_CNT */ tcp_lro_flush_all(lro); #else struct lro_entry *queued; while ((!SLIST_EMPTY(&lro->lro_active))) { queued = SLIST_FIRST(&lro->lro_active); SLIST_REMOVE_HEAD(&lro->lro_active, \ next); tcp_lro_flush(lro, queued); } #endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */ } } #endif /* #ifdef QLNX_SOFT_LRO */ ecore_sb_update_sb_idx(fp->sb_info); rmb(); ecore_sb_ack(fp->sb_info, IGU_INT_ENABLE, 1); } return; } /* * slow path interrupt processing function * can be invoked in polled mode or in interrupt mode via taskqueue. */ void qlnx_sp_isr(void *arg) { struct ecore_hwfn *p_hwfn; qlnx_host_t *ha; p_hwfn = arg; ha = (qlnx_host_t *)p_hwfn->p_dev; ha->sp_interrupts++; QL_DPRINT2(ha, "enter\n"); ecore_int_sp_dpc(p_hwfn); QL_DPRINT2(ha, "exit\n"); return; } /***************************************************************************** * Support Functions for DMA'able Memory *****************************************************************************/ static void qlnx_dmamap_callback(void *arg, bus_dma_segment_t *segs, int nsegs, int error) { *((bus_addr_t *)arg) = 0; if (error) { printf("%s: bus_dmamap_load failed (%d)\n", __func__, error); return; } *((bus_addr_t *)arg) = segs[0].ds_addr; return; } static int qlnx_alloc_dmabuf(qlnx_host_t *ha, qlnx_dma_t *dma_buf) { int ret = 0; device_t dev; bus_addr_t b_addr; dev = ha->pci_dev; ret = bus_dma_tag_create( ha->parent_tag,/* parent */ dma_buf->alignment, ((bus_size_t)(1ULL << 32)),/* boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ dma_buf->size, /* maxsize */ 1, /* nsegments */ dma_buf->size, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &dma_buf->dma_tag); if (ret) { QL_DPRINT1(ha, "could not create dma tag\n"); goto qlnx_alloc_dmabuf_exit; } ret = bus_dmamem_alloc(dma_buf->dma_tag, (void **)&dma_buf->dma_b, (BUS_DMA_ZERO | BUS_DMA_COHERENT | BUS_DMA_NOWAIT), &dma_buf->dma_map); if (ret) { bus_dma_tag_destroy(dma_buf->dma_tag); QL_DPRINT1(ha, "bus_dmamem_alloc failed\n"); goto qlnx_alloc_dmabuf_exit; } ret = bus_dmamap_load(dma_buf->dma_tag, dma_buf->dma_map, dma_buf->dma_b, dma_buf->size, qlnx_dmamap_callback, &b_addr, BUS_DMA_NOWAIT); if (ret || !b_addr) { bus_dma_tag_destroy(dma_buf->dma_tag); bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); ret = -1; goto qlnx_alloc_dmabuf_exit; } dma_buf->dma_addr = b_addr; qlnx_alloc_dmabuf_exit: return ret; } static void qlnx_free_dmabuf(qlnx_host_t *ha, qlnx_dma_t *dma_buf) { bus_dmamap_unload(dma_buf->dma_tag, dma_buf->dma_map); bus_dmamem_free(dma_buf->dma_tag, dma_buf->dma_b, dma_buf->dma_map); bus_dma_tag_destroy(dma_buf->dma_tag); return; } void * qlnx_dma_alloc_coherent(void *ecore_dev, bus_addr_t *phys, uint32_t size) { qlnx_dma_t dma_buf; qlnx_dma_t *dma_p; qlnx_host_t *ha; device_t dev; ha = (qlnx_host_t *)ecore_dev; dev = ha->pci_dev; size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); memset(&dma_buf, 0, sizeof (qlnx_dma_t)); dma_buf.size = size + PAGE_SIZE; dma_buf.alignment = 8; if (qlnx_alloc_dmabuf((qlnx_host_t *)ecore_dev, &dma_buf) != 0) return (NULL); bzero((uint8_t *)dma_buf.dma_b, dma_buf.size); *phys = dma_buf.dma_addr; dma_p = (qlnx_dma_t *)((uint8_t *)dma_buf.dma_b + size); memcpy(dma_p, &dma_buf, sizeof(qlnx_dma_t)); QL_DPRINT5(ha, "[%p %p %p %p 0x%08x ]\n", (void *)dma_buf.dma_map, (void *)dma_buf.dma_tag, dma_buf.dma_b, (void *)dma_buf.dma_addr, size); return (dma_buf.dma_b); } void qlnx_dma_free_coherent(void *ecore_dev, void *v_addr, bus_addr_t phys, uint32_t size) { qlnx_dma_t dma_buf, *dma_p; qlnx_host_t *ha; device_t dev; ha = (qlnx_host_t *)ecore_dev; dev = ha->pci_dev; if (v_addr == NULL) return; size = (size + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1); dma_p = (qlnx_dma_t *)((uint8_t *)v_addr + size); QL_DPRINT5(ha, "[%p %p %p %p 0x%08x ]\n", (void *)dma_p->dma_map, (void *)dma_p->dma_tag, dma_p->dma_b, (void *)dma_p->dma_addr, size); dma_buf = *dma_p; if (!ha->qlnxr_debug) qlnx_free_dmabuf((qlnx_host_t *)ecore_dev, &dma_buf); return; } static int qlnx_alloc_parent_dma_tag(qlnx_host_t *ha) { int ret; device_t dev; dev = ha->pci_dev; /* * Allocate parent DMA Tag */ ret = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ 1,((bus_size_t)(1ULL << 32)),/* alignment, boundary */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ BUS_SPACE_MAXSIZE_32BIT,/* maxsize */ 0, /* nsegments */ BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockarg */ &ha->parent_tag); if (ret) { QL_DPRINT1(ha, "could not create parent dma tag\n"); return (-1); } ha->flags.parent_tag = 1; return (0); } static void qlnx_free_parent_dma_tag(qlnx_host_t *ha) { if (ha->parent_tag != NULL) { bus_dma_tag_destroy(ha->parent_tag); ha->parent_tag = NULL; } return; } static int qlnx_alloc_tx_dma_tag(qlnx_host_t *ha) { if (bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ QLNX_MAX_TSO_FRAME_SIZE, /* maxsize */ QLNX_MAX_SEGMENTS, /* nsegments */ QLNX_MAX_TX_MBUF_SIZE, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->tx_tag)) { QL_DPRINT1(ha, "tx_tag alloc failed\n"); return (-1); } return (0); } static void qlnx_free_tx_dma_tag(qlnx_host_t *ha) { if (ha->tx_tag != NULL) { bus_dma_tag_destroy(ha->tx_tag); ha->tx_tag = NULL; } return; } static int qlnx_alloc_rx_dma_tag(qlnx_host_t *ha) { if (bus_dma_tag_create(NULL, /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MJUM9BYTES, /* maxsize */ 1, /* nsegments */ MJUM9BYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockfuncarg */ &ha->rx_tag)) { QL_DPRINT1(ha, " rx_tag alloc failed\n"); return (-1); } return (0); } static void qlnx_free_rx_dma_tag(qlnx_host_t *ha) { if (ha->rx_tag != NULL) { bus_dma_tag_destroy(ha->rx_tag); ha->rx_tag = NULL; } return; } /********************************* * Exported functions *********************************/ uint32_t qlnx_pci_bus_get_bar_size(void *ecore_dev, uint8_t bar_id) { uint32_t bar_size; bar_id = bar_id * 2; bar_size = bus_get_resource_count(((qlnx_host_t *)ecore_dev)->pci_dev, SYS_RES_MEMORY, PCIR_BAR(bar_id)); return (bar_size); } uint32_t qlnx_pci_read_config_byte(void *ecore_dev, uint32_t pci_reg, uint8_t *reg_value) { *reg_value = pci_read_config(((qlnx_host_t *)ecore_dev)->pci_dev, pci_reg, 1); return 0; } uint32_t qlnx_pci_read_config_word(void *ecore_dev, uint32_t pci_reg, uint16_t *reg_value) { *reg_value = pci_read_config(((qlnx_host_t *)ecore_dev)->pci_dev, pci_reg, 2); return 0; } uint32_t qlnx_pci_read_config_dword(void *ecore_dev, uint32_t pci_reg, uint32_t *reg_value) { *reg_value = pci_read_config(((qlnx_host_t *)ecore_dev)->pci_dev, pci_reg, 4); return 0; } void qlnx_pci_write_config_byte(void *ecore_dev, uint32_t pci_reg, uint8_t reg_value) { pci_write_config(((qlnx_host_t *)ecore_dev)->pci_dev, pci_reg, reg_value, 1); return; } void qlnx_pci_write_config_word(void *ecore_dev, uint32_t pci_reg, uint16_t reg_value) { pci_write_config(((qlnx_host_t *)ecore_dev)->pci_dev, pci_reg, reg_value, 2); return; } void qlnx_pci_write_config_dword(void *ecore_dev, uint32_t pci_reg, uint32_t reg_value) { pci_write_config(((qlnx_host_t *)ecore_dev)->pci_dev, pci_reg, reg_value, 4); return; } int qlnx_pci_find_capability(void *ecore_dev, int cap) { int reg; qlnx_host_t *ha; ha = ecore_dev; if (pci_find_cap(ha->pci_dev, PCIY_EXPRESS, ®) == 0) return reg; else { QL_DPRINT1(ha, "failed\n"); return 0; } } int qlnx_pci_find_ext_capability(void *ecore_dev, int ext_cap) { int reg; qlnx_host_t *ha; ha = ecore_dev; if (pci_find_extcap(ha->pci_dev, ext_cap, ®) == 0) return reg; else { QL_DPRINT1(ha, "failed\n"); return 0; } } uint32_t qlnx_reg_rd32(void *hwfn, uint32_t reg_addr) { uint32_t data32; struct ecore_hwfn *p_hwfn; p_hwfn = hwfn; data32 = bus_read_4(((qlnx_host_t *)p_hwfn->p_dev)->pci_reg, \ (bus_size_t)(p_hwfn->reg_offset + reg_addr)); return (data32); } void qlnx_reg_wr32(void *hwfn, uint32_t reg_addr, uint32_t value) { struct ecore_hwfn *p_hwfn = hwfn; bus_write_4(((qlnx_host_t *)p_hwfn->p_dev)->pci_reg, \ (bus_size_t)(p_hwfn->reg_offset + reg_addr), value); return; } void qlnx_reg_wr16(void *hwfn, uint32_t reg_addr, uint16_t value) { struct ecore_hwfn *p_hwfn = hwfn; bus_write_2(((qlnx_host_t *)p_hwfn->p_dev)->pci_reg, \ (bus_size_t)(p_hwfn->reg_offset + reg_addr), value); return; } void qlnx_dbell_wr32_db(void *hwfn, void *reg_addr, uint32_t value) { struct ecore_dev *cdev; struct ecore_hwfn *p_hwfn; uint32_t offset; p_hwfn = hwfn; cdev = p_hwfn->p_dev; offset = (uint32_t)((uint8_t *)reg_addr - (uint8_t *)(p_hwfn->doorbells)); bus_write_4(((qlnx_host_t *)cdev)->pci_dbells, offset, value); return; } void qlnx_dbell_wr32(void *hwfn, uint32_t reg_addr, uint32_t value) { struct ecore_hwfn *p_hwfn = hwfn; bus_write_4(((qlnx_host_t *)p_hwfn->p_dev)->pci_dbells, \ (bus_size_t)(p_hwfn->db_offset + reg_addr), value); return; } uint32_t qlnx_direct_reg_rd32(void *p_hwfn, uint32_t *reg_addr) { uint32_t data32; bus_size_t offset; struct ecore_dev *cdev; cdev = ((struct ecore_hwfn *)p_hwfn)->p_dev; offset = (bus_size_t)((uint8_t *)reg_addr - (uint8_t *)(cdev->regview)); data32 = bus_read_4(((qlnx_host_t *)cdev)->pci_reg, offset); return (data32); } void qlnx_direct_reg_wr32(void *p_hwfn, void *reg_addr, uint32_t value) { bus_size_t offset; struct ecore_dev *cdev; cdev = ((struct ecore_hwfn *)p_hwfn)->p_dev; offset = (bus_size_t)((uint8_t *)reg_addr - (uint8_t *)(cdev->regview)); bus_write_4(((qlnx_host_t *)cdev)->pci_reg, offset, value); return; } void qlnx_direct_reg_wr64(void *p_hwfn, void *reg_addr, uint64_t value) { bus_size_t offset; struct ecore_dev *cdev; cdev = ((struct ecore_hwfn *)p_hwfn)->p_dev; offset = (bus_size_t)((uint8_t *)reg_addr - (uint8_t *)(cdev->regview)); bus_write_8(((qlnx_host_t *)cdev)->pci_reg, offset, value); return; } void * qlnx_zalloc(uint32_t size) { caddr_t va; va = malloc((unsigned long)size, M_QLNXBUF, M_NOWAIT); bzero(va, size); return ((void *)va); } void qlnx_barrier(void *p_hwfn) { qlnx_host_t *ha; ha = (qlnx_host_t *)((struct ecore_hwfn *)p_hwfn)->p_dev; bus_barrier(ha->pci_reg, 0, 0, BUS_SPACE_BARRIER_WRITE); } void qlnx_link_update(void *p_hwfn) { qlnx_host_t *ha; int prev_link_state; ha = (qlnx_host_t *)((struct ecore_hwfn *)p_hwfn)->p_dev; qlnx_fill_link(ha, p_hwfn, &ha->if_link); prev_link_state = ha->link_up; ha->link_up = ha->if_link.link_up; if (prev_link_state != ha->link_up) { if (ha->link_up) { if_link_state_change(ha->ifp, LINK_STATE_UP); } else { if_link_state_change(ha->ifp, LINK_STATE_DOWN); } } #ifndef QLNX_VF #ifdef CONFIG_ECORE_SRIOV if (qlnx_vf_device(ha) != 0) { if (ha->sriov_initialized) qlnx_inform_vf_link_state(p_hwfn, ha); } #endif /* #ifdef CONFIG_ECORE_SRIOV */ #endif /* #ifdef QLNX_VF */ return; } static void __qlnx_osal_vf_fill_acquire_resc_req(struct ecore_hwfn *p_hwfn, struct ecore_vf_acquire_sw_info *p_sw_info) { p_sw_info->driver_version = (QLNX_VERSION_MAJOR << 24) | (QLNX_VERSION_MINOR << 16) | QLNX_VERSION_BUILD; p_sw_info->os_type = VFPF_ACQUIRE_OS_FREEBSD; return; } void qlnx_osal_vf_fill_acquire_resc_req(void *p_hwfn, void *p_resc_req, void *p_sw_info) { __qlnx_osal_vf_fill_acquire_resc_req(p_hwfn, p_sw_info); return; } void qlnx_fill_link(qlnx_host_t *ha, struct ecore_hwfn *hwfn, struct qlnx_link_output *if_link) { struct ecore_mcp_link_params link_params; struct ecore_mcp_link_state link_state; uint8_t p_change; struct ecore_ptt *p_ptt = NULL; memset(if_link, 0, sizeof(*if_link)); memset(&link_params, 0, sizeof(struct ecore_mcp_link_params)); memset(&link_state, 0, sizeof(struct ecore_mcp_link_state)); ha = (qlnx_host_t *)hwfn->p_dev; /* Prepare source inputs */ /* we only deal with physical functions */ if (qlnx_vf_device(ha) != 0) { p_ptt = ecore_ptt_acquire(hwfn); if (p_ptt == NULL) { QL_DPRINT1(ha, "ecore_ptt_acquire failed\n"); return; } ecore_mcp_get_media_type(hwfn, p_ptt, &if_link->media_type); ecore_ptt_release(hwfn, p_ptt); memcpy(&link_params, ecore_mcp_get_link_params(hwfn), sizeof(link_params)); memcpy(&link_state, ecore_mcp_get_link_state(hwfn), sizeof(link_state)); } else { ecore_mcp_get_media_type(hwfn, NULL, &if_link->media_type); ecore_vf_read_bulletin(hwfn, &p_change); ecore_vf_get_link_params(hwfn, &link_params); ecore_vf_get_link_state(hwfn, &link_state); } /* Set the link parameters to pass to protocol driver */ if (link_state.link_up) { if_link->link_up = true; if_link->speed = link_state.speed; } if_link->supported_caps = QLNX_LINK_CAP_FIBRE; if (link_params.speed.autoneg) if_link->supported_caps |= QLNX_LINK_CAP_Autoneg; if (link_params.pause.autoneg || (link_params.pause.forced_rx && link_params.pause.forced_tx)) if_link->supported_caps |= QLNX_LINK_CAP_Asym_Pause; if (link_params.pause.autoneg || link_params.pause.forced_rx || link_params.pause.forced_tx) if_link->supported_caps |= QLNX_LINK_CAP_Pause; if (link_params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G) if_link->supported_caps |= QLNX_LINK_CAP_1000baseT_Half | QLNX_LINK_CAP_1000baseT_Full; if (link_params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_10G) if_link->supported_caps |= QLNX_LINK_CAP_10000baseKR_Full; if (link_params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_25G) if_link->supported_caps |= QLNX_LINK_CAP_25000baseKR_Full; if (link_params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_LINK_SPEED_40G) if_link->supported_caps |= QLNX_LINK_CAP_40000baseLR4_Full; if (link_params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_50G) if_link->supported_caps |= QLNX_LINK_CAP_50000baseKR2_Full; if (link_params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_BB_100G) if_link->supported_caps |= QLNX_LINK_CAP_100000baseKR4_Full; if_link->advertised_caps = if_link->supported_caps; if_link->autoneg = link_params.speed.autoneg; if_link->duplex = QLNX_LINK_DUPLEX; /* Link partner capabilities */ if (link_state.partner_adv_speed & ECORE_LINK_PARTNER_SPEED_1G_HD) if_link->link_partner_caps |= QLNX_LINK_CAP_1000baseT_Half; if (link_state.partner_adv_speed & ECORE_LINK_PARTNER_SPEED_1G_FD) if_link->link_partner_caps |= QLNX_LINK_CAP_1000baseT_Full; if (link_state.partner_adv_speed & ECORE_LINK_PARTNER_SPEED_10G) if_link->link_partner_caps |= QLNX_LINK_CAP_10000baseKR_Full; if (link_state.partner_adv_speed & ECORE_LINK_PARTNER_SPEED_25G) if_link->link_partner_caps |= QLNX_LINK_CAP_25000baseKR_Full; if (link_state.partner_adv_speed & ECORE_LINK_PARTNER_SPEED_40G) if_link->link_partner_caps |= QLNX_LINK_CAP_40000baseLR4_Full; if (link_state.partner_adv_speed & ECORE_LINK_PARTNER_SPEED_50G) if_link->link_partner_caps |= QLNX_LINK_CAP_50000baseKR2_Full; if (link_state.partner_adv_speed & ECORE_LINK_PARTNER_SPEED_100G) if_link->link_partner_caps |= QLNX_LINK_CAP_100000baseKR4_Full; if (link_state.an_complete) if_link->link_partner_caps |= QLNX_LINK_CAP_Autoneg; if (link_state.partner_adv_pause) if_link->link_partner_caps |= QLNX_LINK_CAP_Pause; if ((link_state.partner_adv_pause == ECORE_LINK_PARTNER_ASYMMETRIC_PAUSE) || (link_state.partner_adv_pause == ECORE_LINK_PARTNER_BOTH_PAUSE)) if_link->link_partner_caps |= QLNX_LINK_CAP_Asym_Pause; return; } void qlnx_schedule_recovery(void *p_hwfn) { qlnx_host_t *ha; ha = (qlnx_host_t *)((struct ecore_hwfn *)p_hwfn)->p_dev; if (qlnx_vf_device(ha) != 0) { taskqueue_enqueue(ha->err_taskqueue, &ha->err_task); } return; } static int qlnx_nic_setup(struct ecore_dev *cdev, struct ecore_pf_params *func_params) { int rc, i; for (i = 0; i < cdev->num_hwfns; i++) { struct ecore_hwfn *p_hwfn = &cdev->hwfns[i]; p_hwfn->pf_params = *func_params; #ifdef QLNX_ENABLE_IWARP if (qlnx_vf_device((qlnx_host_t *)cdev) != 0) { p_hwfn->using_ll2 = true; } #endif /* #ifdef QLNX_ENABLE_IWARP */ } rc = ecore_resc_alloc(cdev); if (rc) goto qlnx_nic_setup_exit; ecore_resc_setup(cdev); qlnx_nic_setup_exit: return rc; } static int qlnx_nic_start(struct ecore_dev *cdev) { int rc; struct ecore_hw_init_params params; bzero(¶ms, sizeof (struct ecore_hw_init_params)); params.p_tunn = NULL; params.b_hw_start = true; params.int_mode = cdev->int_mode; params.allow_npar_tx_switch = true; params.bin_fw_data = NULL; rc = ecore_hw_init(cdev, ¶ms); if (rc) { ecore_resc_free(cdev); return rc; } return 0; } static int qlnx_slowpath_start(qlnx_host_t *ha) { struct ecore_dev *cdev; struct ecore_pf_params pf_params; int rc; memset(&pf_params, 0, sizeof(struct ecore_pf_params)); pf_params.eth_pf_params.num_cons = (ha->num_rss) * (ha->num_tc + 1); #ifdef QLNX_ENABLE_IWARP if (qlnx_vf_device(ha) != 0) { if(ha->personality == ECORE_PCI_ETH_IWARP) { device_printf(ha->pci_dev, "setting parameters required by iWARP dev\n"); pf_params.rdma_pf_params.num_qps = 1024; pf_params.rdma_pf_params.num_srqs = 1024; pf_params.rdma_pf_params.gl_pi = ECORE_ROCE_PROTOCOL_INDEX; pf_params.rdma_pf_params.rdma_protocol = ECORE_RDMA_PROTOCOL_IWARP; } else if(ha->personality == ECORE_PCI_ETH_ROCE) { device_printf(ha->pci_dev, "setting parameters required by RoCE dev\n"); pf_params.rdma_pf_params.num_qps = 8192; pf_params.rdma_pf_params.num_srqs = 8192; //pf_params.rdma_pf_params.min_dpis = 0; pf_params.rdma_pf_params.min_dpis = 8; pf_params.rdma_pf_params.roce_edpm_mode = 0; pf_params.rdma_pf_params.gl_pi = ECORE_ROCE_PROTOCOL_INDEX; pf_params.rdma_pf_params.rdma_protocol = ECORE_RDMA_PROTOCOL_ROCE; } } #endif /* #ifdef QLNX_ENABLE_IWARP */ cdev = &ha->cdev; rc = qlnx_nic_setup(cdev, &pf_params); if (rc) goto qlnx_slowpath_start_exit; cdev->int_mode = ECORE_INT_MODE_MSIX; cdev->int_coalescing_mode = ECORE_COAL_MODE_ENABLE; #ifdef QLNX_MAX_COALESCE cdev->rx_coalesce_usecs = 255; cdev->tx_coalesce_usecs = 255; #endif rc = qlnx_nic_start(cdev); ha->rx_coalesce_usecs = cdev->rx_coalesce_usecs; ha->tx_coalesce_usecs = cdev->tx_coalesce_usecs; #ifdef QLNX_USER_LLDP (void)qlnx_set_lldp_tlvx(ha, NULL); #endif /* #ifdef QLNX_USER_LLDP */ qlnx_slowpath_start_exit: return (rc); } static int qlnx_slowpath_stop(qlnx_host_t *ha) { struct ecore_dev *cdev; device_t dev = ha->pci_dev; int i; cdev = &ha->cdev; ecore_hw_stop(cdev); for (i = 0; i < ha->cdev.num_hwfns; i++) { if (ha->sp_handle[i]) (void)bus_teardown_intr(dev, ha->sp_irq[i], ha->sp_handle[i]); ha->sp_handle[i] = NULL; if (ha->sp_irq[i]) (void) bus_release_resource(dev, SYS_RES_IRQ, ha->sp_irq_rid[i], ha->sp_irq[i]); ha->sp_irq[i] = NULL; } ecore_resc_free(cdev); return 0; } static void qlnx_set_id(struct ecore_dev *cdev, char name[NAME_SIZE], char ver_str[VER_SIZE]) { int i; memcpy(cdev->name, name, NAME_SIZE); for_each_hwfn(cdev, i) { snprintf(cdev->hwfns[i].name, NAME_SIZE, "%s-%d", name, i); } cdev->drv_type = DRV_ID_DRV_TYPE_FREEBSD; return ; } void qlnx_get_protocol_stats(void *cdev, int proto_type, void *proto_stats) { enum ecore_mcp_protocol_type type; union ecore_mcp_protocol_stats *stats; struct ecore_eth_stats eth_stats; qlnx_host_t *ha; ha = cdev; stats = proto_stats; type = proto_type; switch (type) { case ECORE_MCP_LAN_STATS: ecore_get_vport_stats((struct ecore_dev *)cdev, ð_stats); stats->lan_stats.ucast_rx_pkts = eth_stats.common.rx_ucast_pkts; stats->lan_stats.ucast_tx_pkts = eth_stats.common.tx_ucast_pkts; stats->lan_stats.fcs_err = -1; break; default: ha->err_get_proto_invalid_type++; QL_DPRINT1(ha, "invalid protocol type 0x%x\n", type); break; } return; } static int qlnx_get_mfw_version(qlnx_host_t *ha, uint32_t *mfw_ver) { struct ecore_hwfn *p_hwfn; struct ecore_ptt *p_ptt; p_hwfn = &ha->cdev.hwfns[0]; p_ptt = ecore_ptt_acquire(p_hwfn); if (p_ptt == NULL) { QL_DPRINT1(ha, "ecore_ptt_acquire failed\n"); return (-1); } ecore_mcp_get_mfw_ver(p_hwfn, p_ptt, mfw_ver, NULL); ecore_ptt_release(p_hwfn, p_ptt); return (0); } static int qlnx_get_flash_size(qlnx_host_t *ha, uint32_t *flash_size) { struct ecore_hwfn *p_hwfn; struct ecore_ptt *p_ptt; p_hwfn = &ha->cdev.hwfns[0]; p_ptt = ecore_ptt_acquire(p_hwfn); if (p_ptt == NULL) { QL_DPRINT1(ha,"ecore_ptt_acquire failed\n"); return (-1); } ecore_mcp_get_flash_size(p_hwfn, p_ptt, flash_size); ecore_ptt_release(p_hwfn, p_ptt); return (0); } static int qlnx_alloc_mem_arrays(qlnx_host_t *ha) { struct ecore_dev *cdev; cdev = &ha->cdev; bzero(&ha->txq_array[0], (sizeof(struct qlnx_tx_queue) * QLNX_MAX_RSS)); bzero(&ha->rxq_array[0], (sizeof(struct qlnx_rx_queue) * QLNX_MAX_RSS)); bzero(&ha->sb_array[0], (sizeof(struct ecore_sb_info) * QLNX_MAX_RSS)); return 0; } static void qlnx_init_fp(qlnx_host_t *ha) { int rss_id, txq_array_index, tc; for (rss_id = 0; rss_id < ha->num_rss; rss_id++) { struct qlnx_fastpath *fp = &ha->fp_array[rss_id]; fp->rss_id = rss_id; fp->edev = ha; fp->sb_info = &ha->sb_array[rss_id]; fp->rxq = &ha->rxq_array[rss_id]; fp->rxq->rxq_id = rss_id; for (tc = 0; tc < ha->num_tc; tc++) { txq_array_index = tc * ha->num_rss + rss_id; fp->txq[tc] = &ha->txq_array[txq_array_index]; fp->txq[tc]->index = txq_array_index; } snprintf(fp->name, sizeof(fp->name), "%s-fp-%d", qlnx_name_str, rss_id); fp->tx_ring_full = 0; /* reset all the statistics counters */ fp->tx_pkts_processed = 0; fp->tx_pkts_freed = 0; fp->tx_pkts_transmitted = 0; fp->tx_pkts_completed = 0; #ifdef QLNX_TRACE_PERF_DATA fp->tx_pkts_trans_ctx = 0; fp->tx_pkts_compl_ctx = 0; fp->tx_pkts_trans_fp = 0; fp->tx_pkts_compl_fp = 0; fp->tx_pkts_compl_intr = 0; #endif fp->tx_lso_wnd_min_len = 0; fp->tx_defrag = 0; fp->tx_nsegs_gt_elem_left = 0; fp->tx_tso_max_nsegs = 0; fp->tx_tso_min_nsegs = 0; fp->err_tx_nsegs_gt_elem_left = 0; fp->err_tx_dmamap_create = 0; fp->err_tx_defrag_dmamap_load = 0; fp->err_tx_non_tso_max_seg = 0; fp->err_tx_dmamap_load = 0; fp->err_tx_defrag = 0; fp->err_tx_free_pkt_null = 0; fp->err_tx_cons_idx_conflict = 0; fp->rx_pkts = 0; fp->err_m_getcl = 0; fp->err_m_getjcl = 0; } return; } void qlnx_free_mem_sb(qlnx_host_t *ha, struct ecore_sb_info *sb_info) { struct ecore_dev *cdev; cdev = &ha->cdev; if (sb_info->sb_virt) { OSAL_DMA_FREE_COHERENT(cdev, ((void *)sb_info->sb_virt), (sb_info->sb_phys), (sizeof(*sb_info->sb_virt))); sb_info->sb_virt = NULL; } } static int qlnx_sb_init(struct ecore_dev *cdev, struct ecore_sb_info *sb_info, void *sb_virt_addr, bus_addr_t sb_phy_addr, u16 sb_id) { struct ecore_hwfn *p_hwfn; int hwfn_index, rc; u16 rel_sb_id; hwfn_index = sb_id % cdev->num_hwfns; p_hwfn = &cdev->hwfns[hwfn_index]; rel_sb_id = sb_id / cdev->num_hwfns; QL_DPRINT2(((qlnx_host_t *)cdev), "hwfn_index = %d p_hwfn = %p sb_id = 0x%x rel_sb_id = 0x%x \ sb_info = %p sb_virt_addr = %p sb_phy_addr = %p\n", hwfn_index, p_hwfn, sb_id, rel_sb_id, sb_info, sb_virt_addr, (void *)sb_phy_addr); rc = ecore_int_sb_init(p_hwfn, p_hwfn->p_main_ptt, sb_info, sb_virt_addr, sb_phy_addr, rel_sb_id); return rc; } /* This function allocates fast-path status block memory */ int qlnx_alloc_mem_sb(qlnx_host_t *ha, struct ecore_sb_info *sb_info, u16 sb_id) { struct status_block_e4 *sb_virt; bus_addr_t sb_phys; int rc; uint32_t size; struct ecore_dev *cdev; cdev = &ha->cdev; size = sizeof(*sb_virt); sb_virt = OSAL_DMA_ALLOC_COHERENT(cdev, (&sb_phys), size); if (!sb_virt) { QL_DPRINT1(ha, "Status block allocation failed\n"); return -ENOMEM; } rc = qlnx_sb_init(cdev, sb_info, sb_virt, sb_phys, sb_id); if (rc) { OSAL_DMA_FREE_COHERENT(cdev, sb_virt, sb_phys, size); } return rc; } static void qlnx_free_rx_buffers(qlnx_host_t *ha, struct qlnx_rx_queue *rxq) { int i; struct sw_rx_data *rx_buf; for (i = 0; i < rxq->num_rx_buffers; i++) { rx_buf = &rxq->sw_rx_ring[i]; if (rx_buf->data != NULL) { if (rx_buf->map != NULL) { bus_dmamap_unload(ha->rx_tag, rx_buf->map); bus_dmamap_destroy(ha->rx_tag, rx_buf->map); rx_buf->map = NULL; } m_freem(rx_buf->data); rx_buf->data = NULL; } } return; } static void qlnx_free_mem_rxq(qlnx_host_t *ha, struct qlnx_rx_queue *rxq) { struct ecore_dev *cdev; int i; cdev = &ha->cdev; qlnx_free_rx_buffers(ha, rxq); for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) { qlnx_free_tpa_mbuf(ha, &rxq->tpa_info[i]); if (rxq->tpa_info[i].mpf != NULL) m_freem(rxq->tpa_info[i].mpf); } bzero((void *)&rxq->sw_rx_ring[0], (sizeof (struct sw_rx_data) * RX_RING_SIZE)); /* Free the real RQ ring used by FW */ if (rxq->rx_bd_ring.p_virt_addr) { ecore_chain_free(cdev, &rxq->rx_bd_ring); rxq->rx_bd_ring.p_virt_addr = NULL; } /* Free the real completion ring used by FW */ if (rxq->rx_comp_ring.p_virt_addr && rxq->rx_comp_ring.pbl_sp.p_virt_table) { ecore_chain_free(cdev, &rxq->rx_comp_ring); rxq->rx_comp_ring.p_virt_addr = NULL; rxq->rx_comp_ring.pbl_sp.p_virt_table = NULL; } #ifdef QLNX_SOFT_LRO { struct lro_ctrl *lro; lro = &rxq->lro; tcp_lro_free(lro); } #endif /* #ifdef QLNX_SOFT_LRO */ return; } static int qlnx_alloc_rx_buffer(qlnx_host_t *ha, struct qlnx_rx_queue *rxq) { register struct mbuf *mp; uint16_t rx_buf_size; struct sw_rx_data *sw_rx_data; struct eth_rx_bd *rx_bd; dma_addr_t dma_addr; bus_dmamap_t map; bus_dma_segment_t segs[1]; int nsegs; int ret; struct ecore_dev *cdev; cdev = &ha->cdev; rx_buf_size = rxq->rx_buf_size; mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx_buf_size); if (mp == NULL) { QL_DPRINT1(ha, "Failed to allocate Rx data\n"); return -ENOMEM; } mp->m_len = mp->m_pkthdr.len = rx_buf_size; map = (bus_dmamap_t)0; ret = bus_dmamap_load_mbuf_sg(ha->rx_tag, map, mp, segs, &nsegs, BUS_DMA_NOWAIT); dma_addr = segs[0].ds_addr; if (ret || !dma_addr || (nsegs != 1)) { m_freem(mp); QL_DPRINT1(ha, "bus_dmamap_load failed[%d, 0x%016llx, %d]\n", ret, (long long unsigned int)dma_addr, nsegs); return -ENOMEM; } sw_rx_data = &rxq->sw_rx_ring[rxq->sw_rx_prod]; sw_rx_data->data = mp; sw_rx_data->dma_addr = dma_addr; sw_rx_data->map = map; /* Advance PROD and get BD pointer */ rx_bd = (struct eth_rx_bd *)ecore_chain_produce(&rxq->rx_bd_ring); rx_bd->addr.hi = htole32(U64_HI(dma_addr)); rx_bd->addr.lo = htole32(U64_LO(dma_addr)); bus_dmamap_sync(ha->rx_tag, map, BUS_DMASYNC_PREREAD); rxq->sw_rx_prod = (rxq->sw_rx_prod + 1) & (RX_RING_SIZE - 1); return 0; } static int qlnx_alloc_tpa_mbuf(qlnx_host_t *ha, uint16_t rx_buf_size, struct qlnx_agg_info *tpa) { struct mbuf *mp; dma_addr_t dma_addr; bus_dmamap_t map; bus_dma_segment_t segs[1]; int nsegs; int ret; struct sw_rx_data *rx_buf; mp = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rx_buf_size); if (mp == NULL) { QL_DPRINT1(ha, "Failed to allocate Rx data\n"); return -ENOMEM; } mp->m_len = mp->m_pkthdr.len = rx_buf_size; map = (bus_dmamap_t)0; ret = bus_dmamap_load_mbuf_sg(ha->rx_tag, map, mp, segs, &nsegs, BUS_DMA_NOWAIT); dma_addr = segs[0].ds_addr; if (ret || !dma_addr || (nsegs != 1)) { m_freem(mp); QL_DPRINT1(ha, "bus_dmamap_load failed[%d, 0x%016llx, %d]\n", ret, (long long unsigned int)dma_addr, nsegs); return -ENOMEM; } rx_buf = &tpa->rx_buf; memset(rx_buf, 0, sizeof (struct sw_rx_data)); rx_buf->data = mp; rx_buf->dma_addr = dma_addr; rx_buf->map = map; bus_dmamap_sync(ha->rx_tag, map, BUS_DMASYNC_PREREAD); return (0); } static void qlnx_free_tpa_mbuf(qlnx_host_t *ha, struct qlnx_agg_info *tpa) { struct sw_rx_data *rx_buf; rx_buf = &tpa->rx_buf; if (rx_buf->data != NULL) { if (rx_buf->map != NULL) { bus_dmamap_unload(ha->rx_tag, rx_buf->map); bus_dmamap_destroy(ha->rx_tag, rx_buf->map); rx_buf->map = NULL; } m_freem(rx_buf->data); rx_buf->data = NULL; } return; } /* This function allocates all memory needed per Rx queue */ static int qlnx_alloc_mem_rxq(qlnx_host_t *ha, struct qlnx_rx_queue *rxq) { int i, rc, num_allocated; struct ifnet *ifp; struct ecore_dev *cdev; cdev = &ha->cdev; ifp = ha->ifp; rxq->num_rx_buffers = RX_RING_SIZE; rxq->rx_buf_size = ha->rx_buf_size; /* Allocate the parallel driver ring for Rx buffers */ bzero((void *)&rxq->sw_rx_ring[0], (sizeof (struct sw_rx_data) * RX_RING_SIZE)); /* Allocate FW Rx ring */ rc = ecore_chain_alloc(cdev, ECORE_CHAIN_USE_TO_CONSUME_PRODUCE, ECORE_CHAIN_MODE_NEXT_PTR, ECORE_CHAIN_CNT_TYPE_U16, RX_RING_SIZE, sizeof(struct eth_rx_bd), &rxq->rx_bd_ring, NULL); if (rc) goto err; /* Allocate FW completion ring */ rc = ecore_chain_alloc(cdev, ECORE_CHAIN_USE_TO_CONSUME, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U16, RX_RING_SIZE, sizeof(union eth_rx_cqe), &rxq->rx_comp_ring, NULL); if (rc) goto err; /* Allocate buffers for the Rx ring */ for (i = 0; i < ETH_TPA_MAX_AGGS_NUM; i++) { rc = qlnx_alloc_tpa_mbuf(ha, rxq->rx_buf_size, &rxq->tpa_info[i]); if (rc) break; } for (i = 0; i < rxq->num_rx_buffers; i++) { rc = qlnx_alloc_rx_buffer(ha, rxq); if (rc) break; } num_allocated = i; if (!num_allocated) { QL_DPRINT1(ha, "Rx buffers allocation failed\n"); goto err; } else if (num_allocated < rxq->num_rx_buffers) { QL_DPRINT1(ha, "Allocated less buffers than" " desired (%d allocated)\n", num_allocated); } #ifdef QLNX_SOFT_LRO { struct lro_ctrl *lro; lro = &rxq->lro; #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) if (tcp_lro_init_args(lro, ifp, 0, rxq->num_rx_buffers)) { QL_DPRINT1(ha, "tcp_lro_init[%d] failed\n", rxq->rxq_id); goto err; } #else if (tcp_lro_init(lro)) { QL_DPRINT1(ha, "tcp_lro_init[%d] failed\n", rxq->rxq_id); goto err; } #endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */ lro->ifp = ha->ifp; } #endif /* #ifdef QLNX_SOFT_LRO */ return 0; err: qlnx_free_mem_rxq(ha, rxq); return -ENOMEM; } static void qlnx_free_mem_txq(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq) { struct ecore_dev *cdev; cdev = &ha->cdev; bzero((void *)&txq->sw_tx_ring[0], (sizeof (struct sw_tx_bd) * TX_RING_SIZE)); /* Free the real RQ ring used by FW */ if (txq->tx_pbl.p_virt_addr) { ecore_chain_free(cdev, &txq->tx_pbl); txq->tx_pbl.p_virt_addr = NULL; } return; } /* This function allocates all memory needed per Tx queue */ static int qlnx_alloc_mem_txq(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq) { int ret = ECORE_SUCCESS; union eth_tx_bd_types *p_virt; struct ecore_dev *cdev; cdev = &ha->cdev; bzero((void *)&txq->sw_tx_ring[0], (sizeof (struct sw_tx_bd) * TX_RING_SIZE)); /* Allocate the real Tx ring to be used by FW */ ret = ecore_chain_alloc(cdev, ECORE_CHAIN_USE_TO_CONSUME_PRODUCE, ECORE_CHAIN_MODE_PBL, ECORE_CHAIN_CNT_TYPE_U16, TX_RING_SIZE, sizeof(*p_virt), &txq->tx_pbl, NULL); if (ret != ECORE_SUCCESS) { goto err; } txq->num_tx_buffers = TX_RING_SIZE; return 0; err: qlnx_free_mem_txq(ha, fp, txq); return -ENOMEM; } static void qlnx_free_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp) { struct mbuf *mp; struct ifnet *ifp = ha->ifp; if (mtx_initialized(&fp->tx_mtx)) { if (fp->tx_br != NULL) { mtx_lock(&fp->tx_mtx); while ((mp = drbr_dequeue(ifp, fp->tx_br)) != NULL) { fp->tx_pkts_freed++; m_freem(mp); } mtx_unlock(&fp->tx_mtx); buf_ring_free(fp->tx_br, M_DEVBUF); fp->tx_br = NULL; } mtx_destroy(&fp->tx_mtx); } return; } static void qlnx_free_mem_fp(qlnx_host_t *ha, struct qlnx_fastpath *fp) { int tc; qlnx_free_mem_sb(ha, fp->sb_info); qlnx_free_mem_rxq(ha, fp->rxq); for (tc = 0; tc < ha->num_tc; tc++) qlnx_free_mem_txq(ha, fp, fp->txq[tc]); return; } static int qlnx_alloc_tx_br(qlnx_host_t *ha, struct qlnx_fastpath *fp) { snprintf(fp->tx_mtx_name, sizeof(fp->tx_mtx_name), "qlnx%d_fp%d_tx_mq_lock", ha->dev_unit, fp->rss_id); mtx_init(&fp->tx_mtx, fp->tx_mtx_name, NULL, MTX_DEF); fp->tx_br = buf_ring_alloc(TX_RING_SIZE, M_DEVBUF, M_NOWAIT, &fp->tx_mtx); if (fp->tx_br == NULL) { QL_DPRINT1(ha, "buf_ring_alloc failed for fp[%d, %d]\n", ha->dev_unit, fp->rss_id); return -ENOMEM; } return 0; } static int qlnx_alloc_mem_fp(qlnx_host_t *ha, struct qlnx_fastpath *fp) { int rc, tc; rc = qlnx_alloc_mem_sb(ha, fp->sb_info, fp->rss_id); if (rc) goto err; if (ha->rx_jumbo_buf_eq_mtu) { if (ha->max_frame_size <= MCLBYTES) ha->rx_buf_size = MCLBYTES; else if (ha->max_frame_size <= MJUMPAGESIZE) ha->rx_buf_size = MJUMPAGESIZE; else if (ha->max_frame_size <= MJUM9BYTES) ha->rx_buf_size = MJUM9BYTES; else if (ha->max_frame_size <= MJUM16BYTES) ha->rx_buf_size = MJUM16BYTES; } else { if (ha->max_frame_size <= MCLBYTES) ha->rx_buf_size = MCLBYTES; else ha->rx_buf_size = MJUMPAGESIZE; } rc = qlnx_alloc_mem_rxq(ha, fp->rxq); if (rc) goto err; for (tc = 0; tc < ha->num_tc; tc++) { rc = qlnx_alloc_mem_txq(ha, fp, fp->txq[tc]); if (rc) goto err; } return 0; err: qlnx_free_mem_fp(ha, fp); return -ENOMEM; } static void qlnx_free_mem_load(qlnx_host_t *ha) { int i; struct ecore_dev *cdev; cdev = &ha->cdev; for (i = 0; i < ha->num_rss; i++) { struct qlnx_fastpath *fp = &ha->fp_array[i]; qlnx_free_mem_fp(ha, fp); } return; } static int qlnx_alloc_mem_load(qlnx_host_t *ha) { int rc = 0, rss_id; for (rss_id = 0; rss_id < ha->num_rss; rss_id++) { struct qlnx_fastpath *fp = &ha->fp_array[rss_id]; rc = qlnx_alloc_mem_fp(ha, fp); if (rc) break; } return (rc); } static int qlnx_start_vport(struct ecore_dev *cdev, u8 vport_id, u16 mtu, u8 drop_ttl0_flg, u8 inner_vlan_removal_en_flg, u8 tx_switching, u8 hw_lro_enable) { int rc, i; struct ecore_sp_vport_start_params vport_start_params = { 0 }; qlnx_host_t *ha; ha = (qlnx_host_t *)cdev; vport_start_params.remove_inner_vlan = inner_vlan_removal_en_flg; vport_start_params.tx_switching = 0; vport_start_params.handle_ptp_pkts = 0; vport_start_params.only_untagged = 0; vport_start_params.drop_ttl0 = drop_ttl0_flg; vport_start_params.tpa_mode = (hw_lro_enable ? ECORE_TPA_MODE_RSC : ECORE_TPA_MODE_NONE); vport_start_params.max_buffers_per_cqe = QLNX_TPA_MAX_AGG_BUFFERS; vport_start_params.vport_id = vport_id; vport_start_params.mtu = mtu; QL_DPRINT2(ha, "Setting mtu to %d and VPORT ID = %d\n", mtu, vport_id); for_each_hwfn(cdev, i) { struct ecore_hwfn *p_hwfn = &cdev->hwfns[i]; vport_start_params.concrete_fid = p_hwfn->hw_info.concrete_fid; vport_start_params.opaque_fid = p_hwfn->hw_info.opaque_fid; rc = ecore_sp_vport_start(p_hwfn, &vport_start_params); if (rc) { QL_DPRINT1(ha, "Failed to start VPORT V-PORT %d" " with MTU %d\n" , vport_id, mtu); return -ENOMEM; } ecore_hw_start_fastpath(p_hwfn); QL_DPRINT2(ha, "Started V-PORT %d with MTU %d\n", vport_id, mtu); } return 0; } static int qlnx_update_vport(struct ecore_dev *cdev, struct qlnx_update_vport_params *params) { struct ecore_sp_vport_update_params sp_params; int rc, i, j, fp_index; struct ecore_hwfn *p_hwfn; struct ecore_rss_params *rss; qlnx_host_t *ha = (qlnx_host_t *)cdev; struct qlnx_fastpath *fp; memset(&sp_params, 0, sizeof(sp_params)); /* Translate protocol params into sp params */ sp_params.vport_id = params->vport_id; sp_params.update_vport_active_rx_flg = params->update_vport_active_rx_flg; sp_params.vport_active_rx_flg = params->vport_active_rx_flg; sp_params.update_vport_active_tx_flg = params->update_vport_active_tx_flg; sp_params.vport_active_tx_flg = params->vport_active_tx_flg; sp_params.update_inner_vlan_removal_flg = params->update_inner_vlan_removal_flg; sp_params.inner_vlan_removal_flg = params->inner_vlan_removal_flg; sp_params.sge_tpa_params = params->sge_tpa_params; /* RSS - is a bit tricky, since upper-layer isn't familiar with hwfns. * We need to re-fix the rss values per engine for CMT. */ if (params->rss_params->update_rss_config) sp_params.rss_params = params->rss_params; else sp_params.rss_params = NULL; for_each_hwfn(cdev, i) { p_hwfn = &cdev->hwfns[i]; if ((cdev->num_hwfns > 1) && params->rss_params->update_rss_config && params->rss_params->rss_enable) { rss = params->rss_params; for (j = 0; j < ECORE_RSS_IND_TABLE_SIZE; j++) { fp_index = ((cdev->num_hwfns * j) + i) % ha->num_rss; fp = &ha->fp_array[fp_index]; rss->rss_ind_table[j] = fp->rxq->handle; } for (j = 0; j < ECORE_RSS_IND_TABLE_SIZE;) { QL_DPRINT3(ha, "%p %p %p %p %p %p %p %p \n", rss->rss_ind_table[j], rss->rss_ind_table[j+1], rss->rss_ind_table[j+2], rss->rss_ind_table[j+3], rss->rss_ind_table[j+4], rss->rss_ind_table[j+5], rss->rss_ind_table[j+6], rss->rss_ind_table[j+7]); j += 8; } } sp_params.opaque_fid = p_hwfn->hw_info.opaque_fid; QL_DPRINT1(ha, "Update sp vport ID=%d\n", params->vport_id); rc = ecore_sp_vport_update(p_hwfn, &sp_params, ECORE_SPQ_MODE_EBLOCK, NULL); if (rc) { QL_DPRINT1(ha, "Failed to update VPORT\n"); return rc; } QL_DPRINT2(ha, "Updated V-PORT %d: tx_active_flag %d, \ rx_active_flag %d [tx_update %d], [rx_update %d]\n", params->vport_id, params->vport_active_tx_flg, params->vport_active_rx_flg, params->update_vport_active_tx_flg, params->update_vport_active_rx_flg); } return 0; } static void qlnx_reuse_rx_data(struct qlnx_rx_queue *rxq) { struct eth_rx_bd *rx_bd_cons = ecore_chain_consume(&rxq->rx_bd_ring); struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq->rx_bd_ring); struct sw_rx_data *sw_rx_data_cons = &rxq->sw_rx_ring[rxq->sw_rx_cons]; struct sw_rx_data *sw_rx_data_prod = &rxq->sw_rx_ring[rxq->sw_rx_prod]; sw_rx_data_prod->data = sw_rx_data_cons->data; memcpy(rx_bd_prod, rx_bd_cons, sizeof(struct eth_rx_bd)); rxq->sw_rx_cons = (rxq->sw_rx_cons + 1) & (RX_RING_SIZE - 1); rxq->sw_rx_prod = (rxq->sw_rx_prod + 1) & (RX_RING_SIZE - 1); return; } static void qlnx_update_rx_prod(struct ecore_hwfn *p_hwfn, struct qlnx_rx_queue *rxq) { uint16_t bd_prod; uint16_t cqe_prod; union { struct eth_rx_prod_data rx_prod_data; uint32_t data32; } rx_prods; bd_prod = ecore_chain_get_prod_idx(&rxq->rx_bd_ring); cqe_prod = ecore_chain_get_prod_idx(&rxq->rx_comp_ring); /* Update producers */ rx_prods.rx_prod_data.bd_prod = htole16(bd_prod); rx_prods.rx_prod_data.cqe_prod = htole16(cqe_prod); /* Make sure that the BD and SGE data is updated before updating the * producers since FW might read the BD/SGE right after the producer * is updated. */ wmb(); internal_ram_wr(p_hwfn, rxq->hw_rxq_prod_addr, sizeof(rx_prods), &rx_prods.data32); /* mmiowb is needed to synchronize doorbell writes from more than one * processor. It guarantees that the write arrives to the device before * the napi lock is released and another qlnx_poll is called (possibly * on another CPU). Without this barrier, the next doorbell can bypass * this doorbell. This is applicable to IA64/Altix systems. */ wmb(); return; } static uint32_t qlnx_hash_key[] = { ((0x6d << 24)|(0x5a << 16)|(0x56 << 8)|0xda), ((0x25 << 24)|(0x5b << 16)|(0x0e << 8)|0xc2), ((0x41 << 24)|(0x67 << 16)|(0x25 << 8)|0x3d), ((0x43 << 24)|(0xa3 << 16)|(0x8f << 8)|0xb0), ((0xd0 << 24)|(0xca << 16)|(0x2b << 8)|0xcb), ((0xae << 24)|(0x7b << 16)|(0x30 << 8)|0xb4), ((0x77 << 24)|(0xcb << 16)|(0x2d << 8)|0xa3), ((0x80 << 24)|(0x30 << 16)|(0xf2 << 8)|0x0c), ((0x6a << 24)|(0x42 << 16)|(0xb7 << 8)|0x3b), ((0xbe << 24)|(0xac << 16)|(0x01 << 8)|0xfa)}; static int qlnx_start_queues(qlnx_host_t *ha) { int rc, tc, i, vport_id = 0, drop_ttl0_flg = 1, vlan_removal_en = 1, tx_switching = 0, hw_lro_enable = 0; struct ecore_dev *cdev = &ha->cdev; struct ecore_rss_params *rss_params = &ha->rss_params; struct qlnx_update_vport_params vport_update_params; struct ifnet *ifp; struct ecore_hwfn *p_hwfn; struct ecore_sge_tpa_params tpa_params; struct ecore_queue_start_common_params qparams; struct qlnx_fastpath *fp; ifp = ha->ifp; QL_DPRINT1(ha, "Num RSS = %d\n", ha->num_rss); if (!ha->num_rss) { QL_DPRINT1(ha, "Cannot update V-VPORT as active as there" " are no Rx queues\n"); return -EINVAL; } #ifndef QLNX_SOFT_LRO hw_lro_enable = ifp->if_capenable & IFCAP_LRO; #endif /* #ifndef QLNX_SOFT_LRO */ rc = qlnx_start_vport(cdev, vport_id, ifp->if_mtu, drop_ttl0_flg, vlan_removal_en, tx_switching, hw_lro_enable); if (rc) { QL_DPRINT1(ha, "Start V-PORT failed %d\n", rc); return rc; } QL_DPRINT2(ha, "Start vport ramrod passed, " "vport_id = %d, MTU = %d, vlan_removal_en = %d\n", vport_id, (int)(ifp->if_mtu + 0xe), vlan_removal_en); for_each_rss(i) { struct ecore_rxq_start_ret_params rx_ret_params; struct ecore_txq_start_ret_params tx_ret_params; fp = &ha->fp_array[i]; p_hwfn = &cdev->hwfns[(fp->rss_id % cdev->num_hwfns)]; bzero(&qparams, sizeof(struct ecore_queue_start_common_params)); bzero(&rx_ret_params, sizeof (struct ecore_rxq_start_ret_params)); qparams.queue_id = i ; qparams.vport_id = vport_id; qparams.stats_id = vport_id; qparams.p_sb = fp->sb_info; qparams.sb_idx = RX_PI; rc = ecore_eth_rx_queue_start(p_hwfn, p_hwfn->hw_info.opaque_fid, &qparams, fp->rxq->rx_buf_size, /* bd_max_bytes */ /* bd_chain_phys_addr */ fp->rxq->rx_bd_ring.p_phys_addr, /* cqe_pbl_addr */ ecore_chain_get_pbl_phys(&fp->rxq->rx_comp_ring), /* cqe_pbl_size */ ecore_chain_get_page_cnt(&fp->rxq->rx_comp_ring), &rx_ret_params); if (rc) { QL_DPRINT1(ha, "Start RXQ #%d failed %d\n", i, rc); return rc; } fp->rxq->hw_rxq_prod_addr = rx_ret_params.p_prod; fp->rxq->handle = rx_ret_params.p_handle; fp->rxq->hw_cons_ptr = &fp->sb_info->sb_virt->pi_array[RX_PI]; qlnx_update_rx_prod(p_hwfn, fp->rxq); for (tc = 0; tc < ha->num_tc; tc++) { struct qlnx_tx_queue *txq = fp->txq[tc]; bzero(&qparams, sizeof(struct ecore_queue_start_common_params)); bzero(&tx_ret_params, sizeof (struct ecore_txq_start_ret_params)); qparams.queue_id = txq->index / cdev->num_hwfns ; qparams.vport_id = vport_id; qparams.stats_id = vport_id; qparams.p_sb = fp->sb_info; qparams.sb_idx = TX_PI(tc); rc = ecore_eth_tx_queue_start(p_hwfn, p_hwfn->hw_info.opaque_fid, &qparams, tc, /* bd_chain_phys_addr */ ecore_chain_get_pbl_phys(&txq->tx_pbl), ecore_chain_get_page_cnt(&txq->tx_pbl), &tx_ret_params); if (rc) { QL_DPRINT1(ha, "Start TXQ #%d failed %d\n", txq->index, rc); return rc; } txq->doorbell_addr = tx_ret_params.p_doorbell; txq->handle = tx_ret_params.p_handle; txq->hw_cons_ptr = &fp->sb_info->sb_virt->pi_array[TX_PI(tc)]; SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_DEST, DB_DEST_XCM); SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_CMD, DB_AGG_CMD_SET); SET_FIELD(txq->tx_db.data.params, ETH_DB_DATA_AGG_VAL_SEL, DQ_XCM_ETH_TX_BD_PROD_CMD); txq->tx_db.data.agg_flags = DQ_XCM_ETH_DQ_CF_CMD; } } /* Fill struct with RSS params */ if (ha->num_rss > 1) { rss_params->update_rss_config = 1; rss_params->rss_enable = 1; rss_params->update_rss_capabilities = 1; rss_params->update_rss_ind_table = 1; rss_params->update_rss_key = 1; rss_params->rss_caps = ECORE_RSS_IPV4 | ECORE_RSS_IPV6 | ECORE_RSS_IPV4_TCP | ECORE_RSS_IPV6_TCP; rss_params->rss_table_size_log = 7; /* 2^7 = 128 */ for (i = 0; i < ECORE_RSS_IND_TABLE_SIZE; i++) { fp = &ha->fp_array[(i % ha->num_rss)]; rss_params->rss_ind_table[i] = fp->rxq->handle; } for (i = 0; i < ECORE_RSS_KEY_SIZE; i++) rss_params->rss_key[i] = (__le32)qlnx_hash_key[i]; } else { memset(rss_params, 0, sizeof(*rss_params)); } /* Prepare and send the vport enable */ memset(&vport_update_params, 0, sizeof(vport_update_params)); vport_update_params.vport_id = vport_id; vport_update_params.update_vport_active_tx_flg = 1; vport_update_params.vport_active_tx_flg = 1; vport_update_params.update_vport_active_rx_flg = 1; vport_update_params.vport_active_rx_flg = 1; vport_update_params.rss_params = rss_params; vport_update_params.update_inner_vlan_removal_flg = 1; vport_update_params.inner_vlan_removal_flg = 1; if (hw_lro_enable) { memset(&tpa_params, 0, sizeof (struct ecore_sge_tpa_params)); tpa_params.max_buffers_per_cqe = QLNX_TPA_MAX_AGG_BUFFERS; tpa_params.update_tpa_en_flg = 1; tpa_params.tpa_ipv4_en_flg = 1; tpa_params.tpa_ipv6_en_flg = 1; tpa_params.update_tpa_param_flg = 1; tpa_params.tpa_pkt_split_flg = 0; tpa_params.tpa_hdr_data_split_flg = 0; tpa_params.tpa_gro_consistent_flg = 0; tpa_params.tpa_max_aggs_num = ETH_TPA_MAX_AGGS_NUM; tpa_params.tpa_max_size = (uint16_t)(-1); tpa_params.tpa_min_size_to_start = ifp->if_mtu/2; tpa_params.tpa_min_size_to_cont = ifp->if_mtu/2; vport_update_params.sge_tpa_params = &tpa_params; } rc = qlnx_update_vport(cdev, &vport_update_params); if (rc) { QL_DPRINT1(ha, "Update V-PORT failed %d\n", rc); return rc; } return 0; } static int qlnx_drain_txq(qlnx_host_t *ha, struct qlnx_fastpath *fp, struct qlnx_tx_queue *txq) { uint16_t hw_bd_cons; uint16_t ecore_cons_idx; QL_DPRINT2(ha, "enter\n"); hw_bd_cons = le16toh(*txq->hw_cons_ptr); while (hw_bd_cons != (ecore_cons_idx = ecore_chain_get_cons_idx(&txq->tx_pbl))) { mtx_lock(&fp->tx_mtx); (void)qlnx_tx_int(ha, fp, txq); mtx_unlock(&fp->tx_mtx); qlnx_mdelay(__func__, 2); hw_bd_cons = le16toh(*txq->hw_cons_ptr); } QL_DPRINT2(ha, "[%d, %d]: done\n", fp->rss_id, txq->index); return 0; } static int qlnx_stop_queues(qlnx_host_t *ha) { struct qlnx_update_vport_params vport_update_params; struct ecore_dev *cdev; struct qlnx_fastpath *fp; int rc, tc, i; cdev = &ha->cdev; /* Disable the vport */ memset(&vport_update_params, 0, sizeof(vport_update_params)); vport_update_params.vport_id = 0; vport_update_params.update_vport_active_tx_flg = 1; vport_update_params.vport_active_tx_flg = 0; vport_update_params.update_vport_active_rx_flg = 1; vport_update_params.vport_active_rx_flg = 0; vport_update_params.rss_params = &ha->rss_params; vport_update_params.rss_params->update_rss_config = 0; vport_update_params.rss_params->rss_enable = 0; vport_update_params.update_inner_vlan_removal_flg = 0; vport_update_params.inner_vlan_removal_flg = 0; QL_DPRINT1(ha, "Update vport ID= %d\n", vport_update_params.vport_id); rc = qlnx_update_vport(cdev, &vport_update_params); if (rc) { QL_DPRINT1(ha, "Failed to update vport\n"); return rc; } /* Flush Tx queues. If needed, request drain from MCP */ for_each_rss(i) { fp = &ha->fp_array[i]; for (tc = 0; tc < ha->num_tc; tc++) { struct qlnx_tx_queue *txq = fp->txq[tc]; rc = qlnx_drain_txq(ha, fp, txq); if (rc) return rc; } } /* Stop all Queues in reverse order*/ for (i = ha->num_rss - 1; i >= 0; i--) { struct ecore_hwfn *p_hwfn = &cdev->hwfns[(i % cdev->num_hwfns)]; fp = &ha->fp_array[i]; /* Stop the Tx Queue(s)*/ for (tc = 0; tc < ha->num_tc; tc++) { int tx_queue_id; tx_queue_id = tc * ha->num_rss + i; rc = ecore_eth_tx_queue_stop(p_hwfn, fp->txq[tc]->handle); if (rc) { QL_DPRINT1(ha, "Failed to stop TXQ #%d\n", tx_queue_id); return rc; } } /* Stop the Rx Queue*/ rc = ecore_eth_rx_queue_stop(p_hwfn, fp->rxq->handle, false, false); if (rc) { QL_DPRINT1(ha, "Failed to stop RXQ #%d\n", i); return rc; } } /* Stop the vport */ for_each_hwfn(cdev, i) { struct ecore_hwfn *p_hwfn = &cdev->hwfns[i]; rc = ecore_sp_vport_stop(p_hwfn, p_hwfn->hw_info.opaque_fid, 0); if (rc) { QL_DPRINT1(ha, "Failed to stop VPORT\n"); return rc; } } return rc; } static int qlnx_set_ucast_rx_mac(qlnx_host_t *ha, enum ecore_filter_opcode opcode, unsigned char mac[ETH_ALEN]) { struct ecore_filter_ucast ucast; struct ecore_dev *cdev; int rc; cdev = &ha->cdev; bzero(&ucast, sizeof(struct ecore_filter_ucast)); ucast.opcode = opcode; ucast.type = ECORE_FILTER_MAC; ucast.is_rx_filter = 1; ucast.vport_to_add_to = 0; memcpy(&ucast.mac[0], mac, ETH_ALEN); rc = ecore_filter_ucast_cmd(cdev, &ucast, ECORE_SPQ_MODE_CB, NULL); return (rc); } static int qlnx_remove_all_ucast_mac(qlnx_host_t *ha) { struct ecore_filter_ucast ucast; struct ecore_dev *cdev; int rc; bzero(&ucast, sizeof(struct ecore_filter_ucast)); ucast.opcode = ECORE_FILTER_REPLACE; ucast.type = ECORE_FILTER_MAC; ucast.is_rx_filter = 1; cdev = &ha->cdev; rc = ecore_filter_ucast_cmd(cdev, &ucast, ECORE_SPQ_MODE_CB, NULL); return (rc); } static int qlnx_remove_all_mcast_mac(qlnx_host_t *ha) { struct ecore_filter_mcast *mcast; struct ecore_dev *cdev; int rc, i; cdev = &ha->cdev; mcast = &ha->ecore_mcast; bzero(mcast, sizeof(struct ecore_filter_mcast)); mcast->opcode = ECORE_FILTER_REMOVE; for (i = 0; i < QLNX_MAX_NUM_MULTICAST_ADDRS; i++) { if (ha->mcast[i].addr[0] || ha->mcast[i].addr[1] || ha->mcast[i].addr[2] || ha->mcast[i].addr[3] || ha->mcast[i].addr[4] || ha->mcast[i].addr[5]) { memcpy(&mcast->mac[i][0], &ha->mcast[i].addr[0], ETH_ALEN); mcast->num_mc_addrs++; } } mcast = &ha->ecore_mcast; rc = ecore_filter_mcast_cmd(cdev, mcast, ECORE_SPQ_MODE_CB, NULL); bzero(ha->mcast, (sizeof(qlnx_mcast_t) * QLNX_MAX_NUM_MULTICAST_ADDRS)); ha->nmcast = 0; return (rc); } static int qlnx_clean_filters(qlnx_host_t *ha) { int rc = 0; /* Remove all unicast macs */ rc = qlnx_remove_all_ucast_mac(ha); if (rc) return rc; /* Remove all multicast macs */ rc = qlnx_remove_all_mcast_mac(ha); if (rc) return rc; rc = qlnx_set_ucast_rx_mac(ha, ECORE_FILTER_FLUSH, ha->primary_mac); return (rc); } static int qlnx_set_rx_accept_filter(qlnx_host_t *ha, uint8_t filter) { struct ecore_filter_accept_flags accept; int rc = 0; struct ecore_dev *cdev; cdev = &ha->cdev; bzero(&accept, sizeof(struct ecore_filter_accept_flags)); accept.update_rx_mode_config = 1; accept.rx_accept_filter = filter; accept.update_tx_mode_config = 1; accept.tx_accept_filter = ECORE_ACCEPT_UCAST_MATCHED | ECORE_ACCEPT_MCAST_MATCHED | ECORE_ACCEPT_BCAST; rc = ecore_filter_accept_cmd(cdev, 0, accept, false, false, ECORE_SPQ_MODE_CB, NULL); return (rc); } static int qlnx_set_rx_mode(qlnx_host_t *ha) { int rc = 0; uint8_t filter; rc = qlnx_set_ucast_rx_mac(ha, ECORE_FILTER_REPLACE, ha->primary_mac); if (rc) return rc; rc = qlnx_remove_all_mcast_mac(ha); if (rc) return rc; filter = ECORE_ACCEPT_UCAST_MATCHED | ECORE_ACCEPT_MCAST_MATCHED | ECORE_ACCEPT_BCAST; if (qlnx_vf_device(ha) == 0) { filter |= ECORE_ACCEPT_UCAST_UNMATCHED; filter |= ECORE_ACCEPT_MCAST_UNMATCHED; } ha->filter = filter; rc = qlnx_set_rx_accept_filter(ha, filter); return (rc); } static int qlnx_set_link(qlnx_host_t *ha, bool link_up) { int i, rc = 0; struct ecore_dev *cdev; struct ecore_hwfn *hwfn; struct ecore_ptt *ptt; if (qlnx_vf_device(ha) == 0) return (0); cdev = &ha->cdev; for_each_hwfn(cdev, i) { hwfn = &cdev->hwfns[i]; ptt = ecore_ptt_acquire(hwfn); if (!ptt) return -EBUSY; rc = ecore_mcp_set_link(hwfn, ptt, link_up); ecore_ptt_release(hwfn, ptt); if (rc) return rc; } return (rc); } #if __FreeBSD_version >= 1100000 static uint64_t qlnx_get_counter(if_t ifp, ift_counter cnt) { qlnx_host_t *ha; uint64_t count; ha = (qlnx_host_t *)if_getsoftc(ifp); switch (cnt) { case IFCOUNTER_IPACKETS: count = ha->hw_stats.common.rx_ucast_pkts + ha->hw_stats.common.rx_mcast_pkts + ha->hw_stats.common.rx_bcast_pkts; break; case IFCOUNTER_IERRORS: count = ha->hw_stats.common.rx_crc_errors + ha->hw_stats.common.rx_align_errors + ha->hw_stats.common.rx_oversize_packets + ha->hw_stats.common.rx_undersize_packets; break; case IFCOUNTER_OPACKETS: count = ha->hw_stats.common.tx_ucast_pkts + ha->hw_stats.common.tx_mcast_pkts + ha->hw_stats.common.tx_bcast_pkts; break; case IFCOUNTER_OERRORS: count = ha->hw_stats.common.tx_err_drop_pkts; break; case IFCOUNTER_COLLISIONS: return (0); case IFCOUNTER_IBYTES: count = ha->hw_stats.common.rx_ucast_bytes + ha->hw_stats.common.rx_mcast_bytes + ha->hw_stats.common.rx_bcast_bytes; break; case IFCOUNTER_OBYTES: count = ha->hw_stats.common.tx_ucast_bytes + ha->hw_stats.common.tx_mcast_bytes + ha->hw_stats.common.tx_bcast_bytes; break; case IFCOUNTER_IMCASTS: count = ha->hw_stats.common.rx_mcast_bytes; break; case IFCOUNTER_OMCASTS: count = ha->hw_stats.common.tx_mcast_bytes; break; case IFCOUNTER_IQDROPS: case IFCOUNTER_OQDROPS: case IFCOUNTER_NOPROTO: default: return (if_get_counter_default(ifp, cnt)); } return (count); } #endif static void qlnx_timer(void *arg) { qlnx_host_t *ha; ha = (qlnx_host_t *)arg; if (ha->error_recovery) { ha->error_recovery = 0; taskqueue_enqueue(ha->err_taskqueue, &ha->err_task); return; } ecore_get_vport_stats(&ha->cdev, &ha->hw_stats); if (ha->storm_stats_gather) qlnx_sample_storm_stats(ha); callout_reset(&ha->qlnx_callout, hz, qlnx_timer, ha); return; } static int qlnx_load(qlnx_host_t *ha) { int i; int rc = 0; struct ecore_dev *cdev; device_t dev; cdev = &ha->cdev; dev = ha->pci_dev; QL_DPRINT2(ha, "enter\n"); rc = qlnx_alloc_mem_arrays(ha); if (rc) goto qlnx_load_exit0; qlnx_init_fp(ha); rc = qlnx_alloc_mem_load(ha); if (rc) goto qlnx_load_exit1; QL_DPRINT2(ha, "Allocated %d RSS queues on %d TC/s\n", ha->num_rss, ha->num_tc); for (i = 0; i < ha->num_rss; i++) { if ((rc = bus_setup_intr(dev, ha->irq_vec[i].irq, (INTR_TYPE_NET | INTR_MPSAFE), NULL, qlnx_fp_isr, &ha->irq_vec[i], &ha->irq_vec[i].handle))) { QL_DPRINT1(ha, "could not setup interrupt\n"); goto qlnx_load_exit2; } QL_DPRINT2(ha, "rss_id = %d irq_rid %d \ irq %p handle %p\n", i, ha->irq_vec[i].irq_rid, ha->irq_vec[i].irq, ha->irq_vec[i].handle); bus_bind_intr(dev, ha->irq_vec[i].irq, (i % mp_ncpus)); } rc = qlnx_start_queues(ha); if (rc) goto qlnx_load_exit2; QL_DPRINT2(ha, "Start VPORT, RXQ and TXQ succeeded\n"); /* Add primary mac and set Rx filters */ rc = qlnx_set_rx_mode(ha); if (rc) goto qlnx_load_exit2; /* Ask for link-up using current configuration */ qlnx_set_link(ha, true); if (qlnx_vf_device(ha) == 0) qlnx_link_update(&ha->cdev.hwfns[0]); ha->state = QLNX_STATE_OPEN; bzero(&ha->hw_stats, sizeof(struct ecore_eth_stats)); if (ha->flags.callout_init) callout_reset(&ha->qlnx_callout, hz, qlnx_timer, ha); goto qlnx_load_exit0; qlnx_load_exit2: qlnx_free_mem_load(ha); qlnx_load_exit1: ha->num_rss = 0; qlnx_load_exit0: QL_DPRINT2(ha, "exit [%d]\n", rc); return rc; } static void qlnx_drain_soft_lro(qlnx_host_t *ha) { #ifdef QLNX_SOFT_LRO struct ifnet *ifp; int i; ifp = ha->ifp; if (ifp->if_capenable & IFCAP_LRO) { for (i = 0; i < ha->num_rss; i++) { struct qlnx_fastpath *fp = &ha->fp_array[i]; struct lro_ctrl *lro; lro = &fp->rxq->lro; #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) tcp_lro_flush_all(lro); #else struct lro_entry *queued; while ((!SLIST_EMPTY(&lro->lro_active))){ queued = SLIST_FIRST(&lro->lro_active); SLIST_REMOVE_HEAD(&lro->lro_active, next); tcp_lro_flush(lro, queued); } #endif /* #if (__FreeBSD_version >= 1100101) || (defined QLNX_QSORT_LRO) */ } } #endif /* #ifdef QLNX_SOFT_LRO */ return; } static void qlnx_unload(qlnx_host_t *ha) { struct ecore_dev *cdev; device_t dev; int i; cdev = &ha->cdev; dev = ha->pci_dev; QL_DPRINT2(ha, "enter\n"); QL_DPRINT1(ha, " QLNX STATE = %d\n",ha->state); if (ha->state == QLNX_STATE_OPEN) { qlnx_set_link(ha, false); qlnx_clean_filters(ha); qlnx_stop_queues(ha); ecore_hw_stop_fastpath(cdev); for (i = 0; i < ha->num_rss; i++) { if (ha->irq_vec[i].handle) { (void)bus_teardown_intr(dev, ha->irq_vec[i].irq, ha->irq_vec[i].handle); ha->irq_vec[i].handle = NULL; } } qlnx_drain_fp_taskqueues(ha); qlnx_drain_soft_lro(ha); qlnx_free_mem_load(ha); } if (ha->flags.callout_init) callout_drain(&ha->qlnx_callout); qlnx_mdelay(__func__, 1000); ha->state = QLNX_STATE_CLOSED; QL_DPRINT2(ha, "exit\n"); return; } static int qlnx_grc_dumpsize(qlnx_host_t *ha, uint32_t *num_dwords, int hwfn_index) { int rval = -1; struct ecore_hwfn *p_hwfn; struct ecore_ptt *p_ptt; ecore_dbg_set_app_ver(ecore_dbg_get_fw_func_ver()); p_hwfn = &ha->cdev.hwfns[hwfn_index]; p_ptt = ecore_ptt_acquire(p_hwfn); if (!p_ptt) { QL_DPRINT1(ha, "ecore_ptt_acquire failed\n"); return (rval); } rval = ecore_dbg_grc_get_dump_buf_size(p_hwfn, p_ptt, num_dwords); if (rval == DBG_STATUS_OK) rval = 0; else { QL_DPRINT1(ha, "ecore_dbg_grc_get_dump_buf_size failed" "[0x%x]\n", rval); } ecore_ptt_release(p_hwfn, p_ptt); return (rval); } static int qlnx_idle_chk_size(qlnx_host_t *ha, uint32_t *num_dwords, int hwfn_index) { int rval = -1; struct ecore_hwfn *p_hwfn; struct ecore_ptt *p_ptt; ecore_dbg_set_app_ver(ecore_dbg_get_fw_func_ver()); p_hwfn = &ha->cdev.hwfns[hwfn_index]; p_ptt = ecore_ptt_acquire(p_hwfn); if (!p_ptt) { QL_DPRINT1(ha, "ecore_ptt_acquire failed\n"); return (rval); } rval = ecore_dbg_idle_chk_get_dump_buf_size(p_hwfn, p_ptt, num_dwords); if (rval == DBG_STATUS_OK) rval = 0; else { QL_DPRINT1(ha, "ecore_dbg_idle_chk_get_dump_buf_size failed" " [0x%x]\n", rval); } ecore_ptt_release(p_hwfn, p_ptt); return (rval); } static void qlnx_sample_storm_stats(qlnx_host_t *ha) { int i, index; struct ecore_dev *cdev; qlnx_storm_stats_t *s_stats; uint32_t reg; struct ecore_ptt *p_ptt; struct ecore_hwfn *hwfn; if (ha->storm_stats_index >= QLNX_STORM_STATS_SAMPLES_PER_HWFN) { ha->storm_stats_gather = 0; return; } cdev = &ha->cdev; for_each_hwfn(cdev, i) { hwfn = &cdev->hwfns[i]; p_ptt = ecore_ptt_acquire(hwfn); if (!p_ptt) return; index = ha->storm_stats_index + (i * QLNX_STORM_STATS_SAMPLES_PER_HWFN); s_stats = &ha->storm_stats[index]; /* XSTORM */ reg = XSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_ACTIVE_CYCLES_BB_K2; s_stats->xstorm_active_cycles = ecore_rd(hwfn, p_ptt, reg); reg = XSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_STALL_CYCLES_BB_K2; s_stats->xstorm_stall_cycles = ecore_rd(hwfn, p_ptt, reg); reg = XSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_SLEEPING_CYCLES_BB_K2; s_stats->xstorm_sleeping_cycles = ecore_rd(hwfn, p_ptt, reg); reg = XSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_INACTIVE_CYCLES_BB_K2; s_stats->xstorm_inactive_cycles = ecore_rd(hwfn, p_ptt, reg); /* YSTORM */ reg = YSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_ACTIVE_CYCLES_BB_K2; s_stats->ystorm_active_cycles = ecore_rd(hwfn, p_ptt, reg); reg = YSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_STALL_CYCLES_BB_K2; s_stats->ystorm_stall_cycles = ecore_rd(hwfn, p_ptt, reg); reg = YSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_SLEEPING_CYCLES_BB_K2; s_stats->ystorm_sleeping_cycles = ecore_rd(hwfn, p_ptt, reg); reg = YSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_INACTIVE_CYCLES_BB_K2; s_stats->ystorm_inactive_cycles = ecore_rd(hwfn, p_ptt, reg); /* PSTORM */ reg = PSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_ACTIVE_CYCLES_BB_K2; s_stats->pstorm_active_cycles = ecore_rd(hwfn, p_ptt, reg); reg = PSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_STALL_CYCLES_BB_K2; s_stats->pstorm_stall_cycles = ecore_rd(hwfn, p_ptt, reg); reg = PSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_SLEEPING_CYCLES_BB_K2; s_stats->pstorm_sleeping_cycles = ecore_rd(hwfn, p_ptt, reg); reg = PSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_INACTIVE_CYCLES_BB_K2; s_stats->pstorm_inactive_cycles = ecore_rd(hwfn, p_ptt, reg); /* TSTORM */ reg = TSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_ACTIVE_CYCLES_BB_K2; s_stats->tstorm_active_cycles = ecore_rd(hwfn, p_ptt, reg); reg = TSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_STALL_CYCLES_BB_K2; s_stats->tstorm_stall_cycles = ecore_rd(hwfn, p_ptt, reg); reg = TSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_SLEEPING_CYCLES_BB_K2; s_stats->tstorm_sleeping_cycles = ecore_rd(hwfn, p_ptt, reg); reg = TSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_INACTIVE_CYCLES_BB_K2; s_stats->tstorm_inactive_cycles = ecore_rd(hwfn, p_ptt, reg); /* MSTORM */ reg = MSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_ACTIVE_CYCLES_BB_K2; s_stats->mstorm_active_cycles = ecore_rd(hwfn, p_ptt, reg); reg = MSEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_STALL_CYCLES_BB_K2; s_stats->mstorm_stall_cycles = ecore_rd(hwfn, p_ptt, reg); reg = MSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_SLEEPING_CYCLES_BB_K2; s_stats->mstorm_sleeping_cycles = ecore_rd(hwfn, p_ptt, reg); reg = MSEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_INACTIVE_CYCLES_BB_K2; s_stats->mstorm_inactive_cycles = ecore_rd(hwfn, p_ptt, reg); /* USTORM */ reg = USEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_ACTIVE_CYCLES_BB_K2; s_stats->ustorm_active_cycles = ecore_rd(hwfn, p_ptt, reg); reg = USEM_REG_FAST_MEMORY + SEM_FAST_REG_STORM_STALL_CYCLES_BB_K2; s_stats->ustorm_stall_cycles = ecore_rd(hwfn, p_ptt, reg); reg = USEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_SLEEPING_CYCLES_BB_K2; s_stats->ustorm_sleeping_cycles = ecore_rd(hwfn, p_ptt, reg); reg = USEM_REG_FAST_MEMORY + SEM_FAST_REG_IDLE_INACTIVE_CYCLES_BB_K2; s_stats->ustorm_inactive_cycles = ecore_rd(hwfn, p_ptt, reg); ecore_ptt_release(hwfn, p_ptt); } ha->storm_stats_index++; return; } /* * Name: qlnx_dump_buf8 * Function: dumps a buffer as bytes */ static void qlnx_dump_buf8(qlnx_host_t *ha, const char *msg, void *dbuf, uint32_t len) { device_t dev; uint32_t i = 0; uint8_t *buf; dev = ha->pci_dev; buf = dbuf; device_printf(dev, "%s: %s 0x%x dump start\n", __func__, msg, len); while (len >= 16) { device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x %02x %02x %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14], buf[15]); i += 16; len -= 16; buf += 16; } switch (len) { case 1: device_printf(dev,"0x%08x: %02x\n", i, buf[0]); break; case 2: device_printf(dev,"0x%08x: %02x %02x\n", i, buf[0], buf[1]); break; case 3: device_printf(dev,"0x%08x: %02x %02x %02x\n", i, buf[0], buf[1], buf[2]); break; case 4: device_printf(dev,"0x%08x: %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3]); break; case 5: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4]); break; case 6: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5]); break; case 7: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6]); break; case 8: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]); break; case 9: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8]); break; case 10: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9]); break; case 11: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10]); break; case 12: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11]); break; case 13: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11], buf[12]); break; case 14: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11], buf[12], buf[13]); break; case 15: device_printf(dev,"0x%08x:" " %02x %02x %02x %02x %02x %02x %02x %02x" " %02x %02x %02x %02x %02x %02x %02x\n", i, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7], buf[8], buf[9], buf[10], buf[11], buf[12], buf[13], buf[14]); break; default: break; } device_printf(dev, "%s: %s dump end\n", __func__, msg); return; } #ifdef CONFIG_ECORE_SRIOV static void __qlnx_osal_iov_vf_cleanup(struct ecore_hwfn *p_hwfn, uint8_t rel_vf_id) { struct ecore_public_vf_info *vf_info; vf_info = ecore_iov_get_public_vf_info(p_hwfn, rel_vf_id, false); if (!vf_info) return; /* Clear the VF mac */ memset(vf_info->forced_mac, 0, ETH_ALEN); vf_info->forced_vlan = 0; return; } void qlnx_osal_iov_vf_cleanup(void *p_hwfn, uint8_t relative_vf_id) { __qlnx_osal_iov_vf_cleanup(p_hwfn, relative_vf_id); return; } static int __qlnx_iov_chk_ucast(struct ecore_hwfn *p_hwfn, int vfid, struct ecore_filter_ucast *params) { struct ecore_public_vf_info *vf; if (!ecore_iov_vf_has_vport_instance(p_hwfn, vfid)) { QL_DPRINT1(((qlnx_host_t *)p_hwfn->p_dev), "VF[%d] vport not initialized\n", vfid); return ECORE_INVAL; } vf = ecore_iov_get_public_vf_info(p_hwfn, vfid, true); if (!vf) return -EINVAL; /* No real decision to make; Store the configured MAC */ if (params->type == ECORE_FILTER_MAC || params->type == ECORE_FILTER_MAC_VLAN) memcpy(params->mac, vf->forced_mac, ETH_ALEN); return 0; } int qlnx_iov_chk_ucast(void *p_hwfn, int vfid, void *params) { return (__qlnx_iov_chk_ucast(p_hwfn, vfid, params)); } static int __qlnx_iov_update_vport(struct ecore_hwfn *hwfn, uint8_t vfid, struct ecore_sp_vport_update_params *params, uint16_t * tlvs) { uint8_t mask; struct ecore_filter_accept_flags *flags; if (!ecore_iov_vf_has_vport_instance(hwfn, vfid)) { QL_DPRINT1(((qlnx_host_t *)hwfn->p_dev), "VF[%d] vport not initialized\n", vfid); return ECORE_INVAL; } /* Untrusted VFs can't even be trusted to know that fact. * Simply indicate everything is configured fine, and trace * configuration 'behind their back'. */ mask = ECORE_ACCEPT_UCAST_UNMATCHED | ECORE_ACCEPT_MCAST_UNMATCHED; flags = ¶ms->accept_flags; if (!(*tlvs & BIT(ECORE_IOV_VP_UPDATE_ACCEPT_PARAM))) return 0; return 0; } int qlnx_iov_update_vport(void *hwfn, uint8_t vfid, void *params, uint16_t *tlvs) { return(__qlnx_iov_update_vport(hwfn, vfid, params, tlvs)); } static int qlnx_find_hwfn_index(struct ecore_hwfn *p_hwfn) { int i; struct ecore_dev *cdev; cdev = p_hwfn->p_dev; for (i = 0; i < cdev->num_hwfns; i++) { if (&cdev->hwfns[i] == p_hwfn) break; } if (i >= cdev->num_hwfns) return (-1); return (i); } static int __qlnx_pf_vf_msg(struct ecore_hwfn *p_hwfn, uint16_t rel_vf_id) { qlnx_host_t *ha = (qlnx_host_t *)p_hwfn->p_dev; int i; QL_DPRINT2(ha, "ha = %p cdev = %p p_hwfn = %p rel_vf_id = %d\n", ha, p_hwfn->p_dev, p_hwfn, rel_vf_id); if ((i = qlnx_find_hwfn_index(p_hwfn)) == -1) return (-1); if (ha->sriov_task[i].pf_taskqueue != NULL) { atomic_testandset_32(&ha->sriov_task[i].flags, QLNX_SRIOV_TASK_FLAGS_VF_PF_MSG); taskqueue_enqueue(ha->sriov_task[i].pf_taskqueue, &ha->sriov_task[i].pf_task); } return (ECORE_SUCCESS); } int qlnx_pf_vf_msg(void *p_hwfn, uint16_t relative_vf_id) { return (__qlnx_pf_vf_msg(p_hwfn, relative_vf_id)); } static void __qlnx_vf_flr_update(struct ecore_hwfn *p_hwfn) { qlnx_host_t *ha = (qlnx_host_t *)p_hwfn->p_dev; int i; if (!ha->sriov_initialized) return; QL_DPRINT2(ha, "ha = %p cdev = %p p_hwfn = %p \n", ha, p_hwfn->p_dev, p_hwfn); if ((i = qlnx_find_hwfn_index(p_hwfn)) == -1) return; if (ha->sriov_task[i].pf_taskqueue != NULL) { atomic_testandset_32(&ha->sriov_task[i].flags, QLNX_SRIOV_TASK_FLAGS_VF_FLR_UPDATE); taskqueue_enqueue(ha->sriov_task[i].pf_taskqueue, &ha->sriov_task[i].pf_task); } return; } void qlnx_vf_flr_update(void *p_hwfn) { __qlnx_vf_flr_update(p_hwfn); return; } #ifndef QLNX_VF static void qlnx_vf_bulleting_update(struct ecore_hwfn *p_hwfn) { qlnx_host_t *ha = (qlnx_host_t *)p_hwfn->p_dev; int i; QL_DPRINT2(ha, "ha = %p cdev = %p p_hwfn = %p \n", ha, p_hwfn->p_dev, p_hwfn); if ((i = qlnx_find_hwfn_index(p_hwfn)) == -1) return; QL_DPRINT2(ha, "ha = %p cdev = %p p_hwfn = %p i = %d\n", ha, p_hwfn->p_dev, p_hwfn, i); if (ha->sriov_task[i].pf_taskqueue != NULL) { atomic_testandset_32(&ha->sriov_task[i].flags, QLNX_SRIOV_TASK_FLAGS_BULLETIN_UPDATE); taskqueue_enqueue(ha->sriov_task[i].pf_taskqueue, &ha->sriov_task[i].pf_task); } } static void qlnx_initialize_sriov(qlnx_host_t *ha) { device_t dev; nvlist_t *pf_schema, *vf_schema; int iov_error; dev = ha->pci_dev; pf_schema = pci_iov_schema_alloc_node(); vf_schema = pci_iov_schema_alloc_node(); pci_iov_schema_add_unicast_mac(vf_schema, "mac-addr", 0, NULL); pci_iov_schema_add_bool(vf_schema, "allow-set-mac", IOV_SCHEMA_HASDEFAULT, FALSE); pci_iov_schema_add_bool(vf_schema, "allow-promisc", IOV_SCHEMA_HASDEFAULT, FALSE); pci_iov_schema_add_uint16(vf_schema, "num-queues", IOV_SCHEMA_HASDEFAULT, 1); iov_error = pci_iov_attach(dev, pf_schema, vf_schema); if (iov_error != 0) { ha->sriov_initialized = 0; } else { device_printf(dev, "SRIOV initialized\n"); ha->sriov_initialized = 1; } return; } static void qlnx_sriov_disable(qlnx_host_t *ha) { struct ecore_dev *cdev; int i, j; cdev = &ha->cdev; ecore_iov_set_vfs_to_disable(cdev, true); for_each_hwfn(cdev, i) { struct ecore_hwfn *hwfn = &cdev->hwfns[i]; struct ecore_ptt *ptt = ecore_ptt_acquire(hwfn); if (!ptt) { QL_DPRINT1(ha, "Failed to acquire ptt\n"); return; } /* Clean WFQ db and configure equal weight for all vports */ ecore_clean_wfq_db(hwfn, ptt); ecore_for_each_vf(hwfn, j) { int k = 0; if (!ecore_iov_is_valid_vfid(hwfn, j, true, false)) continue; if (ecore_iov_is_vf_started(hwfn, j)) { /* Wait until VF is disabled before releasing */ for (k = 0; k < 100; k++) { if (!ecore_iov_is_vf_stopped(hwfn, j)) { qlnx_mdelay(__func__, 10); } else break; } } if (k < 100) ecore_iov_release_hw_for_vf(&cdev->hwfns[i], ptt, j); else { QL_DPRINT1(ha, "Timeout waiting for VF's FLR to end\n"); } } ecore_ptt_release(hwfn, ptt); } ecore_iov_set_vfs_to_disable(cdev, false); return; } static void qlnx_sriov_enable_qid_config(struct ecore_hwfn *hwfn, u16 vfid, struct ecore_iov_vf_init_params *params) { u16 base, i; /* Since we have an equal resource distribution per-VF, and we assume * PF has acquired the ECORE_PF_L2_QUE first queues, we start setting * sequentially from there. */ base = FEAT_NUM(hwfn, ECORE_PF_L2_QUE) + vfid * params->num_queues; params->rel_vf_id = vfid; for (i = 0; i < params->num_queues; i++) { params->req_rx_queue[i] = base + i; params->req_tx_queue[i] = base + i; } /* PF uses indices 0 for itself; Set vport/RSS afterwards */ params->vport_id = vfid + 1; params->rss_eng_id = vfid + 1; return; } static int qlnx_iov_init(device_t dev, uint16_t num_vfs, const nvlist_t *nvlist_params) { qlnx_host_t *ha; struct ecore_dev *cdev; struct ecore_iov_vf_init_params params; int ret, j, i; uint32_t max_vfs; if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "%s: cannot get softc\n", __func__); return (-1); } if (qlnx_create_pf_taskqueues(ha) != 0) goto qlnx_iov_init_err0; cdev = &ha->cdev; max_vfs = RESC_NUM(&cdev->hwfns[0], ECORE_VPORT); QL_DPRINT2(ha," dev = %p enter num_vfs = %d max_vfs = %d\n", dev, num_vfs, max_vfs); if (num_vfs >= max_vfs) { QL_DPRINT1(ha, "Can start at most %d VFs\n", (RESC_NUM(&cdev->hwfns[0], ECORE_VPORT) - 1)); goto qlnx_iov_init_err0; } ha->vf_attr = malloc(((sizeof (qlnx_vf_attr_t) * num_vfs)), M_QLNXBUF, M_NOWAIT); if (ha->vf_attr == NULL) goto qlnx_iov_init_err0; memset(¶ms, 0, sizeof(params)); /* Initialize HW for VF access */ for_each_hwfn(cdev, j) { struct ecore_hwfn *hwfn = &cdev->hwfns[j]; struct ecore_ptt *ptt = ecore_ptt_acquire(hwfn); /* Make sure not to use more than 16 queues per VF */ params.num_queues = min_t(int, (FEAT_NUM(hwfn, ECORE_VF_L2_QUE) / num_vfs), 16); if (!ptt) { QL_DPRINT1(ha, "Failed to acquire ptt\n"); goto qlnx_iov_init_err1; } for (i = 0; i < num_vfs; i++) { if (!ecore_iov_is_valid_vfid(hwfn, i, false, true)) continue; qlnx_sriov_enable_qid_config(hwfn, i, ¶ms); ret = ecore_iov_init_hw_for_vf(hwfn, ptt, ¶ms); if (ret) { QL_DPRINT1(ha, "Failed to enable VF[%d]\n", i); ecore_ptt_release(hwfn, ptt); goto qlnx_iov_init_err1; } } ecore_ptt_release(hwfn, ptt); } ha->num_vfs = num_vfs; qlnx_inform_vf_link_state(&cdev->hwfns[0], ha); QL_DPRINT2(ha," dev = %p exit num_vfs = %d\n", dev, num_vfs); return (0); qlnx_iov_init_err1: qlnx_sriov_disable(ha); qlnx_iov_init_err0: qlnx_destroy_pf_taskqueues(ha); ha->num_vfs = 0; return (-1); } static void qlnx_iov_uninit(device_t dev) { qlnx_host_t *ha; if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "%s: cannot get softc\n", __func__); return; } QL_DPRINT2(ha," dev = %p enter\n", dev); qlnx_sriov_disable(ha); qlnx_destroy_pf_taskqueues(ha); free(ha->vf_attr, M_QLNXBUF); ha->vf_attr = NULL; ha->num_vfs = 0; QL_DPRINT2(ha," dev = %p exit\n", dev); return; } static int qlnx_iov_add_vf(device_t dev, uint16_t vfnum, const nvlist_t *params) { qlnx_host_t *ha; qlnx_vf_attr_t *vf_attr; unsigned const char *mac; size_t size; struct ecore_hwfn *p_hwfn; if ((ha = device_get_softc(dev)) == NULL) { device_printf(dev, "%s: cannot get softc\n", __func__); return (-1); } QL_DPRINT2(ha," dev = %p enter vfnum = %d\n", dev, vfnum); if (vfnum > (ha->num_vfs - 1)) { QL_DPRINT1(ha, " VF[%d] is greater than max allowed [%d]\n", vfnum, (ha->num_vfs - 1)); } vf_attr = &ha->vf_attr[vfnum]; if (nvlist_exists_binary(params, "mac-addr")) { mac = nvlist_get_binary(params, "mac-addr", &size); bcopy(mac, vf_attr->mac_addr, ETHER_ADDR_LEN); device_printf(dev, "%s: mac_addr = %02x:%02x:%02x:%02x:%02x:%02x\n", __func__, vf_attr->mac_addr[0], vf_attr->mac_addr[1], vf_attr->mac_addr[2], vf_attr->mac_addr[3], vf_attr->mac_addr[4], vf_attr->mac_addr[5]); p_hwfn = &ha->cdev.hwfns[0]; ecore_iov_bulletin_set_mac(p_hwfn, vf_attr->mac_addr, vfnum); } QL_DPRINT2(ha," dev = %p exit vfnum = %d\n", dev, vfnum); return (0); } static void qlnx_handle_vf_msg(qlnx_host_t *ha, struct ecore_hwfn *p_hwfn) { uint64_t events[ECORE_VF_ARRAY_LENGTH]; struct ecore_ptt *ptt; int i; ptt = ecore_ptt_acquire(p_hwfn); if (!ptt) { QL_DPRINT1(ha, "Can't acquire PTT; re-scheduling\n"); __qlnx_pf_vf_msg(p_hwfn, 0); return; } ecore_iov_pf_get_pending_events(p_hwfn, events); QL_DPRINT2(ha, "Event mask of VF events:" "0x%" PRIu64 "0x%" PRIu64 " 0x%" PRIu64 "\n", events[0], events[1], events[2]); ecore_for_each_vf(p_hwfn, i) { /* Skip VFs with no pending messages */ if (!(events[i / 64] & (1ULL << (i % 64)))) continue; QL_DPRINT2(ha, "Handling VF message from VF 0x%02x [Abs 0x%02x]\n", i, p_hwfn->p_dev->p_iov_info->first_vf_in_pf + i); /* Copy VF's message to PF's request buffer for that VF */ if (ecore_iov_copy_vf_msg(p_hwfn, ptt, i)) continue; ecore_iov_process_mbx_req(p_hwfn, ptt, i); } ecore_ptt_release(p_hwfn, ptt); return; } static void qlnx_handle_vf_flr_update(qlnx_host_t *ha, struct ecore_hwfn *p_hwfn) { struct ecore_ptt *ptt; int ret; ptt = ecore_ptt_acquire(p_hwfn); if (!ptt) { QL_DPRINT1(ha, "Can't acquire PTT; re-scheduling\n"); __qlnx_vf_flr_update(p_hwfn); return; } ret = ecore_iov_vf_flr_cleanup(p_hwfn, ptt); if (ret) { QL_DPRINT1(ha, "ecore_iov_vf_flr_cleanup failed; re-scheduling\n"); } ecore_ptt_release(p_hwfn, ptt); return; } static void qlnx_handle_bulletin_update(qlnx_host_t *ha, struct ecore_hwfn *p_hwfn) { struct ecore_ptt *ptt; int i; ptt = ecore_ptt_acquire(p_hwfn); if (!ptt) { QL_DPRINT1(ha, "Can't acquire PTT; re-scheduling\n"); qlnx_vf_bulleting_update(p_hwfn); return; } ecore_for_each_vf(p_hwfn, i) { QL_DPRINT1(ha, "ecore_iov_post_vf_bulletin[%p, %d]\n", p_hwfn, i); ecore_iov_post_vf_bulletin(p_hwfn, i, ptt); } ecore_ptt_release(p_hwfn, ptt); return; } static void qlnx_pf_taskqueue(void *context, int pending) { struct ecore_hwfn *p_hwfn; qlnx_host_t *ha; int i; p_hwfn = context; if (p_hwfn == NULL) return; ha = (qlnx_host_t *)(p_hwfn->p_dev); if ((i = qlnx_find_hwfn_index(p_hwfn)) == -1) return; if (atomic_testandclear_32(&ha->sriov_task[i].flags, QLNX_SRIOV_TASK_FLAGS_VF_PF_MSG)) qlnx_handle_vf_msg(ha, p_hwfn); if (atomic_testandclear_32(&ha->sriov_task[i].flags, QLNX_SRIOV_TASK_FLAGS_VF_FLR_UPDATE)) qlnx_handle_vf_flr_update(ha, p_hwfn); if (atomic_testandclear_32(&ha->sriov_task[i].flags, QLNX_SRIOV_TASK_FLAGS_BULLETIN_UPDATE)) qlnx_handle_bulletin_update(ha, p_hwfn); return; } static int qlnx_create_pf_taskqueues(qlnx_host_t *ha) { int i; uint8_t tq_name[32]; for (i = 0; i < ha->cdev.num_hwfns; i++) { struct ecore_hwfn *p_hwfn = &ha->cdev.hwfns[i]; bzero(tq_name, sizeof (tq_name)); snprintf(tq_name, sizeof (tq_name), "ql_pf_tq_%d", i); TASK_INIT(&ha->sriov_task[i].pf_task, 0, qlnx_pf_taskqueue, p_hwfn); ha->sriov_task[i].pf_taskqueue = taskqueue_create(tq_name, M_NOWAIT, taskqueue_thread_enqueue, &ha->sriov_task[i].pf_taskqueue); if (ha->sriov_task[i].pf_taskqueue == NULL) return (-1); taskqueue_start_threads(&ha->sriov_task[i].pf_taskqueue, 1, PI_NET, "%s", tq_name); QL_DPRINT1(ha, "%p\n", ha->sriov_task[i].pf_taskqueue); } return (0); } static void qlnx_destroy_pf_taskqueues(qlnx_host_t *ha) { int i; for (i = 0; i < ha->cdev.num_hwfns; i++) { if (ha->sriov_task[i].pf_taskqueue != NULL) { taskqueue_drain(ha->sriov_task[i].pf_taskqueue, &ha->sriov_task[i].pf_task); taskqueue_free(ha->sriov_task[i].pf_taskqueue); ha->sriov_task[i].pf_taskqueue = NULL; } } return; } static void qlnx_inform_vf_link_state(struct ecore_hwfn *p_hwfn, qlnx_host_t *ha) { struct ecore_mcp_link_capabilities caps; struct ecore_mcp_link_params params; struct ecore_mcp_link_state link; int i; if (!p_hwfn->pf_iov_info) return; memset(¶ms, 0, sizeof(struct ecore_mcp_link_params)); memset(&link, 0, sizeof(struct ecore_mcp_link_state)); memset(&caps, 0, sizeof(struct ecore_mcp_link_capabilities)); memcpy(&caps, ecore_mcp_get_link_capabilities(p_hwfn), sizeof(caps)); memcpy(&link, ecore_mcp_get_link_state(p_hwfn), sizeof(link)); memcpy(¶ms, ecore_mcp_get_link_params(p_hwfn), sizeof(params)); QL_DPRINT2(ha, "called\n"); /* Update bulletin of all future possible VFs with link configuration */ for (i = 0; i < p_hwfn->p_dev->p_iov_info->total_vfs; i++) { /* Modify link according to the VF's configured link state */ link.link_up = false; if (ha->link_up) { link.link_up = true; /* Set speed according to maximum supported by HW. * that is 40G for regular devices and 100G for CMT * mode devices. */ link.speed = (p_hwfn->p_dev->num_hwfns > 1) ? 100000 : link.speed; } QL_DPRINT2(ha, "link [%d] = %d\n", i, link.link_up); ecore_iov_set_link(p_hwfn, i, ¶ms, &link, &caps); } qlnx_vf_bulleting_update(p_hwfn); return; } #endif /* #ifndef QLNX_VF */ #endif /* #ifdef CONFIG_ECORE_SRIOV */ Index: stable/12 =================================================================== --- stable/12 (revision 353104) +++ stable/12 (revision 353105) Property changes on: stable/12 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r352823