diff --git a/sys/dev/ice/ice_rdma.c b/sys/dev/ice/ice_rdma.c index 8443cab1e98e..411bfb41dcec 100644 --- a/sys/dev/ice/ice_rdma.c +++ b/sys/dev/ice/ice_rdma.c @@ -1,867 +1,916 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* Copyright (c) 2023, Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** * @file ice_rdma.c * @brief RDMA client driver interface * * Functions to interface with the RDMA client driver, for enabling RMDA * functionality for the ice driver. * * The RDMA client interface is based on a simple kobject interface which is * defined by the rmda_if.m and irdma_di_if.m interfaces. * * The ice device driver provides the rmda_di_if.m interface methods, while * the client RDMA driver provides the irdma_if.m interface methods as an * extension ontop of the irdma_di_if kobject. * * The initial connection between drivers is done via the RDMA client driver * calling ice_rdma_register. */ #include "ice_iflib.h" #include "ice_rdma_internal.h" #include "irdma_if.h" #include "irdma_di_if.h" /** * @var ice_rdma * @brief global RDMA driver state * * Contains global state the driver uses to connect to a client RDMA interface * driver. */ static struct ice_rdma_state ice_rdma; /* * Helper function prototypes */ static int ice_rdma_pf_attach_locked(struct ice_softc *sc); static void ice_rdma_pf_detach_locked(struct ice_softc *sc); static int ice_rdma_check_version(struct ice_rdma_info *info); static void ice_rdma_cp_qos_info(struct ice_hw *hw, struct ice_dcbx_cfg *dcbx_cfg, struct ice_qos_params *qos_info); /* * RDMA Device Interface prototypes */ static int ice_rdma_pf_reset(struct ice_rdma_peer *peer); static int ice_rdma_pf_msix_init(struct ice_rdma_peer *peer, struct ice_rdma_msix_mapping *msix_info); static int ice_rdma_qset_register_request(struct ice_rdma_peer *peer, struct ice_rdma_qset_update *res); static int ice_rdma_update_vsi_filter(struct ice_rdma_peer *peer_dev, bool enable); static void ice_rdma_request_handler(struct ice_rdma_peer *peer, struct ice_rdma_request *req); /** * @var ice_rdma_di_methods * @brief RDMA driver interface methods * * Kobject methods implementing the driver-side interface for the RDMA peer * clients. This method table contains the operations which the client can * request from the driver. * * The client driver will then extend this kobject class with methods that the * driver can request from the client. */ static kobj_method_t ice_rdma_di_methods[] = { KOBJMETHOD(irdma_di_reset, ice_rdma_pf_reset), KOBJMETHOD(irdma_di_msix_init, ice_rdma_pf_msix_init), KOBJMETHOD(irdma_di_qset_register_request, ice_rdma_qset_register_request), KOBJMETHOD(irdma_di_vsi_filter_update, ice_rdma_update_vsi_filter), KOBJMETHOD(irdma_di_req_handler, ice_rdma_request_handler), KOBJMETHOD_END }; /* Define ice_rdma_di class which will be extended by the iRDMA driver */ DEFINE_CLASS_0(ice_rdma_di, ice_rdma_di_class, ice_rdma_di_methods, sizeof(struct ice_rdma_peer)); /** * ice_rdma_pf_reset - RDMA client interface requested a reset * @peer: the RDMA peer client structure * * Implements IRDMA_DI_RESET, called by the RDMA client driver to request * a reset of an ice driver device. */ static int ice_rdma_pf_reset(struct ice_rdma_peer *peer) { struct ice_softc *sc = ice_rdma_peer_to_sc(peer); - /* - * Request that the driver re-initialize by bringing the interface - * down and up. - */ - ice_request_stack_reinit(sc); + /* Tell the base driver that RDMA is requesting a PFR */ + ice_set_state(&sc->state, ICE_STATE_RESET_PFR_REQ); + + /* XXX: Base driver will notify RDMA when it's done */ return (0); } /** * ice_rdma_pf_msix_init - RDMA client interface request MSI-X initialization * @peer: the RDMA peer client structure * @msix_info: requested MSI-X mapping * * Implements IRDMA_DI_MSIX_INIT, called by the RDMA client driver to * initialize the MSI-X resources required for RDMA functionality. */ static int ice_rdma_pf_msix_init(struct ice_rdma_peer *peer, struct ice_rdma_msix_mapping __unused *msix_info) { struct ice_softc *sc = ice_rdma_peer_to_sc(peer); MPASS(msix_info != NULL); device_printf(sc->dev, "%s: iRDMA MSI-X initialization request is not yet implemented\n", __func__); /* TODO: implement MSI-X initialization for RDMA */ return (ENOSYS); } /** * ice_rdma_register_request - RDMA client interface request qset * registration or unregistration * @peer: the RDMA peer client structure * @res: resources to be registered or unregistered */ static int ice_rdma_qset_register_request(struct ice_rdma_peer *peer, struct ice_rdma_qset_update *res) { struct ice_softc *sc = ice_rdma_peer_to_sc(peer); struct ice_vsi *vsi = NULL; struct ice_dcbx_cfg *dcbx_cfg; struct ice_hw *hw = &sc->hw; enum ice_status status; int count, i, ret = 0; uint32_t *qset_teid; uint16_t *qs_handle; uint16_t max_rdmaqs[ICE_MAX_TRAFFIC_CLASS]; uint16_t vsi_id; uint8_t ena_tc = 0; if (!res) return -EINVAL; if (res->cnt_req > ICE_MAX_TXQ_PER_TXQG) return -EINVAL; switch(res->res_type) { case ICE_RDMA_QSET_ALLOC: count = res->cnt_req; vsi_id = peer->pf_vsi_num; break; case ICE_RDMA_QSET_FREE: count = res->res_allocated; vsi_id = res->qsets.vsi_id; break; default: return -EINVAL; } qset_teid = (uint32_t *)ice_calloc(hw, count, sizeof(*qset_teid)); if (!qset_teid) return -ENOMEM; qs_handle = (uint16_t *)ice_calloc(hw, count, sizeof(*qs_handle)); if (!qs_handle) { ice_free(hw, qset_teid); return -ENOMEM; } ice_for_each_traffic_class(i) max_rdmaqs[i] = 0; for (i = 0; i < sc->num_available_vsi; i++) { if (sc->all_vsi[i] && ice_get_hw_vsi_num(hw, sc->all_vsi[i]->idx) == vsi_id) { vsi = sc->all_vsi[i]; break; } } if (!vsi) { ice_debug(hw, ICE_DBG_RDMA, "RDMA QSet invalid VSI\n"); ret = -EINVAL; goto out; } if (sc != vsi->sc) { ice_debug(hw, ICE_DBG_RDMA, "VSI is tied to unexpected device\n"); ret = -EXDEV; goto out; } for (i = 0; i < count; i++) { struct ice_rdma_qset_params *qset; qset = &res->qsets; if (qset->vsi_id != peer->pf_vsi_num) { ice_debug(hw, ICE_DBG_RDMA, "RDMA QSet invalid VSI requested %d %d\n", qset->vsi_id, peer->pf_vsi_num); ret = -EINVAL; goto out; } max_rdmaqs[qset->tc]++; qs_handle[i] = qset->qs_handle; qset_teid[i] = qset->teid; } switch(res->res_type) { case ICE_RDMA_QSET_ALLOC: dcbx_cfg = &hw->port_info->qos_cfg.local_dcbx_cfg; ena_tc = ice_dcb_get_tc_map(dcbx_cfg); ice_debug(hw, ICE_DBG_RDMA, "%s:%d ena_tc=%x\n", __func__, __LINE__, ena_tc); status = ice_cfg_vsi_rdma(hw->port_info, vsi->idx, ena_tc, max_rdmaqs); if (status) { ice_debug(hw, ICE_DBG_RDMA, "Failed VSI RDMA qset config\n"); ret = -EINVAL; goto out; } for (i = 0; i < count; i++) { struct ice_rdma_qset_params *qset; qset = &res->qsets; status = ice_ena_vsi_rdma_qset(hw->port_info, vsi->idx, qset->tc, &qs_handle[i], 1, &qset_teid[i]); if (status) { ice_debug(hw, ICE_DBG_RDMA, "Failed VSI RDMA qset enable\n"); ret = -EINVAL; goto out; } qset->teid = qset_teid[i]; } break; case ICE_RDMA_QSET_FREE: status = ice_dis_vsi_rdma_qset(hw->port_info, count, qset_teid, qs_handle); if (status) ret = -EINVAL; break; default: ret = -EINVAL; break; } out: ice_free(hw, qs_handle); ice_free(hw, qset_teid); return ret; } /** * ice_rdma_update_vsi_filter - configure vsi information * when opening or closing rdma driver * @peer: the RDMA peer client structure * @enable: enable or disable the rdma filter */ static int ice_rdma_update_vsi_filter(struct ice_rdma_peer *peer, bool enable) { struct ice_softc *sc = ice_rdma_peer_to_sc(peer); struct ice_vsi *vsi; int ret; vsi = &sc->pf_vsi; if (!vsi) return -EINVAL; ret = ice_cfg_iwarp_fltr(&sc->hw, vsi->idx, enable); if (ret) { device_printf(sc->dev, "Failed to %sable iWARP filtering\n", enable ? "en" : "dis"); } else { if (enable) vsi->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; else vsi->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; } return ret; } /** * ice_rdma_request_handler - handle requests incoming from RDMA driver * @peer: the RDMA peer client structure * @req: structure containing request */ static void ice_rdma_request_handler(struct ice_rdma_peer *peer, struct ice_rdma_request *req) { if (!req || !peer) { log(LOG_WARNING, "%s: peer or req are not valid\n", __func__); return; } switch(req->type) { case ICE_RDMA_EVENT_RESET: + ice_rdma_pf_reset(peer); break; case ICE_RDMA_EVENT_QSET_REGISTER: ice_rdma_qset_register_request(peer, &req->res); break; case ICE_RDMA_EVENT_VSI_FILTER_UPDATE: ice_rdma_update_vsi_filter(peer, req->enable_filter); break; default: log(LOG_WARNING, "%s: Event %d not supported\n", __func__, req->type); break; } } /** * ice_rdma_cp_qos_info - gather current QOS/DCB settings in LAN to pass * to RDMA driver * @hw: ice hw structure * @dcbx_cfg: current DCB settings in ice driver * @qos_info: destination of the DCB settings */ static void ice_rdma_cp_qos_info(struct ice_hw *hw, struct ice_dcbx_cfg *dcbx_cfg, struct ice_qos_params *qos_info) { u32 up2tc; u8 j; u8 num_tc = 0; u8 val_tc = 0; /* number of TC for validation */ u8 cnt_tc = 0; /* setup qos_info fields with defaults */ qos_info->num_apps = 0; qos_info->num_tc = 1; for (j = 0; j < ICE_TC_MAX_USER_PRIORITY; j++) qos_info->up2tc[j] = 0; qos_info->tc_info[0].rel_bw = 100; for (j = 1; j < IEEE_8021QAZ_MAX_TCS; j++) qos_info->tc_info[j].rel_bw = 0; /* gather current values */ up2tc = rd32(hw, PRTDCB_TUP2TC); qos_info->num_apps = dcbx_cfg->numapps; for (j = 0; j < ICE_MAX_TRAFFIC_CLASS; j++) { num_tc |= BIT(dcbx_cfg->etscfg.prio_table[j]); } for (j = 0; j < ICE_MAX_TRAFFIC_CLASS; j++) { if (num_tc & BIT(j)) { cnt_tc++; val_tc |= BIT(j); } else { break; } } qos_info->num_tc = (val_tc == num_tc && num_tc != 0) ? cnt_tc : 1; for (j = 0; j < ICE_TC_MAX_USER_PRIORITY; j++) qos_info->up2tc[j] = (up2tc >> (j * 3)) & 0x7; for (j = 0; j < IEEE_8021QAZ_MAX_TCS; j++) qos_info->tc_info[j].rel_bw = dcbx_cfg->etscfg.tcbwtable[j]; for (j = 0; j < qos_info->num_apps; j++) { qos_info->apps[j].priority = dcbx_cfg->app[j].priority; qos_info->apps[j].prot_id = dcbx_cfg->app[j].prot_id; qos_info->apps[j].selector = dcbx_cfg->app[j].selector; } /* Gather DSCP-to-TC mapping and QoS/PFC mode */ memcpy(qos_info->dscp_map, dcbx_cfg->dscp_map, sizeof(qos_info->dscp_map)); qos_info->pfc_mode = dcbx_cfg->pfc_mode; } /** * ice_rdma_check_version - Check that the provided RDMA version is compatible * @info: the RDMA client information structure * * Verify that the client RDMA driver provided a version that is compatible * with the driver interface. */ static int ice_rdma_check_version(struct ice_rdma_info *info) { /* Make sure the MAJOR version matches */ if (info->major_version != ICE_RDMA_MAJOR_VERSION) { log(LOG_WARNING, "%s: the iRDMA driver requested version %d.%d.%d, but this driver only supports major version %d.x.x\n", __func__, info->major_version, info->minor_version, info->patch_version, ICE_RDMA_MAJOR_VERSION); return (ENOTSUP); } /* * Make sure that the MINOR version is compatible. * * This means that the RDMA client driver version MUST not be greater * than the version provided by the driver, as it would indicate that * the RDMA client expects features which are not supported by the * main driver. */ if (info->minor_version > ICE_RDMA_MINOR_VERSION) { log(LOG_WARNING, "%s: the iRDMA driver requested version %d.%d.%d, but this driver only supports up to minor version %d.%d.x\n", __func__, info->major_version, info->minor_version, info->patch_version, ICE_RDMA_MAJOR_VERSION, ICE_RDMA_MINOR_VERSION); return (ENOTSUP); } /* * Make sure that the PATCH version is compatible. * * This means that the RDMA client version MUST not be greater than * the version provided by the driver, as it may indicate that the * RDMA client expects certain backwards compatible bug fixes which * are not implemented by this version of the main driver. */ if ((info->minor_version == ICE_RDMA_MINOR_VERSION) && (info->patch_version > ICE_RDMA_PATCH_VERSION)) { log(LOG_WARNING, "%s: the iRDMA driver requested version %d.%d.%d, but this driver only supports up to patch version %d.%d.%d\n", __func__, info->major_version, info->minor_version, info->patch_version, ICE_RDMA_MAJOR_VERSION, ICE_RDMA_MINOR_VERSION, ICE_RDMA_PATCH_VERSION); return (ENOTSUP); } /* Make sure that the kobject class is initialized */ if (info->rdma_class == NULL) { log(LOG_WARNING, "%s: the iRDMA driver did not specify a kobject interface\n", __func__); return (EINVAL); } return (0); } /** * ice_rdma_register - Register an RDMA client driver * @info: the RDMA client information structure * * Called by the RDMA client driver on load. Used to initialize the RDMA * client driver interface and enable interop between the ice driver and the * RDMA client driver. * * The RDMA client driver must provide the version number it expects, along * with a pointer to a kobject class that extends the irdma_di_if class, and * implements the irdma_if class interface. */ int ice_rdma_register(struct ice_rdma_info *info) { struct ice_rdma_entry *entry; struct ice_softc *sc; int err = 0; sx_xlock(&ice_rdma.mtx); if (!ice_enable_irdma) { log(LOG_INFO, "%s: The iRDMA driver interface has been disabled\n", __func__); err = (ECONNREFUSED); goto return_unlock; } if (ice_rdma.registered) { log(LOG_WARNING, "%s: iRDMA driver already registered\n", __func__); err = (EBUSY); goto return_unlock; } /* Make sure the iRDMA version is compatible */ err = ice_rdma_check_version(info); if (err) goto return_unlock; log(LOG_INFO, "%s: iRDMA driver registered using version %d.%d.%d\n", __func__, info->major_version, info->minor_version, info->patch_version); ice_rdma.peer_class = info->rdma_class; /* * Initialize the kobject interface and notify the RDMA client of each * existing PF interface. */ LIST_FOREACH(entry, &ice_rdma.peers, node) { kobj_init((kobj_t)&entry->peer, ice_rdma.peer_class); /* Gather DCB/QOS info into peer */ sc = __containerof(entry, struct ice_softc, rdma_entry); memset(&entry->peer.initial_qos_info, 0, sizeof(entry->peer.initial_qos_info)); ice_rdma_cp_qos_info(&sc->hw, &sc->hw.port_info->qos_cfg.local_dcbx_cfg, &entry->peer.initial_qos_info); IRDMA_PROBE(&entry->peer); if (entry->initiated) IRDMA_OPEN(&entry->peer); } ice_rdma.registered = true; return_unlock: sx_xunlock(&ice_rdma.mtx); return (err); } /** * ice_rdma_unregister - Unregister an RDMA client driver * * Called by the RDMA client driver on unload. Used to de-initialize the RDMA * client driver interface and shut down communication between the ice driver * and the RDMA client driver. */ int ice_rdma_unregister(void) { struct ice_rdma_entry *entry; sx_xlock(&ice_rdma.mtx); if (!ice_rdma.registered) { log(LOG_WARNING, "%s: iRDMA driver was not previously registered\n", __func__); sx_xunlock(&ice_rdma.mtx); return (ENOENT); } log(LOG_INFO, "%s: iRDMA driver unregistered\n", __func__); ice_rdma.registered = false; ice_rdma.peer_class = NULL; /* * Release the kobject interface for each of the existing PF * interfaces. Note that we do not notify the client about removing * each PF, as it is assumed that the client will have already cleaned * up any associated resources when it is unregistered. */ LIST_FOREACH(entry, &ice_rdma.peers, node) kobj_delete((kobj_t)&entry->peer, NULL); sx_xunlock(&ice_rdma.mtx); return (0); } /** * ice_rdma_init - RDMA driver init routine * * Called during ice driver module initialization to setup the RDMA client * interface mutex and RDMA peer structure list. */ void ice_rdma_init(void) { LIST_INIT(&ice_rdma.peers); sx_init_flags(&ice_rdma.mtx, "ice rdma interface", SX_DUPOK); ice_rdma.registered = false; ice_rdma.peer_class = NULL; } /** * ice_rdma_exit - RDMA driver exit routine * * Called during ice driver module exit to shutdown the RDMA client interface * mutex. */ void ice_rdma_exit(void) { MPASS(LIST_EMPTY(&ice_rdma.peers)); sx_destroy(&ice_rdma.mtx); } /** * ice_rdma_pf_attach_locked - Prepare a PF for RDMA connections * @sc: the ice driver softc * * Initialize a peer entry for this PF and add it to the RDMA interface list. * Notify the client RDMA driver of a new PF device. * * @pre must be called while holding the ice_rdma mutex. */ static int ice_rdma_pf_attach_locked(struct ice_softc *sc) { struct ice_rdma_entry *entry; /* Do not attach the PF unless RDMA is supported */ if (!ice_is_bit_set(sc->feat_cap, ICE_FEATURE_RDMA)) return (0); entry = &sc->rdma_entry; if (entry->attached) { device_printf(sc->dev, "iRDMA peer entry already exists\n"); return (EEXIST); } entry->attached = true; entry->peer.dev = sc->dev; entry->peer.ifp = sc->ifp; entry->peer.pf_id = sc->hw.pf_id; entry->peer.pci_mem = sc->bar0.res; entry->peer.pf_vsi_num = ice_get_hw_vsi_num(&sc->hw, sc->pf_vsi.idx); if (sc->rdma_imap && sc->rdma_imap[0] != ICE_INVALID_RES_IDX && sc->irdma_vectors > 0) { entry->peer.msix.base = sc->rdma_imap[0]; entry->peer.msix.count = sc->irdma_vectors; } /* Gather DCB/QOS info into peer */ memset(&entry->peer.initial_qos_info, 0, sizeof(entry->peer.initial_qos_info)); ice_rdma_cp_qos_info(&sc->hw, &sc->hw.port_info->qos_cfg.local_dcbx_cfg, &entry->peer.initial_qos_info); /* * If the RDMA client driver has already registered, initialize the * kobject and notify the client of a new PF */ if (ice_rdma.registered) { kobj_init((kobj_t)&entry->peer, ice_rdma.peer_class); IRDMA_PROBE(&entry->peer); } LIST_INSERT_HEAD(&ice_rdma.peers, entry, node); ice_set_bit(ICE_FEATURE_RDMA, sc->feat_en); return (0); } /** * ice_rdma_pf_attach - Notify the RDMA client of a new PF * @sc: the ice driver softc * * Called during PF attach to notify the RDMA client of a new PF. */ int ice_rdma_pf_attach(struct ice_softc *sc) { int err; sx_xlock(&ice_rdma.mtx); err = ice_rdma_pf_attach_locked(sc); sx_xunlock(&ice_rdma.mtx); return (err); } /** * ice_rdma_pf_detach_locked - Notify the RDMA client on PF detach * @sc: the ice driver softc * * Notify the RDMA peer client driver of removal of a PF, and release any * RDMA-specific resources associated with that PF. Remove the PF from the * list of available RDMA entries. * * @pre must be called while holding the ice_rdma mutex. */ static void ice_rdma_pf_detach_locked(struct ice_softc *sc) { struct ice_rdma_entry *entry; /* No need to detach the PF if RDMA is not enabled */ if (!ice_is_bit_set(sc->feat_en, ICE_FEATURE_RDMA)) return; entry = &sc->rdma_entry; if (!entry->attached) { device_printf(sc->dev, "iRDMA peer entry was not attached\n"); return; } /* * If the RDMA client driver is registered, notify the client that * a PF has been removed, and release the kobject reference. */ if (ice_rdma.registered) { IRDMA_REMOVE(&entry->peer); kobj_delete((kobj_t)&entry->peer, NULL); } LIST_REMOVE(entry, node); entry->attached = false; ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_en); } /** * ice_rdma_pf_detach - Notify the RDMA client of a PF detaching * @sc: the ice driver softc * * Take the ice_rdma mutex and then notify the RDMA client that a PF has been * removed. */ void ice_rdma_pf_detach(struct ice_softc *sc) { sx_xlock(&ice_rdma.mtx); ice_rdma_pf_detach_locked(sc); sx_xunlock(&ice_rdma.mtx); } /** * ice_rdma_pf_init - Notify the RDMA client that a PF has initialized * @sc: the ice driver softc * * Called by the ice driver when a PF has been initialized. Notifies the RDMA * client that a PF is up and ready to operate. */ int ice_rdma_pf_init(struct ice_softc *sc) { struct ice_rdma_peer *peer = &sc->rdma_entry.peer; sx_xlock(&ice_rdma.mtx); /* Update the MTU */ peer->mtu = sc->ifp->if_mtu; sc->rdma_entry.initiated = true; if (sc->rdma_entry.attached && ice_rdma.registered) { sx_xunlock(&ice_rdma.mtx); return IRDMA_OPEN(peer); } sx_xunlock(&ice_rdma.mtx); return (0); } /** * ice_rdma_pf_stop - Notify the RDMA client of a stopped PF device * @sc: the ice driver softc * * Called by the ice driver when a PF is stopped. Notifies the RDMA client * driver that the PF has stopped and is not ready to operate. */ int ice_rdma_pf_stop(struct ice_softc *sc) { sx_xlock(&ice_rdma.mtx); sc->rdma_entry.initiated = false; if (sc->rdma_entry.attached && ice_rdma.registered) { sx_xunlock(&ice_rdma.mtx); return IRDMA_CLOSE(&sc->rdma_entry.peer); } sx_xunlock(&ice_rdma.mtx); return (0); } /** * ice_rdma_link_change - Notify RDMA client of a change in link status * @sc: the ice driver softc * @linkstate: the link status * @baudrate: the link rate in bits per second * * Notify the RDMA client of a link status change, by sending it the new link * state and baudrate. * * The link state is represented the same was as in the ifnet structure. It * should be LINK_STATE_UNKNOWN, LINK_STATE_DOWN, or LINK_STATE_UP. */ void ice_rdma_link_change(struct ice_softc *sc, int linkstate, uint64_t baudrate) { struct ice_rdma_peer *peer = &sc->rdma_entry.peer; struct ice_rdma_event event; memset(&event, 0, sizeof(struct ice_rdma_event)); event.type = ICE_RDMA_EVENT_LINK_CHANGE; event.linkstate = linkstate; event.baudrate = baudrate; sx_xlock(&ice_rdma.mtx); if (sc->rdma_entry.attached && ice_rdma.registered) IRDMA_EVENT_HANDLER(peer, &event); sx_xunlock(&ice_rdma.mtx); } /** * ice_rdma_notify_dcb_qos_change - notify RDMA driver to pause traffic * @sc: the ice driver softc * * Notify the RDMA driver that QOS/DCB settings are about to change. * Once the function return, all the QPs should be suspended. */ void ice_rdma_notify_dcb_qos_change(struct ice_softc *sc) { struct ice_rdma_peer *peer = &sc->rdma_entry.peer; struct ice_rdma_event event; memset(&event, 0, sizeof(struct ice_rdma_event)); event.type = ICE_RDMA_EVENT_TC_CHANGE; /* pre-event */ event.prep = true; sx_xlock(&ice_rdma.mtx); if (sc->rdma_entry.attached && ice_rdma.registered) IRDMA_EVENT_HANDLER(peer, &event); sx_xunlock(&ice_rdma.mtx); } /** * ice_rdma_dcb_qos_update - pass the changed dcb settings to RDMA driver * @sc: the ice driver softc * @pi: the port info structure * * Pass the changed DCB settings to RDMA traffic. This function should be * called only after ice_rdma_notify_dcb_qos_change has been called and * returned before. After the function returns, all the RDMA traffic * should be resumed. */ void ice_rdma_dcb_qos_update(struct ice_softc *sc, struct ice_port_info *pi) { struct ice_rdma_peer *peer = &sc->rdma_entry.peer; struct ice_rdma_event event; memset(&event, 0, sizeof(struct ice_rdma_event)); event.type = ICE_RDMA_EVENT_TC_CHANGE; /* post-event */ event.prep = false; /* gather current configuration */ ice_rdma_cp_qos_info(&sc->hw, &pi->qos_cfg.local_dcbx_cfg, &event.port_qos); sx_xlock(&ice_rdma.mtx); if (sc->rdma_entry.attached && ice_rdma.registered) IRDMA_EVENT_HANDLER(peer, &event); sx_xunlock(&ice_rdma.mtx); } + +/** + * ice_rdma_notify_pe_intr - notify irdma on incoming interrupts regarding PE + * @sc: the ice driver softc + * @oicr: interrupt cause + * + * Pass the information about received interrupt to RDMA driver if it was + * relating to PE. Specifically PE_CRITERR and HMC_ERR. + * The irdma driver shall decide what should be done upon these interrupts. + */ +void +ice_rdma_notify_pe_intr(struct ice_softc *sc, uint32_t oicr) +{ + struct ice_rdma_peer *peer = &sc->rdma_entry.peer; + struct ice_rdma_event event; + + memset(&event, 0, sizeof(struct ice_rdma_event)); + event.type = ICE_RDMA_EVENT_CRIT_ERR; + event.oicr_reg = oicr; + + sx_xlock(&ice_rdma.mtx); + if (sc->rdma_entry.attached && ice_rdma.registered) + IRDMA_EVENT_HANDLER(peer, &event); + sx_xunlock(&ice_rdma.mtx); +} + +/** + * ice_rdma_notify_reset - notify irdma on incoming pf-reset + * @sc: the ice driver softc + * + * Inform irdma driver of an incoming PF reset. + * The irdma driver shall set its state to reset, and avoid using CQP + * anymore. Next step should be to call ice_rdma_pf_stop in order to + * remove resources. + */ +void +ice_rdma_notify_reset(struct ice_softc *sc) +{ + struct ice_rdma_peer *peer = &sc->rdma_entry.peer; + struct ice_rdma_event event; + + memset(&event, 0, sizeof(struct ice_rdma_event)); + event.type = ICE_RDMA_EVENT_RESET; + + sx_xlock(&ice_rdma.mtx); + if (sc->rdma_entry.attached && ice_rdma.registered) + IRDMA_EVENT_HANDLER(peer, &event); + sx_xunlock(&ice_rdma.mtx); +} diff --git a/sys/dev/ice/ice_rdma.h b/sys/dev/ice/ice_rdma.h index f83c30b33f6c..38e2ef491e8e 100644 --- a/sys/dev/ice/ice_rdma.h +++ b/sys/dev/ice/ice_rdma.h @@ -1,310 +1,311 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* Copyright (c) 2023, Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** * @file ice_rdma.h * @brief header file for RDMA client interface functions * * Contains definitions and function calls shared by the ice driver and the * RDMA client interface driver. * * Since these definitions are shared between drivers it is important that any * changes are considered carefully for backwards compatibility. */ #ifndef _ICE_RDMA_H_ #define _ICE_RDMA_H_ /* * The RDMA client interface version is used to help determine * incompatibilities between the interface definition shared between the main * driver and the client driver. * * It will follows the semantic version guidelines, that is: * Given the version number MAJOR.MINOR.PATCH, increment the: * * MAJOR version when you make incompatible changes, * MINOR version when you add functionality in a backwards-compatible manner, and * PATCH version when you make backwards-compatible bug fixes. * * Any change to this file, or one of the kobject interface files must come * with an associated change in one of the MAJOR, MINOR, or PATCH versions, * and care must be taken that backwards incompatible changes MUST increment * the MAJOR version. * * Note: Until the MAJOR version is set to at least 1, the above semantic * version guarantees may not hold, and this interface should not be * considered stable. */ #define ICE_RDMA_MAJOR_VERSION 1 -#define ICE_RDMA_MINOR_VERSION 0 +#define ICE_RDMA_MINOR_VERSION 1 #define ICE_RDMA_PATCH_VERSION 0 /** * @def ICE_RDMA_MAX_MSIX * @brief Maximum number of MSI-X vectors that will be reserved * * Defines the maximum number of MSI-X vectors that an RDMA interface will * have reserved in advance. Does not guarantee that many vectors have * actually been enabled. */ #define ICE_RDMA_MAX_MSIX 64 /** * @struct ice_rdma_info * @brief RDMA information from the client driver * * The RDMA client driver will fill in this structure and pass its contents * back to the main driver using the ice_rdma_register function. * * It should fill the version in with the ICE_RDMA_* versions as defined in * the ice_rdma.h header. * * Additionally it must provide a pointer to a kobject class which extends the * ice_rdma_di_class with the operations defined in the rdma_if.m interface. * * If the version specified is not compatible, then the registration will * of the RDMA driver will fail. */ struct ice_rdma_info { uint16_t major_version; uint16_t minor_version; uint16_t patch_version; kobj_class_t rdma_class; }; #define ICE_RDMA_MAX_USER_PRIORITY 8 #define ICE_RDMA_MAX_MSIX 64 /* Declare the ice_rdma_di kobject class */ DECLARE_CLASS(ice_rdma_di_class); /** * @struct ice_rdma_msix_mapping * @brief MSI-X mapping requested by the peer RDMA driver * * Defines a mapping for MSI-X vectors being requested by the peer RDMA driver * for a given PF. */ struct ice_rdma_msix_mapping { uint8_t itr_indx; int aeq_vector; int ceq_cnt; int *ceq_vector; }; /** * @struct ice_rdma_msix * @brief RDMA MSI-X vectors reserved for the peer RDMA driver * * Defines the segment of the MSI-X vectors for use by the RDMA driver. These * are reserved by the PF when it initializes. */ struct ice_rdma_msix { int base; int count; }; /** * @struct ice_qos_info * @brief QoS information to be shared with RDMA driver */ struct ice_qos_info { uint64_t tc_ctx; uint8_t rel_bw; uint8_t prio_type; uint8_t egress_virt_up; uint8_t ingress_virt_up; }; /** * @struct ice_qos_app_priority_table * @brief Application priority data */ struct ice_qos_app_priority_table { uint16_t prot_id; uint8_t priority; uint8_t selector; }; #define IEEE_8021QAZ_MAX_TCS 8 #define ICE_TC_MAX_USER_PRIORITY 8 #define ICE_QOS_MAX_APPS 32 #define ICE_QOS_DSCP_NUM_VAL 64 /** * @struct ice_qos_params * @brief Holds all necessary data for RDMA to work with DCB * * Struct to hold QoS info */ struct ice_qos_params { struct ice_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; uint8_t up2tc[ICE_TC_MAX_USER_PRIORITY]; uint8_t vsi_relative_bw; uint8_t vsi_priority_type; uint32_t num_apps; uint8_t pfc_mode; uint8_t dscp_map[ICE_QOS_DSCP_NUM_VAL]; struct ice_qos_app_priority_table apps[ICE_QOS_MAX_APPS]; uint8_t num_tc; }; /** * @struct ice_rdma_peer * @brief RDMA driver information * * Shared structure used by the RDMA client driver when talking with the main * device driver. * * Because the definition of this structure is shared between the two drivers, * its ABI should be handled carefully. */ struct ice_rdma_peer { /** * The KOBJ_FIELDS macro must come first, in order for it to be used * as a kobject. */ KOBJ_FIELDS; struct ifnet *ifp; device_t dev; struct resource *pci_mem; struct ice_qos_params initial_qos_info; struct ice_rdma_msix msix; uint16_t mtu; uint16_t pf_vsi_num; uint8_t pf_id; }; /** * @enum ice_res_type * @brief enum for type of resource registration * * enum for type of resource registration. * created for plausible compatibility with IDC */ enum ice_res_type { ICE_INVAL_RES = 0x0, ICE_RDMA_QSET_ALLOC = 0x8, ICE_RDMA_QSET_FREE = 0x18, }; /** * @struct ice_rdma_qset_params * @brief struct to hold per RDMA Qset info */ struct ice_rdma_qset_params { uint32_t teid; /* qset TEID */ uint16_t qs_handle; /* RDMA driver provides this */ uint16_t vsi_id; /* VSI index */ uint8_t tc; /* TC branch the QSet should belong to */ uint8_t reserved[3]; }; #define ICE_MAX_TXQ_PER_TXQG 128 /** * @struct ice_rdma_qset_update * @brief struct used to register and unregister qsets for RDMA driver */ struct ice_rdma_qset_update { enum ice_res_type res_type; uint16_t cnt_req; uint16_t res_allocated; uint32_t res_handle; struct ice_rdma_qset_params qsets; }; /** * @enum ice_rdma_event_type * @brief enum for type of event from base driver */ enum ice_rdma_event_type { ICE_RDMA_EVENT_NONE = 0, ICE_RDMA_EVENT_LINK_CHANGE, ICE_RDMA_EVENT_MTU_CHANGE, ICE_RDMA_EVENT_TC_CHANGE, ICE_RDMA_EVENT_API_CHANGE, ICE_RDMA_EVENT_CRIT_ERR, ICE_RDMA_EVENT_RESET, ICE_RDMA_EVENT_QSET_REGISTER, ICE_RDMA_EVENT_VSI_FILTER_UPDATE, ICE_RDMA_EVENT_LAST }; /** * @struct ice_rdma_event * @brief struct for event information to pass to RDMA driver */ struct ice_rdma_event { enum ice_rdma_event_type type; union { /* link change event */ struct { int linkstate; uint64_t baudrate; }; /* MTU change event */ - struct { - int mtu; - }; + int mtu; /* * TC/QoS/DCB change event - * RESET event use prep variable only * prep: if true, this is a pre-event, post-event otherwise */ struct { struct ice_qos_params port_qos; bool prep; }; + /* + * CRIT_ERR event + */ + uint32_t oicr_reg; }; }; /** * @struct ice_rdma_request * @brief struct with data for a request from the RDMA driver */ struct ice_rdma_request { enum ice_rdma_event_type type; union { struct { struct ice_rdma_qset_update res; }; struct { bool enable_filter; }; }; }; int ice_rdma_register(struct ice_rdma_info *info); int ice_rdma_unregister(void); #endif diff --git a/sys/dev/ice/ice_rdma_internal.h b/sys/dev/ice/ice_rdma_internal.h index c80d4540194a..b36544609f45 100644 --- a/sys/dev/ice/ice_rdma_internal.h +++ b/sys/dev/ice/ice_rdma_internal.h @@ -1,101 +1,103 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* Copyright (c) 2023, Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** * @file ice_rdma_internal.h * @brief internal header for the RMDA driver interface setup * * Contains the definitions and functions used by the ice driver to setup the * RDMA driver interface. Functions and definitions in this file are not * shared with the RDMA client driver. */ #ifndef _ICE_RDMA_INTERNAL_H_ #define _ICE_RDMA_INTERNAL_H_ #include "ice_rdma.h" /* Forward declare the softc structure */ struct ice_softc; /* Global sysctl variable indicating if the RDMA client interface is enabled */ extern bool ice_enable_irdma; /** * @struct ice_rdma_entry * @brief RDMA peer list node * * Structure used to store peer entries for each PF in a linked list. */ struct ice_rdma_entry { LIST_ENTRY(ice_rdma_entry) node; struct ice_rdma_peer peer; bool attached; bool initiated; }; #define ice_rdma_peer_to_entry(p) __containerof(p, struct ice_rdma_entry, peer) #define ice_rdma_entry_to_sc(e) __containerof(e, struct ice_softc, rdma_entry) #define ice_rdma_peer_to_sc(p) ice_rdma_entry_to_sc(ice_rdma_peer_to_entry(p)) /** * @struct ice_rdma_peers * @brief Head list structure for the RDMA entry list * * Type defining the head of the linked list of RDMA entries. */ LIST_HEAD(ice_rdma_peers, ice_rdma_entry); /** * @struct ice_rdma_state * @brief global driver state for RDMA * * Contains global state shared across all PFs by the device driver, such as * the kobject class of the currently connected peer driver, and the linked * list of peer entries for each PF. */ struct ice_rdma_state { bool registered; kobj_class_t peer_class; struct sx mtx; struct ice_rdma_peers peers; }; void ice_rdma_init(void); void ice_rdma_exit(void); int ice_rdma_pf_attach(struct ice_softc *sc); void ice_rdma_pf_detach(struct ice_softc *sc); int ice_rdma_pf_init(struct ice_softc *sc); int ice_rdma_pf_stop(struct ice_softc *sc); void ice_rdma_link_change(struct ice_softc *sc, int linkstate, uint64_t baudrate); void ice_rdma_notify_dcb_qos_change(struct ice_softc *sc); void ice_rdma_dcb_qos_update(struct ice_softc *sc, struct ice_port_info *pi); +void ice_rdma_notify_pe_intr(struct ice_softc *sc, uint32_t oicr); +void ice_rdma_notify_reset(struct ice_softc *sc); #endif diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c index be1aa86234c7..d8542e383564 100644 --- a/sys/dev/ice/if_ice_iflib.c +++ b/sys/dev/ice/if_ice_iflib.c @@ -1,3106 +1,3105 @@ /* SPDX-License-Identifier: BSD-3-Clause */ /* Copyright (c) 2023, Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * 3. Neither the name of the Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /** * @file if_ice_iflib.c * @brief iflib driver implementation * * Contains the main entry point for the iflib driver implementation. It * implements the various ifdi driver methods, and sets up the module and * driver values to load an iflib driver. */ #include "ice_iflib.h" #include "ice_drv_info.h" #include "ice_switch.h" #include "ice_sched.h" #include #include #include #include #include /* * Device method prototypes */ static void *ice_register(device_t); static int ice_if_attach_pre(if_ctx_t); static int ice_attach_pre_recovery_mode(struct ice_softc *sc); static int ice_if_attach_post(if_ctx_t); static void ice_attach_post_recovery_mode(struct ice_softc *sc); static int ice_if_detach(if_ctx_t); static int ice_if_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int ntxqs, int ntxqsets); static int ice_if_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int nqs, int nqsets); static int ice_if_msix_intr_assign(if_ctx_t ctx, int msix); static void ice_if_queues_free(if_ctx_t ctx); static int ice_if_mtu_set(if_ctx_t ctx, uint32_t mtu); static void ice_if_intr_enable(if_ctx_t ctx); static void ice_if_intr_disable(if_ctx_t ctx); static int ice_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid); static int ice_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid); static int ice_if_promisc_set(if_ctx_t ctx, int flags); static void ice_if_media_status(if_ctx_t ctx, struct ifmediareq *ifmr); static int ice_if_media_change(if_ctx_t ctx); static void ice_if_init(if_ctx_t ctx); static void ice_if_timer(if_ctx_t ctx, uint16_t qid); static void ice_if_update_admin_status(if_ctx_t ctx); static void ice_if_multi_set(if_ctx_t ctx); static void ice_if_vlan_register(if_ctx_t ctx, u16 vtag); static void ice_if_vlan_unregister(if_ctx_t ctx, u16 vtag); static void ice_if_stop(if_ctx_t ctx); static uint64_t ice_if_get_counter(if_ctx_t ctx, ift_counter counter); static int ice_if_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data); static int ice_if_i2c_req(if_ctx_t ctx, struct ifi2creq *req); static int ice_if_suspend(if_ctx_t ctx); static int ice_if_resume(if_ctx_t ctx); static bool ice_if_needs_restart(if_ctx_t, enum iflib_restart_event); static int ice_msix_que(void *arg); static int ice_msix_admin(void *arg); /* * Helper function prototypes */ static int ice_pci_mapping(struct ice_softc *sc); static void ice_free_pci_mapping(struct ice_softc *sc); static void ice_update_link_status(struct ice_softc *sc, bool update_media); static void ice_init_device_features(struct ice_softc *sc); static void ice_init_tx_tracking(struct ice_vsi *vsi); static void ice_handle_reset_event(struct ice_softc *sc); static void ice_handle_pf_reset_request(struct ice_softc *sc); static void ice_prepare_for_reset(struct ice_softc *sc); static int ice_rebuild_pf_vsi_qmap(struct ice_softc *sc); static void ice_rebuild(struct ice_softc *sc); static void ice_rebuild_recovery_mode(struct ice_softc *sc); static void ice_free_irqvs(struct ice_softc *sc); static void ice_update_rx_mbuf_sz(struct ice_softc *sc); static void ice_poll_for_media_avail(struct ice_softc *sc); static void ice_setup_scctx(struct ice_softc *sc); static int ice_allocate_msix(struct ice_softc *sc); static void ice_admin_timer(void *arg); static void ice_transition_recovery_mode(struct ice_softc *sc); static void ice_transition_safe_mode(struct ice_softc *sc); /* * Device Interface Declaration */ /** * @var ice_methods * @brief ice driver method entry points * * List of device methods implementing the generic device interface used by * the device stack to interact with the ice driver. Since this is an iflib * driver, most of the methods point to the generic iflib implementation. */ static device_method_t ice_methods[] = { /* Device interface */ DEVMETHOD(device_register, ice_register), DEVMETHOD(device_probe, iflib_device_probe_vendor), DEVMETHOD(device_attach, iflib_device_attach), DEVMETHOD(device_detach, iflib_device_detach), DEVMETHOD(device_shutdown, iflib_device_shutdown), DEVMETHOD(device_suspend, iflib_device_suspend), DEVMETHOD(device_resume, iflib_device_resume), DEVMETHOD_END }; /** * @var ice_iflib_methods * @brief iflib method entry points * * List of device methods used by the iflib stack to interact with this * driver. These are the real main entry points used to interact with this * driver. */ static device_method_t ice_iflib_methods[] = { DEVMETHOD(ifdi_attach_pre, ice_if_attach_pre), DEVMETHOD(ifdi_attach_post, ice_if_attach_post), DEVMETHOD(ifdi_detach, ice_if_detach), DEVMETHOD(ifdi_tx_queues_alloc, ice_if_tx_queues_alloc), DEVMETHOD(ifdi_rx_queues_alloc, ice_if_rx_queues_alloc), DEVMETHOD(ifdi_msix_intr_assign, ice_if_msix_intr_assign), DEVMETHOD(ifdi_queues_free, ice_if_queues_free), DEVMETHOD(ifdi_mtu_set, ice_if_mtu_set), DEVMETHOD(ifdi_intr_enable, ice_if_intr_enable), DEVMETHOD(ifdi_intr_disable, ice_if_intr_disable), DEVMETHOD(ifdi_rx_queue_intr_enable, ice_if_rx_queue_intr_enable), DEVMETHOD(ifdi_tx_queue_intr_enable, ice_if_tx_queue_intr_enable), DEVMETHOD(ifdi_promisc_set, ice_if_promisc_set), DEVMETHOD(ifdi_media_status, ice_if_media_status), DEVMETHOD(ifdi_media_change, ice_if_media_change), DEVMETHOD(ifdi_init, ice_if_init), DEVMETHOD(ifdi_stop, ice_if_stop), DEVMETHOD(ifdi_timer, ice_if_timer), DEVMETHOD(ifdi_update_admin_status, ice_if_update_admin_status), DEVMETHOD(ifdi_multi_set, ice_if_multi_set), DEVMETHOD(ifdi_vlan_register, ice_if_vlan_register), DEVMETHOD(ifdi_vlan_unregister, ice_if_vlan_unregister), DEVMETHOD(ifdi_get_counter, ice_if_get_counter), DEVMETHOD(ifdi_priv_ioctl, ice_if_priv_ioctl), DEVMETHOD(ifdi_i2c_req, ice_if_i2c_req), DEVMETHOD(ifdi_suspend, ice_if_suspend), DEVMETHOD(ifdi_resume, ice_if_resume), DEVMETHOD(ifdi_needs_restart, ice_if_needs_restart), DEVMETHOD_END }; /** * @var ice_driver * @brief driver structure for the generic device stack * * driver_t definition used to setup the generic device methods. */ static driver_t ice_driver = { .name = "ice", .methods = ice_methods, .size = sizeof(struct ice_softc), }; /** * @var ice_iflib_driver * @brief driver structure for the iflib stack * * driver_t definition used to setup the iflib device methods. */ static driver_t ice_iflib_driver = { .name = "ice", .methods = ice_iflib_methods, .size = sizeof(struct ice_softc), }; extern struct if_txrx ice_txrx; extern struct if_txrx ice_recovery_txrx; /** * @var ice_sctx * @brief ice driver shared context * * Structure defining shared values (context) that is used by all instances of * the device. Primarily used to setup details about how the iflib stack * should treat this driver. Also defines the default, minimum, and maximum * number of descriptors in each ring. */ static struct if_shared_ctx ice_sctx = { .isc_magic = IFLIB_MAGIC, .isc_q_align = PAGE_SIZE, .isc_tx_maxsize = ICE_MAX_FRAME_SIZE, /* We could technically set this as high as ICE_MAX_DMA_SEG_SIZE, but * that doesn't make sense since that would be larger than the maximum * size of a single packet. */ .isc_tx_maxsegsize = ICE_MAX_FRAME_SIZE, /* XXX: This is only used by iflib to ensure that * scctx->isc_tx_tso_size_max + the VLAN header is a valid size. */ .isc_tso_maxsize = ICE_TSO_SIZE + sizeof(struct ether_vlan_header), /* XXX: This is used by iflib to set the number of segments in the TSO * DMA tag. However, scctx->isc_tx_tso_segsize_max is used to set the * related ifnet parameter. */ .isc_tso_maxsegsize = ICE_MAX_DMA_SEG_SIZE, .isc_rx_maxsize = ICE_MAX_FRAME_SIZE, .isc_rx_nsegments = ICE_MAX_RX_SEGS, .isc_rx_maxsegsize = ICE_MAX_FRAME_SIZE, .isc_nfl = 1, .isc_ntxqs = 1, .isc_nrxqs = 1, .isc_admin_intrcnt = 1, .isc_vendor_info = ice_vendor_info_array, .isc_driver_version = __DECONST(char *, ice_driver_version), .isc_driver = &ice_iflib_driver, /* * IFLIB_NEED_SCRATCH ensures that mbufs have scratch space available * for hardware checksum offload * * IFLIB_TSO_INIT_IP ensures that the TSO packets have zeroed out the * IP sum field, required by our hardware to calculate valid TSO * checksums. * * IFLIB_ADMIN_ALWAYS_RUN ensures that the administrative task runs * even when the interface is down. * * IFLIB_SKIP_MSIX allows the driver to handle allocating MSI-X * vectors manually instead of relying on iflib code to do this. */ .isc_flags = IFLIB_NEED_SCRATCH | IFLIB_TSO_INIT_IP | IFLIB_ADMIN_ALWAYS_RUN | IFLIB_SKIP_MSIX, .isc_nrxd_min = {ICE_MIN_DESC_COUNT}, .isc_ntxd_min = {ICE_MIN_DESC_COUNT}, .isc_nrxd_max = {ICE_IFLIB_MAX_DESC_COUNT}, .isc_ntxd_max = {ICE_IFLIB_MAX_DESC_COUNT}, .isc_nrxd_default = {ICE_DEFAULT_DESC_COUNT}, .isc_ntxd_default = {ICE_DEFAULT_DESC_COUNT}, }; /** * @var ice_devclass * @brief ice driver device class * * device class used to setup the ice driver module kobject class. */ devclass_t ice_devclass; DRIVER_MODULE(ice, pci, ice_driver, ice_devclass, ice_module_event_handler, 0); MODULE_VERSION(ice, 1); MODULE_DEPEND(ice, pci, 1, 1, 1); MODULE_DEPEND(ice, ether, 1, 1, 1); MODULE_DEPEND(ice, iflib, 1, 1, 1); IFLIB_PNP_INFO(pci, ice, ice_vendor_info_array); /* Static driver-wide sysctls */ #include "ice_iflib_sysctls.h" /** * ice_pci_mapping - Map PCI BAR memory * @sc: device private softc * * Map PCI BAR 0 for device operation. */ static int ice_pci_mapping(struct ice_softc *sc) { int rc; /* Map BAR0 */ rc = ice_map_bar(sc->dev, &sc->bar0, 0); if (rc) return rc; return 0; } /** * ice_free_pci_mapping - Release PCI BAR memory * @sc: device private softc * * Release PCI BARs which were previously mapped by ice_pci_mapping(). */ static void ice_free_pci_mapping(struct ice_softc *sc) { /* Free BAR0 */ ice_free_bar(sc->dev, &sc->bar0); } /* * Device methods */ /** * ice_register - register device method callback * @dev: the device being registered * * Returns a pointer to the shared context structure, which is used by iflib. */ static void * ice_register(device_t dev __unused) { return &ice_sctx; } /* ice_register */ /** * ice_setup_scctx - Setup the iflib softc context structure * @sc: the device private structure * * Setup the parameters in if_softc_ctx_t structure used by the iflib stack * when loading. */ static void ice_setup_scctx(struct ice_softc *sc) { if_softc_ctx_t scctx = sc->scctx; struct ice_hw *hw = &sc->hw; bool safe_mode, recovery_mode; safe_mode = ice_is_bit_set(sc->feat_en, ICE_FEATURE_SAFE_MODE); recovery_mode = ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE); /* * If the driver loads in Safe mode or Recovery mode, limit iflib to * a single queue pair. */ if (safe_mode || recovery_mode) { scctx->isc_ntxqsets = scctx->isc_nrxqsets = 1; scctx->isc_ntxqsets_max = 1; scctx->isc_nrxqsets_max = 1; } else { /* * iflib initially sets the isc_ntxqsets and isc_nrxqsets to * the values of the override sysctls. Cache these initial * values so that the driver can be aware of what the iflib * sysctl value is when setting up MSI-X vectors. */ sc->ifc_sysctl_ntxqs = scctx->isc_ntxqsets; sc->ifc_sysctl_nrxqs = scctx->isc_nrxqsets; if (scctx->isc_ntxqsets == 0) scctx->isc_ntxqsets = hw->func_caps.common_cap.rss_table_size; if (scctx->isc_nrxqsets == 0) scctx->isc_nrxqsets = hw->func_caps.common_cap.rss_table_size; scctx->isc_ntxqsets_max = hw->func_caps.common_cap.num_txq; scctx->isc_nrxqsets_max = hw->func_caps.common_cap.num_rxq; /* * Sanity check that the iflib sysctl values are within the * maximum supported range. */ if (sc->ifc_sysctl_ntxqs > scctx->isc_ntxqsets_max) sc->ifc_sysctl_ntxqs = scctx->isc_ntxqsets_max; if (sc->ifc_sysctl_nrxqs > scctx->isc_nrxqsets_max) sc->ifc_sysctl_nrxqs = scctx->isc_nrxqsets_max; } scctx->isc_txqsizes[0] = roundup2(scctx->isc_ntxd[0] * sizeof(struct ice_tx_desc), DBA_ALIGN); scctx->isc_rxqsizes[0] = roundup2(scctx->isc_nrxd[0] * sizeof(union ice_32b_rx_flex_desc), DBA_ALIGN); scctx->isc_tx_nsegments = ICE_MAX_TX_SEGS; scctx->isc_tx_tso_segments_max = ICE_MAX_TSO_SEGS; scctx->isc_tx_tso_size_max = ICE_TSO_SIZE; scctx->isc_tx_tso_segsize_max = ICE_MAX_DMA_SEG_SIZE; scctx->isc_msix_bar = PCIR_BAR(ICE_MSIX_BAR); scctx->isc_rss_table_size = hw->func_caps.common_cap.rss_table_size; /* * If the driver loads in recovery mode, disable Tx/Rx functionality */ if (recovery_mode) scctx->isc_txrx = &ice_recovery_txrx; else scctx->isc_txrx = &ice_txrx; /* * If the driver loads in Safe mode or Recovery mode, disable * advanced features including hardware offloads. */ if (safe_mode || recovery_mode) { scctx->isc_capenable = ICE_SAFE_CAPS; scctx->isc_tx_csum_flags = 0; } else { scctx->isc_capenable = ICE_FULL_CAPS; scctx->isc_tx_csum_flags = ICE_CSUM_OFFLOAD; } scctx->isc_capabilities = scctx->isc_capenable; } /* ice_setup_scctx */ /** * ice_if_attach_pre - Early device attach logic * @ctx: the iflib context structure * * Called by iflib during the attach process. Earliest main driver entry * point which performs necessary hardware and driver initialization. Called * before the Tx and Rx queues are allocated. */ static int ice_if_attach_pre(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); enum ice_fw_modes fw_mode; enum ice_status status; if_softc_ctx_t scctx; struct ice_hw *hw; device_t dev; int err; device_printf(iflib_get_dev(ctx), "Loading the iflib ice driver\n"); ice_set_state(&sc->state, ICE_STATE_ATTACHING); sc->ctx = ctx; sc->media = iflib_get_media(ctx); sc->sctx = iflib_get_sctx(ctx); sc->iflib_ctx_lock = iflib_ctx_lock_get(ctx); dev = sc->dev = iflib_get_dev(ctx); scctx = sc->scctx = iflib_get_softc_ctx(ctx); hw = &sc->hw; hw->back = sc; snprintf(sc->admin_mtx_name, sizeof(sc->admin_mtx_name), "%s:admin", device_get_nameunit(dev)); mtx_init(&sc->admin_mtx, sc->admin_mtx_name, NULL, MTX_DEF); callout_init_mtx(&sc->admin_timer, &sc->admin_mtx, 0); ASSERT_CTX_LOCKED(sc); if (ice_pci_mapping(sc)) { err = (ENXIO); goto destroy_admin_timer; } /* Save off the PCI information */ ice_save_pci_info(hw, dev); /* create tunables as early as possible */ ice_add_device_tunables(sc); /* Setup ControlQ lengths */ ice_set_ctrlq_len(hw); reinit_hw: fw_mode = ice_get_fw_mode(hw); if (fw_mode == ICE_FW_MODE_REC) { device_printf(dev, "Firmware recovery mode detected. Limiting functionality. Refer to Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); err = ice_attach_pre_recovery_mode(sc); if (err) goto free_pci_mapping; return (0); } /* Initialize the hw data structure */ status = ice_init_hw(hw); if (status) { if (status == ICE_ERR_FW_API_VER) { /* Enter recovery mode, so that the driver remains * loaded. This way, if the system administrator * cannot update the driver, they may still attempt to * downgrade the NVM. */ err = ice_attach_pre_recovery_mode(sc); if (err) goto free_pci_mapping; return (0); } else { err = EIO; device_printf(dev, "Unable to initialize hw, err %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); } goto free_pci_mapping; } ice_init_device_features(sc); /* Notify firmware of the device driver version */ err = ice_send_version(sc); if (err) goto deinit_hw; /* * Success indicates a change was made that requires a reinitialization * of the hardware */ err = ice_load_pkg_file(sc); if (err == ICE_SUCCESS) { ice_deinit_hw(hw); goto reinit_hw; } err = ice_init_link_events(sc); if (err) { device_printf(dev, "ice_init_link_events failed: %s\n", ice_err_str(err)); goto deinit_hw; } /* Initialize VLAN mode in FW; if dual VLAN mode is supported by the package * and firmware, this will force them to use single VLAN mode. */ status = ice_set_vlan_mode(hw); if (status) { err = EIO; device_printf(dev, "Unable to initialize VLAN mode, err %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); goto deinit_hw; } ice_print_nvm_version(sc); /* Setup the MAC address */ iflib_set_mac(ctx, hw->port_info->mac.lan_addr); /* Setup the iflib softc context structure */ ice_setup_scctx(sc); /* Initialize the Tx queue manager */ err = ice_resmgr_init(&sc->tx_qmgr, hw->func_caps.common_cap.num_txq); if (err) { device_printf(dev, "Unable to initialize Tx queue manager: %s\n", ice_err_str(err)); goto deinit_hw; } /* Initialize the Rx queue manager */ err = ice_resmgr_init(&sc->rx_qmgr, hw->func_caps.common_cap.num_rxq); if (err) { device_printf(dev, "Unable to initialize Rx queue manager: %s\n", ice_err_str(err)); goto free_tx_qmgr; } /* Initialize the interrupt resource manager */ err = ice_alloc_intr_tracking(sc); if (err) /* Errors are already printed */ goto free_rx_qmgr; /* Determine maximum number of VSIs we'll prepare for */ sc->num_available_vsi = min(ICE_MAX_VSI_AVAILABLE, hw->func_caps.guar_num_vsi); if (!sc->num_available_vsi) { err = EIO; device_printf(dev, "No VSIs allocated to host\n"); goto free_intr_tracking; } /* Allocate storage for the VSI pointers */ sc->all_vsi = (struct ice_vsi **) malloc(sizeof(struct ice_vsi *) * sc->num_available_vsi, M_ICE, M_WAITOK | M_ZERO); if (!sc->all_vsi) { err = ENOMEM; device_printf(dev, "Unable to allocate VSI array\n"); goto free_intr_tracking; } /* * Prepare the statically allocated primary PF VSI in the softc * structure. Other VSIs will be dynamically allocated as needed. */ ice_setup_pf_vsi(sc); err = ice_alloc_vsi_qmap(&sc->pf_vsi, scctx->isc_ntxqsets_max, scctx->isc_nrxqsets_max); if (err) { device_printf(dev, "Unable to allocate VSI Queue maps\n"); goto free_main_vsi; } /* Allocate MSI-X vectors (due to isc_flags IFLIB_SKIP_MSIX) */ err = ice_allocate_msix(sc); if (err) goto free_main_vsi; return 0; free_main_vsi: /* ice_release_vsi will free the queue maps if they were allocated */ ice_release_vsi(&sc->pf_vsi); free(sc->all_vsi, M_ICE); sc->all_vsi = NULL; free_intr_tracking: ice_free_intr_tracking(sc); free_rx_qmgr: ice_resmgr_destroy(&sc->rx_qmgr); free_tx_qmgr: ice_resmgr_destroy(&sc->tx_qmgr); deinit_hw: ice_deinit_hw(hw); free_pci_mapping: ice_free_pci_mapping(sc); destroy_admin_timer: mtx_lock(&sc->admin_mtx); callout_stop(&sc->admin_timer); mtx_unlock(&sc->admin_mtx); mtx_destroy(&sc->admin_mtx); return err; } /* ice_if_attach_pre */ /** * ice_attach_pre_recovery_mode - Limited driver attach_pre for FW recovery * @sc: the device private softc * * Loads the device driver in limited Firmware Recovery mode, intended to * allow users to update the firmware to attempt to recover the device. * * @remark We may enter recovery mode in case either (a) the firmware is * detected to be in an invalid state and must be re-programmed, or (b) the * driver detects that the loaded firmware has a non-compatible API version * that the driver cannot operate with. */ static int ice_attach_pre_recovery_mode(struct ice_softc *sc) { ice_set_state(&sc->state, ICE_STATE_RECOVERY_MODE); /* Setup the iflib softc context */ ice_setup_scctx(sc); /* Setup the PF VSI back pointer */ sc->pf_vsi.sc = sc; /* * We still need to allocate MSI-X vectors since we need one vector to * run the administrative admin interrupt */ return ice_allocate_msix(sc); } /** * ice_update_link_status - notify OS of link state change * @sc: device private softc structure * @update_media: true if we should update media even if link didn't change * * Called to notify iflib core of link status changes. Should be called once * during attach_post, and whenever link status changes during runtime. * * This call only updates the currently supported media types if the link * status changed, or if update_media is set to true. */ static void ice_update_link_status(struct ice_softc *sc, bool update_media) { struct ice_hw *hw = &sc->hw; enum ice_status status; /* Never report link up when in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; /* Report link status to iflib only once each time it changes */ if (!ice_testandset_state(&sc->state, ICE_STATE_LINK_STATUS_REPORTED)) { if (sc->link_up) { /* link is up */ uint64_t baudrate = ice_aq_speed_to_rate(sc->hw.port_info); ice_set_default_local_lldp_mib(sc); iflib_link_state_change(sc->ctx, LINK_STATE_UP, baudrate); ice_rdma_link_change(sc, LINK_STATE_UP, baudrate); ice_link_up_msg(sc); } else { /* link is down */ iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0); ice_rdma_link_change(sc, LINK_STATE_DOWN, 0); } update_media = true; } /* Update the supported media types */ if (update_media) { status = ice_add_media_types(sc, sc->media); if (status) device_printf(sc->dev, "Error adding device media types: %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); } } /** * ice_if_attach_post - Late device attach logic * @ctx: the iflib context structure * * Called by iflib to finish up attaching the device. Performs any attach * logic which must wait until after the Tx and Rx queues have been * allocated. */ static int ice_if_attach_post(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); if_t ifp = iflib_get_ifp(ctx); int err; ASSERT_CTX_LOCKED(sc); /* We don't yet support loading if MSI-X is not supported */ if (sc->scctx->isc_intr != IFLIB_INTR_MSIX) { device_printf(sc->dev, "The ice driver does not support loading without MSI-X\n"); return (ENOTSUP); } /* The ifnet structure hasn't yet been initialized when the attach_pre * handler is called, so wait until attach_post to setup the * isc_max_frame_size. */ sc->ifp = ifp; sc->scctx->isc_max_frame_size = ifp->if_mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN; /* * If we are in recovery mode, only perform a limited subset of * initialization to support NVM recovery. */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) { ice_attach_post_recovery_mode(sc); return (0); } sc->pf_vsi.max_frame_size = sc->scctx->isc_max_frame_size; err = ice_initialize_vsi(&sc->pf_vsi); if (err) { device_printf(sc->dev, "Unable to initialize Main VSI: %s\n", ice_err_str(err)); return err; } /* Enable FW health event reporting */ ice_init_health_events(sc); /* Configure the main PF VSI for RSS */ err = ice_config_rss(&sc->pf_vsi); if (err) { device_printf(sc->dev, "Unable to configure RSS for the main VSI, err %s\n", ice_err_str(err)); return err; } /* Configure switch to drop transmitted LLDP and PAUSE frames */ err = ice_cfg_pf_ethertype_filters(sc); if (err) return err; ice_get_and_print_bus_info(sc); ice_set_link_management_mode(sc); ice_init_saved_phy_cfg(sc); ice_cfg_pba_num(sc); ice_add_device_sysctls(sc); /* Get DCBX/LLDP state and start DCBX agent */ ice_init_dcb_setup(sc); /* Setup link configuration parameters */ ice_init_link_configuration(sc); ice_update_link_status(sc, true); /* Configure interrupt causes for the administrative interrupt */ ice_configure_misc_interrupts(sc); /* Enable ITR 0 right away, so that we can handle admin interrupts */ ice_enable_intr(&sc->hw, sc->irqvs[0].me); err = ice_rdma_pf_attach(sc); if (err) return (err); /* Start the admin timer */ mtx_lock(&sc->admin_mtx); callout_reset(&sc->admin_timer, hz/2, ice_admin_timer, sc); mtx_unlock(&sc->admin_mtx); ice_clear_state(&sc->state, ICE_STATE_ATTACHING); return 0; } /* ice_if_attach_post */ /** * ice_attach_post_recovery_mode - Limited driver attach_post for FW recovery * @sc: the device private softc * * Performs minimal work to prepare the driver to recover an NVM in case the * firmware is in recovery mode. */ static void ice_attach_post_recovery_mode(struct ice_softc *sc) { /* Configure interrupt causes for the administrative interrupt */ ice_configure_misc_interrupts(sc); /* Enable ITR 0 right away, so that we can handle admin interrupts */ ice_enable_intr(&sc->hw, sc->irqvs[0].me); /* Start the admin timer */ mtx_lock(&sc->admin_mtx); callout_reset(&sc->admin_timer, hz/2, ice_admin_timer, sc); mtx_unlock(&sc->admin_mtx); ice_clear_state(&sc->state, ICE_STATE_ATTACHING); } /** * ice_free_irqvs - Free IRQ vector memory * @sc: the device private softc structure * * Free IRQ vector memory allocated during ice_if_msix_intr_assign. */ static void ice_free_irqvs(struct ice_softc *sc) { struct ice_vsi *vsi = &sc->pf_vsi; if_ctx_t ctx = sc->ctx; int i; /* If the irqvs array is NULL, then there are no vectors to free */ if (sc->irqvs == NULL) return; /* Free the IRQ vectors */ for (i = 0; i < sc->num_irq_vectors; i++) iflib_irq_free(ctx, &sc->irqvs[i].irq); /* Clear the irqv pointers */ for (i = 0; i < vsi->num_rx_queues; i++) vsi->rx_queues[i].irqv = NULL; for (i = 0; i < vsi->num_tx_queues; i++) vsi->tx_queues[i].irqv = NULL; /* Release the vector array memory */ free(sc->irqvs, M_ICE); sc->irqvs = NULL; sc->num_irq_vectors = 0; } /** * ice_if_detach - Device driver detach logic * @ctx: iflib context structure * * Perform device shutdown logic to detach the device driver. * * Note that there is no guarantee of the ordering of ice_if_queues_free() and * ice_if_detach(). It is possible for the functions to be called in either * order, and they must not assume to have a strict ordering. */ static int ice_if_detach(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; int i; ASSERT_CTX_LOCKED(sc); /* Indicate that we're detaching */ ice_set_state(&sc->state, ICE_STATE_DETACHING); /* Stop the admin timer */ mtx_lock(&sc->admin_mtx); callout_stop(&sc->admin_timer); mtx_unlock(&sc->admin_mtx); mtx_destroy(&sc->admin_mtx); ice_rdma_pf_detach(sc); /* Free allocated media types */ ifmedia_removeall(sc->media); /* Free the Tx and Rx sysctl contexts, and assign NULL to the node * pointers. Note, the calls here and those in ice_if_queues_free() * are *BOTH* necessary, as we cannot guarantee which path will be * run first */ ice_vsi_del_txqs_ctx(vsi); ice_vsi_del_rxqs_ctx(vsi); /* Release MSI-X resources */ ice_free_irqvs(sc); for (i = 0; i < sc->num_available_vsi; i++) { if (sc->all_vsi[i]) ice_release_vsi(sc->all_vsi[i]); } if (sc->all_vsi) { free(sc->all_vsi, M_ICE); sc->all_vsi = NULL; } /* Release MSI-X memory */ pci_release_msi(sc->dev); if (sc->msix_table != NULL) { bus_release_resource(sc->dev, SYS_RES_MEMORY, rman_get_rid(sc->msix_table), sc->msix_table); sc->msix_table = NULL; } ice_free_intr_tracking(sc); /* Destroy the queue managers */ ice_resmgr_destroy(&sc->tx_qmgr); ice_resmgr_destroy(&sc->rx_qmgr); if (!ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) ice_deinit_hw(&sc->hw); ice_free_pci_mapping(sc); return 0; } /* ice_if_detach */ /** * ice_if_tx_queues_alloc - Allocate Tx queue memory * @ctx: iflib context structure * @vaddrs: virtual addresses for the queue memory * @paddrs: physical addresses for the queue memory * @ntxqs: the number of Tx queues per set (should always be 1) * @ntxqsets: the number of Tx queue sets to allocate * * Called by iflib to allocate Tx queues for the device. Allocates driver * memory to track each queue, the status arrays used for descriptor * status reporting, and Tx queue sysctls. */ static int ice_if_tx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int __invariant_only ntxqs, int ntxqsets) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; struct ice_tx_queue *txq; int err, i, j; MPASS(ntxqs == 1); MPASS(sc->scctx->isc_ntxd[0] <= ICE_MAX_DESC_COUNT); ASSERT_CTX_LOCKED(sc); /* Do not bother allocating queues if we're in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (0); /* Allocate queue structure memory */ if (!(vsi->tx_queues = (struct ice_tx_queue *) malloc(sizeof(struct ice_tx_queue) * ntxqsets, M_ICE, M_NOWAIT | M_ZERO))) { device_printf(sc->dev, "Unable to allocate Tx queue memory\n"); return (ENOMEM); } /* Allocate report status arrays */ for (i = 0, txq = vsi->tx_queues; i < ntxqsets; i++, txq++) { if (!(txq->tx_rsq = (uint16_t *) malloc(sizeof(uint16_t) * sc->scctx->isc_ntxd[0], M_ICE, M_NOWAIT))) { device_printf(sc->dev, "Unable to allocate tx_rsq memory\n"); err = ENOMEM; goto free_tx_queues; } /* Initialize report status array */ for (j = 0; j < sc->scctx->isc_ntxd[0]; j++) txq->tx_rsq[j] = QIDX_INVALID; } /* Assign queues from PF space to the main VSI */ err = ice_resmgr_assign_contiguous(&sc->tx_qmgr, vsi->tx_qmap, ntxqsets); if (err) { device_printf(sc->dev, "Unable to assign PF queues: %s\n", ice_err_str(err)); goto free_tx_queues; } vsi->qmap_type = ICE_RESMGR_ALLOC_CONTIGUOUS; /* Add Tx queue sysctls context */ ice_vsi_add_txqs_ctx(vsi); for (i = 0, txq = vsi->tx_queues; i < ntxqsets; i++, txq++) { /* q_handle == me when only one TC */ txq->me = txq->q_handle = i; txq->vsi = vsi; /* store the queue size for easier access */ txq->desc_count = sc->scctx->isc_ntxd[0]; /* get the virtual and physical address of the hardware queues */ txq->tail = QTX_COMM_DBELL(vsi->tx_qmap[i]); txq->tx_base = (struct ice_tx_desc *)vaddrs[i]; txq->tx_paddr = paddrs[i]; ice_add_txq_sysctls(txq); } vsi->num_tx_queues = ntxqsets; return (0); free_tx_queues: for (i = 0, txq = vsi->tx_queues; i < ntxqsets; i++, txq++) { if (txq->tx_rsq != NULL) { free(txq->tx_rsq, M_ICE); txq->tx_rsq = NULL; } } free(vsi->tx_queues, M_ICE); vsi->tx_queues = NULL; return err; } /** * ice_if_rx_queues_alloc - Allocate Rx queue memory * @ctx: iflib context structure * @vaddrs: virtual addresses for the queue memory * @paddrs: physical addresses for the queue memory * @nrxqs: number of Rx queues per set (should always be 1) * @nrxqsets: number of Rx queue sets to allocate * * Called by iflib to allocate Rx queues for the device. Allocates driver * memory to track each queue, as well as sets up the Rx queue sysctls. */ static int ice_if_rx_queues_alloc(if_ctx_t ctx, caddr_t *vaddrs, uint64_t *paddrs, int __invariant_only nrxqs, int nrxqsets) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; struct ice_rx_queue *rxq; int err, i; MPASS(nrxqs == 1); MPASS(sc->scctx->isc_nrxd[0] <= ICE_MAX_DESC_COUNT); ASSERT_CTX_LOCKED(sc); /* Do not bother allocating queues if we're in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (0); /* Allocate queue structure memory */ if (!(vsi->rx_queues = (struct ice_rx_queue *) malloc(sizeof(struct ice_rx_queue) * nrxqsets, M_ICE, M_NOWAIT | M_ZERO))) { device_printf(sc->dev, "Unable to allocate Rx queue memory\n"); return (ENOMEM); } /* Assign queues from PF space to the main VSI */ err = ice_resmgr_assign_contiguous(&sc->rx_qmgr, vsi->rx_qmap, nrxqsets); if (err) { device_printf(sc->dev, "Unable to assign PF queues: %s\n", ice_err_str(err)); goto free_rx_queues; } vsi->qmap_type = ICE_RESMGR_ALLOC_CONTIGUOUS; /* Add Rx queue sysctls context */ ice_vsi_add_rxqs_ctx(vsi); for (i = 0, rxq = vsi->rx_queues; i < nrxqsets; i++, rxq++) { rxq->me = i; rxq->vsi = vsi; /* store the queue size for easier access */ rxq->desc_count = sc->scctx->isc_nrxd[0]; /* get the virtual and physical address of the hardware queues */ rxq->tail = QRX_TAIL(vsi->rx_qmap[i]); rxq->rx_base = (union ice_32b_rx_flex_desc *)vaddrs[i]; rxq->rx_paddr = paddrs[i]; ice_add_rxq_sysctls(rxq); } vsi->num_rx_queues = nrxqsets; return (0); free_rx_queues: free(vsi->rx_queues, M_ICE); vsi->rx_queues = NULL; return err; } /** * ice_if_queues_free - Free queue memory * @ctx: the iflib context structure * * Free queue memory allocated by ice_if_tx_queues_alloc() and * ice_if_rx_queues_alloc(). * * There is no guarantee that ice_if_queues_free() and ice_if_detach() will be * called in the same order. It's possible for ice_if_queues_free() to be * called prior to ice_if_detach(), and vice versa. * * For this reason, the main VSI is a static member of the ice_softc, which is * not free'd until after iflib finishes calling both of these functions. * * Thus, care must be taken in how we manage the memory being freed by this * function, and in what tasks it can and must perform. */ static void ice_if_queues_free(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; struct ice_tx_queue *txq; int i; /* Free the Tx and Rx sysctl contexts, and assign NULL to the node * pointers. Note, the calls here and those in ice_if_detach() * are *BOTH* necessary, as we cannot guarantee which path will be * run first */ ice_vsi_del_txqs_ctx(vsi); ice_vsi_del_rxqs_ctx(vsi); /* Release MSI-X IRQ vectors, if not yet released in ice_if_detach */ ice_free_irqvs(sc); if (vsi->tx_queues != NULL) { /* free the tx_rsq arrays */ for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) { if (txq->tx_rsq != NULL) { free(txq->tx_rsq, M_ICE); txq->tx_rsq = NULL; } } free(vsi->tx_queues, M_ICE); vsi->tx_queues = NULL; vsi->num_tx_queues = 0; } if (vsi->rx_queues != NULL) { free(vsi->rx_queues, M_ICE); vsi->rx_queues = NULL; vsi->num_rx_queues = 0; } } /** * ice_msix_que - Fast interrupt handler for MSI-X receive queues * @arg: The Rx queue memory * * Interrupt filter function for iflib MSI-X interrupts. Called by iflib when * an MSI-X interrupt for a given queue is triggered. Currently this just asks * iflib to schedule the main Rx thread. */ static int ice_msix_que(void *arg) { struct ice_rx_queue __unused *rxq = (struct ice_rx_queue *)arg; /* TODO: dynamic ITR algorithm?? */ return (FILTER_SCHEDULE_THREAD); } /** * ice_msix_admin - Fast interrupt handler for MSI-X admin interrupt * @arg: pointer to device softc memory * * Called by iflib when an administrative interrupt occurs. Should perform any * fast logic for handling the interrupt cause, and then indicate whether the * admin task needs to be queued. */ static int ice_msix_admin(void *arg) { struct ice_softc *sc = (struct ice_softc *)arg; struct ice_hw *hw = &sc->hw; device_t dev = sc->dev; u32 oicr; /* There is no safe way to modify the enabled miscellaneous causes of * the OICR vector at runtime, as doing so would be prone to race * conditions. Reading PFINT_OICR will unmask the associated interrupt * causes and allow future interrupts to occur. The admin interrupt * vector will not be re-enabled until after we exit this function, * but any delayed tasks must be resilient against possible "late * arrival" interrupts that occur while we're already handling the * task. This is done by using state bits and serializing these * delayed tasks via the admin status task function. */ oicr = rd32(hw, PFINT_OICR); /* Processing multiple controlq interrupts on a single vector does not * provide an indication of which controlq triggered the interrupt. * We might try reading the INTEVENT bit of the respective PFINT_*_CTL * registers. However, the INTEVENT bit is not guaranteed to be set as * it gets automatically cleared when the hardware acknowledges the * interrupt. * * This means we don't really have a good indication of whether or * which controlq triggered this interrupt. We'll just notify the * admin task that it should check all the controlqs. */ ice_set_state(&sc->state, ICE_STATE_CONTROLQ_EVENT_PENDING); if (oicr & PFINT_OICR_VFLR_M) { ice_set_state(&sc->state, ICE_STATE_VFLR_PENDING); } if (oicr & PFINT_OICR_MAL_DETECT_M) { ice_set_state(&sc->state, ICE_STATE_MDD_PENDING); } if (oicr & PFINT_OICR_GRST_M) { u32 reset; reset = (rd32(hw, GLGEN_RSTAT) & GLGEN_RSTAT_RESET_TYPE_M) >> GLGEN_RSTAT_RESET_TYPE_S; if (reset == ICE_RESET_CORER) sc->soft_stats.corer_count++; else if (reset == ICE_RESET_GLOBR) sc->soft_stats.globr_count++; else sc->soft_stats.empr_count++; /* There are a couple of bits at play for handling resets. * First, the ICE_STATE_RESET_OICR_RECV bit is used to * indicate that the driver has received an OICR with a reset * bit active, indicating that a CORER/GLOBR/EMPR is about to * happen. Second, we set hw->reset_ongoing to indicate that * the hardware is in reset. We will set this back to false as * soon as the driver has determined that the hardware is out * of reset. * * If the driver wishes to trigger a request, it can set one of * the ICE_STATE_RESET_*_REQ bits, which will trigger the * correct type of reset. */ if (!ice_testandset_state(&sc->state, ICE_STATE_RESET_OICR_RECV)) hw->reset_ongoing = true; } if (oicr & PFINT_OICR_ECC_ERR_M) { device_printf(dev, "ECC Error detected!\n"); ice_set_state(&sc->state, ICE_STATE_RESET_PFR_REQ); } - if (oicr & PFINT_OICR_PE_CRITERR_M) { - device_printf(dev, "Critical Protocol Engine Error detected!\n"); - ice_set_state(&sc->state, ICE_STATE_RESET_PFR_REQ); + if (oicr & (PFINT_OICR_PE_CRITERR_M | PFINT_OICR_HMC_ERR_M)) { + if (oicr & PFINT_OICR_HMC_ERR_M) + /* Log the HMC errors */ + ice_log_hmc_error(hw, dev); + ice_rdma_notify_pe_intr(sc, oicr); } if (oicr & PFINT_OICR_PCI_EXCEPTION_M) { device_printf(dev, "PCI Exception detected!\n"); ice_set_state(&sc->state, ICE_STATE_RESET_PFR_REQ); } - if (oicr & PFINT_OICR_HMC_ERR_M) { - /* Log the HMC errors, but don't disable the interrupt cause */ - ice_log_hmc_error(hw, dev); - } - return (FILTER_SCHEDULE_THREAD); } /** * ice_allocate_msix - Allocate MSI-X vectors for the interface * @sc: the device private softc * * Map the MSI-X bar, and then request MSI-X vectors in a two-stage process. * * First, determine a suitable total number of vectors based on the number * of CPUs, RSS buckets, the administrative vector, and other demands such as * RDMA. * * Request the desired amount of vectors, and see how many we obtain. If we * don't obtain as many as desired, reduce the demands by lowering the number * of requested queues or reducing the demand from other features such as * RDMA. * * @remark This function is required because the driver sets the * IFLIB_SKIP_MSIX flag indicating that the driver will manage MSI-X vectors * manually. * * @remark This driver will only use MSI-X vectors. If this is not possible, * neither MSI or legacy interrupts will be tried. * * @post on success this function must set the following scctx parameters: * isc_vectors, isc_nrxqsets, isc_ntxqsets, and isc_intr. * * @returns zero on success or an error code on failure. */ static int ice_allocate_msix(struct ice_softc *sc) { bool iflib_override_queue_count = false; if_softc_ctx_t scctx = sc->scctx; device_t dev = sc->dev; cpuset_t cpus; int bar, queues, vectors, requested; int err = 0; int rdma; /* Allocate the MSI-X bar */ bar = scctx->isc_msix_bar; sc->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &bar, RF_ACTIVE); if (!sc->msix_table) { device_printf(dev, "Unable to map MSI-X table\n"); return (ENOMEM); } /* Check if the iflib queue count sysctls have been set */ if (sc->ifc_sysctl_ntxqs || sc->ifc_sysctl_nrxqs) iflib_override_queue_count = true; err = bus_get_cpus(dev, INTR_CPUS, sizeof(cpus), &cpus); if (err) { device_printf(dev, "%s: Unable to fetch the CPU list: %s\n", __func__, ice_err_str(err)); CPU_COPY(&all_cpus, &cpus); } /* Attempt to mimic behavior of iflib_msix_init */ if (iflib_override_queue_count) { /* * If the override sysctls have been set, limit the queues to * the number of logical CPUs. */ queues = mp_ncpus; } else { /* * Otherwise, limit the queue count to the CPUs associated * with the NUMA node the device is associated with. */ queues = CPU_COUNT(&cpus); } /* Clamp to the number of RSS buckets */ queues = imin(queues, rss_getnumbuckets()); /* * Clamp the number of queue pairs to the minimum of the requested Tx * and Rx queues. */ queues = imin(queues, sc->ifc_sysctl_ntxqs ?: scctx->isc_ntxqsets); queues = imin(queues, sc->ifc_sysctl_nrxqs ?: scctx->isc_nrxqsets); if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_RDMA)) { /* * Choose a number of RDMA vectors based on the number of CPUs * up to a maximum */ rdma = min(CPU_COUNT(&cpus), ICE_RDMA_MAX_MSIX); /* Further limit by the user configurable tunable */ rdma = min(rdma, ice_rdma_max_msix); } else { rdma = 0; } /* * Determine the number of vectors to request. Note that we also need * to allocate one vector for administrative tasks. */ requested = rdma + queues + 1; vectors = requested; err = pci_alloc_msix(dev, &vectors); if (err) { device_printf(dev, "Failed to allocate %d MSI-X vectors, err %s\n", vectors, ice_err_str(err)); goto err_free_msix_table; } /* If we don't receive enough vectors, reduce demands */ if (vectors < requested) { int diff = requested - vectors; device_printf(dev, "Requested %d MSI-X vectors, but got only %d\n", requested, vectors); /* * The OS didn't grant us the requested number of vectors. * Check to see if we can reduce demands by limiting the * number of vectors allocated to certain features. */ if (rdma >= diff) { /* Reduce the number of RDMA vectors we reserve */ rdma -= diff; diff = 0; } else { /* Disable RDMA and reduce the difference */ ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); diff -= rdma; rdma = 0; } /* * If we still have a difference, we need to reduce the number * of queue pairs. * * However, we still need at least one vector for the admin * interrupt and one queue pair. */ if (queues <= diff) { device_printf(dev, "Unable to allocate sufficient MSI-X vectors\n"); err = (ERANGE); goto err_pci_release_msi; } queues -= diff; } device_printf(dev, "Using %d Tx and Rx queues\n", queues); if (rdma) device_printf(dev, "Reserving %d MSI-X interrupts for iRDMA\n", rdma); device_printf(dev, "Using MSI-X interrupts with %d vectors\n", vectors); scctx->isc_vectors = vectors; scctx->isc_nrxqsets = queues; scctx->isc_ntxqsets = queues; scctx->isc_intr = IFLIB_INTR_MSIX; sc->irdma_vectors = rdma; /* Interrupt allocation tracking isn't required in recovery mode, * since neither RDMA nor VFs are enabled. */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (0); /* Keep track of which interrupt indices are being used for what */ sc->lan_vectors = vectors - rdma; err = ice_resmgr_assign_contiguous(&sc->imgr, sc->pf_imap, sc->lan_vectors); if (err) { device_printf(dev, "Unable to assign PF interrupt mapping: %s\n", ice_err_str(err)); goto err_pci_release_msi; } err = ice_resmgr_assign_contiguous(&sc->imgr, sc->rdma_imap, rdma); if (err) { device_printf(dev, "Unable to assign PF RDMA interrupt mapping: %s\n", ice_err_str(err)); ice_resmgr_release_map(&sc->imgr, sc->pf_imap, sc->lan_vectors); goto err_pci_release_msi; } return (0); err_pci_release_msi: pci_release_msi(dev); err_free_msix_table: if (sc->msix_table != NULL) { bus_release_resource(sc->dev, SYS_RES_MEMORY, rman_get_rid(sc->msix_table), sc->msix_table); sc->msix_table = NULL; } return (err); } /** * ice_if_msix_intr_assign - Assign MSI-X interrupt vectors to queues * @ctx: the iflib context structure * @msix: the number of vectors we were assigned * * Called by iflib to assign MSI-X vectors to queues. Currently requires that * we get at least the same number of vectors as we have queues, and that we * always have the same number of Tx and Rx queues. * * Tx queues use a softirq instead of using their own hardware interrupt. */ static int ice_if_msix_intr_assign(if_ctx_t ctx, int msix) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; int err, i, vector; ASSERT_CTX_LOCKED(sc); if (vsi->num_rx_queues != vsi->num_tx_queues) { device_printf(sc->dev, "iflib requested %d Tx queues, and %d Rx queues, but the driver isn't able to support a differing number of Tx and Rx queues\n", vsi->num_tx_queues, vsi->num_rx_queues); return (EOPNOTSUPP); } if (msix < (vsi->num_rx_queues + 1)) { device_printf(sc->dev, "Not enough MSI-X vectors to assign one vector to each queue pair\n"); return (EOPNOTSUPP); } /* Save the number of vectors for future use */ sc->num_irq_vectors = vsi->num_rx_queues + 1; /* Allocate space to store the IRQ vector data */ if (!(sc->irqvs = (struct ice_irq_vector *) malloc(sizeof(struct ice_irq_vector) * (sc->num_irq_vectors), M_ICE, M_NOWAIT))) { device_printf(sc->dev, "Unable to allocate irqv memory\n"); return (ENOMEM); } /* Administrative interrupt events will use vector 0 */ err = iflib_irq_alloc_generic(ctx, &sc->irqvs[0].irq, 1, IFLIB_INTR_ADMIN, ice_msix_admin, sc, 0, "admin"); if (err) { device_printf(sc->dev, "Failed to register Admin queue handler: %s\n", ice_err_str(err)); goto free_irqvs; } sc->irqvs[0].me = 0; /* Do not allocate queue interrupts when in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (0); for (i = 0, vector = 1; i < vsi->num_rx_queues; i++, vector++) { struct ice_rx_queue *rxq = &vsi->rx_queues[i]; struct ice_tx_queue *txq = &vsi->tx_queues[i]; int rid = vector + 1; char irq_name[16]; snprintf(irq_name, sizeof(irq_name), "rxq%d", i); err = iflib_irq_alloc_generic(ctx, &sc->irqvs[vector].irq, rid, IFLIB_INTR_RXTX, ice_msix_que, rxq, rxq->me, irq_name); if (err) { device_printf(sc->dev, "Failed to allocate q int %d err: %s\n", i, ice_err_str(err)); vector--; i--; goto fail; } sc->irqvs[vector].me = vector; rxq->irqv = &sc->irqvs[vector]; bzero(irq_name, sizeof(irq_name)); snprintf(irq_name, sizeof(irq_name), "txq%d", i); iflib_softirq_alloc_generic(ctx, &sc->irqvs[vector].irq, IFLIB_INTR_TX, txq, txq->me, irq_name); txq->irqv = &sc->irqvs[vector]; } return (0); fail: for (; i >= 0; i--, vector--) iflib_irq_free(ctx, &sc->irqvs[vector].irq); iflib_irq_free(ctx, &sc->irqvs[0].irq); free_irqvs: free(sc->irqvs, M_ICE); sc->irqvs = NULL; return err; } /** * ice_if_mtu_set - Set the device MTU * @ctx: iflib context structure * @mtu: the MTU requested * * Called by iflib to configure the device's Maximum Transmission Unit (MTU). * * @pre assumes the caller holds the iflib CTX lock */ static int ice_if_mtu_set(if_ctx_t ctx, uint32_t mtu) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); ASSERT_CTX_LOCKED(sc); /* Do not support configuration when in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (ENOSYS); if (mtu < ICE_MIN_MTU || mtu > ICE_MAX_MTU) return (EINVAL); sc->scctx->isc_max_frame_size = mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + ETHER_VLAN_ENCAP_LEN; sc->pf_vsi.max_frame_size = sc->scctx->isc_max_frame_size; return (0); } /** * ice_if_intr_enable - Enable device interrupts * @ctx: iflib context structure * * Called by iflib to request enabling device interrupts. */ static void ice_if_intr_enable(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; struct ice_hw *hw = &sc->hw; ASSERT_CTX_LOCKED(sc); /* Enable ITR 0 */ ice_enable_intr(hw, sc->irqvs[0].me); /* Do not enable queue interrupts in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; /* Enable all queue interrupts */ for (int i = 0; i < vsi->num_rx_queues; i++) ice_enable_intr(hw, vsi->rx_queues[i].irqv->me); } /** * ice_if_intr_disable - Disable device interrupts * @ctx: iflib context structure * * Called by iflib to request disabling device interrupts. */ static void ice_if_intr_disable(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_hw *hw = &sc->hw; unsigned int i; ASSERT_CTX_LOCKED(sc); /* IFDI_INTR_DISABLE may be called prior to interrupts actually being * assigned to queues. Instead of assuming that the interrupt * assignment in the rx_queues structure is valid, just disable all * possible interrupts * * Note that we choose not to disable ITR 0 because this handles the * AdminQ interrupts, and we want to keep processing these even when * the interface is offline. */ for (i = 1; i < hw->func_caps.common_cap.num_msix_vectors; i++) ice_disable_intr(hw, i); } /** * ice_if_rx_queue_intr_enable - Enable a specific Rx queue interrupt * @ctx: iflib context structure * @rxqid: the Rx queue to enable * * Enable a specific Rx queue interrupt. * * This function is not protected by the iflib CTX lock. */ static int ice_if_rx_queue_intr_enable(if_ctx_t ctx, uint16_t rxqid) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; struct ice_hw *hw = &sc->hw; /* Do not enable queue interrupts in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (ENOSYS); ice_enable_intr(hw, vsi->rx_queues[rxqid].irqv->me); return (0); } /** * ice_if_tx_queue_intr_enable - Enable a specific Tx queue interrupt * @ctx: iflib context structure * @txqid: the Tx queue to enable * * Enable a specific Tx queue interrupt. * * This function is not protected by the iflib CTX lock. */ static int ice_if_tx_queue_intr_enable(if_ctx_t ctx, uint16_t txqid) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_vsi *vsi = &sc->pf_vsi; struct ice_hw *hw = &sc->hw; /* Do not enable queue interrupts in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (ENOSYS); ice_enable_intr(hw, vsi->tx_queues[txqid].irqv->me); return (0); } /** * ice_if_promisc_set - Set device promiscuous mode * @ctx: iflib context structure * @flags: promiscuous flags to configure * * Called by iflib to configure device promiscuous mode. * * @remark Calls to this function will always overwrite the previous setting */ static int ice_if_promisc_set(if_ctx_t ctx, int flags) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_hw *hw = &sc->hw; device_t dev = sc->dev; enum ice_status status; bool promisc_enable = flags & IFF_PROMISC; bool multi_enable = flags & IFF_ALLMULTI; /* Do not support configuration when in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return (ENOSYS); if (multi_enable) return (EOPNOTSUPP); if (promisc_enable) { status = ice_set_vsi_promisc(hw, sc->pf_vsi.idx, ICE_VSI_PROMISC_MASK, 0); if (status && status != ICE_ERR_ALREADY_EXISTS) { device_printf(dev, "Failed to enable promiscuous mode for PF VSI, err %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); return (EIO); } } else { status = ice_clear_vsi_promisc(hw, sc->pf_vsi.idx, ICE_VSI_PROMISC_MASK, 0); if (status) { device_printf(dev, "Failed to disable promiscuous mode for PF VSI, err %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); return (EIO); } } return (0); } /** * ice_if_media_change - Change device media * @ctx: device ctx structure * * Called by iflib when a media change is requested. This operation is not * supported by the hardware, so we just return an error code. */ static int ice_if_media_change(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); device_printf(sc->dev, "Media change is not supported.\n"); return (ENODEV); } /** * ice_if_media_status - Report current device media * @ctx: iflib context structure * @ifmr: ifmedia request structure to update * * Updates the provided ifmr with current device media status, including link * status and media type. */ static void ice_if_media_status(if_ctx_t ctx, struct ifmediareq *ifmr) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ice_link_status *li = &sc->hw.port_info->phy.link_info; ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; /* Never report link up or media types when in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; if (!sc->link_up) return; ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_FDX; if (li->phy_type_low) ifmr->ifm_active |= ice_get_phy_type_low(li->phy_type_low); else if (li->phy_type_high) ifmr->ifm_active |= ice_get_phy_type_high(li->phy_type_high); else ifmr->ifm_active |= IFM_UNKNOWN; /* Report flow control status as well */ if (li->an_info & ICE_AQ_LINK_PAUSE_TX) ifmr->ifm_active |= IFM_ETH_TXPAUSE; if (li->an_info & ICE_AQ_LINK_PAUSE_RX) ifmr->ifm_active |= IFM_ETH_RXPAUSE; } /** * ice_init_tx_tracking - Initialize Tx queue software tracking values * @vsi: the VSI to initialize * * Initialize Tx queue software tracking values, including the Report Status * queue, and related software tracking values. */ static void ice_init_tx_tracking(struct ice_vsi *vsi) { struct ice_tx_queue *txq; size_t j; int i; for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) { txq->tx_rs_cidx = txq->tx_rs_pidx = 0; /* Initialize the last processed descriptor to be the end of * the ring, rather than the start, so that we avoid an * off-by-one error in ice_ift_txd_credits_update for the * first packet. */ txq->tx_cidx_processed = txq->desc_count - 1; for (j = 0; j < txq->desc_count; j++) txq->tx_rsq[j] = QIDX_INVALID; } } /** * ice_update_rx_mbuf_sz - Update the Rx buffer size for all queues * @sc: the device softc * * Called to update the Rx queue mbuf_sz parameter for configuring the receive * buffer sizes when programming hardware. */ static void ice_update_rx_mbuf_sz(struct ice_softc *sc) { uint32_t mbuf_sz = iflib_get_rx_mbuf_sz(sc->ctx); struct ice_vsi *vsi = &sc->pf_vsi; MPASS(mbuf_sz <= UINT16_MAX); vsi->mbuf_sz = mbuf_sz; } /** * ice_if_init - Initialize the device * @ctx: iflib ctx structure * * Called by iflib to bring the device up, i.e. ifconfig ice0 up. Initializes * device filters and prepares the Tx and Rx engines. * * @pre assumes the caller holds the iflib CTX lock */ static void ice_if_init(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); device_t dev = sc->dev; int err; ASSERT_CTX_LOCKED(sc); /* * We've seen an issue with 11.3/12.1 where sideband routines are * called after detach is called. This would call routines after * if_stop, causing issues with the teardown process. This has * seemingly been fixed in STABLE snapshots, but it seems like a * good idea to have this guard here regardless. */ if (ice_driver_is_detaching(sc)) return; if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; if (ice_test_state(&sc->state, ICE_STATE_RESET_FAILED)) { device_printf(sc->dev, "request to start interface cannot be completed as the device failed to reset\n"); return; } if (ice_test_state(&sc->state, ICE_STATE_PREPARED_FOR_RESET)) { device_printf(sc->dev, "request to start interface while device is prepared for impending reset\n"); return; } ice_update_rx_mbuf_sz(sc); /* Update the MAC address... User might use a LAA */ err = ice_update_laa_mac(sc); if (err) { device_printf(dev, "LAA address change failed, err %s\n", ice_err_str(err)); return; } /* Initialize software Tx tracking values */ ice_init_tx_tracking(&sc->pf_vsi); err = ice_cfg_vsi_for_tx(&sc->pf_vsi); if (err) { device_printf(dev, "Unable to configure the main VSI for Tx: %s\n", ice_err_str(err)); return; } err = ice_cfg_vsi_for_rx(&sc->pf_vsi); if (err) { device_printf(dev, "Unable to configure the main VSI for Rx: %s\n", ice_err_str(err)); goto err_cleanup_tx; } err = ice_control_all_rx_queues(&sc->pf_vsi, true); if (err) { device_printf(dev, "Unable to enable Rx rings for transmit: %s\n", ice_err_str(err)); goto err_cleanup_tx; } err = ice_cfg_pf_default_mac_filters(sc); if (err) { device_printf(dev, "Unable to configure default MAC filters: %s\n", ice_err_str(err)); goto err_stop_rx; } /* We use software interrupts for Tx, so we only program the hardware * interrupts for Rx. */ ice_configure_all_rxq_interrupts(&sc->pf_vsi); ice_configure_rx_itr(&sc->pf_vsi); /* Configure promiscuous mode */ ice_if_promisc_set(ctx, if_getflags(sc->ifp)); ice_rdma_pf_init(sc); ice_set_state(&sc->state, ICE_STATE_DRIVER_INITIALIZED); return; err_stop_rx: ice_control_all_rx_queues(&sc->pf_vsi, false); err_cleanup_tx: ice_vsi_disable_tx(&sc->pf_vsi); } /** * ice_poll_for_media_avail - Re-enable link if media is detected * @sc: device private structure * * Intended to be called from the driver's timer function, this function * sends the Get Link Status AQ command and re-enables HW link if the * command says that media is available. * * If the driver doesn't have the "NO_MEDIA" state set, then this does nothing, * since media removal events are supposed to be sent to the driver through * a link status event. */ static void ice_poll_for_media_avail(struct ice_softc *sc) { struct ice_hw *hw = &sc->hw; struct ice_port_info *pi = hw->port_info; if (ice_test_state(&sc->state, ICE_STATE_NO_MEDIA)) { pi->phy.get_link_info = true; ice_get_link_status(pi, &sc->link_up); if (pi->phy.link_info.link_info & ICE_AQ_MEDIA_AVAILABLE) { enum ice_status status; /* Re-enable link and re-apply user link settings */ ice_apply_saved_phy_cfg(sc, ICE_APPLY_LS_FEC_FC); /* Update the OS about changes in media capability */ status = ice_add_media_types(sc, sc->media); if (status) device_printf(sc->dev, "Error adding device media types: %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); ice_clear_state(&sc->state, ICE_STATE_NO_MEDIA); } } } /** * ice_if_timer - called by iflib periodically * @ctx: iflib ctx structure * @qid: the queue this timer was called for * * This callback is triggered by iflib periodically. We use it to update the * hw statistics. * * @remark this function is not protected by the iflib CTX lock. */ static void ice_if_timer(if_ctx_t ctx, uint16_t qid) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); uint64_t prev_link_xoff_rx = sc->stats.cur.link_xoff_rx; if (qid != 0) return; /* Do not attempt to update stats when in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; /* Update device statistics */ ice_update_pf_stats(sc); /* * For proper watchdog management, the iflib stack needs to know if * we've been paused during the last interval. Check if the * link_xoff_rx stat changed, and set the isc_pause_frames, if so. */ if (sc->stats.cur.link_xoff_rx != prev_link_xoff_rx) sc->scctx->isc_pause_frames = 1; /* Update the primary VSI stats */ ice_update_vsi_hw_stats(&sc->pf_vsi); } /** * ice_admin_timer - called periodically to trigger the admin task * @arg: callout(9) argument pointing to the device private softc structure * * Timer function used as part of a callout(9) timer that will periodically * trigger the admin task, even when the interface is down. * * @remark this function is not called by iflib and is not protected by the * iflib CTX lock. * * @remark because this is a callout function, it cannot sleep and should not * attempt taking the iflib CTX lock. */ static void ice_admin_timer(void *arg) { struct ice_softc *sc = (struct ice_softc *)arg; /* * There is a point where callout routines are no longer * cancelable. So there exists a window of time where the * driver enters detach() and tries to cancel the callout, but the * callout routine has passed the cancellation point. The detach() * routine is unaware of this and tries to free resources that the * callout routine needs. So we check for the detach state flag to * at least shrink the window of opportunity. */ if (ice_driver_is_detaching(sc)) return; /* Fire off the admin task */ iflib_admin_intr_deferred(sc->ctx); /* Reschedule the admin timer */ callout_schedule(&sc->admin_timer, hz/2); } /** * ice_transition_recovery_mode - Transition to recovery mode * @sc: the device private softc * * Called when the driver detects that the firmware has entered recovery mode * at run time. */ static void ice_transition_recovery_mode(struct ice_softc *sc) { struct ice_vsi *vsi = &sc->pf_vsi; int i; device_printf(sc->dev, "Firmware recovery mode detected. Limiting functionality. Refer to Intel(R) Ethernet Adapters and Devices User Guide for details on firmware recovery mode.\n"); /* Tell the stack that the link has gone down */ iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0); /* Request that the device be re-initialized */ ice_request_stack_reinit(sc); ice_rdma_pf_detach(sc); ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_vsi_del_txqs_ctx(vsi); ice_vsi_del_rxqs_ctx(vsi); for (i = 0; i < sc->num_available_vsi; i++) { if (sc->all_vsi[i]) ice_release_vsi(sc->all_vsi[i]); } sc->num_available_vsi = 0; if (sc->all_vsi) { free(sc->all_vsi, M_ICE); sc->all_vsi = NULL; } /* Destroy the interrupt manager */ ice_resmgr_destroy(&sc->imgr); /* Destroy the queue managers */ ice_resmgr_destroy(&sc->tx_qmgr); ice_resmgr_destroy(&sc->rx_qmgr); ice_deinit_hw(&sc->hw); } /** * ice_transition_safe_mode - Transition to safe mode * @sc: the device private softc * * Called when the driver attempts to reload the DDP package during a device * reset, and the new download fails. If so, we must transition to safe mode * at run time. * * @remark although safe mode normally allocates only a single queue, we can't * change the number of queues dynamically when using iflib. Due to this, we * do not attempt to reduce the number of queues. */ static void ice_transition_safe_mode(struct ice_softc *sc) { /* Indicate that we are in Safe mode */ ice_set_bit(ICE_FEATURE_SAFE_MODE, sc->feat_cap); ice_set_bit(ICE_FEATURE_SAFE_MODE, sc->feat_en); ice_rdma_pf_detach(sc); ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap); ice_clear_bit(ICE_FEATURE_RSS, sc->feat_en); } /** * ice_if_update_admin_status - update admin status * @ctx: iflib ctx structure * * Called by iflib to update the admin status. For our purposes, this means * check the adminq, and update the link status. It's ultimately triggered by * our admin interrupt, or by the ice_if_timer periodically. * * @pre assumes the caller holds the iflib CTX lock */ static void ice_if_update_admin_status(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); enum ice_fw_modes fw_mode; bool reschedule = false; u16 pending = 0; ASSERT_CTX_LOCKED(sc); /* Check if the firmware entered recovery mode at run time */ fw_mode = ice_get_fw_mode(&sc->hw); if (fw_mode == ICE_FW_MODE_REC) { if (!ice_testandset_state(&sc->state, ICE_STATE_RECOVERY_MODE)) { /* If we just entered recovery mode, log a warning to * the system administrator and deinit driver state * that is no longer functional. */ ice_transition_recovery_mode(sc); } } else if (fw_mode == ICE_FW_MODE_ROLLBACK) { if (!ice_testandset_state(&sc->state, ICE_STATE_ROLLBACK_MODE)) { /* Rollback mode isn't fatal, but we don't want to * repeatedly post a message about it. */ ice_print_rollback_msg(&sc->hw); } } /* Handle global reset events */ ice_handle_reset_event(sc); /* Handle PF reset requests */ ice_handle_pf_reset_request(sc); /* Handle MDD events */ ice_handle_mdd_event(sc); if (ice_test_state(&sc->state, ICE_STATE_RESET_FAILED) || ice_test_state(&sc->state, ICE_STATE_PREPARED_FOR_RESET) || ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) { /* * If we know the control queues are disabled, skip processing * the control queues entirely. */ ; } else if (ice_testandclear_state(&sc->state, ICE_STATE_CONTROLQ_EVENT_PENDING)) { ice_process_ctrlq(sc, ICE_CTL_Q_ADMIN, &pending); if (pending > 0) reschedule = true; ice_process_ctrlq(sc, ICE_CTL_Q_MAILBOX, &pending); if (pending > 0) reschedule = true; } /* Poll for link up */ ice_poll_for_media_avail(sc); /* Check and update link status */ ice_update_link_status(sc, false); /* * If there are still messages to process, we need to reschedule * ourselves. Otherwise, we can just re-enable the interrupt. We'll be * woken up at the next interrupt or timer event. */ if (reschedule) { ice_set_state(&sc->state, ICE_STATE_CONTROLQ_EVENT_PENDING); iflib_admin_intr_deferred(ctx); } else { ice_enable_intr(&sc->hw, sc->irqvs[0].me); } } /** * ice_prepare_for_reset - Prepare device for an impending reset * @sc: The device private softc * * Prepare the driver for an impending reset, shutting down VSIs, clearing the * scheduler setup, and shutting down controlqs. Uses the * ICE_STATE_PREPARED_FOR_RESET to indicate whether we've already prepared the * driver for reset or not. */ static void ice_prepare_for_reset(struct ice_softc *sc) { struct ice_hw *hw = &sc->hw; /* If we're already prepared, there's nothing to do */ if (ice_testandset_state(&sc->state, ICE_STATE_PREPARED_FOR_RESET)) return; log(LOG_INFO, "%s: preparing to reset device logic\n", sc->ifp->if_xname); /* In recovery mode, hardware is not initialized */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; + /* inform the RDMA client */ + ice_rdma_notify_reset(sc); /* stop the RDMA client */ ice_rdma_pf_stop(sc); /* Release the main PF VSI queue mappings */ ice_resmgr_release_map(&sc->tx_qmgr, sc->pf_vsi.tx_qmap, sc->pf_vsi.num_tx_queues); ice_resmgr_release_map(&sc->rx_qmgr, sc->pf_vsi.rx_qmap, sc->pf_vsi.num_rx_queues); ice_clear_hw_tbls(hw); if (hw->port_info) ice_sched_clear_port(hw->port_info); ice_shutdown_all_ctrlq(hw, false); } /** * ice_rebuild_pf_vsi_qmap - Rebuild the main PF VSI queue mapping * @sc: the device softc pointer * * Loops over the Tx and Rx queues for the main PF VSI and reassigns the queue * mapping after a reset occurred. */ static int ice_rebuild_pf_vsi_qmap(struct ice_softc *sc) { struct ice_vsi *vsi = &sc->pf_vsi; struct ice_tx_queue *txq; struct ice_rx_queue *rxq; int err, i; /* Re-assign Tx queues from PF space to the main VSI */ err = ice_resmgr_assign_contiguous(&sc->tx_qmgr, vsi->tx_qmap, vsi->num_tx_queues); if (err) { device_printf(sc->dev, "Unable to re-assign PF Tx queues: %s\n", ice_err_str(err)); return (err); } /* Re-assign Rx queues from PF space to this VSI */ err = ice_resmgr_assign_contiguous(&sc->rx_qmgr, vsi->rx_qmap, vsi->num_rx_queues); if (err) { device_printf(sc->dev, "Unable to re-assign PF Rx queues: %s\n", ice_err_str(err)); goto err_release_tx_queues; } vsi->qmap_type = ICE_RESMGR_ALLOC_CONTIGUOUS; /* Re-assign Tx queue tail pointers */ for (i = 0, txq = vsi->tx_queues; i < vsi->num_tx_queues; i++, txq++) txq->tail = QTX_COMM_DBELL(vsi->tx_qmap[i]); /* Re-assign Rx queue tail pointers */ for (i = 0, rxq = vsi->rx_queues; i < vsi->num_rx_queues; i++, rxq++) rxq->tail = QRX_TAIL(vsi->rx_qmap[i]); return (0); err_release_tx_queues: ice_resmgr_release_map(&sc->tx_qmgr, sc->pf_vsi.tx_qmap, sc->pf_vsi.num_tx_queues); return (err); } /* determine if the iflib context is active */ #define CTX_ACTIVE(ctx) ((if_getdrvflags(iflib_get_ifp(ctx)) & IFF_DRV_RUNNING)) /** * ice_rebuild_recovery_mode - Rebuild driver state while in recovery mode * @sc: The device private softc * * Handle a driver rebuild while in recovery mode. This will only rebuild the * limited functionality supported while in recovery mode. */ static void ice_rebuild_recovery_mode(struct ice_softc *sc) { device_t dev = sc->dev; /* enable PCIe bus master */ pci_enable_busmaster(dev); /* Configure interrupt causes for the administrative interrupt */ ice_configure_misc_interrupts(sc); /* Enable ITR 0 right away, so that we can handle admin interrupts */ ice_enable_intr(&sc->hw, sc->irqvs[0].me); /* Now that the rebuild is finished, we're no longer prepared to reset */ ice_clear_state(&sc->state, ICE_STATE_PREPARED_FOR_RESET); log(LOG_INFO, "%s: device rebuild successful\n", sc->ifp->if_xname); /* In order to completely restore device functionality, the iflib core * needs to be reset. We need to request an iflib reset. Additionally, * because the state of IFC_DO_RESET is cached within task_fn_admin in * the iflib core, we also want re-run the admin task so that iflib * resets immediately instead of waiting for the next interrupt. */ ice_request_stack_reinit(sc); return; } /** * ice_rebuild - Rebuild driver state post reset * @sc: The device private softc * * Restore driver state after a reset occurred. Restart the controlqs, setup * the hardware port, and re-enable the VSIs. */ static void ice_rebuild(struct ice_softc *sc) { struct ice_hw *hw = &sc->hw; device_t dev = sc->dev; enum ice_ddp_state pkg_state; enum ice_status status; int err; sc->rebuild_ticks = ticks; /* If we're rebuilding, then a reset has succeeded. */ ice_clear_state(&sc->state, ICE_STATE_RESET_FAILED); /* * If the firmware is in recovery mode, only restore the limited * functionality supported by recovery mode. */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) { ice_rebuild_recovery_mode(sc); return; } /* enable PCIe bus master */ pci_enable_busmaster(dev); status = ice_init_all_ctrlq(hw); if (status) { device_printf(dev, "failed to re-init controlqs, err %s\n", ice_status_str(status)); goto err_shutdown_ctrlq; } /* Query the allocated resources for Tx scheduler */ status = ice_sched_query_res_alloc(hw); if (status) { device_printf(dev, "Failed to query scheduler resources, err %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); goto err_shutdown_ctrlq; } /* Re-enable FW logging. Keep going even if this fails */ status = ice_fwlog_set(hw, &hw->fwlog_cfg); if (!status) { /* * We should have the most updated cached copy of the * configuration, regardless of whether we're rebuilding * or not. So we'll simply check to see if logging was * enabled pre-rebuild. */ if (hw->fwlog_cfg.options & ICE_FWLOG_OPTION_IS_REGISTERED) { status = ice_fwlog_register(hw); if (status) device_printf(dev, "failed to re-register fw logging, err %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); } } else device_printf(dev, "failed to rebuild fw logging configuration, err %s aq_err %s\n", ice_status_str(status), ice_aq_str(hw->adminq.sq_last_status)); err = ice_send_version(sc); if (err) goto err_shutdown_ctrlq; err = ice_init_link_events(sc); if (err) { device_printf(dev, "ice_init_link_events failed: %s\n", ice_err_str(err)); goto err_shutdown_ctrlq; } status = ice_clear_pf_cfg(hw); if (status) { device_printf(dev, "failed to clear PF configuration, err %s\n", ice_status_str(status)); goto err_shutdown_ctrlq; } ice_clear_pxe_mode(hw); status = ice_get_caps(hw); if (status) { device_printf(dev, "failed to get capabilities, err %s\n", ice_status_str(status)); goto err_shutdown_ctrlq; } status = ice_sched_init_port(hw->port_info); if (status) { device_printf(dev, "failed to initialize port, err %s\n", ice_status_str(status)); goto err_sched_cleanup; } /* If we previously loaded the package, it needs to be reloaded now */ if (!ice_is_bit_set(sc->feat_en, ICE_FEATURE_SAFE_MODE)) { pkg_state = ice_init_pkg(hw, hw->pkg_copy, hw->pkg_size); if (!ice_is_init_pkg_successful(pkg_state)) { ice_log_pkg_init(sc, pkg_state); ice_transition_safe_mode(sc); } } ice_reset_pf_stats(sc); err = ice_rebuild_pf_vsi_qmap(sc); if (err) { device_printf(sc->dev, "Unable to re-assign main VSI queues, err %s\n", ice_err_str(err)); goto err_sched_cleanup; } err = ice_initialize_vsi(&sc->pf_vsi); if (err) { device_printf(sc->dev, "Unable to re-initialize Main VSI, err %s\n", ice_err_str(err)); goto err_release_queue_allocations; } /* Replay all VSI configuration */ err = ice_replay_all_vsi_cfg(sc); if (err) goto err_deinit_pf_vsi; /* Re-enable FW health event reporting */ ice_init_health_events(sc); /* Reconfigure the main PF VSI for RSS */ err = ice_config_rss(&sc->pf_vsi); if (err) { device_printf(sc->dev, "Unable to reconfigure RSS for the main VSI, err %s\n", ice_err_str(err)); goto err_deinit_pf_vsi; } /* Refresh link status */ ice_clear_state(&sc->state, ICE_STATE_LINK_STATUS_REPORTED); sc->hw.port_info->phy.get_link_info = true; ice_get_link_status(sc->hw.port_info, &sc->link_up); ice_update_link_status(sc, true); /* RDMA interface will be restarted by the stack re-init */ /* Configure interrupt causes for the administrative interrupt */ ice_configure_misc_interrupts(sc); /* Enable ITR 0 right away, so that we can handle admin interrupts */ ice_enable_intr(&sc->hw, sc->irqvs[0].me); /* Now that the rebuild is finished, we're no longer prepared to reset */ ice_clear_state(&sc->state, ICE_STATE_PREPARED_FOR_RESET); log(LOG_INFO, "%s: device rebuild successful\n", sc->ifp->if_xname); /* In order to completely restore device functionality, the iflib core * needs to be reset. We need to request an iflib reset. Additionally, * because the state of IFC_DO_RESET is cached within task_fn_admin in * the iflib core, we also want re-run the admin task so that iflib * resets immediately instead of waiting for the next interrupt. */ ice_request_stack_reinit(sc); return; err_deinit_pf_vsi: ice_deinit_vsi(&sc->pf_vsi); err_release_queue_allocations: ice_resmgr_release_map(&sc->tx_qmgr, sc->pf_vsi.tx_qmap, sc->pf_vsi.num_tx_queues); ice_resmgr_release_map(&sc->rx_qmgr, sc->pf_vsi.rx_qmap, sc->pf_vsi.num_rx_queues); err_sched_cleanup: ice_sched_cleanup_all(hw); err_shutdown_ctrlq: ice_shutdown_all_ctrlq(hw, false); ice_clear_state(&sc->state, ICE_STATE_PREPARED_FOR_RESET); ice_set_state(&sc->state, ICE_STATE_RESET_FAILED); device_printf(dev, "Driver rebuild failed, please reload the device driver\n"); } /** * ice_handle_reset_event - Handle reset events triggered by OICR * @sc: The device private softc * * Handle reset events triggered by an OICR notification. This includes CORER, * GLOBR, and EMPR resets triggered by software on this or any other PF or by * firmware. * * @pre assumes the iflib context lock is held, and will unlock it while * waiting for the hardware to finish reset. */ static void ice_handle_reset_event(struct ice_softc *sc) { struct ice_hw *hw = &sc->hw; enum ice_status status; device_t dev = sc->dev; /* When a CORER, GLOBR, or EMPR is about to happen, the hardware will * trigger an OICR interrupt. Our OICR handler will determine when * this occurs and set the ICE_STATE_RESET_OICR_RECV bit as * appropriate. */ if (!ice_testandclear_state(&sc->state, ICE_STATE_RESET_OICR_RECV)) return; ice_prepare_for_reset(sc); /* * Release the iflib context lock and wait for the device to finish * resetting. */ IFLIB_CTX_UNLOCK(sc); status = ice_check_reset(hw); IFLIB_CTX_LOCK(sc); if (status) { device_printf(dev, "Device never came out of reset, err %s\n", ice_status_str(status)); ice_set_state(&sc->state, ICE_STATE_RESET_FAILED); return; } /* We're done with the reset, so we can rebuild driver state */ sc->hw.reset_ongoing = false; ice_rebuild(sc); /* In the unlikely event that a PF reset request occurs at the same * time as a global reset, clear the request now. This avoids * resetting a second time right after we reset due to a global event. */ if (ice_testandclear_state(&sc->state, ICE_STATE_RESET_PFR_REQ)) device_printf(dev, "Ignoring PFR request that occurred while a reset was ongoing\n"); } /** * ice_handle_pf_reset_request - Initiate PF reset requested by software * @sc: The device private softc * * Initiate a PF reset requested by software. We handle this in the admin task * so that only one thread actually handles driver preparation and cleanup, * rather than having multiple threads possibly attempt to run this code * simultaneously. * * @pre assumes the iflib context lock is held and will unlock it while * waiting for the PF reset to complete. */ static void ice_handle_pf_reset_request(struct ice_softc *sc) { struct ice_hw *hw = &sc->hw; enum ice_status status; /* Check for PF reset requests */ if (!ice_testandclear_state(&sc->state, ICE_STATE_RESET_PFR_REQ)) return; /* Make sure we're prepared for reset */ ice_prepare_for_reset(sc); /* * Release the iflib context lock and wait for the device to finish * resetting. */ IFLIB_CTX_UNLOCK(sc); status = ice_reset(hw, ICE_RESET_PFR); IFLIB_CTX_LOCK(sc); if (status) { device_printf(sc->dev, "device PF reset failed, err %s\n", ice_status_str(status)); ice_set_state(&sc->state, ICE_STATE_RESET_FAILED); return; } sc->soft_stats.pfr_count++; ice_rebuild(sc); } /** * ice_init_device_features - Init device driver features * @sc: driver softc structure * * @pre assumes that the function capabilities bits have been set up by * ice_init_hw(). */ static void ice_init_device_features(struct ice_softc *sc) { /* Set capabilities that all devices support */ ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_set_bit(ICE_FEATURE_RSS, sc->feat_cap); ice_set_bit(ICE_FEATURE_RDMA, sc->feat_cap); ice_set_bit(ICE_FEATURE_LENIENT_LINK_MODE, sc->feat_cap); ice_set_bit(ICE_FEATURE_LINK_MGMT_VER_1, sc->feat_cap); ice_set_bit(ICE_FEATURE_LINK_MGMT_VER_2, sc->feat_cap); ice_set_bit(ICE_FEATURE_HEALTH_STATUS, sc->feat_cap); ice_set_bit(ICE_FEATURE_FW_LOGGING, sc->feat_cap); ice_set_bit(ICE_FEATURE_HAS_PBA, sc->feat_cap); ice_set_bit(ICE_FEATURE_DCB, sc->feat_cap); ice_set_bit(ICE_FEATURE_TX_BALANCE, sc->feat_cap); /* Disable features due to hardware limitations... */ if (!sc->hw.func_caps.common_cap.rss_table_size) ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap); if (!sc->hw.func_caps.common_cap.iwarp || !ice_enable_irdma) ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); if (!sc->hw.func_caps.common_cap.dcb) ice_clear_bit(ICE_FEATURE_DCB, sc->feat_cap); /* Disable features due to firmware limitations... */ if (!ice_is_fw_health_report_supported(&sc->hw)) ice_clear_bit(ICE_FEATURE_HEALTH_STATUS, sc->feat_cap); if (!ice_fwlog_supported(&sc->hw)) ice_clear_bit(ICE_FEATURE_FW_LOGGING, sc->feat_cap); if (sc->hw.fwlog_cfg.options & ICE_FWLOG_OPTION_IS_REGISTERED) { if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_FW_LOGGING)) ice_set_bit(ICE_FEATURE_FW_LOGGING, sc->feat_en); else ice_fwlog_unregister(&sc->hw); } /* Disable capabilities not supported by the OS */ ice_disable_unsupported_features(sc->feat_cap); /* RSS is always enabled for iflib */ if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_RSS)) ice_set_bit(ICE_FEATURE_RSS, sc->feat_en); /* Disable features based on sysctl settings */ if (!ice_tx_balance_en) ice_clear_bit(ICE_FEATURE_TX_BALANCE, sc->feat_cap); } /** * ice_if_multi_set - Callback to update Multicast filters in HW * @ctx: iflib ctx structure * * Called by iflib in response to SIOCDELMULTI and SIOCADDMULTI. Must search * the if_multiaddrs list and determine which filters have been added or * removed from the list, and update HW programming to reflect the new list. * * @pre assumes the caller holds the iflib CTX lock */ static void ice_if_multi_set(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); int err; ASSERT_CTX_LOCKED(sc); /* Do not handle multicast configuration in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; err = ice_sync_multicast_filters(sc); if (err) { device_printf(sc->dev, "Failed to synchronize multicast filter list: %s\n", ice_err_str(err)); return; } } /** * ice_if_vlan_register - Register a VLAN with the hardware * @ctx: iflib ctx pointer * @vtag: VLAN to add * * Programs the main PF VSI with a hardware filter for the given VLAN. * * @pre assumes the caller holds the iflib CTX lock */ static void ice_if_vlan_register(if_ctx_t ctx, u16 vtag) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); enum ice_status status; ASSERT_CTX_LOCKED(sc); /* Do not handle VLAN configuration in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; status = ice_add_vlan_hw_filter(&sc->pf_vsi, vtag); if (status) { device_printf(sc->dev, "Failure adding VLAN %d to main VSI, err %s aq_err %s\n", vtag, ice_status_str(status), ice_aq_str(sc->hw.adminq.sq_last_status)); } } /** * ice_if_vlan_unregister - Remove a VLAN filter from the hardware * @ctx: iflib ctx pointer * @vtag: VLAN to add * * Removes the previously programmed VLAN filter from the main PF VSI. * * @pre assumes the caller holds the iflib CTX lock */ static void ice_if_vlan_unregister(if_ctx_t ctx, u16 vtag) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); enum ice_status status; ASSERT_CTX_LOCKED(sc); /* Do not handle VLAN configuration in recovery mode */ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; status = ice_remove_vlan_hw_filter(&sc->pf_vsi, vtag); if (status) { device_printf(sc->dev, "Failure removing VLAN %d from main VSI, err %s aq_err %s\n", vtag, ice_status_str(status), ice_aq_str(sc->hw.adminq.sq_last_status)); } } /** * ice_if_stop - Stop the device * @ctx: iflib context structure * * Called by iflib to stop the device and bring it down. (i.e. ifconfig ice0 * down) * * @pre assumes the caller holds the iflib CTX lock */ static void ice_if_stop(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); ASSERT_CTX_LOCKED(sc); /* * The iflib core may call IFDI_STOP prior to the first call to * IFDI_INIT. This will cause us to attempt to remove MAC filters we * don't have, and disable Tx queues which aren't yet configured. * Although it is likely these extra operations are harmless, they do * cause spurious warning messages to be displayed, which may confuse * users. * * To avoid these messages, we use a state bit indicating if we've * been initialized. It will be set when ice_if_init is called, and * cleared here in ice_if_stop. */ if (!ice_testandclear_state(&sc->state, ICE_STATE_DRIVER_INITIALIZED)) return; if (ice_test_state(&sc->state, ICE_STATE_RESET_FAILED)) { device_printf(sc->dev, "request to stop interface cannot be completed as the device failed to reset\n"); return; } if (ice_test_state(&sc->state, ICE_STATE_PREPARED_FOR_RESET)) { device_printf(sc->dev, "request to stop interface while device is prepared for impending reset\n"); return; } ice_rdma_pf_stop(sc); /* Remove the MAC filters, stop Tx, and stop Rx. We don't check the * return of these functions because there's nothing we can really do * if they fail, and the functions already print error messages. * Just try to shut down as much as we can. */ ice_rm_pf_default_mac_filters(sc); /* Dissociate the Tx and Rx queues from the interrupts */ ice_flush_txq_interrupts(&sc->pf_vsi); ice_flush_rxq_interrupts(&sc->pf_vsi); /* Disable the Tx and Rx queues */ ice_vsi_disable_tx(&sc->pf_vsi); ice_control_all_rx_queues(&sc->pf_vsi, false); } /** * ice_if_get_counter - Get current value of an ifnet statistic * @ctx: iflib context pointer * @counter: ifnet counter to read * * Reads the current value of an ifnet counter for the device. * * This function is not protected by the iflib CTX lock. */ static uint64_t ice_if_get_counter(if_ctx_t ctx, ift_counter counter) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); /* Return the counter for the main PF VSI */ return ice_get_ifnet_counter(&sc->pf_vsi, counter); } /** * ice_request_stack_reinit - Request that iflib re-initialize * @sc: the device private softc * * Request that the device be brought down and up, to re-initialize. For * example, this may be called when a device reset occurs, or when Tx and Rx * queues need to be re-initialized. * * This is required because the iflib state is outside the driver, and must be * re-initialized if we need to resart Tx and Rx queues. */ void ice_request_stack_reinit(struct ice_softc *sc) { if (CTX_ACTIVE(sc->ctx)) { iflib_request_reset(sc->ctx); iflib_admin_intr_deferred(sc->ctx); } } /** * ice_driver_is_detaching - Check if the driver is detaching/unloading * @sc: device private softc * * Returns true if the driver is detaching, false otherwise. * * @remark on newer kernels, take advantage of iflib_in_detach in order to * report detachment correctly as early as possible. * * @remark this function is used by various code paths that want to avoid * running if the driver is about to be removed. This includes sysctls and * other driver access points. Note that it does not fully resolve * detach-based race conditions as it is possible for a thread to race with * iflib_in_detach. */ bool ice_driver_is_detaching(struct ice_softc *sc) { return (ice_test_state(&sc->state, ICE_STATE_DETACHING) || iflib_in_detach(sc->ctx)); } /** * ice_if_priv_ioctl - Device private ioctl handler * @ctx: iflib context pointer * @command: The ioctl command issued * @data: ioctl specific data * * iflib callback for handling custom driver specific ioctls. * * @pre Assumes that the iflib context lock is held. */ static int ice_if_priv_ioctl(if_ctx_t ctx, u_long command, caddr_t data) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); struct ifdrv *ifd; device_t dev = sc->dev; if (data == NULL) return (EINVAL); ASSERT_CTX_LOCKED(sc); /* Make sure the command type is valid */ switch (command) { case SIOCSDRVSPEC: case SIOCGDRVSPEC: /* Accepted commands */ break; case SIOCGPRIVATE_0: /* * Although we do not support this ioctl command, it's * expected that iflib will forward it to the IFDI_PRIV_IOCTL * handler. Do not print a message in this case */ return (ENOTSUP); default: /* * If we get a different command for this function, it's * definitely unexpected, so log a message indicating what * command we got for debugging purposes. */ device_printf(dev, "%s: unexpected ioctl command %08lx\n", __func__, command); return (EINVAL); } ifd = (struct ifdrv *)data; switch (ifd->ifd_cmd) { case ICE_NVM_ACCESS: return ice_handle_nvm_access_ioctl(sc, ifd); case ICE_DEBUG_DUMP: return ice_handle_debug_dump_ioctl(sc, ifd); default: return EINVAL; } } /** * ice_if_i2c_req - I2C request handler for iflib * @ctx: iflib context pointer * @req: The I2C parameters to use * * Read from the port's I2C eeprom using the parameters from the ioctl. * * @remark The iflib-only part is pretty simple. */ static int ice_if_i2c_req(if_ctx_t ctx, struct ifi2creq *req) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); return ice_handle_i2c_req(sc, req); } /** * ice_if_suspend - PCI device suspend handler for iflib * @ctx: iflib context pointer * * Deinitializes the driver and clears HW resources in preparation for * suspend or an FLR. * * @returns 0; this return value is ignored */ static int ice_if_suspend(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); /* At least a PFR is always going to happen after this; * either via FLR or during the D3->D0 transition. */ ice_clear_state(&sc->state, ICE_STATE_RESET_PFR_REQ); ice_prepare_for_reset(sc); return (0); } /** * ice_if_resume - PCI device resume handler for iflib * @ctx: iflib context pointer * * Reinitializes the driver and the HW after PCI resume or after * an FLR. An init is performed by iflib after this function is finished. * * @returns 0; this return value is ignored */ static int ice_if_resume(if_ctx_t ctx) { struct ice_softc *sc = (struct ice_softc *)iflib_get_softc(ctx); ice_rebuild(sc); return (0); } /* ice_if_needs_restart - Tell iflib when the driver needs to be reinitialized * @ctx: iflib context * @event: event code to check * * Defaults to returning false for unknown events. * * @returns true if iflib needs to reinit the interface */ static bool ice_if_needs_restart(if_ctx_t ctx __unused, enum iflib_restart_event event) { switch (event) { case IFLIB_RESTART_VLAN_CONFIG: default: return (false); } } diff --git a/sys/dev/irdma/icrdma.c b/sys/dev/irdma/icrdma.c index 848f8126e57c..8efd9d5c6034 100644 --- a/sys/dev/irdma/icrdma.c +++ b/sys/dev/irdma/icrdma.c @@ -1,820 +1,816 @@ /*- * SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB * * Copyright (c) 2021 - 2023 Intel Corporation * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenFabrics.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include "ice_rdma.h" #include "irdma_main.h" #include "icrdma_hw.h" #include "irdma_if.h" #include "irdma_di_if.h" /** * Driver version */ char irdma_driver_version[] = "1.2.17-k"; /** * irdma_init_tunable - prepare tunables * @rf: RDMA PCI function * @pf_id: id of the pf */ static void irdma_init_tunable(struct irdma_pci_f *rf, uint8_t pf_id) { struct sysctl_oid_list *irdma_oid_list; struct irdma_tunable_info *t_info = &rf->tun_info; char pf_name[16]; snprintf(pf_name, 15, "irdma%d", pf_id); sysctl_ctx_init(&t_info->irdma_sysctl_ctx); t_info->irdma_sysctl_tree = SYSCTL_ADD_NODE(&t_info->irdma_sysctl_ctx, SYSCTL_STATIC_CHILDREN(_dev), OID_AUTO, pf_name, CTLFLAG_RD, NULL, ""); irdma_oid_list = SYSCTL_CHILDREN(t_info->irdma_sysctl_tree); t_info->sws_sysctl_tree = SYSCTL_ADD_NODE(&t_info->irdma_sysctl_ctx, irdma_oid_list, OID_AUTO, "sw_stats", CTLFLAG_RD, NULL, ""); /* * debug mask setting */ SYSCTL_ADD_S32(&t_info->irdma_sysctl_ctx, irdma_oid_list, OID_AUTO, "debug", CTLFLAG_RWTUN, &rf->sc_dev.debug_mask, 0, "irdma debug"); /* * RoCEv2/iWARP setting RoCEv2 the default mode */ t_info->roce_ena = 1; SYSCTL_ADD_U8(&t_info->irdma_sysctl_ctx, irdma_oid_list, OID_AUTO, "roce_enable", CTLFLAG_RDTUN, &t_info->roce_ena, 0, "RoCEv2 mode enable"); rf->protocol_used = IRDMA_IWARP_PROTOCOL_ONLY; if (t_info->roce_ena == 1) rf->protocol_used = IRDMA_ROCE_PROTOCOL_ONLY; else if (t_info->roce_ena != 0) printf("%s:%d wrong roce_enable value (%d), using iWARP\n", __func__, __LINE__, t_info->roce_ena); printf("%s:%d protocol: %s, roce_enable value: %d\n", __func__, __LINE__, (rf->protocol_used == IRDMA_IWARP_PROTOCOL_ONLY) ? "iWARP" : "RoCEv2", t_info->roce_ena); snprintf(t_info->drv_ver, IRDMA_VER_LEN, "%s", irdma_driver_version); SYSCTL_ADD_STRING(&t_info->irdma_sysctl_ctx, irdma_oid_list, OID_AUTO, "drv_ver", CTLFLAG_RDTUN, t_info->drv_ver, IRDMA_VER_LEN, "driver version"); irdma_dcqcn_tunables_init(rf); irdma_sysctl_settings(rf); } /** * irdma_find_handler - obtain hdl object to identify pf * @p_dev: the peer interface structure */ static struct irdma_handler * irdma_find_handler(struct ice_rdma_peer *p_dev) { struct irdma_handler *hdl; unsigned long flags; spin_lock_irqsave(&irdma_handler_lock, flags); list_for_each_entry(hdl, &irdma_handlers, list) { if (!hdl->iwdev->rf->peer_info) continue; if (hdl->iwdev->rf->peer_info->dev == p_dev->dev) { spin_unlock_irqrestore(&irdma_handler_lock, flags); return hdl; } } spin_unlock_irqrestore(&irdma_handler_lock, flags); return NULL; } /** * peer_to_iwdev - return iwdev based on peer * @peer: the peer interface structure */ static struct irdma_device * peer_to_iwdev(struct ice_rdma_peer *peer) { struct irdma_handler *hdl; hdl = irdma_find_handler(peer); if (!hdl) { printf("%s:%d rdma handler not found\n", __func__, __LINE__); return NULL; } return hdl->iwdev; } /** * irdma_get_qos_info - save qos info from parameters to internal struct * @l2params: destination, qos, tc, mtu info structure * @qos_info: source, DCB settings structure */ static void irdma_get_qos_info(struct irdma_pci_f *rf, struct irdma_l2params *l2params, struct ice_qos_params *qos_info) { int i; char txt[7][128] = {"", "", "", "", "", "", ""}; u8 len; l2params->num_tc = qos_info->num_tc; l2params->num_apps = qos_info->num_apps; l2params->vsi_prio_type = qos_info->vsi_priority_type; l2params->vsi_rel_bw = qos_info->vsi_relative_bw; for (i = 0; i < l2params->num_tc; i++) { l2params->tc_info[i].egress_virt_up = qos_info->tc_info[i].egress_virt_up; l2params->tc_info[i].ingress_virt_up = qos_info->tc_info[i].ingress_virt_up; l2params->tc_info[i].prio_type = qos_info->tc_info[i].prio_type; l2params->tc_info[i].rel_bw = qos_info->tc_info[i].rel_bw; l2params->tc_info[i].tc_ctx = qos_info->tc_info[i].tc_ctx; } for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) l2params->up2tc[i] = qos_info->up2tc[i]; if (qos_info->pfc_mode == IRDMA_QOS_MODE_DSCP) { l2params->dscp_mode = true; memcpy(l2params->dscp_map, qos_info->dscp_map, sizeof(l2params->dscp_map)); } if (!(rf->sc_dev.debug_mask & IRDMA_DEBUG_DCB)) return; for (i = 0; i < l2params->num_tc; i++) { len = strlen(txt[0]); snprintf(txt[0] + len, sizeof(txt[0]) - 5, " %d", l2params->tc_info[i].egress_virt_up); len = strlen(txt[1]); snprintf(txt[1] + len, sizeof(txt[1]) - 5, " %d", l2params->tc_info[i].ingress_virt_up); len = strlen(txt[2]); snprintf(txt[2] + len, sizeof(txt[2]) - 5, " %d", l2params->tc_info[i].prio_type); len = strlen(txt[3]); snprintf(txt[3] + len, sizeof(txt[3]) - 5, " %d", l2params->tc_info[i].rel_bw); len = strlen(txt[4]); snprintf(txt[4] + len, sizeof(txt[4]) - 5, " %lu", l2params->tc_info[i].tc_ctx); } len = strlen(txt[5]); for (i = 0; i < IRDMA_MAX_USER_PRIORITY; i++) len += snprintf(txt[5] + len, sizeof(txt[5]) - 5, " %d", l2params->up2tc[i]); len = strlen(txt[6]); for (i = 0; i < IRDMA_DSCP_NUM_VAL; i++) len += snprintf(txt[6] + len, sizeof(txt[6]) - 5, " %d", l2params->dscp_map[i]); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "num_tc: %d\n", l2params->num_tc); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "num_apps: %d\n", l2params->num_apps); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "vsi_prio_type: %d\n", l2params->vsi_prio_type); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "vsi_rel_bw: %d\n", l2params->vsi_rel_bw); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "egress_virt_up: %s\n", txt[0]); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "ingress_virt_up:%s\n", txt[1]); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "prio_type: %s\n", txt[2]); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "rel_bw: %s\n", txt[3]); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "tc_ctx: %s\n", txt[4]); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "up2tc: %s\n", txt[5]); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_DCB, "dscp_mode: %s\n", txt[6]); irdma_debug_buf(&rf->sc_dev, IRDMA_DEBUG_DCB, "l2params", l2params, sizeof(*l2params)); } /** * irdma_log_invalid_mtu - check mtu setting validity * @mtu: mtu value * @dev: hardware control device structure */ static void irdma_log_invalid_mtu(u16 mtu, struct irdma_sc_dev *dev) { if (mtu < IRDMA_MIN_MTU_IPV4) irdma_dev_warn(to_ibdev(dev), "MTU setting [%d] too low for RDMA traffic. Minimum MTU is 576 for IPv4\n", mtu); else if (mtu < IRDMA_MIN_MTU_IPV6) irdma_dev_warn(to_ibdev(dev), "MTU setting [%d] too low for RDMA traffic. Minimum MTU is 1280 for IPv6\n", mtu); } /** * irdma_get_event_name - convert type enum to string * @type: event type enum */ static const char * irdma_get_event_name(enum ice_rdma_event_type type) { switch (type) { case ICE_RDMA_EVENT_LINK_CHANGE: return "LINK CHANGE"; case ICE_RDMA_EVENT_MTU_CHANGE: return "MTU CHANGE"; case ICE_RDMA_EVENT_TC_CHANGE: return "TC CHANGE"; case ICE_RDMA_EVENT_API_CHANGE: return "API CHANGE"; case ICE_RDMA_EVENT_CRIT_ERR: return "CRITICAL ERROR"; case ICE_RDMA_EVENT_RESET: return "RESET"; case ICE_RDMA_EVENT_QSET_REGISTER: return "QSET REGISTER"; case ICE_RDMA_EVENT_VSI_FILTER_UPDATE: return "VSI FILTER UPDATE"; default: return "UNKNOWN"; } } /** * irdma_event_handler - handling events from lan driver * @peer: the peer interface structure * @event: event info structure */ static void irdma_event_handler(struct ice_rdma_peer *peer, struct ice_rdma_event *event) { struct irdma_device *iwdev; struct irdma_l2params l2params = {}; printf("%s:%d event_handler %s (%x) on pf %d (%d)\n", __func__, __LINE__, irdma_get_event_name(event->type), event->type, peer->pf_id, if_getdunit(peer->ifp)); iwdev = peer_to_iwdev(peer); if (!iwdev) { printf("%s:%d rdma device not found\n", __func__, __LINE__); return; } switch (event->type) { case ICE_RDMA_EVENT_LINK_CHANGE: printf("%s:%d PF: %x (%x), state: %d, speed: %lu\n", __func__, __LINE__, peer->pf_id, if_getdunit(peer->ifp), event->linkstate, event->baudrate); break; case ICE_RDMA_EVENT_MTU_CHANGE: if (iwdev->vsi.mtu != event->mtu) { l2params.mtu = event->mtu; l2params.mtu_changed = true; irdma_log_invalid_mtu(l2params.mtu, &iwdev->rf->sc_dev); irdma_change_l2params(&iwdev->vsi, &l2params); } break; case ICE_RDMA_EVENT_TC_CHANGE: /* * 1. check if it is pre or post 2. check if it is currently being done */ if (event->prep == iwdev->vsi.tc_change_pending) { printf("%s:%d can't process %s TC change if TC change is %spending\n", __func__, __LINE__, event->prep ? "pre" : "post", event->prep ? " " : "not "); goto done; } if (!atomic_inc_not_zero(&iwdev->rf->dev_ctx.event_rfcnt)) { printf("%s:%d (%d) EVENT_TC_CHANGE received, but not processed %d\n", __func__, __LINE__, if_getdunit(peer->ifp), atomic_read(&iwdev->rf->dev_ctx.event_rfcnt)); break; } if (event->prep) { iwdev->vsi.tc_change_pending = true; irdma_sc_suspend_resume_qps(&iwdev->vsi, IRDMA_OP_SUSPEND); wait_event_timeout(iwdev->suspend_wq, !atomic_read(&iwdev->vsi.qp_suspend_reqs), IRDMA_EVENT_TIMEOUT_MS * 10); irdma_ws_reset(&iwdev->vsi); printf("%s:%d TC change preparation done\n", __func__, __LINE__); } else { l2params.tc_changed = true; irdma_get_qos_info(iwdev->rf, &l2params, &event->port_qos); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; irdma_check_fc_for_tc_update(&iwdev->vsi, &l2params); irdma_change_l2params(&iwdev->vsi, &l2params); printf("%s:%d TC change done\n", __func__, __LINE__); } atomic_dec(&iwdev->rf->dev_ctx.event_rfcnt); break; case ICE_RDMA_EVENT_CRIT_ERR: -#ifdef EVNT_HNDLR_CRITERR if (event->oicr_reg & IRDMAPFINT_OICR_PE_CRITERR_M) { u32 pe_criterr; #define IRDMA_Q1_RESOURCE_ERR 0x0001024d pe_criterr = readl(iwdev->rf->sc_dev.hw_regs[IRDMA_GLPE_CRITERR]); if (pe_criterr != IRDMA_Q1_RESOURCE_ERR) { irdma_pr_err("critical PE Error, GLPE_CRITERR=0x%08x\n", pe_criterr); iwdev->rf->reset = true; } else { irdma_dev_warn(to_ibdev(&iwdev->rf->sc_dev), "Q1 Resource Check\n"); } } if (event->oicr_reg & IRDMAPFINT_OICR_HMC_ERR_M) { irdma_pr_err("HMC Error\n"); iwdev->rf->reset = true; } if (iwdev->rf->reset) iwdev->rf->gen_ops.request_reset(iwdev->rf); -#else - printf("%s:%d event type received: %d\n", __func__, __LINE__, event->type); -#endif break; case ICE_RDMA_EVENT_RESET: iwdev->rf->reset = true; break; default: printf("%s:%d event type unsupported: %d\n", __func__, __LINE__, event->type); } done: return; } /** * irdma_link_change - Callback for link state change * @peer: the peer interface structure * @linkstate: state of the link * @baudrate: speed of the link */ static void irdma_link_change(struct ice_rdma_peer *peer, int linkstate, uint64_t baudrate) { printf("%s:%d PF: %x (%x), state: %d, speed: %lu\n", __func__, __LINE__, peer->pf_id, if_getdunit(peer->ifp), linkstate, baudrate); } /** * irdma_finalize_task - Finish open or close phase in a separate thread * @context: instance holding peer and iwdev information * * Triggered from irdma_open or irdma_close to perform rt_init_hw or * rt_deinit_hw respectively. Does registration and unregistration of * the device. */ static void irdma_finalize_task(void *context, int pending) { struct irdma_task_arg *task_arg = (struct irdma_task_arg *)context; struct irdma_device *iwdev = task_arg->iwdev; struct irdma_pci_f *rf = iwdev->rf; struct ice_rdma_peer *peer = task_arg->peer; struct irdma_l2params l2params = {{{0}}}; struct ice_rdma_request req = {0}; int status = 0; if (iwdev->iw_status) { irdma_debug(&rf->sc_dev, IRDMA_DEBUG_INIT, "Starting deferred closing %d (%d)\n", rf->peer_info->pf_id, if_getdunit(peer->ifp)); atomic_dec(&rf->dev_ctx.event_rfcnt); wait_event_timeout(iwdev->suspend_wq, !atomic_read(&rf->dev_ctx.event_rfcnt), IRDMA_MAX_TIMEOUT); if (atomic_read(&rf->dev_ctx.event_rfcnt) != 0) { printf("%s:%d (%d) waiting for event_rfcnt (%d) timeout, proceed with unload\n", __func__, __LINE__, if_getdunit(peer->ifp), atomic_read(&rf->dev_ctx.event_rfcnt)); } irdma_dereg_ipaddr_event_cb(rf); irdma_ib_unregister_device(iwdev); req.type = ICE_RDMA_EVENT_VSI_FILTER_UPDATE; req.enable_filter = false; IRDMA_DI_REQ_HANDLER(peer, &req); irdma_cleanup_dead_qps(&iwdev->vsi); irdma_rt_deinit_hw(iwdev); } else { irdma_debug(&rf->sc_dev, IRDMA_DEBUG_INIT, "Starting deferred opening %d (%d)\n", rf->peer_info->pf_id, if_getdunit(peer->ifp)); irdma_get_qos_info(iwdev->rf, &l2params, &peer->initial_qos_info); if (iwdev->rf->protocol_used != IRDMA_IWARP_PROTOCOL_ONLY) iwdev->dcb_vlan_mode = l2params.num_tc > 1 && !l2params.dscp_mode; l2params.mtu = peer->mtu; status = irdma_rt_init_hw(iwdev, &l2params); if (status) { irdma_pr_err("RT init failed %d\n", status); ib_dealloc_device(&iwdev->ibdev); return; } status = irdma_ib_register_device(iwdev); if (status) { irdma_pr_err("Registration failed %d\n", status); irdma_rt_deinit_hw(iwdev); ib_dealloc_device(&iwdev->ibdev); } irdma_sw_stats_tunables_init(rf); req.type = ICE_RDMA_EVENT_VSI_FILTER_UPDATE; req.enable_filter = true; IRDMA_DI_REQ_HANDLER(peer, &req); irdma_reg_ipaddr_event_cb(rf); atomic_inc(&rf->dev_ctx.event_rfcnt); irdma_debug(&rf->sc_dev, IRDMA_DEBUG_INIT, "Deferred opening finished %d (%d)\n", rf->peer_info->pf_id, if_getdunit(peer->ifp)); } } /** * irdma_alloc_pcidev - allocate memory for pcidev and populate data * @peer: the new peer interface structure * @rf: RDMA PCI function */ static int irdma_alloc_pcidev(struct ice_rdma_peer *peer, struct irdma_pci_f *rf) { rf->pcidev = kzalloc(sizeof(struct pci_dev), GFP_KERNEL); if (!rf->pcidev) { return -ENOMEM; } if (linux_pci_attach_device(rf->dev_ctx.dev, NULL, NULL, rf->pcidev)) return -ENOMEM; return 0; } /** * irdma_dealloc_pcidev - deallocate memory for pcidev * @rf: RDMA PCI function */ static void irdma_dealloc_pcidev(struct irdma_pci_f *rf) { linux_pci_detach_device(rf->pcidev); kfree(rf->pcidev); } /** * irdma_fill_device_info - assign initial values to rf variables * @iwdev: irdma device * @peer: the peer interface structure */ static void irdma_fill_device_info(struct irdma_device *iwdev, struct ice_rdma_peer *peer) { struct irdma_pci_f *rf = iwdev->rf; rf->peer_info = peer; rf->gen_ops.register_qset = irdma_register_qset; rf->gen_ops.unregister_qset = irdma_unregister_qset; rf->rdma_ver = IRDMA_GEN_2; rf->sc_dev.hw_attrs.uk_attrs.hw_rev = IRDMA_GEN_2; rf->rsrc_profile = IRDMA_HMC_PROFILE_DEFAULT; rf->rst_to = IRDMA_RST_TIMEOUT_HZ; rf->check_fc = irdma_check_fc_for_qp; rf->gen_ops.request_reset = irdma_request_reset; irdma_set_rf_user_cfg_params(rf); rf->default_vsi.vsi_idx = peer->pf_vsi_num; rf->dev_ctx.dev = peer->dev; rf->dev_ctx.mem_bus_space_tag = rman_get_bustag(peer->pci_mem); rf->dev_ctx.mem_bus_space_handle = rman_get_bushandle(peer->pci_mem); rf->dev_ctx.mem_bus_space_size = rman_get_size(peer->pci_mem); rf->hw.dev_context = &rf->dev_ctx; rf->hw.hw_addr = (u8 *)rman_get_virtual(peer->pci_mem); rf->msix_count = peer->msix.count; rf->msix_info.entry = peer->msix.base; rf->msix_info.vector = peer->msix.count; printf("%s:%d msix_info: %d %d %d\n", __func__, __LINE__, rf->msix_count, rf->msix_info.entry, rf->msix_info.vector); rf->iwdev = iwdev; iwdev->netdev = peer->ifp; iwdev->init_state = INITIAL_STATE; iwdev->vsi_num = peer->pf_vsi_num; iwdev->rcv_wnd = IRDMA_CM_DEFAULT_RCV_WND_SCALED; iwdev->rcv_wscale = IRDMA_CM_DEFAULT_RCV_WND_SCALE; iwdev->roce_cwnd = IRDMA_ROCE_CWND_DEFAULT; iwdev->roce_ackcreds = IRDMA_ROCE_ACKCREDS_DEFAULT; iwdev->roce_rtomin = 5; if (rf->protocol_used == IRDMA_ROCE_PROTOCOL_ONLY) { iwdev->roce_mode = true; } } /** * irdma_probe - Callback to probe a new RDMA peer device * @peer: the new peer interface structure * * Callback implementing the RDMA_PROBE function. Called by the ice driver to * notify the RDMA client driver that a new device has been created */ static int irdma_probe(struct ice_rdma_peer *peer) { struct irdma_device *iwdev; struct irdma_pci_f *rf; struct irdma_handler *hdl; int err = 0; irdma_pr_info("probe: irdma-%s peer=%p, peer->pf_id=%d, peer->ifp=%p, peer->ifp->if_dunit=%d, peer->pci_mem->r_bustag=%p\n", irdma_driver_version, peer, peer->pf_id, peer->ifp, if_getdunit(peer->ifp), (void *)(uintptr_t)peer->pci_mem->r_bustag); hdl = irdma_find_handler(peer); if (hdl) return -EBUSY; hdl = kzalloc(sizeof(*hdl), GFP_KERNEL); if (!hdl) return -ENOMEM; iwdev = (struct irdma_device *)ib_alloc_device(sizeof(*iwdev)); if (!iwdev) { kfree(hdl); return -ENOMEM; } iwdev->rf = kzalloc(sizeof(*rf), GFP_KERNEL); if (!iwdev->rf) { ib_dealloc_device(&iwdev->ibdev); kfree(hdl); return -ENOMEM; } hdl->iwdev = iwdev; iwdev->hdl = hdl; irdma_init_tunable(iwdev->rf, if_getdunit(peer->ifp)); irdma_fill_device_info(iwdev, peer); rf = iwdev->rf; if (irdma_alloc_pcidev(peer, rf)) goto err_pcidev; irdma_add_handler(hdl); if (irdma_ctrl_init_hw(rf)) { err = -EIO; goto err_ctrl_init; } rf->dev_ctx.task_arg.peer = peer; rf->dev_ctx.task_arg.iwdev = iwdev; rf->dev_ctx.task_arg.peer = peer; TASK_INIT(&hdl->deferred_task, 0, irdma_finalize_task, &rf->dev_ctx.task_arg); hdl->deferred_tq = taskqueue_create_fast("irdma_defer", M_NOWAIT, taskqueue_thread_enqueue, &hdl->deferred_tq); taskqueue_start_threads(&hdl->deferred_tq, 1, PI_NET, "irdma_defer_t"); taskqueue_enqueue(hdl->deferred_tq, &hdl->deferred_task); return 0; err_ctrl_init: irdma_del_handler(hdl); irdma_dealloc_pcidev(rf); err_pcidev: kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); kfree(hdl); return err; } /** * irdma_remove - Callback to remove an RDMA peer device * @peer: the new peer interface structure * * Callback implementing the RDMA_REMOVE function. Called by the ice driver to * notify the RDMA client driver that the device wille be delated */ static int irdma_remove(struct ice_rdma_peer *peer) { struct irdma_handler *hdl; struct irdma_device *iwdev; irdma_debug((struct irdma_sc_dev *)NULL, IRDMA_DEBUG_INIT, "removing %s irdma%d\n", __func__, if_getdunit(peer->ifp)); hdl = irdma_find_handler(peer); if (!hdl) return 0; iwdev = hdl->iwdev; if (iwdev->vsi.tc_change_pending) { iwdev->vsi.tc_change_pending = false; irdma_sc_suspend_resume_qps(&iwdev->vsi, IRDMA_OP_RESUME); } taskqueue_enqueue(hdl->deferred_tq, &hdl->deferred_task); taskqueue_drain(hdl->deferred_tq, &hdl->deferred_task); taskqueue_free(hdl->deferred_tq); hdl->iwdev->rf->dev_ctx.task_arg.iwdev = NULL; hdl->iwdev->rf->dev_ctx.task_arg.peer = NULL; sysctl_ctx_free(&iwdev->rf->tun_info.irdma_sysctl_ctx); hdl->iwdev->rf->tun_info.irdma_sysctl_tree = NULL; hdl->iwdev->rf->tun_info.sws_sysctl_tree = NULL; irdma_ctrl_deinit_hw(iwdev->rf); irdma_dealloc_pcidev(iwdev->rf); irdma_del_handler(iwdev->hdl); kfree(iwdev->hdl); kfree(iwdev->rf); ib_dealloc_device(&iwdev->ibdev); irdma_pr_info("IRDMA hardware deinitialization complete irdma%d\n", if_getdunit(peer->ifp)); return 0; } /** * irdma_open - Callback for operation open for RDMA device * @peer: the new peer interface structure * * Callback implementing the RDMA_OPEN function. Called by the ice driver to * notify the RDMA client driver that a new device has been initialized. */ static int irdma_open(struct ice_rdma_peer *peer) { struct irdma_device *iwdev; struct ice_rdma_event event = {0}; iwdev = peer_to_iwdev(peer); if (iwdev) { event.type = ICE_RDMA_EVENT_MTU_CHANGE; event.mtu = peer->mtu; irdma_event_handler(peer, &event); } else { irdma_probe(peer); } return 0; } /** * irdma_close - Callback to notify that a peer device is down * @peer: the RDMA peer device being stopped * * Callback implementing the RDMA_CLOSE function. Called by the ice driver to * notify the RDMA client driver that a peer device is being stopped. */ static int irdma_close(struct ice_rdma_peer *peer) { /* * This is called when ifconfig down or pf-reset is about to happen. */ struct irdma_device *iwdev; iwdev = peer_to_iwdev(peer); if (iwdev && iwdev->rf->reset) irdma_remove(peer); return 0; } /** * irdma_prep_for_unregister - ensure the driver is ready to unregister */ static void irdma_prep_for_unregister(void) { struct irdma_handler *hdl; unsigned long flags; bool hdl_valid; do { hdl_valid = false; spin_lock_irqsave(&irdma_handler_lock, flags); list_for_each_entry(hdl, &irdma_handlers, list) { if (!hdl->iwdev->rf->peer_info) continue; hdl_valid = true; break; } spin_unlock_irqrestore(&irdma_handler_lock, flags); if (!hdl || !hdl_valid) break; IRDMA_CLOSE(hdl->iwdev->rf->peer_info); IRDMA_REMOVE(hdl->iwdev->rf->peer_info); } while (1); } static kobj_method_t irdma_methods[] = { KOBJMETHOD(irdma_probe, irdma_probe), KOBJMETHOD(irdma_open, irdma_open), KOBJMETHOD(irdma_close, irdma_close), KOBJMETHOD(irdma_remove, irdma_remove), KOBJMETHOD(irdma_link_change, irdma_link_change), KOBJMETHOD(irdma_event_handler, irdma_event_handler), KOBJMETHOD_END }; /* declare irdma_class which extends the ice_rdma_di class */ DEFINE_CLASS_1(irdma, irdma_class, irdma_methods, sizeof(struct ice_rdma_peer), ice_rdma_di_class); static struct ice_rdma_info irdma_info = { .major_version = ICE_RDMA_MAJOR_VERSION, .minor_version = ICE_RDMA_MINOR_VERSION, .patch_version = ICE_RDMA_PATCH_VERSION, .rdma_class = &irdma_class, }; /** * irdma_module_event_handler - Module event handler callback * @mod: unused mod argument * @what: the module event to handle * @arg: unused module event argument * * Callback used by the FreeBSD module stack to notify the driver of module * events. Used to implement custom handling for certain module events such as * load and unload. */ static int irdma_module_event_handler(module_t __unused mod, int what, void __unused * arg) { switch (what) { case MOD_LOAD: printf("Loading irdma module\n"); return ice_rdma_register(&irdma_info); case MOD_UNLOAD: printf("Unloading irdma module\n"); irdma_prep_for_unregister(); ice_rdma_unregister(); return (0); default: return (EOPNOTSUPP); } return (0); } static moduledata_t irdma_moduledata = { "irdma", irdma_module_event_handler, NULL }; DECLARE_MODULE(irdma, irdma_moduledata, SI_SUB_LAST, SI_ORDER_ANY); MODULE_VERSION(irdma, 1); MODULE_DEPEND(irdma, ice, 1, 1, 1); MODULE_DEPEND(irdma, ibcore, 1, 1, 1);