Index: sys/conf/files.amd64 =================================================================== --- sys/conf/files.amd64 +++ sys/conf/files.amd64 @@ -173,6 +173,12 @@ compile-with "${NORMAL_C} -I$S/dev/ice" dev/ice/ice_fwlog.c optional ice pci \ compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_rdma.c optional ice pci \ + compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/irdma_if.m optional ice pci \ + compile-with "${NORMAL_M} -I$S/dev/ice" +dev/ice/irdma_di_if.m optional ice pci \ + compile-with "${NORMAL_M} -I$S/dev/ice" ice_ddp.c optional ice_ddp \ compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01031b00 -mice_ddp -c${.TARGET}" \ no-ctfconvert no-implicit-rule before-depend local \ Index: sys/conf/files.arm64 =================================================================== --- sys/conf/files.arm64 +++ sys/conf/files.arm64 @@ -230,6 +230,12 @@ compile-with "${NORMAL_C} -I$S/dev/ice" dev/ice/ice_fwlog.c optional ice pci \ compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_rdma.c optional ice pci \ + compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/irdma_if.m optional ice pci \ + compile-with "${NORMAL_M} -I$S/dev/ice" +dev/ice/irdma_di_if.m optional ice pci \ + compile-with "${NORMAL_M} -I$S/dev/ice" ice_ddp.c optional ice_ddp \ compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01031b00 -mice_ddp -c${.TARGET}" \ no-ctfconvert no-implicit-rule before-depend local \ Index: sys/conf/files.powerpc =================================================================== --- sys/conf/files.powerpc +++ sys/conf/files.powerpc @@ -75,6 +75,12 @@ compile-with "${NORMAL_C} -I$S/dev/ice" dev/ice/ice_fwlog.c optional ice pci powerpc64 \ compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/ice_rdma.c optional ice pci powerpc64 \ + compile-with "${NORMAL_C} -I$S/dev/ice" +dev/ice/irdma_if.m optional ice pci powerpc64 \ + compile-with "${NORMAL_M} -I$S/dev/ice" +dev/ice/irdma_di_if.m optional ice pci powerpc64 \ + compile-with "${NORMAL_M} -I$S/dev/ice" ice_ddp.c optional ice_ddp powerpc64 \ compile-with "${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01031b00 -mice_ddp -c${.TARGET}" \ no-ctfconvert no-implicit-rule before-depend local \ Index: sys/dev/ice/ice_adminq_cmd.h =================================================================== --- sys/dev/ice/ice_adminq_cmd.h +++ sys/dev/ice/ice_adminq_cmd.h @@ -2547,6 +2547,57 @@ struct ice_aqc_move_txqs_elem txqs[STRUCT_HACK_VAR_LEN]; }; +/* Add Tx RDMA Queue Set (indirect 0x0C33) */ +struct ice_aqc_add_rdma_qset { + u8 num_qset_grps; + u8 reserved[7]; + __le32 addr_high; + __le32 addr_low; +}; + +/* This is the descriptor of each qset entry for the Add Tx RDMA Queue Set + * command (0x0C33). Only used within struct ice_aqc_add_rdma_qset. + */ +struct ice_aqc_add_tx_rdma_qset_entry { + __le16 tx_qset_id; + u8 rsvd[2]; + __le32 qset_teid; + struct ice_aqc_txsched_elem info; +}; + +/* The format of the command buffer for Add Tx RDMA Queue Set(0x0C33) + * is an array of the following structs. Please note that the length of + * each struct ice_aqc_add_rdma_qset is variable due to the variable + * number of queues in each group! + */ +struct ice_aqc_add_rdma_qset_data { + __le32 parent_teid; + __le16 num_qsets; + u8 rsvd[2]; + struct ice_aqc_add_tx_rdma_qset_entry rdma_qsets[STRUCT_HACK_VAR_LEN]; +}; + +/* Move RDMA Queue Set (indirect 0x0C34) */ +struct ice_aqc_move_rdma_qset_cmd { + u8 num_rdma_qset; /* Used by commands and response */ + u8 flags; + u8 reserved[6]; + __le32 addr_high; + __le32 addr_low; +}; + +/* Buffer */ +struct ice_aqc_move_rdma_qset_buffer_desc { + __le16 tx_qset_id; + __le16 qset_teid; +}; + +struct ice_aqc_move_rdma_qset_buffer { + __le32 src_parent_teid; + __le32 dest_parent_teid; + struct ice_aqc_move_rdma_qset_buffer_desc descs[STRUCT_HACK_VAR_LEN]; +}; + /* Download Package (indirect 0x0C40) */ /* Also used for Update Package (indirect 0x0C42 and 0x0C41) */ struct ice_aqc_download_pkg { @@ -2897,6 +2948,7 @@ struct ice_aqc_add_txqs add_txqs; struct ice_aqc_dis_txqs dis_txqs; struct ice_aqc_move_txqs move_txqs; + struct ice_aqc_add_rdma_qset add_rdma_qset; struct ice_aqc_txqs_cleanup txqs_cleanup; struct ice_aqc_add_get_update_free_vsi vsi_cmd; struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res; @@ -3156,6 +3208,8 @@ ice_aqc_opc_dis_txqs = 0x0C31, ice_aqc_opc_txqs_cleanup = 0x0C31, ice_aqc_opc_move_recfg_txqs = 0x0C32, + ice_aqc_opc_add_rdma_qset = 0x0C33, + ice_aqc_opc_move_rdma_qset = 0x0C34, /* package commands */ ice_aqc_opc_download_pkg = 0x0C40, Index: sys/dev/ice/ice_common.h =================================================================== --- sys/dev/ice/ice_common.h +++ sys/dev/ice/ice_common.h @@ -147,6 +147,11 @@ struct ice_aqc_move_txqs_data *buf, u16 buf_size, u8 *txqs_moved, struct ice_sq_cd *cd); +enum ice_status +ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps, + struct ice_aqc_add_rdma_qset_data *qset_list, + u16 buf_size, struct ice_sq_cd *cd); + bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq); enum ice_status ice_aq_q_shutdown(struct ice_hw *hw, bool unloading); void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode); @@ -257,6 +262,15 @@ enum ice_status __ice_write_sr_buf(struct ice_hw *hw, u32 offset, u16 words, const u16 *data); enum ice_status +ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap, + u16 *max_rdmaqs); +enum ice_status +ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc, + u16 *rdma_qset, u16 num_qsets, u32 *qset_teid); +enum ice_status +ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid, + u16 *q_id); +enum ice_status ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues, u16 *q_handle, u16 *q_ids, u32 *q_teids, enum ice_disq_rst_src rst_src, u16 vmvf_num, Index: sys/dev/ice/ice_common.c =================================================================== --- sys/dev/ice/ice_common.c +++ sys/dev/ice/ice_common.c @@ -1198,7 +1198,8 @@ GLNVM_ULD_POR_DONE_1_M |\ GLNVM_ULD_PCIER_DONE_2_M) - uld_mask = ICE_RESET_DONE_MASK; + uld_mask = ICE_RESET_DONE_MASK | (hw->func_caps.common_cap.iwarp ? + GLNVM_ULD_PE_DONE_M : 0); /* Device is Active; check Global Reset processes are done */ for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) { @@ -2364,6 +2365,10 @@ ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_cem = %d\n", prefix, caps->mgmt_cem); break; + case ICE_AQC_CAPS_IWARP: + caps->iwarp = (number == 1); + ice_debug(hw, ICE_DBG_INIT, "%s: iwarp = %d\n", prefix, caps->iwarp); + break; case ICE_AQC_CAPS_LED: if (phys_id < ICE_MAX_SUPPORTED_GPIO_LED) { caps->led[phys_id] = true; @@ -2481,6 +2486,16 @@ caps->maxtc = 4; ice_debug(hw, ICE_DBG_INIT, "reducing maxtc to %d (based on #ports)\n", caps->maxtc); + if (caps->iwarp) { + ice_debug(hw, ICE_DBG_INIT, "forcing RDMA off\n"); + caps->iwarp = 0; + } + + /* print message only when processing device capabilities + * during initialization. + */ + if (caps == &hw->dev_caps.common_cap) + ice_info(hw, "RDMA functionality is not available with the current device configuration.\n"); } } @@ -4338,6 +4353,56 @@ return status; } +/** + * ice_aq_add_rdma_qsets + * @hw: pointer to the hardware structure + * @num_qset_grps: Number of RDMA Qset groups + * @qset_list: list of qset groups to be added + * @buf_size: size of buffer for indirect command + * @cd: pointer to command details structure or NULL + * + * Add Tx RDMA Qsets (0x0C33) + */ +enum ice_status +ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps, + struct ice_aqc_add_rdma_qset_data *qset_list, + u16 buf_size, struct ice_sq_cd *cd) +{ + struct ice_aqc_add_rdma_qset_data *list; + struct ice_aqc_add_rdma_qset *cmd; + struct ice_aq_desc desc; + u16 i, sum_size = 0; + + ice_debug(hw, ICE_DBG_TRACE, "%s\n", __func__); + + cmd = &desc.params.add_rdma_qset; + + ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_rdma_qset); + + if (!qset_list) + return ICE_ERR_PARAM; + + if (num_qset_grps > ICE_LAN_TXQ_MAX_QGRPS) + return ICE_ERR_PARAM; + + for (i = 0, list = qset_list; i < num_qset_grps; i++) { + u16 num_qsets = LE16_TO_CPU(list->num_qsets); + + sum_size += ice_struct_size(list, rdma_qsets, num_qsets); + list = (struct ice_aqc_add_rdma_qset_data *)(list->rdma_qsets + + num_qsets); + } + + if (buf_size != sum_size) + return ICE_ERR_PARAM; + + desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD); + + cmd->num_qset_grps = num_qset_grps; + + return ice_aq_send_cmd(hw, &desc, qset_list, buf_size, cd); +} + /* End of FW Admin Queue command wrappers */ /** @@ -5100,6 +5165,158 @@ ICE_SCHED_NODE_OWNER_LAN); } +/** + * ice_cfg_vsi_rdma - configure the VSI RDMA queues + * @pi: port information structure + * @vsi_handle: software VSI handle + * @tc_bitmap: TC bitmap + * @max_rdmaqs: max RDMA queues array per TC + * + * This function adds/updates the VSI RDMA queues per TC. + */ +enum ice_status +ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap, + u16 *max_rdmaqs) +{ + return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_rdmaqs, + ICE_SCHED_NODE_OWNER_RDMA); +} + +/** + * ice_ena_vsi_rdma_qset + * @pi: port information structure + * @vsi_handle: software VSI handle + * @tc: TC number + * @rdma_qset: pointer to RDMA qset + * @num_qsets: number of RDMA qsets + * @qset_teid: pointer to qset node teids + * + * This function adds RDMA qset + */ +enum ice_status +ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc, + u16 *rdma_qset, u16 num_qsets, u32 *qset_teid) +{ + struct ice_aqc_txsched_elem_data node = { 0 }; + struct ice_aqc_add_rdma_qset_data *buf; + struct ice_sched_node *parent; + enum ice_status status; + struct ice_hw *hw; + u16 i, buf_size; + + if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY) + return ICE_ERR_CFG; + hw = pi->hw; + + if (!ice_is_vsi_valid(hw, vsi_handle)) + return ICE_ERR_PARAM; + + buf_size = ice_struct_size(buf, rdma_qsets, num_qsets); + buf = (struct ice_aqc_add_rdma_qset_data *)ice_malloc(hw, buf_size); + if (!buf) + return ICE_ERR_NO_MEMORY; + ice_acquire_lock(&pi->sched_lock); + + parent = ice_sched_get_free_qparent(pi, vsi_handle, tc, + ICE_SCHED_NODE_OWNER_RDMA); + if (!parent) { + status = ICE_ERR_PARAM; + goto rdma_error_exit; + } + buf->parent_teid = parent->info.node_teid; + node.parent_teid = parent->info.node_teid; + + buf->num_qsets = CPU_TO_LE16(num_qsets); + for (i = 0; i < num_qsets; i++) { + buf->rdma_qsets[i].tx_qset_id = CPU_TO_LE16(rdma_qset[i]); + buf->rdma_qsets[i].info.valid_sections = + ICE_AQC_ELEM_VALID_GENERIC | ICE_AQC_ELEM_VALID_CIR | + ICE_AQC_ELEM_VALID_EIR; + buf->rdma_qsets[i].info.generic = 0; + buf->rdma_qsets[i].info.cir_bw.bw_profile_idx = + CPU_TO_LE16(ICE_SCHED_DFLT_RL_PROF_ID); + buf->rdma_qsets[i].info.cir_bw.bw_alloc = + CPU_TO_LE16(ICE_SCHED_DFLT_BW_WT); + buf->rdma_qsets[i].info.eir_bw.bw_profile_idx = + CPU_TO_LE16(ICE_SCHED_DFLT_RL_PROF_ID); + buf->rdma_qsets[i].info.eir_bw.bw_alloc = + CPU_TO_LE16(ICE_SCHED_DFLT_BW_WT); + } + status = ice_aq_add_rdma_qsets(hw, 1, buf, buf_size, NULL); + if (status != ICE_SUCCESS) { + ice_debug(hw, ICE_DBG_RDMA, "add RDMA qset failed\n"); + goto rdma_error_exit; + } + node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF; + for (i = 0; i < num_qsets; i++) { + node.node_teid = buf->rdma_qsets[i].qset_teid; + status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1, + &node); + if (status) + break; + qset_teid[i] = LE32_TO_CPU(node.node_teid); + } +rdma_error_exit: + ice_release_lock(&pi->sched_lock); + ice_free(hw, buf); + return status; +} + +/** + * ice_dis_vsi_rdma_qset - free RDMA resources + * @pi: port_info struct + * @count: number of RDMA qsets to free + * @qset_teid: TEID of qset node + * @q_id: list of queue IDs being disabled + */ +enum ice_status +ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid, + u16 *q_id) +{ + struct ice_aqc_dis_txq_item *qg_list; + enum ice_status status = ICE_SUCCESS; + struct ice_hw *hw; + u16 qg_size; + int i; + + if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY) + return ICE_ERR_CFG; + + hw = pi->hw; + + qg_size = ice_struct_size(qg_list, q_id, 1); + qg_list = (struct ice_aqc_dis_txq_item *)ice_malloc(hw, qg_size); + if (!qg_list) + return ICE_ERR_NO_MEMORY; + + ice_acquire_lock(&pi->sched_lock); + + for (i = 0; i < count; i++) { + struct ice_sched_node *node; + + node = ice_sched_find_node_by_teid(pi->root, qset_teid[i]); + if (!node) + continue; + + qg_list->parent_teid = node->info.parent_teid; + qg_list->num_qs = 1; + qg_list->q_id[0] = + CPU_TO_LE16(q_id[i] | + ICE_AQC_Q_DIS_BUF_ELEM_TYPE_RDMA_QSET); + + status = ice_aq_dis_lan_txq(hw, 1, qg_list, qg_size, + ICE_NO_RESET, 0, NULL); + if (status) + break; + + ice_free_sched_node(pi, node); + } + + ice_release_lock(&pi->sched_lock); + ice_free(hw, qg_list); + return status; +} + /** * ice_is_main_vsi - checks whether the VSI is main VSI * @hw: pointer to the HW struct Index: sys/dev/ice/ice_common_sysctls.h =================================================================== --- sys/dev/ice/ice_common_sysctls.h +++ sys/dev/ice/ice_common_sysctls.h @@ -45,6 +45,15 @@ #include +/** + * @var ice_enable_irdma + * @brief boolean indicating if the iRDMA client interface is enabled + * + * Global sysctl variable indicating whether the RDMA client interface feature + * is enabled. + */ +bool ice_enable_irdma = true; + /** * @var ice_enable_tx_fc_filter * @brief boolean indicating if the Tx Flow Control filter should be enabled @@ -85,6 +94,15 @@ */ bool ice_enable_health_events = true; +/** + * @var ice_rdma_max_msix + * @brief maximum number of MSI-X vectors to reserve for RDMA interface + * + * Global sysctl variable indicating the maximum number of MSI-X vectors to + * reserve for a single RDMA interface. + */ +static uint16_t ice_rdma_max_msix = ICE_RDMA_MAX_MSIX; + /* sysctls marked as tunable, (i.e. with the CTLFLAG_TUN set) will * automatically load tunable values, without the need to manually create the * TUNABLE definition. @@ -105,6 +123,12 @@ &ice_enable_health_events, 0, "Enable FW health event reporting globally"); +SYSCTL_BOOL(_hw_ice, OID_AUTO, irdma, CTLFLAG_RDTUN, &ice_enable_irdma, 0, + "Enable iRDMA client interface"); + +SYSCTL_U16(_hw_ice, OID_AUTO, rdma_max_msix, CTLFLAG_RDTUN, &ice_rdma_max_msix, + 0, "Maximum number of MSI-X vectors to reserve per RDMA interface"); + SYSCTL_BOOL(_hw_ice_debug, OID_AUTO, enable_tx_fc_filter, CTLFLAG_RDTUN, &ice_enable_tx_fc_filter, 0, "Drop Ethertype 0x8808 control frames originating from non-HW sources"); Index: sys/dev/ice/ice_iflib.h =================================================================== --- sys/dev/ice/ice_iflib.h +++ sys/dev/ice/ice_iflib.h @@ -236,6 +236,11 @@ struct mtx admin_mtx; /* mutex to protect the admin timer */ struct callout admin_timer; /* timer to trigger admin task */ + /* iRDMA peer interface */ + struct ice_rdma_entry rdma_entry; + int irdma_vectors; + u16 *rdma_imap; + struct ice_vsi **all_vsi; /* Array of VSI pointers */ u16 num_available_vsi; /* Size of VSI array */ Index: sys/dev/ice/ice_lib.h =================================================================== --- sys/dev/ice/ice_lib.h +++ sys/dev/ice/ice_lib.h @@ -65,6 +65,8 @@ #include "ice_sched.h" #include "ice_resmgr.h" +#include "ice_rdma_internal.h" + #include "ice_rss.h" /* Hide debug sysctls unless INVARIANTS is enabled */ Index: sys/dev/ice/ice_lib.c =================================================================== --- sys/dev/ice/ice_lib.c +++ sys/dev/ice/ice_lib.c @@ -3984,6 +3984,11 @@ local_dcbx_cfg->pfc.willing = 0; local_dcbx_cfg->pfc.mbc = 0; + /* Warn if PFC is being disabled with RoCE v2 in use */ + if (new_mode == 0 && sc->rdma_entry.attached) + device_printf(dev, + "WARNING: Recommended that Priority Flow Control is enabled when RoCEv2 is in use\n"); + status = ice_set_dcb_cfg(pi); if (status) { device_printf(dev, @@ -7800,6 +7805,8 @@ pi = sc->hw.port_info; local_dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg; + ice_rdma_notify_dcb_qos_change(sc); + /* Set state when there's more than one TC */ tc_map = ice_dcb_get_tc_map(local_dcbx_cfg); if (ice_dcb_num_tc(tc_map) > 1) { @@ -7826,6 +7833,9 @@ /* Change PF VSI configuration */ ice_dcb_recfg(sc); + /* Send new configuration to RDMA client driver */ + ice_rdma_dcb_qos_update(sc, pi); + ice_request_stack_reinit(sc); } @@ -8663,6 +8673,7 @@ static int ice_module_init(void) { + ice_rdma_init(); return (0); } @@ -8679,6 +8690,7 @@ static int ice_module_exit(void) { + ice_rdma_exit(); return (0); } @@ -9029,8 +9041,17 @@ err = ENOMEM; goto free_imgr; } + if (!(sc->rdma_imap = + (u16 *)malloc(sizeof(u16) * hw->func_caps.common_cap.num_msix_vectors, + M_ICE, M_NOWAIT))) { + device_printf(dev, "Unable to allocate RDMA imap memory\n"); + err = ENOMEM; + free(sc->pf_imap, M_ICE); + goto free_imgr; + } for (u32 i = 0; i < hw->func_caps.common_cap.num_msix_vectors; i++) { sc->pf_imap[i] = ICE_INVALID_RES_IDX; + sc->rdma_imap[i] = ICE_INVALID_RES_IDX; } return (0); @@ -9058,6 +9079,12 @@ free(sc->pf_imap, M_ICE); sc->pf_imap = NULL; } + if (sc->rdma_imap) { + ice_resmgr_release_map(&sc->imgr, sc->rdma_imap, + sc->lan_vectors); + free(sc->rdma_imap, M_ICE); + sc->rdma_imap = NULL; + } ice_resmgr_destroy(&sc->imgr); } Index: sys/dev/ice/ice_rdma.h =================================================================== --- /dev/null +++ sys/dev/ice/ice_rdma.h @@ -0,0 +1,311 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2021, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*$FreeBSD$*/ + +/** + * @file ice_rdma.h + * @brief header file for RDMA client interface functions + * + * Contains definitions and function calls shared by the ice driver and the + * RDMA client interface driver. + * + * Since these definitions are shared between drivers it is important that any + * changes are considered carefully for backwards compatibility. + */ +#ifndef _ICE_RDMA_H_ +#define _ICE_RDMA_H_ + +/* + * The RDMA client interface version is used to help determine + * incompatibilities between the interface definition shared between the main + * driver and the client driver. + * + * It will follows the semantic version guidelines, that is: + * Given the version number MAJOR.MINOR.PATCH, increment the: + * + * MAJOR version when you make incompatible changes, + * MINOR version when you add functionality in a backwards-compatible manner, and + * PATCH version when you make backwards-compatible bug fixes. + * + * Any change to this file, or one of the kobject interface files must come + * with an associated change in one of the MAJOR, MINOR, or PATCH versions, + * and care must be taken that backwards incompatible changes MUST increment + * the MAJOR version. + * + * Note: Until the MAJOR version is set to at least 1, the above semantic + * version guarantees may not hold, and this interface should not be + * considered stable. + */ +#define ICE_RDMA_MAJOR_VERSION 1 +#define ICE_RDMA_MINOR_VERSION 0 +#define ICE_RDMA_PATCH_VERSION 0 + +/** + * @def ICE_RDMA_MAX_MSIX + * @brief Maximum number of MSI-X vectors that will be reserved + * + * Defines the maximum number of MSI-X vectors that an RDMA interface will + * have reserved in advance. Does not guarantee that many vectors have + * actually been enabled. + */ +#define ICE_RDMA_MAX_MSIX 64 + +/** + * @struct ice_rdma_info + * @brief RDMA information from the client driver + * + * The RDMA client driver will fill in this structure and pass its contents + * back to the main driver using the ice_rdma_register function. + * + * It should fill the version in with the ICE_RDMA_* versions as defined in + * the ice_rdma.h header. + * + * Additionally it must provide a pointer to a kobject class which extends the + * ice_rdma_di_class with the operations defined in the rdma_if.m interface. + * + * If the version specified is not compatible, then the registration will + * of the RDMA driver will fail. + */ +struct ice_rdma_info { + uint16_t major_version; + uint16_t minor_version; + uint16_t patch_version; + + kobj_class_t rdma_class; +}; + +#define ICE_RDMA_MAX_USER_PRIORITY 8 +#define ICE_RDMA_MAX_MSIX 64 + +/* Declare the ice_rdma_di kobject class */ +DECLARE_CLASS(ice_rdma_di_class); + +/** + * @struct ice_rdma_msix_mapping + * @brief MSI-X mapping requested by the peer RDMA driver + * + * Defines a mapping for MSI-X vectors being requested by the peer RDMA driver + * for a given PF. + */ +struct ice_rdma_msix_mapping { + uint8_t itr_indx; + int aeq_vector; + int ceq_cnt; + int *ceq_vector; +}; + +/** + * @struct ice_rdma_msix + * @brief RDMA MSI-X vectors reserved for the peer RDMA driver + * + * Defines the segment of the MSI-X vectors for use by the RDMA driver. These + * are reserved by the PF when it initializes. + */ +struct ice_rdma_msix { + int base; + int count; +}; + +/** + * @struct ice_qos_info + * @brief QoS information to be shared with RDMA driver + */ +struct ice_qos_info { + uint64_t tc_ctx; + uint8_t rel_bw; + uint8_t prio_type; + uint8_t egress_virt_up; + uint8_t ingress_virt_up; +}; + +/** + * @struct ice_qos_app_priority_table + * @brief Application priority data + */ +struct ice_qos_app_priority_table { + uint16_t prot_id; + uint8_t priority; + uint8_t selector; +}; + +#define IEEE_8021QAZ_MAX_TCS 8 +#define ICE_TC_MAX_USER_PRIORITY 8 +#define ICE_QOS_MAX_APPS 32 +#define ICE_QOS_DSCP_NUM_VAL 64 + +/** + * @struct ice_qos_params + * @brief Holds all necessary data for RDMA to work with DCB + * + * Struct to hold QoS info + */ +struct ice_qos_params { + struct ice_qos_info tc_info[IEEE_8021QAZ_MAX_TCS]; + uint8_t up2tc[ICE_TC_MAX_USER_PRIORITY]; + uint8_t vsi_relative_bw; + uint8_t vsi_priority_type; + uint32_t num_apps; + uint8_t pfc_mode; + uint8_t dscp_map[ICE_QOS_DSCP_NUM_VAL]; + struct ice_qos_app_priority_table apps[ICE_QOS_MAX_APPS]; + uint8_t num_tc; +}; + +/** + * @struct ice_rdma_peer + * @brief RDMA driver information + * + * Shared structure used by the RDMA client driver when talking with the main + * device driver. + * + * Because the definition of this structure is shared between the two drivers, + * its ABI should be handled carefully. + */ +struct ice_rdma_peer { + /** + * The KOBJ_FIELDS macro must come first, in order for it to be used + * as a kobject. + */ + KOBJ_FIELDS; + + struct ifnet *ifp; + device_t dev; + struct resource *pci_mem; + struct ice_qos_params initial_qos_info; + struct ice_rdma_msix msix; + uint16_t mtu; + uint16_t pf_vsi_num; + uint8_t pf_id; +}; + +/** + * @enum ice_res_type + * @brief enum for type of resource registration + * + * enum for type of resource registration. + * created for plausible compatibility with IDC + */ +enum ice_res_type { + ICE_INVAL_RES = 0x0, + ICE_RDMA_QSET_ALLOC = 0x8, + ICE_RDMA_QSET_FREE = 0x18, +}; + +/** + * @struct ice_rdma_qset_params + * @brief struct to hold per RDMA Qset info + */ +struct ice_rdma_qset_params { + uint32_t teid; /* qset TEID */ + uint16_t qs_handle; /* RDMA driver provides this */ + uint16_t vsi_id; /* VSI index */ + uint8_t tc; /* TC branch the QSet should belong to */ + uint8_t reserved[3]; +}; + +#define ICE_MAX_TXQ_PER_TXQG 128 +/** + * @struct ice_rdma_qset_update + * @brief struct used to register and unregister qsets for RDMA driver + */ +struct ice_rdma_qset_update { + enum ice_res_type res_type; + uint16_t cnt_req; + uint16_t res_allocated; + uint32_t res_handle; + struct ice_rdma_qset_params qsets; +}; + +/** + * @enum ice_rdma_event_type + * @brief enum for type of event from base driver + */ +enum ice_rdma_event_type { + ICE_RDMA_EVENT_NONE = 0, + ICE_RDMA_EVENT_LINK_CHANGE, + ICE_RDMA_EVENT_MTU_CHANGE, + ICE_RDMA_EVENT_TC_CHANGE, + ICE_RDMA_EVENT_API_CHANGE, + ICE_RDMA_EVENT_CRIT_ERR, + ICE_RDMA_EVENT_RESET, + ICE_RDMA_EVENT_QSET_REGISTER, + ICE_RDMA_EVENT_VSI_FILTER_UPDATE, + ICE_RDMA_EVENT_LAST +}; + +/** + * @struct ice_rdma_event + * @brief struct for event information to pass to RDMA driver + */ +struct ice_rdma_event { + enum ice_rdma_event_type type; + union { + /* link change event */ + struct { + int linkstate; + uint64_t baudrate; + }; + /* MTU change event */ + struct { + int mtu; + }; + /* + * TC/QoS/DCB change event + * RESET event use prep variable only + * prep: if true, this is a pre-event, post-event otherwise + */ + struct { + struct ice_qos_params port_qos; + bool prep; + }; + }; +}; + +/** + * @struct ice_rdma_request + * @brief struct with data for a request from the RDMA driver + */ +struct ice_rdma_request { + enum ice_rdma_event_type type; + union { + struct { + struct ice_rdma_qset_update res; + }; + struct { + bool enable_filter; + }; + }; +}; + +int ice_rdma_register(struct ice_rdma_info *info); +int ice_rdma_unregister(void); + +#endif Index: sys/dev/ice/ice_rdma.c =================================================================== --- /dev/null +++ sys/dev/ice/ice_rdma.c @@ -0,0 +1,859 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2022, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*$FreeBSD$*/ + +/** + * @file ice_rdma.c + * @brief RDMA client driver interface + * + * Functions to interface with the RDMA client driver, for enabling RMDA + * functionality for the ice driver. + * + * The RDMA client interface is based on a simple kobject interface which is + * defined by the rmda_if.m and irdma_di_if.m interfaces. + * + * The ice device driver provides the rmda_di_if.m interface methods, while + * the client RDMA driver provides the irdma_if.m interface methods as an + * extension ontop of the irdma_di_if kobject. + * + * The initial connection between drivers is done via the RDMA client driver + * calling ice_rdma_register. + */ + +#include "ice_iflib.h" +#include "ice_rdma_internal.h" + +#include "irdma_if.h" +#include "irdma_di_if.h" + +/** + * @var ice_rdma + * @brief global RDMA driver state + * + * Contains global state the driver uses to connect to a client RDMA interface + * driver. + */ +static struct ice_rdma_state ice_rdma; + +/* + * Helper function prototypes + */ +static int ice_rdma_pf_attach_locked(struct ice_softc *sc); +static void ice_rdma_pf_detach_locked(struct ice_softc *sc); +static int ice_rdma_check_version(struct ice_rdma_info *info); +static void ice_rdma_cp_qos_info(struct ice_hw *hw, + struct ice_dcbx_cfg *dcbx_cfg, + struct ice_qos_params *qos_info); + +/* + * RDMA Device Interface prototypes + */ +static int ice_rdma_pf_reset(struct ice_rdma_peer *peer); +static int ice_rdma_pf_msix_init(struct ice_rdma_peer *peer, + struct ice_rdma_msix_mapping *msix_info); +static int ice_rdma_qset_register_request(struct ice_rdma_peer *peer, + struct ice_rdma_qset_update *res); +static int ice_rdma_update_vsi_filter(struct ice_rdma_peer *peer_dev, + bool enable); +static void ice_rdma_request_handler(struct ice_rdma_peer *peer, + struct ice_rdma_request *req); + + +/** + * @var ice_rdma_di_methods + * @brief RDMA driver interface methods + * + * Kobject methods implementing the driver-side interface for the RDMA peer + * clients. This method table contains the operations which the client can + * request from the driver. + * + * The client driver will then extend this kobject class with methods that the + * driver can request from the client. + */ +static kobj_method_t ice_rdma_di_methods[] = { + KOBJMETHOD(irdma_di_reset, ice_rdma_pf_reset), + KOBJMETHOD(irdma_di_msix_init, ice_rdma_pf_msix_init), + KOBJMETHOD(irdma_di_qset_register_request, ice_rdma_qset_register_request), + KOBJMETHOD(irdma_di_vsi_filter_update, ice_rdma_update_vsi_filter), + KOBJMETHOD(irdma_di_req_handler, ice_rdma_request_handler), + KOBJMETHOD_END +}; + +/* Define ice_rdma_di class which will be extended by the iRDMA driver */ +DEFINE_CLASS_0(ice_rdma_di, ice_rdma_di_class, ice_rdma_di_methods, sizeof(struct ice_rdma_peer)); + +/** + * ice_rdma_pf_reset - RDMA client interface requested a reset + * @peer: the RDMA peer client structure + * + * Implements IRDMA_DI_RESET, called by the RDMA client driver to request + * a reset of an ice driver device. + */ +static int +ice_rdma_pf_reset(struct ice_rdma_peer *peer) +{ + struct ice_softc *sc = ice_rdma_peer_to_sc(peer); + + /* + * Request that the driver re-initialize by bringing the interface + * down and up. + */ + ice_request_stack_reinit(sc); + + return (0); +} + +/** + * ice_rdma_pf_msix_init - RDMA client interface request MSI-X initialization + * @peer: the RDMA peer client structure + * @msix_info: requested MSI-X mapping + * + * Implements IRDMA_DI_MSIX_INIT, called by the RDMA client driver to + * initialize the MSI-X resources required for RDMA functionality. + */ +static int +ice_rdma_pf_msix_init(struct ice_rdma_peer *peer, + struct ice_rdma_msix_mapping __unused *msix_info) +{ + struct ice_softc *sc = ice_rdma_peer_to_sc(peer); + + MPASS(msix_info != NULL); + + device_printf(sc->dev, "%s: iRDMA MSI-X initialization request is not yet implemented\n", __func__); + + /* TODO: implement MSI-X initialization for RDMA */ + return (ENOSYS); +} + +/** + * ice_rdma_register_request - RDMA client interface request qset + * registration or unregistration + * @peer: the RDMA peer client structure + * @res: resources to be registered or unregistered + */ +static int +ice_rdma_qset_register_request(struct ice_rdma_peer *peer, struct ice_rdma_qset_update *res) +{ + struct ice_softc *sc = ice_rdma_peer_to_sc(peer); + struct ice_vsi *vsi = NULL; + struct ice_dcbx_cfg *dcbx_cfg; + struct ice_hw *hw = &sc->hw; + enum ice_status status; + int count, i, ret = 0; + uint32_t *qset_teid; + uint16_t *qs_handle; + uint16_t max_rdmaqs[ICE_MAX_TRAFFIC_CLASS]; + uint16_t vsi_id; + uint8_t ena_tc = 0; + + if (!res) + return -EINVAL; + + if (res->cnt_req > ICE_MAX_TXQ_PER_TXQG) + return -EINVAL; + + switch(res->res_type) { + case ICE_RDMA_QSET_ALLOC: + count = res->cnt_req; + vsi_id = peer->pf_vsi_num; + break; + case ICE_RDMA_QSET_FREE: + count = res->res_allocated; + vsi_id = res->qsets.vsi_id; + break; + default: + return -EINVAL; + } + qset_teid = (uint32_t *)ice_calloc(hw, count, sizeof(*qset_teid)); + if (!qset_teid) + return -ENOMEM; + + qs_handle = (uint16_t *)ice_calloc(hw, count, sizeof(*qs_handle)); + if (!qs_handle) { + ice_free(hw, qset_teid); + return -ENOMEM; + } + + ice_for_each_traffic_class(i) + max_rdmaqs[i] = 0; + for (i = 0; i < sc->num_available_vsi; i++) { + if (sc->all_vsi[i] && + ice_get_hw_vsi_num(hw, sc->all_vsi[i]->idx) == vsi_id) { + vsi = sc->all_vsi[i]; + break; + } + } + + if (!vsi) { + ice_debug(hw, ICE_DBG_RDMA, "RDMA QSet invalid VSI\n"); + ret = -EINVAL; + goto out; + } + if (sc != vsi->sc) { + ice_debug(hw, ICE_DBG_RDMA, "VSI is tied to unexpected device\n"); + ret = -EXDEV; + goto out; + } + + for (i = 0; i < count; i++) { + struct ice_rdma_qset_params *qset; + + qset = &res->qsets; + if (qset->vsi_id != peer->pf_vsi_num) { + ice_debug(hw, ICE_DBG_RDMA, "RDMA QSet invalid VSI requested %d %d\n", + qset->vsi_id, peer->pf_vsi_num); + ret = -EINVAL; + goto out; + } + max_rdmaqs[qset->tc]++; + qs_handle[i] = qset->qs_handle; + qset_teid[i] = qset->teid; + } + + switch(res->res_type) { + case ICE_RDMA_QSET_ALLOC: + dcbx_cfg = &hw->port_info->qos_cfg.local_dcbx_cfg; + for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) { + ena_tc |= BIT(dcbx_cfg->etscfg.prio_table[i]); + } + + ice_debug(hw, ICE_DBG_RDMA, "%s:%d ena_tc=%x\n", __func__, __LINE__, ena_tc); + status = ice_cfg_vsi_rdma(hw->port_info, vsi->idx, ena_tc, + max_rdmaqs); + if (status) { + ice_debug(hw, ICE_DBG_RDMA, "Failed VSI RDMA qset config\n"); + ret = -EINVAL; + goto out; + } + + for (i = 0; i < count; i++) { + struct ice_rdma_qset_params *qset; + + qset = &res->qsets; + status = ice_ena_vsi_rdma_qset(hw->port_info, vsi->idx, + qset->tc, &qs_handle[i], 1, + &qset_teid[i]); + if (status) { + ice_debug(hw, ICE_DBG_RDMA, "Failed VSI RDMA qset enable\n"); + ret = -EINVAL; + goto out; + } + qset->teid = qset_teid[i]; + } + break; + case ICE_RDMA_QSET_FREE: + status = ice_dis_vsi_rdma_qset(hw->port_info, count, qset_teid, qs_handle); + if (status) + ret = -EINVAL; + break; + default: + ret = -EINVAL; + break; + } + +out: + ice_free(hw, qs_handle); + ice_free(hw, qset_teid); + + return ret; +} + +/** + * ice_rdma_update_vsi_filter - configure vsi information + * when opening or closing rdma driver + * @peer: the RDMA peer client structure + * @enable: enable or disable the rdma filter + */ +static int +ice_rdma_update_vsi_filter(struct ice_rdma_peer *peer, + bool enable) +{ + struct ice_softc *sc = ice_rdma_peer_to_sc(peer); + struct ice_vsi *vsi; + int ret; + + vsi = &sc->pf_vsi; + if (!vsi) + return -EINVAL; + + ret = ice_cfg_iwarp_fltr(&sc->hw, vsi->idx, enable); + if (ret) { + device_printf(sc->dev, "Failed to %sable iWARP filtering\n", + enable ? "en" : "dis"); + } else { + if (enable) + vsi->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; + else + vsi->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; + } + + return ret; +} + +/** + * ice_rdma_request_handler - handle requests incoming from RDMA driver + * @peer: the RDMA peer client structure + * @req: structure containing request + */ +static void +ice_rdma_request_handler(struct ice_rdma_peer *peer, + struct ice_rdma_request *req) +{ + if (!req || !peer) { + log(LOG_WARNING, "%s: peer or req are not valid\n", __func__); + return; + } + + switch(req->type) { + case ICE_RDMA_EVENT_RESET: + break; + case ICE_RDMA_EVENT_QSET_REGISTER: + ice_rdma_qset_register_request(peer, &req->res); + break; + case ICE_RDMA_EVENT_VSI_FILTER_UPDATE: + ice_rdma_update_vsi_filter(peer, req->enable_filter); + break; + default: + log(LOG_WARNING, "%s: Event %d not supported\n", __func__, req->type); + break; + } +} + +/** + * ice_rdma_cp_qos_info - gather current QOS/DCB settings in LAN to pass + * to RDMA driver + * @hw: ice hw structure + * @dcbx_cfg: current DCB settings in ice driver + * @qos_info: destination of the DCB settings + */ +static void +ice_rdma_cp_qos_info(struct ice_hw *hw, struct ice_dcbx_cfg *dcbx_cfg, + struct ice_qos_params *qos_info) +{ + u32 up2tc; + u8 j; + u8 num_tc = 0; + u8 val_tc = 0; /* number of TC for validation */ + u8 cnt_tc = 0; + + /* setup qos_info fields with defaults */ + qos_info->num_apps = 0; + qos_info->num_tc = 1; + + for (j = 0; j < ICE_TC_MAX_USER_PRIORITY; j++) + qos_info->up2tc[j] = 0; + + qos_info->tc_info[0].rel_bw = 100; + for (j = 1; j < IEEE_8021QAZ_MAX_TCS; j++) + qos_info->tc_info[j].rel_bw = 0; + + /* gather current values */ + up2tc = rd32(hw, PRTDCB_TUP2TC); + qos_info->num_apps = dcbx_cfg->numapps; + + for (j = 0; j < ICE_MAX_TRAFFIC_CLASS; j++) { + num_tc |= BIT(dcbx_cfg->etscfg.prio_table[j]); + } + for (j = 0; j < ICE_MAX_TRAFFIC_CLASS; j++) { + if (num_tc & BIT(j)) { + cnt_tc++; + val_tc |= BIT(j); + } else { + break; + } + } + qos_info->num_tc = (val_tc == num_tc && num_tc != 0) ? cnt_tc : 1; + for (j = 0; j < ICE_TC_MAX_USER_PRIORITY; j++) + qos_info->up2tc[j] = (up2tc >> (j * 3)) & 0x7; + + for (j = 0; j < IEEE_8021QAZ_MAX_TCS; j++) + qos_info->tc_info[j].rel_bw = dcbx_cfg->etscfg.tcbwtable[j]; + for (j = 0; j < qos_info->num_apps; j++) { + qos_info->apps[j].priority = dcbx_cfg->app[j].priority; + qos_info->apps[j].prot_id = dcbx_cfg->app[j].prot_id; + qos_info->apps[j].selector = dcbx_cfg->app[j].selector; + } +} + +/** + * ice_rdma_check_version - Check that the provided RDMA version is compatible + * @info: the RDMA client information structure + * + * Verify that the client RDMA driver provided a version that is compatible + * with the driver interface. + */ +static int +ice_rdma_check_version(struct ice_rdma_info *info) +{ + /* Make sure the MAJOR version matches */ + if (info->major_version != ICE_RDMA_MAJOR_VERSION) { + log(LOG_WARNING, "%s: the iRDMA driver requested version %d.%d.%d, but this driver only supports major version %d.x.x\n", + __func__, + info->major_version, info->minor_version, info->patch_version, + ICE_RDMA_MAJOR_VERSION); + return (ENOTSUP); + } + + /* + * Make sure that the MINOR version is compatible. + * + * This means that the RDMA client driver version MUST not be greater + * than the version provided by the driver, as it would indicate that + * the RDMA client expects features which are not supported by the + * main driver. + */ + if (info->minor_version > ICE_RDMA_MINOR_VERSION) { + log(LOG_WARNING, "%s: the iRDMA driver requested version %d.%d.%d, but this driver only supports up to minor version %d.%d.x\n", + __func__, + info->major_version, info->minor_version, info->patch_version, + ICE_RDMA_MAJOR_VERSION, ICE_RDMA_MINOR_VERSION); + return (ENOTSUP); + } + + /* + * Make sure that the PATCH version is compatible. + * + * This means that the RDMA client version MUST not be greater than + * the version provided by the driver, as it may indicate that the + * RDMA client expects certain backwards compatible bug fixes which + * are not implemented by this version of the main driver. + */ + if ((info->minor_version == ICE_RDMA_MINOR_VERSION) && + (info->patch_version > ICE_RDMA_PATCH_VERSION)) { + log(LOG_WARNING, "%s: the iRDMA driver requested version %d.%d.%d, but this driver only supports up to patch version %d.%d.%d\n", + __func__, + info->major_version, info->minor_version, info->patch_version, + ICE_RDMA_MAJOR_VERSION, ICE_RDMA_MINOR_VERSION, ICE_RDMA_PATCH_VERSION); + return (ENOTSUP); + } + + /* Make sure that the kobject class is initialized */ + if (info->rdma_class == NULL) { + log(LOG_WARNING, "%s: the iRDMA driver did not specify a kobject interface\n", + __func__); + return (EINVAL); + } + + return (0); +} + +/** + * ice_rdma_register - Register an RDMA client driver + * @info: the RDMA client information structure + * + * Called by the RDMA client driver on load. Used to initialize the RDMA + * client driver interface and enable interop between the ice driver and the + * RDMA client driver. + * + * The RDMA client driver must provide the version number it expects, along + * with a pointer to a kobject class that extends the irdma_di_if class, and + * implements the irdma_if class interface. + */ +int +ice_rdma_register(struct ice_rdma_info *info) +{ + struct ice_rdma_entry *entry; + int err = 0; + + sx_xlock(&ice_rdma.mtx); + + if (!ice_enable_irdma) { + log(LOG_INFO, "%s: The iRDMA driver interface has been disabled\n", __func__); + err = (ECONNREFUSED); + goto return_unlock; + } + + if (ice_rdma.registered) { + log(LOG_WARNING, "%s: iRDMA driver already registered\n", __func__); + err = (EBUSY); + goto return_unlock; + } + + /* Make sure the iRDMA version is compatible */ + err = ice_rdma_check_version(info); + if (err) + goto return_unlock; + + log(LOG_INFO, "%s: iRDMA driver registered using version %d.%d.%d\n", + __func__, info->major_version, info->minor_version, info->patch_version); + + ice_rdma.peer_class = info->rdma_class; + + /* + * Initialize the kobject interface and notify the RDMA client of each + * existing PF interface. + */ + LIST_FOREACH(entry, &ice_rdma.peers, node) { + kobj_init((kobj_t)&entry->peer, ice_rdma.peer_class); + IRDMA_PROBE(&entry->peer); + if (entry->initiated) + IRDMA_OPEN(&entry->peer); + } + ice_rdma.registered = true; + +return_unlock: + sx_xunlock(&ice_rdma.mtx); + + return (err); +} + +/** + * ice_rdma_unregister - Unregister an RDMA client driver + * + * Called by the RDMA client driver on unload. Used to de-initialize the RDMA + * client driver interface and shut down communication between the ice driver + * and the RDMA client driver. + */ +int +ice_rdma_unregister(void) +{ + struct ice_rdma_entry *entry; + + sx_xlock(&ice_rdma.mtx); + + if (!ice_rdma.registered) { + log(LOG_WARNING, "%s: iRDMA driver was not previously registered\n", + __func__); + sx_xunlock(&ice_rdma.mtx); + return (ENOENT); + } + + log(LOG_INFO, "%s: iRDMA driver unregistered\n", __func__); + ice_rdma.registered = false; + ice_rdma.peer_class = NULL; + + /* + * Release the kobject interface for each of the existing PF + * interfaces. Note that we do not notify the client about removing + * each PF, as it is assumed that the client will have already cleaned + * up any associated resources when it is unregistered. + */ + LIST_FOREACH(entry, &ice_rdma.peers, node) + kobj_delete((kobj_t)&entry->peer, NULL); + + sx_xunlock(&ice_rdma.mtx); + + return (0); +} + +/** + * ice_rdma_init - RDMA driver init routine + * + * Called during ice driver module initialization to setup the RDMA client + * interface mutex and RDMA peer structure list. + */ +void +ice_rdma_init(void) +{ + LIST_INIT(&ice_rdma.peers); + sx_init_flags(&ice_rdma.mtx, "ice rdma interface", SX_DUPOK); + + ice_rdma.registered = false; + ice_rdma.peer_class = NULL; +} + +/** + * ice_rdma_exit - RDMA driver exit routine + * + * Called during ice driver module exit to shutdown the RDMA client interface + * mutex. + */ +void +ice_rdma_exit(void) +{ + MPASS(LIST_EMPTY(&ice_rdma.peers)); + sx_destroy(&ice_rdma.mtx); +} + +/** + * ice_rdma_pf_attach_locked - Prepare a PF for RDMA connections + * @sc: the ice driver softc + * + * Initialize a peer entry for this PF and add it to the RDMA interface list. + * Notify the client RDMA driver of a new PF device. + * + * @pre must be called while holding the ice_rdma mutex. + */ +static int +ice_rdma_pf_attach_locked(struct ice_softc *sc) +{ + struct ice_rdma_entry *entry; + + /* Do not attach the PF unless RDMA is supported */ + if (!ice_is_bit_set(sc->feat_cap, ICE_FEATURE_RDMA)) + return (0); + + entry = &sc->rdma_entry; + if (entry->attached) { + device_printf(sc->dev, "iRDMA peer entry already exists\n"); + return (EEXIST); + } + + entry->attached = true; + entry->peer.dev = sc->dev; + entry->peer.ifp = sc->ifp; + entry->peer.pf_id = sc->hw.pf_id; + entry->peer.pci_mem = sc->bar0.res; + entry->peer.pf_vsi_num = ice_get_hw_vsi_num(&sc->hw, sc->pf_vsi.idx); + if (sc->rdma_imap && sc->rdma_imap[0] != ICE_INVALID_RES_IDX && + sc->irdma_vectors > 0) { + entry->peer.msix.base = sc->rdma_imap[0]; + entry->peer.msix.count = sc->irdma_vectors; + } + + /* Gather DCB/QOS info into peer */ + memset(&entry->peer.initial_qos_info, 0, sizeof(entry->peer.initial_qos_info)); + ice_rdma_cp_qos_info(&sc->hw, &sc->hw.port_info->qos_cfg.local_dcbx_cfg, + &entry->peer.initial_qos_info); + + /* + * If the RDMA client driver has already registered, initialize the + * kobject and notify the client of a new PF + */ + if (ice_rdma.registered) { + kobj_init((kobj_t)&entry->peer, ice_rdma.peer_class); + IRDMA_PROBE(&entry->peer); + } + + LIST_INSERT_HEAD(&ice_rdma.peers, entry, node); + + ice_set_bit(ICE_FEATURE_RDMA, sc->feat_en); + + return (0); +} + +/** + * ice_rdma_pf_attach - Notify the RDMA client of a new PF + * @sc: the ice driver softc + * + * Called during PF attach to notify the RDMA client of a new PF. + */ +int +ice_rdma_pf_attach(struct ice_softc *sc) +{ + int err; + + sx_xlock(&ice_rdma.mtx); + err = ice_rdma_pf_attach_locked(sc); + sx_xunlock(&ice_rdma.mtx); + + return (err); +} + +/** + * ice_rdma_pf_detach_locked - Notify the RDMA client on PF detach + * @sc: the ice driver softc + * + * Notify the RDMA peer client driver of removal of a PF, and release any + * RDMA-specific resources associated with that PF. Remove the PF from the + * list of available RDMA entries. + * + * @pre must be called while holding the ice_rdma mutex. + */ +static void +ice_rdma_pf_detach_locked(struct ice_softc *sc) +{ + struct ice_rdma_entry *entry; + + /* No need to detach the PF if RDMA is not enabled */ + if (!ice_is_bit_set(sc->feat_en, ICE_FEATURE_RDMA)) + return; + + entry = &sc->rdma_entry; + if (!entry->attached) { + device_printf(sc->dev, "iRDMA peer entry was not attached\n"); + return; + } + + /* + * If the RDMA client driver is registered, notify the client that + * a PF has been removed, and release the kobject reference. + */ + if (ice_rdma.registered) { + IRDMA_REMOVE(&entry->peer); + kobj_delete((kobj_t)&entry->peer, NULL); + } + + LIST_REMOVE(entry, node); + entry->attached = false; + + ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_en); +} + +/** + * ice_rdma_pf_detach - Notify the RDMA client of a PF detaching + * @sc: the ice driver softc + * + * Take the ice_rdma mutex and then notify the RDMA client that a PF has been + * removed. + */ +void +ice_rdma_pf_detach(struct ice_softc *sc) +{ + sx_xlock(&ice_rdma.mtx); + ice_rdma_pf_detach_locked(sc); + sx_xunlock(&ice_rdma.mtx); +} + +/** + * ice_rdma_pf_init - Notify the RDMA client that a PF has initialized + * @sc: the ice driver softc + * + * Called by the ice driver when a PF has been initialized. Notifies the RDMA + * client that a PF is up and ready to operate. + */ +int +ice_rdma_pf_init(struct ice_softc *sc) +{ + struct ice_rdma_peer *peer = &sc->rdma_entry.peer; + + sx_xlock(&ice_rdma.mtx); + + /* Update the MTU */ + peer->mtu = sc->ifp->if_mtu; + sc->rdma_entry.initiated = true; + + if (sc->rdma_entry.attached && ice_rdma.registered) { + sx_xunlock(&ice_rdma.mtx); + return IRDMA_OPEN(peer); + } + + sx_xunlock(&ice_rdma.mtx); + + return (0); +} + +/** + * ice_rdma_pf_stop - Notify the RDMA client of a stopped PF device + * @sc: the ice driver softc + * + * Called by the ice driver when a PF is stopped. Notifies the RDMA client + * driver that the PF has stopped and is not ready to operate. + */ +int +ice_rdma_pf_stop(struct ice_softc *sc) +{ + sx_xlock(&ice_rdma.mtx); + + sc->rdma_entry.initiated = false; + if (sc->rdma_entry.attached && ice_rdma.registered) { + sx_xunlock(&ice_rdma.mtx); + return IRDMA_CLOSE(&sc->rdma_entry.peer); + } + + sx_xunlock(&ice_rdma.mtx); + + return (0); +} + +/** + * ice_rdma_link_change - Notify RDMA client of a change in link status + * @sc: the ice driver softc + * @linkstate: the link status + * @baudrate: the link rate in bits per second + * + * Notify the RDMA client of a link status change, by sending it the new link + * state and baudrate. + * + * The link state is represented the same was as in the ifnet structure. It + * should be LINK_STATE_UNKNOWN, LINK_STATE_DOWN, or LINK_STATE_UP. + */ +void +ice_rdma_link_change(struct ice_softc *sc, int linkstate, uint64_t baudrate) +{ + struct ice_rdma_peer *peer = &sc->rdma_entry.peer; + struct ice_rdma_event event; + + memset(&event, 0, sizeof(struct ice_rdma_event)); + event.type = ICE_RDMA_EVENT_LINK_CHANGE; + event.linkstate = linkstate; + event.baudrate = baudrate; + + sx_xlock(&ice_rdma.mtx); + + if (sc->rdma_entry.attached && ice_rdma.registered) + IRDMA_EVENT_HANDLER(peer, &event); + + sx_xunlock(&ice_rdma.mtx); +} + +/** + * ice_rdma_notify_dcb_qos_change - notify RDMA driver to pause traffic + * @sc: the ice driver softc + * + * Notify the RDMA driver that QOS/DCB settings are about to change. + * Once the function return, all the QPs should be suspended. + */ +void +ice_rdma_notify_dcb_qos_change(struct ice_softc *sc) +{ + struct ice_rdma_peer *peer = &sc->rdma_entry.peer; + struct ice_rdma_event event; + + memset(&event, 0, sizeof(struct ice_rdma_event)); + event.type = ICE_RDMA_EVENT_TC_CHANGE; + /* pre-event */ + event.prep = true; + + sx_xlock(&ice_rdma.mtx); + if (sc->rdma_entry.attached && ice_rdma.registered) + IRDMA_EVENT_HANDLER(peer, &event); + sx_xunlock(&ice_rdma.mtx); +} + +/** + * ice_rdma_dcb_qos_update - pass the changed dcb settings to RDMA driver + * @sc: the ice driver softc + * @pi: the port info structure + * + * Pass the changed DCB settings to RDMA traffic. This function should be + * called only after ice_rdma_notify_dcb_qos_change has been called and + * returned before. After the function returns, all the RDMA traffic + * should be resumed. + */ +void +ice_rdma_dcb_qos_update(struct ice_softc *sc, struct ice_port_info *pi) +{ + struct ice_rdma_peer *peer = &sc->rdma_entry.peer; + struct ice_rdma_event event; + + memset(&event, 0, sizeof(struct ice_rdma_event)); + event.type = ICE_RDMA_EVENT_TC_CHANGE; + /* post-event */ + event.prep = false; + + /* gather current configuration */ + ice_rdma_cp_qos_info(&sc->hw, &pi->qos_cfg.local_dcbx_cfg, &event.port_qos); + sx_xlock(&ice_rdma.mtx); + if (sc->rdma_entry.attached && ice_rdma.registered) + IRDMA_EVENT_HANDLER(peer, &event); + sx_xunlock(&ice_rdma.mtx); +} Index: sys/dev/ice/ice_rdma_internal.h =================================================================== --- /dev/null +++ sys/dev/ice/ice_rdma_internal.h @@ -0,0 +1,102 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* Copyright (c) 2022, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/*$FreeBSD$*/ + +/** + * @file ice_rdma_internal.h + * @brief internal header for the RMDA driver interface setup + * + * Contains the definitions and functions used by the ice driver to setup the + * RDMA driver interface. Functions and definitions in this file are not + * shared with the RDMA client driver. + */ +#ifndef _ICE_RDMA_INTERNAL_H_ +#define _ICE_RDMA_INTERNAL_H_ + +#include "ice_rdma.h" + +/* Forward declare the softc structure */ +struct ice_softc; + +/* Global sysctl variable indicating if the RDMA client interface is enabled */ +extern bool ice_enable_irdma; + +/** + * @struct ice_rdma_entry + * @brief RDMA peer list node + * + * Structure used to store peer entries for each PF in a linked list. + */ +struct ice_rdma_entry { + LIST_ENTRY(ice_rdma_entry) node; + struct ice_rdma_peer peer; + bool attached; + bool initiated; +}; + +#define ice_rdma_peer_to_entry(p) __containerof(p, struct ice_rdma_entry, peer) +#define ice_rdma_entry_to_sc(e) __containerof(e, struct ice_softc, rdma_entry) +#define ice_rdma_peer_to_sc(p) ice_rdma_entry_to_sc(ice_rdma_peer_to_entry(p)) + +/** + * @struct ice_rdma_peers + * @brief Head list structure for the RDMA entry list + * + * Type defining the head of the linked list of RDMA entries. + */ +LIST_HEAD(ice_rdma_peers, ice_rdma_entry); + +/** + * @struct ice_rdma_state + * @brief global driver state for RDMA + * + * Contains global state shared across all PFs by the device driver, such as + * the kobject class of the currently connected peer driver, and the linked + * list of peer entries for each PF. + */ +struct ice_rdma_state { + bool registered; + kobj_class_t peer_class; + struct sx mtx; + struct ice_rdma_peers peers; +}; + +void ice_rdma_init(void); +void ice_rdma_exit(void); + +int ice_rdma_pf_attach(struct ice_softc *sc); +void ice_rdma_pf_detach(struct ice_softc *sc); +int ice_rdma_pf_init(struct ice_softc *sc); +int ice_rdma_pf_stop(struct ice_softc *sc); +void ice_rdma_link_change(struct ice_softc *sc, int linkstate, uint64_t baudrate); +void ice_rdma_notify_dcb_qos_change(struct ice_softc *sc); +void ice_rdma_dcb_qos_update(struct ice_softc *sc, struct ice_port_info *pi); +#endif Index: sys/dev/ice/ice_sched.h =================================================================== --- sys/dev/ice/ice_sched.h +++ sys/dev/ice/ice_sched.h @@ -139,6 +139,7 @@ ice_sched_cfg_vsi(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u16 maxqs, u8 owner, bool enable); enum ice_status ice_rm_vsi_lan_cfg(struct ice_port_info *pi, u16 vsi_handle); +enum ice_status ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle); struct ice_sched_node * ice_sched_get_vsi_node(struct ice_port_info *pi, struct ice_sched_node *tc_node, u16 vsi_handle); Index: sys/dev/ice/ice_sched.c =================================================================== --- sys/dev/ice/ice_sched.c +++ sys/dev/ice/ice_sched.c @@ -620,6 +620,48 @@ return ICE_SUCCESS; } +/** + * ice_alloc_rdma_q_ctx - allocate RDMA queue contexts for the given VSI and TC + * @hw: pointer to the HW struct + * @vsi_handle: VSI handle + * @tc: TC number + * @new_numqs: number of queues + */ +static enum ice_status +ice_alloc_rdma_q_ctx(struct ice_hw *hw, u16 vsi_handle, u8 tc, u16 new_numqs) +{ + struct ice_vsi_ctx *vsi_ctx; + struct ice_q_ctx *q_ctx; + + vsi_ctx = ice_get_vsi_ctx(hw, vsi_handle); + if (!vsi_ctx) + return ICE_ERR_PARAM; + /* allocate RDMA queue contexts */ + if (!vsi_ctx->rdma_q_ctx[tc]) { + vsi_ctx->rdma_q_ctx[tc] = (struct ice_q_ctx *) + ice_calloc(hw, new_numqs, sizeof(*q_ctx)); + if (!vsi_ctx->rdma_q_ctx[tc]) + return ICE_ERR_NO_MEMORY; + vsi_ctx->num_rdma_q_entries[tc] = new_numqs; + return ICE_SUCCESS; + } + /* num queues are increased, update the queue contexts */ + if (new_numqs > vsi_ctx->num_rdma_q_entries[tc]) { + u16 prev_num = vsi_ctx->num_rdma_q_entries[tc]; + + q_ctx = (struct ice_q_ctx *) + ice_calloc(hw, new_numqs, sizeof(*q_ctx)); + if (!q_ctx) + return ICE_ERR_NO_MEMORY; + ice_memcpy(q_ctx, vsi_ctx->rdma_q_ctx[tc], + prev_num * sizeof(*q_ctx), ICE_DMA_TO_NONDMA); + ice_free(hw, vsi_ctx->rdma_q_ctx[tc]); + vsi_ctx->rdma_q_ctx[tc] = q_ctx; + vsi_ctx->num_rdma_q_entries[tc] = new_numqs; + } + return ICE_SUCCESS; +} + /** * ice_aq_rl_profile - performs a rate limiting task * @hw: pointer to the HW struct @@ -1904,13 +1946,22 @@ if (!vsi_ctx) return ICE_ERR_PARAM; - prev_numqs = vsi_ctx->sched.max_lanq[tc]; + if (owner == ICE_SCHED_NODE_OWNER_LAN) + prev_numqs = vsi_ctx->sched.max_lanq[tc]; + else + prev_numqs = vsi_ctx->sched.max_rdmaq[tc]; /* num queues are not changed or less than the previous number */ if (new_numqs <= prev_numqs) return status; - status = ice_alloc_lan_q_ctx(hw, vsi_handle, tc, new_numqs); - if (status) - return status; + if (owner == ICE_SCHED_NODE_OWNER_LAN) { + status = ice_alloc_lan_q_ctx(hw, vsi_handle, tc, new_numqs); + if (status) + return status; + } else { + status = ice_alloc_rdma_q_ctx(hw, vsi_handle, tc, new_numqs); + if (status) + return status; + } if (new_numqs) ice_sched_calc_vsi_child_nodes(hw, new_numqs, new_num_nodes); @@ -1925,7 +1976,10 @@ new_num_nodes, owner); if (status) return status; - vsi_ctx->sched.max_lanq[tc] = new_numqs; + if (owner == ICE_SCHED_NODE_OWNER_LAN) + vsi_ctx->sched.max_lanq[tc] = new_numqs; + else + vsi_ctx->sched.max_rdmaq[tc] = new_numqs; return ICE_SUCCESS; } @@ -1991,6 +2045,7 @@ * recreate the child nodes all the time in these cases. */ vsi_ctx->sched.max_lanq[tc] = 0; + vsi_ctx->sched.max_rdmaq[tc] = 0; } /* update the VSI child nodes */ @@ -2121,6 +2176,8 @@ } if (owner == ICE_SCHED_NODE_OWNER_LAN) vsi_ctx->sched.max_lanq[i] = 0; + else + vsi_ctx->sched.max_rdmaq[i] = 0; } status = ICE_SUCCESS; @@ -2142,6 +2199,19 @@ return ice_sched_rm_vsi_cfg(pi, vsi_handle, ICE_SCHED_NODE_OWNER_LAN); } +/** + * ice_rm_vsi_rdma_cfg - remove VSI and its RDMA children nodes + * @pi: port information structure + * @vsi_handle: software VSI handle + * + * This function clears the VSI and its RDMA children nodes from scheduler tree + * for all TCs. + */ +enum ice_status ice_rm_vsi_rdma_cfg(struct ice_port_info *pi, u16 vsi_handle) +{ + return ice_sched_rm_vsi_cfg(pi, vsi_handle, ICE_SCHED_NODE_OWNER_RDMA); +} + /** * ice_sched_is_tree_balanced - Check tree nodes are identical or not * @hw: pointer to the HW struct Index: sys/dev/ice/ice_switch.h =================================================================== --- sys/dev/ice/ice_switch.h +++ sys/dev/ice/ice_switch.h @@ -77,6 +77,8 @@ u8 vf_num; u16 num_lan_q_entries[ICE_MAX_TRAFFIC_CLASS]; struct ice_q_ctx *lan_q_ctx[ICE_MAX_TRAFFIC_CLASS]; + u16 num_rdma_q_entries[ICE_MAX_TRAFFIC_CLASS]; + struct ice_q_ctx *rdma_q_ctx[ICE_MAX_TRAFFIC_CLASS]; }; /* This is to be used by add/update mirror rule Admin Queue command */ @@ -452,6 +454,8 @@ ice_add_eth_mac(struct ice_hw *hw, struct LIST_HEAD_TYPE *em_list); enum ice_status ice_remove_eth_mac(struct ice_hw *hw, struct LIST_HEAD_TYPE *em_list); +enum ice_status +ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable); enum ice_status ice_add_mac_with_sw_marker(struct ice_hw *hw, struct ice_fltr_info *f_info, Index: sys/dev/ice/ice_switch.c =================================================================== --- sys/dev/ice/ice_switch.c +++ sys/dev/ice/ice_switch.c @@ -537,6 +537,10 @@ ice_free(hw, vsi->lan_q_ctx[i]); vsi->lan_q_ctx[i] = NULL; } + if (vsi->rdma_q_ctx[i]) { + ice_free(hw, vsi->rdma_q_ctx[i]); + vsi->rdma_q_ctx[i] = NULL; + } } } @@ -658,6 +662,47 @@ return ice_aq_update_vsi(hw, vsi_ctx, cd); } +/** + * ice_cfg_iwarp_fltr - enable/disable iWARP filtering on VSI + * @hw: pointer to HW struct + * @vsi_handle: VSI SW index + * @enable: boolean for enable/disable + */ +enum ice_status +ice_cfg_iwarp_fltr(struct ice_hw *hw, u16 vsi_handle, bool enable) +{ + struct ice_vsi_ctx *ctx, *cached_ctx; + enum ice_status status; + + cached_ctx = ice_get_vsi_ctx(hw, vsi_handle); + if (!cached_ctx) + return ICE_ERR_DOES_NOT_EXIST; + + ctx = (struct ice_vsi_ctx *)ice_calloc(hw, 1, sizeof(*ctx)); + if (!ctx) + return ICE_ERR_NO_MEMORY; + + ctx->info.q_opt_rss = cached_ctx->info.q_opt_rss; + ctx->info.q_opt_tc = cached_ctx->info.q_opt_tc; + ctx->info.q_opt_flags = cached_ctx->info.q_opt_flags; + + ctx->info.valid_sections = CPU_TO_LE16(ICE_AQ_VSI_PROP_Q_OPT_VALID); + + if (enable) + ctx->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; + else + ctx->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN; + + status = ice_update_vsi(hw, vsi_handle, ctx, NULL); + if (!status) { + cached_ctx->info.q_opt_flags = ctx->info.q_opt_flags; + cached_ctx->info.valid_sections |= ctx->info.valid_sections; + } + + ice_free(hw, ctx); + return status; +} + /** * ice_aq_get_vsi_params * @hw: pointer to the HW struct Index: sys/dev/ice/ice_type.h =================================================================== --- sys/dev/ice/ice_type.h +++ sys/dev/ice/ice_type.h @@ -153,6 +153,7 @@ #define ICE_DBG_SW BIT_ULL(13) #define ICE_DBG_SCHED BIT_ULL(14) +#define ICE_DBG_RDMA BIT_ULL(15) #define ICE_DBG_PKG BIT_ULL(16) #define ICE_DBG_RES BIT_ULL(17) #define ICE_DBG_AQ_MSG BIT_ULL(24) @@ -404,6 +405,7 @@ u8 dcb; u8 iscsi; u8 mgmt_cem; + u8 iwarp; /* WoL and APM support */ #define ICE_WOL_SUPPORT_M BIT(0) @@ -774,6 +776,7 @@ struct ice_sched_node *vsi_node[ICE_MAX_TRAFFIC_CLASS]; struct ice_sched_node *ag_node[ICE_MAX_TRAFFIC_CLASS]; u16 max_lanq[ICE_MAX_TRAFFIC_CLASS]; + u16 max_rdmaq[ICE_MAX_TRAFFIC_CLASS]; /* bw_t_info saves VSI BW information */ struct ice_bw_type_info bw_t_info[ICE_MAX_TRAFFIC_CLASS]; }; Index: sys/dev/ice/if_ice_iflib.c =================================================================== --- sys/dev/ice/if_ice_iflib.c +++ sys/dev/ice/if_ice_iflib.c @@ -682,12 +682,14 @@ ice_set_default_local_lldp_mib(sc); iflib_link_state_change(sc->ctx, LINK_STATE_UP, baudrate); + ice_rdma_link_change(sc, LINK_STATE_UP, baudrate); ice_link_up_msg(sc); update_media = true; } else { /* link is down */ iflib_link_state_change(sc->ctx, LINK_STATE_DOWN, 0); + ice_rdma_link_change(sc, LINK_STATE_DOWN, 0); update_media = true; } @@ -795,6 +797,10 @@ /* Enable ITR 0 right away, so that we can handle admin interrupts */ ice_enable_intr(&sc->hw, sc->irqvs[0].me); + err = ice_rdma_pf_attach(sc); + if (err) + return (err); + /* Start the admin timer */ mtx_lock(&sc->admin_mtx); callout_reset(&sc->admin_timer, hz/2, ice_admin_timer, sc); @@ -891,6 +897,8 @@ mtx_unlock(&sc->admin_mtx); mtx_destroy(&sc->admin_mtx); + ice_rdma_pf_detach(sc); + /* Free allocated media types */ ifmedia_removeall(sc->media); @@ -1315,6 +1323,7 @@ cpuset_t cpus; int bar, queues, vectors, requested; int err = 0; + int rdma; /* Allocate the MSI-X bar */ bar = scctx->isc_msix_bar; @@ -1360,11 +1369,24 @@ queues = imin(queues, sc->ifc_sysctl_ntxqs ?: scctx->isc_ntxqsets); queues = imin(queues, sc->ifc_sysctl_nrxqs ?: scctx->isc_nrxqsets); + if (ice_is_bit_set(sc->feat_cap, ICE_FEATURE_RDMA)) { + /* + * Choose a number of RDMA vectors based on the number of CPUs + * up to a maximum + */ + rdma = min(CPU_COUNT(&cpus), ICE_RDMA_MAX_MSIX); + + /* Further limit by the user configurable tunable */ + rdma = min(rdma, ice_rdma_max_msix); + } else { + rdma = 0; + } + /* * Determine the number of vectors to request. Note that we also need * to allocate one vector for administrative tasks. */ - requested = queues + 1; + requested = rdma + queues + 1; vectors = requested; @@ -1382,6 +1404,23 @@ device_printf(dev, "Requested %d MSI-X vectors, but got only %d\n", requested, vectors); + /* + * The OS didn't grant us the requested number of vectors. + * Check to see if we can reduce demands by limiting the + * number of vectors allocated to certain features. + */ + + if (rdma >= diff) { + /* Reduce the number of RDMA vectors we reserve */ + rdma -= diff; + diff = 0; + } else { + /* Disable RDMA and reduce the difference */ + ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); + diff -= rdma; + rdma = 0; + } + /* * If we still have a difference, we need to reduce the number * of queue pairs. @@ -1399,6 +1438,9 @@ } device_printf(dev, "Using %d Tx and Rx queues\n", queues); + if (rdma) + device_printf(dev, "Reserving %d MSI-X interrupts for iRDMA\n", + rdma); device_printf(dev, "Using MSI-X interrupts with %d vectors\n", vectors); @@ -1407,6 +1449,8 @@ scctx->isc_ntxqsets = queues; scctx->isc_intr = IFLIB_INTR_MSIX; + sc->irdma_vectors = rdma; + /* Interrupt allocation tracking isn't required in recovery mode, * since neither RDMA nor VFs are enabled. */ @@ -1414,13 +1458,21 @@ return (0); /* Keep track of which interrupt indices are being used for what */ - sc->lan_vectors = vectors; + sc->lan_vectors = vectors - rdma; err = ice_resmgr_assign_contiguous(&sc->imgr, sc->pf_imap, sc->lan_vectors); if (err) { device_printf(dev, "Unable to assign PF interrupt mapping: %s\n", ice_err_str(err)); goto err_pci_release_msi; } + err = ice_resmgr_assign_contiguous(&sc->imgr, sc->rdma_imap, rdma); + if (err) { + device_printf(dev, "Unable to assign PF RDMA interrupt mapping: %s\n", + ice_err_str(err)); + ice_resmgr_release_map(&sc->imgr, sc->pf_imap, + sc->lan_vectors); + goto err_pci_release_msi; + } return (0); @@ -1923,6 +1975,8 @@ /* Configure promiscuous mode */ ice_if_promisc_set(ctx, if_getflags(sc->ifp)); + ice_rdma_pf_init(sc); + ice_set_state(&sc->state, ICE_STATE_DRIVER_INITIALIZED); return; @@ -2068,6 +2122,9 @@ /* Request that the device be re-initialized */ ice_request_stack_reinit(sc); + ice_rdma_pf_detach(sc); + ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); + ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); @@ -2113,6 +2170,9 @@ ice_set_bit(ICE_FEATURE_SAFE_MODE, sc->feat_cap); ice_set_bit(ICE_FEATURE_SAFE_MODE, sc->feat_en); + ice_rdma_pf_detach(sc); + ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); + ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_en); ice_clear_bit(ICE_FEATURE_SRIOV, sc->feat_cap); @@ -2229,6 +2289,9 @@ if (ice_test_state(&sc->state, ICE_STATE_RECOVERY_MODE)) return; + /* stop the RDMA client */ + ice_rdma_pf_stop(sc); + /* Release the main PF VSI queue mappings */ ice_resmgr_release_map(&sc->tx_qmgr, sc->pf_vsi.tx_qmap, sc->pf_vsi.num_tx_queues); @@ -2487,6 +2550,8 @@ ice_get_link_status(sc->hw.port_info, &sc->link_up); ice_update_link_status(sc, true); + /* RDMA interface will be restarted by the stack re-init */ + /* Configure interrupt causes for the administrative interrupt */ ice_configure_misc_interrupts(sc); @@ -2640,6 +2705,7 @@ /* Set capabilities that all devices support */ ice_set_bit(ICE_FEATURE_SRIOV, sc->feat_cap); ice_set_bit(ICE_FEATURE_RSS, sc->feat_cap); + ice_set_bit(ICE_FEATURE_RDMA, sc->feat_cap); ice_set_bit(ICE_FEATURE_LENIENT_LINK_MODE, sc->feat_cap); ice_set_bit(ICE_FEATURE_LINK_MGMT_VER_1, sc->feat_cap); ice_set_bit(ICE_FEATURE_LINK_MGMT_VER_2, sc->feat_cap); @@ -2650,6 +2716,8 @@ /* Disable features due to hardware limitations... */ if (!sc->hw.func_caps.common_cap.rss_table_size) ice_clear_bit(ICE_FEATURE_RSS, sc->feat_cap); + if (!sc->hw.func_caps.common_cap.iwarp || !ice_enable_irdma) + ice_clear_bit(ICE_FEATURE_RDMA, sc->feat_cap); /* Disable features due to firmware limitations... */ if (!ice_is_fw_health_report_supported(&sc->hw)) ice_clear_bit(ICE_FEATURE_HEALTH_STATUS, sc->feat_cap); @@ -2802,6 +2870,8 @@ return; } + ice_rdma_pf_stop(sc); + /* Remove the MAC filters, stop Tx, and stop Rx. We don't check the * return of these functions because there's nothing we can really do * if they fail, and the functions already print error messages. Index: sys/dev/ice/irdma_di_if.m =================================================================== --- /dev/null +++ sys/dev/ice/irdma_di_if.m @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2021, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# $FreeBSD$ + +/** + * @file irdma_di_if.m + * @brief RDMA client kobject driver interface + * + * KObject methods implemented by the ice driver. These functions are called + * by the RDMA client driver to connect with the ice driver and request + * operations or notify the driver of RDMA events. + */ +#include "ice_rdma.h" + +INTERFACE irdma_di; + +/** + * reset - Request the ice driver to perform a reset + * @peer: the RDMA peer structure + * + * Called by the RDMA client driver to request a reset of the ice device. + */ +METHOD int reset { + struct ice_rdma_peer *peer; +}; + +/** + * msix_init - Initialize MSI-X resources for the RDMA driver + * @peer: the RDMA peer structure + * @msix_info: the requested MSI-X mapping + * + * Called by the RDMA client driver to request initialization of the MSI-X + * resources used for RDMA functionality. + */ +METHOD int msix_init { + struct ice_rdma_peer *peer; + struct ice_rdma_msix_mapping *msix_info; +}; + +/** + * qset_register_request - RDMA client interface request qset + * registration or deregistration + * @peer: the RDMA peer client structure + * @res: resources to be registered or unregistered + */ +METHOD int qset_register_request { + struct ice_rdma_peer *peer; + struct ice_rdma_qset_update *res; +}; + +/** + * vsi_filter_update - configure vsi information + * when opening or closing rdma driver + * @peer: the RDMA peer client structure + * @enable: enable or disable the rdma filter + */ +METHOD int vsi_filter_update { + struct ice_rdma_peer *peer; + bool enable; +}; + +/** + * req_handler - handle requests incoming from RDMA driver + * @peer: the RDMA peer client structure + * @req: structure containing request + */ +METHOD void req_handler { + struct ice_rdma_peer *peer; + struct ice_rdma_request *req; +}; Index: sys/dev/ice/irdma_if.m =================================================================== --- /dev/null +++ sys/dev/ice/irdma_if.m @@ -0,0 +1,106 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright (c) 2021, Intel Corporation +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# 3. Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# $FreeBSD$ + +/** + * @file irdma_if.m + * @brief RDMA client kobject interface + * + * KOBject methods implemented by the RDMA client driver. These functions will + * be called from the ice driver to notify the RDMA client driver of device + * driver events. + */ +#include "ice_rdma.h" + +INTERFACE irdma; + +/** + * probe - Notify the RDMA client driver that a peer device has been created + * @peer: the RDMA peer structure + * + * Called by the ice driver during attach to notify the RDMA client driver + * that a new PF has been initialized. + */ +METHOD int probe { + struct ice_rdma_peer *peer; +}; + +/** + * open - Notify the RDMA client driver that a peer device has been opened + * @peer: the RDMA peer structure + * + * Called by the ice driver during the if_init routine to notify the RDMA + * client driver that a PF has been activated. + */ +METHOD int open { + struct ice_rdma_peer *peer; +}; + +/** + * close - Notify the RDMA client driver that a peer device has closed + * @peer: the RDMA peer structure + * + * Called by the ice driver during the if_stop routine to notify the RDMA + * client driver that a PF has been deactivated. + */ +METHOD int close { + struct ice_rdma_peer *peer; +}; + +/** + * remove - Notify the RDMA client driver that a peer device has been removed + * @peer: the RDMA peer structure + * + * Called by the ice driver during detach to notify the RDMA client driver + * that a PF has been removed. + */ +METHOD int remove { + struct ice_rdma_peer *peer; +} + +/** + * link_change - Notify the RDMA client driver that link status has changed + * @peer: the RDMA peer structure + * @linkstate: link status + * @baudrate: link rate in bits per second + * + * Called by the ice driver when link status changes to notify the RDMA client + * driver of the new status. + */ +METHOD void link_change { + struct ice_rdma_peer *peer; + int linkstate; + uint64_t baudrate; +} + +METHOD void event_handler { + struct ice_rdma_peer *peer; + struct ice_rdma_event *event; +} Index: sys/modules/ice/Makefile =================================================================== --- sys/modules/ice/Makefile +++ sys/modules/ice/Makefile @@ -3,12 +3,25 @@ .PATH: ${SRCTOP}/sys/dev/ice KMOD = if_ice + +# Interface headers SRCS = device_if.h bus_if.h pci_if.h ifdi_if.h +SRCS += irdma_di_if.h irdma_if.h + +# Option headers SRCS += opt_inet.h opt_inet6.h opt_rss.h opt_iflib.h + +# Core source SRCS += ice_lib.c ice_osdep.c ice_resmgr.c ice_strings.c SRCS += ice_iflib_recovery_txrx.c ice_iflib_txrx.c if_ice_iflib.c SRCS += ice_fw_logging.c +# RDMA Client interface +# TODO: Is this the right way to compile this? +SRCS += irdma_di_if.c irdma_if.c +CFLAGS.irdma_di_if.c += -I${SRCTOP}/sys/dev/ice +CFLAGS.irdma_if.c += -I${SRCTOP}/sys/dev/ice + # Shared source SRCS += ice_common.c ice_controlq.c ice_dcb.c ice_flex_pipe.c ice_flow.c SRCS += ice_nvm.c ice_sched.c ice_switch.c ice_vlan_mode.c ice_fwlog.c