diff --git a/sys/conf/files.x86 b/sys/conf/files.x86 --- a/sys/conf/files.x86 +++ b/sys/conf/files.x86 @@ -264,6 +264,12 @@ dev/isci/scil/scif_sas_timer.c optional isci dev/itwd/itwd.c optional itwd dev/kvm_clock/kvm_clock.c optional kvm_clock +dev/mana/gdma_main.c optional mana +dev/mana/mana_en.c optional mana +dev/mana/mana_sysctl.c optional mana +dev/mana/shm_channel.c optional mana +dev/mana/hw_channel.c optional mana +dev/mana/gdma_util.c optional mana dev/qat/qat.c optional qat dev/qat/qat_ae.c optional qat dev/qat/qat_c2xxx.c optional qat diff --git a/sys/dev/mana/gdma.h b/sys/dev/mana/gdma.h new file mode 100644 --- /dev/null +++ b/sys/dev/mana/gdma.h @@ -0,0 +1,744 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _GDMA_H +#define _GDMA_H + +#include +#include +#include +#include +#include + +#include "gdma_util.h" +#include "shm_channel.h" + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +#define GDMA_BAR0 0 + +#define GDMA_IRQNAME_SZ 40 + +struct gdma_bus { + bus_space_handle_t bar0_h; + bus_space_tag_t bar0_t; +}; + +struct gdma_msix_entry { + int entry; + int vector; +}; + +enum gdma_request_type { + GDMA_VERIFY_VF_DRIVER_VERSION = 1, + GDMA_QUERY_MAX_RESOURCES = 2, + GDMA_LIST_DEVICES = 3, + GDMA_REGISTER_DEVICE = 4, + GDMA_DEREGISTER_DEVICE = 5, + GDMA_GENERATE_TEST_EQE = 10, + GDMA_CREATE_QUEUE = 12, + GDMA_DISABLE_QUEUE = 13, + GDMA_CREATE_DMA_REGION = 25, + GDMA_DMA_REGION_ADD_PAGES = 26, + GDMA_DESTROY_DMA_REGION = 27, +}; + +enum gdma_queue_type { + GDMA_INVALID_QUEUE, + GDMA_SQ, + GDMA_RQ, + GDMA_CQ, + GDMA_EQ, +}; + +enum gdma_work_request_flags { + GDMA_WR_NONE = 0, + GDMA_WR_OOB_IN_SGL = BIT(0), + GDMA_WR_PAD_BY_SGE0 = BIT(1), +}; + +enum gdma_eqe_type { + GDMA_EQE_COMPLETION = 3, + GDMA_EQE_TEST_EVENT = 64, + GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, + GDMA_EQE_HWC_INIT_DATA = 130, + GDMA_EQE_HWC_INIT_DONE = 131, +}; + +enum { + GDMA_DEVICE_NONE = 0, + GDMA_DEVICE_HWC = 1, + GDMA_DEVICE_MANA = 2, +}; + + +struct gdma_resource { + /* Protect the bitmap */ + struct mtx lock_spin; + + /* The bitmap size in bits. */ + uint32_t size; + + /* The bitmap tracks the resources. */ + unsigned long *map; +}; + +union gdma_doorbell_entry { + uint64_t as_uint64; + + struct { + uint64_t id : 24; + uint64_t reserved : 8; + uint64_t tail_ptr : 31; + uint64_t arm : 1; + } cq; + + struct { + uint64_t id : 24; + uint64_t wqe_cnt : 8; + uint64_t tail_ptr : 32; + } rq; + + struct { + uint64_t id : 24; + uint64_t reserved : 8; + uint64_t tail_ptr : 32; + } sq; + + struct { + uint64_t id : 16; + uint64_t reserved : 16; + uint64_t tail_ptr : 31; + uint64_t arm : 1; + } eq; +}; /* HW DATA */ + +struct gdma_msg_hdr { + uint32_t hdr_type; + uint32_t msg_type; + uint16_t msg_version; + uint16_t hwc_msg_id; + uint32_t msg_size; +}; /* HW DATA */ + +struct gdma_dev_id { + union { + struct { + uint16_t type; + uint16_t instance; + }; + + uint32_t as_uint32; + }; +}; /* HW DATA */ + +struct gdma_req_hdr { + struct gdma_msg_hdr req; + struct gdma_msg_hdr resp; /* The expected response */ + struct gdma_dev_id dev_id; + uint32_t activity_id; +}; /* HW DATA */ + +struct gdma_resp_hdr { + struct gdma_msg_hdr response; + struct gdma_dev_id dev_id; + uint32_t activity_id; + uint32_t status; + uint32_t reserved; +}; /* HW DATA */ + +struct gdma_general_req { + struct gdma_req_hdr hdr; +}; /* HW DATA */ + +#define GDMA_MESSAGE_V1 1 + +struct gdma_general_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +#define GDMA_STANDARD_HEADER_TYPE 0 + +static inline void +mana_gd_init_req_hdr(struct gdma_req_hdr *hdr, uint32_t code, + uint32_t req_size, uint32_t resp_size) +{ + hdr->req.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->req.msg_type = code; + hdr->req.msg_version = GDMA_MESSAGE_V1; + hdr->req.msg_size = req_size; + + hdr->resp.hdr_type = GDMA_STANDARD_HEADER_TYPE; + hdr->resp.msg_type = code; + hdr->resp.msg_version = GDMA_MESSAGE_V1; + hdr->resp.msg_size = resp_size; +} + +/* The 16-byte struct is part of the GDMA work queue entry (WQE). */ +struct gdma_sge { + uint64_t address; + uint32_t mem_key; + uint32_t size; +}; /* HW DATA */ + +struct gdma_wqe_request { + struct gdma_sge *sgl; + uint32_t num_sge; + + uint32_t inline_oob_size; + const void *inline_oob_data; + + uint32_t flags; + uint32_t client_data_unit; +}; + +enum gdma_page_type { + GDMA_PAGE_TYPE_4K, +}; + +#define GDMA_INVALID_DMA_REGION 0 + +struct gdma_mem_info { + device_t dev; + + bus_dma_tag_t dma_tag; + bus_dmamap_t dma_map; + bus_addr_t dma_handle; /* Physical address */ + void *virt_addr; /* Virtual address */ + uint64_t length; + + /* Allocated by the PF driver */ + uint64_t gdma_region; +}; + +#define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8 + +struct gdma_dev { + struct gdma_context *gdma_context; + + struct gdma_dev_id dev_id; + + uint32_t pdid; + uint32_t doorbell; + uint32_t gpa_mkey; + + /* GDMA driver specific pointer */ + void *driver_data; +}; + +#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE + +#define GDMA_CQE_SIZE 64 +#define GDMA_EQE_SIZE 16 +#define GDMA_MAX_SQE_SIZE 512 +#define GDMA_MAX_RQE_SIZE 256 + +#define GDMA_COMP_DATA_SIZE 0x3C + +#define GDMA_EVENT_DATA_SIZE 0xC + +/* The WQE size must be a multiple of the Basic Unit, which is 32 bytes. */ +#define GDMA_WQE_BU_SIZE 32 + +#define INVALID_PDID UINT_MAX +#define INVALID_DOORBELL UINT_MAX +#define INVALID_MEM_KEY UINT_MAX +#define INVALID_QUEUE_ID UINT_MAX +#define INVALID_PCI_MSIX_INDEX UINT_MAX + +struct gdma_comp { + uint32_t cqe_data[GDMA_COMP_DATA_SIZE / 4]; + uint32_t wq_num; + bool is_sq; +}; + +struct gdma_event { + uint32_t details[GDMA_EVENT_DATA_SIZE / 4]; + uint8_t type; +}; + +struct gdma_queue; + +#define CQE_POLLING_BUFFER 512 + +typedef void gdma_eq_callback(void *context, struct gdma_queue *q, + struct gdma_event *e); + +typedef void gdma_cq_callback(void *context, struct gdma_queue *q); + +/* The 'head' is the producer index. For SQ/RQ, when the driver posts a WQE + * (Note: the WQE size must be a multiple of the 32-byte Basic Unit), the + * driver increases the 'head' in BUs rather than in bytes, and notifies + * the HW of the updated head. For EQ/CQ, the driver uses the 'head' to track + * the HW head, and increases the 'head' by 1 for every processed EQE/CQE. + * + * The 'tail' is the consumer index for SQ/RQ. After the CQE of the SQ/RQ is + * processed, the driver increases the 'tail' to indicate that WQEs have + * been consumed by the HW, so the driver can post new WQEs into the SQ/RQ. + * + * The driver doesn't use the 'tail' for EQ/CQ, because the driver ensures + * that the EQ/CQ is big enough so they can't overflow, and the driver uses + * the owner bits mechanism to detect if the queue has become empty. + */ +struct gdma_queue { + struct gdma_dev *gdma_dev; + + enum gdma_queue_type type; + uint32_t id; + + struct gdma_mem_info mem_info; + + void *queue_mem_ptr; + uint32_t queue_size; + + bool monitor_avl_buf; + + uint32_t head; + uint32_t tail; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + bool disable_needed; + + gdma_eq_callback *callback; + void *context; + + unsigned int msix_index; + + uint32_t log2_throttle_limit; + + struct task cleanup_task; + struct taskqueue *cleanup_tq; + int cpu; + bool do_not_ring_db; + + int work_done; + int budget; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + /* For CQ/EQ relationship */ + struct gdma_queue *parent; + } cq; + }; +}; + +struct gdma_queue_spec { + enum gdma_queue_type type; + bool monitor_avl_buf; + unsigned int queue_size; + + /* Extra fields specific to EQ/CQ. */ + union { + struct { + gdma_eq_callback *callback; + void *context; + + unsigned long log2_throttle_limit; + + /* Only used by the MANA device. */ + struct ifnet *ndev; + } eq; + + struct { + gdma_cq_callback *callback; + void *context; + + struct gdma_queue *parent_eq; + + } cq; + }; +}; + +struct mana_eq { + struct gdma_queue *eq; + struct gdma_comp cqe_poll[CQE_POLLING_BUFFER]; +}; + +struct gdma_irq_context { + struct gdma_msix_entry msix_e; + struct resource *res; + driver_intr_t *handler; + void *arg; + void *cookie; + bool requested; + int cpu; + char name[GDMA_IRQNAME_SZ]; +}; + +struct gdma_context { + device_t dev; + + struct gdma_bus gd_bus; + + /* Per-vPort max number of queues */ + unsigned int max_num_queues; + unsigned int max_num_msix; + unsigned int num_msix_usable; + struct gdma_resource msix_resource; + struct gdma_irq_context *irq_contexts; + + /* This maps a CQ index to the queue structure. */ + unsigned int max_num_cqs; + struct gdma_queue **cq_table; + + /* Protect eq_test_event and test_event_eq_id */ + struct sx eq_test_event_sx; + struct completion eq_test_event; + uint32_t test_event_eq_id; + + struct resource *bar0; + struct resource *msix; + int msix_rid; + void __iomem *shm_base; + void __iomem *db_page_base; + uint32_t db_page_size; + + /* Shared memory chanenl (used to bootstrap HWC) */ + struct shm_channel shm_channel; + + /* Hardware communication channel (HWC) */ + struct gdma_dev hwc; + + /* Azure network adapter */ + struct gdma_dev mana; +}; + +#define MAX_NUM_GDMA_DEVICES 4 + +static inline bool mana_gd_is_mana(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_MANA; +} + +static inline bool mana_gd_is_hwc(struct gdma_dev *gd) +{ + return gd->dev_id.type == GDMA_DEVICE_HWC; +} + +uint8_t *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, uint32_t wqe_offset); +uint32_t mana_gd_wq_avail_space(struct gdma_queue *wq); + +int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq); + +int mana_gd_create_hwc_queue(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr); + +void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue); + +int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe); + +void mana_gd_arm_cq(struct gdma_queue *cq); + +struct gdma_wqe { + uint32_t reserved :24; + uint32_t last_vbytes :8; + + union { + uint32_t flags; + + struct { + uint32_t num_sge :8; + uint32_t inline_oob_size_div4 :3; + uint32_t client_oob_in_sgl :1; + uint32_t reserved1 :4; + uint32_t client_data_unit :14; + uint32_t reserved2 :2; + }; + }; +}; /* HW DATA */ + +#define INLINE_OOB_SMALL_SIZE 8 +#define INLINE_OOB_LARGE_SIZE 24 + +#define MAX_TX_WQE_SIZE 512 +#define MAX_RX_WQE_SIZE 256 + +struct gdma_cqe { + uint32_t cqe_data[GDMA_COMP_DATA_SIZE / 4]; + + union { + uint32_t as_uint32; + + struct { + uint32_t wq_num :24; + uint32_t is_sq :1; + uint32_t reserved :4; + uint32_t owner_bits :3; + }; + } cqe_info; +}; /* HW DATA */ + +#define GDMA_CQE_OWNER_BITS 3 + +#define GDMA_CQE_OWNER_MASK ((1 << GDMA_CQE_OWNER_BITS) - 1) + +#define SET_ARM_BIT 1 + +#define GDMA_EQE_OWNER_BITS 3 + +union gdma_eqe_info { + uint32_t as_uint32; + + struct { + uint32_t type : 8; + uint32_t reserved1 : 8; + uint32_t client_id : 2; + uint32_t reserved2 : 11; + uint32_t owner_bits : 3; + }; +}; /* HW DATA */ + +#define GDMA_EQE_OWNER_MASK ((1 << GDMA_EQE_OWNER_BITS) - 1) +#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries)) + +struct gdma_eqe { + uint32_t details[GDMA_EVENT_DATA_SIZE / 4]; + uint32_t eqe_info; +}; /* HW DATA */ + +#define GDMA_REG_DB_PAGE_OFFSET 8 +#define GDMA_REG_DB_PAGE_SIZE 0x10 +#define GDMA_REG_SHM_OFFSET 0x18 + +struct gdma_posted_wqe_info { + uint32_t wqe_size_in_bu; +}; + +/* GDMA_GENERATE_TEST_EQE */ +struct gdma_generate_test_event_req { + struct gdma_req_hdr hdr; + uint32_t queue_index; +}; /* HW DATA */ + +/* GDMA_VERIFY_VF_DRIVER_VERSION */ +enum { + GDMA_PROTOCOL_V1 = 1, + GDMA_PROTOCOL_FIRST = GDMA_PROTOCOL_V1, + GDMA_PROTOCOL_LAST = GDMA_PROTOCOL_V1, +}; + +struct gdma_verify_ver_req { + struct gdma_req_hdr hdr; + + /* Mandatory fields required for protocol establishment */ + uint64_t protocol_ver_min; + uint64_t protocol_ver_max; + uint64_t drv_cap_flags1; + uint64_t drv_cap_flags2; + uint64_t drv_cap_flags3; + uint64_t drv_cap_flags4; + + /* Advisory fields */ + uint64_t drv_ver; + uint32_t os_type; /* Linux = 0x10; Windows = 0x20; Other = 0x30 */ + uint32_t reserved; + uint32_t os_ver_major; + uint32_t os_ver_minor; + uint32_t os_ver_build; + uint32_t os_ver_platform; + uint64_t reserved_2; + uint8_t os_ver_str1[128]; + uint8_t os_ver_str2[128]; + uint8_t os_ver_str3[128]; + uint8_t os_ver_str4[128]; +}; /* HW DATA */ + +struct gdma_verify_ver_resp { + struct gdma_resp_hdr hdr; + uint64_t gdma_protocol_ver; + uint64_t pf_cap_flags1; + uint64_t pf_cap_flags2; + uint64_t pf_cap_flags3; + uint64_t pf_cap_flags4; +}; /* HW DATA */ + +/* GDMA_QUERY_MAX_RESOURCES */ +struct gdma_query_max_resources_resp { + struct gdma_resp_hdr hdr; + uint32_t status; + uint32_t max_sq; + uint32_t max_rq; + uint32_t max_cq; + uint32_t max_eq; + uint32_t max_db; + uint32_t max_mst; + uint32_t max_cq_mod_ctx; + uint32_t max_mod_cq; + uint32_t max_msix; +}; /* HW DATA */ + +/* GDMA_LIST_DEVICES */ +struct gdma_list_devices_resp { + struct gdma_resp_hdr hdr; + uint32_t num_of_devs; + uint32_t reserved; + struct gdma_dev_id devs[64]; +}; /* HW DATA */ + +/* GDMA_REGISTER_DEVICE */ +struct gdma_register_device_resp { + struct gdma_resp_hdr hdr; + uint32_t pdid; + uint32_t gpa_mkey; + uint32_t db_id; +}; /* HW DATA */ + +/* GDMA_CREATE_QUEUE */ +struct gdma_create_queue_req { + struct gdma_req_hdr hdr; + uint32_t type; + uint32_t reserved1; + uint32_t pdid; + uint32_t doolbell_id; + uint64_t gdma_region; + uint32_t reserved2; + uint32_t queue_size; + uint32_t log2_throttle_limit; + uint32_t eq_pci_msix_index; + uint32_t cq_mod_ctx_id; + uint32_t cq_parent_eq_id; + uint8_t rq_drop_on_overrun; + uint8_t rq_err_on_wqe_overflow; + uint8_t rq_chain_rec_wqes; + uint8_t sq_hw_db; + uint32_t reserved3; +}; /* HW DATA */ + +struct gdma_create_queue_resp { + struct gdma_resp_hdr hdr; + uint32_t queue_index; +}; /* HW DATA */ + +/* GDMA_DISABLE_QUEUE */ +struct gdma_disable_queue_req { + struct gdma_req_hdr hdr; + uint32_t type; + uint32_t queue_index; + uint32_t alloc_res_id_on_creation; +}; /* HW DATA */ + +/* GDMA_CREATE_DMA_REGION */ +struct gdma_create_dma_region_req { + struct gdma_req_hdr hdr; + + /* The total size of the DMA region */ + uint64_t length; + + /* The offset in the first page */ + uint32_t offset_in_page; + + /* enum gdma_page_type */ + uint32_t gdma_page_type; + + /* The total number of pages */ + uint32_t page_count; + + /* If page_addr_list_len is smaller than page_count, + * the remaining page addresses will be added via the + * message GDMA_DMA_REGION_ADD_PAGES. + */ + uint32_t page_addr_list_len; + uint64_t page_addr_list[]; +}; /* HW DATA */ + +struct gdma_create_dma_region_resp { + struct gdma_resp_hdr hdr; + uint64_t gdma_region; +}; /* HW DATA */ + +/* GDMA_DMA_REGION_ADD_PAGES */ +struct gdma_dma_region_add_pages_req { + struct gdma_req_hdr hdr; + + uint64_t gdma_region; + + uint32_t page_addr_list_len; + uint32_t reserved3; + + uint64_t page_addr_list[]; +}; /* HW DATA */ + +/* GDMA_DESTROY_DMA_REGION */ +struct gdma_destroy_dma_region_req { + struct gdma_req_hdr hdr; + + uint64_t gdma_region; +}; /* HW DATA */ + +int mana_gd_verify_vf_version(device_t dev); + +int mana_gd_register_device(struct gdma_dev *gd); +int mana_gd_deregister_device(struct gdma_dev *gd); + +int mana_gd_post_work_request(struct gdma_queue *wq, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_post_and_ring(struct gdma_queue *queue, + const struct gdma_wqe_request *wqe, + struct gdma_posted_wqe_info *wqe_info); + +int mana_gd_alloc_res_map(uint32_t res_avil, struct gdma_resource *r, + const char *lock_name); +void mana_gd_free_res_map(struct gdma_resource *r); + +void mana_gd_wq_ring_doorbell(struct gdma_context *gc, + struct gdma_queue *queue); + +int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, + struct gdma_mem_info *gmi); + +void mana_gd_free_memory(struct gdma_mem_info *gmi); + +void mana_gd_dma_map_paddr(void *arg, bus_dma_segment_t *segs, + int nseg, int error); + +int mana_gd_send_request(struct gdma_context *gc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp); +#endif /* _GDMA_H */ diff --git a/sys/dev/mana/gdma_main.c b/sys/dev/mana/gdma_main.c new file mode 100644 --- /dev/null +++ b/sys/dev/mana/gdma_main.c @@ -0,0 +1,1961 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#include "gdma_util.h" +#include "mana.h" + + +static mana_vendor_id_t mana_id_table[] = { + { PCI_VENDOR_ID_MICROSOFT, PCI_DEV_ID_MANA_VF}, + /* Last entry */ + { 0, 0} +}; + +static inline uint32_t +mana_gd_r32(struct gdma_context *g, uint64_t offset) +{ + uint32_t v = bus_space_read_4(g->gd_bus.bar0_t, + g->gd_bus.bar0_h, offset); + rmb(); + return (v); +} + +#if defined(__amd64__) +static inline uint64_t +mana_gd_r64(struct gdma_context *g, uint64_t offset) +{ + uint64_t v = bus_space_read_8(g->gd_bus.bar0_t, + g->gd_bus.bar0_h, offset); + rmb(); + return (v); +} +#else +static inline uint64_t +mana_gd_r64(struct gdma_context *g, uint64_t offset) +{ + uint64_t v; + uint32_t *vp = (uint32_t *)&v; + + *vp = mana_gd_r32(g, offset); + *(vp + 1) = mana_gd_r32(g, offset + 4); + rmb(); + return (v); +} +#endif + +static int +mana_gd_query_max_resources(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_query_max_resources_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_QUERY_MAX_RESOURCES, + sizeof(req), sizeof(resp)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to query resource info: %d, 0x%x\n", + err, resp.hdr.status); + return err ? err : EPROTO; + } + + mana_dbg(NULL, "max_msix %u, max_eq %u, max_cq %u, " + "max_sq %u, max_rq %u\n", + resp.max_msix, resp.max_eq, resp.max_cq, + resp.max_sq, resp.max_rq); + + if (gc->num_msix_usable > resp.max_msix) + gc->num_msix_usable = resp.max_msix; + + if (gc->num_msix_usable <= 1) + return ENOSPC; + + gc->max_num_queues = mp_ncpus; + if (gc->max_num_queues > MANA_MAX_NUM_QUEUES) + gc->max_num_queues = MANA_MAX_NUM_QUEUES; + + if (gc->max_num_queues > resp.max_eq) + gc->max_num_queues = resp.max_eq; + + if (gc->max_num_queues > resp.max_cq) + gc->max_num_queues = resp.max_cq; + + if (gc->max_num_queues > resp.max_sq) + gc->max_num_queues = resp.max_sq; + + if (gc->max_num_queues > resp.max_rq) + gc->max_num_queues = resp.max_rq; + + return 0; +} + +static int +mana_gd_detect_devices(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_list_devices_resp resp = {}; + struct gdma_general_req req = {}; + struct gdma_dev_id gd_dev; + uint32_t i, max_num_devs; + uint16_t dev_type; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_LIST_DEVICES, sizeof(req), + sizeof(resp)); + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to detect devices: %d, 0x%x\n", err, + resp.hdr.status); + return err ? err : EPROTO; + } + + max_num_devs = min_t(uint32_t, MAX_NUM_GDMA_DEVICES, resp.num_of_devs); + + for (i = 0; i < max_num_devs; i++) { + gd_dev = resp.devs[i]; + dev_type = gd_dev.type; + + mana_dbg(NULL, "gdma dev %d, type %u\n", + i, dev_type); + + /* HWC is already detected in mana_hwc_create_channel(). */ + if (dev_type == GDMA_DEVICE_HWC) + continue; + + if (dev_type == GDMA_DEVICE_MANA) { + gc->mana.gdma_context = gc; + gc->mana.dev_id = gd_dev; + } + } + + return gc->mana.dev_id.type == 0 ? ENODEV : 0; +} + +int +mana_gd_send_request(struct gdma_context *gc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp) +{ + struct hw_channel_context *hwc = gc->hwc.driver_data; + + return mana_hwc_send_request(hwc, req_len, req, resp_len, resp); +} + +void +mana_gd_dma_map_paddr(void *arg, bus_dma_segment_t *segs, int nseg, int error) +{ + bus_addr_t *paddr = arg; + + if (error) + return; + + KASSERT(nseg == 1, ("too many segments %d!", nseg)); + *paddr = segs->ds_addr; +} + +int +mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length, + struct gdma_mem_info *gmi) +{ + bus_addr_t dma_handle; + void *buf; + int err; + + if (!gc || !gmi) + return EINVAL; + + if (length < PAGE_SIZE || (length != roundup_pow_of_two(length))) + return EINVAL; + + err = bus_dma_tag_create(bus_get_dma_tag(gc->dev), /* parent */ + PAGE_SIZE, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + length, /* maxsize */ + 1, /* nsegments */ + length, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg*/ + &gmi->dma_tag); + if (err) { + device_printf(gc->dev, + "failed to create dma tag, err: %d\n", err); + return (err); + } + + /* + * Must have BUS_DMA_ZERO flag to clear the dma memory. + * Otherwise the queue overflow detection mechanism does + * not work. + */ + err = bus_dmamem_alloc(gmi->dma_tag, &buf, + BUS_DMA_NOWAIT | BUS_DMA_COHERENT | BUS_DMA_ZERO, &gmi->dma_map); + if (err) { + device_printf(gc->dev, + "failed to alloc dma mem, err: %d\n", err); + bus_dma_tag_destroy(gmi->dma_tag); + return (err); + } + + err = bus_dmamap_load(gmi->dma_tag, gmi->dma_map, buf, + length, mana_gd_dma_map_paddr, &dma_handle, BUS_DMA_NOWAIT); + if (err) { + device_printf(gc->dev, + "failed to load dma mem, err: %d\n", err); + bus_dmamem_free(gmi->dma_tag, buf, gmi->dma_map); + bus_dma_tag_destroy(gmi->dma_tag); + return (err); + } + + gmi->dev = gc->dev; + gmi->dma_handle = dma_handle; + gmi->virt_addr = buf; + gmi->length = length; + + return 0; +} + +void +mana_gd_free_memory(struct gdma_mem_info *gmi) +{ + bus_dmamap_unload(gmi->dma_tag, gmi->dma_map); + bus_dmamem_free(gmi->dma_tag, gmi->virt_addr, gmi->dma_map); + bus_dma_tag_destroy(gmi->dma_tag); +} + +static int +mana_gd_create_hw_eq(struct gdma_context *gc, + struct gdma_queue *queue) +{ + struct gdma_create_queue_resp resp = {}; + struct gdma_create_queue_req req = {}; + int err; + + if (queue->type != GDMA_EQ) + return EINVAL; + + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_QUEUE, + sizeof(req), sizeof(resp)); + + req.hdr.dev_id = queue->gdma_dev->dev_id; + req.type = queue->type; + req.pdid = queue->gdma_dev->pdid; + req.doolbell_id = queue->gdma_dev->doorbell; + req.gdma_region = queue->mem_info.gdma_region; + req.queue_size = queue->queue_size; + req.log2_throttle_limit = queue->eq.log2_throttle_limit; + req.eq_pci_msix_index = queue->eq.msix_index; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to create queue: %d, 0x%x\n", + err, resp.hdr.status); + return err ? err : EPROTO; + } + + queue->id = resp.queue_index; + queue->eq.disable_needed = true; + queue->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + return 0; +} + +static +int mana_gd_disable_queue(struct gdma_queue *queue) +{ + struct gdma_context *gc = queue->gdma_dev->gdma_context; + struct gdma_disable_queue_req req = {}; + struct gdma_general_resp resp = {}; + int err; + + if (queue->type != GDMA_EQ) + mana_warn(NULL, "Not event queue type 0x%x\n", + queue->type); + + mana_gd_init_req_hdr(&req.hdr, GDMA_DISABLE_QUEUE, + sizeof(req), sizeof(resp)); + + req.hdr.dev_id = queue->gdma_dev->dev_id; + req.type = queue->type; + req.queue_index = queue->id; + req.alloc_res_id_on_creation = 1; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to disable queue: %d, 0x%x\n", err, + resp.hdr.status); + return err ? err : EPROTO; + } + + return 0; +} + +#define DOORBELL_OFFSET_SQ 0x0 +#define DOORBELL_OFFSET_RQ 0x400 +#define DOORBELL_OFFSET_CQ 0x800 +#define DOORBELL_OFFSET_EQ 0xFF8 + +static void +mana_gd_ring_doorbell(struct gdma_context *gc, uint32_t db_index, + enum gdma_queue_type q_type, uint32_t qid, + uint32_t tail_ptr, uint8_t num_req) +{ + union gdma_doorbell_entry e = {}; + void __iomem *addr; + + addr = (char *)gc->db_page_base + gc->db_page_size * db_index; + switch (q_type) { + case GDMA_EQ: + e.eq.id = qid; + e.eq.tail_ptr = tail_ptr; + e.eq.arm = num_req; + + addr = (char *)addr + DOORBELL_OFFSET_EQ; + break; + + case GDMA_CQ: + e.cq.id = qid; + e.cq.tail_ptr = tail_ptr; + e.cq.arm = num_req; + + addr = (char *)addr + DOORBELL_OFFSET_CQ; + break; + + case GDMA_RQ: + e.rq.id = qid; + e.rq.tail_ptr = tail_ptr; + e.rq.wqe_cnt = num_req; + + addr = (char *)addr + DOORBELL_OFFSET_RQ; + break; + + case GDMA_SQ: + e.sq.id = qid; + e.sq.tail_ptr = tail_ptr; + + addr = (char *)addr + DOORBELL_OFFSET_SQ; + break; + + default: + mana_warn(NULL, "Invalid queue type 0x%x\n", q_type); + return; + } + + /* Ensure all writes are done before ring doorbell */ + wmb(); + +#if defined(__amd64__) + writeq(addr, e.as_uint64); +#else + uint32_t *p = (uint32_t *)&e.as_uint64; + writel(addr, *p); + writel((char *)addr + 4, *(p + 1)); +#endif +} + +void +mana_gd_wq_ring_doorbell(struct gdma_context *gc, struct gdma_queue *queue) +{ + mana_gd_ring_doorbell(gc, queue->gdma_dev->doorbell, queue->type, + queue->id, queue->head * GDMA_WQE_BU_SIZE, 1); +} + +void +mana_gd_arm_cq(struct gdma_queue *cq) +{ + struct gdma_context *gc = cq->gdma_dev->gdma_context; + + uint32_t num_cqe = cq->queue_size / GDMA_CQE_SIZE; + + uint32_t head = cq->head % (num_cqe << GDMA_CQE_OWNER_BITS); + + mana_gd_ring_doorbell(gc, cq->gdma_dev->doorbell, cq->type, cq->id, + head, SET_ARM_BIT); +} + +static void +mana_gd_process_eqe(struct gdma_queue *eq) +{ + uint32_t head = eq->head % (eq->queue_size / GDMA_EQE_SIZE); + struct gdma_context *gc = eq->gdma_dev->gdma_context; + struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr; + union gdma_eqe_info eqe_info; + enum gdma_eqe_type type; + struct gdma_event event; + struct gdma_queue *cq; + struct gdma_eqe *eqe; + uint32_t cq_id; + + eqe = &eq_eqe_ptr[head]; + eqe_info.as_uint32 = eqe->eqe_info; + type = eqe_info.type; + + switch (type) { + case GDMA_EQE_COMPLETION: + cq_id = eqe->details[0] & 0xFFFFFF; + if (cq_id >= gc->max_num_cqs) { + mana_warn(NULL, + "failed: cq_id %u > max_num_cqs %u\n", + cq_id, gc->max_num_cqs); + break; + } + + cq = gc->cq_table[cq_id]; + if (!cq || cq->type != GDMA_CQ || cq->id != cq_id) { + mana_warn(NULL, + "failed: invalid cq_id %u\n", cq_id); + break; + } + + if (cq->cq.callback) + cq->cq.callback(cq->cq.context, cq); + + break; + + case GDMA_EQE_TEST_EVENT: + gc->test_event_eq_id = eq->id; + + mana_dbg(NULL, + "EQE TEST EVENT received for EQ %u\n", eq->id); + + complete(&gc->eq_test_event); + break; + + case GDMA_EQE_HWC_INIT_EQ_ID_DB: + case GDMA_EQE_HWC_INIT_DATA: + case GDMA_EQE_HWC_INIT_DONE: + if (!eq->eq.callback) + break; + + event.type = type; + memcpy(&event.details, &eqe->details, GDMA_EVENT_DATA_SIZE); + eq->eq.callback(eq->eq.context, eq, &event); + break; + + default: + break; + } +} + +static void +mana_gd_process_eq_events(void *arg) +{ + uint32_t owner_bits, new_bits, old_bits; + union gdma_eqe_info eqe_info; + struct gdma_eqe *eq_eqe_ptr; + struct gdma_queue *eq = arg; + struct gdma_context *gc; + uint32_t head, num_eqe; + struct gdma_eqe *eqe; + unsigned int arm_bit; + int i, j; + + gc = eq->gdma_dev->gdma_context; + + num_eqe = eq->queue_size / GDMA_EQE_SIZE; + eq_eqe_ptr = eq->queue_mem_ptr; + + bus_dmamap_sync(eq->mem_info.dma_tag, eq->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + /* Process up to 5 EQEs at a time, and update the HW head. */ + for (i = 0; i < 5; i++) { + eqe = &eq_eqe_ptr[eq->head % num_eqe]; + eqe_info.as_uint32 = eqe->eqe_info; + owner_bits = eqe_info.owner_bits; + + old_bits = (eq->head / num_eqe - 1) & GDMA_EQE_OWNER_MASK; + + /* No more entries */ + if (owner_bits == old_bits) + break; + + new_bits = (eq->head / num_eqe) & GDMA_EQE_OWNER_MASK; + if (owner_bits != new_bits) { + /* Something wrong. Log for debugging purpose */ + device_printf(gc->dev, + "EQ %d: overflow detected, " + "i = %d, eq->head = %u " + "got owner_bits = %u, new_bits = %u " + "eqe addr %p, eqe->eqe_info 0x%x, " + "eqe type = %x, reserved1 = %x, client_id = %x, " + "reserved2 = %x, owner_bits = %x\n", + eq->id, i, eq->head, + owner_bits, new_bits, + eqe, eqe->eqe_info, + eqe_info.type, eqe_info.reserved1, + eqe_info.client_id, eqe_info.reserved2, + eqe_info.owner_bits); + + uint32_t *eqe_dump = (uint32_t *) eq_eqe_ptr; + for (j = 0; j < 20; j++) { + device_printf(gc->dev, "%p: %x\t%x\t%x\t%x\n", + &eqe_dump[j * 4], eqe_dump[j * 4], eqe_dump[j * 4 + 1], + eqe_dump[j * 4 + 2], eqe_dump[j * 4 + 3]); + } + break; + } + + mana_gd_process_eqe(eq); + + eq->head++; + } + + bus_dmamap_sync(eq->mem_info.dma_tag, eq->mem_info.dma_map, + BUS_DMASYNC_PREREAD); + + /* Always rearm the EQ for HWC. */ + if (mana_gd_is_hwc(eq->gdma_dev)) { + arm_bit = SET_ARM_BIT; + } else if (eq->eq.work_done < eq->eq.budget && + eq->eq.do_not_ring_db == false) { + arm_bit = SET_ARM_BIT; + } else { + arm_bit = 0; + } + + head = eq->head % (num_eqe << GDMA_EQE_OWNER_BITS); + + mana_gd_ring_doorbell(gc, eq->gdma_dev->doorbell, eq->type, eq->id, + head, arm_bit); +} + +#define MANA_POLL_BUDGET 8 +#define MANA_RX_BUDGET 256 + +static void +mana_poll(void *arg, int pending) +{ + struct gdma_queue *eq = arg; + int i; + + eq->eq.work_done = 0; + eq->eq.budget = MANA_RX_BUDGET; + + for (i = 0; i < MANA_POLL_BUDGET; i++) { + /* + * If this is the last loop, set the budget big enough + * so it will arm the EQ any way. + */ + if (i == (MANA_POLL_BUDGET - 1)) + eq->eq.budget = CQE_POLLING_BUFFER + 1; + + mana_gd_process_eq_events(eq); + + if (eq->eq.work_done < eq->eq.budget) + break; + + eq->eq.work_done = 0; + } +} + +static void +mana_gd_schedule_task(void *arg) +{ + struct gdma_queue *eq = arg; + + taskqueue_enqueue(eq->eq.cleanup_tq, &eq->eq.cleanup_task); +} + +static int +mana_gd_register_irq(struct gdma_queue *queue, + const struct gdma_queue_spec *spec) +{ + static int mana_last_bind_cpu = -1; + struct gdma_dev *gd = queue->gdma_dev; + bool is_mana = mana_gd_is_mana(gd); + struct gdma_irq_context *gic; + struct gdma_context *gc; + struct gdma_resource *r; + unsigned int msi_index; + int err; + + gc = gd->gdma_context; + r = &gc->msix_resource; + + mtx_lock_spin(&r->lock_spin); + + msi_index = find_first_zero_bit(r->map, r->size); + if (msi_index >= r->size) { + err = ENOSPC; + } else { + bitmap_set(r->map, msi_index, 1); + queue->eq.msix_index = msi_index; + err = 0; + } + + mtx_unlock_spin(&r->lock_spin); + + if (err) + return err; + + if (unlikely(msi_index >= gc->num_msix_usable)) { + device_printf(gc->dev, + "chose an invalid msix index %d, usable %d\n", + msi_index, gc->num_msix_usable); + return ENOSPC; + } + + gic = &gc->irq_contexts[msi_index]; + + if (is_mana) { + struct mana_port_context *apc = if_getsoftc(spec->eq.ndev); + queue->eq.do_not_ring_db = false; + + NET_TASK_INIT(&queue->eq.cleanup_task, 0, mana_poll, queue); + queue->eq.cleanup_tq = + taskqueue_create_fast("mana eq cleanup", + M_WAITOK, taskqueue_thread_enqueue, + &queue->eq.cleanup_tq); + + if (mana_last_bind_cpu < 0) + mana_last_bind_cpu = CPU_FIRST(); + queue->eq.cpu = mana_last_bind_cpu; + mana_last_bind_cpu = CPU_NEXT(mana_last_bind_cpu); + + /* XXX Name is not optimal. However we have to start + * the task here. Otherwise, test eq will have no + * handler. + */ + if (apc->bind_cleanup_thread_cpu) { + cpuset_t cpu_mask; + CPU_SETOF(queue->eq.cpu, &cpu_mask); + taskqueue_start_threads_cpuset(&queue->eq.cleanup_tq, + 1, PI_NET, &cpu_mask, + "mana eq poll msix %u on cpu %d", + msi_index, queue->eq.cpu); + } else { + + taskqueue_start_threads(&queue->eq.cleanup_tq, 1, + PI_NET, "mana eq poll on msix %u", msi_index); + } + } + + if (unlikely(gic->handler || gic->arg)) { + device_printf(gc->dev, + "interrupt handler or arg already assigned, " + "msix index: %d\n", msi_index); + } + + gic->arg = queue; + + if (is_mana) + gic->handler = mana_gd_schedule_task; + else + gic->handler = mana_gd_process_eq_events; + + mana_dbg(NULL, "registered msix index %d vector %d irq %ju\n", + msi_index, gic->msix_e.vector, rman_get_start(gic->res)); + + return 0; +} + +static void +mana_gd_deregiser_irq(struct gdma_queue *queue) +{ + struct gdma_dev *gd = queue->gdma_dev; + struct gdma_irq_context *gic; + struct gdma_context *gc; + struct gdma_resource *r; + unsigned int msix_index; + + gc = gd->gdma_context; + r = &gc->msix_resource; + + /* At most num_online_cpus() + 1 interrupts are used. */ + msix_index = queue->eq.msix_index; + if (unlikely(msix_index >= gc->num_msix_usable)) + return; + + gic = &gc->irq_contexts[msix_index]; + gic->handler = NULL; + gic->arg = NULL; + + mtx_lock_spin(&r->lock_spin); + bitmap_clear(r->map, msix_index, 1); + mtx_unlock_spin(&r->lock_spin); + + queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; + + mana_dbg(NULL, "deregistered msix index %d vector %d irq %ju\n", + msix_index, gic->msix_e.vector, rman_get_start(gic->res)); +} + +int +mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq) +{ + struct gdma_generate_test_event_req req = {}; + struct gdma_general_resp resp = {}; + device_t dev = gc->dev; + int err; + + sx_xlock(&gc->eq_test_event_sx); + + init_completion(&gc->eq_test_event); + gc->test_event_eq_id = INVALID_QUEUE_ID; + + mana_gd_init_req_hdr(&req.hdr, GDMA_GENERATE_TEST_EQE, + sizeof(req), sizeof(resp)); + + req.hdr.dev_id = eq->gdma_dev->dev_id; + req.queue_index = eq->id; + + err = mana_gd_send_request(gc, sizeof(req), &req, + sizeof(resp), &resp); + if (err) { + device_printf(dev, "test_eq failed: %d\n", err); + goto out; + } + + err = EPROTO; + + if (resp.hdr.status) { + device_printf(dev, "test_eq failed: 0x%x\n", + resp.hdr.status); + goto out; + } + + if (wait_for_completion_timeout(&gc->eq_test_event, 30 * hz)) { + device_printf(dev, "test_eq timed out on queue %d\n", + eq->id); + goto out; + } + + if (eq->id != gc->test_event_eq_id) { + device_printf(dev, + "test_eq got an event on wrong queue %d (%d)\n", + gc->test_event_eq_id, eq->id); + goto out; + } + + err = 0; +out: + sx_xunlock(&gc->eq_test_event_sx); + return err; +} + +static void +mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets, + struct gdma_queue *queue) +{ + int err; + + if (flush_evenets) { + err = mana_gd_test_eq(gc, queue); + if (err) + device_printf(gc->dev, + "Failed to flush EQ: %d\n", err); + } + + mana_gd_deregiser_irq(queue); + + if (mana_gd_is_mana(queue->gdma_dev)) { + while (taskqueue_cancel(queue->eq.cleanup_tq, + &queue->eq.cleanup_task, NULL)) + taskqueue_drain(queue->eq.cleanup_tq, + &queue->eq.cleanup_task); + + taskqueue_free(queue->eq.cleanup_tq); + } + + if (queue->eq.disable_needed) + mana_gd_disable_queue(queue); +} + +static int mana_gd_create_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + bool create_hwq, struct gdma_queue *queue) +{ + struct gdma_context *gc = gd->gdma_context; + device_t dev = gc->dev; + uint32_t log2_num_entries; + int err; + + queue->eq.msix_index = INVALID_PCI_MSIX_INDEX; + + log2_num_entries = ilog2(queue->queue_size / GDMA_EQE_SIZE); + + if (spec->eq.log2_throttle_limit > log2_num_entries) { + device_printf(dev, + "EQ throttling limit (%lu) > maximum EQE (%u)\n", + spec->eq.log2_throttle_limit, log2_num_entries); + return EINVAL; + } + + err = mana_gd_register_irq(queue, spec); + if (err) { + device_printf(dev, "Failed to register irq: %d\n", err); + return err; + } + + queue->eq.callback = spec->eq.callback; + queue->eq.context = spec->eq.context; + queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries); + queue->eq.log2_throttle_limit = spec->eq.log2_throttle_limit ?: 1; + + if (create_hwq) { + err = mana_gd_create_hw_eq(gc, queue); + if (err) + goto out; + + err = mana_gd_test_eq(gc, queue); + if (err) + goto out; + } + + return 0; +out: + device_printf(dev, "Failed to create EQ: %d\n", err); + mana_gd_destroy_eq(gc, false, queue); + return err; +} + +static void +mana_gd_create_cq(const struct gdma_queue_spec *spec, + struct gdma_queue *queue) +{ + uint32_t log2_num_entries = ilog2(spec->queue_size / GDMA_CQE_SIZE); + + queue->head |= INITIALIZED_OWNER_BIT(log2_num_entries); + queue->cq.parent = spec->cq.parent_eq; + queue->cq.context = spec->cq.context; + queue->cq.callback = spec->cq.callback; +} + +static void +mana_gd_destroy_cq(struct gdma_context *gc, + struct gdma_queue *queue) +{ + uint32_t id = queue->id; + + if (id >= gc->max_num_cqs) + return; + + if (!gc->cq_table[id]) + return; + + gc->cq_table[id] = NULL; +} + +int mana_gd_create_hwc_queue(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_mem_info *gmi; + struct gdma_queue *queue; + int err; + + queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO); + if (!queue) + return ENOMEM; + + gmi = &queue->mem_info; + err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); + if (err) + goto free_q; + + queue->head = 0; + queue->tail = 0; + queue->queue_mem_ptr = gmi->virt_addr; + queue->queue_size = spec->queue_size; + queue->monitor_avl_buf = spec->monitor_avl_buf; + queue->type = spec->type; + queue->gdma_dev = gd; + + if (spec->type == GDMA_EQ) + err = mana_gd_create_eq(gd, spec, false, queue); + else if (spec->type == GDMA_CQ) + mana_gd_create_cq(spec, queue); + + if (err) + goto out; + + *queue_ptr = queue; + return 0; +out: + mana_gd_free_memory(gmi); +free_q: + free(queue, M_DEVBUF); + return err; +} + +static void +mana_gd_destroy_dma_region(struct gdma_context *gc, uint64_t gdma_region) +{ + struct gdma_destroy_dma_region_req req = {}; + struct gdma_general_resp resp = {}; + int err; + + if (gdma_region == GDMA_INVALID_DMA_REGION) + return; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DMA_REGION, sizeof(req), + sizeof(resp)); + req.gdma_region = gdma_region; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), + &resp); + if (err || resp.hdr.status) + device_printf(gc->dev, + "Failed to destroy DMA region: %d, 0x%x\n", + err, resp.hdr.status); +} + +static int +mana_gd_create_dma_region(struct gdma_dev *gd, + struct gdma_mem_info *gmi) +{ + unsigned int num_page = gmi->length / PAGE_SIZE; + struct gdma_create_dma_region_req *req = NULL; + struct gdma_create_dma_region_resp resp = {}; + struct gdma_context *gc = gd->gdma_context; + struct hw_channel_context *hwc; + uint32_t length = gmi->length; + uint32_t req_msg_size; + int err; + int i; + + if (length < PAGE_SIZE || !is_power_of_2(length)) { + mana_err(NULL, "gmi size incorrect: %u\n", length); + return EINVAL; + } + + if (offset_in_page((uint64_t)gmi->virt_addr) != 0) { + mana_err(NULL, "gmi not page aligned: %p\n", + gmi->virt_addr); + return EINVAL; + } + + hwc = gc->hwc.driver_data; + req_msg_size = sizeof(*req) + num_page * sizeof(uint64_t); + if (req_msg_size > hwc->max_req_msg_size) { + mana_err(NULL, "req msg size too large: %u, %u\n", + req_msg_size, hwc->max_req_msg_size); + return EINVAL; + } + + req = malloc(req_msg_size, M_DEVBUF, M_WAITOK | M_ZERO); + if (!req) + return ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, GDMA_CREATE_DMA_REGION, + req_msg_size, sizeof(resp)); + req->length = length; + req->offset_in_page = 0; + req->gdma_page_type = GDMA_PAGE_TYPE_4K; + req->page_count = num_page; + req->page_addr_list_len = num_page; + + for (i = 0; i < num_page; i++) + req->page_addr_list[i] = gmi->dma_handle + i * PAGE_SIZE; + + err = mana_gd_send_request(gc, req_msg_size, req, sizeof(resp), &resp); + if (err) + goto out; + + if (resp.hdr.status || resp.gdma_region == GDMA_INVALID_DMA_REGION) { + device_printf(gc->dev, "Failed to create DMA region: 0x%x\n", + resp.hdr.status); + err = EPROTO; + goto out; + } + + gmi->gdma_region = resp.gdma_region; +out: + free(req, M_DEVBUF); + return err; +} + +int +mana_gd_create_mana_eq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_mem_info *gmi; + struct gdma_queue *queue; + int err; + + if (spec->type != GDMA_EQ) + return EINVAL; + + queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO); + if (!queue) + return ENOMEM; + + gmi = &queue->mem_info; + err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); + if (err) + goto free_q; + + err = mana_gd_create_dma_region(gd, gmi); + if (err) + goto out; + + queue->head = 0; + queue->tail = 0; + queue->queue_mem_ptr = gmi->virt_addr; + queue->queue_size = spec->queue_size; + queue->monitor_avl_buf = spec->monitor_avl_buf; + queue->type = spec->type; + queue->gdma_dev = gd; + + err = mana_gd_create_eq(gd, spec, true, queue); + if (err) + goto out; + + *queue_ptr = queue; + return 0; + +out: + mana_gd_free_memory(gmi); +free_q: + free(queue, M_DEVBUF); + return err; +} + +int mana_gd_create_mana_wq_cq(struct gdma_dev *gd, + const struct gdma_queue_spec *spec, + struct gdma_queue **queue_ptr) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_mem_info *gmi; + struct gdma_queue *queue; + int err; + + if (spec->type != GDMA_CQ && spec->type != GDMA_SQ && + spec->type != GDMA_RQ) + return EINVAL; + + queue = malloc(sizeof(*queue), M_DEVBUF, M_WAITOK | M_ZERO); + if (!queue) + return ENOMEM; + + gmi = &queue->mem_info; + err = mana_gd_alloc_memory(gc, spec->queue_size, gmi); + if (err) + goto free_q; + + err = mana_gd_create_dma_region(gd, gmi); + if (err) + goto out; + + queue->head = 0; + queue->tail = 0; + queue->queue_mem_ptr = gmi->virt_addr; + queue->queue_size = spec->queue_size; + queue->monitor_avl_buf = spec->monitor_avl_buf; + queue->type = spec->type; + queue->gdma_dev = gd; + + if (spec->type == GDMA_CQ) + mana_gd_create_cq(spec, queue); + + *queue_ptr = queue; + return 0; + +out: + mana_gd_free_memory(gmi); +free_q: + free(queue, M_DEVBUF); + return err; +} + +void +mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue) +{ + struct gdma_mem_info *gmi = &queue->mem_info; + + switch (queue->type) { + case GDMA_EQ: + mana_gd_destroy_eq(gc, queue->eq.disable_needed, queue); + break; + + case GDMA_CQ: + mana_gd_destroy_cq(gc, queue); + break; + + case GDMA_RQ: + break; + + case GDMA_SQ: + break; + + default: + device_printf(gc->dev, + "Can't destroy unknown queue: type = %d\n", + queue->type); + return; + } + + mana_gd_destroy_dma_region(gc, gmi->gdma_region); + mana_gd_free_memory(gmi); + free(queue, M_DEVBUF); +} + +int +mana_gd_verify_vf_version(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_verify_ver_resp resp = {}; + struct gdma_verify_ver_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, GDMA_VERIFY_VF_DRIVER_VERSION, + sizeof(req), sizeof(resp)); + + req.protocol_ver_min = GDMA_PROTOCOL_FIRST; + req.protocol_ver_max = GDMA_PROTOCOL_LAST; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "VfVerifyVersionOutput: %d, status=0x%x\n", + err, resp.hdr.status); + return err ? err : EPROTO; + } + + return 0; +} + +int +mana_gd_register_device(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_register_device_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + gd->pdid = INVALID_PDID; + gd->doorbell = INVALID_DOORBELL; + gd->gpa_mkey = INVALID_MEM_KEY; + + mana_gd_init_req_hdr(&req.hdr, GDMA_REGISTER_DEVICE, sizeof(req), + sizeof(resp)); + + req.hdr.dev_id = gd->dev_id; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "gdma_register_device_resp failed: %d, 0x%x\n", + err, resp.hdr.status); + return err ? err : -EPROTO; + } + + gd->pdid = resp.pdid; + gd->gpa_mkey = resp.gpa_mkey; + gd->doorbell = resp.db_id; + + mana_dbg(NULL, "mana device pdid %u, gpa_mkey %u, doorbell %u \n", + gd->pdid, gd->gpa_mkey, gd->doorbell); + + return 0; +} + +int +mana_gd_deregister_device(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + struct gdma_general_resp resp = {}; + struct gdma_general_req req = {}; + int err; + + if (gd->pdid == INVALID_PDID) + return EINVAL; + + mana_gd_init_req_hdr(&req.hdr, GDMA_DEREGISTER_DEVICE, sizeof(req), + sizeof(resp)); + + req.hdr.dev_id = gd->dev_id; + + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); + if (err || resp.hdr.status) { + device_printf(gc->dev, + "Failed to deregister device: %d, 0x%x\n", + err, resp.hdr.status); + if (!err) + err = EPROTO; + } + + gd->pdid = INVALID_PDID; + gd->doorbell = INVALID_DOORBELL; + gd->gpa_mkey = INVALID_MEM_KEY; + + return err; +} + +uint32_t +mana_gd_wq_avail_space(struct gdma_queue *wq) +{ + uint32_t used_space = (wq->head - wq->tail) * GDMA_WQE_BU_SIZE; + uint32_t wq_size = wq->queue_size; + + if (used_space > wq_size) { + mana_warn(NULL, "failed: used space %u > queue size %u\n", + used_space, wq_size); + } + + return wq_size - used_space; +} + +uint8_t * +mana_gd_get_wqe_ptr(const struct gdma_queue *wq, uint32_t wqe_offset) +{ + uint32_t offset = + (wqe_offset * GDMA_WQE_BU_SIZE) & (wq->queue_size - 1); + + if ((offset + GDMA_WQE_BU_SIZE) > wq->queue_size) { + mana_warn(NULL, "failed: write end out of queue bound %u, " + "queue size %u\n", + offset + GDMA_WQE_BU_SIZE, wq->queue_size); + } + + return (uint8_t *)wq->queue_mem_ptr + offset; +} + +static uint32_t +mana_gd_write_client_oob(const struct gdma_wqe_request *wqe_req, + enum gdma_queue_type q_type, + uint32_t client_oob_size, uint32_t sgl_data_size, + uint8_t *wqe_ptr) +{ + bool oob_in_sgl = !!(wqe_req->flags & GDMA_WR_OOB_IN_SGL); + bool pad_data = !!(wqe_req->flags & GDMA_WR_PAD_BY_SGE0); + struct gdma_wqe *header = (struct gdma_wqe *)wqe_ptr; + uint8_t *ptr; + + memset(header, 0, sizeof(struct gdma_wqe)); + header->num_sge = wqe_req->num_sge; + header->inline_oob_size_div4 = client_oob_size / sizeof(uint32_t); + + if (oob_in_sgl) { + if (!pad_data || wqe_req->num_sge < 2) { + mana_warn(NULL, "no pad_data or num_sge < 2\n"); + } + + header->client_oob_in_sgl = 1; + + if (pad_data) + header->last_vbytes = wqe_req->sgl[0].size; + } + + if (q_type == GDMA_SQ) + header->client_data_unit = wqe_req->client_data_unit; + + /* + * The size of gdma_wqe + client_oob_size must be less than or equal + * to one Basic Unit (i.e. 32 bytes), so the pointer can't go beyond + * the queue memory buffer boundary. + */ + ptr = wqe_ptr + sizeof(header); + + if (wqe_req->inline_oob_data && wqe_req->inline_oob_size > 0) { + memcpy(ptr, wqe_req->inline_oob_data, wqe_req->inline_oob_size); + + if (client_oob_size > wqe_req->inline_oob_size) + memset(ptr + wqe_req->inline_oob_size, 0, + client_oob_size - wqe_req->inline_oob_size); + } + + return sizeof(header) + client_oob_size; +} + +static void +mana_gd_write_sgl(struct gdma_queue *wq, uint8_t *wqe_ptr, + const struct gdma_wqe_request *wqe_req) +{ + uint32_t sgl_size = sizeof(struct gdma_sge) * wqe_req->num_sge; + const uint8_t *address = (uint8_t *)wqe_req->sgl; + uint8_t *base_ptr, *end_ptr; + uint32_t size_to_end; + + base_ptr = wq->queue_mem_ptr; + end_ptr = base_ptr + wq->queue_size; + size_to_end = (uint32_t)(end_ptr - wqe_ptr); + + if (size_to_end < sgl_size) { + memcpy(wqe_ptr, address, size_to_end); + + wqe_ptr = base_ptr; + address += size_to_end; + sgl_size -= size_to_end; + } + + memcpy(wqe_ptr, address, sgl_size); +} + +int +mana_gd_post_work_request(struct gdma_queue *wq, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info) +{ + uint32_t client_oob_size = wqe_req->inline_oob_size; + struct gdma_context *gc; + uint32_t sgl_data_size; + uint32_t max_wqe_size; + uint32_t wqe_size; + uint8_t *wqe_ptr; + + if (wqe_req->num_sge == 0) + return EINVAL; + + if (wq->type == GDMA_RQ) { + if (client_oob_size != 0) + return EINVAL; + + client_oob_size = INLINE_OOB_SMALL_SIZE; + + max_wqe_size = GDMA_MAX_RQE_SIZE; + } else { + if (client_oob_size != INLINE_OOB_SMALL_SIZE && + client_oob_size != INLINE_OOB_LARGE_SIZE) + return EINVAL; + + max_wqe_size = GDMA_MAX_SQE_SIZE; + } + + sgl_data_size = sizeof(struct gdma_sge) * wqe_req->num_sge; + wqe_size = ALIGN(sizeof(struct gdma_wqe) + client_oob_size + + sgl_data_size, GDMA_WQE_BU_SIZE); + if (wqe_size > max_wqe_size) + return EINVAL; + + if (wq->monitor_avl_buf && wqe_size > mana_gd_wq_avail_space(wq)) { + gc = wq->gdma_dev->gdma_context; + device_printf(gc->dev, "unsuccessful flow control!\n"); + return ENOSPC; + } + + if (wqe_info) + wqe_info->wqe_size_in_bu = wqe_size / GDMA_WQE_BU_SIZE; + + wqe_ptr = mana_gd_get_wqe_ptr(wq, wq->head); + wqe_ptr += mana_gd_write_client_oob(wqe_req, wq->type, client_oob_size, + sgl_data_size, wqe_ptr); + if (wqe_ptr >= (uint8_t *)wq->queue_mem_ptr + wq->queue_size) + wqe_ptr -= wq->queue_size; + + mana_gd_write_sgl(wq, wqe_ptr, wqe_req); + + wq->head += wqe_size / GDMA_WQE_BU_SIZE; + + bus_dmamap_sync(wq->mem_info.dma_tag, wq->mem_info.dma_map, + BUS_DMASYNC_PREWRITE); + + return 0; +} + +int +mana_gd_post_and_ring(struct gdma_queue *queue, + const struct gdma_wqe_request *wqe_req, + struct gdma_posted_wqe_info *wqe_info) +{ + struct gdma_context *gc = queue->gdma_dev->gdma_context; + int err; + + err = mana_gd_post_work_request(queue, wqe_req, wqe_info); + if (err) + return err; + + mana_gd_wq_ring_doorbell(gc, queue); + + return 0; +} + +static int +mana_gd_read_cqe(struct gdma_queue *cq, struct gdma_comp *comp) +{ + unsigned int num_cqe = cq->queue_size / sizeof(struct gdma_cqe); + struct gdma_cqe *cq_cqe = cq->queue_mem_ptr; + uint32_t owner_bits, new_bits, old_bits; + struct gdma_cqe *cqe; + + cqe = &cq_cqe[cq->head % num_cqe]; + owner_bits = cqe->cqe_info.owner_bits; + + old_bits = (cq->head / num_cqe - 1) & GDMA_CQE_OWNER_MASK; + /* Return 0 if no more entries. */ + if (owner_bits == old_bits) + return 0; + + new_bits = (cq->head / num_cqe) & GDMA_CQE_OWNER_MASK; + /* Return -1 if overflow detected. */ + if (owner_bits != new_bits) + return -1; + + comp->wq_num = cqe->cqe_info.wq_num; + comp->is_sq = cqe->cqe_info.is_sq; + memcpy(comp->cqe_data, cqe->cqe_data, GDMA_COMP_DATA_SIZE); + + return 1; +} + +int +mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe) +{ + int cqe_idx; + int ret; + + bus_dmamap_sync(cq->mem_info.dma_tag, cq->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + for (cqe_idx = 0; cqe_idx < num_cqe; cqe_idx++) { + ret = mana_gd_read_cqe(cq, &comp[cqe_idx]); + + if (ret < 0) { + cq->head -= cqe_idx; + return ret; + } + + if (ret == 0) + break; + + cq->head++; + } + + return cqe_idx; +} + +static void +mana_gd_intr(void *arg) +{ + struct gdma_irq_context *gic = arg; + + if (gic->handler) { + gic->handler(gic->arg); + } +} + +int +mana_gd_alloc_res_map(uint32_t res_avail, + struct gdma_resource *r, const char *lock_name) +{ + int n = howmany(res_avail, BITS_PER_LONG); + + r->map = + malloc(n * sizeof(unsigned long), M_DEVBUF, M_WAITOK | M_ZERO); + if (!r->map) + return ENOMEM; + + r->size = res_avail; + mtx_init(&r->lock_spin, lock_name, NULL, MTX_SPIN); + + mana_dbg(NULL, + "total res %u, total number of unsigned longs %u\n", + r->size, n); + return (0); +} + +void +mana_gd_free_res_map(struct gdma_resource *r) +{ + if (!r || !r->map) + return; + + free(r->map, M_DEVBUF); + r->map = NULL; + r->size = 0; +} + +static void +mana_gd_init_registers(struct gdma_context *gc) +{ + uint64_t bar0_va = rman_get_bushandle(gc->bar0); + + gc->db_page_size = mana_gd_r32(gc, GDMA_REG_DB_PAGE_SIZE) & 0xFFFF; + + gc->db_page_base = + (void *) (bar0_va + mana_gd_r64(gc, GDMA_REG_DB_PAGE_OFFSET)); + + gc->shm_base = + (void *) (bar0_va + mana_gd_r64(gc, GDMA_REG_SHM_OFFSET)); + + mana_dbg(NULL, "db_page_size 0x%xx, db_page_base %p," + " shm_base %p\n", + gc->db_page_size, gc->db_page_base, gc->shm_base); +} + +static struct resource * +mana_gd_alloc_bar(device_t dev, int bar) +{ + struct resource *res = NULL; + struct pci_map *pm; + int rid, type; + + if (bar < 0 || bar > PCIR_MAX_BAR_0) + goto alloc_bar_out; + + pm = pci_find_bar(dev, PCIR_BAR(bar)); + if (!pm) + goto alloc_bar_out; + + if (PCI_BAR_IO(pm->pm_value)) + type = SYS_RES_IOPORT; + else + type = SYS_RES_MEMORY; + if (type < 0) + goto alloc_bar_out; + + rid = PCIR_BAR(bar); + res = bus_alloc_resource_any(dev, type, &rid, RF_ACTIVE); +#if defined(__amd64__) + if (res) + mana_dbg(NULL, "bar %d: rid 0x%x, type 0x%jx," + " handle 0x%jx\n", + bar, rid, res->r_bustag, res->r_bushandle); +#endif + +alloc_bar_out: + return (res); +} + +static void +mana_gd_free_pci_res(struct gdma_context *gc) +{ + if (!gc || gc->dev) + return; + + if (gc->bar0 != NULL) { + bus_release_resource(gc->dev, SYS_RES_MEMORY, + PCIR_BAR(GDMA_BAR0), gc->bar0); + } + + if (gc->msix != NULL) { + bus_release_resource(gc->dev, SYS_RES_MEMORY, + gc->msix_rid, gc->msix); + } +} + +static int +mana_gd_setup_irqs(device_t dev) +{ + unsigned int max_queues_per_port = mp_ncpus; + struct gdma_context *gc = device_get_softc(dev); + struct gdma_irq_context *gic; + unsigned int max_irqs; + int nvec; + int rc, rcc, i; + + if (max_queues_per_port > MANA_MAX_NUM_QUEUES) + max_queues_per_port = MANA_MAX_NUM_QUEUES; + + max_irqs = max_queues_per_port * MAX_PORTS_IN_MANA_DEV; + + /* Need 1 interrupt for the Hardware communication Channel (HWC) */ + max_irqs++; + + nvec = max_irqs; + rc = pci_alloc_msix(dev, &nvec); + if (unlikely(rc != 0)) { + device_printf(dev, + "Failed to allocate MSIX, vectors %d, error: %d\n", + nvec, rc); + rc = ENOSPC; + goto err_setup_irq_alloc; + } + + if (nvec != max_irqs) { + if (nvec == 1) { + device_printf(dev, + "Not enough number of MSI-x allocated: %d\n", + nvec); + rc = ENOSPC; + goto err_setup_irq_release; + } + device_printf(dev, "Allocated only %d MSI-x (%d requested)\n", + nvec, max_irqs); + } + + gc->irq_contexts = malloc(nvec * sizeof(struct gdma_irq_context), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!gc->irq_contexts) { + rc = ENOMEM; + goto err_setup_irq_release; + } + + for (i = 0; i < nvec; i++) { + gic = &gc->irq_contexts[i]; + gic->msix_e.entry = i; + /* Vector starts from 1. */ + gic->msix_e.vector = i + 1; + gic->handler = NULL; + gic->arg = NULL; + + gic->res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &gic->msix_e.vector, RF_ACTIVE | RF_SHAREABLE); + if (unlikely(gic->res == NULL)) { + rc = ENOMEM; + device_printf(dev, "could not allocate resource " + "for irq vector %d\n", gic->msix_e.vector); + goto err_setup_irq; + } + + rc = bus_setup_intr(dev, gic->res, + INTR_TYPE_NET | INTR_MPSAFE, NULL, mana_gd_intr, + gic, &gic->cookie); + if (unlikely(rc != 0)) { + device_printf(dev, "failed to register interrupt " + "handler for irq %ju vector %d: error %d\n", + rman_get_start(gic->res), gic->msix_e.vector, rc); + goto err_setup_irq; + } + gic->requested = true; + + mana_dbg(NULL, "added msix vector %d irq %ju\n", + gic->msix_e.vector, rman_get_start(gic->res)); + } + + rc = mana_gd_alloc_res_map(nvec, &gc->msix_resource, + "gdma msix res lock"); + if (rc != 0) { + device_printf(dev, "failed to allocate memory " + "for msix bitmap\n"); + goto err_setup_irq; + } + + gc->max_num_msix = nvec; + gc->num_msix_usable = nvec; + + mana_dbg(NULL, "setup %d msix interrupts\n", nvec); + + return (0); + +err_setup_irq: + for (; i >= 0; i--) { + gic = &gc->irq_contexts[i]; + rcc = 0; + + /* + * If gic->requested is true, we need to free both intr and + * resources. + */ + if (gic->requested) + rcc = bus_teardown_intr(dev, gic->res, gic->cookie); + if (unlikely(rcc != 0)) + device_printf(dev, "could not release " + "irq vector %d, error: %d\n", + gic->msix_e.vector, rcc); + + rcc = 0; + if (gic->res != NULL) { + rcc = bus_release_resource(dev, SYS_RES_IRQ, + gic->msix_e.vector, gic->res); + } + if (unlikely(rcc != 0)) + device_printf(dev, "dev has no parent while " + "releasing resource for irq vector %d\n", + gic->msix_e.vector); + gic->requested = false; + gic->res = NULL; + } + + free(gc->irq_contexts, M_DEVBUF); + gc->irq_contexts = NULL; +err_setup_irq_release: + pci_release_msi(dev); +err_setup_irq_alloc: + return (rc); +} + +static void +mana_gd_remove_irqs(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + struct gdma_irq_context *gic; + int rc, i; + + mana_gd_free_res_map(&gc->msix_resource); + + for (i = 0; i < gc->max_num_msix; i++) { + gic = &gc->irq_contexts[i]; + if (gic->requested) { + rc = bus_teardown_intr(dev, gic->res, gic->cookie); + if (unlikely(rc != 0)) { + device_printf(dev, "failed to tear down " + "irq vector %d, error: %d\n", + gic->msix_e.vector, rc); + } + gic->requested = false; + } + + if (gic->res != NULL) { + rc = bus_release_resource(dev, SYS_RES_IRQ, + gic->msix_e.vector, gic->res); + if (unlikely(rc != 0)) { + device_printf(dev, "dev has no parent while " + "releasing resource for irq vector %d\n", + gic->msix_e.vector); + } + gic->res = NULL; + } + } + + gc->max_num_msix = 0; + gc->num_msix_usable = 0; + free(gc->irq_contexts, M_DEVBUF); + gc->irq_contexts = NULL; + + pci_release_msi(dev); +} + +static int +mana_gd_probe(device_t dev) +{ + mana_vendor_id_t *ent; + char adapter_name[60]; + uint16_t pci_vendor_id = 0; + uint16_t pci_device_id = 0; + + pci_vendor_id = pci_get_vendor(dev); + pci_device_id = pci_get_device(dev); + + ent = mana_id_table; + while (ent->vendor_id != 0) { + if ((pci_vendor_id == ent->vendor_id) && + (pci_device_id == ent->device_id)) { + mana_dbg(NULL, "vendor=%x device=%x\n", + pci_vendor_id, pci_device_id); + + sprintf(adapter_name, DEVICE_DESC); + device_set_desc_copy(dev, adapter_name); + return (BUS_PROBE_DEFAULT); + } + + ent++; + } + + return (ENXIO); +} + +/** + * mana_attach - Device Initialization Routine + * @dev: device information struct + * + * Returns 0 on success, otherwise on failure. + * + * mana_attach initializes a GDMA adapter identified by a device structure. + **/ +static int +mana_gd_attach(device_t dev) +{ + struct gdma_context *gc; + int msix_rid; + int rc; + + gc = device_get_softc(dev); + gc->dev = dev; + + pci_enable_io(dev, SYS_RES_IOPORT); + pci_enable_io(dev, SYS_RES_MEMORY); + + pci_enable_busmaster(dev); + + gc->bar0 = mana_gd_alloc_bar(dev, GDMA_BAR0); + if (unlikely(gc->bar0 == NULL)) { + device_printf(dev, + "unable to allocate bus resource for bar0!\n"); + rc = ENOMEM; + goto err_disable_dev; + } + + /* Store bar0 tage and handle for quick access */ + gc->gd_bus.bar0_t = rman_get_bustag(gc->bar0); + gc->gd_bus.bar0_h = rman_get_bushandle(gc->bar0); + + /* Map MSI-x vector table */ + msix_rid = pci_msix_table_bar(dev); + + mana_dbg(NULL, "msix_rid 0x%x\n", msix_rid); + + gc->msix = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &msix_rid, RF_ACTIVE); + if (unlikely(gc->msix == NULL)) { + device_printf(dev, + "unable to allocate bus resource for msix!\n"); + rc = ENOMEM; + goto err_free_pci_res; + } + gc->msix_rid = msix_rid; + + if (unlikely(gc->gd_bus.bar0_h == 0)) { + device_printf(dev, "failed to map bar0!\n"); + rc = ENXIO; + goto err_free_pci_res; + } + + mana_gd_init_registers(gc); + + mana_smc_init(&gc->shm_channel, gc->dev, gc->shm_base); + + rc = mana_gd_setup_irqs(dev); + if (rc) { + goto err_free_pci_res; + } + + sx_init(&gc->eq_test_event_sx, "gdma test event sx"); + + rc = mana_hwc_create_channel(gc); + if (rc) { + mana_dbg(NULL, "Failed to create hwc channel\n"); + if (rc == EIO) + goto err_clean_up_gdma; + else + goto err_remove_irq; + } + + rc = mana_gd_verify_vf_version(dev); + if (rc) { + mana_dbg(NULL, "Failed to verify vf\n"); + goto err_clean_up_gdma; + } + + rc = mana_gd_query_max_resources(dev); + if (rc) { + mana_dbg(NULL, "Failed to query max resources\n"); + goto err_clean_up_gdma; + } + + rc = mana_gd_detect_devices(dev); + if (rc) { + mana_dbg(NULL, "Failed to detect mana device\n"); + goto err_clean_up_gdma; + } + + rc = mana_probe(&gc->mana); + if (rc) { + mana_dbg(NULL, "Failed to probe mana device\n"); + goto err_clean_up_gdma; + } + + return (0); + +err_clean_up_gdma: + mana_hwc_destroy_channel(gc); + if (gc->cq_table) + free(gc->cq_table, M_DEVBUF); + gc->cq_table = NULL; +err_remove_irq: + mana_gd_remove_irqs(dev); +err_free_pci_res: + mana_gd_free_pci_res(gc); +err_disable_dev: + pci_disable_busmaster(dev); + + return(rc); +} + +/** + * mana_detach - Device Removal Routine + * @pdev: device information struct + * + * mana_detach is called by the device subsystem to alert the driver + * that it should release a PCI device. + **/ +static int +mana_gd_detach(device_t dev) +{ + struct gdma_context *gc = device_get_softc(dev); + + mana_remove(&gc->mana); + + mana_hwc_destroy_channel(gc); + free(gc->cq_table, M_DEVBUF); + gc->cq_table = NULL; + + mana_gd_remove_irqs(dev); + + mana_gd_free_pci_res(gc); + + pci_disable_busmaster(dev); + + return (bus_generic_detach(dev)); +} + + +/********************************************************************* + * FreeBSD Device Interface Entry Points + *********************************************************************/ + +static device_method_t mana_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, mana_gd_probe), + DEVMETHOD(device_attach, mana_gd_attach), + DEVMETHOD(device_detach, mana_gd_detach), + DEVMETHOD_END +}; + +static driver_t mana_driver = { + "mana", mana_methods, sizeof(struct gdma_context), +}; + +devclass_t mana_devclass; +DRIVER_MODULE(mana, pci, mana_driver, mana_devclass, 0, 0); +MODULE_PNP_INFO("U16:vendor;U16:device", pci, mana, mana_id_table, + nitems(mana_id_table) - 1); +MODULE_DEPEND(mana, pci, 1, 1, 1); +MODULE_DEPEND(mana, ether, 1, 1, 1); + +/*********************************************************************/ diff --git a/sys/dev/mana/gdma_util.h b/sys/dev/mana/gdma_util.h new file mode 100644 --- /dev/null +++ b/sys/dev/mana/gdma_util.h @@ -0,0 +1,206 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _GDMA_UTIL_H_ +#define _GDMA_UTIL_H_ + +#include +#include + +/* Log Levels */ +#define MANA_ALERT (1 << 0) /* Alerts are providing more error info. */ +#define MANA_WARNING (1 << 1) /* Driver output is more error sensitive. */ +#define MANA_INFO (1 << 2) /* Provides additional driver info. */ +#define MANA_DBG (1 << 3) /* Driver output for debugging. */ + +extern int mana_log_level; + +#define mana_trace_raw(ctx, level, fmt, args...) \ + do { \ + ((void)(ctx)); \ + if (((level) & mana_log_level) != (level)) \ + break; \ + printf(fmt, ##args); \ + } while (0) + +#define mana_trace(ctx, level, fmt, args...) \ + mana_trace_raw(ctx, level, "%s() [TID:%d]: " \ + fmt, __func__, curthread->td_tid, ##args) + + +#define mana_dbg(ctx, format, arg...) \ + mana_trace(ctx, MANA_DBG, format, ##arg) +#define mana_info(ctx, format, arg...) \ + mana_trace(ctx, MANA_INFO, format, ##arg) +#define mana_warn(ctx, format, arg...) \ + mana_trace(ctx, MANA_WARNING, format, ##arg) +#define mana_err(ctx, format, arg...) \ + mana_trace(ctx, MANA_ALERT, format, ##arg) + +#define unlikely(x) __predict_false(!!(x)) +#define likely(x) __predict_true(!!(x)) + + +#define BITS_PER_LONG (sizeof(long) * NBBY) + +#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG)) +#define BITMAP_LAST_WORD_MASK(n) (~0UL >> (BITS_PER_LONG - (n))) +#define BITS_TO_LONGS(n) howmany((n), BITS_PER_LONG) +#define BIT_MASK(nr) (1UL << ((nr) & (BITS_PER_LONG - 1))) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) + +#undef ALIGN +#define ALIGN(x, y) roundup2((x), (y)) +#define IS_ALIGNED(x, a) (((x) & ((__typeof(x))(a) - 1)) == 0) + +#define BIT(n) (1ULL << (n)) + +#define PHYS_PFN(x) ((unsigned long)((x) >> PAGE_SHIFT)) +#define offset_in_page(x) ((x) & PAGE_MASK) + +#define min_t(type, _x, _y) \ + ((type)(_x) < (type)(_y) ? (type)(_x) : (type)(_y)) + +#define test_bit(i, a) \ + ((((volatile const unsigned long *)(a))[BIT_WORD(i)]) & BIT_MASK(i)) + +typedef volatile uint32_t atomic_t; + +#define atomic_add_return(v, p) (atomic_fetchadd_int(p, v) + (v)) +#define atomic_sub_return(v, p) (atomic_fetchadd_int(p, -(v)) - (v)) +#define atomic_inc_return(p) atomic_add_return(1, p) +#define atomic_dec_return(p) atomic_sub_return(1, p) +#define atomic_read(p) atomic_add_return(0, p) + +#define usleep_range(_1, _2) \ + pause_sbt("gdma-usleep-range", SBT_1US * _1, SBT_1US * 1, C_ABSOLUTE) + +static inline void +gdma_msleep(unsigned int ms) +{ + if (ms == 0) + ms = 1; + pause_sbt("gdma-msleep", mstosbt(ms), 0, C_HARDCLOCK); +} + +static inline void +bitmap_set(unsigned long *map, unsigned int start, int nr) +{ + const unsigned int size = start + nr; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + map += BIT_WORD(start); + + while (nr - bits_to_set >= 0) { + *map |= mask_to_set; + nr -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + map++; + } + + if (nr) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *map |= mask_to_set; + } +} + +static inline void +bitmap_clear(unsigned long *map, unsigned int start, int nr) +{ + const unsigned int size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + map += BIT_WORD(start); + + while (nr - bits_to_clear >= 0) { + *map &= ~mask_to_clear; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + map++; + } + + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *map &= ~mask_to_clear; + } +} + +static inline unsigned long +find_first_zero_bit(const unsigned long *p, unsigned long max) +{ + unsigned long i, n; + + for (i = 0; i < max / BITS_PER_LONG + 1; i++) { + n = ~p[i]; + if (n != 0) + return (i * BITS_PER_LONG + ffsl(n) - 1); + } + return (max); +} + +static inline unsigned long +ilog2(unsigned long x) +{ + unsigned long log = x; + while (x >>= 1) + log++; + return (log); +} + +static inline unsigned long +roundup_pow_of_two(unsigned long x) +{ + return (1UL << flsl(x - 1)); +} + +static inline int +is_power_of_2(unsigned long n) +{ + return (n == roundup_pow_of_two(n)); +} + +struct completion { + unsigned int done; + struct mtx lock; +}; + +void init_completion(struct completion *c); +void free_completion(struct completion *c); +void complete(struct completion *c); +void wait_for_completion(struct completion *c); +int wait_for_completion_timeout(struct completion *c, int timeout); +#endif /* _GDMA_UTIL_H_ */ diff --git a/sys/dev/mana/gdma_util.c b/sys/dev/mana/gdma_util.c new file mode 100644 --- /dev/null +++ b/sys/dev/mana/gdma_util.c @@ -0,0 +1,96 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include "gdma_util.h" + + +void +init_completion(struct completion *c) +{ + memset(c, 0, sizeof(*c)); + mtx_init(&c->lock, "gdma_completion", NULL, MTX_DEF); + c->done = 0; +} + +void +free_completion(struct completion *c) +{ + mtx_destroy(&c->lock); +} + +void +complete(struct completion *c) +{ + mtx_lock(&c->lock); + c->done++; + mtx_unlock(&c->lock); + wakeup(c); +} + +void +wait_for_completion(struct completion *c) +{ + mtx_lock(&c->lock); + while (c->done == 0) + mtx_sleep(c, &c->lock, 0, "gdma_wfc", 0); + c->done--; + mtx_unlock(&c->lock); +} + +/* + * Return: 0 if completed, a non-zero value if timed out. + */ +int +wait_for_completion_timeout(struct completion *c, int timeout) +{ + int ret; + + mtx_lock(&c->lock); + + if (c->done == 0) + mtx_sleep(c, &c->lock, 0, "gdma_wfc", timeout); + + if (c->done > 0) { + c->done--; + ret = 0; + } else { + ret = 1; + } + + mtx_unlock(&c->lock); + + return (ret); +} diff --git a/sys/dev/mana/hw_channel.h b/sys/dev/mana/hw_channel.h new file mode 100644 --- /dev/null +++ b/sys/dev/mana/hw_channel.h @@ -0,0 +1,222 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _HW_CHANNEL_H +#define _HW_CHANNEL_H + +#include + +#define DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ 4 + +#define HW_CHANNEL_MAX_REQUEST_SIZE 0x1000 +#define HW_CHANNEL_MAX_RESPONSE_SIZE 0x1000 + +#define HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH 1 + +#define HWC_INIT_DATA_CQID 1 +#define HWC_INIT_DATA_RQID 2 +#define HWC_INIT_DATA_SQID 3 +#define HWC_INIT_DATA_QUEUE_DEPTH 4 +#define HWC_INIT_DATA_MAX_REQUEST 5 +#define HWC_INIT_DATA_MAX_RESPONSE 6 +#define HWC_INIT_DATA_MAX_NUM_CQS 7 +#define HWC_INIT_DATA_PDID 8 +#define HWC_INIT_DATA_GPA_MKEY 9 + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +union hwc_init_eq_id_db { + uint32_t as_uint32; + + struct { + uint32_t eq_id : 16; + uint32_t doorbell: 16; + }; +}; /* HW DATA */ + +union hwc_init_type_data { + uint32_t as_uint32; + + struct { + uint32_t value : 24; + uint32_t type : 8; + }; +}; /* HW DATA */ + +struct hwc_rx_oob { + uint32_t type : 6; + uint32_t eom : 1; + uint32_t som : 1; + uint32_t vendor_err : 8; + uint32_t reserved1 : 16; + + uint32_t src_virt_wq : 24; + uint32_t src_vfid : 8; + + uint32_t reserved2; + + union { + uint32_t wqe_addr_low; + uint32_t wqe_offset; + }; + + uint32_t wqe_addr_high; + + uint32_t client_data_unit : 14; + uint32_t reserved3 : 18; + + uint32_t tx_oob_data_size; + + uint32_t chunk_offset : 21; + uint32_t reserved4 : 11; +}; /* HW DATA */ + +struct hwc_tx_oob { + uint32_t reserved1; + + uint32_t reserved2; + + uint32_t vrq_id : 24; + uint32_t dest_vfid : 8; + + uint32_t vrcq_id : 24; + uint32_t reserved3 : 8; + + uint32_t vscq_id : 24; + uint32_t loopback : 1; + uint32_t lso_override: 1; + uint32_t dest_pf : 1; + uint32_t reserved4 : 5; + + uint32_t vsq_id : 24; + uint32_t reserved5 : 8; +}; /* HW DATA */ + +struct hwc_work_request { + void *buf_va; + void *buf_sge_addr; + uint32_t buf_len; + uint32_t msg_size; + + struct gdma_wqe_request wqe_req; + struct hwc_tx_oob tx_oob; + + struct gdma_sge sge; +}; + +/* hwc_dma_buf represents the array of in-flight WQEs. + * mem_info as know as the GDMA mapped memory is partitioned and used by + * in-flight WQEs. + * The number of WQEs is determined by the number of in-flight messages. + */ +struct hwc_dma_buf { + struct gdma_mem_info mem_info; + + uint32_t gpa_mkey; + + uint32_t num_reqs; + struct hwc_work_request reqs[]; +}; + +typedef void hwc_rx_event_handler_t(void *ctx, uint32_t gdma_rxq_id, + const struct hwc_rx_oob *rx_oob); + +typedef void hwc_tx_event_handler_t(void *ctx, uint32_t gdma_txq_id, + const struct hwc_rx_oob *rx_oob); + +struct hwc_cq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_cq; + struct gdma_queue *gdma_eq; + struct gdma_comp *comp_buf; + uint16_t queue_depth; + + hwc_rx_event_handler_t *rx_event_handler; + void *rx_event_ctx; + + hwc_tx_event_handler_t *tx_event_handler; + void *tx_event_ctx; +}; + +struct hwc_wq { + struct hw_channel_context *hwc; + + struct gdma_queue *gdma_wq; + struct hwc_dma_buf *msg_buf; + uint16_t queue_depth; + + struct hwc_cq *hwc_cq; +}; + +struct hwc_caller_ctx { + struct completion comp_event; + void *output_buf; + uint32_t output_buflen; + + uint32_t error; /* Error code */ + uint32_t status_code; +}; + +struct hw_channel_context { + struct gdma_dev *gdma_dev; + device_t dev; + + uint16_t num_inflight_msg; + uint32_t max_req_msg_size; + + uint16_t hwc_init_q_depth_max; + uint32_t hwc_init_max_req_msg_size; + uint32_t hwc_init_max_resp_msg_size; + + struct completion hwc_init_eqe_comp; + + struct hwc_wq *rxq; + struct hwc_wq *txq; + struct hwc_cq *cq; + + struct sema sema; + struct gdma_resource inflight_msg_res; + + struct hwc_caller_ctx *caller_ctx; +}; + +int mana_hwc_create_channel(struct gdma_context *gc); +void mana_hwc_destroy_channel(struct gdma_context *gc); + +int mana_hwc_send_request(struct hw_channel_context *hwc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp); + +#endif /* _HW_CHANNEL_H */ diff --git a/sys/dev/mana/hw_channel.c b/sys/dev/mana/hw_channel.c new file mode 100644 --- /dev/null +++ b/sys/dev/mana/hw_channel.c @@ -0,0 +1,950 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mana.h" +#include "hw_channel.h" + +static int +mana_hwc_get_msg_index(struct hw_channel_context *hwc, uint16_t *msg_id) +{ + struct gdma_resource *r = &hwc->inflight_msg_res; + uint32_t index; + + sema_wait(&hwc->sema); + + mtx_lock_spin(&r->lock_spin); + + index = find_first_zero_bit(hwc->inflight_msg_res.map, + hwc->inflight_msg_res.size); + + bitmap_set(hwc->inflight_msg_res.map, index, 1); + + mtx_unlock_spin(&r->lock_spin); + + *msg_id = index; + + return 0; +} + +static void +mana_hwc_put_msg_index(struct hw_channel_context *hwc, uint16_t msg_id) +{ + struct gdma_resource *r = &hwc->inflight_msg_res; + + mtx_lock_spin(&r->lock_spin); + bitmap_clear(hwc->inflight_msg_res.map, msg_id, 1); + mtx_unlock_spin(&r->lock_spin); + + sema_post(&hwc->sema); +} + +static int +mana_hwc_verify_resp_msg(const struct hwc_caller_ctx *caller_ctx, + const struct gdma_resp_hdr *resp_msg, + uint32_t resp_len) +{ + if (resp_len < sizeof(*resp_msg)) + return EPROTO; + + if (resp_len > caller_ctx->output_buflen) + return EPROTO; + + return 0; +} + +static void +mana_hwc_handle_resp(struct hw_channel_context *hwc, uint32_t resp_len, + const struct gdma_resp_hdr *resp_msg) +{ + struct hwc_caller_ctx *ctx; + int err; + + if (!test_bit(resp_msg->response.hwc_msg_id, + hwc->inflight_msg_res.map)) { + device_printf(hwc->dev, "hwc_rx: invalid msg_id = %u\n", + resp_msg->response.hwc_msg_id); + return; + } + + ctx = hwc->caller_ctx + resp_msg->response.hwc_msg_id; + err = mana_hwc_verify_resp_msg(ctx, resp_msg, resp_len); + if (err) + goto out; + + ctx->status_code = resp_msg->status; + + memcpy(ctx->output_buf, resp_msg, resp_len); +out: + ctx->error = err; + complete(&ctx->comp_event); +} + +static int +mana_hwc_post_rx_wqe(const struct hwc_wq *hwc_rxq, + struct hwc_work_request *req) +{ + device_t dev = hwc_rxq->hwc->dev; + struct gdma_sge *sge; + int err; + + sge = &req->sge; + sge->address = (uint64_t)req->buf_sge_addr; + sge->mem_key = hwc_rxq->msg_buf->gpa_mkey; + sge->size = req->buf_len; + + memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request)); + req->wqe_req.sgl = sge; + req->wqe_req.num_sge = 1; + req->wqe_req.client_data_unit = 0; + + err = mana_gd_post_and_ring(hwc_rxq->gdma_wq, &req->wqe_req, NULL); + if (err) + device_printf(dev, + "Failed to post WQE on HWC RQ: %d\n", err); + return err; +} + +static void +mana_hwc_init_event_handler(void *ctx, struct gdma_queue *q_self, + struct gdma_event *event) +{ + struct hw_channel_context *hwc = ctx; + struct gdma_dev *gd = hwc->gdma_dev; + union hwc_init_type_data type_data; + union hwc_init_eq_id_db eq_db; + uint32_t type, val; + + switch (event->type) { + case GDMA_EQE_HWC_INIT_EQ_ID_DB: + eq_db.as_uint32 = event->details[0]; + hwc->cq->gdma_eq->id = eq_db.eq_id; + gd->doorbell = eq_db.doorbell; + break; + + case GDMA_EQE_HWC_INIT_DATA: + type_data.as_uint32 = event->details[0]; + type = type_data.type; + val = type_data.value; + + switch (type) { + case HWC_INIT_DATA_CQID: + hwc->cq->gdma_cq->id = val; + break; + + case HWC_INIT_DATA_RQID: + hwc->rxq->gdma_wq->id = val; + break; + + case HWC_INIT_DATA_SQID: + hwc->txq->gdma_wq->id = val; + break; + + case HWC_INIT_DATA_QUEUE_DEPTH: + hwc->hwc_init_q_depth_max = (uint16_t)val; + break; + + case HWC_INIT_DATA_MAX_REQUEST: + hwc->hwc_init_max_req_msg_size = val; + break; + + case HWC_INIT_DATA_MAX_RESPONSE: + hwc->hwc_init_max_resp_msg_size = val; + break; + + case HWC_INIT_DATA_MAX_NUM_CQS: + gd->gdma_context->max_num_cqs = val; + break; + + case HWC_INIT_DATA_PDID: + hwc->gdma_dev->pdid = val; + break; + + case HWC_INIT_DATA_GPA_MKEY: + hwc->rxq->msg_buf->gpa_mkey = val; + hwc->txq->msg_buf->gpa_mkey = val; + break; + } + + break; + + case GDMA_EQE_HWC_INIT_DONE: + complete(&hwc->hwc_init_eqe_comp); + break; + + default: + /* Ignore unknown events, which should never happen. */ + break; + } +} + +static void +mana_hwc_rx_event_handler(void *ctx, uint32_t gdma_rxq_id, + const struct hwc_rx_oob *rx_oob) +{ + struct hw_channel_context *hwc = ctx; + struct hwc_wq *hwc_rxq = hwc->rxq; + struct hwc_work_request *rx_req; + struct gdma_resp_hdr *resp; + struct gdma_wqe *dma_oob; + struct gdma_queue *rq; + struct gdma_sge *sge; + uint64_t rq_base_addr; + uint64_t rx_req_idx; + uint8_t *wqe; + + if (hwc_rxq->gdma_wq->id != gdma_rxq_id) { + mana_warn(NULL, "unmatched rx queue %u != %u\n", + hwc_rxq->gdma_wq->id, gdma_rxq_id); + return; + } + + + rq = hwc_rxq->gdma_wq; + wqe = mana_gd_get_wqe_ptr(rq, rx_oob->wqe_offset / GDMA_WQE_BU_SIZE); + dma_oob = (struct gdma_wqe *)wqe; + + bus_dmamap_sync(rq->mem_info.dma_tag, rq->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + sge = (struct gdma_sge *)(wqe + 8 + dma_oob->inline_oob_size_div4 * 4); + + /* Select the RX work request for virtual address and for reposting. */ + rq_base_addr = hwc_rxq->msg_buf->mem_info.dma_handle; + rx_req_idx = (sge->address - rq_base_addr) / hwc->max_req_msg_size; + + bus_dmamap_sync(hwc_rxq->msg_buf->mem_info.dma_tag, + hwc_rxq->msg_buf->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + rx_req = &hwc_rxq->msg_buf->reqs[rx_req_idx]; + resp = (struct gdma_resp_hdr *)rx_req->buf_va; + + if (resp->response.hwc_msg_id >= hwc->num_inflight_msg) { + device_printf(hwc->dev, "HWC RX: wrong msg_id=%u\n", + resp->response.hwc_msg_id); + return; + } + + mana_hwc_handle_resp(hwc, rx_oob->tx_oob_data_size, resp); + + /* Do no longer use 'resp', because the buffer is posted to the HW + * in the below mana_hwc_post_rx_wqe(). + */ + resp = NULL; + + bus_dmamap_sync(hwc_rxq->msg_buf->mem_info.dma_tag, + hwc_rxq->msg_buf->mem_info.dma_map, + BUS_DMASYNC_PREREAD); + + mana_hwc_post_rx_wqe(hwc_rxq, rx_req); +} + +static void +mana_hwc_tx_event_handler(void *ctx, uint32_t gdma_txq_id, + const struct hwc_rx_oob *rx_oob) +{ + struct hw_channel_context *hwc = ctx; + struct hwc_wq *hwc_txq = hwc->txq; + + if (!hwc_txq || hwc_txq->gdma_wq->id != gdma_txq_id) { + mana_warn(NULL, "unmatched tx queue %u != %u\n", + hwc_txq->gdma_wq->id, gdma_txq_id); + } + + bus_dmamap_sync(hwc_txq->gdma_wq->mem_info.dma_tag, + hwc_txq->gdma_wq->mem_info.dma_map, + BUS_DMASYNC_POSTWRITE); +} + +static int +mana_hwc_create_gdma_wq(struct hw_channel_context *hwc, + enum gdma_queue_type type, uint64_t queue_size, + struct gdma_queue **queue) +{ + struct gdma_queue_spec spec = {}; + + if (type != GDMA_SQ && type != GDMA_RQ) + return EINVAL; + + spec.type = type; + spec.monitor_avl_buf = false; + spec.queue_size = queue_size; + + return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue); +} + +static int +mana_hwc_create_gdma_cq(struct hw_channel_context *hwc, + uint64_t queue_size, + void *ctx, gdma_cq_callback *cb, + struct gdma_queue *parent_eq, + struct gdma_queue **queue) +{ + struct gdma_queue_spec spec = {}; + + spec.type = GDMA_CQ; + spec.monitor_avl_buf = false; + spec.queue_size = queue_size; + spec.cq.context = ctx; + spec.cq.callback = cb; + spec.cq.parent_eq = parent_eq; + + return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue); +} + +static int +mana_hwc_create_gdma_eq(struct hw_channel_context *hwc, + uint64_t queue_size, + void *ctx, gdma_eq_callback *cb, + struct gdma_queue **queue) +{ + struct gdma_queue_spec spec = {}; + + spec.type = GDMA_EQ; + spec.monitor_avl_buf = false; + spec.queue_size = queue_size; + spec.eq.context = ctx; + spec.eq.callback = cb; + spec.eq.log2_throttle_limit = DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ; + + return mana_gd_create_hwc_queue(hwc->gdma_dev, &spec, queue); +} + +static void +mana_hwc_comp_event(void *ctx, struct gdma_queue *q_self) +{ + struct hwc_rx_oob comp_data = {}; + struct gdma_comp *completions; + struct hwc_cq *hwc_cq = ctx; + int comp_read, i; + + completions = hwc_cq->comp_buf; + comp_read = mana_gd_poll_cq(q_self, completions, hwc_cq->queue_depth); + + for (i = 0; i < comp_read; ++i) { + comp_data = *(struct hwc_rx_oob *)completions[i].cqe_data; + + if (completions[i].is_sq) + hwc_cq->tx_event_handler(hwc_cq->tx_event_ctx, + completions[i].wq_num, + &comp_data); + else + hwc_cq->rx_event_handler(hwc_cq->rx_event_ctx, + completions[i].wq_num, + &comp_data); + } + + bus_dmamap_sync(q_self->mem_info.dma_tag, q_self->mem_info.dma_map, + BUS_DMASYNC_POSTREAD); + + mana_gd_arm_cq(q_self); +} + +static void +mana_hwc_destroy_cq(struct gdma_context *gc, struct hwc_cq *hwc_cq) +{ + if (!hwc_cq) + return; + + if (hwc_cq->comp_buf) + free(hwc_cq->comp_buf, M_DEVBUF); + + if (hwc_cq->gdma_cq) + mana_gd_destroy_queue(gc, hwc_cq->gdma_cq); + + if (hwc_cq->gdma_eq) + mana_gd_destroy_queue(gc, hwc_cq->gdma_eq); + + free(hwc_cq, M_DEVBUF); +} + +static int +mana_hwc_create_cq(struct hw_channel_context *hwc, + uint16_t q_depth, + gdma_eq_callback *callback, void *ctx, + hwc_rx_event_handler_t *rx_ev_hdlr, void *rx_ev_ctx, + hwc_tx_event_handler_t *tx_ev_hdlr, void *tx_ev_ctx, + struct hwc_cq **hwc_cq_ptr) +{ + struct gdma_queue *eq, *cq; + struct gdma_comp *comp_buf; + struct hwc_cq *hwc_cq; + uint32_t eq_size, cq_size; + int err; + + eq_size = roundup_pow_of_two(GDMA_EQE_SIZE * q_depth); + if (eq_size < MINIMUM_SUPPORTED_PAGE_SIZE) + eq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + + cq_size = roundup_pow_of_two(GDMA_CQE_SIZE * q_depth); + if (cq_size < MINIMUM_SUPPORTED_PAGE_SIZE) + cq_size = MINIMUM_SUPPORTED_PAGE_SIZE; + + hwc_cq = malloc(sizeof(*hwc_cq), M_DEVBUF, M_WAITOK | M_ZERO); + if (!hwc_cq) + return ENOMEM; + + err = mana_hwc_create_gdma_eq(hwc, eq_size, ctx, callback, &eq); + if (err) { + device_printf(hwc->dev, + "Failed to create HWC EQ for RQ: %d\n", err); + goto out; + } + hwc_cq->gdma_eq = eq; + + err = mana_hwc_create_gdma_cq(hwc, cq_size, hwc_cq, + mana_hwc_comp_event, eq, &cq); + if (err) { + device_printf(hwc->dev, + "Failed to create HWC CQ for RQ: %d\n", err); + goto out; + } + hwc_cq->gdma_cq = cq; + + comp_buf = mallocarray(q_depth, sizeof(struct gdma_comp), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!comp_buf) { + err = ENOMEM; + goto out; + } + + hwc_cq->hwc = hwc; + hwc_cq->comp_buf = comp_buf; + hwc_cq->queue_depth = q_depth; + hwc_cq->rx_event_handler = rx_ev_hdlr; + hwc_cq->rx_event_ctx = rx_ev_ctx; + hwc_cq->tx_event_handler = tx_ev_hdlr; + hwc_cq->tx_event_ctx = tx_ev_ctx; + + *hwc_cq_ptr = hwc_cq; + return 0; +out: + mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc_cq); + return err; +} + +static int +mana_hwc_alloc_dma_buf(struct hw_channel_context *hwc, uint16_t q_depth, + uint32_t max_msg_size, + struct hwc_dma_buf **dma_buf_ptr) +{ + struct gdma_context *gc = hwc->gdma_dev->gdma_context; + struct hwc_work_request *hwc_wr; + struct hwc_dma_buf *dma_buf; + struct gdma_mem_info *gmi; + uint32_t buf_size; + uint8_t *base_pa; + void *virt_addr; + uint16_t i; + int err; + + dma_buf = malloc(sizeof(*dma_buf) + + q_depth * sizeof(struct hwc_work_request), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!dma_buf) + return ENOMEM; + + dma_buf->num_reqs = q_depth; + + buf_size = ALIGN(q_depth * max_msg_size, PAGE_SIZE); + + gmi = &dma_buf->mem_info; + err = mana_gd_alloc_memory(gc, buf_size, gmi); + if (err) { + device_printf(hwc->dev, + "Failed to allocate DMA buffer: %d\n", err); + goto out; + } + + virt_addr = dma_buf->mem_info.virt_addr; + base_pa = (uint8_t *)dma_buf->mem_info.dma_handle; + + for (i = 0; i < q_depth; i++) { + hwc_wr = &dma_buf->reqs[i]; + + hwc_wr->buf_va = (char *)virt_addr + i * max_msg_size; + hwc_wr->buf_sge_addr = base_pa + i * max_msg_size; + + hwc_wr->buf_len = max_msg_size; + } + + *dma_buf_ptr = dma_buf; + return 0; +out: + free(dma_buf, M_DEVBUF); + return err; +} + +static void +mana_hwc_dealloc_dma_buf(struct hw_channel_context *hwc, + struct hwc_dma_buf *dma_buf) +{ + if (!dma_buf) + return; + + mana_gd_free_memory(&dma_buf->mem_info); + + free(dma_buf, M_DEVBUF); +} + +static void +mana_hwc_destroy_wq(struct hw_channel_context *hwc, + struct hwc_wq *hwc_wq) +{ + if (!hwc_wq) + return; + + mana_hwc_dealloc_dma_buf(hwc, hwc_wq->msg_buf); + + if (hwc_wq->gdma_wq) + mana_gd_destroy_queue(hwc->gdma_dev->gdma_context, + hwc_wq->gdma_wq); + + free(hwc_wq, M_DEVBUF); +} + +static int +mana_hwc_create_wq(struct hw_channel_context *hwc, + enum gdma_queue_type q_type, uint16_t q_depth, + uint32_t max_msg_size, struct hwc_cq *hwc_cq, + struct hwc_wq **hwc_wq_ptr) +{ + struct gdma_queue *queue; + struct hwc_wq *hwc_wq; + uint32_t queue_size; + int err; + + if (q_type != GDMA_SQ && q_type != GDMA_RQ) { + /* XXX should fail and return error? */ + mana_warn(NULL, "Invalid q_type %u\n", q_type); + } + + if (q_type == GDMA_RQ) + queue_size = roundup_pow_of_two(GDMA_MAX_RQE_SIZE * q_depth); + else + queue_size = roundup_pow_of_two(GDMA_MAX_SQE_SIZE * q_depth); + + if (queue_size < MINIMUM_SUPPORTED_PAGE_SIZE) + queue_size = MINIMUM_SUPPORTED_PAGE_SIZE; + + hwc_wq = malloc(sizeof(*hwc_wq), M_DEVBUF, M_WAITOK | M_ZERO); + if (!hwc_wq) + return ENOMEM; + + err = mana_hwc_create_gdma_wq(hwc, q_type, queue_size, &queue); + if (err) + goto out; + + err = mana_hwc_alloc_dma_buf(hwc, q_depth, max_msg_size, + &hwc_wq->msg_buf); + if (err) + goto out; + + hwc_wq->hwc = hwc; + hwc_wq->gdma_wq = queue; + hwc_wq->queue_depth = q_depth; + hwc_wq->hwc_cq = hwc_cq; + + *hwc_wq_ptr = hwc_wq; + return 0; +out: + if (err) + mana_hwc_destroy_wq(hwc, hwc_wq); + return err; +} + +static int +mana_hwc_post_tx_wqe(const struct hwc_wq *hwc_txq, + struct hwc_work_request *req, + uint32_t dest_virt_rq_id, uint32_t dest_virt_rcq_id, + bool dest_pf) +{ + device_t dev = hwc_txq->hwc->dev; + struct hwc_tx_oob *tx_oob; + struct gdma_sge *sge; + int err; + + if (req->msg_size == 0 || req->msg_size > req->buf_len) { + device_printf(dev, "wrong msg_size: %u, buf_len: %u\n", + req->msg_size, req->buf_len); + return EINVAL; + } + + tx_oob = &req->tx_oob; + + tx_oob->vrq_id = dest_virt_rq_id; + tx_oob->dest_vfid = 0; + tx_oob->vrcq_id = dest_virt_rcq_id; + tx_oob->vscq_id = hwc_txq->hwc_cq->gdma_cq->id; + tx_oob->loopback = false; + tx_oob->lso_override = false; + tx_oob->dest_pf = dest_pf; + tx_oob->vsq_id = hwc_txq->gdma_wq->id; + + sge = &req->sge; + sge->address = (uint64_t)req->buf_sge_addr; + sge->mem_key = hwc_txq->msg_buf->gpa_mkey; + sge->size = req->msg_size; + + memset(&req->wqe_req, 0, sizeof(struct gdma_wqe_request)); + req->wqe_req.sgl = sge; + req->wqe_req.num_sge = 1; + req->wqe_req.inline_oob_size = sizeof(struct hwc_tx_oob); + req->wqe_req.inline_oob_data = tx_oob; + req->wqe_req.client_data_unit = 0; + + err = mana_gd_post_and_ring(hwc_txq->gdma_wq, &req->wqe_req, NULL); + if (err) + device_printf(dev, + "Failed to post WQE on HWC SQ: %d\n", err); + return err; +} + +static int +mana_hwc_init_inflight_msg(struct hw_channel_context *hwc, uint16_t num_msg) +{ + int err; + + sema_init(&hwc->sema, num_msg, "gdma hwc sema"); + + err = mana_gd_alloc_res_map(num_msg, &hwc->inflight_msg_res, + "gdma hwc res lock"); + if (err) + device_printf(hwc->dev, + "Failed to init inflight_msg_res: %d\n", err); + + return (err); +} + +static int +mana_hwc_test_channel(struct hw_channel_context *hwc, uint16_t q_depth, + uint32_t max_req_msg_size, uint32_t max_resp_msg_size) +{ + struct gdma_context *gc = hwc->gdma_dev->gdma_context; + struct hwc_wq *hwc_rxq = hwc->rxq; + struct hwc_work_request *req; + struct hwc_caller_ctx *ctx; + int err; + int i; + + /* Post all WQEs on the RQ */ + for (i = 0; i < q_depth; i++) { + req = &hwc_rxq->msg_buf->reqs[i]; + err = mana_hwc_post_rx_wqe(hwc_rxq, req); + if (err) + return err; + } + + ctx = malloc(q_depth * sizeof(struct hwc_caller_ctx), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!ctx) + return ENOMEM; + + for (i = 0; i < q_depth; ++i) + init_completion(&ctx[i].comp_event); + + hwc->caller_ctx = ctx; + + return mana_gd_test_eq(gc, hwc->cq->gdma_eq); +} + +static int +mana_hwc_establish_channel(struct gdma_context *gc, uint16_t *q_depth, + uint32_t *max_req_msg_size, + uint32_t *max_resp_msg_size) +{ + struct hw_channel_context *hwc = gc->hwc.driver_data; + struct gdma_queue *rq = hwc->rxq->gdma_wq; + struct gdma_queue *sq = hwc->txq->gdma_wq; + struct gdma_queue *eq = hwc->cq->gdma_eq; + struct gdma_queue *cq = hwc->cq->gdma_cq; + int err; + + init_completion(&hwc->hwc_init_eqe_comp); + + err = mana_smc_setup_hwc(&gc->shm_channel, false, + eq->mem_info.dma_handle, + cq->mem_info.dma_handle, + rq->mem_info.dma_handle, + sq->mem_info.dma_handle, + eq->eq.msix_index); + if (err) + return err; + + if (wait_for_completion_timeout(&hwc->hwc_init_eqe_comp, 60 * hz)) + return ETIMEDOUT; + + *q_depth = hwc->hwc_init_q_depth_max; + *max_req_msg_size = hwc->hwc_init_max_req_msg_size; + *max_resp_msg_size = hwc->hwc_init_max_resp_msg_size; + + if (cq->id >= gc->max_num_cqs) { + mana_warn(NULL, "invalid cq id %u > %u\n", + cq->id, gc->max_num_cqs); + return EPROTO; + } + + gc->cq_table = malloc(gc->max_num_cqs * sizeof(struct gdma_queue *), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!gc->cq_table) + return ENOMEM; + + gc->cq_table[cq->id] = cq; + + return 0; +} + +static int +mana_hwc_init_queues(struct hw_channel_context *hwc, uint16_t q_depth, + uint32_t max_req_msg_size, uint32_t max_resp_msg_size) +{ + struct hwc_wq *hwc_rxq = NULL; + struct hwc_wq *hwc_txq = NULL; + struct hwc_cq *hwc_cq = NULL; + int err; + + err = mana_hwc_init_inflight_msg(hwc, q_depth); + if (err) + return err; + + /* CQ is shared by SQ and RQ, so CQ's queue depth is the sum of SQ + * queue depth and RQ queue depth. + */ + err = mana_hwc_create_cq(hwc, q_depth * 2, + mana_hwc_init_event_handler, hwc, + mana_hwc_rx_event_handler, hwc, + mana_hwc_tx_event_handler, hwc, &hwc_cq); + if (err) { + device_printf(hwc->dev, "Failed to create HWC CQ: %d\n", err); + goto out; + } + hwc->cq = hwc_cq; + + err = mana_hwc_create_wq(hwc, GDMA_RQ, q_depth, max_req_msg_size, + hwc_cq, &hwc_rxq); + if (err) { + device_printf(hwc->dev, "Failed to create HWC RQ: %d\n", err); + goto out; + } + hwc->rxq = hwc_rxq; + + err = mana_hwc_create_wq(hwc, GDMA_SQ, q_depth, max_resp_msg_size, + hwc_cq, &hwc_txq); + if (err) { + device_printf(hwc->dev, "Failed to create HWC SQ: %d\n", err); + goto out; + } + hwc->txq = hwc_txq; + + hwc->num_inflight_msg = q_depth; + hwc->max_req_msg_size = max_req_msg_size; + + return 0; +out: + if (hwc_txq) + mana_hwc_destroy_wq(hwc, hwc_txq); + + if (hwc_rxq) + mana_hwc_destroy_wq(hwc, hwc_rxq); + + if (hwc_cq) + mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc_cq); + + mana_gd_free_res_map(&hwc->inflight_msg_res); + return err; +} + +int +mana_hwc_create_channel(struct gdma_context *gc) +{ + uint32_t max_req_msg_size, max_resp_msg_size; + struct gdma_dev *gd = &gc->hwc; + struct hw_channel_context *hwc; + uint16_t q_depth_max; + int err; + + hwc = malloc(sizeof(*hwc), M_DEVBUF, M_WAITOK | M_ZERO); + if (!hwc) + return ENOMEM; + + gd->gdma_context = gc; + gd->driver_data = hwc; + hwc->gdma_dev = gd; + hwc->dev = gc->dev; + + /* HWC's instance number is always 0. */ + gd->dev_id.as_uint32 = 0; + gd->dev_id.type = GDMA_DEVICE_HWC; + + gd->pdid = INVALID_PDID; + gd->doorbell = INVALID_DOORBELL; + + err = mana_hwc_init_queues(hwc, HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH, + HW_CHANNEL_MAX_REQUEST_SIZE, + HW_CHANNEL_MAX_RESPONSE_SIZE); + if (err) { + device_printf(hwc->dev, "Failed to initialize HWC: %d\n", + err); + goto out; + } + + err = mana_hwc_establish_channel(gc, &q_depth_max, &max_req_msg_size, + &max_resp_msg_size); + if (err) { + device_printf(hwc->dev, "Failed to establish HWC: %d\n", err); + goto out; + } + + err = mana_hwc_test_channel(gc->hwc.driver_data, + HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH, + max_req_msg_size, max_resp_msg_size); + if (err) { + /* Test failed, but the channel has been established */ + device_printf(hwc->dev, "Failed to test HWC: %d\n", err); + return EIO; + } + + return 0; +out: + free(hwc, M_DEVBUF); + return (err); +} + +void +mana_hwc_destroy_channel(struct gdma_context *gc) +{ + struct hw_channel_context *hwc = gc->hwc.driver_data; + struct hwc_caller_ctx *ctx; + + mana_smc_teardown_hwc(&gc->shm_channel, false); + + ctx = hwc->caller_ctx; + free(ctx, M_DEVBUF); + hwc->caller_ctx = NULL; + + mana_hwc_destroy_wq(hwc, hwc->txq); + hwc->txq = NULL; + + mana_hwc_destroy_wq(hwc, hwc->rxq); + hwc->rxq = NULL; + + mana_hwc_destroy_cq(hwc->gdma_dev->gdma_context, hwc->cq); + hwc->cq = NULL; + + mana_gd_free_res_map(&hwc->inflight_msg_res); + + hwc->num_inflight_msg = 0; + + if (hwc->gdma_dev->pdid != INVALID_PDID) { + hwc->gdma_dev->doorbell = INVALID_DOORBELL; + hwc->gdma_dev->pdid = INVALID_PDID; + } + + free(hwc, M_DEVBUF); + gc->hwc.driver_data = NULL; + gc->hwc.gdma_context = NULL; +} + +int +mana_hwc_send_request(struct hw_channel_context *hwc, uint32_t req_len, + const void *req, uint32_t resp_len, void *resp) +{ + struct hwc_work_request *tx_wr; + struct hwc_wq *txq = hwc->txq; + struct gdma_req_hdr *req_msg; + struct hwc_caller_ctx *ctx; + uint16_t msg_id; + int err; + + mana_hwc_get_msg_index(hwc, &msg_id); + + tx_wr = &txq->msg_buf->reqs[msg_id]; + + if (req_len > tx_wr->buf_len) { + device_printf(hwc->dev, + "HWC: req msg size: %d > %d\n", req_len, + tx_wr->buf_len); + err = EINVAL; + goto out; + } + + ctx = hwc->caller_ctx + msg_id; + ctx->output_buf = resp; + ctx->output_buflen = resp_len; + + req_msg = (struct gdma_req_hdr *)tx_wr->buf_va; + if (req) + memcpy(req_msg, req, req_len); + + req_msg->req.hwc_msg_id = msg_id; + + tx_wr->msg_size = req_len; + + err = mana_hwc_post_tx_wqe(txq, tx_wr, 0, 0, false); + if (err) { + device_printf(hwc->dev, + "HWC: Failed to post send WQE: %d\n", err); + goto out; + } + + if (wait_for_completion_timeout(&ctx->comp_event, 30 * hz)) { + device_printf(hwc->dev, "HWC: Request timed out!\n"); + err = ETIMEDOUT; + goto out; + } + + if (ctx->error) { + err = ctx->error; + goto out; + } + + if (ctx->status_code) { + device_printf(hwc->dev, + "HWC: Failed hw_channel req: 0x%x\n", ctx->status_code); + err = EPROTO; + goto out; + } +out: + mana_hwc_put_msg_index(hwc, msg_id); + return err; +} diff --git a/sys/dev/mana/mana.h b/sys/dev/mana/mana.h new file mode 100644 --- /dev/null +++ b/sys/dev/mana/mana.h @@ -0,0 +1,689 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _MANA_H +#define _MANA_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "gdma.h" +#include "hw_channel.h" + + +/* Microsoft Azure Network Adapter (MANA)'s definitions + * + * Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ +/* MANA protocol version */ +#define MANA_MAJOR_VERSION 0 +#define MANA_MINOR_VERSION 1 +#define MANA_MICRO_VERSION 1 + +#define DRV_MODULE_NAME "mana" + +#ifndef DRV_MODULE_VERSION +#define DRV_MODULE_VERSION \ + __XSTRING(MANA_MAJOR_VERSION) "." \ + __XSTRING(MANA_MINOR_VERSION) "." \ + __XSTRING(MANA_MICRO_VERSION) +#endif +#define DEVICE_NAME "Microsoft Azure Network Adapter (MANA)" +#define DEVICE_DESC "MANA adapter" + +/* + * Supported PCI vendor and devices IDs + */ +#ifndef PCI_VENDOR_ID_MICROSOFT +#define PCI_VENDOR_ID_MICROSOFT 0x1414 +#endif + +#define PCI_DEV_ID_MANA_VF 0x00ba + +typedef struct _mana_vendor_id_t { + uint16_t vendor_id; + uint16_t device_id; +} mana_vendor_id_t; + +typedef uint64_t mana_handle_t; +#define INVALID_MANA_HANDLE ((mana_handle_t)-1) + +enum TRI_STATE { + TRI_STATE_UNKNOWN = -1, + TRI_STATE_FALSE = 0, + TRI_STATE_TRUE = 1 +}; + +/* Number of entries for hardware indirection table must be in power of 2 */ +#define MANA_INDIRECT_TABLE_SIZE 64 +#define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1) + +/* The Toeplitz hash key's length in bytes: should be multiple of 8 */ +#define MANA_HASH_KEY_SIZE 40 + +#define COMP_ENTRY_SIZE 64 + +#define MIN_FRAME_SIZE 146 +#define ADAPTER_MTU_SIZE 1500 +#define DEFAULT_FRAME_SIZE (ADAPTER_MTU_SIZE + 14) +#define MAX_FRAME_SIZE 4096 + +#define RX_BUFFERS_PER_QUEUE 512 + +#define MAX_SEND_BUFFERS_PER_QUEUE 256 + +#define EQ_SIZE (8 * PAGE_SIZE) +#define LOG2_EQ_THROTTLE 3 + +#if 1 /* XXX */ +#define MAX_PORTS_IN_MANA_DEV 1 +#else +#define MAX_PORTS_IN_MANA_DEV 16 +#endif + +struct mana_send_buf_info { + struct mbuf *mbuf; + bus_dmamap_t dma_map; + + /* Required to store the result of mana_gd_post_work_request. + * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the + * work queue when the WQE is consumed. + */ + struct gdma_posted_wqe_info wqe_inf; +}; + +struct mana_stats { + counter_u64_t packets; /* rx, tx */ + counter_u64_t bytes; /* rx, tx */ + counter_u64_t stop; /* tx */ + counter_u64_t wakeup; /* tx */ + counter_u64_t collapse; /* tx */ + counter_u64_t collapse_err; /* tx */ + counter_u64_t dma_mapping_err; /* rx, tx */ + counter_u64_t mbuf_alloc_fail; /* rx */ + counter_u64_t alt_chg; /* tx */ + counter_u64_t alt_reset; /* tx */ +}; + +struct mana_txq { + struct gdma_queue *gdma_sq; + + union { + uint32_t gdma_txq_id; + struct { + uint32_t reserved1 :10; + uint32_t vsq_frame :14; + uint32_t reserved2 :8; + }; + }; + + uint16_t vp_offset; + + struct ifnet *ndev; + /* Store index to the array of tx_qp in port structure */ + int idx; + /* The alternative txq idx when this txq is under heavy load */ + int alt_txq_idx; + + /* The mbufs are sent to the HW and we are waiting for the CQEs. */ + struct mana_send_buf_info *tx_buf_info; + uint16_t next_to_use; + uint16_t next_to_complete; + + atomic_t pending_sends; + + struct buf_ring *txq_br; + struct mtx txq_mtx; + char txq_mtx_name[16]; + + struct task enqueue_task; + struct taskqueue *enqueue_tq; + + struct mana_stats stats; +}; + + +/* + * Max WQE size is 512B. The first 8B is for GDMA Out of Band (OOB), + * next is the Client OOB can be either 8B or 24B. Thus, the max + * space for SGL entries in a singel WQE is 512 - 8 - 8 = 496B. Since each + * SGL is 16B in size, the max number of SGLs in a WQE is 496/16 = 31. + * Save one for emergency use, set the MAX_MBUF_FRAGS allowed to 30. + */ +#define MAX_MBUF_FRAGS 30 +#define MANA_TSO_MAXSEG_SZ PAGE_SIZE + +/* mbuf data and frags dma mappings */ +struct mana_mbuf_head { + bus_addr_t dma_handle[MAX_MBUF_FRAGS + 1]; + + uint32_t size[MAX_MBUF_FRAGS + 1]; +}; + +#define MANA_HEADROOM sizeof(struct mana_mbuf_head) + +enum mana_tx_pkt_format { + MANA_SHORT_PKT_FMT = 0, + MANA_LONG_PKT_FMT = 1, +}; + +struct mana_tx_short_oob { + uint32_t pkt_fmt :2; + uint32_t is_outer_ipv4 :1; + uint32_t is_outer_ipv6 :1; + uint32_t comp_iphdr_csum :1; + uint32_t comp_tcp_csum :1; + uint32_t comp_udp_csum :1; + uint32_t supress_txcqe_gen :1; + uint32_t vcq_num :24; + + uint32_t trans_off :10; /* Transport header offset */ + uint32_t vsq_frame :14; + uint32_t short_vp_offset :8; +}; /* HW DATA */ + +struct mana_tx_long_oob { + uint32_t is_encap :1; + uint32_t inner_is_ipv6 :1; + uint32_t inner_tcp_opt :1; + uint32_t inject_vlan_pri_tag :1; + uint32_t reserved1 :12; + uint32_t pcp :3; /* 802.1Q */ + uint32_t dei :1; /* 802.1Q */ + uint32_t vlan_id :12; /* 802.1Q */ + + uint32_t inner_frame_offset :10; + uint32_t inner_ip_rel_offset :6; + uint32_t long_vp_offset :12; + uint32_t reserved2 :4; + + uint32_t reserved3; + uint32_t reserved4; +}; /* HW DATA */ + +struct mana_tx_oob { + struct mana_tx_short_oob s_oob; + struct mana_tx_long_oob l_oob; +}; /* HW DATA */ + +enum mana_cq_type { + MANA_CQ_TYPE_RX, + MANA_CQ_TYPE_TX, +}; + +enum mana_cqe_type { + CQE_INVALID = 0, + CQE_RX_OKAY = 1, + CQE_RX_COALESCED_4 = 2, + CQE_RX_OBJECT_FENCE = 3, + CQE_RX_TRUNCATED = 4, + + CQE_TX_OKAY = 32, + CQE_TX_SA_DROP = 33, + CQE_TX_MTU_DROP = 34, + CQE_TX_INVALID_OOB = 35, + CQE_TX_INVALID_ETH_TYPE = 36, + CQE_TX_HDR_PROCESSING_ERROR = 37, + CQE_TX_VF_DISABLED = 38, + CQE_TX_VPORT_IDX_OUT_OF_RANGE = 39, + CQE_TX_VPORT_DISABLED = 40, + CQE_TX_VLAN_TAGGING_VIOLATION = 41, +}; + +#define MANA_CQE_COMPLETION 1 + +struct mana_cqe_header { + uint32_t cqe_type :6; + uint32_t client_type :2; + uint32_t vendor_err :24; +}; /* HW DATA */ + +/* NDIS HASH Types */ +#define NDIS_HASH_IPV4 BIT(0) +#define NDIS_HASH_TCP_IPV4 BIT(1) +#define NDIS_HASH_UDP_IPV4 BIT(2) +#define NDIS_HASH_IPV6 BIT(3) +#define NDIS_HASH_TCP_IPV6 BIT(4) +#define NDIS_HASH_UDP_IPV6 BIT(5) +#define NDIS_HASH_IPV6_EX BIT(6) +#define NDIS_HASH_TCP_IPV6_EX BIT(7) +#define NDIS_HASH_UDP_IPV6_EX BIT(8) + +#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX) +#define MANA_HASH_L4 \ + (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 | \ + NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX) + +#define NDIS_HASH_IPV4_L3_MASK (NDIS_HASH_IPV4) +#define NDIS_HASH_IPV4_L4_MASK (NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4) +#define NDIS_HASH_IPV6_L3_MASK (NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX) +#define NDIS_HASH_IPV6_L4_MASK \ + (NDIS_HASH_TCP_IPV6 | NDIS_HASH_UDP_IPV6 | \ + NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX) +#define NDIS_HASH_IPV4_MASK \ + (NDIS_HASH_IPV4_L3_MASK | NDIS_HASH_IPV4_L4_MASK) +#define NDIS_HASH_IPV6_MASK \ + (NDIS_HASH_IPV6_L3_MASK | NDIS_HASH_IPV6_L4_MASK) + + +struct mana_rxcomp_perpkt_info { + uint32_t pkt_len :16; + uint32_t reserved1 :16; + uint32_t reserved2; + uint32_t pkt_hash; +}; /* HW DATA */ + +#define MANA_RXCOMP_OOB_NUM_PPI 4 + +/* Receive completion OOB */ +struct mana_rxcomp_oob { + struct mana_cqe_header cqe_hdr; + + uint32_t rx_vlan_id :12; + uint32_t rx_vlantag_present :1; + uint32_t rx_outer_iphdr_csum_succeed :1; + uint32_t rx_outer_iphdr_csum_fail :1; + uint32_t reserved1 :1; + uint32_t rx_hashtype :9; + uint32_t rx_iphdr_csum_succeed :1; + uint32_t rx_iphdr_csum_fail :1; + uint32_t rx_tcp_csum_succeed :1; + uint32_t rx_tcp_csum_fail :1; + uint32_t rx_udp_csum_succeed :1; + uint32_t rx_udp_csum_fail :1; + uint32_t reserved2 :1; + + struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI]; + + uint32_t rx_wqe_offset; +}; /* HW DATA */ + +struct mana_tx_comp_oob { + struct mana_cqe_header cqe_hdr; + + uint32_t tx_data_offset; + + uint32_t tx_sgl_offset :5; + uint32_t tx_wqe_offset :27; + + uint32_t reserved[12]; +}; /* HW DATA */ + +struct mana_rxq; + +struct mana_cq { + struct gdma_queue *gdma_cq; + + /* Cache the CQ id (used to verify if each CQE comes to the right CQ. */ + uint32_t gdma_id; + + /* Type of the CQ: TX or RX */ + enum mana_cq_type type; + + /* Pointer to the mana_rxq that is pushing RX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_RX. + */ + struct mana_rxq *rxq; + + /* Pointer to the mana_txq that is pushing TX CQEs to the queue. + * Only and must be non-NULL if type is MANA_CQ_TYPE_TX. + */ + struct mana_txq *txq; + + /* Pointer to a buffer which the CQ handler can copy the CQE's into. */ + struct gdma_comp *gdma_comp_buf; +}; + +#define GDMA_MAX_RQE_SGES 15 + +struct mana_recv_buf_oob { + /* A valid GDMA work request representing the data buffer. */ + struct gdma_wqe_request wqe_req; + + struct mbuf *mbuf; + bus_dmamap_t dma_map; + + /* SGL of the buffer going to be sent as part of the work request. */ + uint32_t num_sge; + struct gdma_sge sgl[GDMA_MAX_RQE_SGES]; + + /* Required to store the result of mana_gd_post_work_request. + * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the + * work queue when the WQE is consumed. + */ + struct gdma_posted_wqe_info wqe_inf; +}; + +struct mana_rxq { + struct gdma_queue *gdma_rq; + /* Cache the gdma receive queue id */ + uint32_t gdma_id; + + /* Index of RQ in the vPort, not gdma receive queue id */ + uint32_t rxq_idx; + + uint32_t datasize; + + mana_handle_t rxobj; + + struct mana_cq rx_cq; + + struct ifnet *ndev; + struct lro_ctrl lro; + + /* Total number of receive buffers to be allocated */ + uint32_t num_rx_buf; + + uint32_t buf_index; + + struct mana_stats stats; + + /* MUST BE THE LAST MEMBER: + * Each receive buffer has an associated mana_recv_buf_oob. + */ + struct mana_recv_buf_oob rx_oobs[]; +}; + +struct mana_tx_qp { + struct mana_txq txq; + + struct mana_cq tx_cq; + + mana_handle_t tx_object; +}; + +struct mana_port_stats { + counter_u64_t rx_packets; + counter_u64_t tx_packets; + + counter_u64_t rx_bytes; + counter_u64_t tx_bytes; + + counter_u64_t rx_drops; + counter_u64_t tx_drops; + + counter_u64_t stop_queue; + counter_u64_t wake_queue; +}; + +struct mana_context { + struct gdma_dev *gdma_dev; + + uint16_t num_ports; + + struct ifnet *ports[MAX_PORTS_IN_MANA_DEV]; +}; + +struct mana_port_context { + struct mana_context *ac; + struct ifnet *ndev; + struct ifmedia media; + + struct sx apc_lock; + + /* DMA tag used for queue bufs of the entire port */ + bus_dma_tag_t rx_buf_tag; + bus_dma_tag_t tx_buf_tag; + + uint8_t mac_addr[ETHER_ADDR_LEN]; + + struct mana_eq *eqs; + + enum TRI_STATE rss_state; + + mana_handle_t default_rxobj; + bool tx_shortform_allowed; + uint16_t tx_vp_offset; + + struct mana_tx_qp *tx_qp; + + /* Indirection Table for RX & TX. The values are queue indexes */ + uint32_t indir_table[MANA_INDIRECT_TABLE_SIZE]; + + /* Indirection table containing RxObject Handles */ + mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE]; + + /* Hash key used by the NIC */ + uint8_t hashkey[MANA_HASH_KEY_SIZE]; + + /* This points to an array of num_queues of RQ pointers. */ + struct mana_rxq **rxqs; + + /* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */ + unsigned int max_queues; + unsigned int num_queues; + + mana_handle_t port_handle; + + uint16_t port_idx; + + uint16_t frame_size; + + bool port_is_up; + bool port_st_save; /* Saved port state */ + + bool enable_tx_altq; + bool bind_cleanup_thread_cpu; + + struct mana_port_stats port_stats; + + struct sysctl_oid_list *port_list; + struct sysctl_ctx_list que_sysctl_ctx; +}; + +#define MANA_APC_LOCK_INIT(apc) \ + sx_init(&(apc)->apc_lock, "MANA port lock") +#define MANA_APC_LOCK_DESTROY(apc) sx_destroy(&(apc)->apc_lock) +#define MANA_APC_LOCK_LOCK(apc) sx_xlock(&(apc)->apc_lock) +#define MANA_APC_LOCK_UNLOCK(apc) sx_unlock(&(apc)->apc_lock) + +int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx, + bool update_hash, bool update_tab); + +int mana_alloc_queues(struct ifnet *ndev); +int mana_attach(struct ifnet *ndev); +int mana_detach(struct ifnet *ndev); + +int mana_probe(struct gdma_dev *gd); +void mana_remove(struct gdma_dev *gd); + +struct mana_obj_spec { + uint32_t queue_index; + uint64_t gdma_region; + uint32_t queue_size; + uint32_t attached_eq; + uint32_t modr_ctx_id; +}; + +enum mana_command_code { + MANA_QUERY_DEV_CONFIG = 0x20001, + MANA_QUERY_GF_STAT = 0x20002, + MANA_CONFIG_VPORT_TX = 0x20003, + MANA_CREATE_WQ_OBJ = 0x20004, + MANA_DESTROY_WQ_OBJ = 0x20005, + MANA_FENCE_RQ = 0x20006, + MANA_CONFIG_VPORT_RX = 0x20007, + MANA_QUERY_VPORT_CONFIG = 0x20008, +}; + +/* Query Device Configuration */ +struct mana_query_device_cfg_req { + struct gdma_req_hdr hdr; + + /* Driver Capability flags */ + uint64_t drv_cap_flags1; + uint64_t drv_cap_flags2; + uint64_t drv_cap_flags3; + uint64_t drv_cap_flags4; + + uint32_t proto_major_ver; + uint32_t proto_minor_ver; + uint32_t proto_micro_ver; + + uint32_t reserved; +}; /* HW DATA */ + +struct mana_query_device_cfg_resp { + struct gdma_resp_hdr hdr; + + uint64_t pf_cap_flags1; + uint64_t pf_cap_flags2; + uint64_t pf_cap_flags3; + uint64_t pf_cap_flags4; + + uint16_t max_num_vports; + uint16_t reserved; + uint32_t max_num_eqs; +}; /* HW DATA */ + +/* Query vPort Configuration */ +struct mana_query_vport_cfg_req { + struct gdma_req_hdr hdr; + uint32_t vport_index; +}; /* HW DATA */ + +struct mana_query_vport_cfg_resp { + struct gdma_resp_hdr hdr; + uint32_t max_num_sq; + uint32_t max_num_rq; + uint32_t num_indirection_ent; + uint32_t reserved1; + uint8_t mac_addr[6]; + uint8_t reserved2[2]; + mana_handle_t vport; +}; /* HW DATA */ + +/* Configure vPort */ +struct mana_config_vport_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + uint32_t pdid; + uint32_t doorbell_pageid; +}; /* HW DATA */ + +struct mana_config_vport_resp { + struct gdma_resp_hdr hdr; + uint16_t tx_vport_offset; + uint8_t short_form_allowed; + uint8_t reserved; +}; /* HW DATA */ + +/* Create WQ Object */ +struct mana_create_wqobj_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + uint32_t wq_type; + uint32_t reserved; + uint64_t wq_gdma_region; + uint64_t cq_gdma_region; + uint32_t wq_size; + uint32_t cq_size; + uint32_t cq_moderation_ctx_id; + uint32_t cq_parent_qid; +}; /* HW DATA */ + +struct mana_create_wqobj_resp { + struct gdma_resp_hdr hdr; + uint32_t wq_id; + uint32_t cq_id; + mana_handle_t wq_obj; +}; /* HW DATA */ + +/* Destroy WQ Object */ +struct mana_destroy_wqobj_req { + struct gdma_req_hdr hdr; + uint32_t wq_type; + uint32_t reserved; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_destroy_wqobj_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Fence RQ */ +struct mana_fence_rq_req { + struct gdma_req_hdr hdr; + mana_handle_t wq_obj_handle; +}; /* HW DATA */ + +struct mana_fence_rq_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +/* Configure vPort Rx Steering */ +struct mana_cfg_rx_steer_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + uint16_t num_indir_entries; + uint16_t indir_tab_offset; + uint32_t rx_enable; + uint32_t rss_enable; + uint8_t update_default_rxobj; + uint8_t update_hashkey; + uint8_t update_indir_tab; + uint8_t reserved; + mana_handle_t default_rxobj; + uint8_t hashkey[MANA_HASH_KEY_SIZE]; +}; /* HW DATA */ + +struct mana_cfg_rx_steer_resp { + struct gdma_resp_hdr hdr; +}; /* HW DATA */ + +#define MANA_MAX_NUM_QUEUES 16 + +#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1) + +struct mana_tx_package { + struct gdma_wqe_request wqe_req; + struct gdma_sge sgl_array[MAX_MBUF_FRAGS]; + + struct mana_tx_oob tx_oob; + + struct gdma_posted_wqe_info wqe_info; +}; + +int mana_restart(struct mana_port_context *apc); + +#endif /* _MANA_H */ diff --git a/sys/dev/mana/mana_en.c b/sys/dev/mana/mana_en.c new file mode 100644 --- /dev/null +++ b/sys/dev/mana/mana_en.c @@ -0,0 +1,2699 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#ifdef RSS +#include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "mana.h" +#include "mana_sysctl.h" + +static int mana_up(struct mana_port_context *apc); +static int mana_down(struct mana_port_context *apc); + +static void +mana_rss_key_fill(void *k, size_t size) +{ + static bool rss_key_generated = false; + static uint8_t rss_key[MANA_HASH_KEY_SIZE]; + + KASSERT(size <= MANA_HASH_KEY_SIZE, + ("Request more buytes than MANA RSS key can hold")); + + if (!rss_key_generated) { + arc4random_buf(rss_key, MANA_HASH_KEY_SIZE); + rss_key_generated = true; + } + memcpy(k, rss_key, size); +} + +static int +mana_ifmedia_change(struct ifnet *ifp __unused) +{ + return EOPNOTSUPP; +} + +static void +mana_ifmedia_status(struct ifnet *ifp, struct ifmediareq *ifmr) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + + if (!apc) { + if_printf(ifp, "Port not available\n"); + return; + } + + MANA_APC_LOCK_LOCK(apc); + + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + + if (!apc->port_is_up) { + MANA_APC_LOCK_UNLOCK(apc); + mana_info(NULL, "Port %u link is down\n", apc->port_idx); + return; + } + + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active |= IFM_100G_DR | IFM_FDX; + + MANA_APC_LOCK_UNLOCK(apc); +} + +static uint64_t +mana_get_counter(struct ifnet *ifp, ift_counter cnt) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + struct mana_port_stats *stats = &apc->port_stats; + + switch (cnt) { + case IFCOUNTER_IPACKETS: + return (counter_u64_fetch(stats->rx_packets)); + case IFCOUNTER_OPACKETS: + return (counter_u64_fetch(stats->tx_packets)); + case IFCOUNTER_IBYTES: + return (counter_u64_fetch(stats->rx_bytes)); + case IFCOUNTER_OBYTES: + return (counter_u64_fetch(stats->tx_bytes)); + case IFCOUNTER_IQDROPS: + return (counter_u64_fetch(stats->rx_drops)); + case IFCOUNTER_OQDROPS: + return (counter_u64_fetch(stats->tx_drops)); + default: + return (if_get_counter_default(ifp, cnt)); + } +} + +static void +mana_drain_eq_task(struct gdma_queue *queue) +{ + if (!queue || !queue->eq.cleanup_tq) + return; + + while (taskqueue_cancel(queue->eq.cleanup_tq, + &queue->eq.cleanup_task, NULL)) { + taskqueue_drain(queue->eq.cleanup_tq, + &queue->eq.cleanup_task); + } +} + +static void +mana_qflush(struct ifnet *ifp) +{ + if_qflush(ifp); +} + +int +mana_restart(struct mana_port_context *apc) +{ + int rc = 0; + + MANA_APC_LOCK_LOCK(apc); + if (apc->port_is_up) + mana_down(apc); + + rc = mana_up(apc); + MANA_APC_LOCK_UNLOCK(apc); + + return (rc); +} + +static int +mana_ioctl(struct ifnet *ifp, u_long command, caddr_t data) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + struct ifrsskey *ifrk; + struct ifrsshash *ifrh; + struct ifreq *ifr; + uint16_t new_mtu; + int rc = 0; + + switch (command) { + case SIOCSIFMTU: + ifr = (struct ifreq *)data; + new_mtu = ifr->ifr_mtu; + if (ifp->if_mtu == new_mtu) + break; + if ((new_mtu + 18 > MAX_FRAME_SIZE) || + (new_mtu + 18 < MIN_FRAME_SIZE)) { + if_printf(ifp, "Invalid MTU. new_mtu: %d, " + "max allowed: %d, min allowed: %d\n", + new_mtu, MAX_FRAME_SIZE - 18, MIN_FRAME_SIZE - 18); + return EINVAL; + } + MANA_APC_LOCK_LOCK(apc); + if (apc->port_is_up) + mana_down(apc); + + apc->frame_size = new_mtu + 18; + if_setmtu(ifp, new_mtu); + mana_dbg(NULL, "Set MTU to %d\n", new_mtu); + + rc = mana_up(apc); + MANA_APC_LOCK_UNLOCK(apc); + break; + + case SIOCSIFFLAGS: + if (ifp->if_flags & IFF_UP) { + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + MANA_APC_LOCK_LOCK(apc); + if (!apc->port_is_up) + rc = mana_up(apc); + MANA_APC_LOCK_UNLOCK(apc); + } + } else { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + MANA_APC_LOCK_LOCK(apc); + if (apc->port_is_up) + mana_down(apc); + MANA_APC_LOCK_UNLOCK(apc); + } + } + break; + + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + case SIOCGIFXMEDIA: + ifr = (struct ifreq *)data; + rc = ifmedia_ioctl(ifp, ifr, &apc->media, command); + break; + + case SIOCGIFRSSKEY: + ifrk = (struct ifrsskey *)data; + ifrk->ifrk_func = RSS_FUNC_TOEPLITZ; + ifrk->ifrk_keylen = MANA_HASH_KEY_SIZE; + memcpy(ifrk->ifrk_key, apc->hashkey, MANA_HASH_KEY_SIZE); + break; + + case SIOCGIFRSSHASH: + ifrh = (struct ifrsshash *)data; + ifrh->ifrh_func = RSS_FUNC_TOEPLITZ; + ifrh->ifrh_types = + RSS_TYPE_TCP_IPV4 | + RSS_TYPE_UDP_IPV4 | + RSS_TYPE_TCP_IPV6 | + RSS_TYPE_UDP_IPV6; + break; + + default: + rc = ether_ioctl(ifp, command, data); + break; + } + + return (rc); +} + +static inline void +mana_alloc_counters(counter_u64_t *begin, int size) +{ + counter_u64_t *end = (counter_u64_t *)((char *)begin + size); + + for (; begin < end; ++begin) + *begin = counter_u64_alloc(M_WAITOK); +} + +static inline void +mana_free_counters(counter_u64_t *begin, int size) +{ + counter_u64_t *end = (counter_u64_t *)((char *)begin + size); + + for (; begin < end; ++begin) + counter_u64_free(*begin); +} + +static inline void +mana_reset_counters(counter_u64_t *begin, int size) +{ + counter_u64_t *end = (counter_u64_t *)((char *)begin + size); + + for (; begin < end; ++begin) + counter_u64_zero(*begin); +} + +static bool +mana_can_tx(struct gdma_queue *wq) +{ + return mana_gd_wq_avail_space(wq) >= MAX_TX_WQE_SIZE; +} + +static inline int +mana_tx_map_mbuf(struct mana_port_context *apc, + struct mana_send_buf_info *tx_info, + struct mbuf **m_head, struct mana_tx_package *tp, + struct mana_stats *tx_stats) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + bus_dma_segment_t segs[MAX_MBUF_FRAGS]; + struct mbuf *m = *m_head; + int err, nsegs, i; + + err = bus_dmamap_load_mbuf_sg(apc->tx_buf_tag, tx_info->dma_map, + m, segs, &nsegs, BUS_DMA_NOWAIT); + if (err == EFBIG) { + struct mbuf *m_new; + + counter_u64_add(tx_stats->collapse, 1); + m_new = m_collapse(m, M_NOWAIT, MAX_MBUF_FRAGS); + if (unlikely(m_new == NULL)) { + counter_u64_add(tx_stats->collapse_err, 1); + return ENOBUFS; + } else { + *m_head = m = m_new; + } + + mana_warn(NULL, + "Too many segs in orig mbuf, m_collapse called\n"); + + err = bus_dmamap_load_mbuf_sg(apc->tx_buf_tag, + tx_info->dma_map, m, segs, &nsegs, BUS_DMA_NOWAIT); + } + if (!err) { + for (i = 0; i < nsegs; i++) { + tp->wqe_req.sgl[i].address = segs[i].ds_addr; + tp->wqe_req.sgl[i].mem_key = gd->gpa_mkey; + tp->wqe_req.sgl[i].size = segs[i].ds_len; + } + tp->wqe_req.num_sge = nsegs; + + tx_info->mbuf = *m_head; + + bus_dmamap_sync(apc->tx_buf_tag, tx_info->dma_map, + BUS_DMASYNC_PREWRITE); + } + + return err; +} + +static inline void +mana_tx_unmap_mbuf(struct mana_port_context *apc, + struct mana_send_buf_info *tx_info) +{ + bus_dmamap_sync(apc->tx_buf_tag, tx_info->dma_map, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(apc->tx_buf_tag, tx_info->dma_map); + if (tx_info->mbuf) { + m_freem(tx_info->mbuf); + tx_info->mbuf = NULL; + } +} + +static inline int +mana_load_rx_mbuf(struct mana_port_context *apc, struct mana_rxq *rxq, + struct mana_recv_buf_oob *rx_oob, bool alloc_mbuf) +{ + bus_dma_segment_t segs[1]; + struct mbuf *mbuf; + int nsegs, err; + uint32_t mlen; + + if (alloc_mbuf) { + mbuf = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rxq->datasize); + if (unlikely(mbuf == NULL)) { + mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + if (unlikely(mbuf == NULL)) { + return ENOMEM; + } + mlen = MCLBYTES; + } else { + mlen = rxq->datasize; + } + + mbuf->m_pkthdr.len = mbuf->m_len = mlen; + } else { + if (rx_oob->mbuf) { + mbuf = rx_oob->mbuf; + mlen = rx_oob->mbuf->m_pkthdr.len; + } else { + return ENOMEM; + } + } + + err = bus_dmamap_load_mbuf_sg(apc->rx_buf_tag, rx_oob->dma_map, + mbuf, segs, &nsegs, BUS_DMA_NOWAIT); + + if (unlikely((err != 0) || (nsegs != 1))) { + mana_warn(NULL, "Failed to map mbuf, error: %d, " + "nsegs: %d\n", err, nsegs); + counter_u64_add(rxq->stats.dma_mapping_err, 1); + goto error; + } + + bus_dmamap_sync(apc->rx_buf_tag, rx_oob->dma_map, + BUS_DMASYNC_PREREAD); + + rx_oob->mbuf = mbuf; + rx_oob->num_sge = 1; + rx_oob->sgl[0].address = segs[0].ds_addr; + rx_oob->sgl[0].size = mlen; + rx_oob->sgl[0].mem_key = apc->ac->gdma_dev->gpa_mkey; + + return 0; + +error: + m_freem(mbuf); + return EFAULT; +} + +static inline void +mana_unload_rx_mbuf(struct mana_port_context *apc, struct mana_rxq *rxq, + struct mana_recv_buf_oob *rx_oob, bool free_mbuf) +{ + bus_dmamap_sync(apc->rx_buf_tag, rx_oob->dma_map, + BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(apc->rx_buf_tag, rx_oob->dma_map); + + if (free_mbuf && rx_oob->mbuf) { + m_freem(rx_oob->mbuf); + rx_oob->mbuf = NULL; + } +} + + +/* Use couple mbuf PH_loc spaces for l3 and l4 protocal type */ +#define MANA_L3_PROTO(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[0]) +#define MANA_L4_PROTO(_mbuf) ((_mbuf)->m_pkthdr.PH_loc.sixteen[1]) + +#define MANA_TXQ_FULL (IFF_DRV_RUNNING | IFF_DRV_OACTIVE) + +static void +mana_xmit(struct mana_txq *txq) +{ + enum mana_tx_pkt_format pkt_fmt = MANA_SHORT_PKT_FMT; + struct mana_send_buf_info *tx_info; + struct ifnet *ndev = txq->ndev; + struct mbuf *mbuf; + struct mana_port_context *apc = if_getsoftc(ndev); + struct mana_port_stats *port_stats = &apc->port_stats; + struct gdma_dev *gd = apc->ac->gdma_dev; + uint64_t packets, bytes; + uint16_t next_to_use; + struct mana_tx_package pkg = {}; + struct mana_stats *tx_stats; + struct gdma_queue *gdma_sq; + struct gdma_queue *gdma_eq; + struct mana_cq *cq; + int err, len; + + gdma_sq = txq->gdma_sq; + cq = &apc->tx_qp[txq->idx].tx_cq; + gdma_eq = cq->gdma_cq->cq.parent; + tx_stats = &txq->stats; + + packets = 0; + bytes = 0; + next_to_use = txq->next_to_use; + + while ((mbuf = drbr_peek(ndev, txq->txq_br)) != NULL) { + if (!apc->port_is_up || + (if_getdrvflags(ndev) & MANA_TXQ_FULL) != IFF_DRV_RUNNING) { + drbr_putback(ndev, txq->txq_br, mbuf); + break; + } + + if (!mana_can_tx(gdma_sq)) { + /* SQ is full. Set the IFF_DRV_OACTIVE flag */ + if_setdrvflagbits(apc->ndev, IFF_DRV_OACTIVE, 0); + counter_u64_add(tx_stats->stop, 1); + uint64_t stops = counter_u64_fetch(tx_stats->stop); + uint64_t wakeups = counter_u64_fetch(tx_stats->wakeup); +#define MANA_TXQ_STOP_THRESHOLD 50 + if (stops > MANA_TXQ_STOP_THRESHOLD && wakeups > 0 && + stops > wakeups && txq->alt_txq_idx == txq->idx) { + txq->alt_txq_idx = + (txq->idx + (stops / wakeups)) + % apc->num_queues; + counter_u64_add(tx_stats->alt_chg, 1); + } + + drbr_putback(ndev, txq->txq_br, mbuf); + + taskqueue_enqueue(gdma_eq->eq.cleanup_tq, + &gdma_eq->eq.cleanup_task); + break; + } + + tx_info = &txq->tx_buf_info[next_to_use]; + + memset(&pkg, 0, sizeof(struct mana_tx_package)); + pkg.wqe_req.sgl = pkg.sgl_array; + + err = mana_tx_map_mbuf(apc, tx_info, &mbuf, &pkg, tx_stats); + if (unlikely(err)) { + mana_dbg(NULL, + "Failed to map tx mbuf, err %d\n", err); + + counter_u64_add(tx_stats->dma_mapping_err, 1); + + /* The mbuf is still there. Free it */ + m_freem(mbuf); + /* Advance the drbr queue */ + drbr_advance(ndev, txq->txq_br); + continue; + } + + pkg.tx_oob.s_oob.vcq_num = cq->gdma_id; + pkg.tx_oob.s_oob.vsq_frame = txq->vsq_frame; + + if (txq->vp_offset > MANA_SHORT_VPORT_OFFSET_MAX) { + pkg.tx_oob.l_oob.long_vp_offset = txq->vp_offset; + pkt_fmt = MANA_LONG_PKT_FMT; + } else { + pkg.tx_oob.s_oob.short_vp_offset = txq->vp_offset; + } + + pkg.tx_oob.s_oob.pkt_fmt = pkt_fmt; + + if (pkt_fmt == MANA_SHORT_PKT_FMT) + pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_short_oob); + else + pkg.wqe_req.inline_oob_size = sizeof(struct mana_tx_oob); + + pkg.wqe_req.inline_oob_data = &pkg.tx_oob; + pkg.wqe_req.flags = 0; + pkg.wqe_req.client_data_unit = 0; + + if (mbuf->m_pkthdr.csum_flags & CSUM_TSO) { + if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP) + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + else + pkg.tx_oob.s_oob.is_outer_ipv6 = 1; + + pkg.tx_oob.s_oob.comp_iphdr_csum = 1; + pkg.tx_oob.s_oob.comp_tcp_csum = 1; + pkg.tx_oob.s_oob.trans_off = mbuf->m_pkthdr.l3hlen; + + pkg.wqe_req.client_data_unit = mbuf->m_pkthdr.tso_segsz; + pkg.wqe_req.flags = GDMA_WR_OOB_IN_SGL | GDMA_WR_PAD_BY_SGE0; + } else if (mbuf->m_pkthdr.csum_flags & + (CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP6_UDP | CSUM_IP6_TCP)) { + if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP) { + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + pkg.tx_oob.s_oob.comp_iphdr_csum = 1; + } else { + pkg.tx_oob.s_oob.is_outer_ipv6 = 1; + } + + if (MANA_L4_PROTO(mbuf) == IPPROTO_TCP) { + pkg.tx_oob.s_oob.comp_tcp_csum = 1; + pkg.tx_oob.s_oob.trans_off = + mbuf->m_pkthdr.l3hlen; + } else { + pkg.tx_oob.s_oob.comp_udp_csum = 1; + } + } else if (mbuf->m_pkthdr.csum_flags & CSUM_IP) { + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + pkg.tx_oob.s_oob.comp_iphdr_csum = 1; + } else { + if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IP) + pkg.tx_oob.s_oob.is_outer_ipv4 = 1; + else if (MANA_L3_PROTO(mbuf) == ETHERTYPE_IPV6) + pkg.tx_oob.s_oob.is_outer_ipv6 = 1; + } + + len = mbuf->m_pkthdr.len; + + err = mana_gd_post_work_request(gdma_sq, &pkg.wqe_req, + (struct gdma_posted_wqe_info *)&tx_info->wqe_inf); + if (unlikely(err)) { + /* Should not happen */ + if_printf(ndev, "Failed to post TX OOB: %d\n", err); + + mana_tx_unmap_mbuf(apc, tx_info); + + drbr_advance(ndev, txq->txq_br); + continue; + } + + next_to_use = + (next_to_use + 1) % MAX_SEND_BUFFERS_PER_QUEUE; + + atomic_inc_return(&txq->pending_sends); + + drbr_advance(ndev, txq->txq_br); + + mana_gd_wq_ring_doorbell(gd->gdma_context, gdma_sq); + + packets++; + bytes += len; + } + + counter_enter(); + counter_u64_add_protected(tx_stats->packets, packets); + counter_u64_add_protected(port_stats->tx_packets, packets); + counter_u64_add_protected(tx_stats->bytes, bytes); + counter_u64_add_protected(port_stats->tx_bytes, bytes); + counter_exit(); + + txq->next_to_use = next_to_use; +} + +static void +mana_xmit_taskfunc(void *arg, int pending) +{ + struct mana_txq *txq = (struct mana_txq *)arg; + struct ifnet *ndev = txq->ndev; + struct mana_port_context *apc = if_getsoftc(ndev); + + while (!drbr_empty(ndev, txq->txq_br) && apc->port_is_up && + (if_getdrvflags(ndev) & MANA_TXQ_FULL) == IFF_DRV_RUNNING) { + mtx_lock(&txq->txq_mtx); + mana_xmit(txq); + mtx_unlock(&txq->txq_mtx); + } +} + +#define PULLUP_HDR(m, len) \ +do { \ + if (unlikely((m)->m_len < (len))) { \ + (m) = m_pullup((m), (len)); \ + if ((m) == NULL) \ + return (NULL); \ + } \ +} while (0) + +/* + * If this function failed, the mbuf would be freed. + */ +static inline struct mbuf * +mana_tso_fixup(struct mbuf *mbuf) +{ + struct ether_vlan_header *eh = mtod(mbuf, struct ether_vlan_header *); + struct tcphdr *th; + uint16_t etype; + int ehlen; + + if (eh->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) { + etype = ntohs(eh->evl_proto); + ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + } else { + etype = ntohs(eh->evl_encap_proto); + ehlen = ETHER_HDR_LEN; + } + + if (etype == ETHERTYPE_IP) { + struct ip *ip; + int iphlen; + + PULLUP_HDR(mbuf, ehlen + sizeof(*ip)); + ip = mtodo(mbuf, ehlen); + iphlen = ip->ip_hl << 2; + mbuf->m_pkthdr.l3hlen = ehlen + iphlen; + + PULLUP_HDR(mbuf, ehlen + iphlen + sizeof(*th)); + th = mtodo(mbuf, ehlen + iphlen); + + ip->ip_len = 0; + ip->ip_sum = 0; + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + } else if (etype == ETHERTYPE_IPV6) { + struct ip6_hdr *ip6; + + PULLUP_HDR(mbuf, ehlen + sizeof(*ip6) + sizeof(*th)); + ip6 = mtodo(mbuf, ehlen); + if (ip6->ip6_nxt != IPPROTO_TCP) { + /* Realy something wrong, just return */ + mana_dbg(NULL, "TSO mbuf not TCP, freed.\n"); + m_freem(mbuf); + return NULL; + } + mbuf->m_pkthdr.l3hlen = ehlen + sizeof(*ip6); + + th = mtodo(mbuf, ehlen + sizeof(*ip6)); + + ip6->ip6_plen = 0; + th->th_sum = in6_cksum_pseudo(ip6, 0, IPPROTO_TCP, 0); + } else { + /* CSUM_TSO is set but not IP protocol. */ + mana_warn(NULL, "TSO mbuf not right, freed.\n"); + m_freem(mbuf); + return NULL; + } + + MANA_L3_PROTO(mbuf) = etype; + + return (mbuf); +} + +/* + * If this function failed, the mbuf would be freed. + */ +static inline struct mbuf * +mana_mbuf_csum_check(struct mbuf *mbuf) +{ + struct ether_vlan_header *eh = mtod(mbuf, struct ether_vlan_header *); + struct mbuf *mbuf_next; + uint16_t etype; + int offset; + int ehlen; + + if (eh->evl_encap_proto == ntohs(ETHERTYPE_VLAN)) { + etype = ntohs(eh->evl_proto); + ehlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN; + } else { + etype = ntohs(eh->evl_encap_proto); + ehlen = ETHER_HDR_LEN; + } + + mbuf_next = m_getptr(mbuf, ehlen, &offset); + + MANA_L4_PROTO(mbuf) = 0; + if (etype == ETHERTYPE_IP) { + const struct ip *ip; + int iphlen; + + ip = (struct ip *)(mtodo(mbuf_next, offset)); + iphlen = ip->ip_hl << 2; + mbuf->m_pkthdr.l3hlen = ehlen + iphlen; + + MANA_L4_PROTO(mbuf) = ip->ip_p; + } else if (etype == ETHERTYPE_IPV6) { + const struct ip6_hdr *ip6; + + ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); + mbuf->m_pkthdr.l3hlen = ehlen + sizeof(*ip6); + + MANA_L4_PROTO(mbuf) = ip6->ip6_nxt; + } else { + MANA_L4_PROTO(mbuf) = 0; + } + + MANA_L3_PROTO(mbuf) = etype; + + return (mbuf); +} + +static int +mana_start_xmit(struct ifnet *ifp, struct mbuf *m) +{ + struct mana_port_context *apc = if_getsoftc(ifp); + struct mana_txq *txq; + int is_drbr_empty; + uint16_t txq_id; + int err; + + if (unlikely((!apc->port_is_up) || + (if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0)) + return ENODEV; + + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + m = mana_tso_fixup(m); + if (unlikely(m == NULL)) { + counter_enter(); + counter_u64_add_protected(apc->port_stats.tx_drops, 1); + counter_exit(); + return EIO; + } + } else { + m = mana_mbuf_csum_check(m); + if (unlikely(m == NULL)) { + counter_enter(); + counter_u64_add_protected(apc->port_stats.tx_drops, 1); + counter_exit(); + return EIO; + } + } + + if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) { + uint32_t hash = m->m_pkthdr.flowid; + txq_id = apc->indir_table[(hash) & MANA_INDIRECT_TABLE_MASK] % + apc->num_queues; + } else { + txq_id = m->m_pkthdr.flowid % apc->num_queues; + } + + if (apc->enable_tx_altq) + txq_id = apc->tx_qp[txq_id].txq.alt_txq_idx; + + txq = &apc->tx_qp[txq_id].txq; + + is_drbr_empty = drbr_empty(ifp, txq->txq_br); + err = drbr_enqueue(ifp, txq->txq_br, m); + if (unlikely(err)) { + mana_warn(NULL, "txq %u failed to enqueue: %d\n", + txq_id, err); + taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task); + return err; + } + + if (is_drbr_empty && mtx_trylock(&txq->txq_mtx)) { + mana_xmit(txq); + mtx_unlock(&txq->txq_mtx); + } else { + taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task); + } + + return 0; +} + +static void +mana_cleanup_port_context(struct mana_port_context *apc) +{ + bus_dma_tag_destroy(apc->tx_buf_tag); + bus_dma_tag_destroy(apc->rx_buf_tag); + apc->rx_buf_tag = NULL; + + free(apc->rxqs, M_DEVBUF); + apc->rxqs = NULL; + + mana_free_counters((counter_u64_t *)&apc->port_stats, + sizeof(struct mana_port_stats)); +} + +static int +mana_init_port_context(struct mana_port_context *apc) +{ + device_t dev = apc->ac->gdma_dev->gdma_context->dev; + uint32_t tso_maxsize; + int err; + + tso_maxsize = MAX_MBUF_FRAGS * MANA_TSO_MAXSEG_SZ - + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + + /* Create DMA tag for tx bufs */ + err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ + 1, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + tso_maxsize, /* maxsize */ + MAX_MBUF_FRAGS, /* nsegments */ + tso_maxsize, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg*/ + &apc->tx_buf_tag); + if (unlikely(err)) { + device_printf(dev, "Feiled to create TX DMA tag\n"); + return err; + } + + /* Create DMA tag for rx bufs */ + err = bus_dma_tag_create(bus_get_dma_tag(dev), /* parent */ + 64, 0, /* alignment, boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + MJUMPAGESIZE, /* maxsize */ + 1, /* nsegments */ + MJUMPAGESIZE, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg*/ + &apc->rx_buf_tag); + if (unlikely(err)) { + device_printf(dev, "Feiled to create RX DMA tag\n"); + return err; + } + + apc->rxqs = mallocarray(apc->num_queues, sizeof(struct mana_rxq *), + M_DEVBUF, M_WAITOK | M_ZERO); + + if (!apc->rxqs) { + bus_dma_tag_destroy(apc->tx_buf_tag); + bus_dma_tag_destroy(apc->rx_buf_tag); + apc->rx_buf_tag = NULL; + return ENOMEM; + } + + return 0; +} + +static int +mana_send_request(struct mana_context *ac, void *in_buf, + uint32_t in_len, void *out_buf, uint32_t out_len) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + struct gdma_resp_hdr *resp = out_buf; + struct gdma_req_hdr *req = in_buf; + device_t dev = gc->dev; + static atomic_t activity_id; + int err; + + req->dev_id = gc->mana.dev_id; + req->activity_id = atomic_inc_return(&activity_id); + + mana_dbg(NULL, "activity_id = %u\n", activity_id); + + err = mana_gd_send_request(gc, in_len, in_buf, out_len, + out_buf); + if (err || resp->status) { + device_printf(dev, "Failed to send mana message: %d, 0x%x\n", + err, resp->status); + return err ? err : EPROTO; + } + + if (req->dev_id.as_uint32 != resp->dev_id.as_uint32 || + req->activity_id != resp->activity_id) { + device_printf(dev, + "Unexpected mana message response: %x,%x,%x,%x\n", + req->dev_id.as_uint32, resp->dev_id.as_uint32, + req->activity_id, resp->activity_id); + return EPROTO; + } + + return 0; +} + +static int +mana_verify_resp_hdr(const struct gdma_resp_hdr *resp_hdr, + const enum mana_command_code expected_code, + const uint32_t min_size) +{ + if (resp_hdr->response.msg_type != expected_code) + return EPROTO; + + if (resp_hdr->response.msg_version < GDMA_MESSAGE_V1) + return EPROTO; + + if (resp_hdr->response.msg_size < min_size) + return EPROTO; + + return 0; +} + +static int +mana_query_device_cfg(struct mana_context *ac, uint32_t proto_major_ver, + uint32_t proto_minor_ver, uint32_t proto_micro_ver, + uint16_t *max_num_vports) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + struct mana_query_device_cfg_resp resp = {}; + struct mana_query_device_cfg_req req = {}; + device_t dev = gc->dev; + int err = 0; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_DEV_CONFIG, + sizeof(req), sizeof(resp)); + req.proto_major_ver = proto_major_ver; + req.proto_minor_ver = proto_minor_ver; + req.proto_micro_ver = proto_micro_ver; + + err = mana_send_request(ac, &req, sizeof(req), &resp, sizeof(resp)); + if (err) { + device_printf(dev, "Failed to query config: %d", err); + return err; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_DEV_CONFIG, + sizeof(resp)); + if (err || resp.hdr.status) { + device_printf(dev, "Invalid query result: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = EPROTO; + return err; + } + + *max_num_vports = resp.max_num_vports; + + mana_dbg(NULL, "mana max_num_vports from device = %d\n", + *max_num_vports); + + return 0; +} + +static int +mana_query_vport_cfg(struct mana_port_context *apc, uint32_t vport_index, + uint32_t *max_sq, uint32_t *max_rq, uint32_t *num_indir_entry) +{ + struct mana_query_vport_cfg_resp resp = {}; + struct mana_query_vport_cfg_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_VPORT_CONFIG, + sizeof(req), sizeof(resp)); + + req.vport_index = vport_index; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) + return err; + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_VPORT_CONFIG, + sizeof(resp)); + if (err) + return err; + + if (resp.hdr.status) + return EPROTO; + + *max_sq = resp.max_num_sq; + *max_rq = resp.max_num_rq; + *num_indir_entry = resp.num_indirection_ent; + + apc->port_handle = resp.vport; + memcpy(apc->mac_addr, resp.mac_addr, ETHER_ADDR_LEN); + + return 0; +} + +static int +mana_cfg_vport(struct mana_port_context *apc, uint32_t protection_dom_id, + uint32_t doorbell_pg_id) +{ + struct mana_config_vport_resp resp = {}; + struct mana_config_vport_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_CONFIG_VPORT_TX, + sizeof(req), sizeof(resp)); + req.vport = apc->port_handle; + req.pdid = protection_dom_id; + req.doorbell_pageid = doorbell_pg_id; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) { + if_printf(apc->ndev, "Failed to configure vPort: %d\n", err); + goto out; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_CONFIG_VPORT_TX, + sizeof(resp)); + if (err || resp.hdr.status) { + if_printf(apc->ndev, "Failed to configure vPort: %d, 0x%x\n", + err, resp.hdr.status); + if (!err) + err = EPROTO; + + goto out; + } + + apc->tx_shortform_allowed = resp.short_form_allowed; + apc->tx_vp_offset = resp.tx_vport_offset; +out: + return err; +} + +static int +mana_cfg_vport_steering(struct mana_port_context *apc, + enum TRI_STATE rx, + bool update_default_rxobj, bool update_key, + bool update_tab) +{ + uint16_t num_entries = MANA_INDIRECT_TABLE_SIZE; + struct mana_cfg_rx_steer_req *req = NULL; + struct mana_cfg_rx_steer_resp resp = {}; + struct ifnet *ndev = apc->ndev; + mana_handle_t *req_indir_tab; + uint32_t req_buf_size; + int err; + + req_buf_size = sizeof(*req) + sizeof(mana_handle_t) * num_entries; + req = malloc(req_buf_size, M_DEVBUF, M_WAITOK | M_ZERO); + if (!req) + return ENOMEM; + + mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size, + sizeof(resp)); + + req->vport = apc->port_handle; + req->num_indir_entries = num_entries; + req->indir_tab_offset = sizeof(*req); + req->rx_enable = rx; + req->rss_enable = apc->rss_state; + req->update_default_rxobj = update_default_rxobj; + req->update_hashkey = update_key; + req->update_indir_tab = update_tab; + req->default_rxobj = apc->default_rxobj; + + if (update_key) + memcpy(&req->hashkey, apc->hashkey, MANA_HASH_KEY_SIZE); + + if (update_tab) { + req_indir_tab = (mana_handle_t *)(req + 1); + memcpy(req_indir_tab, apc->rxobj_table, + req->num_indir_entries * sizeof(mana_handle_t)); + } + + err = mana_send_request(apc->ac, req, req_buf_size, &resp, + sizeof(resp)); + if (err) { + if_printf(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_CONFIG_VPORT_RX, + sizeof(resp)); + if (err) { + if_printf(ndev, "vPort RX configuration failed: %d\n", err); + goto out; + } + + if (resp.hdr.status) { + if_printf(ndev, "vPort RX configuration failed: 0x%x\n", + resp.hdr.status); + err = EPROTO; + } +out: + free(req, M_DEVBUF); + return err; +} + +static int +mana_create_wq_obj(struct mana_port_context *apc, + mana_handle_t vport, + uint32_t wq_type, struct mana_obj_spec *wq_spec, + struct mana_obj_spec *cq_spec, + mana_handle_t *wq_obj) +{ + struct mana_create_wqobj_resp resp = {}; + struct mana_create_wqobj_req req = {}; + struct ifnet *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_CREATE_WQ_OBJ, + sizeof(req), sizeof(resp)); + req.vport = vport; + req.wq_type = wq_type; + req.wq_gdma_region = wq_spec->gdma_region; + req.cq_gdma_region = cq_spec->gdma_region; + req.wq_size = wq_spec->queue_size; + req.cq_size = cq_spec->queue_size; + req.cq_moderation_ctx_id = cq_spec->modr_ctx_id; + req.cq_parent_qid = cq_spec->attached_eq; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) { + if_printf(ndev, "Failed to create WQ object: %d\n", err); + goto out; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_CREATE_WQ_OBJ, + sizeof(resp)); + if (err || resp.hdr.status) { + if_printf(ndev, "Failed to create WQ object: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = EPROTO; + goto out; + } + + if (resp.wq_obj == INVALID_MANA_HANDLE) { + if_printf(ndev, "Got an invalid WQ object handle\n"); + err = EPROTO; + goto out; + } + + *wq_obj = resp.wq_obj; + wq_spec->queue_index = resp.wq_id; + cq_spec->queue_index = resp.cq_id; + + return 0; +out: + return err; +} + +static void +mana_destroy_wq_obj(struct mana_port_context *apc, uint32_t wq_type, + mana_handle_t wq_obj) +{ + struct mana_destroy_wqobj_resp resp = {}; + struct mana_destroy_wqobj_req req = {}; + struct ifnet *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_DESTROY_WQ_OBJ, + sizeof(req), sizeof(resp)); + req.wq_type = wq_type; + req.wq_obj_handle = wq_obj; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) { + if_printf(ndev, "Failed to destroy WQ object: %d\n", err); + return; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_DESTROY_WQ_OBJ, + sizeof(resp)); + if (err || resp.hdr.status) + if_printf(ndev, "Failed to destroy WQ object: %d, 0x%x\n", + err, resp.hdr.status); +} + +static void +mana_init_cqe_poll_buf(struct gdma_comp *cqe_poll_buf) +{ + int i; + + for (i = 0; i < CQE_POLLING_BUFFER; i++) + memset(&cqe_poll_buf[i], 0, sizeof(struct gdma_comp)); +} + +static void +mana_destroy_eq(struct gdma_context *gc, struct mana_port_context *apc) +{ + struct gdma_queue *eq; + int i; + + if (!apc->eqs) + return; + + for (i = 0; i < apc->num_queues; i++) { + eq = apc->eqs[i].eq; + if (!eq) + continue; + + mana_gd_destroy_queue(gc, eq); + } + + free(apc->eqs, M_DEVBUF); + apc->eqs = NULL; +} + +static int +mana_create_eq(struct mana_port_context *apc) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct gdma_queue_spec spec = {}; + int err; + int i; + + apc->eqs = mallocarray(apc->num_queues, sizeof(struct mana_eq), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!apc->eqs) + return ENOMEM; + + spec.type = GDMA_EQ; + spec.monitor_avl_buf = false; + spec.queue_size = EQ_SIZE; + spec.eq.callback = NULL; + spec.eq.context = apc->eqs; + spec.eq.log2_throttle_limit = LOG2_EQ_THROTTLE; + spec.eq.ndev = apc->ndev; + + for (i = 0; i < apc->num_queues; i++) { + mana_init_cqe_poll_buf(apc->eqs[i].cqe_poll); + + err = mana_gd_create_mana_eq(gd, &spec, &apc->eqs[i].eq); + if (err) + goto out; + } + + return 0; +out: + mana_destroy_eq(gd->gdma_context, apc); + return err; +} + +static int +mana_move_wq_tail(struct gdma_queue *wq, uint32_t num_units) +{ + uint32_t used_space_old; + uint32_t used_space_new; + + used_space_old = wq->head - wq->tail; + used_space_new = wq->head - (wq->tail + num_units); + + if (used_space_new > used_space_old) { + mana_err(NULL, + "WARNING: new used space %u greater than old one %u\n", + used_space_new, used_space_old); + return ERANGE; + } + + wq->tail += num_units; + return 0; +} + +static void +mana_poll_tx_cq(struct mana_cq *cq) +{ + struct gdma_comp *completions = cq->gdma_comp_buf; + struct gdma_posted_wqe_info *wqe_info; + struct mana_send_buf_info *tx_info; + unsigned int pkt_transmitted = 0; + unsigned int wqe_unit_cnt = 0; + struct mana_txq *txq = cq->txq; + struct mana_port_context *apc; + uint16_t next_to_complete; + struct ifnet *ndev; + int comp_read; + int txq_idx = txq->idx;; + int i; + int sa_drop = 0; + + struct gdma_queue *gdma_wq; + unsigned int avail_space; + bool txq_full = false; + + ndev = txq->ndev; + apc = if_getsoftc(ndev); + + comp_read = mana_gd_poll_cq(cq->gdma_cq, completions, + CQE_POLLING_BUFFER); + + next_to_complete = txq->next_to_complete; + + for (i = 0; i < comp_read; i++) { + struct mana_tx_comp_oob *cqe_oob; + + if (!completions[i].is_sq) { + mana_err(NULL, "WARNING: Not for SQ\n"); + return; + } + + cqe_oob = (struct mana_tx_comp_oob *)completions[i].cqe_data; + if (cqe_oob->cqe_hdr.client_type != + MANA_CQE_COMPLETION) { + mana_err(NULL, + "WARNING: Invalid CQE client type %u\n", + cqe_oob->cqe_hdr.client_type); + return; + } + + switch (cqe_oob->cqe_hdr.cqe_type) { + case CQE_TX_OKAY: + break; + + case CQE_TX_SA_DROP: + case CQE_TX_MTU_DROP: + case CQE_TX_INVALID_OOB: + case CQE_TX_INVALID_ETH_TYPE: + case CQE_TX_HDR_PROCESSING_ERROR: + case CQE_TX_VF_DISABLED: + case CQE_TX_VPORT_IDX_OUT_OF_RANGE: + case CQE_TX_VPORT_DISABLED: + case CQE_TX_VLAN_TAGGING_VIOLATION: + sa_drop ++; + mana_err(NULL, + "TX: txq %d CQE error %d, ntc = %d, " + "pending sends = %d: err ignored.\n", + txq_idx, cqe_oob->cqe_hdr.cqe_type, + next_to_complete, txq->pending_sends); + break; + + default: + /* If the CQE type is unexpected, log an error, + * and go through the error path. + */ + mana_err(NULL, + "ERROR: TX: Unexpected CQE type %d: HW BUG?\n", + cqe_oob->cqe_hdr.cqe_type); + return; + } + if (txq->gdma_txq_id != completions[i].wq_num) { + mana_dbg(NULL, + "txq gdma id not match completion wq num: " + "%d != %d\n", + txq->gdma_txq_id, completions[i].wq_num); + break; + } + + tx_info = &txq->tx_buf_info[next_to_complete]; + if (!tx_info->mbuf) { + mana_err(NULL, + "WARNING: txq %d Empty mbuf on tx_info: %u, " + "ntu = %u, pending_sends = %d, " + "transmitted = %d, sa_drop = %d, i = %d, comp_read = %d\n", + txq_idx, next_to_complete, txq->next_to_use, + txq->pending_sends, pkt_transmitted, sa_drop, + i, comp_read); + continue; + } + + wqe_info = &tx_info->wqe_inf; + wqe_unit_cnt += wqe_info->wqe_size_in_bu; + + mana_tx_unmap_mbuf(apc, tx_info); + mb(); + + next_to_complete = + (next_to_complete + 1) % MAX_SEND_BUFFERS_PER_QUEUE; + + pkt_transmitted++; + } + + txq->next_to_complete = next_to_complete; + + if (wqe_unit_cnt == 0) { + mana_err(NULL, + "WARNING: TX ring not proceeding!\n"); + return; + } + + mana_move_wq_tail(txq->gdma_sq, wqe_unit_cnt); + + /* Ensure tail updated before checking q stop */ + wmb(); + + gdma_wq = txq->gdma_sq; + avail_space = mana_gd_wq_avail_space(gdma_wq); + + + if ((if_getdrvflags(ndev) & MANA_TXQ_FULL) == MANA_TXQ_FULL) { + txq_full = true; + } + + /* Ensure checking txq_full before apc->port_is_up. */ + rmb(); + + if (txq_full && apc->port_is_up && avail_space >= MAX_TX_WQE_SIZE) { + /* Grab the txq lock and re-test */ + mtx_lock(&txq->txq_mtx); + avail_space = mana_gd_wq_avail_space(gdma_wq); + + if ((if_getdrvflags(ndev) & MANA_TXQ_FULL) == MANA_TXQ_FULL && + apc->port_is_up && avail_space >= MAX_TX_WQE_SIZE) { + /* Clear the Q full flag */ + if_setdrvflagbits(apc->ndev, IFF_DRV_RUNNING, + IFF_DRV_OACTIVE); + counter_u64_add(txq->stats.wakeup, 1); + if (txq->alt_txq_idx != txq->idx) { + uint64_t stops = counter_u64_fetch(txq->stats.stop); + uint64_t wakeups = counter_u64_fetch(txq->stats.wakeup); + /* Reset alt_txq_idx back if it is not overloaded */ + if (stops < wakeups) { + txq->alt_txq_idx = txq->idx; + counter_u64_add(txq->stats.alt_reset, 1); + } + } + rmb(); + /* Schedule a tx enqueue task */ + taskqueue_enqueue(txq->enqueue_tq, &txq->enqueue_task); + } + mtx_unlock(&txq->txq_mtx); + } + + if (atomic_sub_return(pkt_transmitted, &txq->pending_sends) < 0) + mana_err(NULL, + "WARNING: TX %d pending_sends error: %d\n", + txq->idx, txq->pending_sends); +} + +static void +mana_post_pkt_rxq(struct mana_rxq *rxq) +{ + struct mana_recv_buf_oob *recv_buf_oob; + uint32_t curr_index; + int err; + + curr_index = rxq->buf_index++; + if (rxq->buf_index == rxq->num_rx_buf) + rxq->buf_index = 0; + + recv_buf_oob = &rxq->rx_oobs[curr_index]; + + err = mana_gd_post_and_ring(rxq->gdma_rq, &recv_buf_oob->wqe_req, + &recv_buf_oob->wqe_inf); + if (err) { + mana_err(NULL, "WARNING: rxq %u post pkt err %d\n", + rxq->rxq_idx, err); + return; + } + + if (recv_buf_oob->wqe_inf.wqe_size_in_bu != 1) { + mana_err(NULL, "WARNING: rxq %u wqe_size_in_bu %u\n", + rxq->rxq_idx, recv_buf_oob->wqe_inf.wqe_size_in_bu); + } +} + +static void +mana_rx_mbuf(struct mbuf *mbuf, struct mana_rxcomp_oob *cqe, + struct mana_rxq *rxq) +{ + struct mana_stats *rx_stats = &rxq->stats; + struct ifnet *ndev = rxq->ndev; + uint32_t pkt_len = cqe->ppi[0].pkt_len; + uint16_t rxq_idx = rxq->rxq_idx; + struct mana_port_context *apc; + struct gdma_queue *eq; + bool do_lro = false; + bool do_if_input; + + apc = if_getsoftc(ndev); + eq = apc->eqs[rxq_idx].eq; + eq->eq.work_done++; + + if (!mbuf) { + return; + } + + mbuf->m_flags |= M_PKTHDR; + mbuf->m_pkthdr.len = pkt_len; + mbuf->m_len = pkt_len; + mbuf->m_pkthdr.rcvif = ndev; + + if ((ndev->if_capenable & IFCAP_RXCSUM || + ndev->if_capenable & IFCAP_RXCSUM_IPV6) && + (cqe->rx_iphdr_csum_succeed)) { + mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED; + mbuf->m_pkthdr.csum_flags |= CSUM_IP_VALID; + if (cqe->rx_tcp_csum_succeed || cqe->rx_udp_csum_succeed) { + mbuf->m_pkthdr.csum_flags |= + (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + mbuf->m_pkthdr.csum_data = 0xffff; + + if (cqe->rx_tcp_csum_succeed) + do_lro = true; + } + } + + if (cqe->rx_hashtype != 0) { + mbuf->m_pkthdr.flowid = cqe->ppi[0].pkt_hash; + + uint16_t hashtype = cqe->rx_hashtype; + if (hashtype & NDIS_HASH_IPV4_MASK) { + hashtype &= NDIS_HASH_IPV4_MASK; + switch (hashtype) { + case NDIS_HASH_TCP_IPV4: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); + break; + case NDIS_HASH_UDP_IPV4: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); + } + } else if (hashtype & NDIS_HASH_IPV6_MASK) { + hashtype &= NDIS_HASH_IPV6_MASK; + switch (hashtype) { + case NDIS_HASH_TCP_IPV6: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); + break; + case NDIS_HASH_TCP_IPV6_EX: + M_HASHTYPE_SET(mbuf, + M_HASHTYPE_RSS_TCP_IPV6_EX); + break; + case NDIS_HASH_UDP_IPV6: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); + break; + case NDIS_HASH_UDP_IPV6_EX: + M_HASHTYPE_SET(mbuf, + M_HASHTYPE_RSS_UDP_IPV6_EX); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); + } + } else { + M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH); + } + } else { + mbuf->m_pkthdr.flowid = rxq_idx; + M_HASHTYPE_SET(mbuf, M_HASHTYPE_NONE); + } + + do_if_input = true; + if ((ndev->if_capenable & IFCAP_LRO) && do_lro) { + if (rxq->lro.lro_cnt != 0 && + tcp_lro_rx(&rxq->lro, mbuf, 0) == 0) + do_if_input = false; + } + if (do_if_input) { + ndev->if_input(ndev, mbuf); + } + + counter_enter(); + counter_u64_add_protected(rx_stats->packets, 1); + counter_u64_add_protected(apc->port_stats.rx_packets, 1); + counter_u64_add_protected(rx_stats->bytes, pkt_len); + counter_u64_add_protected(apc->port_stats.rx_bytes, pkt_len); + counter_exit(); +} + +static void +mana_process_rx_cqe(struct mana_rxq *rxq, struct mana_cq *cq, + struct gdma_comp *cqe) +{ + struct mana_rxcomp_oob *oob = (struct mana_rxcomp_oob *)cqe->cqe_data; + struct mana_recv_buf_oob *rxbuf_oob; + struct ifnet *ndev = rxq->ndev; + struct mana_port_context *apc; + struct mbuf *old_mbuf; + uint32_t curr, pktlen; + int err; + + switch (oob->cqe_hdr.cqe_type) { + case CQE_RX_OKAY: + break; + + case CQE_RX_TRUNCATED: + if_printf(ndev, "Dropped a truncated packet\n"); + return; + + case CQE_RX_COALESCED_4: + if_printf(ndev, "RX coalescing is unsupported\n"); + return; + + case CQE_RX_OBJECT_FENCE: + if_printf(ndev, "RX Fencing is unsupported\n"); + return; + + default: + if_printf(ndev, "Unknown RX CQE type = %d\n", + oob->cqe_hdr.cqe_type); + return; + } + + if (oob->cqe_hdr.cqe_type != CQE_RX_OKAY) + return; + + pktlen = oob->ppi[0].pkt_len; + + if (pktlen == 0) { + /* data packets should never have packetlength of zero */ +#if defined(__amd64__) + if_printf(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%lx\n", + rxq->gdma_id, cq->gdma_id, rxq->rxobj); +#else + if_printf(ndev, "RX pkt len=0, rq=%u, cq=%u, rxobj=0x%llx\n", + rxq->gdma_id, cq->gdma_id, rxq->rxobj); +#endif + return; + } + + curr = rxq->buf_index; + rxbuf_oob = &rxq->rx_oobs[curr]; + if (rxbuf_oob->wqe_inf.wqe_size_in_bu != 1) { + mana_err(NULL, "WARNING: Rx Incorrect complete " + "WQE size %u\n", + rxbuf_oob->wqe_inf.wqe_size_in_bu); + } + + apc = if_getsoftc(ndev); + + old_mbuf = rxbuf_oob->mbuf; + + /* Unload DMA map for the old mbuf */ + mana_unload_rx_mbuf(apc, rxq, rxbuf_oob, false); + + /* Load a new mbuf to replace the old one */ + err = mana_load_rx_mbuf(apc, rxq, rxbuf_oob, true); + if (err) { + mana_dbg(NULL, + "failed to load rx mbuf, err = %d, packet dropped.\n", + err); + counter_u64_add(rxq->stats.mbuf_alloc_fail, 1); + /* + * Failed to load new mbuf, rxbuf_oob->mbuf is still + * pointing to the old one. Drop the packet. + */ + old_mbuf = NULL; + /* Reload the existing mbuf */ + mana_load_rx_mbuf(apc, rxq, rxbuf_oob, false); + } + + mana_rx_mbuf(old_mbuf, oob, rxq); + + mana_move_wq_tail(rxq->gdma_rq, rxbuf_oob->wqe_inf.wqe_size_in_bu); + + mana_post_pkt_rxq(rxq); +} + +static void +mana_poll_rx_cq(struct mana_cq *cq) +{ + struct gdma_comp *comp = cq->gdma_comp_buf; + int comp_read, i; + + comp_read = mana_gd_poll_cq(cq->gdma_cq, comp, CQE_POLLING_BUFFER); + KASSERT(comp_read <= CQE_POLLING_BUFFER, + ("comp_read %d great than buf size %d", + comp_read, CQE_POLLING_BUFFER)); + + for (i = 0; i < comp_read; i++) { + if (comp[i].is_sq == true) { + mana_err(NULL, + "WARNING: CQE not for receive queue\n"); + return; + } + + /* verify recv cqe references the right rxq */ + if (comp[i].wq_num != cq->rxq->gdma_id) { + mana_err(NULL, + "WARNING: Received CQE %d not for " + "this receive queue %d\n", + comp[i].wq_num, cq->rxq->gdma_id); + return; + } + + mana_process_rx_cqe(cq->rxq, cq, &comp[i]); + } + + tcp_lro_flush_all(&cq->rxq->lro); +} + +static void +mana_cq_handler(void *context, struct gdma_queue *gdma_queue) +{ + struct mana_cq *cq = context; + + KASSERT(cq->gdma_cq == gdma_queue, + ("cq do not match %p, %p", cq->gdma_cq, gdma_queue)); + + if (cq->type == MANA_CQ_TYPE_RX) { + mana_poll_rx_cq(cq); + } else { + mana_poll_tx_cq(cq); + } + + mana_gd_arm_cq(gdma_queue); +} + +static void +mana_deinit_cq(struct mana_port_context *apc, struct mana_cq *cq) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + + if (!cq->gdma_cq) + return; + + mana_gd_destroy_queue(gd->gdma_context, cq->gdma_cq); +} + +static void +mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct mana_send_buf_info *txbuf_info; + uint32_t pending_sends; + int i; + + if (!txq->gdma_sq) + return; + + if ((pending_sends = atomic_read(&txq->pending_sends)) > 0) { + mana_err(NULL, + "WARNING: txq pending sends not zero: %u\n", + pending_sends); + } + + if (txq->next_to_use != txq->next_to_complete) { + mana_err(NULL, + "WARNING: txq buf not completed, " + "next use %u, next complete %u\n", + txq->next_to_use, txq->next_to_complete); + } + + /* Flush buf ring. Grab txq mtx lock */ + if (txq->txq_br) { + mtx_lock(&txq->txq_mtx); + drbr_flush(apc->ndev, txq->txq_br); + mtx_unlock(&txq->txq_mtx); + buf_ring_free(txq->txq_br, M_DEVBUF); + } + + /* Drain taskqueue */ + if (txq->enqueue_tq) { + while (taskqueue_cancel(txq->enqueue_tq, + &txq->enqueue_task, NULL)) { + taskqueue_drain(txq->enqueue_tq, + &txq->enqueue_task); + } + + taskqueue_free(txq->enqueue_tq); + } + + if (txq->tx_buf_info) { + /* Free all mbufs which are still in-flight */ + for (i = 0; i < MAX_SEND_BUFFERS_PER_QUEUE; i++) { + txbuf_info = &txq->tx_buf_info[i]; + if (txbuf_info->mbuf) { + mana_tx_unmap_mbuf(apc, txbuf_info); + } + } + + free(txq->tx_buf_info, M_DEVBUF); + } + + mana_free_counters((counter_u64_t *)&txq->stats, + sizeof(txq->stats)); + + mana_gd_destroy_queue(gd->gdma_context, txq->gdma_sq); + + mtx_destroy(&txq->txq_mtx); +} + +static void +mana_destroy_txq(struct mana_port_context *apc) +{ + int i; + + if (!apc->tx_qp) + return; + + for (i = 0; i < apc->num_queues; i++) { + mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object); + + mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq); + + mana_deinit_txq(apc, &apc->tx_qp[i].txq); + } + + free(apc->tx_qp, M_DEVBUF); + apc->tx_qp = NULL; +} + +static int +mana_create_txq(struct mana_port_context *apc, struct ifnet *net) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct mana_obj_spec wq_spec; + struct mana_obj_spec cq_spec; + struct gdma_queue_spec spec; + struct gdma_context *gc; + struct mana_txq *txq; + struct mana_cq *cq; + uint32_t txq_size; + uint32_t cq_size; + int err; + int i; + + apc->tx_qp = mallocarray(apc->num_queues, sizeof(struct mana_tx_qp), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!apc->tx_qp) + return ENOMEM; + + /* The minimum size of the WQE is 32 bytes, hence + * MAX_SEND_BUFFERS_PER_QUEUE represents the maximum number of WQEs + * the SQ can store. This value is then used to size other queues + * to prevent overflow. + */ + txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32; + KASSERT(IS_ALIGNED(txq_size, PAGE_SIZE), + ("txq size not page aligned")); + + cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE; + cq_size = ALIGN(cq_size, PAGE_SIZE); + + gc = gd->gdma_context; + + for (i = 0; i < apc->num_queues; i++) { + apc->tx_qp[i].tx_object = INVALID_MANA_HANDLE; + + /* Create SQ */ + txq = &apc->tx_qp[i].txq; + + txq->ndev = net; + txq->vp_offset = apc->tx_vp_offset; + txq->idx = i; + txq->alt_txq_idx = i; + + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_SQ; + spec.monitor_avl_buf = true; + spec.queue_size = txq_size; + err = mana_gd_create_mana_wq_cq(gd, &spec, &txq->gdma_sq); + if (err) + goto out; + + /* Create SQ's CQ */ + cq = &apc->tx_qp[i].tx_cq; + cq->gdma_comp_buf = apc->eqs[i].cqe_poll; + cq->type = MANA_CQ_TYPE_TX; + + cq->txq = txq; + + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_CQ; + spec.monitor_avl_buf = false; + spec.queue_size = cq_size; + spec.cq.callback = mana_cq_handler; + spec.cq.parent_eq = apc->eqs[i].eq; + spec.cq.context = cq; + err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); + if (err) + goto out; + + memset(&wq_spec, 0, sizeof(wq_spec)); + memset(&cq_spec, 0, sizeof(cq_spec)); + + wq_spec.gdma_region = txq->gdma_sq->mem_info.gdma_region; + wq_spec.queue_size = txq->gdma_sq->queue_size; + + cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region; + cq_spec.queue_size = cq->gdma_cq->queue_size; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = cq->gdma_cq->cq.parent->id; + + err = mana_create_wq_obj(apc, apc->port_handle, GDMA_SQ, + &wq_spec, &cq_spec, &apc->tx_qp[i].tx_object); + + if (err) + goto out; + + txq->gdma_sq->id = wq_spec.queue_index; + cq->gdma_cq->id = cq_spec.queue_index; + + txq->gdma_sq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + + txq->gdma_txq_id = txq->gdma_sq->id; + + cq->gdma_id = cq->gdma_cq->id; + + mana_dbg(NULL, + "txq %d, txq gdma id %d, txq cq gdma id %d\n", + i, txq->gdma_txq_id, cq->gdma_id);; + + if (cq->gdma_id >= gc->max_num_cqs) { + if_printf(net, "CQ id %u too large.\n", cq->gdma_id); + return EINVAL; + } + + gc->cq_table[cq->gdma_id] = cq->gdma_cq; + + /* Initialize tx specific data */ + txq->tx_buf_info = malloc(MAX_SEND_BUFFERS_PER_QUEUE * + sizeof(struct mana_send_buf_info), + M_DEVBUF, M_WAITOK | M_ZERO); + if (unlikely(txq->tx_buf_info == NULL)) { + if_printf(net, + "Failed to allocate tx buf info for SQ %u\n", + txq->gdma_sq->id); + err = ENOMEM; + goto out; + } + + + snprintf(txq->txq_mtx_name, nitems(txq->txq_mtx_name), + "mana:tx(%d)", i); + mtx_init(&txq->txq_mtx, txq->txq_mtx_name, NULL, MTX_DEF); + + txq->txq_br = buf_ring_alloc(4 * MAX_SEND_BUFFERS_PER_QUEUE, + M_DEVBUF, M_WAITOK, &txq->txq_mtx); + if (unlikely(txq->txq_br == NULL)) { + if_printf(net, + "Failed to allocate buf ring for SQ %u\n", + txq->gdma_sq->id); + err = ENOMEM; + goto out; + } + + /* Allocate taskqueue for deferred send */ + TASK_INIT(&txq->enqueue_task, 0, mana_xmit_taskfunc, txq); + txq->enqueue_tq = taskqueue_create_fast("mana_tx_enque", + M_NOWAIT, taskqueue_thread_enqueue, &txq->enqueue_tq); + if (unlikely(txq->enqueue_tq == NULL)) { + if_printf(net, + "Unable to create tx %d enqueue task queue\n", i); + err = ENOMEM; + goto out; + } + taskqueue_start_threads(&txq->enqueue_tq, 1, PI_NET, + "mana txq %d", i); + + mana_alloc_counters((counter_u64_t *)&txq->stats, + sizeof(txq->stats)); + + mana_gd_arm_cq(cq->gdma_cq); + } + + return 0; +out: + mana_destroy_txq(apc); + return err; +} + +static void +mana_destroy_rxq(struct mana_port_context *apc, struct mana_rxq *rxq, + bool validate_state) +{ + struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; + struct mana_recv_buf_oob *rx_oob; + int i; + + if (!rxq) + return; + + if (validate_state) { + /* + * XXX Cancel and drain cleanup task queue here. + */ + ; + } + + mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); + + mana_deinit_cq(apc, &rxq->rx_cq); + + mana_free_counters((counter_u64_t *)&rxq->stats, + sizeof(rxq->stats)); + + /* Free LRO resources */ + tcp_lro_free(&rxq->lro); + + for (i = 0; i < rxq->num_rx_buf; i++) { + rx_oob = &rxq->rx_oobs[i]; + + if (rx_oob->mbuf) + mana_unload_rx_mbuf(apc, rxq, rx_oob, true); + + bus_dmamap_destroy(apc->rx_buf_tag, rx_oob->dma_map); + } + + if (rxq->gdma_rq) + mana_gd_destroy_queue(gc, rxq->gdma_rq); + + free(rxq, M_DEVBUF); +} + +#define MANA_WQE_HEADER_SIZE 16 +#define MANA_WQE_SGE_SIZE 16 + +static int +mana_alloc_rx_wqe(struct mana_port_context *apc, + struct mana_rxq *rxq, uint32_t *rxq_size, uint32_t *cq_size) +{ + struct mana_recv_buf_oob *rx_oob; + uint32_t buf_idx; + int err; + + if (rxq->datasize == 0 || rxq->datasize > PAGE_SIZE) { + mana_err(NULL, + "WARNING: Invalid rxq datasize %u\n", rxq->datasize); + } + + *rxq_size = 0; + *cq_size = 0; + + for (buf_idx = 0; buf_idx < rxq->num_rx_buf; buf_idx++) { + rx_oob = &rxq->rx_oobs[buf_idx]; + memset(rx_oob, 0, sizeof(*rx_oob)); + + err = bus_dmamap_create(apc->rx_buf_tag, 0, + &rx_oob->dma_map); + if (err) { + mana_err(NULL, + "Failed to create rx DMA map for buf %d\n", + buf_idx); + return err; + } + + err = mana_load_rx_mbuf(apc, rxq, rx_oob, true); + if (err) { + mana_err(NULL, + "Failed to create rx DMA map for buf %d\n", + buf_idx); + bus_dmamap_destroy(apc->rx_buf_tag, rx_oob->dma_map); + return err; + } + + rx_oob->wqe_req.sgl = rx_oob->sgl; + rx_oob->wqe_req.num_sge = rx_oob->num_sge; + rx_oob->wqe_req.inline_oob_size = 0; + rx_oob->wqe_req.inline_oob_data = NULL; + rx_oob->wqe_req.flags = 0; + rx_oob->wqe_req.client_data_unit = 0; + + *rxq_size += ALIGN(MANA_WQE_HEADER_SIZE + + MANA_WQE_SGE_SIZE * rx_oob->num_sge, 32); + *cq_size += COMP_ENTRY_SIZE; + } + + return 0; +} + +static int +mana_push_wqe(struct mana_rxq *rxq) +{ + struct mana_recv_buf_oob *rx_oob; + uint32_t buf_idx; + int err; + + for (buf_idx = 0; buf_idx < rxq->num_rx_buf; buf_idx++) { + rx_oob = &rxq->rx_oobs[buf_idx]; + + err = mana_gd_post_and_ring(rxq->gdma_rq, &rx_oob->wqe_req, + &rx_oob->wqe_inf); + if (err) + return ENOSPC; + } + + return 0; +} + +static struct mana_rxq * +mana_create_rxq(struct mana_port_context *apc, uint32_t rxq_idx, + struct mana_eq *eq, struct ifnet *ndev) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + struct mana_obj_spec wq_spec; + struct mana_obj_spec cq_spec; + struct gdma_queue_spec spec; + struct mana_cq *cq = NULL; + uint32_t cq_size, rq_size; + struct gdma_context *gc; + struct mana_rxq *rxq; + int err; + + gc = gd->gdma_context; + + rxq = malloc(sizeof(*rxq) + + RX_BUFFERS_PER_QUEUE * sizeof(struct mana_recv_buf_oob), + M_DEVBUF, M_WAITOK | M_ZERO); + if (!rxq) + return NULL; + + rxq->ndev = ndev; + rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE; + rxq->rxq_idx = rxq_idx; + /* + * Minimum size is MCLBYTES(2048) bytes for a mbuf cluster. + * Now we just allow maxium size of 4096. + */ + rxq->datasize = ALIGN(apc->frame_size, MCLBYTES); + if (rxq->datasize > MAX_FRAME_SIZE) + rxq->datasize = MAX_FRAME_SIZE; + + mana_dbg(NULL, "Setting rxq %d datasize %d\n", + rxq_idx, rxq->datasize); + + rxq->rxobj = INVALID_MANA_HANDLE; + + err = mana_alloc_rx_wqe(apc, rxq, &rq_size, &cq_size); + if (err) + goto out; + + /* Create LRO for the RQ */ + if (ndev->if_capenable & IFCAP_LRO) { + err = tcp_lro_init(&rxq->lro); + if (err) { + if_printf(ndev, "Failed to create LRO for rxq %d\n", + rxq_idx); + } else { + rxq->lro.ifp = ndev; + } + } + + mana_alloc_counters((counter_u64_t *)&rxq->stats, + sizeof(rxq->stats)); + + rq_size = ALIGN(rq_size, PAGE_SIZE); + cq_size = ALIGN(cq_size, PAGE_SIZE); + + /* Create RQ */ + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_RQ; + spec.monitor_avl_buf = true; + spec.queue_size = rq_size; + err = mana_gd_create_mana_wq_cq(gd, &spec, &rxq->gdma_rq); + if (err) + goto out; + + /* Create RQ's CQ */ + cq = &rxq->rx_cq; + cq->gdma_comp_buf = eq->cqe_poll; + cq->type = MANA_CQ_TYPE_RX; + cq->rxq = rxq; + + memset(&spec, 0, sizeof(spec)); + spec.type = GDMA_CQ; + spec.monitor_avl_buf = false; + spec.queue_size = cq_size; + spec.cq.callback = mana_cq_handler; + spec.cq.parent_eq = eq->eq; + spec.cq.context = cq; + err = mana_gd_create_mana_wq_cq(gd, &spec, &cq->gdma_cq); + if (err) + goto out; + + memset(&wq_spec, 0, sizeof(wq_spec)); + memset(&cq_spec, 0, sizeof(cq_spec)); + wq_spec.gdma_region = rxq->gdma_rq->mem_info.gdma_region; + wq_spec.queue_size = rxq->gdma_rq->queue_size; + + cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region; + cq_spec.queue_size = cq->gdma_cq->queue_size; + cq_spec.modr_ctx_id = 0; + cq_spec.attached_eq = cq->gdma_cq->cq.parent->id; + + err = mana_create_wq_obj(apc, apc->port_handle, GDMA_RQ, + &wq_spec, &cq_spec, &rxq->rxobj); + if (err) + goto out; + + rxq->gdma_rq->id = wq_spec.queue_index; + cq->gdma_cq->id = cq_spec.queue_index; + + rxq->gdma_rq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION; + + rxq->gdma_id = rxq->gdma_rq->id; + cq->gdma_id = cq->gdma_cq->id; + + err = mana_push_wqe(rxq); + if (err) + goto out; + + if (cq->gdma_id >= gc->max_num_cqs) + goto out; + + gc->cq_table[cq->gdma_id] = cq->gdma_cq; + + mana_gd_arm_cq(cq->gdma_cq); +out: + if (!err) + return rxq; + + if_printf(ndev, "Failed to create RXQ: err = %d\n", err); + + mana_destroy_rxq(apc, rxq, false); + + if (cq) + mana_deinit_cq(apc, cq); + + return NULL; +} + +static int +mana_add_rx_queues(struct mana_port_context *apc, struct ifnet *ndev) +{ + struct mana_rxq *rxq; + int err = 0; + int i; + + for (i = 0; i < apc->num_queues; i++) { + rxq = mana_create_rxq(apc, i, &apc->eqs[i], ndev); + if (!rxq) { + err = ENOMEM; + goto out; + } + + apc->rxqs[i] = rxq; + } + + apc->default_rxobj = apc->rxqs[0]->rxobj; +out: + return err; +} + +static void +mana_destroy_vport(struct mana_port_context *apc) +{ + struct mana_rxq *rxq; + uint32_t rxq_idx; + struct mana_cq *rx_cq; + struct gdma_queue *cq, *eq; + + for (rxq_idx = 0; rxq_idx < apc->num_queues; rxq_idx++) { + rxq = apc->rxqs[rxq_idx]; + if (!rxq) + continue; + + rx_cq = &rxq->rx_cq; + if ((cq = rx_cq->gdma_cq) != NULL) { + eq = cq->cq.parent; + mana_drain_eq_task(eq); + } + + mana_destroy_rxq(apc, rxq, true); + apc->rxqs[rxq_idx] = NULL; + } + + mana_destroy_txq(apc); +} + +static int +mana_create_vport(struct mana_port_context *apc, struct ifnet *net) +{ + struct gdma_dev *gd = apc->ac->gdma_dev; + int err; + + apc->default_rxobj = INVALID_MANA_HANDLE; + + err = mana_cfg_vport(apc, gd->pdid, gd->doorbell); + if (err) + return err; + + return mana_create_txq(apc, net); +} + + +static void mana_rss_table_init(struct mana_port_context *apc) +{ + int i; + + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) + apc->indir_table[i] = i % apc->num_queues; +} + +int mana_config_rss(struct mana_port_context *apc, enum TRI_STATE rx, + bool update_hash, bool update_tab) +{ + uint32_t queue_idx; + int i; + + if (update_tab) { + for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) { + queue_idx = apc->indir_table[i]; + apc->rxobj_table[i] = apc->rxqs[queue_idx]->rxobj; + } + } + + return mana_cfg_vport_steering(apc, rx, true, update_hash, update_tab); +} + +static int +mana_init_port(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + uint32_t max_txq, max_rxq, max_queues; + int port_idx = apc->port_idx; + uint32_t num_indirect_entries; + int err; + + err = mana_init_port_context(apc); + if (err) + return err; + + err = mana_query_vport_cfg(apc, port_idx, &max_txq, &max_rxq, + &num_indirect_entries); + if (err) { + if_printf(ndev, "Failed to query info for vPort 0\n"); + goto reset_apc; + } + + max_queues = min_t(uint32_t, max_txq, max_rxq); + if (apc->max_queues > max_queues) + apc->max_queues = max_queues; + + if (apc->num_queues > apc->max_queues) + apc->num_queues = apc->max_queues; + + return 0; + +reset_apc: + bus_dma_tag_destroy(apc->rx_buf_tag); + apc->rx_buf_tag = NULL; + free(apc->rxqs, M_DEVBUF); + apc->rxqs = NULL; + return err; +} + +int +mana_alloc_queues(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + struct gdma_dev *gd = apc->ac->gdma_dev; + int err; + + err = mana_create_eq(apc); + if (err) + return err; + + err = mana_create_vport(apc, ndev); + if (err) + goto destroy_eq; + + err = mana_add_rx_queues(apc, ndev); + if (err) + goto destroy_vport; + + apc->rss_state = apc->num_queues > 1 ? TRI_STATE_TRUE : TRI_STATE_FALSE; + + mana_rss_table_init(apc); + + err = mana_config_rss(apc, TRI_STATE_TRUE, true, true); + if (err) + goto destroy_vport; + + return 0; + +destroy_vport: + mana_destroy_vport(apc); +destroy_eq: + mana_destroy_eq(gd->gdma_context, apc); + return err; +} + +static int +mana_up(struct mana_port_context *apc) +{ + int err; + + mana_dbg(NULL, "mana_up called\n"); + + err = mana_alloc_queues(apc->ndev); + if (err) { + mana_err(NULL, "Faile alloc mana queues: %d\n", err); + return err; + } + + /* Add queue specific sysctl */ + mana_sysctl_add_queues(apc); + + apc->port_is_up = true; + + /* Ensure port state updated before txq state */ + wmb(); + + if_link_state_change(apc->ndev, LINK_STATE_UP); + if_setdrvflagbits(apc->ndev, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); + + return 0; +} + + +static void +mana_init(void *arg) +{ + struct mana_port_context *apc = (struct mana_port_context *)arg; + + MANA_APC_LOCK_LOCK(apc); + if (!apc->port_is_up) { + mana_up(apc); + } + MANA_APC_LOCK_UNLOCK(apc); +} + +static int +mana_dealloc_queues(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + struct mana_txq *txq; + int i, err; + + if (apc->port_is_up) + return EINVAL; + + /* No packet can be transmitted now since apc->port_is_up is false. + * There is still a tiny chance that mana_poll_tx_cq() can re-enable + * a txq because it may not timely see apc->port_is_up being cleared + * to false, but it doesn't matter since mana_start_xmit() drops any + * new packets due to apc->port_is_up being false. + * + * Drain all the in-flight TX packets + */ + for (i = 0; i < apc->num_queues; i++) { + txq = &apc->tx_qp[i].txq; + + struct mana_cq *tx_cq = &apc->tx_qp[i].tx_cq; + struct gdma_queue *eq = NULL; + if (tx_cq->gdma_cq) + eq = tx_cq->gdma_cq->cq.parent; + if (eq) { + /* Stop EQ interrupt */ + eq->eq.do_not_ring_db = true; + /* Schedule a cleanup task */ + taskqueue_enqueue(eq->eq.cleanup_tq, + &eq->eq.cleanup_task); + } + + while (atomic_read(&txq->pending_sends) > 0) + usleep_range(1000, 2000); + } + + /* We're 100% sure the queues can no longer be woken up, because + * we're sure now mana_poll_tx_cq() can't be running. + */ + + apc->rss_state = TRI_STATE_FALSE; + err = mana_config_rss(apc, TRI_STATE_FALSE, false, false); + if (err) { + if_printf(ndev, "Failed to disable vPort: %d\n", err); + return err; + } + + /* TODO: Implement RX fencing */ + gdma_msleep(1000); + + mana_destroy_vport(apc); + + mana_destroy_eq(apc->ac->gdma_dev->gdma_context, apc); + + return 0; +} + +static int +mana_down(struct mana_port_context *apc) +{ + int err = 0; + + apc->port_st_save = apc->port_is_up; + apc->port_is_up = false; + + /* Ensure port state updated before txq state */ + wmb(); + + if (apc->port_st_save) { + if_setdrvflagbits(apc->ndev, IFF_DRV_OACTIVE, + IFF_DRV_RUNNING); + if_link_state_change(apc->ndev, LINK_STATE_DOWN); + + mana_sysctl_free_queues(apc); + + err = mana_dealloc_queues(apc->ndev); + if (err) { + if_printf(apc->ndev, + "Failed to bring down mana interface: %d\n", err); + } + } + + return err; +} + +int +mana_detach(struct ifnet *ndev) +{ + struct mana_port_context *apc = if_getsoftc(ndev); + int err; + + ether_ifdetach(ndev); + + if (!apc) + return 0; + + MANA_APC_LOCK_LOCK(apc); + err = mana_down(apc); + MANA_APC_LOCK_UNLOCK(apc); + + mana_cleanup_port_context(apc); + + MANA_APC_LOCK_DESTROY(apc); + + free(apc, M_DEVBUF); + + return err; +} + +static int +mana_probe_port(struct mana_context *ac, int port_idx, + struct ifnet **ndev_storage) +{ + struct gdma_context *gc = ac->gdma_dev->gdma_context; + struct mana_port_context *apc; + struct ifnet *ndev; + int err; + + ndev = if_alloc_dev(IFT_ETHER, gc->dev); + if (!ndev) { + mana_err(NULL, "Failed to allocate ifnet struct\n"); + return ENOMEM; + } + + *ndev_storage = ndev; + + apc = malloc(sizeof(*apc), M_DEVBUF, M_WAITOK | M_ZERO); + if (!apc) { + mana_err(NULL, "Failed to allocate port context\n"); + err = ENOMEM; + goto free_net; + } + + apc->ac = ac; + apc->ndev = ndev; + apc->max_queues = gc->max_num_queues; + apc->num_queues = min_t(unsigned int, + gc->max_num_queues, MANA_MAX_NUM_QUEUES); + apc->port_handle = INVALID_MANA_HANDLE; + apc->port_idx = port_idx; + apc->frame_size = DEFAULT_FRAME_SIZE; + + MANA_APC_LOCK_INIT(apc); + + if_initname(ndev, device_get_name(gc->dev), port_idx); + if_setdev(ndev,gc->dev); + if_setsoftc(ndev, apc); + + if_setflags(ndev, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); + if_setinitfn(ndev, mana_init); + if_settransmitfn(ndev, mana_start_xmit); + if_setqflushfn(ndev, mana_qflush); + if_setioctlfn(ndev, mana_ioctl); + if_setgetcounterfn(ndev, mana_get_counter); + + if_setmtu(ndev, ETHERMTU); + if_setbaudrate(ndev, IF_Gbps(100)); + + mana_rss_key_fill(apc->hashkey, MANA_HASH_KEY_SIZE); + + err = mana_init_port(ndev); + if (err) + goto reset_apc; + + ndev->if_capabilities |= IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6; + ndev->if_capabilities |= IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6; + ndev->if_capabilities |= IFCAP_TSO4 | IFCAP_TSO6; + + ndev->if_capabilities |= IFCAP_LRO | IFCAP_LINKSTATE; + + /* Enable all available capabilities by default. */ + ndev->if_capenable = ndev->if_capabilities; + + /* TSO parameters */ + ndev->if_hw_tsomax = MAX_MBUF_FRAGS * MANA_TSO_MAXSEG_SZ - + (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); + ndev->if_hw_tsomaxsegcount = MAX_MBUF_FRAGS; + ndev->if_hw_tsomaxsegsize = PAGE_SIZE; + + ifmedia_init(&apc->media, IFM_IMASK, + mana_ifmedia_change, mana_ifmedia_status); + ifmedia_add(&apc->media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&apc->media, IFM_ETHER | IFM_AUTO); + + ether_ifattach(ndev, apc->mac_addr); + + /* Initialize statistics */ + mana_alloc_counters((counter_u64_t *)&apc->port_stats, + sizeof(struct mana_port_stats)); + mana_sysctl_add_port(apc); + + /* Tell the stack that the interface is not active */ + if_setdrvflagbits(ndev, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); + + return 0; + +reset_apc: + free(apc, M_DEVBUF); +free_net: + *ndev_storage = NULL; + if_printf(ndev, "Failed to probe vPort %d: %d\n", port_idx, err); + if_free(ndev); + return err; +} + +int mana_probe(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + device_t dev = gc->dev; + struct mana_context *ac; + int err; + int i; + + device_printf(dev, "%s protocol version: %d.%d.%d\n", DEVICE_NAME, + MANA_MAJOR_VERSION, MANA_MINOR_VERSION, MANA_MICRO_VERSION); + + err = mana_gd_register_device(gd); + if (err) + return err; + + ac = malloc(sizeof(*ac), M_DEVBUF, M_WAITOK | M_ZERO); + if (!ac) + return ENOMEM; + + ac->gdma_dev = gd; + ac->num_ports = 1; + gd->driver_data = ac; + + err = mana_query_device_cfg(ac, MANA_MAJOR_VERSION, MANA_MINOR_VERSION, + MANA_MICRO_VERSION, &ac->num_ports); + if (err) + goto out; + + if (ac->num_ports > MAX_PORTS_IN_MANA_DEV) + ac->num_ports = MAX_PORTS_IN_MANA_DEV; + + for (i = 0; i < ac->num_ports; i++) { + err = mana_probe_port(ac, i, &ac->ports[i]); + if (err) { + device_printf(dev, + "Failed to probe mana port %d\n", i); + break; + } + } + +out: + if (err) + mana_remove(gd); + + return err; +} + +void +mana_remove(struct gdma_dev *gd) +{ + struct gdma_context *gc = gd->gdma_context; + struct mana_context *ac = gd->driver_data; + device_t dev = gc->dev; + struct ifnet *ndev; + int i; + + for (i = 0; i < ac->num_ports; i++) { + ndev = ac->ports[i]; + if (!ndev) { + if (i == 0) + device_printf(dev, "No net device to remove\n"); + goto out; + } + + mana_detach(ndev); + + if_free(ndev); + } +out: + mana_gd_deregister_device(gd); + gd->driver_data = NULL; + gd->gdma_context = NULL; + free(ac, M_DEVBUF); +} diff --git a/sys/dev/mana/mana_sysctl.h b/sys/dev/mana/mana_sysctl.h new file mode 100644 --- /dev/null +++ b/sys/dev/mana/mana_sysctl.h @@ -0,0 +1,48 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2015-2020 Amazon.com, Inc. or its affiliates. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef MANA_SYSCTL_H +#define MANA_SYSCTL_H + +#include +#include +#include +#include + +#include "mana.h" + +void mana_sysctl_add_port(struct mana_port_context *apc); +void mana_sysctl_add_queues(struct mana_port_context *apc); +void mana_sysctl_free_queues(struct mana_port_context *apc); + +#endif /* !(MANA_SYSCTL_H) */ diff --git a/sys/dev/mana/mana_sysctl.c b/sys/dev/mana/mana_sysctl.c new file mode 100644 --- /dev/null +++ b/sys/dev/mana/mana_sysctl.c @@ -0,0 +1,219 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + + +#include "mana_sysctl.h" + +static int mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS); + +int mana_log_level = MANA_ALERT | MANA_WARNING | MANA_INFO; + +SYSCTL_NODE(_hw, OID_AUTO, mana, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "MANA driver parameters"); + +/* + * Logging level for changing verbosity of the output + */ +SYSCTL_INT(_hw_mana, OID_AUTO, log_level, CTLFLAG_RWTUN, + &mana_log_level, 0, "Logging level indicating verbosity of the logs"); + +SYSCTL_CONST_STRING(_hw_mana, OID_AUTO, driver_version, CTLFLAG_RD, + DRV_MODULE_VERSION, "MANA driver version"); + +void +mana_sysctl_add_port(struct mana_port_context *apc) +{ + struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; + device_t dev = gc->dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + struct mana_port_stats *port_stats; + char node_name[32]; + + struct sysctl_oid *port_node, *stats_node; + struct sysctl_oid_list *stats_list; + + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + port_stats = &apc->port_stats; + + snprintf(node_name, 32, "port%d", apc->port_idx); + + port_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, + node_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Port Name"); + apc->port_list = SYSCTL_CHILDREN(port_node); + + SYSCTL_ADD_BOOL(ctx, apc->port_list, OID_AUTO, + "enable_altq", CTLFLAG_RW, &apc->enable_tx_altq, 0, + "Choose alternative txq under heavy load"); + + SYSCTL_ADD_PROC(ctx, apc->port_list, OID_AUTO, + "bind_cleanup_thread_cpu", + CTLTYPE_U8 | CTLFLAG_RW | CTLFLAG_MPSAFE, + apc, 0, mana_sysctl_cleanup_thread_cpu, "I", + "Bind cleanup thread to a cpu. 0 disables it."); + + stats_node = SYSCTL_ADD_NODE(ctx, apc->port_list, OID_AUTO, + "port_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, + "Statistics of port"); + stats_list = SYSCTL_CHILDREN(stats_node); + + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_packets", + CTLFLAG_RD, &port_stats->rx_packets, "Packets received"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_packets", + CTLFLAG_RD, &port_stats->tx_packets, "Packets transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_bytes", + CTLFLAG_RD, &port_stats->rx_bytes, "Bytes received"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_bytes", + CTLFLAG_RD, &port_stats->tx_bytes, "Bytes transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "rx_drops", + CTLFLAG_RD, &port_stats->rx_drops, "Receive packet drops"); + SYSCTL_ADD_COUNTER_U64(ctx, stats_list, OID_AUTO, "tx_drops", + CTLFLAG_RD, &port_stats->tx_drops, "Transmit packet drops"); +} + +void +mana_sysctl_add_queues(struct mana_port_context *apc) +{ + struct sysctl_ctx_list *ctx = &apc->que_sysctl_ctx; + struct sysctl_oid_list *child = apc->port_list; + + struct sysctl_oid *queue_node, *tx_node, *rx_node; + struct sysctl_oid_list *queue_list, *tx_list, *rx_list; + struct mana_txq *txq; + struct mana_rxq *rxq; + struct mana_stats *tx_stats, *rx_stats; + char que_name[32]; + int i; + + sysctl_ctx_init(ctx); + + for (i = 0; i < apc->num_queues; i++) { + rxq = apc->rxqs[i]; + txq = &apc->tx_qp[i].txq; + + snprintf(que_name, 32, "queue%d", i); + + queue_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, + que_name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Queue Name"); + queue_list = SYSCTL_CHILDREN(queue_node); + + /* TX stats */ + tx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, + "txq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "TX queue"); + tx_list = SYSCTL_CHILDREN(tx_node); + + tx_stats = &txq->stats; + + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "count", + CTLFLAG_RD, &tx_stats->packets, "Packets sent"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "bytes", + CTLFLAG_RD, &tx_stats->bytes, "Bytes sent"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_wakeups", + CTLFLAG_RD, &tx_stats->wakeup, "Queue wakeups"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "queue_stops", + CTLFLAG_RD, &tx_stats->stop, "Queue stops"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "mbuf_collapse", + CTLFLAG_RD, &tx_stats->collapse, "Mbuf collapse count"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "mbuf_collapse_err", CTLFLAG_RD, + &tx_stats->collapse_err, "Mbuf collapse failures"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "dma_mapping_err", CTLFLAG_RD, + &tx_stats->dma_mapping_err, "DMA mapping failures"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "alt_chg", CTLFLAG_RD, + &tx_stats->alt_chg, "Switch to alternative txq"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "alt_reset", CTLFLAG_RD, + &tx_stats->alt_reset, "Reset to self txq"); + + /* RX stats */ + rx_node = SYSCTL_ADD_NODE(ctx, queue_list, OID_AUTO, + "rxq", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "RX queue"); + rx_list = SYSCTL_CHILDREN(rx_node); + + rx_stats = &rxq->stats; + + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "count", + CTLFLAG_RD, &rx_stats->packets, "Packets received"); + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, "bytes", + CTLFLAG_RD, &rx_stats->bytes, "Bytes received"); + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, + "mbuf_alloc_fail", CTLFLAG_RD, + &rx_stats->mbuf_alloc_fail, "Failed mbuf allocs"); + SYSCTL_ADD_COUNTER_U64(ctx, rx_list, OID_AUTO, + "dma_mapping_err", CTLFLAG_RD, + &rx_stats->dma_mapping_err, "DMA mapping errors"); + } +} + +/* + * Free all queues' sysctl trees attached to the port's tree. + */ +void +mana_sysctl_free_queues(struct mana_port_context *apc) +{ + sysctl_ctx_free(&apc->que_sysctl_ctx); +} + +static int +mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS) +{ + struct mana_port_context *apc = arg1; + bool bind_cpu = false; + uint8_t val; + int err; + + val = 0; + err = sysctl_wire_old_buffer(req, sizeof(val)); + if (err == 0) { + val = apc->bind_cleanup_thread_cpu; + err = sysctl_handle_8(oidp, &val, 0, req); + } + + if (err != 0 || req->newptr == NULL) + return (err); + + if (val != 0) + bind_cpu = true; + + if (bind_cpu != apc->bind_cleanup_thread_cpu) { + apc->bind_cleanup_thread_cpu = bind_cpu; + err = mana_restart(apc); + } + + return (err); +} diff --git a/sys/dev/mana/shm_channel.h b/sys/dev/mana/shm_channel.h new file mode 100644 --- /dev/null +++ b/sys/dev/mana/shm_channel.h @@ -0,0 +1,52 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _SHM_CHANNEL_H +#define _SHM_CHANNEL_H + +#define __iomem + +struct shm_channel { + device_t dev; + void __iomem *base; +}; + +void mana_smc_init(struct shm_channel *sc, device_t dev, void __iomem *base); + +int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, + uint64_t eq_addr, uint64_t cq_addr, uint64_t rq_addr, uint64_t sq_addr, + uint32_t eq_msix_index); + +int mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf); + +#endif /* _SHM_CHANNEL_H */ diff --git a/sys/dev/mana/shm_channel.c b/sys/dev/mana/shm_channel.c new file mode 100644 --- /dev/null +++ b/sys/dev/mana/shm_channel.c @@ -0,0 +1,337 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2021 Microsoft Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include "mana.h" +#include "shm_channel.h" +#include "gdma_util.h" + +#define PAGE_FRAME_L48_WIDTH_BYTES 6 +#define PAGE_FRAME_L48_WIDTH_BITS (PAGE_FRAME_L48_WIDTH_BYTES * 8) +#define PAGE_FRAME_L48_MASK 0x0000FFFFFFFFFFFF +#define PAGE_FRAME_H4_WIDTH_BITS 4 +#define VECTOR_MASK 0xFFFF +#define SHMEM_VF_RESET_STATE ((uint32_t)-1) + +#define SMC_MSG_TYPE_ESTABLISH_HWC 1 +#define SMC_MSG_TYPE_ESTABLISH_HWC_VERSION 0 + +#define SMC_MSG_TYPE_DESTROY_HWC 2 +#define SMC_MSG_TYPE_DESTROY_HWC_VERSION 0 + +#define SMC_MSG_DIRECTION_REQUEST 0 +#define SMC_MSG_DIRECTION_RESPONSE 1 + +/* Structures labeled with "HW DATA" are exchanged with the hardware. All of + * them are naturally aligned and hence don't need __packed. + */ + +/* Shared memory channel protocol header + * + * msg_type: set on request and response; response matches request. + * msg_version: newer PF writes back older response (matching request) + * older PF acts on latest version known and sets that version in result + * (less than request). + * direction: 0 for request, VF->PF; 1 for response, PF->VF. + * status: 0 on request, + * operation result on response (success = 0, failure = 1 or greater). + * reset_vf: If set on either establish or destroy request, indicates perform + * FLR before/after the operation. + * owner_is_pf: 1 indicates PF owned, 0 indicates VF owned. + */ +union smc_proto_hdr { + uint32_t as_uint32; + + struct { + uint8_t msg_type : 3; + uint8_t msg_version : 3; + uint8_t reserved_1 : 1; + uint8_t direction : 1; + + uint8_t status; + + uint8_t reserved_2; + + uint8_t reset_vf : 1; + uint8_t reserved_3 : 6; + uint8_t owner_is_pf : 1; + }; +}; /* HW DATA */ + +#define SMC_APERTURE_BITS 256 +#define SMC_BASIC_UNIT (sizeof(uint32_t)) +#define SMC_APERTURE_DWORDS (SMC_APERTURE_BITS / (SMC_BASIC_UNIT * 8)) +#define SMC_LAST_DWORD (SMC_APERTURE_DWORDS - 1) + +static int +mana_smc_poll_register(void __iomem *base, bool reset) +{ + void __iomem *ptr = (uint8_t *)base + SMC_LAST_DWORD * SMC_BASIC_UNIT; + volatile uint32_t last_dword; + int i; + + /* Poll the hardware for the ownership bit. This should be pretty fast, + * but let's do it in a loop just in case the hardware or the PF + * driver are temporarily busy. + */ + for (i = 0; i < 20 * 1000; i++) { + last_dword = readl(ptr); + + /* shmem reads as 0xFFFFFFFF in the reset case */ + if (reset && last_dword == SHMEM_VF_RESET_STATE) + return 0; + + /* If bit_31 is set, the PF currently owns the SMC. */ + if (!(last_dword & BIT(31))) + return 0; + + DELAY(1000); + } + + return ETIMEDOUT; +} + +static int +mana_smc_read_response(struct shm_channel *sc, uint32_t msg_type, + uint32_t msg_version, bool reset_vf) +{ + void __iomem *base = sc->base; + union smc_proto_hdr hdr; + int err; + + /* Wait for PF to respond. */ + err = mana_smc_poll_register(base, reset_vf); + if (err) + return err; + + hdr.as_uint32 = + readl((uint8_t *)base + SMC_LAST_DWORD * SMC_BASIC_UNIT); + mana_dbg(NULL, "shm response 0x%x\n", hdr.as_uint32); + + if (reset_vf && hdr.as_uint32 == SHMEM_VF_RESET_STATE) + return 0; + + /* Validate protocol fields from the PF driver */ + if (hdr.msg_type != msg_type || hdr.msg_version > msg_version || + hdr.direction != SMC_MSG_DIRECTION_RESPONSE) { + device_printf(sc->dev, + "Wrong SMC response 0x%x, type=%d, ver=%d\n", + hdr.as_uint32, msg_type, msg_version); + return EPROTO; + } + + /* Validate the operation result */ + if (hdr.status != 0) { + device_printf(sc->dev, + "SMC operation failed: 0x%x\n", hdr.status); + return EPROTO; + } + + return 0; +} + +void +mana_smc_init(struct shm_channel *sc, device_t dev, void __iomem *base) +{ + sc->dev = dev; + sc->base = base; +} + +int +mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, uint64_t eq_addr, + uint64_t cq_addr, uint64_t rq_addr, uint64_t sq_addr, + uint32_t eq_msix_index) +{ + union smc_proto_hdr *hdr; + uint16_t all_addr_h4bits = 0; + uint16_t frame_addr_seq = 0; + uint64_t frame_addr = 0; + uint8_t shm_buf[32]; + uint64_t *shmem; + uint32_t *dword; + uint8_t *ptr; + int err; + int i; + + /* Ensure VF already has possession of shared memory */ + err = mana_smc_poll_register(sc->base, false); + if (err) { + device_printf(sc->dev, + "Timeout when setting up HWC: %d\n", err); + return err; + } + + if (!IS_ALIGNED(eq_addr, PAGE_SIZE) || + !IS_ALIGNED(cq_addr, PAGE_SIZE) || + !IS_ALIGNED(rq_addr, PAGE_SIZE) || + !IS_ALIGNED(sq_addr, PAGE_SIZE)) + return EINVAL; + + if ((eq_msix_index & VECTOR_MASK) != eq_msix_index) + return EINVAL; + + /* Scheme for packing four addresses and extra info into 256 bits. + * + * Addresses must be page frame aligned, so only frame address bits + * are transferred. + * + * 52-bit frame addresses are split into the lower 48 bits and upper + * 4 bits. Lower 48 bits of 4 address are written sequentially from + * the start of the 256-bit shared memory region followed by 16 bits + * containing the upper 4 bits of the 4 addresses in sequence. + * + * A 16 bit EQ vector number fills out the next-to-last 32-bit dword. + * + * The final 32-bit dword is used for protocol control information as + * defined in smc_proto_hdr. + */ + + memset(shm_buf, 0, sizeof(shm_buf)); + ptr = shm_buf; + + /* EQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(eq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* CQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(cq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* RQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(rq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* SQ addr: low 48 bits of frame address */ + shmem = (uint64_t *)ptr; + frame_addr = PHYS_PFN(sq_addr); + *shmem = frame_addr & PAGE_FRAME_L48_MASK; + all_addr_h4bits |= (frame_addr >> PAGE_FRAME_L48_WIDTH_BITS) << + (frame_addr_seq++ * PAGE_FRAME_H4_WIDTH_BITS); + ptr += PAGE_FRAME_L48_WIDTH_BYTES; + + /* High 4 bits of the four frame addresses */ + *((uint16_t *)ptr) = all_addr_h4bits; + ptr += sizeof(uint16_t); + + /* EQ MSIX vector number */ + *((uint16_t *)ptr) = (uint16_t)eq_msix_index; + ptr += sizeof(uint16_t); + + /* 32-bit protocol header in final dword */ + *((uint32_t *)ptr) = 0; + + hdr = (union smc_proto_hdr *)ptr; + hdr->msg_type = SMC_MSG_TYPE_ESTABLISH_HWC; + hdr->msg_version = SMC_MSG_TYPE_ESTABLISH_HWC_VERSION; + hdr->direction = SMC_MSG_DIRECTION_REQUEST; + hdr->reset_vf = reset_vf; + + /* Write 256-message buffer to shared memory (final 32-bit write + * triggers HW to set possession bit to PF). + */ + dword = (uint32_t *)shm_buf; + for (i = 0; i < SMC_APERTURE_DWORDS; i++) { + mana_dbg(NULL, "write shm_buf %d, val: 0x%x\n", + i, *dword); + writel((char *)sc->base + i * SMC_BASIC_UNIT, *dword++); + } + + /* Read shmem response (polling for VF possession) and validate. + * For setup, waiting for response on shared memory is not strictly + * necessary, since wait occurs later for results to appear in EQE's. + */ + err = mana_smc_read_response(sc, SMC_MSG_TYPE_ESTABLISH_HWC, + SMC_MSG_TYPE_ESTABLISH_HWC_VERSION, reset_vf); + if (err) { + device_printf(sc->dev, + "Error when setting up HWC: %d\n", err); + return err; + } + + return 0; +} + +int +mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf) +{ + union smc_proto_hdr hdr = {}; + int err; + + /* Ensure already has possession of shared memory */ + err = mana_smc_poll_register(sc->base, false); + if (err) { + device_printf(sc->dev, "Timeout when tearing down HWC\n"); + return err; + } + + /* Set up protocol header for HWC destroy message */ + hdr.msg_type = SMC_MSG_TYPE_DESTROY_HWC; + hdr.msg_version = SMC_MSG_TYPE_DESTROY_HWC_VERSION; + hdr.direction = SMC_MSG_DIRECTION_REQUEST; + hdr.reset_vf = reset_vf; + + /* Write message in high 32 bits of 256-bit shared memory, causing HW + * to set possession bit to PF. + */ + writel((char *)sc->base + SMC_LAST_DWORD * SMC_BASIC_UNIT, + hdr.as_uint32); + + /* Read shmem response (polling for VF possession) and validate. + * For teardown, waiting for response is required to ensure hardware + * invalidates MST entries before software frees memory. + */ + err = mana_smc_read_response(sc, SMC_MSG_TYPE_DESTROY_HWC, + SMC_MSG_TYPE_DESTROY_HWC_VERSION, reset_vf); + if (err) { + device_printf(sc->dev, + "Error when tearing down HWC: %d\n", err); + return err; + } + + return 0; +} diff --git a/sys/modules/Makefile b/sys/modules/Makefile --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -231,6 +231,7 @@ mac_stub \ mac_test \ ${_malo} \ + ${_mana} \ md \ mdio \ mem \ @@ -648,6 +649,7 @@ .if ${MK_SOURCELESS_UCODE} != "no" _lio= lio .endif +_mana= mana _nctgpio= nctgpio _ntb= ntb _ocs_fc= ocs_fc diff --git a/sys/modules/mana/Makefile b/sys/modules/mana/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/mana/Makefile @@ -0,0 +1,12 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/dev/mana + +KMOD = if_mana +SRCS = gdma_main.c mana_sysctl.c shm_channel.c +SRCS += mana_en.c gdma_util.c hw_channel.c +SRCS += device_if.h bus_if.h pci_if.h + +CFLAGS += -I${SRCTOP}/sys/dev/mana + +.include