diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h index 39965c8669cf..bf15eb3ccabc 100644 --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -1,701 +1,701 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _GVE_FBSD_H #define _GVE_FBSD_H #include "gve_desc.h" #include "gve_plat.h" #include "gve_register.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 #endif #define PCI_DEV_ID_GVNIC 0x0042 #define GVE_REGISTER_BAR 0 #define GVE_DOORBELL_BAR 2 /* Driver can alloc up to 2 segments for the header and 2 for the payload. */ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 /* Each RX bounce buffer page can fit two packet buffers. */ #define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) /* PTYPEs are always 10 bits. */ #define GVE_NUM_PTYPES 1024 /* * Number of descriptors per queue page list. * Page count AKA QPL size can be derived by dividing the number of elements in * a page by the number of descriptors available. */ #define GVE_QPL_DIVISOR 16 static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); struct gve_dma_handle { bus_addr_t bus_addr; void *cpu_addr; bus_dma_tag_t tag; bus_dmamap_t map; }; union gve_tx_desc { struct gve_tx_pkt_desc pkt; /* first desc for a packet */ struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ }; /* Tracks the memory in the fifo occupied by a segment of a packet */ struct gve_tx_iovec { uint32_t iov_offset; /* offset into this segment */ uint32_t iov_len; /* length */ uint32_t iov_padding; /* padding associated with this segment */ }; /* Tracks allowed and current queue settings */ struct gve_queue_config { uint16_t max_queues; uint16_t num_queues; /* current */ }; struct gve_irq_db { __be32 index; } __aligned(CACHE_LINE_SIZE); /* * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value * when the entire configure_device_resources command is zeroed out and the * queue_format is not specified. */ enum gve_queue_format { GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, GVE_DQO_QPL_FORMAT = 0x4, }; enum gve_state_flags_bit { GVE_STATE_FLAG_ADMINQ_OK, GVE_STATE_FLAG_RESOURCES_OK, GVE_STATE_FLAG_QPLREG_OK, GVE_STATE_FLAG_RX_RINGS_OK, GVE_STATE_FLAG_TX_RINGS_OK, GVE_STATE_FLAG_QUEUES_UP, GVE_STATE_FLAG_LINK_UP, GVE_STATE_FLAG_DO_RESET, GVE_STATE_FLAG_IN_RESET, GVE_NUM_STATE_FLAGS /* Not part of the enum space */ }; BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS); #define GVE_DEVICE_STATUS_RESET (0x1 << 1) #define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2) #define GVE_RING_LOCK(ring) mtx_lock(&(ring)->ring_mtx) #define GVE_RING_TRYLOCK(ring) mtx_trylock(&(ring)->ring_mtx) #define GVE_RING_UNLOCK(ring) mtx_unlock(&(ring)->ring_mtx) #define GVE_RING_ASSERT(ring) mtx_assert(&(ring)->ring_mtx, MA_OWNED) #define GVE_IFACE_LOCK_INIT(lock) sx_init(&lock, "gve interface lock") #define GVE_IFACE_LOCK_DESTROY(lock) sx_destroy(&lock) #define GVE_IFACE_LOCK_LOCK(lock) sx_xlock(&lock) #define GVE_IFACE_LOCK_UNLOCK(lock) sx_unlock(&lock) #define GVE_IFACE_LOCK_ASSERT(lock) sx_assert(&lock, SA_XLOCKED) struct gve_queue_page_list { uint32_t id; uint32_t num_dmas; uint32_t num_pages; vm_offset_t kva; vm_page_t *pages; struct gve_dma_handle *dmas; }; struct gve_irq { struct resource *res; void *cookie; }; struct gve_rx_slot_page_info { void *page_address; vm_page_t page; uint32_t page_offset; uint16_t pad; }; /* * A single received packet split across multiple buffers may be * reconstructed using the information in this structure. */ struct gve_rx_ctx { /* head and tail of mbuf chain for the current packet */ struct mbuf *mbuf_head; struct mbuf *mbuf_tail; uint32_t total_size; uint8_t frag_cnt; bool is_tcp; bool drop_pkt; }; struct gve_ring_com { struct gve_priv *priv; uint32_t id; /* * BAR2 offset for this ring's doorbell and the * counter-array offset for this ring's counter. * Acquired from the device individually for each * queue in the queue_create adminq command. */ struct gve_queue_resources *q_resources; struct gve_dma_handle q_resources_mem; /* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */ uint32_t irq_db_offset; /* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */ uint32_t db_offset; /* * Index, not byte-offset, into the counter array where this ring's * 4-byte counter lies. */ uint32_t counter_idx; /* * The index of the MSIX vector that was assigned to * this ring in `gve_alloc_irqs`. * * It is passed to the device in the queue_create adminq * command. * * Additionally, this also serves as the index into * `priv->irq_db_indices` where this ring's irq doorbell's * BAR2 offset, `irq_db_idx`, can be found. */ int ntfy_id; /* * The fixed bounce buffer for this ring. * Once allocated, has to be offered to the device * over the register-page-list adminq command. */ struct gve_queue_page_list *qpl; struct task cleanup_task; struct taskqueue *cleanup_tq; } __aligned(CACHE_LINE_SIZE); struct gve_rxq_stats { counter_u64_t rbytes; counter_u64_t rpackets; counter_u64_t rx_dropped_pkt; counter_u64_t rx_copybreak_cnt; counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; counter_u64_t rx_dropped_pkt_buf_post_fail; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; counter_u64_t rx_mbuf_dmamap_err; counter_u64_t rx_mbuf_mclget_null; }; #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) union gve_rx_qpl_buf_id_dqo { struct { uint16_t buf_id:11; /* Index into rx->dqo.bufs */ uint8_t frag_num:5; /* Which frag in the QPL page */ }; uint16_t all; } __packed; _Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2, "gve: bad dqo qpl rx buf id length"); struct gve_rx_buf_dqo { union { /* RDA */ struct { struct mbuf *mbuf; bus_dmamap_t dmamap; uint64_t addr; bool mapped; }; /* QPL */ struct { uint8_t num_nic_frags; /* number of pending completions */ uint8_t next_idx; /* index of the next frag to post */ /* for chaining rx->dqo.used_bufs */ STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry; }; }; /* for chaining rx->dqo.free_bufs */ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; }; /* power-of-2 sized receive ring */ struct gve_rx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; uint32_t cnt; /* free-running total number of completed packets */ uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ union { /* GQI-only fields */ struct { struct gve_dma_handle data_ring_mem; /* accessed in the GQ receive hot path */ struct gve_rx_desc *desc_ring; union gve_rx_data_slot *data_ring; struct gve_rx_slot_page_info *page_info; uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ uint8_t seq_no; /* helps traverse the descriptor ring */ }; /* DQO-only fields */ struct { struct gve_dma_handle compl_ring_mem; struct gve_rx_compl_desc_dqo *compl_ring; struct gve_rx_desc_dqo *desc_ring; struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */ bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */ uint32_t buf_cnt; /* Size of the bufs array */ uint32_t mask; /* One less than the sizes of the desc and compl rings */ uint32_t head; /* The index at which to post the next buffer at */ uint32_t tail; /* The index at which to receive the next compl at */ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; /* * Only used in QPL mode. Pages referred to by if_input-ed mbufs * stay parked here till their wire count comes back to 1. * Pages are moved here after there aren't any pending completions. */ STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs; } dqo; }; struct lro_ctrl lro; struct gve_rx_ctx ctx; struct gve_rxq_stats stats; } __aligned(CACHE_LINE_SIZE); /* * A contiguous representation of the pages composing the Tx bounce buffer. * The xmit taskqueue and the completion taskqueue both simultaneously use it. * Both operate on `available`: the xmit tq lowers it and the completion tq * raises it. `head` is the last location written at and so only the xmit tq * uses it. */ struct gve_tx_fifo { vm_offset_t base; /* address of base of FIFO */ uint32_t size; /* total size */ volatile int available; /* how much space is still available */ uint32_t head; /* offset to write at */ }; struct gve_tx_buffer_state { struct mbuf *mbuf; struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; struct gve_txq_stats { counter_u64_t tbytes; counter_u64_t tpackets; counter_u64_t tso_packet_cnt; counter_u64_t tx_dropped_pkt; counter_u64_t tx_delayed_pkt_nospace_device; counter_u64_t tx_dropped_pkt_nospace_bufring; counter_u64_t tx_delayed_pkt_nospace_descring; counter_u64_t tx_delayed_pkt_nospace_compring; counter_u64_t tx_delayed_pkt_nospace_qpl_bufs; counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; counter_u64_t tx_mbuf_collapse; counter_u64_t tx_mbuf_defrag; counter_u64_t tx_mbuf_defrag_err; counter_u64_t tx_mbuf_dmamap_enomem_err; counter_u64_t tx_mbuf_dmamap_err; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; union { /* RDA */ bus_dmamap_t dmamap; /* QPL */ struct { /* * A linked list of entries from qpl_bufs that served * as the bounce buffer for this packet. */ int32_t qpl_buf_head; uint32_t num_qpl_bufs; }; }; uint8_t state; /* the gve_packet_state enum */ int next; /* To chain the free_pending_pkts lists */ }; /* power-of-2 sized transmit ring */ struct gve_tx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; struct task xmit_task; struct taskqueue *xmit_tq; bool stopped; /* Accessed when writing descriptors */ struct buf_ring *br; struct mtx ring_mtx; uint32_t req; /* free-running total number of packets written to the nic */ uint32_t done; /* free-running total number of completed packets */ union { /* GQI specific stuff */ struct { union gve_tx_desc *desc_ring; struct gve_tx_buffer_state *info; struct gve_tx_fifo fifo; uint32_t mask; /* masks the req and done to the size of the ring */ }; /* DQO specific stuff */ struct { struct gve_dma_handle compl_ring_mem; /* Accessed when writing descriptors */ struct { union gve_tx_desc_dqo *desc_ring; uint32_t desc_mask; /* masks head and tail to the size of desc_ring */ uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */ uint32_t desc_tail; /* last desc written by driver */ uint32_t last_re_idx; /* desc which last had "report event" set */ /* * The head index of a singly linked list containing pending packet objects * to park mbufs till the NIC sends completions. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. */ int32_t free_pending_pkts_csm; /* * The head index of a singly linked list representing QPL page fragments * to copy mbuf payload into for the NIC to see. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_csm; uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */ uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */ /* DMA params for mapping Tx mbufs. Only used in RDA mode. */ bus_dma_tag_t buf_dmatag; } __aligned(CACHE_LINE_SIZE); /* Accessed when processing completions */ struct { struct gve_tx_compl_desc_dqo *compl_ring; uint32_t compl_mask; /* masks head to the size of compl_ring */ uint32_t compl_head; /* last completion read by driver */ uint8_t cur_gen_bit; /* NIC flips a bit on every pass */ uint32_t hw_tx_head; /* last desc read by NIC */ /* * The completion taskqueue moves pending-packet objects to this * list after freeing the mbuf. The "_prd" denotes that this is * a producer list. The transmit taskqueue steals this list once * its consumer list, with the "_csm" suffix, is depleted. */ int32_t free_pending_pkts_prd; /* * The completion taskqueue moves the QPL pages corresponding to a * completed packet into this list. It is only used in QPL mode. * The "_prd" denotes that this is a producer list. The transmit * taskqueue steals this list once its consumer list, with the "_csm" * suffix, is depleted. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_prd; uint32_t qpl_bufs_produced; } __aligned(CACHE_LINE_SIZE); /* Accessed by both the completion and xmit loops */ struct { /* completion tags index into this array */ struct gve_tx_pending_pkt_dqo *pending_pkts; uint16_t num_pending_pkts; /* * Represents QPL page fragments. An index into this array * always represents the same QPL page fragment. The value * is also an index into this array and servers as a means * to chain buffers into linked lists whose heads are * either free_qpl_bufs_prd or free_qpl_bufs_csm or * qpl_bufs_head. */ int32_t *qpl_bufs; } __aligned(CACHE_LINE_SIZE); } dqo; }; struct gve_txq_stats stats; } __aligned(CACHE_LINE_SIZE); enum gve_packet_state { /* * Packet does not yet have a dmamap created. * This should always be zero since state is not explicitly initialized. */ GVE_PACKET_STATE_UNALLOCATED, /* Packet has a dmamap and is in free list, available to be allocated. */ GVE_PACKET_STATE_FREE, /* Packet is expecting a regular data completion */ GVE_PACKET_STATE_PENDING_DATA_COMPL, }; struct gve_ptype { uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */ uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */ }; struct gve_ptype_lut { struct gve_ptype ptypes[GVE_NUM_PTYPES]; }; struct gve_priv { if_t ifp; device_t dev; struct ifmedia media; uint8_t mac[ETHER_ADDR_LEN]; struct gve_dma_handle aq_mem; struct resource *reg_bar; /* BAR0 */ struct resource *db_bar; /* BAR2 */ struct resource *msix_table; uint32_t mgmt_msix_idx; uint32_t rx_copybreak; uint16_t num_event_counters; uint16_t default_num_queues; uint16_t tx_desc_cnt; uint16_t rx_desc_cnt; uint16_t rx_pages_per_qpl; uint64_t max_registered_pages; uint64_t num_registered_pages; uint32_t supported_features; uint16_t max_mtu; struct gve_dma_handle counter_array_mem; __be32 *counters; struct gve_dma_handle irqs_db_mem; struct gve_irq_db *irq_db_indices; enum gve_queue_format queue_format; - struct gve_queue_page_list *qpls; struct gve_queue_config tx_cfg; struct gve_queue_config rx_cfg; uint32_t num_queues; struct gve_irq *irq_tbl; struct gve_tx_ring *tx; struct gve_rx_ring *rx; struct gve_ptype_lut *ptype_lut_dqo; /* * Admin queue - see gve_adminq.h * Since AQ cmds do not run in steady state, 32 bit counters suffice */ struct gve_adminq_command *adminq; vm_paddr_t adminq_bus_addr; uint32_t adminq_mask; /* masks prod_cnt to adminq size */ uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */ uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */ uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */ /* free-running count of each distinct AQ cmd executed */ uint32_t adminq_describe_device_cnt; uint32_t adminq_cfg_device_resources_cnt; uint32_t adminq_register_page_list_cnt; uint32_t adminq_unregister_page_list_cnt; uint32_t adminq_create_tx_queue_cnt; uint32_t adminq_create_rx_queue_cnt; uint32_t adminq_destroy_tx_queue_cnt; uint32_t adminq_destroy_rx_queue_cnt; uint32_t adminq_dcfg_device_resources_cnt; uint32_t adminq_set_driver_parameter_cnt; uint32_t adminq_verify_driver_compatibility_cnt; uint32_t adminq_get_ptype_map_cnt; uint32_t interface_up_cnt; uint32_t interface_down_cnt; uint32_t reset_cnt; struct task service_task; struct taskqueue *service_tq; struct gve_state_flags state_flags; struct sx gve_iface_lock; }; static inline bool gve_get_state_flag(struct gve_priv *priv, int pos) { return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags)); } static inline void gve_set_state_flag(struct gve_priv *priv, int pos) { BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline void gve_clear_state_flag(struct gve_priv *priv, int pos) { BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline bool gve_is_gqi(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT); } static inline bool gve_is_qpl(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT || priv->queue_format == GVE_DQO_QPL_FORMAT); } /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); /* Register access functions defined in gve_utils.c */ uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); /* QPL (Queue Page List) functions defined in gve_qpl.c */ -int gve_alloc_qpls(struct gve_priv *priv); -void gve_free_qpls(struct gve_priv *priv); +struct gve_queue_page_list *gve_alloc_qpl(struct gve_priv *priv, uint32_t id, + int npages, bool single_kva); +void gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); void gve_mextadd_free(struct mbuf *mbuf); /* TX functions defined in gve_tx.c */ int gve_alloc_tx_rings(struct gve_priv *priv); void gve_free_tx_rings(struct gve_priv *priv); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); void gve_xmit_tq(void *arg, int pending); void gve_tx_cleanup_tq(void *arg, int pending); /* TX functions defined in gve_tx_dqo.c */ int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); void gve_tx_cleanup_tq_dqo(void *arg, int pending); /* RX functions defined in gve_rx.c */ int gve_alloc_rx_rings(struct gve_priv *priv); void gve_free_rx_rings(struct gve_priv *priv); int gve_create_rx_rings(struct gve_priv *priv); int gve_destroy_rx_rings(struct gve_priv *priv); int gve_rx_intr(void *arg); void gve_rx_cleanup_tq(void *arg, int pending); /* RX functions defined in gve_rx_dqo.c */ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_rx_free_ring_dqo(struct gve_priv *priv, int i); void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx); void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i); int gve_rx_intr_dqo(void *arg); void gve_rx_cleanup_tq_dqo(void *arg, int pending); /* DMA functions defined in gve_utils.c */ int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dma_free_coherent(struct gve_dma_handle *dma); int gve_dmamap_create(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dmamap_destroy(struct gve_dma_handle *dma); /* IRQ functions defined in gve_utils.c */ void gve_free_irqs(struct gve_priv *priv); int gve_alloc_irqs(struct gve_priv *priv); void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); /* Systcl functions defined in gve_sysctl.c */ extern bool gve_disable_hw_lro; extern char gve_queue_format[8]; extern char gve_version[8]; void gve_setup_sysctl(struct gve_priv *priv); void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt); /* Stats functions defined in gve_utils.c */ void gve_alloc_counters(counter_u64_t *stat, int num_stats); void gve_free_counters(counter_u64_t *stat, int num_stats); #endif /* _GVE_FBSD_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c index 8e764f9660d7..72e7fc2e3f89 100644 --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -1,957 +1,949 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" #define GVE_DRIVER_VERSION "GVE-FBSD-1.3.2\n" #define GVE_VERSION_MAJOR 1 #define GVE_VERSION_MINOR 3 #define GVE_VERSION_SUB 2 #define GVE_DEFAULT_RX_COPYBREAK 256 /* Devices supported by this driver. */ static struct gve_dev { uint16_t vendor_id; uint16_t device_id; const char *name; } gve_devs[] = { { PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC, "gVNIC" } }; struct sx gve_global_lock; static int gve_verify_driver_compatibility(struct gve_priv *priv) { int err; struct gve_driver_info *driver_info; struct gve_dma_handle driver_info_mem; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_driver_info), PAGE_SIZE, &driver_info_mem); if (err != 0) return (ENOMEM); driver_info = driver_info_mem.cpu_addr; *driver_info = (struct gve_driver_info) { .os_type = 3, /* Freebsd */ .driver_major = GVE_VERSION_MAJOR, .driver_minor = GVE_VERSION_MINOR, .driver_sub = GVE_VERSION_SUB, .os_version_major = htobe32(FBSD_VERSION_MAJOR), .os_version_minor = htobe32(FBSD_VERSION_MINOR), .os_version_sub = htobe32(FBSD_VERSION_PATCH), .driver_capability_flags = { htobe64(GVE_DRIVER_CAPABILITY_FLAGS1), htobe64(GVE_DRIVER_CAPABILITY_FLAGS2), htobe64(GVE_DRIVER_CAPABILITY_FLAGS3), htobe64(GVE_DRIVER_CAPABILITY_FLAGS4), }, }; snprintf(driver_info->os_version_str1, sizeof(driver_info->os_version_str1), "FreeBSD %u", __FreeBSD_version); bus_dmamap_sync(driver_info_mem.tag, driver_info_mem.map, BUS_DMASYNC_PREREAD); err = gve_adminq_verify_driver_compatibility(priv, sizeof(struct gve_driver_info), driver_info_mem.bus_addr); /* It's ok if the device doesn't support this */ if (err == EOPNOTSUPP) err = 0; gve_dma_free_coherent(&driver_info_mem); return (err); } static int gve_up(struct gve_priv *priv) { if_t ifp = priv->ifp; int err; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (device_is_attached(priv->dev) == 0) { device_printf(priv->dev, "Cannot bring the iface up when detached\n"); return (ENXIO); } if (gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return (0); if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, CSUM_TCP | CSUM_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, CSUM_IP6_TCP | CSUM_IP6_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); if (gve_is_qpl(priv)) { err = gve_register_qpls(priv); if (err != 0) goto reset; } err = gve_create_rx_rings(priv); if (err != 0) goto reset; err = gve_create_tx_rings(priv); if (err != 0) goto reset; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; return (0); reset: gve_schedule_reset(priv); return (err); } static void gve_down(struct gve_priv *priv) { GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if (gve_destroy_rx_rings(priv) != 0) goto reset; if (gve_destroy_tx_rings(priv) != 0) goto reset; if (gve_is_qpl(priv)) { if (gve_unregister_qpls(priv) != 0) goto reset; } if (gve_is_gqi(priv)) gve_mask_all_queue_irqs(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_down_cnt++; return; reset: gve_schedule_reset(priv); } static int gve_set_mtu(if_t ifp, uint32_t new_mtu) { struct gve_priv *priv = if_getsoftc(ifp); const uint32_t max_problem_range = 8227; const uint32_t min_problem_range = 7822; int err; if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { device_printf(priv->dev, "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d\n", new_mtu, priv->max_mtu, ETHERMIN); return (EINVAL); } /* * When hardware LRO is enabled in DQ mode, MTUs within the range * [7822, 8227] trigger hardware issues which cause a drastic drop * in throughput. */ if (!gve_is_gqi(priv) && !gve_disable_hw_lro && new_mtu >= min_problem_range && new_mtu <= max_problem_range) { device_printf(priv->dev, "Cannot set to MTU to %d within the range [%d, %d] while hardware LRO is enabled\n", new_mtu, min_problem_range, max_problem_range); return (EINVAL); } err = gve_adminq_set_mtu(priv, new_mtu); if (err == 0) { if (bootverbose) device_printf(priv->dev, "MTU set to %d\n", new_mtu); if_setmtu(ifp, new_mtu); } else { device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); } return (err); } static void gve_init(void *arg) { struct gve_priv *priv = (struct gve_priv *)arg; if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } static int gve_ioctl(if_t ifp, u_long command, caddr_t data) { struct gve_priv *priv; struct ifreq *ifr; int rc = 0; priv = if_getsoftc(ifp); ifr = (struct ifreq *)data; switch (command) { case SIOCSIFMTU: if (if_getmtu(ifp) == ifr->ifr_mtu) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); gve_set_mtu(ifp, ifr->ifr_mtu); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFFLAGS: if ((if_getflags(ifp) & IFF_UP) != 0) { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } else { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } break; case SIOCSIFCAP: if (ifr->ifr_reqcap == if_getcapenable(ifp)) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); if_setcapenable(ifp, ifr->ifr_reqcap); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFMEDIA: /* FALLTHROUGH */ case SIOCGIFMEDIA: rc = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; default: rc = ether_ioctl(ifp, command, data); break; } return (rc); } static int gve_media_change(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); device_printf(priv->dev, "Media change not supported\n"); return (0); } static void gve_media_status(if_t ifp, struct ifmediareq *ifmr) { struct gve_priv *priv = if_getsoftc(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_AUTO; } else { ifmr->ifm_active |= IFM_NONE; } GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } static uint64_t gve_get_counter(if_t ifp, ift_counter cnt) { struct gve_priv *priv; uint64_t rpackets = 0; uint64_t tpackets = 0; uint64_t rbytes = 0; uint64_t tbytes = 0; uint64_t rx_dropped_pkt = 0; uint64_t tx_dropped_pkt = 0; priv = if_getsoftc(ifp); gve_accum_stats(priv, &rpackets, &rbytes, &rx_dropped_pkt, &tpackets, &tbytes, &tx_dropped_pkt); switch (cnt) { case IFCOUNTER_IPACKETS: return (rpackets); case IFCOUNTER_OPACKETS: return (tpackets); case IFCOUNTER_IBYTES: return (rbytes); case IFCOUNTER_OBYTES: return (tbytes); case IFCOUNTER_IQDROPS: return (rx_dropped_pkt); case IFCOUNTER_OQDROPS: return (tx_dropped_pkt); default: return (if_get_counter_default(ifp, cnt)); } } static void gve_setup_ifnet(device_t dev, struct gve_priv *priv) { int caps = 0; if_t ifp; ifp = priv->ifp = if_alloc(IFT_ETHER); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, priv); if_setdev(ifp, dev); if_setinitfn(ifp, gve_init); if_setioctlfn(ifp, gve_ioctl); if_settransmitfn(ifp, gve_xmit_ifp); if_setqflushfn(ifp, gve_qflush); /* * Set TSO limits, must match the arguments to bus_dma_tag_create * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode * because in QPL we copy the entire packet into the bounce buffer * and thus it does not matter how fragmented the mbuf is. */ if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) { if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); } if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); #else if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); #endif ifmedia_init(&priv->media, IFM_IMASK, gve_media_change, gve_media_status); if_setgetcounterfn(ifp, gve_get_counter); caps = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO | IFCAP_LRO; if ((priv->supported_features & GVE_SUP_JUMBO_FRAMES_MASK) != 0) caps |= IFCAP_JUMBO_MTU; if_setcapabilities(ifp, caps); if_setcapenable(ifp, caps); if (bootverbose) device_printf(priv->dev, "Setting initial MTU to %d\n", priv->max_mtu); if_setmtu(ifp, priv->max_mtu); ether_ifattach(ifp, priv->mac); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); } static int gve_alloc_counter_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(uint32_t) * priv->num_event_counters, PAGE_SIZE, &priv->counter_array_mem); if (err != 0) return (err); priv->counters = priv->counter_array_mem.cpu_addr; return (0); } static void gve_free_counter_array(struct gve_priv *priv) { if (priv->counters != NULL) gve_dma_free_coherent(&priv->counter_array_mem); priv->counter_array_mem = (struct gve_dma_handle){}; } static int gve_alloc_irq_db_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_irq_db) * (priv->num_queues), PAGE_SIZE, &priv->irqs_db_mem); if (err != 0) return (err); priv->irq_db_indices = priv->irqs_db_mem.cpu_addr; return (0); } static void gve_free_irq_db_array(struct gve_priv *priv) { if (priv->irq_db_indices != NULL) gve_dma_free_coherent(&priv->irqs_db_mem); priv->irqs_db_mem = (struct gve_dma_handle){}; } static void gve_free_rings(struct gve_priv *priv) { gve_free_irqs(priv); gve_free_tx_rings(priv); gve_free_rx_rings(priv); - if (gve_is_qpl(priv)) - gve_free_qpls(priv); } static int gve_alloc_rings(struct gve_priv *priv) { int err; - if (gve_is_qpl(priv)) { - err = gve_alloc_qpls(priv); - if (err != 0) - goto abort; - } - err = gve_alloc_rx_rings(priv); if (err != 0) goto abort; err = gve_alloc_tx_rings(priv); if (err != 0) goto abort; err = gve_alloc_irqs(priv); if (err != 0) goto abort; return (0); abort: gve_free_rings(priv); return (err); } static void gve_deconfigure_and_free_device_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) { err = gve_adminq_deconfigure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to deconfigure device resources: err=%d\n", err); return; } if (bootverbose) device_printf(priv->dev, "Deconfigured device resources\n"); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); } gve_free_irq_db_array(priv); gve_free_counter_array(priv); if (priv->ptype_lut_dqo) { free(priv->ptype_lut_dqo, M_GVE); priv->ptype_lut_dqo = NULL; } } static int gve_alloc_and_configure_device_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) return (0); err = gve_alloc_counter_array(priv); if (err != 0) return (err); err = gve_alloc_irq_db_array(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } if (!gve_is_gqi(priv)) { priv->ptype_lut_dqo = malloc(sizeof(*priv->ptype_lut_dqo), M_GVE, M_WAITOK | M_ZERO); err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); if (err != 0) { device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", err); goto abort; } } gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); if (bootverbose) device_printf(priv->dev, "Configured device resources\n"); return (0); abort: gve_deconfigure_and_free_device_resources(priv); return (err); } static void gve_set_queue_cnts(struct gve_priv *priv) { priv->tx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_TX_QUEUES); priv->rx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_RX_QUEUES); priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; if (priv->default_num_queues > 0) { priv->tx_cfg.num_queues = MIN(priv->default_num_queues, priv->tx_cfg.num_queues); priv->rx_cfg.num_queues = MIN(priv->default_num_queues, priv->rx_cfg.num_queues); } priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues; priv->mgmt_msix_idx = priv->num_queues; } static int gve_alloc_adminq_and_describe_device(struct gve_priv *priv) { int err; if ((err = gve_adminq_alloc(priv)) != 0) return (err); if ((err = gve_verify_driver_compatibility(priv)) != 0) { device_printf(priv->dev, "Failed to verify driver compatibility: err=%d\n", err); goto abort; } if ((err = gve_adminq_describe_device(priv)) != 0) goto abort; gve_set_queue_cnts(priv); priv->num_registered_pages = 0; return (0); abort: gve_release_adminq(priv); return (err); } void gve_schedule_reset(struct gve_priv *priv) { if (gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) return; device_printf(priv->dev, "Scheduling reset task!\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); taskqueue_enqueue(priv->service_tq, &priv->service_task); } static void gve_destroy(struct gve_priv *priv) { gve_down(priv); gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); } static void gve_restore(struct gve_priv *priv) { int err; err = gve_adminq_alloc(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } if (!gve_is_gqi(priv)) { err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); if (err != 0) { device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", err); goto abort; } } err = gve_up(priv); if (err != 0) goto abort; return; abort: device_printf(priv->dev, "Restore failed!\n"); return; } static void gve_clear_device_resources(struct gve_priv *priv) { int i; for (i = 0; i < priv->num_event_counters; i++) priv->counters[i] = 0; bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, BUS_DMASYNC_PREWRITE); for (i = 0; i < priv->num_queues; i++) priv->irq_db_indices[i] = (struct gve_irq_db){}; bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_PREWRITE); if (priv->ptype_lut_dqo) *priv->ptype_lut_dqo = (struct gve_ptype_lut){0}; } static void gve_handle_reset(struct gve_priv *priv) { if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) return; gve_clear_state_flag(priv, GVE_STATE_FLAG_DO_RESET); gve_set_state_flag(priv, GVE_STATE_FLAG_IN_RESET); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); /* * Releasing the adminq causes the NIC to destroy all resources * registered with it, so by clearing the flags beneath we cause * the subsequent gve_down call below to not attempt to tell the * NIC to destroy these resources again. * * The call to gve_down is needed in the first place to refresh * the state and the DMA-able memory within each driver ring. */ gve_release_adminq(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); gve_down(priv); gve_clear_device_resources(priv); gve_restore(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); priv->reset_cnt++; gve_clear_state_flag(priv, GVE_STATE_FLAG_IN_RESET); } static void gve_handle_link_status(struct gve_priv *priv) { uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); bool link_up = status & GVE_DEVICE_STATUS_LINK_STATUS; if (link_up == gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) return; if (link_up) { if (bootverbose) device_printf(priv->dev, "Device link is up.\n"); if_link_state_change(priv->ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } else { device_printf(priv->dev, "Device link is down.\n"); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } } static void gve_service_task(void *arg, int pending) { struct gve_priv *priv = (struct gve_priv *)arg; uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); if (((GVE_DEVICE_STATUS_RESET_MASK & status) != 0) && !gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) { device_printf(priv->dev, "Device requested reset\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); } gve_handle_reset(priv); gve_handle_link_status(priv); } static int gve_probe(device_t dev) { uint16_t deviceid, vendorid; int i; vendorid = pci_get_vendor(dev); deviceid = pci_get_device(dev); for (i = 0; i < nitems(gve_devs); i++) { if (vendorid == gve_devs[i].vendor_id && deviceid == gve_devs[i].device_id) { device_set_desc(dev, gve_devs[i].name); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void gve_free_sys_res_mem(struct gve_priv *priv) { if (priv->msix_table != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->msix_table), priv->msix_table); if (priv->db_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->db_bar), priv->db_bar); if (priv->reg_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->reg_bar), priv->reg_bar); } static int gve_attach(device_t dev) { struct gve_priv *priv; int rid; int err; snprintf(gve_version, sizeof(gve_version), "%d.%d.%d", GVE_VERSION_MAJOR, GVE_VERSION_MINOR, GVE_VERSION_SUB); priv = device_get_softc(dev); priv->dev = dev; GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); pci_enable_busmaster(dev); rid = PCIR_BAR(GVE_REGISTER_BAR); priv->reg_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->reg_bar == NULL) { device_printf(dev, "Failed to allocate BAR0\n"); err = ENXIO; goto abort; } rid = PCIR_BAR(GVE_DOORBELL_BAR); priv->db_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->db_bar == NULL) { device_printf(dev, "Failed to allocate BAR2\n"); err = ENXIO; goto abort; } rid = pci_msix_table_bar(priv->dev); priv->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->msix_table == NULL) { device_printf(dev, "Failed to allocate msix table\n"); err = ENXIO; goto abort; } err = gve_alloc_adminq_and_describe_device(priv); if (err != 0) goto abort; err = gve_alloc_and_configure_device_resources(priv); if (err != 0) goto abort; err = gve_alloc_rings(priv); if (err != 0) goto abort; gve_setup_ifnet(dev, priv); priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; bus_write_multi_1(priv->reg_bar, DRIVER_VERSION, GVE_DRIVER_VERSION, sizeof(GVE_DRIVER_VERSION) - 1); TASK_INIT(&priv->service_task, 0, gve_service_task, priv); priv->service_tq = taskqueue_create("gve service", M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &priv->service_tq); taskqueue_start_threads(&priv->service_tq, 1, PI_NET, "%s service tq", device_get_nameunit(priv->dev)); gve_setup_sysctl(priv); if (bootverbose) device_printf(priv->dev, "Successfully attached %s", GVE_DRIVER_VERSION); return (0); abort: gve_free_rings(priv); gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); return (err); } static int gve_detach(device_t dev) { struct gve_priv *priv = device_get_softc(dev); if_t ifp = priv->ifp; int error; error = bus_generic_detach(dev); if (error != 0) return (error); ether_ifdetach(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_destroy(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); gve_free_rings(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); while (taskqueue_cancel(priv->service_tq, &priv->service_task, NULL)) taskqueue_drain(priv->service_tq, &priv->service_task); taskqueue_free(priv->service_tq); if_free(ifp); return (0); } static device_method_t gve_methods[] = { DEVMETHOD(device_probe, gve_probe), DEVMETHOD(device_attach, gve_attach), DEVMETHOD(device_detach, gve_detach), DEVMETHOD_END }; static driver_t gve_driver = { "gve", gve_methods, sizeof(struct gve_priv) }; #if __FreeBSD_version < 1301503 static devclass_t gve_devclass; DRIVER_MODULE(gve, pci, gve_driver, gve_devclass, 0, 0); #else DRIVER_MODULE(gve, pci, gve_driver, 0, 0); #endif MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, gve, gve_devs, nitems(gve_devs)); diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c index 1fcc2b5365c9..0e7098dcd4a1 100644 --- a/sys/dev/gve/gve_qpl.c +++ b/sys/dev/gve/gve_qpl.c @@ -1,309 +1,259 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations"); -static uint32_t -gve_num_tx_qpls(struct gve_priv *priv) -{ - if (!gve_is_qpl(priv)) - return (0); - - return (priv->tx_cfg.max_queues); -} - -static uint32_t -gve_num_rx_qpls(struct gve_priv *priv) -{ - if (!gve_is_qpl(priv)) - return (0); - - return (priv->rx_cfg.max_queues); -} - -static void -gve_free_qpl(struct gve_priv *priv, uint32_t id) +void +gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl) { - struct gve_queue_page_list *qpl = &priv->qpls[id]; int i; for (i = 0; i < qpl->num_dmas; i++) { gve_dmamap_destroy(&qpl->dmas[i]); } if (qpl->kva) { pmap_qremove(qpl->kva, qpl->num_pages); kva_free(qpl->kva, PAGE_SIZE * qpl->num_pages); } for (i = 0; i < qpl->num_pages; i++) { /* * Free the page only if this is the last ref. * Tx pages are known to have no other refs at * this point, but Rx pages might still be in * use by the networking stack, see gve_mextadd_free. */ if (vm_page_unwire_noq(qpl->pages[i])) { if (!qpl->kva) { pmap_qremove((vm_offset_t)qpl->dmas[i].cpu_addr, 1); kva_free((vm_offset_t)qpl->dmas[i].cpu_addr, PAGE_SIZE); } vm_page_free(qpl->pages[i]); } priv->num_registered_pages--; } if (qpl->pages != NULL) free(qpl->pages, M_GVE_QPL); if (qpl->dmas != NULL) free(qpl->dmas, M_GVE_QPL); + + free(qpl, M_GVE_QPL); } -static int +struct gve_queue_page_list * gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva) { - struct gve_queue_page_list *qpl = &priv->qpls[id]; + struct gve_queue_page_list *qpl; int err; int i; if (npages + priv->num_registered_pages > priv->max_registered_pages) { device_printf(priv->dev, "Reached max number of registered pages %ju > %ju\n", (uintmax_t)npages + priv->num_registered_pages, (uintmax_t)priv->max_registered_pages); - return (EINVAL); + return (NULL); } + qpl = malloc(sizeof(struct gve_queue_page_list), M_GVE_QPL, + M_WAITOK | M_ZERO); + qpl->id = id; qpl->num_pages = 0; qpl->num_dmas = 0; qpl->dmas = malloc(npages * sizeof(*qpl->dmas), M_GVE_QPL, M_WAITOK | M_ZERO); qpl->pages = malloc(npages * sizeof(*qpl->pages), M_GVE_QPL, M_WAITOK | M_ZERO); qpl->kva = 0; if (single_kva) { qpl->kva = kva_alloc(PAGE_SIZE * npages); if (!qpl->kva) { device_printf(priv->dev, "Failed to create the single kva for QPL %d\n", id); err = ENOMEM; goto abort; } } for (i = 0; i < npages; i++) { qpl->pages[i] = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO); if (!single_kva) { qpl->dmas[i].cpu_addr = (void *)kva_alloc(PAGE_SIZE); if (!qpl->dmas[i].cpu_addr) { device_printf(priv->dev, "Failed to create kva for page %d in QPL %d", i, id); err = ENOMEM; goto abort; } pmap_qenter((vm_offset_t)qpl->dmas[i].cpu_addr, &(qpl->pages[i]), 1); } else qpl->dmas[i].cpu_addr = (void *)(qpl->kva + (PAGE_SIZE * i)); qpl->num_pages++; } if (single_kva) pmap_qenter(qpl->kva, qpl->pages, npages); for (i = 0; i < npages; i++) { err = gve_dmamap_create(priv, /*size=*/PAGE_SIZE, /*align=*/PAGE_SIZE, &qpl->dmas[i]); if (err != 0) { device_printf(priv->dev, "Failed to dma-map page %d in QPL %d\n", i, id); goto abort; } qpl->num_dmas++; priv->num_registered_pages++; } - return (0); + return (qpl); abort: - gve_free_qpl(priv, id); - return (err); + gve_free_qpl(priv, qpl); + return (NULL); } -void -gve_free_qpls(struct gve_priv *priv) -{ - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); - int i; - - if (num_qpls == 0) - return; - - if (priv->qpls != NULL) { - for (i = 0; i < num_qpls; i++) - gve_free_qpl(priv, i); - free(priv->qpls, M_GVE_QPL); - priv->qpls = NULL; - } -} - -int gve_alloc_qpls(struct gve_priv *priv) +int +gve_register_qpls(struct gve_priv *priv) { - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); - int num_pages; + struct gve_ring_com *com; + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; int err; int i; - if (num_qpls == 0) + if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) return (0); - priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL, - M_WAITOK | M_ZERO); - - num_pages = gve_is_gqi(priv) ? - priv->tx_desc_cnt / GVE_QPL_DIVISOR : - GVE_TX_NUM_QPL_PAGES_DQO; - for (i = 0; i < gve_num_tx_qpls(priv); i++) { - err = gve_alloc_qpl(priv, i, num_pages, - /*single_kva=*/true); - if (err != 0) - goto abort; - } - - num_pages = gve_is_gqi(priv) ? priv->rx_desc_cnt : GVE_RX_NUM_QPL_PAGES_DQO; - for (; i < num_qpls; i++) { - err = gve_alloc_qpl(priv, i, num_pages, /*single_kva=*/false); - if (err != 0) - goto abort; - } - - return (0); - -abort: - gve_free_qpls(priv); - return (err); -} - -static int -gve_unregister_n_qpls(struct gve_priv *priv, int n) -{ - int err; - int i; - - for (i = 0; i < n; i++) { - err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id); + /* Register TX qpls */ + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + tx = &priv->tx[i]; + com = &tx->com; + err = gve_adminq_register_page_list(priv, com->qpl); if (err != 0) { device_printf(priv->dev, - "Failed to unregister qpl %d, err: %d\n", - priv->qpls[i].id, err); + "Failed to register qpl %d, err: %d\n", + com->qpl->id, err); + /* Caller schedules a reset when this fails */ + return (err); } } - if (err != 0) - return (err); - - return (0); -} - -int -gve_register_qpls(struct gve_priv *priv) -{ - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); - int err; - int i; - - if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) - return (0); - - for (i = 0; i < num_qpls; i++) { - err = gve_adminq_register_page_list(priv, &priv->qpls[i]); + /* Register RX qpls */ + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rx = &priv->rx[i]; + com = &rx->com; + err = gve_adminq_register_page_list(priv, com->qpl); if (err != 0) { device_printf(priv->dev, "Failed to register qpl %d, err: %d\n", - priv->qpls[i].id, err); - goto abort; + com->qpl->id, err); + /* Caller schedules a reset when this fails */ + return (err); } } - gve_set_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); return (0); - -abort: - gve_unregister_n_qpls(priv, i); - return (err); } int gve_unregister_qpls(struct gve_priv *priv) { - int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); int err; + int i; + struct gve_ring_com *com; + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) return (0); - err = gve_unregister_n_qpls(priv, num_qpls); + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + tx = &priv->tx[i]; + com = &tx->com; + err = gve_adminq_unregister_page_list(priv, com->qpl->id); + if (err != 0) { + device_printf(priv->dev, + "Failed to unregister qpl %d, err: %d\n", + com->qpl->id, err); + } + } + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rx = &priv->rx[i]; + com = &rx->com; + err = gve_adminq_unregister_page_list(priv, com->qpl->id); + if (err != 0) { + device_printf(priv->dev, + "Failed to unregister qpl %d, err: %d\n", + com->qpl->id, err); + } + } + if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); return (0); } void gve_mextadd_free(struct mbuf *mbuf) { vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; /* * Free the page only if this is the last ref. * The interface might no longer exist by the time * this callback is called, see gve_free_qpl. */ if (__predict_false(vm_page_unwire_noq(page))) { pmap_qremove(va, 1); kva_free(va, PAGE_SIZE); vm_page_free(page); } } diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c index e540ad6f4c11..e1a228c0e69c 100644 --- a/sys/dev/gve/gve_rx.c +++ b/sys/dev/gve/gve_rx.c @@ -1,715 +1,724 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" static void gve_rx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; if (rx->page_info != NULL) { free(rx->page_info, M_GVE); rx->page_info = NULL; } if (rx->data_ring != NULL) { gve_dma_free_coherent(&rx->data_ring_mem); rx->data_ring = NULL; } if (rx->desc_ring != NULL) { gve_dma_free_coherent(&rx->desc_ring_mem); rx->desc_ring = NULL; } + + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } } static void gve_rx_free_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; /* Safe to call even if never allocated */ gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); if (gve_is_gqi(priv)) gve_rx_free_ring_gqi(priv, i); else gve_rx_free_ring_dqo(priv, i); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } } static void gve_prefill_rx_slots(struct gve_rx_ring *rx) { struct gve_ring_com *com = &rx->com; struct gve_dma_handle *dma; int i; for (i = 0; i < com->priv->rx_desc_cnt; i++) { rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i); rx->page_info[i].page_offset = 0; rx->page_info[i].page_address = com->qpl->dmas[i].cpu_addr; rx->page_info[i].page = com->qpl->pages[i]; dma = &com->qpl->dmas[i]; bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREREAD); } bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); } static int gve_rx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int err; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); goto abort; } rx->mask = priv->rx_pages_per_qpl - 1; rx->desc_ring = rx->desc_ring_mem.cpu_addr; - com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; + com->qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues, + priv->rx_desc_cnt, /*single_kva=*/false); if (com->qpl == NULL) { - device_printf(priv->dev, "No QPL left for rx ring %d", i); - return (ENOMEM); + device_printf(priv->dev, + "Failed to alloc QPL for rx ring %d", i); + err = ENOMEM; + goto abort; } rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE, M_WAITOK | M_ZERO); err = gve_dma_alloc_coherent(priv, sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->data_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i); goto abort; } rx->data_ring = rx->data_ring_mem.cpu_addr; gve_prefill_rx_slots(rx); return (0); abort: gve_rx_free_ring_gqi(priv, i); return (err); } static int gve_rx_alloc_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int err; com->priv = priv; com->id = i; gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; if (gve_is_gqi(priv)) err = gve_rx_alloc_ring_gqi(priv, i); else err = gve_rx_alloc_ring_dqo(priv, i); if (err != 0) goto abort; return (0); abort: gve_rx_free_ring(priv, i); return (err); } int gve_alloc_rx_rings(struct gve_priv *priv) { int err = 0; int i; priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues, M_GVE, M_WAITOK | M_ZERO); for (i = 0; i < priv->rx_cfg.num_queues; i++) { err = gve_rx_alloc_ring(priv, i); if (err != 0) goto free_rings; } return (0); free_rings: while (i--) gve_rx_free_ring(priv, i); free(priv->rx, M_GVE); return (err); } void gve_free_rx_rings(struct gve_priv *priv) { int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_rx_free_ring(priv, i); free(priv->rx, M_GVE); } static void gve_rx_clear_data_ring(struct gve_rx_ring *rx) { struct gve_priv *priv = rx->com.priv; int i; /* * The Rx data ring has this invariant: "the networking stack is not * using the buffer beginning at any page_offset". This invariant is * established initially by gve_prefill_rx_slots at alloc-time and is * maintained by the cleanup taskqueue. This invariant implies that the * ring can be considered to be fully posted with buffers at this point, * even if there are unfreed mbufs still being processed, which is why we * can fill the ring without waiting on can_flip at each slot to become true. */ for (i = 0; i < priv->rx_desc_cnt; i++) { rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i + rx->page_info[i].page_offset); rx->fill_cnt++; } bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_rx_clear_desc_ring(struct gve_rx_ring *rx) { struct gve_priv *priv = rx->com.priv; int i; for (i = 0; i < priv->rx_desc_cnt; i++) rx->desc_ring[i] = (struct gve_rx_desc){}; bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_clear_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; if (!gve_is_gqi(priv)) { gve_clear_rx_ring_dqo(priv, i); return; } rx->seq_no = 1; rx->cnt = 0; rx->fill_cnt = 0; rx->mask = priv->rx_desc_cnt - 1; gve_rx_clear_desc_ring(rx); gve_rx_clear_data_ring(rx); } static void gve_start_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; if ((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) { if (tcp_lro_init(&rx->lro) != 0) device_printf(priv->dev, "Failed to init lro for rx ring %d", i); rx->lro.ifp = priv->ifp; } if (gve_is_gqi(priv)) NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); else NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq_dqo, rx); com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(priv->dev), i); if (gve_is_gqi(priv)) { /* GQ RX bufs are prefilled at ring alloc time */ gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); } else gve_rx_prefill_buffers_dqo(rx); } int gve_create_rx_rings(struct gve_priv *priv) { struct gve_ring_com *com; struct gve_rx_ring *rx; int err; int i; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) return (0); for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_clear_rx_ring(priv, i); err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); if (err != 0) return (err); bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_POSTREAD); for (i = 0; i < priv->rx_cfg.num_queues; i++) { rx = &priv->rx[i]; com = &rx->com; com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, BUS_DMASYNC_POSTREAD); com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); gve_start_rx_ring(priv, i); } gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); return (0); } static void gve_stop_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; if (com->cleanup_tq != NULL) { taskqueue_quiesce(com->cleanup_tq); taskqueue_free(com->cleanup_tq); com->cleanup_tq = NULL; } tcp_lro_free(&rx->lro); rx->ctx = (struct gve_rx_ctx){}; } int gve_destroy_rx_rings(struct gve_priv *priv) { int err; int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_stop_rx_ring(priv, i); if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) { err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); } return (0); } int gve_rx_intr(void *arg) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; struct gve_ring_com *com = &rx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); return (FILTER_HANDLED); } static inline void gve_set_rss_type(__be16 flag, struct mbuf *mbuf) { if ((flag & GVE_RXF_IPV4) != 0) { if ((flag & GVE_RXF_TCP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); else if ((flag & GVE_RXF_UDP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); else M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); return; } if ((flag & GVE_RXF_IPV6) != 0) { if ((flag & GVE_RXF_TCP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); else if ((flag & GVE_RXF_UDP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); else M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); return; } } static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) { const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET); page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; *(slot_addr) ^= offset; } static struct mbuf * gve_rx_create_mbuf(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, uint16_t len, union gve_rx_data_slot *data_slot, bool is_only_frag) { struct gve_rx_ctx *ctx = &rx->ctx; struct mbuf *mbuf; u_int ref_count; bool can_flip; uint32_t offset = page_info->page_offset + page_info->pad; void *va = (char *)page_info->page_address + offset; if (len <= priv->rx_copybreak && is_only_frag) { mbuf = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(mbuf == NULL)) return (NULL); m_copyback(mbuf, 0, len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); counter_exit(); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { struct mbuf *mbuf_tail = ctx->mbuf_tail; KASSERT(len <= MCLBYTES, ("gve rx fragment bigger than cluster mbuf")); /* * This page was created with VM_ALLOC_WIRED, thus the lowest * wire count experienced by the page until the interface is * destroyed is 1. * * We wire the page again before supplying an mbuf pointing to * it to the networking stack, so before the mbuf leaves the * driver, the wire count rises to 2. * * If it is 1 again, it necessarily means that the mbuf has been * consumed and it was gve_mextadd_free that brought down the wire * count back to 1. We only need to eventually observe the 1. */ ref_count = atomic_load_int(&page_info->page->ref_count); can_flip = VPRC_WIRE_COUNT(ref_count) == 1; if (mbuf_tail == NULL) { if (can_flip) mbuf = m_gethdr(M_NOWAIT, MT_DATA); else mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { if (can_flip) mbuf = m_get(M_NOWAIT, MT_DATA); else mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); mbuf_tail->m_next = mbuf; ctx->mbuf_tail = mbuf; } if (__predict_false(mbuf == NULL)) return (NULL); if (can_flip) { MEXTADD(mbuf, va, len, gve_mextadd_free, page_info->page, page_info->page_address, 0, EXT_NET_DRV); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); counter_exit(); /* * Grab an extra ref to the page so that gve_mextadd_free * does not end up freeing the page while the interface exists. */ vm_page_wire(page_info->page); gve_rx_flip_buff(page_info, &data_slot->qpl_offset); } else { m_copyback(mbuf, 0, len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); counter_exit(); } } mbuf->m_len = len; ctx->total_size += len; return (mbuf); } static inline bool gve_needs_rss(__be16 flag) { if ((flag & GVE_RXF_FRAG) != 0) return (false); if ((flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) != 0) return (true); return (false); } static void gve_rx(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_desc *desc, uint32_t idx) { struct gve_rx_slot_page_info *page_info; struct gve_dma_handle *page_dma_handle; union gve_rx_data_slot *data_slot; struct gve_rx_ctx *ctx = &rx->ctx; struct mbuf *mbuf = NULL; if_t ifp = priv->ifp; bool do_if_input; uint16_t len; bool is_first_frag = ctx->frag_cnt == 0; bool is_last_frag = !(GVE_RXF_PKT_CONT & desc->flags_seq); bool is_only_frag = is_first_frag && is_last_frag; if (__predict_false(ctx->drop_pkt)) goto finish_frag; if ((desc->flags_seq & GVE_RXF_ERR) != 0) { ctx->drop_pkt = true; counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); goto finish_frag; } page_info = &rx->page_info[idx]; data_slot = &rx->data_ring[idx]; page_dma_handle = &(rx->com.qpl->dmas[idx]); page_info->pad = is_first_frag ? GVE_RX_PAD : 0; len = be16toh(desc->len) - page_info->pad; bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, BUS_DMASYNC_POSTREAD); mbuf = gve_rx_create_mbuf(priv, rx, page_info, len, data_slot, is_only_frag); if (mbuf == NULL) { ctx->drop_pkt = true; counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); goto finish_frag; } if (is_first_frag) { mbuf->m_pkthdr.rcvif = priv->ifp; ctx->is_tcp = desc->flags_seq & GVE_RXF_TCP; if (gve_needs_rss(desc->flags_seq)) { gve_set_rss_type(desc->flags_seq, mbuf); mbuf->m_pkthdr.flowid = be32toh(desc->rss_hash); } if ((desc->csum != 0) && ((desc->flags_seq & GVE_RXF_FRAG) == 0)) { mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mbuf->m_pkthdr.csum_data = 0xffff; } } if (is_last_frag) { mbuf = ctx->mbuf_head; mbuf->m_pkthdr.len = ctx->total_size; do_if_input = true; if (((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) && /* LRO is enabled */ (ctx->is_tcp) && /* pkt is a TCP pkt */ ((mbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID) != 0) && /* NIC verified csum */ (rx->lro.lro_cnt != 0) && /* LRO resources exist */ (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) do_if_input = false; if (do_if_input) if_input(ifp, mbuf); counter_enter(); counter_u64_add_protected(rx->stats.rbytes, ctx->total_size); counter_u64_add_protected(rx->stats.rpackets, 1); counter_exit(); } finish_frag: ctx->frag_cnt++; if (is_last_frag) rx->ctx = (struct gve_rx_ctx){}; } static bool gve_rx_work_pending(struct gve_rx_ring *rx) { struct gve_rx_desc *desc; __be16 flags_seq; uint32_t next_idx; next_idx = rx->cnt & rx->mask; desc = rx->desc_ring + next_idx; flags_seq = desc->flags_seq; return (GVE_SEQNO(flags_seq) == rx->seq_no); } static inline uint8_t gve_next_seqno(uint8_t seq) { return ((seq + 1) == 8 ? 1 : seq + 1); } static void gve_rx_cleanup(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) { uint32_t idx = rx->cnt & rx->mask; struct gve_rx_desc *desc; struct gve_rx_ctx *ctx = &rx->ctx; uint32_t work_done = 0; NET_EPOCH_ASSERT(); bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_POSTREAD); desc = &rx->desc_ring[idx]; while ((work_done < budget || ctx->frag_cnt) && (GVE_SEQNO(desc->flags_seq) == rx->seq_no)) { gve_rx(priv, rx, desc, idx); rx->cnt++; idx = rx->cnt & rx->mask; desc = &rx->desc_ring[idx]; rx->seq_no = gve_next_seqno(rx->seq_no); work_done++; } /* The device will only send whole packets. */ if (__predict_false(ctx->frag_cnt)) { m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; device_printf(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", GVE_SEQNO(desc->flags_seq), rx->seq_no); gve_schedule_reset(priv); } if (work_done != 0) tcp_lro_flush_all(&rx->lro); bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); /* Buffers are refilled as the descs are processed */ rx->fill_cnt += work_done; gve_db_bar_write_4(priv, rx->com.db_offset, rx->fill_cnt); } void gve_rx_cleanup_tq(void *arg, int pending) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; gve_rx_cleanup(priv, rx, /*budget=*/128); gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_ACK | GVE_IRQ_EVENT); /* * Fragments received before this barrier MAY NOT cause the NIC to send an * interrupt but they will still be handled by the enqueue below. * Fragments received after the barrier WILL trigger an interrupt. */ atomic_thread_fence_seq_cst(); if (gve_rx_work_pending(rx)) { gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); } } diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c index 6ce9ddd887d0..a499ac9d3c6a 100644 --- a/sys/dev/gve/gve_rx_dqo.c +++ b/sys/dev/gve/gve_rx_dqo.c @@ -1,1012 +1,1021 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" static void gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx) { struct gve_rx_buf_dqo *buf; int i; if (gve_is_qpl(rx->com.priv)) return; for (i = 0; i < rx->dqo.buf_cnt; i++) { buf = &rx->dqo.bufs[i]; if (!buf->mbuf) continue; bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, BUS_DMASYNC_POSTREAD); bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); m_freem(buf->mbuf); buf->mbuf = NULL; } } void gve_rx_free_ring_dqo(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; int j; if (rx->dqo.compl_ring != NULL) { gve_dma_free_coherent(&rx->dqo.compl_ring_mem); rx->dqo.compl_ring = NULL; } if (rx->dqo.desc_ring != NULL) { gve_dma_free_coherent(&rx->desc_ring_mem); rx->dqo.desc_ring = NULL; } if (rx->dqo.bufs != NULL) { gve_free_rx_mbufs_dqo(rx); if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) { for (j = 0; j < rx->dqo.buf_cnt; j++) if (rx->dqo.bufs[j].mapped) bus_dmamap_destroy(rx->dqo.buf_dmatag, rx->dqo.bufs[j].dmamap); } free(rx->dqo.bufs, M_GVE); rx->dqo.bufs = NULL; } if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) bus_dma_tag_destroy(rx->dqo.buf_dmatag); + + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } } int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; int err; int j; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_rx_desc_dqo) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); goto abort; } rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr; rx->dqo.mask = priv->rx_desc_cnt - 1; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc compl ring for rx ring %d", i); goto abort; } rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; rx->dqo.mask = priv->rx_desc_cnt - 1; rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO : priv->rx_desc_cnt; rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), M_GVE, M_WAITOK | M_ZERO); if (gve_is_qpl(priv)) { - rx->com.qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; + rx->com.qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues, + GVE_RX_NUM_QPL_PAGES_DQO, /*single_kva=*/false); if (rx->com.qpl == NULL) { - device_printf(priv->dev, "No QPL left for rx ring %d", i); - return (ENOMEM); + device_printf(priv->dev, + "Failed to alloc QPL for rx ring %d", i); + err = ENOMEM; + goto abort; } return (0); } err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ MCLBYTES, /* maxsize */ 1, /* nsegments */ MCLBYTES, /* maxsegsize */ 0, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &rx->dqo.buf_dmatag); if (err != 0) { device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto abort; } for (j = 0; j < rx->dqo.buf_cnt; j++) { err = bus_dmamap_create(rx->dqo.buf_dmatag, 0, &rx->dqo.bufs[j].dmamap); if (err != 0) { device_printf(priv->dev, "err in creating rx buf dmamap %d: %d", j, err); goto abort; } rx->dqo.bufs[j].mapped = true; } return (0); abort: gve_rx_free_ring_dqo(priv, i); return (err); } static void gve_rx_clear_desc_ring_dqo(struct gve_rx_ring *rx) { struct gve_ring_com *com = &rx->com; int entries; int i; entries = com->priv->rx_desc_cnt; for (i = 0; i < entries; i++) rx->dqo.desc_ring[i] = (struct gve_rx_desc_dqo){}; bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_rx_clear_compl_ring_dqo(struct gve_rx_ring *rx) { struct gve_ring_com *com = &rx->com; int i; for (i = 0; i < com->priv->rx_desc_cnt; i++) rx->dqo.compl_ring[i] = (struct gve_rx_compl_desc_dqo){}; bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map, BUS_DMASYNC_PREWRITE); } void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; int j; rx->fill_cnt = 0; rx->cnt = 0; rx->dqo.mask = priv->rx_desc_cnt - 1; rx->dqo.head = 0; rx->dqo.tail = 0; rx->dqo.cur_gen_bit = 0; gve_rx_clear_desc_ring_dqo(rx); gve_rx_clear_compl_ring_dqo(rx); gve_free_rx_mbufs_dqo(rx); if (gve_is_qpl(priv)) { SLIST_INIT(&rx->dqo.free_bufs); STAILQ_INIT(&rx->dqo.used_bufs); for (j = 0; j < rx->dqo.buf_cnt; j++) { struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j]; vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs]; u_int ref_count = atomic_load_int(&page->ref_count); /* * An ifconfig down+up might see pages still in flight * from the previous innings. */ if (VPRC_WIRE_COUNT(ref_count) == 1) SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); else STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry); buf->num_nic_frags = 0; buf->next_idx = 0; } } else { SLIST_INIT(&rx->dqo.free_bufs); for (j = 0; j < rx->dqo.buf_cnt; j++) SLIST_INSERT_HEAD(&rx->dqo.free_bufs, &rx->dqo.bufs[j], slist_entry); } } int gve_rx_intr_dqo(void *arg) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; struct gve_ring_com *com = &rx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); /* Interrupts are automatically masked */ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static void gve_rx_advance_head_dqo(struct gve_rx_ring *rx) { rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, rx->dqo.head); } } static void gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) { struct gve_rx_desc_dqo *desc; bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, BUS_DMASYNC_PREREAD); desc = &rx->dqo.desc_ring[rx->dqo.head]; desc->buf_id = htole16(buf - rx->dqo.bufs); desc->buf_addr = htole64(buf->addr); gve_rx_advance_head_dqo(rx); } static int gve_rx_post_new_mbuf_dqo(struct gve_rx_ring *rx, int how) { struct gve_rx_buf_dqo *buf; bus_dma_segment_t segs[1]; int nsegs; int err; buf = SLIST_FIRST(&rx->dqo.free_bufs); if (__predict_false(!buf)) { device_printf(rx->com.priv->dev, "Unexpected empty free bufs list\n"); return (ENOBUFS); } SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); buf->mbuf = m_getcl(how, MT_DATA, M_PKTHDR); if (__predict_false(!buf->mbuf)) { err = ENOMEM; counter_enter(); counter_u64_add_protected(rx->stats.rx_mbuf_mclget_null, 1); counter_exit(); goto abort_with_buf; } buf->mbuf->m_len = MCLBYTES; err = bus_dmamap_load_mbuf_sg(rx->dqo.buf_dmatag, buf->dmamap, buf->mbuf, segs, &nsegs, BUS_DMA_NOWAIT); KASSERT(nsegs == 1, ("dma segs for a cluster mbuf is not 1")); if (__predict_false(err != 0)) { counter_enter(); counter_u64_add_protected(rx->stats.rx_mbuf_dmamap_err, 1); counter_exit(); goto abort_with_mbuf; } buf->addr = segs[0].ds_addr; gve_rx_post_buf_dqo(rx, buf); return (0); abort_with_mbuf: m_freem(buf->mbuf); buf->mbuf = NULL; abort_with_buf: SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); return (err); } static struct gve_dma_handle * gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) { return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs])); } static void gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, uint8_t frag_num) { struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head]; union gve_rx_qpl_buf_id_dqo composed_id; struct gve_dma_handle *page_dma_handle; composed_id.buf_id = buf - rx->dqo.bufs; composed_id.frag_num = frag_num; desc->buf_id = htole16(composed_id.all); page_dma_handle = gve_get_page_dma_handle(rx, buf); bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, BUS_DMASYNC_PREREAD); desc->buf_addr = htole64(page_dma_handle->bus_addr + frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); buf->num_nic_frags++; gve_rx_advance_head_dqo(rx); } static void gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one) { struct gve_rx_buf_dqo *hol_blocker = NULL; struct gve_rx_buf_dqo *buf; u_int ref_count; vm_page_t page; while (true) { buf = STAILQ_FIRST(&rx->dqo.used_bufs); if (__predict_false(buf == NULL)) break; page = rx->com.qpl->pages[buf - rx->dqo.bufs]; ref_count = atomic_load_int(&page->ref_count); if (VPRC_WIRE_COUNT(ref_count) != 1) { /* Account for one head-of-line blocker */ if (hol_blocker != NULL) break; hol_blocker = buf; STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, stailq_entry); continue; } STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, stailq_entry); SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); if (just_one) break; } if (hol_blocker != NULL) STAILQ_INSERT_HEAD(&rx->dqo.used_bufs, hol_blocker, stailq_entry); } static int gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx) { struct gve_rx_buf_dqo *buf; buf = SLIST_FIRST(&rx->dqo.free_bufs); if (__predict_false(buf == NULL)) { gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true); buf = SLIST_FIRST(&rx->dqo.free_bufs); if (__predict_false(buf == NULL)) return (ENOBUFS); } gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx); if (buf->next_idx == GVE_DQ_NUM_FRAGS_IN_PAGE - 1) buf->next_idx = 0; else buf->next_idx++; /* * We have posted all the frags in this buf to the NIC. * - buf will enter used_bufs once the last completion arrives. * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs * when its wire count drops back to 1. */ if (buf->next_idx == 0) SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); return (0); } static void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how) { uint32_t num_pending_bufs; uint32_t num_to_post; uint32_t i; int err; num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; num_to_post = rx->dqo.mask - num_pending_bufs; for (i = 0; i < num_to_post; i++) { if (gve_is_qpl(rx->com.priv)) err = gve_rx_post_new_dqo_qpl_buf(rx); else err = gve_rx_post_new_mbuf_dqo(rx, how); if (err) break; } } void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx) { gve_rx_post_buffers_dqo(rx, M_WAITOK); } static void gve_rx_set_hashtype_dqo(struct mbuf *mbuf, struct gve_ptype *ptype, bool *is_tcp) { switch (ptype->l3_type) { case GVE_L3_TYPE_IPV4: switch (ptype->l4_type) { case GVE_L4_TYPE_TCP: *is_tcp = true; M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); break; case GVE_L4_TYPE_UDP: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); break; default: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); } break; case GVE_L3_TYPE_IPV6: switch (ptype->l4_type) { case GVE_L4_TYPE_TCP: *is_tcp = true; M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); break; case GVE_L4_TYPE_UDP: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); break; default: M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); } break; default: M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH); } } static void gve_rx_set_csum_flags_dqo(struct mbuf *mbuf, struct gve_rx_compl_desc_dqo *desc, struct gve_ptype *ptype) { /* HW did not identify and process L3 and L4 headers. */ if (__predict_false(!desc->l3_l4_processed)) return; if (ptype->l3_type == GVE_L3_TYPE_IPV4) { if (__predict_false(desc->csum_ip_err || desc->csum_external_ip_err)) return; } else if (ptype->l3_type == GVE_L3_TYPE_IPV6) { /* Checksum should be skipped if this flag is set. */ if (__predict_false(desc->ipv6_ex_add)) return; } if (__predict_false(desc->csum_l4_err)) return; switch (ptype->l4_type) { case GVE_L4_TYPE_TCP: case GVE_L4_TYPE_UDP: case GVE_L4_TYPE_ICMP: case GVE_L4_TYPE_SCTP: mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mbuf->m_pkthdr.csum_data = 0xffff; break; default: break; } } static void gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx, struct gve_rx_compl_desc_dqo *compl_desc) { struct mbuf *mbuf = rx->ctx.mbuf_head; if_t ifp = rx->com.priv->ifp; struct gve_ptype *ptype; bool do_if_input = true; bool is_tcp = false; ptype = &rx->com.priv->ptype_lut_dqo->ptypes[compl_desc->packet_type]; gve_rx_set_hashtype_dqo(mbuf, ptype, &is_tcp); mbuf->m_pkthdr.flowid = le32toh(compl_desc->hash); gve_rx_set_csum_flags_dqo(mbuf, compl_desc, ptype); mbuf->m_pkthdr.rcvif = ifp; mbuf->m_pkthdr.len = rx->ctx.total_size; if (((if_getcapenable(rx->com.priv->ifp) & IFCAP_LRO) != 0) && is_tcp && (rx->lro.lro_cnt != 0) && (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) do_if_input = false; if (do_if_input) if_input(ifp, mbuf); counter_enter(); counter_u64_add_protected(rx->stats.rbytes, rx->ctx.total_size); counter_u64_add_protected(rx->stats.rpackets, 1); counter_exit(); rx->ctx = (struct gve_rx_ctx){}; } static int gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va, struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len) { struct mbuf *mbuf; mbuf = m_get2(frag_len, M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(mbuf == NULL)) return (ENOMEM); counter_enter(); counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); counter_exit(); m_copyback(mbuf, 0, frag_len, va); mbuf->m_len = frag_len; rx->ctx.mbuf_head = mbuf; rx->ctx.mbuf_tail = mbuf; rx->ctx.total_size += frag_len; gve_rx_input_mbuf_dqo(rx, compl_desc); return (0); } static void gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_compl_desc_dqo *compl_desc, int *work_done) { bool is_last_frag = compl_desc->end_of_packet != 0; struct gve_rx_ctx *ctx = &rx->ctx; struct gve_rx_buf_dqo *buf; uint32_t num_pending_bufs; uint16_t frag_len; uint16_t buf_id; int err; buf_id = le16toh(compl_desc->buf_id); if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", buf_id, rx->com.id); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } buf = &rx->dqo.bufs[buf_id]; if (__predict_false(buf->mbuf == NULL)) { device_printf(priv->dev, "Spurious completion for buf id %d on rxq %d, issuing reset\n", buf_id, rx->com.id); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } if (__predict_false(ctx->drop_pkt)) goto drop_frag; if (__predict_false(compl_desc->rx_error)) { counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); counter_exit(); goto drop_frag; } bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, BUS_DMASYNC_POSTREAD); frag_len = compl_desc->packet_len; if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*), compl_desc, frag_len); if (__predict_false(err != 0)) goto drop_frag; (*work_done)++; gve_rx_post_buf_dqo(rx, buf); return; } /* * Although buffer completions may arrive out of order, buffer * descriptors are consumed by the NIC in order. That is, the * buffer at desc_ring[tail] might not be the buffer we got the * completion compl_ring[tail] for: but we know that desc_ring[tail] * has already been read by the NIC. */ num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; /* * For every fragment received, try to post a new buffer. * * Failures are okay but only so long as the number of outstanding * buffers is above a threshold. * * Beyond that we drop new packets to reuse their buffers. * Without ensuring a minimum number of buffers for the NIC to * put packets in, we run the risk of getting the queue stuck * for good. */ err = gve_rx_post_new_mbuf_dqo(rx, M_NOWAIT); if (__predict_false(err != 0 && num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { counter_enter(); counter_u64_add_protected( rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); counter_exit(); goto drop_frag; } buf->mbuf->m_len = frag_len; ctx->total_size += frag_len; if (ctx->mbuf_tail == NULL) { ctx->mbuf_head = buf->mbuf; ctx->mbuf_tail = buf->mbuf; } else { buf->mbuf->m_flags &= ~M_PKTHDR; ctx->mbuf_tail->m_next = buf->mbuf; ctx->mbuf_tail = buf->mbuf; } /* * Disassociate the mbuf from buf and surrender buf to the free list to * be used by a future mbuf. */ bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); buf->mbuf = NULL; buf->addr = 0; SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); if (is_last_frag) { gve_rx_input_mbuf_dqo(rx, compl_desc); (*work_done)++; } return; drop_frag: /* Clear the earlier frags if there were any */ m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; /* Drop the rest of the pkt if there are more frags */ ctx->drop_pkt = true; /* Reuse the dropped frag's buffer */ gve_rx_post_buf_dqo(rx, buf); if (is_last_frag) goto drop_frag_clear_ctx; return; drop_frag_clear_ctx: counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; } static void * gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num) { int page_idx = buf - rx->dqo.bufs; void *va = rx->com.qpl->dmas[page_idx].cpu_addr; va = (char *)va + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); return (va); } static int gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx, struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num, uint16_t frag_len) { void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); struct mbuf *mbuf; if (ctx->mbuf_tail == NULL) { mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_tail->m_next = mbuf; ctx->mbuf_tail = mbuf; } mbuf->m_len = frag_len; ctx->total_size += frag_len; m_copyback(mbuf, 0, frag_len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); counter_exit(); return (0); } static int gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx, struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num, uint16_t frag_len) { struct mbuf *mbuf; void *page_addr; vm_page_t page; int page_idx; void *va; if (ctx->mbuf_tail == NULL) { mbuf = m_gethdr(M_NOWAIT, MT_DATA); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { mbuf = m_get(M_NOWAIT, MT_DATA); if (mbuf == NULL) return (ENOMEM); ctx->mbuf_tail->m_next = mbuf; ctx->mbuf_tail = mbuf; } mbuf->m_len = frag_len; ctx->total_size += frag_len; page_idx = buf - rx->dqo.bufs; page = rx->com.qpl->pages[page_idx]; page_addr = rx->com.qpl->dmas[page_idx].cpu_addr; va = (char *)page_addr + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); /* * Grab an extra ref to the page so that gve_mextadd_free * does not end up freeing the page while the interface exists. */ vm_page_wire(page); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); counter_exit(); MEXTADD(mbuf, va, frag_len, gve_mextadd_free, page, page_addr, 0, EXT_NET_DRV); return (0); } static void gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_compl_desc_dqo *compl_desc, int *work_done) { bool is_last_frag = compl_desc->end_of_packet != 0; union gve_rx_qpl_buf_id_dqo composed_id; struct gve_dma_handle *page_dma_handle; struct gve_rx_ctx *ctx = &rx->ctx; struct gve_rx_buf_dqo *buf; uint32_t num_pending_bufs; uint8_t buf_frag_num; uint16_t frag_len; uint16_t buf_id; int err; composed_id.all = le16toh(compl_desc->buf_id); buf_id = composed_id.buf_id; buf_frag_num = composed_id.frag_num; if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", buf_id, rx->com.id); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } buf = &rx->dqo.bufs[buf_id]; if (__predict_false(buf->num_nic_frags == 0 || buf_frag_num > GVE_DQ_NUM_FRAGS_IN_PAGE - 1)) { device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d " "with buf_frag_num %d and num_nic_frags %d, issuing reset\n", buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags); gve_schedule_reset(priv); goto drop_frag_clear_ctx; } buf->num_nic_frags--; if (__predict_false(ctx->drop_pkt)) goto drop_frag; if (__predict_false(compl_desc->rx_error)) { counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); counter_exit(); goto drop_frag; } page_dma_handle = gve_get_page_dma_handle(rx, buf); bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, BUS_DMASYNC_POSTREAD); frag_len = compl_desc->packet_len; if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len); if (__predict_false(err != 0)) goto drop_frag; (*work_done)++; gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); return; } num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; err = gve_rx_post_new_dqo_qpl_buf(rx); if (__predict_false(err != 0 && num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { /* * Resort to copying this fragment into a cluster mbuf * when the above threshold is breached and repost the * incoming buffer. If we cannot find cluster mbufs, * just drop the packet (to repost its buffer). */ err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf, buf_frag_num, frag_len); if (err != 0) { counter_enter(); counter_u64_add_protected( rx->stats.rx_dropped_pkt_buf_post_fail, 1); counter_exit(); goto drop_frag; } gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); } else { err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf, buf_frag_num, frag_len); if (__predict_false(err != 0)) { counter_enter(); counter_u64_add_protected( rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); counter_exit(); goto drop_frag; } } /* * Both the counts need to be checked. * * num_nic_frags == 0 implies no pending completions * but not all frags may have yet been posted. * * next_idx == 0 implies all frags have been posted * but there might be pending completions. */ if (buf->num_nic_frags == 0 && buf->next_idx == 0) STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry); if (is_last_frag) { gve_rx_input_mbuf_dqo(rx, compl_desc); (*work_done)++; } return; drop_frag: /* Clear the earlier frags if there were any */ m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; /* Drop the rest of the pkt if there are more frags */ ctx->drop_pkt = true; /* Reuse the dropped frag's buffer */ gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); if (is_last_frag) goto drop_frag_clear_ctx; return; drop_frag_clear_ctx: counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; } static bool gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) { struct gve_rx_compl_desc_dqo *compl_desc; uint32_t work_done = 0; NET_EPOCH_ASSERT(); while (work_done < budget) { bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map, BUS_DMASYNC_POSTREAD); compl_desc = &rx->dqo.compl_ring[rx->dqo.tail]; if (compl_desc->generation == rx->dqo.cur_gen_bit) break; /* * Prevent generation bit from being read after the rest of the * descriptor. */ atomic_thread_fence_acq(); rx->cnt++; rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask; rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0); if (gve_is_qpl(priv)) gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done); else gve_rx_dqo(priv, rx, compl_desc, &work_done); } if (work_done != 0) tcp_lro_flush_all(&rx->lro); gve_rx_post_buffers_dqo(rx, M_NOWAIT); if (gve_is_qpl(priv)) gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false); return (work_done == budget); } void gve_rx_cleanup_tq_dqo(void *arg, int pending) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; if (gve_rx_cleanup_dqo(priv, rx, /*budget=*/64)) { taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); return; } gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); } diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c index 04dde4f1a79b..e594c66149bc 100644 --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -1,925 +1,933 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) { struct gve_queue_page_list *qpl = tx->com.qpl; struct gve_tx_fifo *fifo = &tx->fifo; fifo->size = qpl->num_pages * PAGE_SIZE; fifo->base = qpl->kva; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; return (0); } static void gve_tx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; if (tx->desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->desc_ring = NULL; } if (tx->info != NULL) { free(tx->info, M_GVE); tx->info = NULL; } + + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } } static void gve_tx_free_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; /* Safe to call even if never alloced */ gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); if (mtx_initialized(&tx->ring_mtx)) mtx_destroy(&tx->ring_mtx); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } if (tx->br != NULL) { buf_ring_free(tx->br, M_DEVBUF); tx->br = NULL; } if (gve_is_gqi(priv)) gve_tx_free_ring_gqi(priv, i); else gve_tx_free_ring_dqo(priv, i); } static int gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; int err; err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->desc_ring = tx->desc_ring_mem.cpu_addr; - com->qpl = &priv->qpls[i]; + com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, + /*single_kva=*/true); if (com->qpl == NULL) { - device_printf(priv->dev, "No QPL left for tx ring %d\n", i); + device_printf(priv->dev, + "Failed to alloc QPL for tx ring %d\n", i); err = ENOMEM; goto abort; } err = gve_tx_fifo_init(priv, tx); if (err != 0) goto abort; tx->info = malloc( sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, M_GVE, M_WAITOK | M_ZERO); return (0); abort: gve_tx_free_ring_gqi(priv, i); return (err); } static int gve_tx_alloc_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; char mtx_name[16]; int err; com->priv = priv; com->id = i; if (gve_is_gqi(priv)) err = gve_tx_alloc_ring_gqi(priv, i); else err = gve_tx_alloc_ring_dqo(priv, i); if (err != 0) goto abort; sprintf(mtx_name, "gvetx%d", i); mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, M_WAITOK, &tx->ring_mtx); gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; return (0); abort: gve_tx_free_ring(priv, i); return (err); } int gve_alloc_tx_rings(struct gve_priv *priv) { int err = 0; int i; priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, M_GVE, M_WAITOK | M_ZERO); for (i = 0; i < priv->tx_cfg.num_queues; i++) { err = gve_tx_alloc_ring(priv, i); if (err != 0) goto free_rings; } return (0); free_rings: while (i--) gve_tx_free_ring(priv, i); free(priv->tx, M_GVE); return (err); } void gve_free_tx_rings(struct gve_priv *priv) { int i; for (i = 0; i < priv->tx_cfg.num_queues; i++) gve_tx_free_ring(priv, i); free(priv->tx, M_GVE); } static void gve_tx_clear_desc_ring(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->desc_ring[i] = (union gve_tx_desc){}; tx->info[i] = (struct gve_tx_buffer_state){}; } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_clear_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_tx_fifo *fifo = &tx->fifo; tx->req = 0; tx->done = 0; tx->mask = priv->tx_desc_cnt - 1; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; gve_tx_clear_desc_ring(tx); } static void gve_start_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; atomic_store_bool(&tx->stopped, false); if (gve_is_gqi(priv)) NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); else NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx); com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", device_get_nameunit(priv->dev), i); TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); tx->xmit_tq = taskqueue_create_fast("gve tx xmit", M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", device_get_nameunit(priv->dev), i); } int gve_create_tx_rings(struct gve_priv *priv) { struct gve_ring_com *com; struct gve_tx_ring *tx; int err; int i; if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) return (0); for (i = 0; i < priv->tx_cfg.num_queues; i++) { if (gve_is_gqi(priv)) gve_clear_tx_ring(priv, i); else gve_clear_tx_ring_dqo(priv, i); } err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_POSTREAD); for (i = 0; i < priv->tx_cfg.num_queues; i++) { tx = &priv->tx[i]; com = &tx->com; com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, BUS_DMASYNC_POSTREAD); com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); gve_start_tx_ring(priv, i); } gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); return (0); } static void gve_stop_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; if (com->cleanup_tq != NULL) { taskqueue_quiesce(com->cleanup_tq); taskqueue_free(com->cleanup_tq); com->cleanup_tq = NULL; } if (tx->xmit_tq != NULL) { taskqueue_quiesce(tx->xmit_tq); taskqueue_free(tx->xmit_tq); tx->xmit_tq = NULL; } } int gve_destroy_tx_rings(struct gve_priv *priv) { int err; int i; for (i = 0; i < priv->tx_cfg.num_queues; i++) gve_stop_tx_ring(priv, i); if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); } return (0); } int gve_tx_intr(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static uint32_t gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) { bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, BUS_DMASYNC_POSTREAD); uint32_t counter = priv->counters[tx->com.counter_idx]; return (be32toh(counter)); } static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) { atomic_add_int(&fifo->available, bytes); } void gve_tx_cleanup_tq(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; uint32_t nic_done = gve_tx_load_event_counter(priv, tx); uint32_t todo = nic_done - tx->done; size_t space_freed = 0; int i, j; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; for (j = 0; j < todo; j++) { uint32_t idx = tx->done & tx->mask; struct gve_tx_buffer_state *info = &tx->info[idx]; struct mbuf *mbuf = info->mbuf; tx->done++; if (mbuf == NULL) continue; info->mbuf = NULL; counter_enter(); counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); counter_u64_add_protected(tx->stats.tpackets, 1); counter_exit(); m_freem(mbuf); for (i = 0; i < GVE_TX_MAX_DESCS; i++) { space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; info->iov[i].iov_len = 0; info->iov[i].iov_padding = 0; } } gve_tx_free_fifo(&tx->fifo, space_freed); gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_ACK | GVE_IRQ_EVENT); /* * Completions born before this barrier MAY NOT cause the NIC to send an * interrupt but they will still be handled by the enqueue below. * Completions born after the barrier WILL trigger an interrupt. */ atomic_thread_fence_seq_cst(); nic_done = gve_tx_load_event_counter(priv, tx); todo = nic_done - tx->done; if (todo != 0) { gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); } if (atomic_load_bool(&tx->stopped) && space_freed) { atomic_store_bool(&tx->stopped, false); taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); } } static void gve_dma_sync_for_device(struct gve_queue_page_list *qpl, uint64_t iov_offset, uint64_t iov_len) { uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; uint64_t first_page = iov_offset / PAGE_SIZE; struct gve_dma_handle *dma; uint64_t page; for (page = first_page; page <= last_page; page++) { dma = &(qpl->dmas[page]); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); } } static void gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) { mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); mtd_desc->reserved0 = 0; mtd_desc->reserved1 = 0; } static void gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, uint16_t l4_hdr_offset, uint32_t desc_cnt, uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, int csum_offset, uint16_t pkt_len) { if (is_tso) { pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else if (has_csum_flag) { pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else { pkt_desc->type_flags = GVE_TXD_STD; pkt_desc->l4_csum_offset = 0; pkt_desc->l4_hdr_offset = 0; } pkt_desc->desc_cnt = desc_cnt; pkt_desc->len = htobe16(pkt_len); pkt_desc->seg_len = htobe16(first_seg_len); pkt_desc->seg_addr = htobe64(addr); } static void gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, bool is_tso, uint16_t len, uint64_t addr, bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) { seg_desc->type_flags = GVE_TXD_SEG; if (is_tso) { if (is_ipv6) seg_desc->type_flags |= GVE_TXSF_IPV6; seg_desc->l3_offset = l3_off >> 1; seg_desc->mss = htobe16(tso_mss); } seg_desc->seg_len = htobe16(len); seg_desc->seg_addr = htobe64(addr); } static inline uint32_t gve_tx_avail(struct gve_tx_ring *tx) { return (tx->mask + 1 - (tx->req - tx->done)); } static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) { return (atomic_load_int(&fifo->available) >= bytes); } static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) { return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); } static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) { return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; } static inline int gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, uint16_t pkt_len) { int pad_bytes, align_hdr_pad; int bytes; pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); /* We need to take into account the header alignment padding. */ align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; bytes = align_hdr_pad + pad_bytes + pkt_len; return (bytes); } static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, struct gve_tx_iovec iov[2]) { size_t overflow, padding; uint32_t aligned_head; int nfrags = 0; if (bytes == 0) return (0); /* * This check happens before we know how much padding is needed to * align to a cacheline boundary for the payload, but that is fine, * because the FIFO head always start aligned, and the FIFO's boundaries * are aligned, so if there is space for the data, there is space for * the padding to the next alignment. */ KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), ("Allocating gve tx fifo when there is no room")); nfrags++; iov[0].iov_offset = fifo->head; iov[0].iov_len = bytes; fifo->head += bytes; if (fifo->head > fifo->size) { /* * If the allocation did not fit in the tail fragment of the * FIFO, also use the head fragment. */ nfrags++; overflow = fifo->head - fifo->size; iov[0].iov_len -= overflow; iov[1].iov_offset = 0; /* Start of fifo*/ iov[1].iov_len = overflow; fifo->head = overflow; } /* Re-align to a cacheline boundary */ aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); padding = aligned_head - fifo->head; iov[nfrags - 1].iov_padding = padding; atomic_add_int(&fifo->available, -(bytes + padding)); fifo->head = aligned_head; if (fifo->head == fifo->size) fifo->head = 0; return (nfrags); } /* Only error this returns is ENOBUFS when the tx fifo is short of space */ static int gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) { bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; int pad_bytes, hdr_nfrags, payload_nfrags; struct gve_tx_pkt_desc *pkt_desc; struct gve_tx_seg_desc *seg_desc; struct gve_tx_mtd_desc *mtd_desc; struct gve_tx_buffer_state *info; uint32_t idx = tx->req & tx->mask; struct ether_header *eh; struct mbuf *mbuf_next; int payload_iov = 2; int bytes_required; struct ip6_hdr *ip6; struct tcphdr *th; uint32_t next_idx; uint8_t l3_off; struct ip *ip; int i; info = &tx->info[idx]; csum_flags = mbuf->m_pkthdr.csum_flags; pkt_len = mbuf->m_pkthdr.len; is_tso = csum_flags & CSUM_TSO; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; l3_off = ETHER_HDR_LEN; mbuf_next = m_getptr(mbuf, l3_off, &offset); if (is_ipv6) { ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); l4_off = l3_off + sizeof(struct ip6_hdr); is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); is_udp = (ip6->ip6_nxt == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (struct ip *)(mtodo(mbuf_next, offset)); l4_off = l3_off + (ip->ip_hl << 2); is_tcp = (ip->ip_p == IPPROTO_TCP); is_udp = (ip->ip_p == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } l4_data_off = 0; if (is_tcp) { th = (struct tcphdr *)(mtodo(mbuf_next, offset)); l4_data_off = l4_off + (th->th_off << 2); } else if (is_udp) l4_data_off = l4_off + sizeof(struct udphdr); if (has_csum_flag) { if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) csum_offset = offsetof(struct tcphdr, th_sum); else csum_offset = offsetof(struct udphdr, uh_sum); } /* * If this packet is neither a TCP nor a UDP packet, the first segment, * the one represented by the packet descriptor, will carry the * spec-stipulated minimum of 182B. */ if (l4_data_off != 0) first_seg_len = l4_data_off; else first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); if (__predict_false(!gve_can_tx(tx, bytes_required))) { counter_enter(); counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); counter_exit(); return (ENOBUFS); } /* So that the cleanup taskqueue can free the mbuf eventually. */ info->mbuf = mbuf; /* * We don't want to split the header, so if necessary, pad to the end * of the fifo and then put the header at the beginning of the fifo. */ pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, &info->iov[0]); KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, &info->iov[payload_iov]); pkt_desc = &tx->desc_ring[idx].pkt; gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 1 + mtd_desc_nr + payload_nfrags, first_seg_len, info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, pkt_len); m_copydata(mbuf, 0, first_seg_len, (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[hdr_nfrags - 1].iov_offset, info->iov[hdr_nfrags - 1].iov_len); copy_offset = first_seg_len; if (mtd_desc_nr == 1) { next_idx = (tx->req + 1) & tx->mask; mtd_desc = &tx->desc_ring[next_idx].mtd; gve_tx_fill_mtd_desc(mtd_desc, mbuf); } for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; seg_desc = &tx->desc_ring[next_idx].seg; gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); m_copydata(mbuf, copy_offset, info->iov[i].iov_len, (char *)tx->fifo.base + info->iov[i].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[i].iov_offset, info->iov[i].iov_len); copy_offset += info->iov[i].iov_len; } tx->req += (1 + mtd_desc_nr + payload_nfrags); if (is_tso) { counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } return (0); } static int gve_xmit_mbuf(struct gve_tx_ring *tx, struct mbuf **mbuf) { if (gve_is_gqi(tx->com.priv)) return (gve_xmit(tx, *mbuf)); if (gve_is_qpl(tx->com.priv)) return (gve_xmit_dqo_qpl(tx, *mbuf)); /* * gve_xmit_dqo might attempt to defrag the mbuf chain. * The reference is passed in so that in the case of * errors, the new mbuf chain is what's put back on the br. */ return (gve_xmit_dqo(tx, mbuf)); } /* * Has the side-effect of stopping the xmit queue by setting tx->stopped */ static int gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, struct mbuf **mbuf) { int err; atomic_store_bool(&tx->stopped, true); /* * Room made in the queue BEFORE the barrier will be seen by the * gve_xmit_mbuf retry below. * * If room is made in the queue AFTER the barrier, the cleanup tq * iteration creating the room will either see a tx->stopped value * of 0 or the 1 we just wrote: * * If it sees a 1, then it would enqueue the xmit tq. Enqueue * implies a retry on the waiting pkt. * * If it sees a 0, then that implies a previous iteration overwrote * our 1, and that iteration would enqueue the xmit tq. Enqueue * implies a retry on the waiting pkt. */ atomic_thread_fence_seq_cst(); err = gve_xmit_mbuf(tx, mbuf); if (err == 0) atomic_store_bool(&tx->stopped, false); return (err); } static void gve_xmit_br(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; struct ifnet *ifp = priv->ifp; struct mbuf *mbuf; int err; while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && (mbuf = drbr_peek(ifp, tx->br)) != NULL) { err = gve_xmit_mbuf(tx, &mbuf); /* * We need to stop this taskqueue when we can't xmit the pkt due * to lack of space in the NIC ring (ENOBUFS). The retry exists * to guard against a TOCTTOU bug that could end up freezing the * queue forever. */ if (__predict_false(mbuf != NULL && err == ENOBUFS)) err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); if (__predict_false(err != 0 && mbuf != NULL)) { if (err == EINVAL) { drbr_advance(ifp, tx->br); m_freem(mbuf); } else drbr_putback(ifp, tx->br, mbuf); break; } drbr_advance(ifp, tx->br); BPF_MTAP(ifp, mbuf); bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); else gve_db_bar_dqo_write_4(priv, tx->com.db_offset, tx->dqo.desc_tail); } } void gve_xmit_tq(void *arg, int pending) { struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; GVE_RING_LOCK(tx); gve_xmit_br(tx); GVE_RING_UNLOCK(tx); } static bool is_vlan_tagged_pkt(struct mbuf *mbuf) { struct ether_header *eh; eh = mtod(mbuf, struct ether_header *); return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); } int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; bool is_br_empty; int err; uint32_t i; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (ENODEV); if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; else i = curcpu % priv->tx_cfg.num_queues; tx = &priv->tx[i]; if (__predict_false(is_vlan_tagged_pkt(mbuf))) { counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); m_freem(mbuf); return (ENODEV); } is_br_empty = drbr_empty(ifp, tx->br); err = drbr_enqueue(ifp, tx->br, mbuf); if (__predict_false(err != 0)) { if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); return (err); } /* * If the mbuf we just enqueued is the only one on the ring, then * transmit it right away in the interests of low latency. */ if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { gve_xmit_br(tx); GVE_RING_UNLOCK(tx); } else if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); return (0); } void gve_qflush(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; int i; for (i = 0; i < priv->tx_cfg.num_queues; ++i) { tx = &priv->tx[i]; if (drbr_empty(ifp, tx->br) == 0) { GVE_RING_LOCK(tx); drbr_flush(ifp, tx->br); GVE_RING_UNLOCK(tx); } } if_qflush(ifp); } diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c index bf314ef95173..7361d47b8ce6 100644 --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -1,1097 +1,1111 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet6.h" #include "gve.h" #include "gve_dqo.h" static void gve_unmap_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap); } static void gve_clear_qpl_pending_pkt(struct gve_tx_pending_pkt_dqo *pending_pkt) { pending_pkt->qpl_buf_head = -1; pending_pkt->num_qpl_bufs = 0; } static void gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx) { struct gve_tx_pending_pkt_dqo *pending_pkt; int i; for (i = 0; i < tx->dqo.num_pending_pkts; i++) { pending_pkt = &tx->dqo.pending_pkts[i]; if (!pending_pkt->mbuf) continue; if (gve_is_qpl(tx->com.priv)) gve_clear_qpl_pending_pkt(pending_pkt); else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; } } void gve_tx_free_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; int j; if (tx->dqo.desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->dqo.desc_ring = NULL; } if (tx->dqo.compl_ring != NULL) { gve_dma_free_coherent(&tx->dqo.compl_ring_mem); tx->dqo.compl_ring = NULL; } if (tx->dqo.pending_pkts != NULL) { gve_free_tx_mbufs_dqo(tx); if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { for (j = 0; j < tx->dqo.num_pending_pkts; j++) if (tx->dqo.pending_pkts[j].state != GVE_PACKET_STATE_UNALLOCATED) bus_dmamap_destroy(tx->dqo.buf_dmatag, tx->dqo.pending_pkts[j].dmamap); } free(tx->dqo.pending_pkts, M_GVE); tx->dqo.pending_pkts = NULL; } if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) bus_dma_tag_destroy(tx->dqo.buf_dmatag); if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { free(tx->dqo.qpl_bufs, M_GVE); tx->dqo.qpl_bufs = NULL; } + + if (com->qpl != NULL) { + gve_free_qpl(priv, com->qpl); + com->qpl = NULL; + } } static int gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; int err; int j; /* * DMA tag for mapping Tx mbufs * The maxsize, nsegments, and maxsegsize params should match * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. */ err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ GVE_TSO_MAXSIZE_DQO, /* maxsize */ GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &tx->dqo.buf_dmatag); if (err != 0) { device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); return (err); } for (j = 0; j < tx->dqo.num_pending_pkts; j++) { err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, &tx->dqo.pending_pkts[j].dmamap); if (err != 0) { device_printf(priv->dev, "err in creating pending pkt dmamap %d: %d", j, err); return (err); } tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } return (0); } int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; uint16_t num_pending_pkts; int err; /* Descriptor ring */ err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr; /* Completion ring */ err = gve_dma_alloc_coherent(priv, sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc compl ring for tx ring %d", i); goto abort; } tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; /* * pending_pkts array * * The max number of pending packets determines the maximum number of * descriptors which maybe written to the completion queue. * * We must set the number small enough to make sure we never overrun the * completion queue. */ num_pending_pkts = priv->tx_desc_cnt; /* * Reserve space for descriptor completions, which will be reported at * most every GVE_TX_MIN_RE_INTERVAL packets. */ num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL; tx->dqo.num_pending_pkts = num_pending_pkts; tx->dqo.pending_pkts = malloc( sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, M_GVE, M_WAITOK | M_ZERO); if (gve_is_qpl(priv)) { int qpl_buf_cnt; - tx->com.qpl = &priv->qpls[i]; + tx->com.qpl = gve_alloc_qpl(priv, i, GVE_TX_NUM_QPL_PAGES_DQO, + /*single_kva*/false); + if (tx->com.qpl == NULL) { + device_printf(priv->dev, + "Failed to alloc QPL for tx ring %d", i); + err = ENOMEM; + goto abort; + } + qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; tx->dqo.qpl_bufs = malloc( sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, M_GVE, M_WAITOK | M_ZERO); } else gve_tx_alloc_rda_fields_dqo(tx); return (0); abort: gve_tx_free_ring_dqo(priv, i); return (err); } static void gve_extract_tx_metadata_dqo(const struct mbuf *mbuf, struct gve_tx_metadata_dqo *metadata) { uint32_t hash = mbuf->m_pkthdr.flowid; uint16_t path_hash; metadata->version = GVE_TX_METADATA_VERSION_DQO; if (hash) { path_hash = hash ^ (hash >> 16); path_hash &= (1 << 15) - 1; if (__predict_false(path_hash == 0)) path_hash = ~path_hash; metadata->path_hash = path_hash; } } static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, uint32_t *desc_idx, uint32_t len, uint64_t addr, int16_t compl_tag, bool eop, bool csum_enabled) { while (len > 0) { struct gve_tx_pkt_desc_dqo *desc = &tx->dqo.desc_ring[*desc_idx].pkt; uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO); bool cur_eop = eop && cur_len == len; *desc = (struct gve_tx_pkt_desc_dqo){ .buf_addr = htole64(addr), .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, .end_of_packet = cur_eop, .checksum_offload_enable = csum_enabled, .compl_tag = htole16(compl_tag), .buf_size = cur_len, }; addr += cur_len; len -= cur_len; *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; } } static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata, int header_len) { *desc = (struct gve_tx_tso_context_desc_dqo){ .header_len = header_len, .cmd_dtype = { .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, .tso = 1, }, .flex0 = metadata->bytes[0], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], }; desc->tso_total_len = mbuf->m_pkthdr.len - header_len; desc->mss = mbuf->m_pkthdr.tso_segsz; } static void gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, const struct gve_tx_metadata_dqo *metadata) { *desc = (struct gve_tx_general_context_desc_dqo){ .flex0 = metadata->bytes[0], .flex1 = metadata->bytes[1], .flex2 = metadata->bytes[2], .flex3 = metadata->bytes[3], .flex4 = metadata->bytes[4], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, }; } #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (EINVAL); \ } \ } while (0) static int gve_prep_tso(struct mbuf *mbuf, int *header_len) { uint8_t l3_off, l4_off = 0; struct ether_header *eh; struct tcphdr *th; u_short csum; PULLUP_HDR(mbuf, sizeof(*eh)); eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); l3_off = ETHER_HDR_LEN; #ifdef INET6 if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) { struct ip6_hdr *ip6; PULLUP_HDR(mbuf, l3_off + sizeof(*ip6)); ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off)); l4_off = l3_off + sizeof(struct ip6_hdr); csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP, /*csum=*/0); } else #endif if (ntohs(eh->ether_type) == ETHERTYPE_IP) { struct ip *ip; PULLUP_HDR(mbuf, l3_off + sizeof(*ip)); ip = (struct ip *)(mtodo(mbuf, l3_off)); l4_off = l3_off + (ip->ip_hl << 2); csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *)); th = (struct tcphdr *)(mtodo(mbuf, l4_off)); *header_len = l4_off + (th->th_off << 2); /* * Hardware requires the th->th_sum to not include the TCP payload, * hence we recompute the csum with it excluded. */ th->th_sum = csum; return (0); } static int gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, bool is_tso, uint32_t *desc_idx) { struct gve_tx_general_context_desc_dqo *gen_desc; struct gve_tx_tso_context_desc_dqo *tso_desc; struct gve_tx_metadata_dqo metadata; int header_len; int err; metadata = (struct gve_tx_metadata_dqo){0}; gve_extract_tx_metadata_dqo(mbuf, &metadata); if (is_tso) { err = gve_prep_tso(mbuf, &header_len); if (__predict_false(err)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_tsoerr, 1); counter_exit(); return (err); } tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; gve_tx_fill_general_ctx_desc(gen_desc, &metadata); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; return (0); } static int gve_map_mbuf_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf, bus_dmamap_t dmamap, bus_dma_segment_t *segs, int *nsegs, int attempt) { struct mbuf *m_new = NULL; int err; err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap, *mbuf, segs, nsegs, BUS_DMA_NOWAIT); switch (err) { case __predict_true(0): break; case EFBIG: if (__predict_false(attempt > 0)) goto abort; counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_collapse, 1); counter_exit(); /* Try m_collapse before m_defrag */ m_new = m_collapse(*mbuf, M_NOWAIT, GVE_TX_MAX_DATA_DESCS_DQO); if (m_new == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag, 1); counter_exit(); m_new = m_defrag(*mbuf, M_NOWAIT); } if (__predict_false(m_new == NULL)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag_err, 1); counter_exit(); m_freem(*mbuf); *mbuf = NULL; err = ENOMEM; goto abort; } else { *mbuf = m_new; return (gve_map_mbuf_dqo(tx, mbuf, dmamap, segs, nsegs, ++attempt)); } case ENOMEM: counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_dmamap_enomem_err, 1); counter_exit(); goto abort; default: goto abort; } return (0); abort: counter_enter(); counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1); counter_exit(); return (err); } static uint32_t num_avail_desc_ring_slots(const struct gve_tx_ring *tx) { uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) & tx->dqo.desc_mask; return (tx->dqo.desc_mask - num_used); } static struct gve_tx_pending_pkt_dqo * gve_alloc_pending_packet(struct gve_tx_ring *tx) { int32_t index = tx->dqo.free_pending_pkts_csm; struct gve_tx_pending_pkt_dqo *pending_pkt; /* * No pending packets available in the consumer list, * try to steal the producer list. */ if (__predict_false(index == -1)) { tx->dqo.free_pending_pkts_csm = atomic_swap_32( &tx->dqo.free_pending_pkts_prd, -1); index = tx->dqo.free_pending_pkts_csm; if (__predict_false(index == -1)) return (NULL); } pending_pkt = &tx->dqo.pending_pkts[index]; /* Remove pending_pkt from the consumer list */ tx->dqo.free_pending_pkts_csm = pending_pkt->next; pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; return (pending_pkt); } static void gve_free_pending_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { int index = pending_pkt - tx->dqo.pending_pkts; int32_t old_head; pending_pkt->state = GVE_PACKET_STATE_FREE; /* Add pending_pkt to the producer list */ while (true) { old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); pending_pkt->next = old_head; if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd, old_head, index)) break; } } /* * Has the side-effect of retrieving the value of the last desc index * processed by the NIC. hw_tx_head is written to by the completions-processing * taskqueue upon receiving descriptor-completions. */ static bool gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs) { if (needed_descs <= num_avail_desc_ring_slots(tx)) return (true); tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head); if (needed_descs > num_avail_desc_ring_slots(tx)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_descring, 1); counter_exit(); return (false); } return (0); } static void gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx) { uint32_t last_report_event_interval; uint32_t last_desc_idx; last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask; last_report_event_interval = (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask; if (__predict_false(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) { tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true; tx->dqo.last_re_idx = last_desc_idx; } } static bool gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) { uint32_t available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( &tx->dqo.qpl_bufs_produced); available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); return (false); } static int32_t gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) { int32_t buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) { tx->dqo.free_qpl_bufs_csm = atomic_swap_32( &tx->dqo.free_qpl_bufs_prd, -1); buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) return (-1); } tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; tx->dqo.qpl_bufs_consumed++; return (buf); } /* * Tx buffer i corresponds to * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO */ static void gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, int32_t index, void **va, bus_addr_t *dma_addr) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << GVE_TX_BUF_SHIFT_DQO; *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; } static struct gve_dma_handle * gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); return (&tx->com.qpl->dmas[page_id]); } static void gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, bool csum_enabled, int16_t completion_tag, uint32_t *desc_idx) { int32_t pkt_len = mbuf->m_pkthdr.len; struct gve_dma_handle *dma; uint32_t copy_offset = 0; int32_t prev_buf = -1; uint32_t copy_len; bus_addr_t addr; int32_t buf; void *va; MPASS(pkt->num_qpl_bufs == 0); MPASS(pkt->qpl_buf_head == -1); while (copy_offset < pkt_len) { buf = gve_tx_alloc_qpl_buf(tx); /* We already checked for availability */ MPASS(buf != -1); gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); m_copydata(mbuf, copy_offset, copy_len, va); copy_offset += copy_len; dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); gve_tx_fill_pkt_desc_dqo(tx, desc_idx, copy_len, addr, completion_tag, /*eop=*/copy_offset == pkt_len, csum_enabled); /* Link all the qpl bufs for a packet */ if (prev_buf == -1) pkt->qpl_buf_head = buf; else tx->dqo.qpl_bufs[prev_buf] = buf; prev_buf = buf; pkt->num_qpl_bufs++; } tx->dqo.qpl_bufs[buf] = -1; } int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) { uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); /* Check if we have enough room in the desc ring */ total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); counter_exit(); return (ENOBUFS); } pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; pkt->mbuf = mbuf; err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort; gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, has_csum_flag, completion_tag, &desc_idx); /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) { bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; struct mbuf *mbuf = *mbuf_ptr; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; int i; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; /* * This mbuf might end up needing more than 1 pkt desc. * The actual number, `nsegs` is known only after the * expensive gve_map_mbuf_dqo call. This check beneath * exists to fail early when the desc ring is really full. */ total_descs_needed = 1 + /* general_ctx_desc */ 1 + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap, segs, &nsegs, /*attempt=*/0); if (err) goto abort; mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */ pkt->mbuf = mbuf; total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false( !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) { err = ENOBUFS; goto abort_with_dma; } err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort_with_dma; bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); for (i = 0; i < nsegs; i++) { gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, segs[i].ds_len, segs[i].ds_addr, completion_tag, /*eop=*/i == (nsegs - 1), has_csum_flag); } /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort_with_dma: gve_unmap_packet(tx, pkt); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } static void gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pkt) { int32_t buf = pkt->qpl_buf_head; struct gve_dma_handle *dma; int32_t qpl_buf_tail; int32_t old_head; int i; for (i = 0; i < pkt->num_qpl_bufs; i++) { dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); qpl_buf_tail = buf; buf = tx->dqo.qpl_bufs[buf]; } MPASS(buf == -1); buf = qpl_buf_tail; while (true) { old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); tx->dqo.qpl_bufs[buf] = old_head; /* * The "rel" ensures that the update to dqo.free_qpl_bufs_prd * is visible only after the linked list from this pkt is * attached above to old_head. */ if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd, old_head, pkt->qpl_buf_head)) break; } /* * The "rel" ensures that the update to dqo.qpl_bufs_produced is * visible only adter the update to dqo.free_qpl_bufs_prd above. */ atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); gve_clear_qpl_pending_pkt(pkt); } static uint64_t gve_handle_packet_completion(struct gve_priv *priv, struct gve_tx_ring *tx, uint16_t compl_tag) { struct gve_tx_pending_pkt_dqo *pending_pkt; int32_t pkt_len; if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) { device_printf(priv->dev, "Invalid TX completion tag: %d\n", compl_tag); return (0); } pending_pkt = &tx->dqo.pending_pkts[compl_tag]; /* Packet is allocated but not pending data completion. */ if (__predict_false(pending_pkt->state != GVE_PACKET_STATE_PENDING_DATA_COMPL)) { device_printf(priv->dev, "No pending data completion: %d\n", compl_tag); return (0); } pkt_len = pending_pkt->mbuf->m_pkthdr.len; if (gve_is_qpl(priv)) gve_reap_qpl_bufs_dqo(tx, pending_pkt); else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; gve_free_pending_packet(tx, pending_pkt); return (pkt_len); } int gve_tx_intr_dqo(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); /* Interrupts are automatically masked */ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static void gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; for (i = 0; i < com->priv->tx_desc_cnt; i++) tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int entries; int i; entries = com->priv->tx_desc_cnt; for (i = 0; i < entries; i++) tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){}; bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, BUS_DMASYNC_PREWRITE); } void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; int j; tx->dqo.desc_head = 0; tx->dqo.desc_tail = 0; tx->dqo.desc_mask = priv->tx_desc_cnt - 1; tx->dqo.last_re_idx = 0; tx->dqo.compl_head = 0; tx->dqo.compl_mask = priv->tx_desc_cnt - 1; atomic_store_32(&tx->dqo.hw_tx_head, 0); tx->dqo.cur_gen_bit = 0; gve_free_tx_mbufs_dqo(tx); for (j = 0; j < tx->dqo.num_pending_pkts; j++) { if (gve_is_qpl(tx->com.priv)) gve_clear_qpl_pending_pkt(&tx->dqo.pending_pkts[j]); tx->dqo.pending_pkts[j].next = (j == tx->dqo.num_pending_pkts - 1) ? -1 : j + 1; tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } tx->dqo.free_pending_pkts_csm = 0; atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); if (gve_is_qpl(priv)) { int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; for (j = 0; j < qpl_buf_cnt - 1; j++) tx->dqo.qpl_bufs[j] = j + 1; tx->dqo.qpl_bufs[j] = -1; tx->dqo.free_qpl_bufs_csm = 0; atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1); atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt); tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; tx->dqo.qpl_bufs_consumed = 0; } gve_tx_clear_desc_ring_dqo(tx); gve_tx_clear_compl_ring_dqo(tx); } static bool gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget) { struct gve_tx_compl_desc_dqo *compl_desc; uint64_t bytes_done = 0; uint64_t pkts_done = 0; uint16_t compl_tag; int work_done = 0; uint16_t tx_head; uint16_t type; while (work_done < budget) { bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, BUS_DMASYNC_POSTREAD); compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head]; if (compl_desc->generation == tx->dqo.cur_gen_bit) break; /* * Prevent generation bit from being read after the rest of the * descriptor. */ atomic_thread_fence_acq(); type = compl_desc->type; if (type == GVE_COMPL_TYPE_DQO_DESC) { /* This is the last descriptor fetched by HW plus one */ tx_head = le16toh(compl_desc->tx_head); atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head); } else if (type == GVE_COMPL_TYPE_DQO_PKT) { compl_tag = le16toh(compl_desc->completion_tag); bytes_done += gve_handle_packet_completion(priv, tx, compl_tag); pkts_done++; } tx->dqo.compl_head = (tx->dqo.compl_head + 1) & tx->dqo.compl_mask; /* Flip the generation bit when we wrap around */ tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0; work_done++; } /* * Waking the xmit taskqueue has to occur after room has been made in * the queue. */ atomic_thread_fence_seq_cst(); if (atomic_load_bool(&tx->stopped) && work_done) { atomic_store_bool(&tx->stopped, false); taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); } tx->done += work_done; /* tx->done is just a sysctl counter */ counter_enter(); counter_u64_add_protected(tx->stats.tbytes, bytes_done); counter_u64_add_protected(tx->stats.tpackets, pkts_done); counter_exit(); return (work_done == budget); } void gve_tx_cleanup_tq_dqo(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) { taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); return; } gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); }