diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h index 43082d64ba95..92ab6838d5bb 100644 --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -1,700 +1,701 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _GVE_FBSD_H #define _GVE_FBSD_H #include "gve_desc.h" #include "gve_plat.h" #include "gve_register.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 #endif #define PCI_DEV_ID_GVNIC 0x0042 #define GVE_REGISTER_BAR 0 #define GVE_DOORBELL_BAR 2 /* Driver can alloc up to 2 segments for the header and 2 for the payload. */ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 /* Each RX bounce buffer page can fit two packet buffers. */ #define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) /* PTYPEs are always 10 bits. */ #define GVE_NUM_PTYPES 1024 /* * Number of descriptors per queue page list. * Page count AKA QPL size can be derived by dividing the number of elements in * a page by the number of descriptors available. */ #define GVE_QPL_DIVISOR 16 static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); struct gve_dma_handle { bus_addr_t bus_addr; void *cpu_addr; bus_dma_tag_t tag; bus_dmamap_t map; }; union gve_tx_desc { struct gve_tx_pkt_desc pkt; /* first desc for a packet */ struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ }; /* Tracks the memory in the fifo occupied by a segment of a packet */ struct gve_tx_iovec { uint32_t iov_offset; /* offset into this segment */ uint32_t iov_len; /* length */ uint32_t iov_padding; /* padding associated with this segment */ }; /* Tracks allowed and current queue settings */ struct gve_queue_config { uint16_t max_queues; uint16_t num_queues; /* current */ }; struct gve_irq_db { __be32 index; } __aligned(CACHE_LINE_SIZE); /* * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value * when the entire configure_device_resources command is zeroed out and the * queue_format is not specified. */ enum gve_queue_format { GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, GVE_DQO_QPL_FORMAT = 0x4, }; enum gve_state_flags_bit { GVE_STATE_FLAG_ADMINQ_OK, GVE_STATE_FLAG_RESOURCES_OK, GVE_STATE_FLAG_QPLREG_OK, GVE_STATE_FLAG_RX_RINGS_OK, GVE_STATE_FLAG_TX_RINGS_OK, GVE_STATE_FLAG_QUEUES_UP, GVE_STATE_FLAG_LINK_UP, GVE_STATE_FLAG_DO_RESET, GVE_STATE_FLAG_IN_RESET, GVE_NUM_STATE_FLAGS /* Not part of the enum space */ }; BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS); #define GVE_DEVICE_STATUS_RESET (0x1 << 1) #define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2) #define GVE_RING_LOCK(ring) mtx_lock(&(ring)->ring_mtx) #define GVE_RING_TRYLOCK(ring) mtx_trylock(&(ring)->ring_mtx) #define GVE_RING_UNLOCK(ring) mtx_unlock(&(ring)->ring_mtx) #define GVE_RING_ASSERT(ring) mtx_assert(&(ring)->ring_mtx, MA_OWNED) #define GVE_IFACE_LOCK_INIT(lock) sx_init(&lock, "gve interface lock") #define GVE_IFACE_LOCK_DESTROY(lock) sx_destroy(&lock) #define GVE_IFACE_LOCK_LOCK(lock) sx_xlock(&lock) #define GVE_IFACE_LOCK_UNLOCK(lock) sx_unlock(&lock) #define GVE_IFACE_LOCK_ASSERT(lock) sx_assert(&lock, SA_XLOCKED) struct gve_queue_page_list { uint32_t id; uint32_t num_dmas; uint32_t num_pages; vm_offset_t kva; vm_page_t *pages; struct gve_dma_handle *dmas; }; struct gve_irq { struct resource *res; void *cookie; }; struct gve_rx_slot_page_info { void *page_address; vm_page_t page; uint32_t page_offset; uint16_t pad; }; /* * A single received packet split across multiple buffers may be * reconstructed using the information in this structure. */ struct gve_rx_ctx { /* head and tail of mbuf chain for the current packet */ struct mbuf *mbuf_head; struct mbuf *mbuf_tail; uint32_t total_size; uint8_t frag_cnt; bool is_tcp; bool drop_pkt; }; struct gve_ring_com { struct gve_priv *priv; uint32_t id; /* * BAR2 offset for this ring's doorbell and the * counter-array offset for this ring's counter. * Acquired from the device individually for each * queue in the queue_create adminq command. */ struct gve_queue_resources *q_resources; struct gve_dma_handle q_resources_mem; /* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */ uint32_t irq_db_offset; /* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */ uint32_t db_offset; /* * Index, not byte-offset, into the counter array where this ring's * 4-byte counter lies. */ uint32_t counter_idx; /* * The index of the MSIX vector that was assigned to * this ring in `gve_alloc_irqs`. * * It is passed to the device in the queue_create adminq * command. * * Additionally, this also serves as the index into * `priv->irq_db_indices` where this ring's irq doorbell's * BAR2 offset, `irq_db_idx`, can be found. */ int ntfy_id; /* * The fixed bounce buffer for this ring. * Once allocated, has to be offered to the device * over the register-page-list adminq command. */ struct gve_queue_page_list *qpl; struct task cleanup_task; struct taskqueue *cleanup_tq; } __aligned(CACHE_LINE_SIZE); struct gve_rxq_stats { counter_u64_t rbytes; counter_u64_t rpackets; counter_u64_t rx_dropped_pkt; counter_u64_t rx_copybreak_cnt; counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; counter_u64_t rx_dropped_pkt_buf_post_fail; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; counter_u64_t rx_mbuf_dmamap_err; counter_u64_t rx_mbuf_mclget_null; }; #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) union gve_rx_qpl_buf_id_dqo { struct { uint16_t buf_id:11; /* Index into rx->dqo.bufs */ uint8_t frag_num:5; /* Which frag in the QPL page */ }; uint16_t all; } __packed; _Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2, "gve: bad dqo qpl rx buf id length"); struct gve_rx_buf_dqo { union { /* RDA */ struct { struct mbuf *mbuf; bus_dmamap_t dmamap; uint64_t addr; bool mapped; }; /* QPL */ struct { uint8_t num_nic_frags; /* number of pending completions */ uint8_t next_idx; /* index of the next frag to post */ /* for chaining rx->dqo.used_bufs */ STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry; }; }; /* for chaining rx->dqo.free_bufs */ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; }; /* power-of-2 sized receive ring */ struct gve_rx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; uint32_t cnt; /* free-running total number of completed packets */ uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ union { /* GQI-only fields */ struct { struct gve_dma_handle data_ring_mem; /* accessed in the GQ receive hot path */ struct gve_rx_desc *desc_ring; union gve_rx_data_slot *data_ring; struct gve_rx_slot_page_info *page_info; uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ uint8_t seq_no; /* helps traverse the descriptor ring */ }; /* DQO-only fields */ struct { struct gve_dma_handle compl_ring_mem; struct gve_rx_compl_desc_dqo *compl_ring; struct gve_rx_desc_dqo *desc_ring; struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */ bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */ uint32_t buf_cnt; /* Size of the bufs array */ uint32_t mask; /* One less than the sizes of the desc and compl rings */ uint32_t head; /* The index at which to post the next buffer at */ uint32_t tail; /* The index at which to receive the next compl at */ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; /* * Only used in QPL mode. Pages refered to by if_input-ed mbufs * stay parked here till their wire count comes back to 1. * Pages are moved here after there aren't any pending completions. */ STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs; } dqo; }; struct lro_ctrl lro; struct gve_rx_ctx ctx; struct gve_rxq_stats stats; } __aligned(CACHE_LINE_SIZE); /* * A contiguous representation of the pages composing the Tx bounce buffer. * The xmit taskqueue and the completion taskqueue both simultaneously use it. * Both operate on `available`: the xmit tq lowers it and the completion tq * raises it. `head` is the last location written at and so only the xmit tq * uses it. */ struct gve_tx_fifo { vm_offset_t base; /* address of base of FIFO */ uint32_t size; /* total size */ volatile int available; /* how much space is still available */ uint32_t head; /* offset to write at */ }; struct gve_tx_buffer_state { struct mbuf *mbuf; struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; struct gve_txq_stats { counter_u64_t tbytes; counter_u64_t tpackets; counter_u64_t tso_packet_cnt; counter_u64_t tx_dropped_pkt; - counter_u64_t tx_dropped_pkt_nospace_device; + counter_u64_t tx_delayed_pkt_nospace_device; counter_u64_t tx_dropped_pkt_nospace_bufring; counter_u64_t tx_delayed_pkt_nospace_descring; counter_u64_t tx_delayed_pkt_nospace_compring; counter_u64_t tx_delayed_pkt_nospace_qpl_bufs; counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; counter_u64_t tx_mbuf_collapse; counter_u64_t tx_mbuf_defrag; counter_u64_t tx_mbuf_defrag_err; counter_u64_t tx_mbuf_dmamap_enomem_err; counter_u64_t tx_mbuf_dmamap_err; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; union { /* RDA */ bus_dmamap_t dmamap; /* QPL */ struct { /* * A linked list of entries from qpl_bufs that served * as the bounce buffer for this packet. */ int32_t qpl_buf_head; uint32_t num_qpl_bufs; }; }; uint8_t state; /* the gve_packet_state enum */ int next; /* To chain the free_pending_pkts lists */ }; /* power-of-2 sized transmit ring */ struct gve_tx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; struct task xmit_task; struct taskqueue *xmit_tq; + bool stopped; /* Accessed when writing descriptors */ struct buf_ring *br; struct mtx ring_mtx; uint32_t req; /* free-running total number of packets written to the nic */ uint32_t done; /* free-running total number of completed packets */ union { /* GQI specific stuff */ struct { union gve_tx_desc *desc_ring; struct gve_tx_buffer_state *info; struct gve_tx_fifo fifo; uint32_t mask; /* masks the req and done to the size of the ring */ }; /* DQO specific stuff */ struct { struct gve_dma_handle compl_ring_mem; /* Accessed when writing descriptors */ struct { union gve_tx_desc_dqo *desc_ring; uint32_t desc_mask; /* masks head and tail to the size of desc_ring */ uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */ uint32_t desc_tail; /* last desc written by driver */ uint32_t last_re_idx; /* desc which last had "report event" set */ /* * The head index of a singly linked list containing pending packet objects * to park mbufs till the NIC sends completions. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. */ int32_t free_pending_pkts_csm; /* * The head index of a singly linked list representing QPL page fragments * to copy mbuf payload into for the NIC to see. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_csm; uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */ uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */ /* DMA params for mapping Tx mbufs. Only used in RDA mode. */ bus_dma_tag_t buf_dmatag; } __aligned(CACHE_LINE_SIZE); /* Accessed when processing completions */ struct { struct gve_tx_compl_desc_dqo *compl_ring; uint32_t compl_mask; /* masks head to the size of compl_ring */ uint32_t compl_head; /* last completion read by driver */ uint8_t cur_gen_bit; /* NIC flips a bit on every pass */ uint32_t hw_tx_head; /* last desc read by NIC */ /* * The completion taskqueue moves pending-packet objects to this * list after freeing the mbuf. The "_prd" denotes that this is * a producer list. The trasnmit taskqueue steals this list once * its consumer list, with the "_csm" suffix, is depleted. */ int32_t free_pending_pkts_prd; /* * The completion taskqueue moves the QPL pages corresponding to a * completed packet into this list. It is only used in QPL mode. * The "_prd" denotes that this is a producer list. The trasnmit * taskqueue steals this list once its consumer list, with the "_csm" * suffix, is depleted. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_prd; uint32_t qpl_bufs_produced; } __aligned(CACHE_LINE_SIZE); /* Accessed by both the completion and xmit loops */ struct { /* completion tags index into this array */ struct gve_tx_pending_pkt_dqo *pending_pkts; uint16_t num_pending_pkts; /* * Represents QPL page fragments. An index into this array * always represents the same QPL page fragment. The value * is also an index into this array and servers as a means * to chain buffers into linked lists whose heads are * either free_qpl_bufs_prd or free_qpl_bufs_csm or * qpl_bufs_head. */ int32_t *qpl_bufs; } __aligned(CACHE_LINE_SIZE); } dqo; }; struct gve_txq_stats stats; } __aligned(CACHE_LINE_SIZE); enum gve_packet_state { /* * Packet does not yet have a dmamap created. * This should always be zero since state is not explicitly initialized. */ GVE_PACKET_STATE_UNALLOCATED, /* Packet has a dmamap and is in free list, available to be allocated. */ GVE_PACKET_STATE_FREE, /* Packet is expecting a regular data completion */ GVE_PACKET_STATE_PENDING_DATA_COMPL, }; struct gve_ptype { uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */ uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */ }; struct gve_ptype_lut { struct gve_ptype ptypes[GVE_NUM_PTYPES]; }; struct gve_priv { if_t ifp; device_t dev; struct ifmedia media; uint8_t mac[ETHER_ADDR_LEN]; struct gve_dma_handle aq_mem; struct resource *reg_bar; /* BAR0 */ struct resource *db_bar; /* BAR2 */ struct resource *msix_table; uint32_t mgmt_msix_idx; uint32_t rx_copybreak; uint16_t num_event_counters; uint16_t default_num_queues; uint16_t tx_desc_cnt; uint16_t rx_desc_cnt; uint16_t rx_pages_per_qpl; uint64_t max_registered_pages; uint64_t num_registered_pages; uint32_t supported_features; uint16_t max_mtu; struct gve_dma_handle counter_array_mem; __be32 *counters; struct gve_dma_handle irqs_db_mem; struct gve_irq_db *irq_db_indices; enum gve_queue_format queue_format; struct gve_queue_page_list *qpls; struct gve_queue_config tx_cfg; struct gve_queue_config rx_cfg; uint32_t num_queues; struct gve_irq *irq_tbl; struct gve_tx_ring *tx; struct gve_rx_ring *rx; struct gve_ptype_lut *ptype_lut_dqo; /* * Admin queue - see gve_adminq.h * Since AQ cmds do not run in steady state, 32 bit counters suffice */ struct gve_adminq_command *adminq; vm_paddr_t adminq_bus_addr; uint32_t adminq_mask; /* masks prod_cnt to adminq size */ uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */ uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */ uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */ /* free-running count of each distinct AQ cmd executed */ uint32_t adminq_describe_device_cnt; uint32_t adminq_cfg_device_resources_cnt; uint32_t adminq_register_page_list_cnt; uint32_t adminq_unregister_page_list_cnt; uint32_t adminq_create_tx_queue_cnt; uint32_t adminq_create_rx_queue_cnt; uint32_t adminq_destroy_tx_queue_cnt; uint32_t adminq_destroy_rx_queue_cnt; uint32_t adminq_dcfg_device_resources_cnt; uint32_t adminq_set_driver_parameter_cnt; uint32_t adminq_verify_driver_compatibility_cnt; uint32_t adminq_get_ptype_map_cnt; uint32_t interface_up_cnt; uint32_t interface_down_cnt; uint32_t reset_cnt; struct task service_task; struct taskqueue *service_tq; struct gve_state_flags state_flags; struct sx gve_iface_lock; }; static inline bool gve_get_state_flag(struct gve_priv *priv, int pos) { return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags)); } static inline void gve_set_state_flag(struct gve_priv *priv, int pos) { BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline void gve_clear_state_flag(struct gve_priv *priv, int pos) { BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline bool gve_is_gqi(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT); } static inline bool gve_is_qpl(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT || priv->queue_format == GVE_DQO_QPL_FORMAT); } /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); /* Register access functions defined in gve_utils.c */ uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); /* QPL (Queue Page List) functions defined in gve_qpl.c */ int gve_alloc_qpls(struct gve_priv *priv); void gve_free_qpls(struct gve_priv *priv); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); void gve_mextadd_free(struct mbuf *mbuf); /* TX functions defined in gve_tx.c */ int gve_alloc_tx_rings(struct gve_priv *priv); void gve_free_tx_rings(struct gve_priv *priv); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); void gve_xmit_tq(void *arg, int pending); void gve_tx_cleanup_tq(void *arg, int pending); /* TX functions defined in gve_tx_dqo.c */ int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); void gve_tx_cleanup_tq_dqo(void *arg, int pending); /* RX functions defined in gve_rx.c */ int gve_alloc_rx_rings(struct gve_priv *priv); void gve_free_rx_rings(struct gve_priv *priv); int gve_create_rx_rings(struct gve_priv *priv); int gve_destroy_rx_rings(struct gve_priv *priv); int gve_rx_intr(void *arg); void gve_rx_cleanup_tq(void *arg, int pending); /* RX functions defined in gve_rx_dqo.c */ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_rx_free_ring_dqo(struct gve_priv *priv, int i); void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx); void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i); int gve_rx_intr_dqo(void *arg); void gve_rx_cleanup_tq_dqo(void *arg, int pending); /* DMA functions defined in gve_utils.c */ int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dma_free_coherent(struct gve_dma_handle *dma); int gve_dmamap_create(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dmamap_destroy(struct gve_dma_handle *dma); /* IRQ functions defined in gve_utils.c */ void gve_free_irqs(struct gve_priv *priv); int gve_alloc_irqs(struct gve_priv *priv); void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); /* Systcl functions defined in gve_sysctl.c */ extern bool gve_disable_hw_lro; extern char gve_queue_format[8]; extern char gve_version[8]; void gve_setup_sysctl(struct gve_priv *priv); void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt); /* Stats functions defined in gve_utils.c */ void gve_alloc_counters(counter_u64_t *stat, int num_stats); void gve_free_counters(counter_u64_t *stat, int num_stats); #endif /* _GVE_FBSD_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c index b627c3f2b4ca..0e40656ca928 100644 --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -1,909 +1,909 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.0\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.1\n" #define GVE_VERSION_MAJOR 1 #define GVE_VERSION_MINOR 3 -#define GVE_VERSION_SUB 0 +#define GVE_VERSION_SUB 1 #define GVE_DEFAULT_RX_COPYBREAK 256 /* Devices supported by this driver. */ static struct gve_dev { uint16_t vendor_id; uint16_t device_id; const char *name; } gve_devs[] = { { PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC, "gVNIC" } }; struct sx gve_global_lock; static int gve_verify_driver_compatibility(struct gve_priv *priv) { int err; struct gve_driver_info *driver_info; struct gve_dma_handle driver_info_mem; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_driver_info), PAGE_SIZE, &driver_info_mem); if (err != 0) return (ENOMEM); driver_info = driver_info_mem.cpu_addr; *driver_info = (struct gve_driver_info) { .os_type = 3, /* Freebsd */ .driver_major = GVE_VERSION_MAJOR, .driver_minor = GVE_VERSION_MINOR, .driver_sub = GVE_VERSION_SUB, .os_version_major = htobe32(FBSD_VERSION_MAJOR), .os_version_minor = htobe32(FBSD_VERSION_MINOR), .os_version_sub = htobe32(FBSD_VERSION_PATCH), .driver_capability_flags = { htobe64(GVE_DRIVER_CAPABILITY_FLAGS1), htobe64(GVE_DRIVER_CAPABILITY_FLAGS2), htobe64(GVE_DRIVER_CAPABILITY_FLAGS3), htobe64(GVE_DRIVER_CAPABILITY_FLAGS4), }, }; snprintf(driver_info->os_version_str1, sizeof(driver_info->os_version_str1), "FreeBSD %u", __FreeBSD_version); bus_dmamap_sync(driver_info_mem.tag, driver_info_mem.map, BUS_DMASYNC_PREREAD); err = gve_adminq_verify_driver_compatibility(priv, sizeof(struct gve_driver_info), driver_info_mem.bus_addr); /* It's ok if the device doesn't support this */ if (err == EOPNOTSUPP) err = 0; gve_dma_free_coherent(&driver_info_mem); return (err); } static int gve_up(struct gve_priv *priv) { if_t ifp = priv->ifp; int err; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (device_is_attached(priv->dev) == 0) { device_printf(priv->dev, "Cannot bring the iface up when detached\n"); return (ENXIO); } if (gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return (0); if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, CSUM_TCP | CSUM_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, CSUM_IP6_TCP | CSUM_IP6_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); if (gve_is_qpl(priv)) { err = gve_register_qpls(priv); if (err != 0) goto reset; } err = gve_create_rx_rings(priv); if (err != 0) goto reset; err = gve_create_tx_rings(priv); if (err != 0) goto reset; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; return (0); reset: gve_schedule_reset(priv); return (err); } static void gve_down(struct gve_priv *priv) { GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if (gve_destroy_rx_rings(priv) != 0) goto reset; if (gve_destroy_tx_rings(priv) != 0) goto reset; if (gve_is_qpl(priv)) { if (gve_unregister_qpls(priv) != 0) goto reset; } if (gve_is_gqi(priv)) gve_mask_all_queue_irqs(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_down_cnt++; return; reset: gve_schedule_reset(priv); } static int gve_set_mtu(if_t ifp, uint32_t new_mtu) { struct gve_priv *priv = if_getsoftc(ifp); int err; if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { device_printf(priv->dev, "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d\n", new_mtu, priv->max_mtu, ETHERMIN); return (EINVAL); } err = gve_adminq_set_mtu(priv, new_mtu); if (err == 0) { if (bootverbose) device_printf(priv->dev, "MTU set to %d\n", new_mtu); if_setmtu(ifp, new_mtu); } else { device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); } return (err); } static void gve_init(void *arg) { struct gve_priv *priv = (struct gve_priv *)arg; if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } static int gve_ioctl(if_t ifp, u_long command, caddr_t data) { struct gve_priv *priv; struct ifreq *ifr; int rc = 0; priv = if_getsoftc(ifp); ifr = (struct ifreq *)data; switch (command) { case SIOCSIFMTU: if (if_getmtu(ifp) == ifr->ifr_mtu) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); gve_set_mtu(ifp, ifr->ifr_mtu); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFFLAGS: if ((if_getflags(ifp) & IFF_UP) != 0) { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } else { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } break; case SIOCSIFCAP: if (ifr->ifr_reqcap == if_getcapenable(ifp)) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); if_setcapenable(ifp, ifr->ifr_reqcap); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFMEDIA: /* FALLTHROUGH */ case SIOCGIFMEDIA: rc = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; default: rc = ether_ioctl(ifp, command, data); break; } return (rc); } static int gve_media_change(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); device_printf(priv->dev, "Media change not supported\n"); return (0); } static void gve_media_status(if_t ifp, struct ifmediareq *ifmr) { struct gve_priv *priv = if_getsoftc(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_AUTO; } else { ifmr->ifm_active |= IFM_NONE; } GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } static uint64_t gve_get_counter(if_t ifp, ift_counter cnt) { struct gve_priv *priv; uint64_t rpackets = 0; uint64_t tpackets = 0; uint64_t rbytes = 0; uint64_t tbytes = 0; uint64_t rx_dropped_pkt = 0; uint64_t tx_dropped_pkt = 0; priv = if_getsoftc(ifp); gve_accum_stats(priv, &rpackets, &rbytes, &rx_dropped_pkt, &tpackets, &tbytes, &tx_dropped_pkt); switch (cnt) { case IFCOUNTER_IPACKETS: return (rpackets); case IFCOUNTER_OPACKETS: return (tpackets); case IFCOUNTER_IBYTES: return (rbytes); case IFCOUNTER_OBYTES: return (tbytes); case IFCOUNTER_IQDROPS: return (rx_dropped_pkt); case IFCOUNTER_OQDROPS: return (tx_dropped_pkt); default: return (if_get_counter_default(ifp, cnt)); } } static void gve_setup_ifnet(device_t dev, struct gve_priv *priv) { int caps = 0; if_t ifp; ifp = priv->ifp = if_alloc(IFT_ETHER); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, priv); if_setdev(ifp, dev); if_setinitfn(ifp, gve_init); if_setioctlfn(ifp, gve_ioctl); if_settransmitfn(ifp, gve_xmit_ifp); if_setqflushfn(ifp, gve_qflush); /* * Set TSO limits, must match the arguments to bus_dma_tag_create * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode * because in QPL we copy the entire pakcet into the bounce buffer * and thus it does not matter how fragmented the mbuf is. */ if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) { if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); } if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); #else if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); #endif ifmedia_init(&priv->media, IFM_IMASK, gve_media_change, gve_media_status); if_setgetcounterfn(ifp, gve_get_counter); caps = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO | IFCAP_LRO; if ((priv->supported_features & GVE_SUP_JUMBO_FRAMES_MASK) != 0) caps |= IFCAP_JUMBO_MTU; if_setcapabilities(ifp, caps); if_setcapenable(ifp, caps); if (bootverbose) device_printf(priv->dev, "Setting initial MTU to %d\n", priv->max_mtu); if_setmtu(ifp, priv->max_mtu); ether_ifattach(ifp, priv->mac); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); } static int gve_alloc_counter_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(uint32_t) * priv->num_event_counters, PAGE_SIZE, &priv->counter_array_mem); if (err != 0) return (err); priv->counters = priv->counter_array_mem.cpu_addr; return (0); } static void gve_free_counter_array(struct gve_priv *priv) { if (priv->counters != NULL) gve_dma_free_coherent(&priv->counter_array_mem); priv->counter_array_mem = (struct gve_dma_handle){}; } static int gve_alloc_irq_db_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_irq_db) * (priv->num_queues), PAGE_SIZE, &priv->irqs_db_mem); if (err != 0) return (err); priv->irq_db_indices = priv->irqs_db_mem.cpu_addr; return (0); } static void gve_free_irq_db_array(struct gve_priv *priv) { if (priv->irq_db_indices != NULL) gve_dma_free_coherent(&priv->irqs_db_mem); priv->irqs_db_mem = (struct gve_dma_handle){}; } static void gve_free_rings(struct gve_priv *priv) { gve_free_irqs(priv); gve_free_tx_rings(priv); gve_free_rx_rings(priv); if (gve_is_qpl(priv)) gve_free_qpls(priv); } static int gve_alloc_rings(struct gve_priv *priv) { int err; if (gve_is_qpl(priv)) { err = gve_alloc_qpls(priv); if (err != 0) goto abort; } err = gve_alloc_rx_rings(priv); if (err != 0) goto abort; err = gve_alloc_tx_rings(priv); if (err != 0) goto abort; err = gve_alloc_irqs(priv); if (err != 0) goto abort; return (0); abort: gve_free_rings(priv); return (err); } static void gve_deconfigure_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) { err = gve_adminq_deconfigure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to deconfigure device resources: err=%d\n", err); return; } if (bootverbose) device_printf(priv->dev, "Deconfigured device resources\n"); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); } gve_free_irq_db_array(priv); gve_free_counter_array(priv); if (priv->ptype_lut_dqo) { free(priv->ptype_lut_dqo, M_GVE); priv->ptype_lut_dqo = NULL; } } static int gve_configure_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) return (0); err = gve_alloc_counter_array(priv); if (err != 0) return (err); err = gve_alloc_irq_db_array(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } if (!gve_is_gqi(priv)) { priv->ptype_lut_dqo = malloc(sizeof(*priv->ptype_lut_dqo), M_GVE, M_WAITOK | M_ZERO); err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); if (err != 0) { device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", err); goto abort; } } gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); if (bootverbose) device_printf(priv->dev, "Configured device resources\n"); return (0); abort: gve_deconfigure_resources(priv); return (err); } static void gve_set_queue_cnts(struct gve_priv *priv) { priv->tx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_TX_QUEUES); priv->rx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_RX_QUEUES); priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; if (priv->default_num_queues > 0) { priv->tx_cfg.num_queues = MIN(priv->default_num_queues, priv->tx_cfg.num_queues); priv->rx_cfg.num_queues = MIN(priv->default_num_queues, priv->rx_cfg.num_queues); } priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues; priv->mgmt_msix_idx = priv->num_queues; } static int gve_alloc_adminq_and_describe_device(struct gve_priv *priv) { int err; if ((err = gve_adminq_alloc(priv)) != 0) return (err); if ((err = gve_verify_driver_compatibility(priv)) != 0) { device_printf(priv->dev, "Failed to verify driver compatibility: err=%d\n", err); goto abort; } if ((err = gve_adminq_describe_device(priv)) != 0) goto abort; gve_set_queue_cnts(priv); priv->num_registered_pages = 0; return (0); abort: gve_release_adminq(priv); return (err); } void gve_schedule_reset(struct gve_priv *priv) { if (gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) return; device_printf(priv->dev, "Scheduling reset task!\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); taskqueue_enqueue(priv->service_tq, &priv->service_task); } static void gve_destroy(struct gve_priv *priv) { gve_down(priv); gve_deconfigure_resources(priv); gve_release_adminq(priv); } static void gve_restore(struct gve_priv *priv) { int err; err = gve_adminq_alloc(priv); if (err != 0) goto abort; err = gve_configure_resources(priv); if (err != 0) goto abort; err = gve_up(priv); if (err != 0) goto abort; return; abort: device_printf(priv->dev, "Restore failed!\n"); return; } static void gve_handle_reset(struct gve_priv *priv) { if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) return; gve_clear_state_flag(priv, GVE_STATE_FLAG_DO_RESET); gve_set_state_flag(priv, GVE_STATE_FLAG_IN_RESET); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); /* * Releasing the adminq causes the NIC to destroy all resources * registered with it, so by clearing the flags beneath we cause * the subsequent gve_down call below to not attempt to tell the * NIC to destroy these resources again. * * The call to gve_down is needed in the first place to refresh * the state and the DMA-able memory within each driver ring. */ gve_release_adminq(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); gve_down(priv); gve_restore(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); priv->reset_cnt++; gve_clear_state_flag(priv, GVE_STATE_FLAG_IN_RESET); } static void gve_handle_link_status(struct gve_priv *priv) { uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); bool link_up = status & GVE_DEVICE_STATUS_LINK_STATUS; if (link_up == gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) return; if (link_up) { if (bootverbose) device_printf(priv->dev, "Device link is up.\n"); if_link_state_change(priv->ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } else { device_printf(priv->dev, "Device link is down.\n"); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } } static void gve_service_task(void *arg, int pending) { struct gve_priv *priv = (struct gve_priv *)arg; uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); if (((GVE_DEVICE_STATUS_RESET_MASK & status) != 0) && !gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) { device_printf(priv->dev, "Device requested reset\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); } gve_handle_reset(priv); gve_handle_link_status(priv); } static int gve_probe(device_t dev) { uint16_t deviceid, vendorid; int i; vendorid = pci_get_vendor(dev); deviceid = pci_get_device(dev); for (i = 0; i < nitems(gve_devs); i++) { if (vendorid == gve_devs[i].vendor_id && deviceid == gve_devs[i].device_id) { device_set_desc(dev, gve_devs[i].name); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void gve_free_sys_res_mem(struct gve_priv *priv) { if (priv->msix_table != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->msix_table), priv->msix_table); if (priv->db_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->db_bar), priv->db_bar); if (priv->reg_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->reg_bar), priv->reg_bar); } static int gve_attach(device_t dev) { struct gve_priv *priv; int rid; int err; snprintf(gve_version, sizeof(gve_version), "%d.%d.%d", GVE_VERSION_MAJOR, GVE_VERSION_MINOR, GVE_VERSION_SUB); priv = device_get_softc(dev); priv->dev = dev; GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); pci_enable_busmaster(dev); rid = PCIR_BAR(GVE_REGISTER_BAR); priv->reg_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->reg_bar == NULL) { device_printf(dev, "Failed to allocate BAR0\n"); err = ENXIO; goto abort; } rid = PCIR_BAR(GVE_DOORBELL_BAR); priv->db_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->db_bar == NULL) { device_printf(dev, "Failed to allocate BAR2\n"); err = ENXIO; goto abort; } rid = pci_msix_table_bar(priv->dev); priv->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->msix_table == NULL) { device_printf(dev, "Failed to allocate msix table\n"); err = ENXIO; goto abort; } err = gve_alloc_adminq_and_describe_device(priv); if (err != 0) goto abort; err = gve_configure_resources(priv); if (err != 0) goto abort; err = gve_alloc_rings(priv); if (err != 0) goto abort; gve_setup_ifnet(dev, priv); priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; bus_write_multi_1(priv->reg_bar, DRIVER_VERSION, GVE_DRIVER_VERSION, sizeof(GVE_DRIVER_VERSION) - 1); TASK_INIT(&priv->service_task, 0, gve_service_task, priv); priv->service_tq = taskqueue_create("gve service", M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &priv->service_tq); taskqueue_start_threads(&priv->service_tq, 1, PI_NET, "%s service tq", device_get_nameunit(priv->dev)); gve_setup_sysctl(priv); if (bootverbose) device_printf(priv->dev, "Successfully attached %s", GVE_DRIVER_VERSION); return (0); abort: gve_free_rings(priv); gve_deconfigure_resources(priv); gve_release_adminq(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); return (err); } static int gve_detach(device_t dev) { struct gve_priv *priv = device_get_softc(dev); if_t ifp = priv->ifp; int error; error = bus_generic_detach(dev); if (error != 0) return (error); ether_ifdetach(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_destroy(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); gve_free_rings(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); while (taskqueue_cancel(priv->service_tq, &priv->service_task, NULL)) taskqueue_drain(priv->service_tq, &priv->service_task); taskqueue_free(priv->service_tq); if_free(ifp); return (0); } static device_method_t gve_methods[] = { DEVMETHOD(device_probe, gve_probe), DEVMETHOD(device_attach, gve_attach), DEVMETHOD(device_detach, gve_detach), DEVMETHOD_END }; static driver_t gve_driver = { "gve", gve_methods, sizeof(struct gve_priv) }; #if __FreeBSD_version < 1301503 static devclass_t gve_devclass; DRIVER_MODULE(gve, pci, gve_driver, gve_devclass, 0, 0); #else DRIVER_MODULE(gve, pci, gve_driver, 0, 0); #endif MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, gve, gve_devs, nitems(gve_devs)); diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c index b9f2eefa5bb0..7a091d9caa43 100644 --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -1,327 +1,327 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" static SYSCTL_NODE(_hw, OID_AUTO, gve, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "GVE driver parameters"); bool gve_disable_hw_lro = false; SYSCTL_BOOL(_hw_gve, OID_AUTO, disable_hw_lro, CTLFLAG_RDTUN, &gve_disable_hw_lro, 0, "Controls if hardware LRO is used"); char gve_queue_format[8]; SYSCTL_STRING(_hw_gve, OID_AUTO, queue_format, CTLFLAG_RD, &gve_queue_format, 0, "Queue format being used by the iface"); char gve_version[8]; SYSCTL_STRING(_hw_gve, OID_AUTO, driver_version, CTLFLAG_RD, &gve_version, 0, "Driver version"); static void gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_rx_ring *rxq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct gve_rxq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->com.id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue"); list = SYSCTL_CHILDREN(node); stats = &rxq->stats; SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &stats->rbytes, "Bytes received"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_packets", CTLFLAG_RD, &stats->rpackets, "Packets received"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_copybreak_cnt", CTLFLAG_RD, &stats->rx_copybreak_cnt, "Total frags with mbufs allocated for copybreak"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_flip_cnt", CTLFLAG_RD, &stats->rx_frag_flip_cnt, "Total frags that allocated mbuf with page flip"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_copy_cnt", CTLFLAG_RD, &stats->rx_frag_copy_cnt, "Total frags with mbuf that copied payload into mbuf"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt", CTLFLAG_RD, &stats->rx_dropped_pkt, "Total rx packets dropped"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_desc_err", CTLFLAG_RD, &stats->rx_dropped_pkt_desc_err, "Packets dropped due to descriptor error"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_buf_post_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_buf_post_fail, "Packets dropped due to failure to post enough buffers"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_mbuf_alloc_fail, "Packets dropped due to failed mbuf allocation"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_mbuf_dmamap_err", CTLFLAG_RD, &stats->rx_mbuf_dmamap_err, "Number of rx mbufs which couldnt be dma mapped"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_mbuf_mclget_null", CTLFLAG_RD, &stats->rx_mbuf_mclget_null, "Number of times when there were no cluster mbufs"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "rx_completed_desc", CTLFLAG_RD, &rxq->cnt, 0, "Number of descriptors completed"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "num_desc_posted", CTLFLAG_RD, &rxq->fill_cnt, rxq->fill_cnt, "Toal number of descriptors posted"); } static void gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_tx_ring *txq) { struct sysctl_oid *node; struct sysctl_oid_list *tx_list; struct gve_txq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->com.id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue"); tx_list = SYSCTL_CHILDREN(node); stats = &txq->stats; SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, "tx_posted_desc", CTLFLAG_RD, &txq->req, 0, "Number of descriptors posted by NIC"); SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, "tx_completed_desc", CTLFLAG_RD, &txq->done, 0, "Number of descriptors completed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &stats->tpackets, "Packets transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_tso_packets", CTLFLAG_RD, &stats->tso_packet_cnt, "TSO Packets transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_bytes", CTLFLAG_RD, &stats->tbytes, "Bytes transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, - "tx_dropped_pkt_nospace_device", CTLFLAG_RD, - &stats->tx_dropped_pkt_nospace_device, - "Packets dropped due to no space in device"); + "tx_delayed_pkt_nospace_device", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_device, + "Packets delayed due to no space in device"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_dropped_pkt_nospace_bufring", CTLFLAG_RD, &stats->tx_dropped_pkt_nospace_bufring, "Packets dropped due to no space in br ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_dropped_pkt_vlan", CTLFLAG_RD, &stats->tx_dropped_pkt_vlan, "Dropped VLAN packets"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_descring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_descring, "Packets delayed due to no space in desc ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_compring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_compring, "Packets delayed due to no space in comp ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_qpl_bufs, "Packets delayed due to not enough qpl bufs"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_tsoerr", CTLFLAG_RD, &stats->tx_delayed_pkt_tsoerr, "TSO packets delayed due to err in prep errors"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_collpase", CTLFLAG_RD, &stats->tx_mbuf_collapse, "tx mbufs that had to be collpased"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_defrag", CTLFLAG_RD, &stats->tx_mbuf_defrag, "tx mbufs that had to be defragged"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_defrag_err", CTLFLAG_RD, &stats->tx_mbuf_defrag_err, "tx mbufs that failed defrag"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_dmamap_enomem_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_enomem_err, "tx mbufs that could not be dma-mapped due to low mem"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_dmamap_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_err, "tx mbufs that could not be dma-mapped"); } static void gve_setup_queue_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) { gve_setup_rxq_sysctl(ctx, child, &priv->rx[i]); } for (i = 0; i < priv->tx_cfg.num_queues; i++) { gve_setup_txq_sysctl(ctx, child, &priv->tx[i]); } } static void gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { struct sysctl_oid *admin_node; struct sysctl_oid_list *admin_list; /* Admin queue stats */ admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "adminq_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue statistics"); admin_list = SYSCTL_CHILDREN(admin_node); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_prod_cnt", CTLFLAG_RD, &priv->adminq_prod_cnt, 0, "Adminq Commands issued"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cmd_fail", CTLFLAG_RD, &priv->adminq_cmd_fail, 0, "Aqminq Failed commands"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_timeouts", CTLFLAG_RD, &priv->adminq_timeouts, 0, "Adminq Timedout commands"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_describe_device_cnt", CTLFLAG_RD, &priv->adminq_describe_device_cnt, 0, "adminq_describe_device_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_cfg_device_resources_cnt, 0, "adminq_cfg_device_resources_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_register_page_list_cnt", CTLFLAG_RD, &priv->adminq_register_page_list_cnt, 0, "adminq_register_page_list_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_unregister_page_list_cnt", CTLFLAG_RD, &priv->adminq_unregister_page_list_cnt, 0, "adminq_unregister_page_list_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_tx_queue_cnt", CTLFLAG_RD, &priv->adminq_create_tx_queue_cnt, 0, "adminq_create_tx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_create_rx_queue_cnt, 0, "adminq_create_rx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_tx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_tx_queue_cnt, 0, "adminq_destroy_tx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0, "adminq_destroy_rx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_get_ptype_map_cnt", CTLFLAG_RD, &priv->adminq_get_ptype_map_cnt, 0, "adminq_get_ptype_map_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_dcfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_dcfg_device_resources_cnt, 0, "adminq_dcfg_device_resources_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_set_driver_parameter_cnt", CTLFLAG_RD, &priv->adminq_set_driver_parameter_cnt, 0, "adminq_set_driver_parameter_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_verify_driver_compatibility_cnt", CTLFLAG_RD, &priv->adminq_verify_driver_compatibility_cnt, 0, "adminq_verify_driver_compatibility_cnt"); } static void gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { struct sysctl_oid *main_node; struct sysctl_oid_list *main_list; /* Main stats */ main_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "main_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Main statistics"); main_list = SYSCTL_CHILDREN(main_node); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_up_cnt", CTLFLAG_RD, &priv->interface_up_cnt, 0, "Times interface was set to up"); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_down_cnt", CTLFLAG_RD, &priv->interface_down_cnt, 0, "Times interface was set to down"); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "reset_cnt", CTLFLAG_RD, &priv->reset_cnt, 0, "Times reset"); } void gve_setup_sysctl(struct gve_priv *priv) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = priv->dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); gve_setup_queue_stat_sysctl(ctx, child, priv); gve_setup_adminq_stat_sysctl(ctx, child, priv); gve_setup_main_stat_sysctl(ctx, child, priv); } void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt) { struct gve_rxq_stats *rxqstats; struct gve_txq_stats *txqstats; int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) { rxqstats = &priv->rx[i].stats; *rpackets += counter_u64_fetch(rxqstats->rpackets); *rbytes += counter_u64_fetch(rxqstats->rbytes); *rx_dropped_pkt += counter_u64_fetch(rxqstats->rx_dropped_pkt); } for (i = 0; i < priv->tx_cfg.num_queues; i++) { txqstats = &priv->tx[i].stats; *tpackets += counter_u64_fetch(txqstats->tpackets); *tbytes += counter_u64_fetch(txqstats->tbytes); *tx_dropped_pkt += counter_u64_fetch(txqstats->tx_dropped_pkt); } } diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c index 60a54878b685..e7e10e526cb9 100644 --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -1,869 +1,927 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) { struct gve_queue_page_list *qpl = tx->com.qpl; struct gve_tx_fifo *fifo = &tx->fifo; fifo->size = qpl->num_pages * PAGE_SIZE; fifo->base = qpl->kva; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; return (0); } static void gve_tx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; if (tx->desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->desc_ring = NULL; } if (tx->info != NULL) { free(tx->info, M_GVE); tx->info = NULL; } } static void gve_tx_free_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; /* Safe to call even if never alloced */ gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); if (mtx_initialized(&tx->ring_mtx)) mtx_destroy(&tx->ring_mtx); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } if (tx->br != NULL) { buf_ring_free(tx->br, M_DEVBUF); tx->br = NULL; } if (gve_is_gqi(priv)) gve_tx_free_ring_gqi(priv, i); else gve_tx_free_ring_dqo(priv, i); } static int gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; int err; err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->desc_ring = tx->desc_ring_mem.cpu_addr; com->qpl = &priv->qpls[i]; if (com->qpl == NULL) { device_printf(priv->dev, "No QPL left for tx ring %d\n", i); err = ENOMEM; goto abort; } err = gve_tx_fifo_init(priv, tx); if (err != 0) goto abort; tx->info = malloc( sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, M_GVE, M_WAITOK | M_ZERO); return (0); abort: gve_tx_free_ring_gqi(priv, i); return (err); } static int gve_tx_alloc_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; char mtx_name[16]; int err; com->priv = priv; com->id = i; if (gve_is_gqi(priv)) err = gve_tx_alloc_ring_gqi(priv, i); else err = gve_tx_alloc_ring_dqo(priv, i); if (err != 0) goto abort; sprintf(mtx_name, "gvetx%d", i); mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, M_WAITOK, &tx->ring_mtx); gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; return (0); abort: gve_tx_free_ring(priv, i); return (err); } int gve_alloc_tx_rings(struct gve_priv *priv) { int err = 0; int i; priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, M_GVE, M_WAITOK | M_ZERO); for (i = 0; i < priv->tx_cfg.num_queues; i++) { err = gve_tx_alloc_ring(priv, i); if (err != 0) goto free_rings; } return (0); free_rings: while (i--) gve_tx_free_ring(priv, i); free(priv->tx, M_GVE); return (err); } void gve_free_tx_rings(struct gve_priv *priv) { int i; for (i = 0; i < priv->tx_cfg.num_queues; i++) gve_tx_free_ring(priv, i); free(priv->tx, M_GVE); } static void gve_tx_clear_desc_ring(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->desc_ring[i] = (union gve_tx_desc){}; tx->info[i] = (struct gve_tx_buffer_state){}; } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_clear_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_tx_fifo *fifo = &tx->fifo; tx->req = 0; tx->done = 0; tx->mask = priv->tx_desc_cnt - 1; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; gve_tx_clear_desc_ring(tx); } static void gve_start_tx_ring(struct gve_priv *priv, int i, void (cleanup) (void *arg, int pending)) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; + atomic_store_bool(&tx->stopped, false); + NET_TASK_INIT(&com->cleanup_task, 0, cleanup, tx); com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", device_get_nameunit(priv->dev), i); TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); tx->xmit_tq = taskqueue_create_fast("gve tx xmit", M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", device_get_nameunit(priv->dev), i); } int gve_create_tx_rings(struct gve_priv *priv) { struct gve_ring_com *com; struct gve_tx_ring *tx; int err; int i; if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) return (0); for (i = 0; i < priv->tx_cfg.num_queues; i++) { if (gve_is_gqi(priv)) gve_clear_tx_ring(priv, i); else gve_clear_tx_ring_dqo(priv, i); } err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_POSTREAD); for (i = 0; i < priv->tx_cfg.num_queues; i++) { tx = &priv->tx[i]; com = &tx->com; com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, BUS_DMASYNC_POSTREAD); com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); if (gve_is_gqi(priv)) gve_start_tx_ring(priv, i, gve_tx_cleanup_tq); else gve_start_tx_ring(priv, i, gve_tx_cleanup_tq_dqo); } gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); return (0); } static void gve_stop_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; if (com->cleanup_tq != NULL) { taskqueue_quiesce(com->cleanup_tq); taskqueue_free(com->cleanup_tq); com->cleanup_tq = NULL; } if (tx->xmit_tq != NULL) { taskqueue_quiesce(tx->xmit_tq); taskqueue_free(tx->xmit_tq); tx->xmit_tq = NULL; } } int gve_destroy_tx_rings(struct gve_priv *priv) { int err; int i; for (i = 0; i < priv->tx_cfg.num_queues; i++) gve_stop_tx_ring(priv, i); if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); } return (0); } int gve_tx_intr(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static uint32_t gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) { bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, BUS_DMASYNC_POSTREAD); uint32_t counter = priv->counters[tx->com.counter_idx]; return (be32toh(counter)); } static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) { atomic_add_int(&fifo->available, bytes); } void gve_tx_cleanup_tq(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; uint32_t nic_done = gve_tx_load_event_counter(priv, tx); uint32_t todo = nic_done - tx->done; size_t space_freed = 0; int i, j; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; for (j = 0; j < todo; j++) { uint32_t idx = tx->done & tx->mask; struct gve_tx_buffer_state *info = &tx->info[idx]; struct mbuf *mbuf = info->mbuf; tx->done++; if (mbuf == NULL) continue; info->mbuf = NULL; counter_enter(); counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); counter_u64_add_protected(tx->stats.tpackets, 1); counter_exit(); m_freem(mbuf); for (i = 0; i < GVE_TX_MAX_DESCS; i++) { space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; info->iov[i].iov_len = 0; info->iov[i].iov_padding = 0; } } gve_tx_free_fifo(&tx->fifo, space_freed); gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_ACK | GVE_IRQ_EVENT); /* * Completions born before this barrier MAY NOT cause the NIC to send an * interrupt but they will still be handled by the enqueue below. * Completions born after the barrier WILL trigger an interrupt. */ mb(); nic_done = gve_tx_load_event_counter(priv, tx); todo = nic_done - tx->done; if (todo != 0) { gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); } + + if (atomic_load_bool(&tx->stopped) && space_freed) { + atomic_store_bool(&tx->stopped, false); + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + } } static void gve_dma_sync_for_device(struct gve_queue_page_list *qpl, uint64_t iov_offset, uint64_t iov_len) { uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; uint64_t first_page = iov_offset / PAGE_SIZE; struct gve_dma_handle *dma; uint64_t page; for (page = first_page; page <= last_page; page++) { dma = &(qpl->dmas[page]); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); } } static void gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) { mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); mtd_desc->reserved0 = 0; mtd_desc->reserved1 = 0; } static void gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, uint16_t l4_hdr_offset, uint32_t desc_cnt, uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, int csum_offset, uint16_t pkt_len) { if (is_tso) { pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else if (has_csum_flag) { pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else { pkt_desc->type_flags = GVE_TXD_STD; pkt_desc->l4_csum_offset = 0; pkt_desc->l4_hdr_offset = 0; } pkt_desc->desc_cnt = desc_cnt; pkt_desc->len = htobe16(pkt_len); pkt_desc->seg_len = htobe16(first_seg_len); pkt_desc->seg_addr = htobe64(addr); } static void gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, bool is_tso, uint16_t len, uint64_t addr, bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) { seg_desc->type_flags = GVE_TXD_SEG; if (is_tso) { if (is_ipv6) seg_desc->type_flags |= GVE_TXSF_IPV6; seg_desc->l3_offset = l3_off >> 1; seg_desc->mss = htobe16(tso_mss); } seg_desc->seg_len = htobe16(len); seg_desc->seg_addr = htobe64(addr); } static inline uint32_t gve_tx_avail(struct gve_tx_ring *tx) { return (tx->mask + 1 - (tx->req - tx->done)); } static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) { return (atomic_load_int(&fifo->available) >= bytes); } static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) { return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); } static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) { return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; } static inline int gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, uint16_t pkt_len) { int pad_bytes, align_hdr_pad; int bytes; pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); /* We need to take into account the header alignment padding. */ align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; bytes = align_hdr_pad + pad_bytes + pkt_len; return (bytes); } static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, struct gve_tx_iovec iov[2]) { size_t overflow, padding; uint32_t aligned_head; int nfrags = 0; if (bytes == 0) return (0); /* * This check happens before we know how much padding is needed to * align to a cacheline boundary for the payload, but that is fine, * because the FIFO head always start aligned, and the FIFO's boundaries * are aligned, so if there is space for the data, there is space for * the padding to the next alignment. */ KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), ("Allocating gve tx fifo when there is no room")); nfrags++; iov[0].iov_offset = fifo->head; iov[0].iov_len = bytes; fifo->head += bytes; if (fifo->head > fifo->size) { /* * If the allocation did not fit in the tail fragment of the * FIFO, also use the head fragment. */ nfrags++; overflow = fifo->head - fifo->size; iov[0].iov_len -= overflow; iov[1].iov_offset = 0; /* Start of fifo*/ iov[1].iov_len = overflow; fifo->head = overflow; } /* Re-align to a cacheline boundary */ aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); padding = aligned_head - fifo->head; iov[nfrags - 1].iov_padding = padding; atomic_add_int(&fifo->available, -(bytes + padding)); fifo->head = aligned_head; if (fifo->head == fifo->size) fifo->head = 0; return (nfrags); } /* Only error this returns is ENOBUFS when the tx fifo is short of space */ static int gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) { bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; int pad_bytes, hdr_nfrags, payload_nfrags; struct gve_tx_pkt_desc *pkt_desc; struct gve_tx_seg_desc *seg_desc; struct gve_tx_mtd_desc *mtd_desc; struct gve_tx_buffer_state *info; uint32_t idx = tx->req & tx->mask; struct ether_header *eh; struct mbuf *mbuf_next; int payload_iov = 2; int bytes_required; struct ip6_hdr *ip6; struct tcphdr *th; uint32_t next_idx; uint8_t l3_off; struct ip *ip; int i; info = &tx->info[idx]; csum_flags = mbuf->m_pkthdr.csum_flags; pkt_len = mbuf->m_pkthdr.len; is_tso = csum_flags & CSUM_TSO; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; l3_off = ETHER_HDR_LEN; mbuf_next = m_getptr(mbuf, l3_off, &offset); if (is_ipv6) { ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); l4_off = l3_off + sizeof(struct ip6_hdr); is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); is_udp = (ip6->ip6_nxt == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (struct ip *)(mtodo(mbuf_next, offset)); l4_off = l3_off + (ip->ip_hl << 2); is_tcp = (ip->ip_p == IPPROTO_TCP); is_udp = (ip->ip_p == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } l4_data_off = 0; if (is_tcp) { th = (struct tcphdr *)(mtodo(mbuf_next, offset)); l4_data_off = l4_off + (th->th_off << 2); } else if (is_udp) l4_data_off = l4_off + sizeof(struct udphdr); if (has_csum_flag) { if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) csum_offset = offsetof(struct tcphdr, th_sum); else csum_offset = offsetof(struct udphdr, uh_sum); } /* * If this packet is neither a TCP nor a UDP packet, the first segment, * the one represented by the packet descriptor, will carry the * spec-stipulated minimum of 182B. */ if (l4_data_off != 0) first_seg_len = l4_data_off; else first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); if (__predict_false(!gve_can_tx(tx, bytes_required))) { counter_enter(); - counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1); - counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); counter_exit(); return (ENOBUFS); } /* So that the cleanup taskqueue can free the mbuf eventually. */ info->mbuf = mbuf; /* * We don't want to split the header, so if necessary, pad to the end * of the fifo and then put the header at the beginning of the fifo. */ pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, &info->iov[0]); KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, &info->iov[payload_iov]); pkt_desc = &tx->desc_ring[idx].pkt; gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 1 + mtd_desc_nr + payload_nfrags, first_seg_len, info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, pkt_len); m_copydata(mbuf, 0, first_seg_len, (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[hdr_nfrags - 1].iov_offset, info->iov[hdr_nfrags - 1].iov_len); copy_offset = first_seg_len; if (mtd_desc_nr == 1) { next_idx = (tx->req + 1) & tx->mask; mtd_desc = &tx->desc_ring[next_idx].mtd; gve_tx_fill_mtd_desc(mtd_desc, mbuf); } for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; seg_desc = &tx->desc_ring[next_idx].seg; gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); m_copydata(mbuf, copy_offset, info->iov[i].iov_len, (char *)tx->fifo.base + info->iov[i].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[i].iov_offset, info->iov[i].iov_len); copy_offset += info->iov[i].iov_len; } tx->req += (1 + mtd_desc_nr + payload_nfrags); if (is_tso) { counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } return (0); } +static int +gve_xmit_mbuf(struct gve_tx_ring *tx, + struct mbuf **mbuf) +{ + if (gve_is_gqi(tx->com.priv)) + return (gve_xmit(tx, *mbuf)); + + if (gve_is_qpl(tx->com.priv)) + return (gve_xmit_dqo_qpl(tx, *mbuf)); + + /* + * gve_xmit_dqo might attempt to defrag the mbuf chain. + * The reference is passed in so that in the case of + * errors, the new mbuf chain is what's put back on the br. + */ + return (gve_xmit_dqo(tx, mbuf)); +} + +/* + * Has the side-effect of stopping the xmit queue by setting tx->stopped + */ +static int +gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, + struct mbuf **mbuf) +{ + int err; + + atomic_store_bool(&tx->stopped, true); + + /* + * Room made in the queue BEFORE the barrier will be seen by the + * gve_xmit_mbuf retry below. + * + * If room is made in the queue AFTER the barrier, the cleanup tq + * iteration creating the room will either see a tx->stopped value + * of 0 or the 1 we just wrote: + * + * If it sees a 1, then it would enqueue the xmit tq. Enqueue + * implies a retry on the waiting pkt. + * + * If it sees a 0, then that implies a previous iteration overwrote + * our 1, and that iteration would enqueue the xmit tq. Enqueue + * implies a retry on the waiting pkt. + */ + atomic_thread_fence_seq_cst(); + + err = gve_xmit_mbuf(tx, mbuf); + if (err == 0) + atomic_store_bool(&tx->stopped, false); + + return (err); +} + static void gve_xmit_br(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; struct ifnet *ifp = priv->ifp; struct mbuf *mbuf; int err; while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && (mbuf = drbr_peek(ifp, tx->br)) != NULL) { + err = gve_xmit_mbuf(tx, &mbuf); - if (gve_is_gqi(priv)) - err = gve_xmit(tx, mbuf); - else { - /* - * gve_xmit_dqo might attempt to defrag the mbuf chain. - * The reference is passed in so that in the case of - * errors, the new mbuf chain is what's put back on the br. - */ - if (gve_is_qpl(priv)) - err = gve_xmit_dqo_qpl(tx, mbuf); - else - err = gve_xmit_dqo(tx, &mbuf); - } + /* + * We need to stop this taskqueue when we can't xmit the pkt due + * to lack of space in the NIC ring (ENOBUFS). The retry exists + * to guard against a TOCTTOU bug that could end up freezing the + * queue forever. + */ + if (__predict_false(mbuf != NULL && err == ENOBUFS)) + err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); if (__predict_false(err != 0 && mbuf != NULL)) { - drbr_putback(ifp, tx->br, mbuf); - taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + if (err == EINVAL) { + drbr_advance(ifp, tx->br); + m_freem(mbuf); + } else + drbr_putback(ifp, tx->br, mbuf); break; } drbr_advance(ifp, tx->br); BPF_MTAP(ifp, mbuf); bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); else gve_db_bar_dqo_write_4(priv, tx->com.db_offset, tx->dqo.desc_tail); } } void gve_xmit_tq(void *arg, int pending) { struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; GVE_RING_LOCK(tx); gve_xmit_br(tx); GVE_RING_UNLOCK(tx); } static bool is_vlan_tagged_pkt(struct mbuf *mbuf) { struct ether_header *eh; eh = mtod(mbuf, struct ether_header *); return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); } int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; bool is_br_empty; int err; uint32_t i; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (ENODEV); if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; else i = curcpu % priv->tx_cfg.num_queues; tx = &priv->tx[i]; if (__predict_false(is_vlan_tagged_pkt(mbuf))) { counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); m_freem(mbuf); return (ENODEV); } is_br_empty = drbr_empty(ifp, tx->br); err = drbr_enqueue(ifp, tx->br, mbuf); if (__predict_false(err != 0)) { - taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + if (!atomic_load_bool(&tx->stopped)) + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); return (err); } /* * If the mbuf we just enqueued is the only one on the ring, then * transmit it right away in the interests of low latency. */ if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { gve_xmit_br(tx); GVE_RING_UNLOCK(tx); - } else { + } else if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); - } return (0); } void gve_qflush(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; int i; for (i = 0; i < priv->tx_cfg.num_queues; ++i) { tx = &priv->tx[i]; if (drbr_empty(ifp, tx->br) == 0) { GVE_RING_LOCK(tx); drbr_flush(ifp, tx->br); GVE_RING_UNLOCK(tx); } } if_qflush(ifp); } diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c index 323c032a3e65..fab2d6d0f613 100644 --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -1,1080 +1,1090 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet6.h" #include "gve.h" #include "gve_dqo.h" static void gve_unmap_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap); } static void gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx) { struct gve_tx_pending_pkt_dqo *pending_pkt; int i; for (i = 0; i < tx->dqo.num_pending_pkts; i++) { pending_pkt = &tx->dqo.pending_pkts[i]; if (!pending_pkt->mbuf) continue; if (gve_is_qpl(tx->com.priv)) { pending_pkt->qpl_buf_head = -1; pending_pkt->num_qpl_bufs = 0; } else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; } } void gve_tx_free_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; int j; if (tx->dqo.desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->dqo.desc_ring = NULL; } if (tx->dqo.compl_ring != NULL) { gve_dma_free_coherent(&tx->dqo.compl_ring_mem); tx->dqo.compl_ring = NULL; } if (tx->dqo.pending_pkts != NULL) { gve_free_tx_mbufs_dqo(tx); if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { for (j = 0; j < tx->dqo.num_pending_pkts; j++) if (tx->dqo.pending_pkts[j].state != GVE_PACKET_STATE_UNALLOCATED) bus_dmamap_destroy(tx->dqo.buf_dmatag, tx->dqo.pending_pkts[j].dmamap); } free(tx->dqo.pending_pkts, M_GVE); tx->dqo.pending_pkts = NULL; } if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) bus_dma_tag_destroy(tx->dqo.buf_dmatag); if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { free(tx->dqo.qpl_bufs, M_GVE); tx->dqo.qpl_bufs = NULL; } } static int gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; int err; int j; /* * DMA tag for mapping Tx mbufs * The maxsize, nsegments, and maxsegsize params should match * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. */ err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ GVE_TSO_MAXSIZE_DQO, /* maxsize */ GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &tx->dqo.buf_dmatag); if (err != 0) { device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); return (err); } for (j = 0; j < tx->dqo.num_pending_pkts; j++) { err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, &tx->dqo.pending_pkts[j].dmamap); if (err != 0) { device_printf(priv->dev, "err in creating pending pkt dmamap %d: %d", j, err); return (err); } tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } return (0); } int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; uint16_t num_pending_pkts; int err; /* Descriptor ring */ err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr; /* Completion ring */ err = gve_dma_alloc_coherent(priv, sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc compl ring for tx ring %d", i); goto abort; } tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; /* * pending_pkts array * * The max number of pending packets determines the maximum number of * descriptors which maybe written to the completion queue. * * We must set the number small enough to make sure we never overrun the * completion queue. */ num_pending_pkts = priv->tx_desc_cnt; /* * Reserve space for descriptor completions, which will be reported at * most every GVE_TX_MIN_RE_INTERVAL packets. */ num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL; tx->dqo.num_pending_pkts = num_pending_pkts; tx->dqo.pending_pkts = malloc( sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, M_GVE, M_WAITOK | M_ZERO); if (gve_is_qpl(priv)) { int qpl_buf_cnt; tx->com.qpl = &priv->qpls[i]; qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; tx->dqo.qpl_bufs = malloc( sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, M_GVE, M_WAITOK | M_ZERO); } else gve_tx_alloc_rda_fields_dqo(tx); return (0); abort: gve_tx_free_ring_dqo(priv, i); return (err); } static void gve_extract_tx_metadata_dqo(const struct mbuf *mbuf, struct gve_tx_metadata_dqo *metadata) { uint32_t hash = mbuf->m_pkthdr.flowid; uint16_t path_hash; metadata->version = GVE_TX_METADATA_VERSION_DQO; if (hash) { path_hash = hash ^ (hash >> 16); path_hash &= (1 << 15) - 1; if (__predict_false(path_hash == 0)) path_hash = ~path_hash; metadata->path_hash = path_hash; } } static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, uint32_t *desc_idx, uint32_t len, uint64_t addr, int16_t compl_tag, bool eop, bool csum_enabled) { while (len > 0) { struct gve_tx_pkt_desc_dqo *desc = &tx->dqo.desc_ring[*desc_idx].pkt; uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO); bool cur_eop = eop && cur_len == len; *desc = (struct gve_tx_pkt_desc_dqo){ .buf_addr = htole64(addr), .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, .end_of_packet = cur_eop, .checksum_offload_enable = csum_enabled, .compl_tag = htole16(compl_tag), .buf_size = cur_len, }; addr += cur_len; len -= cur_len; *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; } } static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata, int header_len) { *desc = (struct gve_tx_tso_context_desc_dqo){ .header_len = header_len, .cmd_dtype = { .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, .tso = 1, }, .flex0 = metadata->bytes[0], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], }; desc->tso_total_len = mbuf->m_pkthdr.len - header_len; desc->mss = mbuf->m_pkthdr.tso_segsz; } static void gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, const struct gve_tx_metadata_dqo *metadata) { *desc = (struct gve_tx_general_context_desc_dqo){ .flex0 = metadata->bytes[0], .flex1 = metadata->bytes[1], .flex2 = metadata->bytes[2], .flex3 = metadata->bytes[3], .flex4 = metadata->bytes[4], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, }; } #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (EINVAL); \ } \ } while (0) static int gve_prep_tso(struct mbuf *mbuf, int *header_len) { uint8_t l3_off, l4_off = 0; struct ether_header *eh; struct tcphdr *th; u_short csum; PULLUP_HDR(mbuf, sizeof(*eh)); eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); l3_off = ETHER_HDR_LEN; #ifdef INET6 if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) { struct ip6_hdr *ip6; PULLUP_HDR(mbuf, l3_off + sizeof(*ip6)); ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off)); l4_off = l3_off + sizeof(struct ip6_hdr); csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP, /*csum=*/0); } else #endif if (ntohs(eh->ether_type) == ETHERTYPE_IP) { struct ip *ip; PULLUP_HDR(mbuf, l3_off + sizeof(*ip)); ip = (struct ip *)(mtodo(mbuf, l3_off)); l4_off = l3_off + (ip->ip_hl << 2); csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *)); th = (struct tcphdr *)(mtodo(mbuf, l4_off)); *header_len = l4_off + (th->th_off << 2); /* * Hardware requires the th->th_sum to not include the TCP payload, * hence we recompute the csum with it excluded. */ th->th_sum = csum; return (0); } static int gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, bool is_tso, uint32_t *desc_idx) { struct gve_tx_general_context_desc_dqo *gen_desc; struct gve_tx_tso_context_desc_dqo *tso_desc; struct gve_tx_metadata_dqo metadata; int header_len; int err; metadata = (struct gve_tx_metadata_dqo){0}; gve_extract_tx_metadata_dqo(mbuf, &metadata); if (is_tso) { err = gve_prep_tso(mbuf, &header_len); if (__predict_false(err)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_tsoerr, 1); counter_exit(); return (err); } tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; gve_tx_fill_general_ctx_desc(gen_desc, &metadata); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; return (0); } static int gve_map_mbuf_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf, bus_dmamap_t dmamap, bus_dma_segment_t *segs, int *nsegs, int attempt) { struct mbuf *m_new = NULL; int err; err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap, *mbuf, segs, nsegs, BUS_DMA_NOWAIT); switch (err) { case __predict_true(0): break; case EFBIG: if (__predict_false(attempt > 0)) goto abort; counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_collapse, 1); counter_exit(); /* Try m_collapse before m_defrag */ m_new = m_collapse(*mbuf, M_NOWAIT, GVE_TX_MAX_DATA_DESCS_DQO); if (m_new == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag, 1); counter_exit(); m_new = m_defrag(*mbuf, M_NOWAIT); } if (__predict_false(m_new == NULL)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag_err, 1); counter_exit(); m_freem(*mbuf); *mbuf = NULL; err = ENOMEM; goto abort; } else { *mbuf = m_new; return (gve_map_mbuf_dqo(tx, mbuf, dmamap, segs, nsegs, ++attempt)); } case ENOMEM: counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_dmamap_enomem_err, 1); counter_exit(); goto abort; default: goto abort; } return (0); abort: counter_enter(); counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1); counter_exit(); return (err); } static uint32_t num_avail_desc_ring_slots(const struct gve_tx_ring *tx) { uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) & tx->dqo.desc_mask; return (tx->dqo.desc_mask - num_used); } static struct gve_tx_pending_pkt_dqo * gve_alloc_pending_packet(struct gve_tx_ring *tx) { int32_t index = tx->dqo.free_pending_pkts_csm; struct gve_tx_pending_pkt_dqo *pending_pkt; /* * No pending packets available in the consumer list, * try to steal the producer list. */ if (__predict_false(index == -1)) { tx->dqo.free_pending_pkts_csm = atomic_swap_32( &tx->dqo.free_pending_pkts_prd, -1); index = tx->dqo.free_pending_pkts_csm; if (__predict_false(index == -1)) return (NULL); } pending_pkt = &tx->dqo.pending_pkts[index]; /* Remove pending_pkt from the consumer list */ tx->dqo.free_pending_pkts_csm = pending_pkt->next; pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; return (pending_pkt); } static void gve_free_pending_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { int index = pending_pkt - tx->dqo.pending_pkts; int32_t old_head; pending_pkt->state = GVE_PACKET_STATE_FREE; /* Add pending_pkt to the producer list */ while (true) { old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); pending_pkt->next = old_head; if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd, old_head, index)) break; } } /* * Has the side-effect of retrieving the value of the last desc index * processed by the NIC. hw_tx_head is written to by the completions-processing * taskqueue upon receiving descriptor-completions. */ static bool gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs) { if (needed_descs <= num_avail_desc_ring_slots(tx)) return (true); tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head); if (needed_descs > num_avail_desc_ring_slots(tx)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_descring, 1); counter_exit(); return (false); } return (0); } static void gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx) { uint32_t last_report_event_interval; uint32_t last_desc_idx; last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask; last_report_event_interval = (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask; if (__predict_false(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) { tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true; tx->dqo.last_re_idx = last_desc_idx; } } static bool gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) { uint32_t available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( &tx->dqo.qpl_bufs_produced); available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); return (false); } static int32_t gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) { int32_t buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) { tx->dqo.free_qpl_bufs_csm = atomic_swap_32( &tx->dqo.free_qpl_bufs_prd, -1); buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) return (-1); } tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; tx->dqo.qpl_bufs_consumed++; return (buf); } /* * Tx buffer i corresponds to * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO */ static void gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, int32_t index, void **va, bus_addr_t *dma_addr) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << GVE_TX_BUF_SHIFT_DQO; *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; } static struct gve_dma_handle * gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); return (&tx->com.qpl->dmas[page_id]); } static void gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, bool csum_enabled, int16_t completion_tag, uint32_t *desc_idx) { int32_t pkt_len = mbuf->m_pkthdr.len; struct gve_dma_handle *dma; uint32_t copy_offset = 0; int32_t prev_buf = -1; uint32_t copy_len; bus_addr_t addr; int32_t buf; void *va; MPASS(pkt->num_qpl_bufs == 0); MPASS(pkt->qpl_buf_head == -1); while (copy_offset < pkt_len) { buf = gve_tx_alloc_qpl_buf(tx); /* We already checked for availability */ MPASS(buf != -1); gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); m_copydata(mbuf, copy_offset, copy_len, va); copy_offset += copy_len; dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); gve_tx_fill_pkt_desc_dqo(tx, desc_idx, copy_len, addr, completion_tag, /*eop=*/copy_offset == pkt_len, csum_enabled); /* Link all the qpl bufs for a packet */ if (prev_buf == -1) pkt->qpl_buf_head = buf; else tx->dqo.qpl_bufs[prev_buf] = buf; prev_buf = buf; pkt->num_qpl_bufs++; } tx->dqo.qpl_bufs[buf] = -1; } int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) { uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); /* Check if we have enough room in the desc ring */ total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); counter_exit(); return (ENOBUFS); } pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; pkt->mbuf = mbuf; err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort; gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, has_csum_flag, completion_tag, &desc_idx); /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) { bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; struct mbuf *mbuf = *mbuf_ptr; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; int i; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; /* * This mbuf might end up needing more than 1 pkt desc. * The actual number, `nsegs` is known only after the * expensive gve_map_mbuf_dqo call. This check beneath * exists to fail early when the desc ring is really full. */ total_descs_needed = 1 + /* general_ctx_desc */ 1 + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap, segs, &nsegs, /*attempt=*/0); if (err) goto abort; mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */ pkt->mbuf = mbuf; total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false( !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) { err = ENOBUFS; goto abort_with_dma; } err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort_with_dma; bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); for (i = 0; i < nsegs; i++) { gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, segs[i].ds_len, segs[i].ds_addr, completion_tag, /*eop=*/i == (nsegs - 1), has_csum_flag); } /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort_with_dma: gve_unmap_packet(tx, pkt); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } static void gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pkt) { int32_t buf = pkt->qpl_buf_head; struct gve_dma_handle *dma; int32_t qpl_buf_tail; int32_t old_head; int i; for (i = 0; i < pkt->num_qpl_bufs; i++) { dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); qpl_buf_tail = buf; buf = tx->dqo.qpl_bufs[buf]; } MPASS(buf == -1); buf = qpl_buf_tail; while (true) { old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); tx->dqo.qpl_bufs[buf] = old_head; /* * The "rel" ensures that the update to dqo.free_qpl_bufs_prd * is visible only after the linked list from this pkt is * attached above to old_head. */ if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd, old_head, pkt->qpl_buf_head)) break; } /* * The "rel" ensures that the update to dqo.qpl_bufs_produced is * visible only adter the update to dqo.free_qpl_bufs_prd above. */ atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); pkt->qpl_buf_head = -1; pkt->num_qpl_bufs = 0; } static uint64_t gve_handle_packet_completion(struct gve_priv *priv, struct gve_tx_ring *tx, uint16_t compl_tag) { struct gve_tx_pending_pkt_dqo *pending_pkt; int32_t pkt_len; if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) { device_printf(priv->dev, "Invalid TX completion tag: %d\n", compl_tag); return (0); } pending_pkt = &tx->dqo.pending_pkts[compl_tag]; /* Packet is allocated but not pending data completion. */ if (__predict_false(pending_pkt->state != GVE_PACKET_STATE_PENDING_DATA_COMPL)) { device_printf(priv->dev, "No pending data completion: %d\n", compl_tag); return (0); } pkt_len = pending_pkt->mbuf->m_pkthdr.len; if (gve_is_qpl(priv)) gve_reap_qpl_bufs_dqo(tx, pending_pkt); else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; gve_free_pending_packet(tx, pending_pkt); return (pkt_len); } int gve_tx_intr_dqo(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); /* Interrupts are automatically masked */ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static void gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; for (i = 0; i < com->priv->tx_desc_cnt; i++) tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int entries; int i; entries = com->priv->tx_desc_cnt; for (i = 0; i < entries; i++) tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){}; bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, BUS_DMASYNC_PREWRITE); } void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; int j; tx->dqo.desc_head = 0; tx->dqo.desc_tail = 0; tx->dqo.desc_mask = priv->tx_desc_cnt - 1; tx->dqo.last_re_idx = 0; tx->dqo.compl_head = 0; tx->dqo.compl_mask = priv->tx_desc_cnt - 1; atomic_store_32(&tx->dqo.hw_tx_head, 0); tx->dqo.cur_gen_bit = 0; gve_free_tx_mbufs_dqo(tx); for (j = 0; j < tx->dqo.num_pending_pkts - 1; j++) { tx->dqo.pending_pkts[j].next = j + 1; tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } tx->dqo.pending_pkts[tx->dqo.num_pending_pkts - 1].next = -1; tx->dqo.free_pending_pkts_csm = 0; atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); if (gve_is_qpl(priv)) { int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; for (j = 0; j < qpl_buf_cnt - 1; j++) tx->dqo.qpl_bufs[j] = j + 1; tx->dqo.qpl_bufs[j] = -1; tx->dqo.free_qpl_bufs_csm = 0; atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1); atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt); tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; tx->dqo.qpl_bufs_consumed = 0; } gve_tx_clear_desc_ring_dqo(tx); gve_tx_clear_compl_ring_dqo(tx); } static bool gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget) { struct gve_tx_compl_desc_dqo *compl_desc; uint64_t bytes_done = 0; uint64_t pkts_done = 0; uint16_t compl_tag; int work_done = 0; uint16_t tx_head; uint16_t type; while (work_done < budget) { bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, BUS_DMASYNC_POSTREAD); compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head]; if (compl_desc->generation == tx->dqo.cur_gen_bit) break; /* * Prevent generation bit from being read after the rest of the * descriptor. */ rmb(); type = compl_desc->type; if (type == GVE_COMPL_TYPE_DQO_DESC) { /* This is the last descriptor fetched by HW plus one */ tx_head = le16toh(compl_desc->tx_head); atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head); } else if (type == GVE_COMPL_TYPE_DQO_PKT) { compl_tag = le16toh(compl_desc->completion_tag); bytes_done += gve_handle_packet_completion(priv, tx, compl_tag); pkts_done++; } tx->dqo.compl_head = (tx->dqo.compl_head + 1) & tx->dqo.compl_mask; /* Flip the generation bit when we wrap around */ tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0; work_done++; } + /* + * Waking the xmit taskqueue has to occur after room has been made in + * the queue. + */ + atomic_thread_fence_seq_cst(); + if (atomic_load_bool(&tx->stopped) && work_done) { + atomic_store_bool(&tx->stopped, false); + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + } + tx->done += work_done; /* tx->done is just a sysctl counter */ counter_enter(); counter_u64_add_protected(tx->stats.tbytes, bytes_done); counter_u64_add_protected(tx->stats.tpackets, pkts_done); counter_exit(); return (work_done == budget); } void gve_tx_cleanup_tq_dqo(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) { taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); return; } gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); }