diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 index 5f58a4c1a503..2ae96c93e37d 100644 --- a/share/man/man4/gve.4 +++ b/share/man/man4/gve.4 @@ -1,260 +1,279 @@ .\" SPDX-License-Identifier: BSD-3-Clause .\" .\" Copyright (c) 2023-2024 Google LLC .\" .\" Redistribution and use in source and binary forms, with or without modification, .\" are permitted provided that the following conditions are met: .\" .\" 1. Redistributions of source code must retain the above copyright notice, this .\" list of conditions and the following disclaimer. .\" .\" 2. Redistributions in binary form must reproduce the above copyright notice, .\" this list of conditions and the following disclaimer in the documentation .\" and/or other materials provided with the distribution. .\" .\" 3. Neither the name of the copyright holder nor the names of its contributors .\" may be used to endorse or promote products derived from this software without .\" specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR .\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; .\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON .\" ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS .\" SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .Dd October 14, 2024 .Dt GVE 4 .Os .Sh NAME .Nm gve .Nd "Ethernet driver for Google Virtual NIC (gVNIC)" .Sh SYNOPSIS To compile this driver into the kernel, place the following lines in your kernel configuration file: .Bd -ragged -offset indent .Cd "device gve" .Ed .Pp Alternatively, to load the driver as a module at boot time, place the following line in .Xr loader.conf 5 : .Bd -literal -offset indent if_gve_load="YES" .Ed .Sh DESCRIPTION gVNIC is a virtual network interface designed specifically for Google Compute Engine (GCE). It is required to support per-VM Tier-1 networking performance, and for using certain VM shapes on GCE. .Pp .Nm is the driver for gVNIC. It supports the following features: .Pp .Bl -bullet -compact .It RX checksum offload .It TX chesksum offload .It TCP Segmentation Offload (TSO) .It Large Receive Offload (LRO) in software .It Jumbo frames .It Receive Side Scaling (RSS) .El .Pp For more information on configuring this device, see .Xr ifconfig 8 . .Sh HARDWARE .Nm binds to a single PCI device ID presented by gVNIC: .Pp .Bl -bullet -compact .It 0x1AE0:0x0042 .El +.Sh EXAMPLES +.Pp +Change the TX queue count to 4 for the gve0 interface: +.D1 sysctl dev.gve.0.num_tx_queues=4 +.Pp +Change the RX queue count to 4 for the gve0 interface: +.D1 sysctl dev.gve.0.num_rx_queues=4 .Sh DIAGNOSTICS The following messages are recorded during driver initialization: .Bl -diag .It "Enabled MSIX with %d vectors" .It "Configured device resources" .It "Successfully attached %s" .It "Deconfigured device resources" .El .Pp These messages are seen if driver initialization fails. Global (across-queues) allocation failures: .Bl -diag .It "Failed to configure device resources: err=%d" .It "No compatible queue formats" .It "Failed to allocate ifnet struct" .It "Failed to allocate admin queue mem" .It "Failed to alloc DMA mem for DescribeDevice" .It "Failed to allocate QPL page" .El .Pp irq and BAR allocation failures: .Bl -diag .It "Failed to acquire any msix vectors" .It "Tried to acquire %d msix vectors, got only %d" .It "Failed to setup irq %d for Tx queue %d " .It "Failed to setup irq %d for Rx queue %d " .It "Failed to allocate irq %d for mgmnt queue" .It "Failed to setup irq %d for mgmnt queue, err: %d" .It "Failed to allocate BAR0" .It "Failed to allocate BAR2" .It "Failed to allocate msix table" .El .Pp Rx queue-specific allocation failures: .Bl -diag .It "No QPL left for rx ring %d" .It "Failed to alloc queue resources for rx ring %d" .It "Failed to alloc desc ring for rx ring %d" .It "Failed to alloc data ring for rx ring %d" .El .Pp Tx queue-specific allocation failures: .Bl -diag .It "No QPL left for tx ring %d" .It "Failed to alloc queue resources for tx ring %d" .It "Failed to alloc desc ring for tx ring %d" .It "Failed to vmap fifo, qpl_id = %d" .El .Pp The following messages are recorded when the interface detach fails: .Bl -diag .It "Failed to deconfigure device resources: err=%d" .El .Pp If bootverbose is on, the following messages are recorded when the interface is being brought up: .Bl -diag .It "Created %d rx queues" .It "Created %d tx queues" .It "MTU set to %d" .El .Pp The following messages are recorded when the interface is being brought down: .Bl -diag .It "Destroyed %d rx queues" .It "Destroyed %d tx queues" .El .Pp These messages are seen if errors are encountered when bringing the interface up or down: .Bl -diag .It "Failed to destroy rxq %d, err: %d" .It "Failed to destroy txq %d, err: %d" .It "Failed to create rxq %d, err: %d" .It "Failed to create txq %d, err: %d" .It "Failed to set MTU to %d" .It "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d" .It "Cannot bring the iface up when detached" .It "Reached max number of registered pages %lu > %lu" .It "Failed to init lro for rx ring %d" .El .Pp These messages are seen if any admin queue command fails: .Bl -diag .It "AQ command(%u): failed with status %d" .It "AQ command(%u): unknown status code %d" .It "AQ commands timed out, need to reset AQ" .It "Unknown AQ command opcode %d" .El .Pp These messages are recorded when the device is being reset due to an error: .Bl -diag .It "Scheduling reset task!" .It "Waiting until admin queue is released." .It "Admin queue released" .El .Pp If it was the NIC that requested the reset, this message is recorded: .Bl -diag .It "Device requested reset" .El .Pp If the reset fails during the reinitialization phase, this message is recorded: .Bl -diag .It "Restore failed!" .El .Pp These two messages correspond to the NIC alerting the driver to link state changes: .Bl -diag .It "Device link is up." .It "Device link is down." .El .Pp Apart from these messages, the driver exposes per-queue packet and error counters as sysctl nodes. Global (across queues) counters can be read using .Xr netstat 1 . .Sh SYSCTL VARIABLES .Nm exposes the following .Xr sysctl 8 variables: .Bl -tag -width indent .It Va hw.gve.driver_version The driver version. This is read-only. .It Va hw.gve.queue_format The queue format in use. This is read-only. .It Va hw.gve.disable_hw_lro Setting this boot-time tunable to 1 disables Large Receive Offload (LRO) in the NIC. The default value is 0, which means hardware LRO is enabled by default. The software LRO stack in the kernel is always used. This sysctl variable needs to be set before loading the driver, using .Xr loader.conf 5 . +.It Va dev.gve.X.num_rx_queues and dev.gve.X.num_tx_queues +Run-time tunables that represent the number of currently used RX/TX queues. +The default value is the max number of RX/TX queues the device can support. +.Pp +This call turns down the interface while setting up the new queues, +which may potentially cause any new packets to be dropped. +This call can fail if the system is not able to provide the driver with enough resources. +In that situation, the driver will revert to the previous number of RX/TX queues. +If this also fails, a device reset will be triggered. +.Pp +Note: sysctl nodes for queue stats remain available even if a queue is removed. +.Pp .El .Sh LIMITATIONS .Nm does not support the transmission of VLAN-tagged packets. All VLAN-tagged traffic is dropped. .Sh QUEUE FORMATS .Nm features different datapath modes called queue formats: .Pp .Bl -bullet -compact .It GQI_QPL: "QPL" stands for "Queue Page List" and refers to the fact that hardware expects a fixed bounce buffer and cannot access arbitrary memory. GQI is the older descriptor format. The G in "GQI" refers to an older generation of hardware, and the "QI" stands for "Queue In-order" referring to the fact that the NIC sends Tx and Rx completions in the same order as the one in which the corresponding descriptors were posted by the driver. .It DQO_RDA: DQO is the descriptor format required to take full advantage of next generation VM shapes. "RDA" stands for "Raw DMA Addressing" and refers to the fact that hardware can work with DMA-ed packets and does not expect them to be copied into or out of a fixed bounce buffer. The D in "DQO" refers to a newer generation of hardware, and the "QO" stands for "Queue Out-of-order" referring to the fact that the NIC might send Tx and Rx completions in an order different from the one in which the corresponding descriptors were posted by the driver. .It DQO_QPL: The next generation descriptor format in the "QPL" mode. .El .Sh SUPPORT Please email gvnic-drivers@google.com with the specifics of the issue encountered. .Sh SEE ALSO .Xr netstat 1 , .Xr loader.conf 5 , .Xr ifconfig 8 , .Xr sysctl 8 .Sh HISTORY The .Nm device driver first appeared in .Fx 13.3 . .Sh AUTHORS The .Nm driver was written by Google. diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h index bf15eb3ccabc..2b49ee5ad45a 100644 --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -1,701 +1,703 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _GVE_FBSD_H #define _GVE_FBSD_H #include "gve_desc.h" #include "gve_plat.h" #include "gve_register.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 #endif #define PCI_DEV_ID_GVNIC 0x0042 #define GVE_REGISTER_BAR 0 #define GVE_DOORBELL_BAR 2 /* Driver can alloc up to 2 segments for the header and 2 for the payload. */ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 /* Each RX bounce buffer page can fit two packet buffers. */ #define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) /* PTYPEs are always 10 bits. */ #define GVE_NUM_PTYPES 1024 /* * Number of descriptors per queue page list. * Page count AKA QPL size can be derived by dividing the number of elements in * a page by the number of descriptors available. */ #define GVE_QPL_DIVISOR 16 static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); struct gve_dma_handle { bus_addr_t bus_addr; void *cpu_addr; bus_dma_tag_t tag; bus_dmamap_t map; }; union gve_tx_desc { struct gve_tx_pkt_desc pkt; /* first desc for a packet */ struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ }; /* Tracks the memory in the fifo occupied by a segment of a packet */ struct gve_tx_iovec { uint32_t iov_offset; /* offset into this segment */ uint32_t iov_len; /* length */ uint32_t iov_padding; /* padding associated with this segment */ }; /* Tracks allowed and current queue settings */ struct gve_queue_config { uint16_t max_queues; uint16_t num_queues; /* current */ }; struct gve_irq_db { __be32 index; } __aligned(CACHE_LINE_SIZE); /* * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value * when the entire configure_device_resources command is zeroed out and the * queue_format is not specified. */ enum gve_queue_format { GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, GVE_DQO_QPL_FORMAT = 0x4, }; enum gve_state_flags_bit { GVE_STATE_FLAG_ADMINQ_OK, GVE_STATE_FLAG_RESOURCES_OK, GVE_STATE_FLAG_QPLREG_OK, GVE_STATE_FLAG_RX_RINGS_OK, GVE_STATE_FLAG_TX_RINGS_OK, GVE_STATE_FLAG_QUEUES_UP, GVE_STATE_FLAG_LINK_UP, GVE_STATE_FLAG_DO_RESET, GVE_STATE_FLAG_IN_RESET, GVE_NUM_STATE_FLAGS /* Not part of the enum space */ }; BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS); #define GVE_DEVICE_STATUS_RESET (0x1 << 1) #define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2) #define GVE_RING_LOCK(ring) mtx_lock(&(ring)->ring_mtx) #define GVE_RING_TRYLOCK(ring) mtx_trylock(&(ring)->ring_mtx) #define GVE_RING_UNLOCK(ring) mtx_unlock(&(ring)->ring_mtx) #define GVE_RING_ASSERT(ring) mtx_assert(&(ring)->ring_mtx, MA_OWNED) #define GVE_IFACE_LOCK_INIT(lock) sx_init(&lock, "gve interface lock") #define GVE_IFACE_LOCK_DESTROY(lock) sx_destroy(&lock) #define GVE_IFACE_LOCK_LOCK(lock) sx_xlock(&lock) #define GVE_IFACE_LOCK_UNLOCK(lock) sx_unlock(&lock) #define GVE_IFACE_LOCK_ASSERT(lock) sx_assert(&lock, SA_XLOCKED) struct gve_queue_page_list { uint32_t id; uint32_t num_dmas; uint32_t num_pages; vm_offset_t kva; vm_page_t *pages; struct gve_dma_handle *dmas; }; struct gve_irq { struct resource *res; void *cookie; }; struct gve_rx_slot_page_info { void *page_address; vm_page_t page; uint32_t page_offset; uint16_t pad; }; /* * A single received packet split across multiple buffers may be * reconstructed using the information in this structure. */ struct gve_rx_ctx { /* head and tail of mbuf chain for the current packet */ struct mbuf *mbuf_head; struct mbuf *mbuf_tail; uint32_t total_size; uint8_t frag_cnt; bool is_tcp; bool drop_pkt; }; struct gve_ring_com { struct gve_priv *priv; uint32_t id; /* * BAR2 offset for this ring's doorbell and the * counter-array offset for this ring's counter. * Acquired from the device individually for each * queue in the queue_create adminq command. */ struct gve_queue_resources *q_resources; struct gve_dma_handle q_resources_mem; /* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */ uint32_t irq_db_offset; /* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */ uint32_t db_offset; /* * Index, not byte-offset, into the counter array where this ring's * 4-byte counter lies. */ uint32_t counter_idx; /* * The index of the MSIX vector that was assigned to * this ring in `gve_alloc_irqs`. * * It is passed to the device in the queue_create adminq * command. * * Additionally, this also serves as the index into * `priv->irq_db_indices` where this ring's irq doorbell's * BAR2 offset, `irq_db_idx`, can be found. */ int ntfy_id; /* * The fixed bounce buffer for this ring. * Once allocated, has to be offered to the device * over the register-page-list adminq command. */ struct gve_queue_page_list *qpl; struct task cleanup_task; struct taskqueue *cleanup_tq; } __aligned(CACHE_LINE_SIZE); struct gve_rxq_stats { counter_u64_t rbytes; counter_u64_t rpackets; counter_u64_t rx_dropped_pkt; counter_u64_t rx_copybreak_cnt; counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; counter_u64_t rx_dropped_pkt_buf_post_fail; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; counter_u64_t rx_mbuf_dmamap_err; counter_u64_t rx_mbuf_mclget_null; }; #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) union gve_rx_qpl_buf_id_dqo { struct { uint16_t buf_id:11; /* Index into rx->dqo.bufs */ uint8_t frag_num:5; /* Which frag in the QPL page */ }; uint16_t all; } __packed; _Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2, "gve: bad dqo qpl rx buf id length"); struct gve_rx_buf_dqo { union { /* RDA */ struct { struct mbuf *mbuf; bus_dmamap_t dmamap; uint64_t addr; bool mapped; }; /* QPL */ struct { uint8_t num_nic_frags; /* number of pending completions */ uint8_t next_idx; /* index of the next frag to post */ /* for chaining rx->dqo.used_bufs */ STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry; }; }; /* for chaining rx->dqo.free_bufs */ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; }; /* power-of-2 sized receive ring */ struct gve_rx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; uint32_t cnt; /* free-running total number of completed packets */ uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ union { /* GQI-only fields */ struct { struct gve_dma_handle data_ring_mem; /* accessed in the GQ receive hot path */ struct gve_rx_desc *desc_ring; union gve_rx_data_slot *data_ring; struct gve_rx_slot_page_info *page_info; uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ uint8_t seq_no; /* helps traverse the descriptor ring */ }; /* DQO-only fields */ struct { struct gve_dma_handle compl_ring_mem; struct gve_rx_compl_desc_dqo *compl_ring; struct gve_rx_desc_dqo *desc_ring; struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */ bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */ uint32_t buf_cnt; /* Size of the bufs array */ uint32_t mask; /* One less than the sizes of the desc and compl rings */ uint32_t head; /* The index at which to post the next buffer at */ uint32_t tail; /* The index at which to receive the next compl at */ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; /* * Only used in QPL mode. Pages referred to by if_input-ed mbufs * stay parked here till their wire count comes back to 1. * Pages are moved here after there aren't any pending completions. */ STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs; } dqo; }; struct lro_ctrl lro; struct gve_rx_ctx ctx; struct gve_rxq_stats stats; } __aligned(CACHE_LINE_SIZE); /* * A contiguous representation of the pages composing the Tx bounce buffer. * The xmit taskqueue and the completion taskqueue both simultaneously use it. * Both operate on `available`: the xmit tq lowers it and the completion tq * raises it. `head` is the last location written at and so only the xmit tq * uses it. */ struct gve_tx_fifo { vm_offset_t base; /* address of base of FIFO */ uint32_t size; /* total size */ volatile int available; /* how much space is still available */ uint32_t head; /* offset to write at */ }; struct gve_tx_buffer_state { struct mbuf *mbuf; struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; struct gve_txq_stats { counter_u64_t tbytes; counter_u64_t tpackets; counter_u64_t tso_packet_cnt; counter_u64_t tx_dropped_pkt; counter_u64_t tx_delayed_pkt_nospace_device; counter_u64_t tx_dropped_pkt_nospace_bufring; counter_u64_t tx_delayed_pkt_nospace_descring; counter_u64_t tx_delayed_pkt_nospace_compring; counter_u64_t tx_delayed_pkt_nospace_qpl_bufs; counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; counter_u64_t tx_mbuf_collapse; counter_u64_t tx_mbuf_defrag; counter_u64_t tx_mbuf_defrag_err; counter_u64_t tx_mbuf_dmamap_enomem_err; counter_u64_t tx_mbuf_dmamap_err; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; union { /* RDA */ bus_dmamap_t dmamap; /* QPL */ struct { /* * A linked list of entries from qpl_bufs that served * as the bounce buffer for this packet. */ int32_t qpl_buf_head; uint32_t num_qpl_bufs; }; }; uint8_t state; /* the gve_packet_state enum */ int next; /* To chain the free_pending_pkts lists */ }; /* power-of-2 sized transmit ring */ struct gve_tx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; struct task xmit_task; struct taskqueue *xmit_tq; bool stopped; /* Accessed when writing descriptors */ struct buf_ring *br; struct mtx ring_mtx; uint32_t req; /* free-running total number of packets written to the nic */ uint32_t done; /* free-running total number of completed packets */ union { /* GQI specific stuff */ struct { union gve_tx_desc *desc_ring; struct gve_tx_buffer_state *info; struct gve_tx_fifo fifo; uint32_t mask; /* masks the req and done to the size of the ring */ }; /* DQO specific stuff */ struct { struct gve_dma_handle compl_ring_mem; /* Accessed when writing descriptors */ struct { union gve_tx_desc_dqo *desc_ring; uint32_t desc_mask; /* masks head and tail to the size of desc_ring */ uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */ uint32_t desc_tail; /* last desc written by driver */ uint32_t last_re_idx; /* desc which last had "report event" set */ /* * The head index of a singly linked list containing pending packet objects * to park mbufs till the NIC sends completions. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. */ int32_t free_pending_pkts_csm; /* * The head index of a singly linked list representing QPL page fragments * to copy mbuf payload into for the NIC to see. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_csm; uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */ uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */ /* DMA params for mapping Tx mbufs. Only used in RDA mode. */ bus_dma_tag_t buf_dmatag; } __aligned(CACHE_LINE_SIZE); /* Accessed when processing completions */ struct { struct gve_tx_compl_desc_dqo *compl_ring; uint32_t compl_mask; /* masks head to the size of compl_ring */ uint32_t compl_head; /* last completion read by driver */ uint8_t cur_gen_bit; /* NIC flips a bit on every pass */ uint32_t hw_tx_head; /* last desc read by NIC */ /* * The completion taskqueue moves pending-packet objects to this * list after freeing the mbuf. The "_prd" denotes that this is * a producer list. The transmit taskqueue steals this list once * its consumer list, with the "_csm" suffix, is depleted. */ int32_t free_pending_pkts_prd; /* * The completion taskqueue moves the QPL pages corresponding to a * completed packet into this list. It is only used in QPL mode. * The "_prd" denotes that this is a producer list. The transmit * taskqueue steals this list once its consumer list, with the "_csm" * suffix, is depleted. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_prd; uint32_t qpl_bufs_produced; } __aligned(CACHE_LINE_SIZE); /* Accessed by both the completion and xmit loops */ struct { /* completion tags index into this array */ struct gve_tx_pending_pkt_dqo *pending_pkts; uint16_t num_pending_pkts; /* * Represents QPL page fragments. An index into this array * always represents the same QPL page fragment. The value * is also an index into this array and servers as a means * to chain buffers into linked lists whose heads are * either free_qpl_bufs_prd or free_qpl_bufs_csm or * qpl_bufs_head. */ int32_t *qpl_bufs; } __aligned(CACHE_LINE_SIZE); } dqo; }; struct gve_txq_stats stats; } __aligned(CACHE_LINE_SIZE); enum gve_packet_state { /* * Packet does not yet have a dmamap created. * This should always be zero since state is not explicitly initialized. */ GVE_PACKET_STATE_UNALLOCATED, /* Packet has a dmamap and is in free list, available to be allocated. */ GVE_PACKET_STATE_FREE, /* Packet is expecting a regular data completion */ GVE_PACKET_STATE_PENDING_DATA_COMPL, }; struct gve_ptype { uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */ uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */ }; struct gve_ptype_lut { struct gve_ptype ptypes[GVE_NUM_PTYPES]; }; struct gve_priv { if_t ifp; device_t dev; struct ifmedia media; uint8_t mac[ETHER_ADDR_LEN]; struct gve_dma_handle aq_mem; struct resource *reg_bar; /* BAR0 */ struct resource *db_bar; /* BAR2 */ struct resource *msix_table; uint32_t mgmt_msix_idx; uint32_t rx_copybreak; uint16_t num_event_counters; uint16_t default_num_queues; uint16_t tx_desc_cnt; uint16_t rx_desc_cnt; uint16_t rx_pages_per_qpl; uint64_t max_registered_pages; uint64_t num_registered_pages; uint32_t supported_features; uint16_t max_mtu; struct gve_dma_handle counter_array_mem; __be32 *counters; struct gve_dma_handle irqs_db_mem; struct gve_irq_db *irq_db_indices; enum gve_queue_format queue_format; struct gve_queue_config tx_cfg; struct gve_queue_config rx_cfg; uint32_t num_queues; struct gve_irq *irq_tbl; struct gve_tx_ring *tx; struct gve_rx_ring *rx; struct gve_ptype_lut *ptype_lut_dqo; /* * Admin queue - see gve_adminq.h * Since AQ cmds do not run in steady state, 32 bit counters suffice */ struct gve_adminq_command *adminq; vm_paddr_t adminq_bus_addr; uint32_t adminq_mask; /* masks prod_cnt to adminq size */ uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */ uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */ uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */ /* free-running count of each distinct AQ cmd executed */ uint32_t adminq_describe_device_cnt; uint32_t adminq_cfg_device_resources_cnt; uint32_t adminq_register_page_list_cnt; uint32_t adminq_unregister_page_list_cnt; uint32_t adminq_create_tx_queue_cnt; uint32_t adminq_create_rx_queue_cnt; uint32_t adminq_destroy_tx_queue_cnt; uint32_t adminq_destroy_rx_queue_cnt; uint32_t adminq_dcfg_device_resources_cnt; uint32_t adminq_set_driver_parameter_cnt; uint32_t adminq_verify_driver_compatibility_cnt; uint32_t adminq_get_ptype_map_cnt; uint32_t interface_up_cnt; uint32_t interface_down_cnt; uint32_t reset_cnt; struct task service_task; struct taskqueue *service_tq; struct gve_state_flags state_flags; struct sx gve_iface_lock; }; static inline bool gve_get_state_flag(struct gve_priv *priv, int pos) { return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags)); } static inline void gve_set_state_flag(struct gve_priv *priv, int pos) { BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline void gve_clear_state_flag(struct gve_priv *priv, int pos) { BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline bool gve_is_gqi(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT); } static inline bool gve_is_qpl(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT || priv->queue_format == GVE_DQO_QPL_FORMAT); } /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); +int gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt); +int gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt); /* Register access functions defined in gve_utils.c */ uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); /* QPL (Queue Page List) functions defined in gve_qpl.c */ struct gve_queue_page_list *gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva); void gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); void gve_mextadd_free(struct mbuf *mbuf); /* TX functions defined in gve_tx.c */ -int gve_alloc_tx_rings(struct gve_priv *priv); -void gve_free_tx_rings(struct gve_priv *priv); +int gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); +void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); void gve_xmit_tq(void *arg, int pending); void gve_tx_cleanup_tq(void *arg, int pending); /* TX functions defined in gve_tx_dqo.c */ int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); void gve_tx_cleanup_tq_dqo(void *arg, int pending); /* RX functions defined in gve_rx.c */ -int gve_alloc_rx_rings(struct gve_priv *priv); -void gve_free_rx_rings(struct gve_priv *priv); +int gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); +void gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_rx_rings(struct gve_priv *priv); int gve_destroy_rx_rings(struct gve_priv *priv); int gve_rx_intr(void *arg); void gve_rx_cleanup_tq(void *arg, int pending); /* RX functions defined in gve_rx_dqo.c */ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_rx_free_ring_dqo(struct gve_priv *priv, int i); void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx); void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i); int gve_rx_intr_dqo(void *arg); void gve_rx_cleanup_tq_dqo(void *arg, int pending); /* DMA functions defined in gve_utils.c */ int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dma_free_coherent(struct gve_dma_handle *dma); int gve_dmamap_create(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dmamap_destroy(struct gve_dma_handle *dma); /* IRQ functions defined in gve_utils.c */ void gve_free_irqs(struct gve_priv *priv); int gve_alloc_irqs(struct gve_priv *priv); void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); /* Systcl functions defined in gve_sysctl.c */ extern bool gve_disable_hw_lro; extern char gve_queue_format[8]; extern char gve_version[8]; void gve_setup_sysctl(struct gve_priv *priv); void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt); /* Stats functions defined in gve_utils.c */ void gve_alloc_counters(counter_u64_t *stat, int num_stats); void gve_free_counters(counter_u64_t *stat, int num_stats); #endif /* _GVE_FBSD_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c index 72e7fc2e3f89..39556b85f493 100644 --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -1,949 +1,1027 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" #define GVE_DRIVER_VERSION "GVE-FBSD-1.3.2\n" #define GVE_VERSION_MAJOR 1 #define GVE_VERSION_MINOR 3 #define GVE_VERSION_SUB 2 #define GVE_DEFAULT_RX_COPYBREAK 256 /* Devices supported by this driver. */ static struct gve_dev { uint16_t vendor_id; uint16_t device_id; const char *name; } gve_devs[] = { { PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC, "gVNIC" } }; struct sx gve_global_lock; static int gve_verify_driver_compatibility(struct gve_priv *priv) { int err; struct gve_driver_info *driver_info; struct gve_dma_handle driver_info_mem; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_driver_info), PAGE_SIZE, &driver_info_mem); if (err != 0) return (ENOMEM); driver_info = driver_info_mem.cpu_addr; *driver_info = (struct gve_driver_info) { .os_type = 3, /* Freebsd */ .driver_major = GVE_VERSION_MAJOR, .driver_minor = GVE_VERSION_MINOR, .driver_sub = GVE_VERSION_SUB, .os_version_major = htobe32(FBSD_VERSION_MAJOR), .os_version_minor = htobe32(FBSD_VERSION_MINOR), .os_version_sub = htobe32(FBSD_VERSION_PATCH), .driver_capability_flags = { htobe64(GVE_DRIVER_CAPABILITY_FLAGS1), htobe64(GVE_DRIVER_CAPABILITY_FLAGS2), htobe64(GVE_DRIVER_CAPABILITY_FLAGS3), htobe64(GVE_DRIVER_CAPABILITY_FLAGS4), }, }; snprintf(driver_info->os_version_str1, sizeof(driver_info->os_version_str1), "FreeBSD %u", __FreeBSD_version); bus_dmamap_sync(driver_info_mem.tag, driver_info_mem.map, BUS_DMASYNC_PREREAD); err = gve_adminq_verify_driver_compatibility(priv, sizeof(struct gve_driver_info), driver_info_mem.bus_addr); /* It's ok if the device doesn't support this */ if (err == EOPNOTSUPP) err = 0; gve_dma_free_coherent(&driver_info_mem); return (err); } static int gve_up(struct gve_priv *priv) { if_t ifp = priv->ifp; int err; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (device_is_attached(priv->dev) == 0) { device_printf(priv->dev, "Cannot bring the iface up when detached\n"); return (ENXIO); } if (gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return (0); if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, CSUM_TCP | CSUM_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, CSUM_IP6_TCP | CSUM_IP6_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); if (gve_is_qpl(priv)) { err = gve_register_qpls(priv); if (err != 0) goto reset; } err = gve_create_rx_rings(priv); if (err != 0) goto reset; err = gve_create_tx_rings(priv); if (err != 0) goto reset; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; return (0); reset: gve_schedule_reset(priv); return (err); } static void gve_down(struct gve_priv *priv) { GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if (gve_destroy_rx_rings(priv) != 0) goto reset; if (gve_destroy_tx_rings(priv) != 0) goto reset; if (gve_is_qpl(priv)) { if (gve_unregister_qpls(priv) != 0) goto reset; } if (gve_is_gqi(priv)) gve_mask_all_queue_irqs(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_down_cnt++; return; reset: gve_schedule_reset(priv); } +int +gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt) +{ + int err; + + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + gve_down(priv); + + if (new_queue_cnt < priv->rx_cfg.num_queues) { + /* + * Freeing a ring still preserves its ntfy_id, + * which is needed if we create the ring again. + */ + gve_free_rx_rings(priv, new_queue_cnt, priv->rx_cfg.num_queues); + } else { + err = gve_alloc_rx_rings(priv, priv->rx_cfg.num_queues, new_queue_cnt); + if (err != 0) { + device_printf(priv->dev, "Failed to allocate new queues"); + /* Failed to allocate rings, start back up with old ones */ + gve_up(priv); + return (err); + + } + } + priv->rx_cfg.num_queues = new_queue_cnt; + + err = gve_up(priv); + if (err != 0) + gve_schedule_reset(priv); + + return (err); +} + +int +gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt) +{ + int err; + + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + gve_down(priv); + + if (new_queue_cnt < priv->tx_cfg.num_queues) { + /* + * Freeing a ring still preserves its ntfy_id, + * which is needed if we create the ring again. + */ + gve_free_tx_rings(priv, new_queue_cnt, priv->tx_cfg.num_queues); + } else { + err = gve_alloc_tx_rings(priv, priv->tx_cfg.num_queues, new_queue_cnt); + if (err != 0) { + device_printf(priv->dev, "Failed to allocate new queues"); + /* Failed to allocate rings, start back up with old ones */ + gve_up(priv); + return (err); + + } + } + priv->tx_cfg.num_queues = new_queue_cnt; + + err = gve_up(priv); + if (err != 0) + gve_schedule_reset(priv); + + return (err); +} + static int gve_set_mtu(if_t ifp, uint32_t new_mtu) { struct gve_priv *priv = if_getsoftc(ifp); const uint32_t max_problem_range = 8227; const uint32_t min_problem_range = 7822; int err; if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { device_printf(priv->dev, "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d\n", new_mtu, priv->max_mtu, ETHERMIN); return (EINVAL); } /* * When hardware LRO is enabled in DQ mode, MTUs within the range * [7822, 8227] trigger hardware issues which cause a drastic drop * in throughput. */ if (!gve_is_gqi(priv) && !gve_disable_hw_lro && new_mtu >= min_problem_range && new_mtu <= max_problem_range) { device_printf(priv->dev, "Cannot set to MTU to %d within the range [%d, %d] while hardware LRO is enabled\n", new_mtu, min_problem_range, max_problem_range); return (EINVAL); } err = gve_adminq_set_mtu(priv, new_mtu); if (err == 0) { if (bootverbose) device_printf(priv->dev, "MTU set to %d\n", new_mtu); if_setmtu(ifp, new_mtu); } else { device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); } return (err); } static void gve_init(void *arg) { struct gve_priv *priv = (struct gve_priv *)arg; if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } static int gve_ioctl(if_t ifp, u_long command, caddr_t data) { struct gve_priv *priv; struct ifreq *ifr; int rc = 0; priv = if_getsoftc(ifp); ifr = (struct ifreq *)data; switch (command) { case SIOCSIFMTU: if (if_getmtu(ifp) == ifr->ifr_mtu) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); gve_set_mtu(ifp, ifr->ifr_mtu); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFFLAGS: if ((if_getflags(ifp) & IFF_UP) != 0) { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } else { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } break; case SIOCSIFCAP: if (ifr->ifr_reqcap == if_getcapenable(ifp)) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); if_setcapenable(ifp, ifr->ifr_reqcap); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFMEDIA: /* FALLTHROUGH */ case SIOCGIFMEDIA: rc = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; default: rc = ether_ioctl(ifp, command, data); break; } return (rc); } static int gve_media_change(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); device_printf(priv->dev, "Media change not supported\n"); return (0); } static void gve_media_status(if_t ifp, struct ifmediareq *ifmr) { struct gve_priv *priv = if_getsoftc(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_AUTO; } else { ifmr->ifm_active |= IFM_NONE; } GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } static uint64_t gve_get_counter(if_t ifp, ift_counter cnt) { struct gve_priv *priv; uint64_t rpackets = 0; uint64_t tpackets = 0; uint64_t rbytes = 0; uint64_t tbytes = 0; uint64_t rx_dropped_pkt = 0; uint64_t tx_dropped_pkt = 0; priv = if_getsoftc(ifp); gve_accum_stats(priv, &rpackets, &rbytes, &rx_dropped_pkt, &tpackets, &tbytes, &tx_dropped_pkt); switch (cnt) { case IFCOUNTER_IPACKETS: return (rpackets); case IFCOUNTER_OPACKETS: return (tpackets); case IFCOUNTER_IBYTES: return (rbytes); case IFCOUNTER_OBYTES: return (tbytes); case IFCOUNTER_IQDROPS: return (rx_dropped_pkt); case IFCOUNTER_OQDROPS: return (tx_dropped_pkt); default: return (if_get_counter_default(ifp, cnt)); } } static void gve_setup_ifnet(device_t dev, struct gve_priv *priv) { int caps = 0; if_t ifp; ifp = priv->ifp = if_alloc(IFT_ETHER); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, priv); if_setdev(ifp, dev); if_setinitfn(ifp, gve_init); if_setioctlfn(ifp, gve_ioctl); if_settransmitfn(ifp, gve_xmit_ifp); if_setqflushfn(ifp, gve_qflush); /* * Set TSO limits, must match the arguments to bus_dma_tag_create * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode * because in QPL we copy the entire packet into the bounce buffer * and thus it does not matter how fragmented the mbuf is. */ if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) { if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); } if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); #else if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); #endif ifmedia_init(&priv->media, IFM_IMASK, gve_media_change, gve_media_status); if_setgetcounterfn(ifp, gve_get_counter); caps = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO | IFCAP_LRO; if ((priv->supported_features & GVE_SUP_JUMBO_FRAMES_MASK) != 0) caps |= IFCAP_JUMBO_MTU; if_setcapabilities(ifp, caps); if_setcapenable(ifp, caps); if (bootverbose) device_printf(priv->dev, "Setting initial MTU to %d\n", priv->max_mtu); if_setmtu(ifp, priv->max_mtu); ether_ifattach(ifp, priv->mac); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); } static int gve_alloc_counter_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(uint32_t) * priv->num_event_counters, PAGE_SIZE, &priv->counter_array_mem); if (err != 0) return (err); priv->counters = priv->counter_array_mem.cpu_addr; return (0); } static void gve_free_counter_array(struct gve_priv *priv) { if (priv->counters != NULL) gve_dma_free_coherent(&priv->counter_array_mem); priv->counter_array_mem = (struct gve_dma_handle){}; } static int gve_alloc_irq_db_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_irq_db) * (priv->num_queues), PAGE_SIZE, &priv->irqs_db_mem); if (err != 0) return (err); priv->irq_db_indices = priv->irqs_db_mem.cpu_addr; return (0); } static void gve_free_irq_db_array(struct gve_priv *priv) { if (priv->irq_db_indices != NULL) gve_dma_free_coherent(&priv->irqs_db_mem); priv->irqs_db_mem = (struct gve_dma_handle){}; } static void gve_free_rings(struct gve_priv *priv) { gve_free_irqs(priv); - gve_free_tx_rings(priv); - gve_free_rx_rings(priv); + + gve_free_tx_rings(priv, 0, priv->tx_cfg.num_queues); + free(priv->tx, M_GVE); + priv->tx = NULL; + + gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues); + free(priv->rx, M_GVE); + priv->rx = NULL; } static int gve_alloc_rings(struct gve_priv *priv) { int err; - err = gve_alloc_rx_rings(priv); + priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.max_queues, + M_GVE, M_WAITOK | M_ZERO); + err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); if (err != 0) goto abort; - err = gve_alloc_tx_rings(priv); + priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.max_queues, + M_GVE, M_WAITOK | M_ZERO); + err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues); if (err != 0) goto abort; err = gve_alloc_irqs(priv); if (err != 0) goto abort; return (0); abort: gve_free_rings(priv); return (err); } static void gve_deconfigure_and_free_device_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) { err = gve_adminq_deconfigure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to deconfigure device resources: err=%d\n", err); return; } if (bootverbose) device_printf(priv->dev, "Deconfigured device resources\n"); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); } gve_free_irq_db_array(priv); gve_free_counter_array(priv); if (priv->ptype_lut_dqo) { free(priv->ptype_lut_dqo, M_GVE); priv->ptype_lut_dqo = NULL; } } static int gve_alloc_and_configure_device_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) return (0); err = gve_alloc_counter_array(priv); if (err != 0) return (err); err = gve_alloc_irq_db_array(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } if (!gve_is_gqi(priv)) { priv->ptype_lut_dqo = malloc(sizeof(*priv->ptype_lut_dqo), M_GVE, M_WAITOK | M_ZERO); err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); if (err != 0) { device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", err); goto abort; } } gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); if (bootverbose) device_printf(priv->dev, "Configured device resources\n"); return (0); abort: gve_deconfigure_and_free_device_resources(priv); return (err); } static void gve_set_queue_cnts(struct gve_priv *priv) { priv->tx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_TX_QUEUES); priv->rx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_RX_QUEUES); priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; if (priv->default_num_queues > 0) { priv->tx_cfg.num_queues = MIN(priv->default_num_queues, priv->tx_cfg.num_queues); priv->rx_cfg.num_queues = MIN(priv->default_num_queues, priv->rx_cfg.num_queues); } - priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues; + priv->num_queues = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues; priv->mgmt_msix_idx = priv->num_queues; } static int gve_alloc_adminq_and_describe_device(struct gve_priv *priv) { int err; if ((err = gve_adminq_alloc(priv)) != 0) return (err); if ((err = gve_verify_driver_compatibility(priv)) != 0) { device_printf(priv->dev, "Failed to verify driver compatibility: err=%d\n", err); goto abort; } if ((err = gve_adminq_describe_device(priv)) != 0) goto abort; gve_set_queue_cnts(priv); priv->num_registered_pages = 0; return (0); abort: gve_release_adminq(priv); return (err); } void gve_schedule_reset(struct gve_priv *priv) { if (gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) return; device_printf(priv->dev, "Scheduling reset task!\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); taskqueue_enqueue(priv->service_tq, &priv->service_task); } static void gve_destroy(struct gve_priv *priv) { gve_down(priv); gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); } static void gve_restore(struct gve_priv *priv) { int err; err = gve_adminq_alloc(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } if (!gve_is_gqi(priv)) { err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); if (err != 0) { device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", err); goto abort; } } err = gve_up(priv); if (err != 0) goto abort; return; abort: device_printf(priv->dev, "Restore failed!\n"); return; } static void gve_clear_device_resources(struct gve_priv *priv) { int i; for (i = 0; i < priv->num_event_counters; i++) priv->counters[i] = 0; bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, BUS_DMASYNC_PREWRITE); for (i = 0; i < priv->num_queues; i++) priv->irq_db_indices[i] = (struct gve_irq_db){}; bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_PREWRITE); if (priv->ptype_lut_dqo) *priv->ptype_lut_dqo = (struct gve_ptype_lut){0}; } static void gve_handle_reset(struct gve_priv *priv) { if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) return; gve_clear_state_flag(priv, GVE_STATE_FLAG_DO_RESET); gve_set_state_flag(priv, GVE_STATE_FLAG_IN_RESET); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); /* * Releasing the adminq causes the NIC to destroy all resources * registered with it, so by clearing the flags beneath we cause * the subsequent gve_down call below to not attempt to tell the * NIC to destroy these resources again. * * The call to gve_down is needed in the first place to refresh * the state and the DMA-able memory within each driver ring. */ gve_release_adminq(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); gve_down(priv); gve_clear_device_resources(priv); gve_restore(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); priv->reset_cnt++; gve_clear_state_flag(priv, GVE_STATE_FLAG_IN_RESET); } static void gve_handle_link_status(struct gve_priv *priv) { uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); bool link_up = status & GVE_DEVICE_STATUS_LINK_STATUS; if (link_up == gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) return; if (link_up) { if (bootverbose) device_printf(priv->dev, "Device link is up.\n"); if_link_state_change(priv->ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } else { device_printf(priv->dev, "Device link is down.\n"); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } } static void gve_service_task(void *arg, int pending) { struct gve_priv *priv = (struct gve_priv *)arg; uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); if (((GVE_DEVICE_STATUS_RESET_MASK & status) != 0) && !gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) { device_printf(priv->dev, "Device requested reset\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); } gve_handle_reset(priv); gve_handle_link_status(priv); } static int gve_probe(device_t dev) { uint16_t deviceid, vendorid; int i; vendorid = pci_get_vendor(dev); deviceid = pci_get_device(dev); for (i = 0; i < nitems(gve_devs); i++) { if (vendorid == gve_devs[i].vendor_id && deviceid == gve_devs[i].device_id) { device_set_desc(dev, gve_devs[i].name); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void gve_free_sys_res_mem(struct gve_priv *priv) { if (priv->msix_table != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->msix_table), priv->msix_table); if (priv->db_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->db_bar), priv->db_bar); if (priv->reg_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->reg_bar), priv->reg_bar); } static int gve_attach(device_t dev) { struct gve_priv *priv; int rid; int err; snprintf(gve_version, sizeof(gve_version), "%d.%d.%d", GVE_VERSION_MAJOR, GVE_VERSION_MINOR, GVE_VERSION_SUB); priv = device_get_softc(dev); priv->dev = dev; GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); pci_enable_busmaster(dev); rid = PCIR_BAR(GVE_REGISTER_BAR); priv->reg_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->reg_bar == NULL) { device_printf(dev, "Failed to allocate BAR0\n"); err = ENXIO; goto abort; } rid = PCIR_BAR(GVE_DOORBELL_BAR); priv->db_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->db_bar == NULL) { device_printf(dev, "Failed to allocate BAR2\n"); err = ENXIO; goto abort; } rid = pci_msix_table_bar(priv->dev); priv->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->msix_table == NULL) { device_printf(dev, "Failed to allocate msix table\n"); err = ENXIO; goto abort; } err = gve_alloc_adminq_and_describe_device(priv); if (err != 0) goto abort; err = gve_alloc_and_configure_device_resources(priv); if (err != 0) goto abort; err = gve_alloc_rings(priv); if (err != 0) goto abort; gve_setup_ifnet(dev, priv); priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; bus_write_multi_1(priv->reg_bar, DRIVER_VERSION, GVE_DRIVER_VERSION, sizeof(GVE_DRIVER_VERSION) - 1); TASK_INIT(&priv->service_task, 0, gve_service_task, priv); priv->service_tq = taskqueue_create("gve service", M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &priv->service_tq); taskqueue_start_threads(&priv->service_tq, 1, PI_NET, "%s service tq", device_get_nameunit(priv->dev)); gve_setup_sysctl(priv); if (bootverbose) device_printf(priv->dev, "Successfully attached %s", GVE_DRIVER_VERSION); return (0); abort: gve_free_rings(priv); gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); return (err); } static int gve_detach(device_t dev) { struct gve_priv *priv = device_get_softc(dev); if_t ifp = priv->ifp; int error; error = bus_generic_detach(dev); if (error != 0) return (error); ether_ifdetach(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_destroy(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); gve_free_rings(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); while (taskqueue_cancel(priv->service_tq, &priv->service_task, NULL)) taskqueue_drain(priv->service_tq, &priv->service_task); taskqueue_free(priv->service_tq); if_free(ifp); return (0); } static device_method_t gve_methods[] = { DEVMETHOD(device_probe, gve_probe), DEVMETHOD(device_attach, gve_attach), DEVMETHOD(device_detach, gve_detach), DEVMETHOD_END }; static driver_t gve_driver = { "gve", gve_methods, sizeof(struct gve_priv) }; #if __FreeBSD_version < 1301503 static devclass_t gve_devclass; DRIVER_MODULE(gve, pci, gve_driver, gve_devclass, 0, 0); #else DRIVER_MODULE(gve, pci, gve_driver, 0, 0); #endif MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, gve, gve_devs, nitems(gve_devs)); diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c index e1a228c0e69c..de64375ac4f3 100644 --- a/sys/dev/gve/gve_rx.c +++ b/sys/dev/gve/gve_rx.c @@ -1,724 +1,718 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" static void gve_rx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; if (rx->page_info != NULL) { free(rx->page_info, M_GVE); rx->page_info = NULL; } if (rx->data_ring != NULL) { gve_dma_free_coherent(&rx->data_ring_mem); rx->data_ring = NULL; } if (rx->desc_ring != NULL) { gve_dma_free_coherent(&rx->desc_ring_mem); rx->desc_ring = NULL; } if (com->qpl != NULL) { gve_free_qpl(priv, com->qpl); com->qpl = NULL; } } static void gve_rx_free_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; /* Safe to call even if never allocated */ gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); if (gve_is_gqi(priv)) gve_rx_free_ring_gqi(priv, i); else gve_rx_free_ring_dqo(priv, i); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } } static void gve_prefill_rx_slots(struct gve_rx_ring *rx) { struct gve_ring_com *com = &rx->com; struct gve_dma_handle *dma; int i; for (i = 0; i < com->priv->rx_desc_cnt; i++) { rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i); rx->page_info[i].page_offset = 0; rx->page_info[i].page_address = com->qpl->dmas[i].cpu_addr; rx->page_info[i].page = com->qpl->pages[i]; dma = &com->qpl->dmas[i]; bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREREAD); } bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); } static int gve_rx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int err; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); goto abort; } rx->mask = priv->rx_pages_per_qpl - 1; rx->desc_ring = rx->desc_ring_mem.cpu_addr; com->qpl = gve_alloc_qpl(priv, i + priv->tx_cfg.max_queues, priv->rx_desc_cnt, /*single_kva=*/false); if (com->qpl == NULL) { device_printf(priv->dev, "Failed to alloc QPL for rx ring %d", i); err = ENOMEM; goto abort; } rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE, M_WAITOK | M_ZERO); err = gve_dma_alloc_coherent(priv, sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->data_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i); goto abort; } rx->data_ring = rx->data_ring_mem.cpu_addr; gve_prefill_rx_slots(rx); return (0); abort: gve_rx_free_ring_gqi(priv, i); return (err); } static int gve_rx_alloc_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int err; com->priv = priv; com->id = i; gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; if (gve_is_gqi(priv)) err = gve_rx_alloc_ring_gqi(priv, i); else err = gve_rx_alloc_ring_dqo(priv, i); if (err != 0) goto abort; return (0); abort: gve_rx_free_ring(priv, i); return (err); } int -gve_alloc_rx_rings(struct gve_priv *priv) +gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { - int err = 0; int i; + int err; - priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues, - M_GVE, M_WAITOK | M_ZERO); + KASSERT(priv->rx != NULL, ("priv->rx is NULL!")); - for (i = 0; i < priv->rx_cfg.num_queues; i++) { + for (i = start_idx; i < stop_idx; i++) { err = gve_rx_alloc_ring(priv, i); if (err != 0) goto free_rings; } return (0); - free_rings: - while (i--) - gve_rx_free_ring(priv, i); - free(priv->rx, M_GVE); + gve_free_rx_rings(priv, start_idx, i); return (err); } void -gve_free_rx_rings(struct gve_priv *priv) +gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { int i; - for (i = 0; i < priv->rx_cfg.num_queues; i++) + for (i = start_idx; i < stop_idx; i++) gve_rx_free_ring(priv, i); - - free(priv->rx, M_GVE); } static void gve_rx_clear_data_ring(struct gve_rx_ring *rx) { struct gve_priv *priv = rx->com.priv; int i; /* * The Rx data ring has this invariant: "the networking stack is not * using the buffer beginning at any page_offset". This invariant is * established initially by gve_prefill_rx_slots at alloc-time and is * maintained by the cleanup taskqueue. This invariant implies that the * ring can be considered to be fully posted with buffers at this point, * even if there are unfreed mbufs still being processed, which is why we * can fill the ring without waiting on can_flip at each slot to become true. */ for (i = 0; i < priv->rx_desc_cnt; i++) { rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i + rx->page_info[i].page_offset); rx->fill_cnt++; } bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_rx_clear_desc_ring(struct gve_rx_ring *rx) { struct gve_priv *priv = rx->com.priv; int i; for (i = 0; i < priv->rx_desc_cnt; i++) rx->desc_ring[i] = (struct gve_rx_desc){}; bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_clear_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; if (!gve_is_gqi(priv)) { gve_clear_rx_ring_dqo(priv, i); return; } rx->seq_no = 1; rx->cnt = 0; rx->fill_cnt = 0; rx->mask = priv->rx_desc_cnt - 1; gve_rx_clear_desc_ring(rx); gve_rx_clear_data_ring(rx); } static void gve_start_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; if ((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) { if (tcp_lro_init(&rx->lro) != 0) device_printf(priv->dev, "Failed to init lro for rx ring %d", i); rx->lro.ifp = priv->ifp; } if (gve_is_gqi(priv)) NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); else NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq_dqo, rx); com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(priv->dev), i); if (gve_is_gqi(priv)) { /* GQ RX bufs are prefilled at ring alloc time */ gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); } else gve_rx_prefill_buffers_dqo(rx); } int gve_create_rx_rings(struct gve_priv *priv) { struct gve_ring_com *com; struct gve_rx_ring *rx; int err; int i; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) return (0); for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_clear_rx_ring(priv, i); err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); if (err != 0) return (err); bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_POSTREAD); for (i = 0; i < priv->rx_cfg.num_queues; i++) { rx = &priv->rx[i]; com = &rx->com; com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, BUS_DMASYNC_POSTREAD); com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); gve_start_rx_ring(priv, i); } gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); return (0); } static void gve_stop_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; if (com->cleanup_tq != NULL) { taskqueue_quiesce(com->cleanup_tq); taskqueue_free(com->cleanup_tq); com->cleanup_tq = NULL; } tcp_lro_free(&rx->lro); rx->ctx = (struct gve_rx_ctx){}; } int gve_destroy_rx_rings(struct gve_priv *priv) { int err; int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_stop_rx_ring(priv, i); if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) { err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); } return (0); } int gve_rx_intr(void *arg) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; struct gve_ring_com *com = &rx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); return (FILTER_HANDLED); } static inline void gve_set_rss_type(__be16 flag, struct mbuf *mbuf) { if ((flag & GVE_RXF_IPV4) != 0) { if ((flag & GVE_RXF_TCP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); else if ((flag & GVE_RXF_UDP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); else M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); return; } if ((flag & GVE_RXF_IPV6) != 0) { if ((flag & GVE_RXF_TCP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); else if ((flag & GVE_RXF_UDP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); else M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); return; } } static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) { const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET); page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; *(slot_addr) ^= offset; } static struct mbuf * gve_rx_create_mbuf(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, uint16_t len, union gve_rx_data_slot *data_slot, bool is_only_frag) { struct gve_rx_ctx *ctx = &rx->ctx; struct mbuf *mbuf; u_int ref_count; bool can_flip; uint32_t offset = page_info->page_offset + page_info->pad; void *va = (char *)page_info->page_address + offset; if (len <= priv->rx_copybreak && is_only_frag) { mbuf = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(mbuf == NULL)) return (NULL); m_copyback(mbuf, 0, len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); counter_exit(); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { struct mbuf *mbuf_tail = ctx->mbuf_tail; KASSERT(len <= MCLBYTES, ("gve rx fragment bigger than cluster mbuf")); /* * This page was created with VM_ALLOC_WIRED, thus the lowest * wire count experienced by the page until the interface is * destroyed is 1. * * We wire the page again before supplying an mbuf pointing to * it to the networking stack, so before the mbuf leaves the * driver, the wire count rises to 2. * * If it is 1 again, it necessarily means that the mbuf has been * consumed and it was gve_mextadd_free that brought down the wire * count back to 1. We only need to eventually observe the 1. */ ref_count = atomic_load_int(&page_info->page->ref_count); can_flip = VPRC_WIRE_COUNT(ref_count) == 1; if (mbuf_tail == NULL) { if (can_flip) mbuf = m_gethdr(M_NOWAIT, MT_DATA); else mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { if (can_flip) mbuf = m_get(M_NOWAIT, MT_DATA); else mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); mbuf_tail->m_next = mbuf; ctx->mbuf_tail = mbuf; } if (__predict_false(mbuf == NULL)) return (NULL); if (can_flip) { MEXTADD(mbuf, va, len, gve_mextadd_free, page_info->page, page_info->page_address, 0, EXT_NET_DRV); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); counter_exit(); /* * Grab an extra ref to the page so that gve_mextadd_free * does not end up freeing the page while the interface exists. */ vm_page_wire(page_info->page); gve_rx_flip_buff(page_info, &data_slot->qpl_offset); } else { m_copyback(mbuf, 0, len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); counter_exit(); } } mbuf->m_len = len; ctx->total_size += len; return (mbuf); } static inline bool gve_needs_rss(__be16 flag) { if ((flag & GVE_RXF_FRAG) != 0) return (false); if ((flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) != 0) return (true); return (false); } static void gve_rx(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_desc *desc, uint32_t idx) { struct gve_rx_slot_page_info *page_info; struct gve_dma_handle *page_dma_handle; union gve_rx_data_slot *data_slot; struct gve_rx_ctx *ctx = &rx->ctx; struct mbuf *mbuf = NULL; if_t ifp = priv->ifp; bool do_if_input; uint16_t len; bool is_first_frag = ctx->frag_cnt == 0; bool is_last_frag = !(GVE_RXF_PKT_CONT & desc->flags_seq); bool is_only_frag = is_first_frag && is_last_frag; if (__predict_false(ctx->drop_pkt)) goto finish_frag; if ((desc->flags_seq & GVE_RXF_ERR) != 0) { ctx->drop_pkt = true; counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); goto finish_frag; } page_info = &rx->page_info[idx]; data_slot = &rx->data_ring[idx]; page_dma_handle = &(rx->com.qpl->dmas[idx]); page_info->pad = is_first_frag ? GVE_RX_PAD : 0; len = be16toh(desc->len) - page_info->pad; bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, BUS_DMASYNC_POSTREAD); mbuf = gve_rx_create_mbuf(priv, rx, page_info, len, data_slot, is_only_frag); if (mbuf == NULL) { ctx->drop_pkt = true; counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); goto finish_frag; } if (is_first_frag) { mbuf->m_pkthdr.rcvif = priv->ifp; ctx->is_tcp = desc->flags_seq & GVE_RXF_TCP; if (gve_needs_rss(desc->flags_seq)) { gve_set_rss_type(desc->flags_seq, mbuf); mbuf->m_pkthdr.flowid = be32toh(desc->rss_hash); } if ((desc->csum != 0) && ((desc->flags_seq & GVE_RXF_FRAG) == 0)) { mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mbuf->m_pkthdr.csum_data = 0xffff; } } if (is_last_frag) { mbuf = ctx->mbuf_head; mbuf->m_pkthdr.len = ctx->total_size; do_if_input = true; if (((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) && /* LRO is enabled */ (ctx->is_tcp) && /* pkt is a TCP pkt */ ((mbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID) != 0) && /* NIC verified csum */ (rx->lro.lro_cnt != 0) && /* LRO resources exist */ (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) do_if_input = false; if (do_if_input) if_input(ifp, mbuf); counter_enter(); counter_u64_add_protected(rx->stats.rbytes, ctx->total_size); counter_u64_add_protected(rx->stats.rpackets, 1); counter_exit(); } finish_frag: ctx->frag_cnt++; if (is_last_frag) rx->ctx = (struct gve_rx_ctx){}; } static bool gve_rx_work_pending(struct gve_rx_ring *rx) { struct gve_rx_desc *desc; __be16 flags_seq; uint32_t next_idx; next_idx = rx->cnt & rx->mask; desc = rx->desc_ring + next_idx; flags_seq = desc->flags_seq; return (GVE_SEQNO(flags_seq) == rx->seq_no); } static inline uint8_t gve_next_seqno(uint8_t seq) { return ((seq + 1) == 8 ? 1 : seq + 1); } static void gve_rx_cleanup(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) { uint32_t idx = rx->cnt & rx->mask; struct gve_rx_desc *desc; struct gve_rx_ctx *ctx = &rx->ctx; uint32_t work_done = 0; NET_EPOCH_ASSERT(); bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_POSTREAD); desc = &rx->desc_ring[idx]; while ((work_done < budget || ctx->frag_cnt) && (GVE_SEQNO(desc->flags_seq) == rx->seq_no)) { gve_rx(priv, rx, desc, idx); rx->cnt++; idx = rx->cnt & rx->mask; desc = &rx->desc_ring[idx]; rx->seq_no = gve_next_seqno(rx->seq_no); work_done++; } /* The device will only send whole packets. */ if (__predict_false(ctx->frag_cnt)) { m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; device_printf(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", GVE_SEQNO(desc->flags_seq), rx->seq_no); gve_schedule_reset(priv); } if (work_done != 0) tcp_lro_flush_all(&rx->lro); bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); /* Buffers are refilled as the descs are processed */ rx->fill_cnt += work_done; gve_db_bar_write_4(priv, rx->com.db_offset, rx->fill_cnt); } void gve_rx_cleanup_tq(void *arg, int pending) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; gve_rx_cleanup(priv, rx, /*budget=*/128); gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_ACK | GVE_IRQ_EVENT); /* * Fragments received before this barrier MAY NOT cause the NIC to send an * interrupt but they will still be handled by the enqueue below. * Fragments received after the barrier WILL trigger an interrupt. */ atomic_thread_fence_seq_cst(); if (gve_rx_work_pending(rx)) { gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); } } diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c index c96d082837a4..8f52ffad6f3e 100644 --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -1,327 +1,410 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" static SYSCTL_NODE(_hw, OID_AUTO, gve, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "GVE driver parameters"); bool gve_disable_hw_lro = false; SYSCTL_BOOL(_hw_gve, OID_AUTO, disable_hw_lro, CTLFLAG_RDTUN, &gve_disable_hw_lro, 0, "Controls if hardware LRO is used"); char gve_queue_format[8]; SYSCTL_STRING(_hw_gve, OID_AUTO, queue_format, CTLFLAG_RD, &gve_queue_format, 0, "Queue format being used by the iface"); char gve_version[8]; SYSCTL_STRING(_hw_gve, OID_AUTO, driver_version, CTLFLAG_RD, &gve_version, 0, "Driver version"); static void gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_rx_ring *rxq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct gve_rxq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->com.id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue"); list = SYSCTL_CHILDREN(node); stats = &rxq->stats; SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &stats->rbytes, "Bytes received"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_packets", CTLFLAG_RD, &stats->rpackets, "Packets received"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_copybreak_cnt", CTLFLAG_RD, &stats->rx_copybreak_cnt, "Total frags with mbufs allocated for copybreak"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_flip_cnt", CTLFLAG_RD, &stats->rx_frag_flip_cnt, "Total frags that allocated mbuf with page flip"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_copy_cnt", CTLFLAG_RD, &stats->rx_frag_copy_cnt, "Total frags with mbuf that copied payload into mbuf"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt", CTLFLAG_RD, &stats->rx_dropped_pkt, "Total rx packets dropped"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_desc_err", CTLFLAG_RD, &stats->rx_dropped_pkt_desc_err, "Packets dropped due to descriptor error"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_buf_post_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_buf_post_fail, "Packets dropped due to failure to post enough buffers"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_mbuf_alloc_fail, "Packets dropped due to failed mbuf allocation"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_mbuf_dmamap_err", CTLFLAG_RD, &stats->rx_mbuf_dmamap_err, "Number of rx mbufs which could not be dma mapped"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_mbuf_mclget_null", CTLFLAG_RD, &stats->rx_mbuf_mclget_null, "Number of times when there were no cluster mbufs"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "rx_completed_desc", CTLFLAG_RD, &rxq->cnt, 0, "Number of descriptors completed"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "num_desc_posted", CTLFLAG_RD, &rxq->fill_cnt, rxq->fill_cnt, "Toal number of descriptors posted"); } static void gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_tx_ring *txq) { struct sysctl_oid *node; struct sysctl_oid_list *tx_list; struct gve_txq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->com.id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue"); tx_list = SYSCTL_CHILDREN(node); stats = &txq->stats; SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, "tx_posted_desc", CTLFLAG_RD, &txq->req, 0, "Number of descriptors posted by NIC"); SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, "tx_completed_desc", CTLFLAG_RD, &txq->done, 0, "Number of descriptors completed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &stats->tpackets, "Packets transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_tso_packets", CTLFLAG_RD, &stats->tso_packet_cnt, "TSO Packets transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_bytes", CTLFLAG_RD, &stats->tbytes, "Bytes transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_device", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_device, "Packets delayed due to no space in device"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_dropped_pkt_nospace_bufring", CTLFLAG_RD, &stats->tx_dropped_pkt_nospace_bufring, "Packets dropped due to no space in br ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_dropped_pkt_vlan", CTLFLAG_RD, &stats->tx_dropped_pkt_vlan, "Dropped VLAN packets"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_descring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_descring, "Packets delayed due to no space in desc ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_compring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_compring, "Packets delayed due to no space in comp ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_qpl_bufs, "Packets delayed due to not enough qpl bufs"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_tsoerr", CTLFLAG_RD, &stats->tx_delayed_pkt_tsoerr, "TSO packets delayed due to err in prep errors"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_collpase", CTLFLAG_RD, &stats->tx_mbuf_collapse, "tx mbufs that had to be collapsed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_defrag", CTLFLAG_RD, &stats->tx_mbuf_defrag, "tx mbufs that had to be defragged"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_defrag_err", CTLFLAG_RD, &stats->tx_mbuf_defrag_err, "tx mbufs that failed defrag"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_dmamap_enomem_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_enomem_err, "tx mbufs that could not be dma-mapped due to low mem"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_dmamap_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_err, "tx mbufs that could not be dma-mapped"); } static void gve_setup_queue_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) { gve_setup_rxq_sysctl(ctx, child, &priv->rx[i]); } for (i = 0; i < priv->tx_cfg.num_queues; i++) { gve_setup_txq_sysctl(ctx, child, &priv->tx[i]); } } static void gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { struct sysctl_oid *admin_node; struct sysctl_oid_list *admin_list; /* Admin queue stats */ admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "adminq_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue statistics"); admin_list = SYSCTL_CHILDREN(admin_node); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_prod_cnt", CTLFLAG_RD, &priv->adminq_prod_cnt, 0, "Adminq Commands issued"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cmd_fail", CTLFLAG_RD, &priv->adminq_cmd_fail, 0, "Aqminq Failed commands"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_timeouts", CTLFLAG_RD, &priv->adminq_timeouts, 0, "Adminq Timedout commands"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_describe_device_cnt", CTLFLAG_RD, &priv->adminq_describe_device_cnt, 0, "adminq_describe_device_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_cfg_device_resources_cnt, 0, "adminq_cfg_device_resources_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_register_page_list_cnt", CTLFLAG_RD, &priv->adminq_register_page_list_cnt, 0, "adminq_register_page_list_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_unregister_page_list_cnt", CTLFLAG_RD, &priv->adminq_unregister_page_list_cnt, 0, "adminq_unregister_page_list_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_tx_queue_cnt", CTLFLAG_RD, &priv->adminq_create_tx_queue_cnt, 0, "adminq_create_tx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_create_rx_queue_cnt, 0, "adminq_create_rx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_tx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_tx_queue_cnt, 0, "adminq_destroy_tx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0, "adminq_destroy_rx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_get_ptype_map_cnt", CTLFLAG_RD, &priv->adminq_get_ptype_map_cnt, 0, "adminq_get_ptype_map_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_dcfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_dcfg_device_resources_cnt, 0, "adminq_dcfg_device_resources_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_set_driver_parameter_cnt", CTLFLAG_RD, &priv->adminq_set_driver_parameter_cnt, 0, "adminq_set_driver_parameter_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_verify_driver_compatibility_cnt", CTLFLAG_RD, &priv->adminq_verify_driver_compatibility_cnt, 0, "adminq_verify_driver_compatibility_cnt"); } static void gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { struct sysctl_oid *main_node; struct sysctl_oid_list *main_list; /* Main stats */ main_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "main_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Main statistics"); main_list = SYSCTL_CHILDREN(main_node); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_up_cnt", CTLFLAG_RD, &priv->interface_up_cnt, 0, "Times interface was set to up"); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_down_cnt", CTLFLAG_RD, &priv->interface_down_cnt, 0, "Times interface was set to down"); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "reset_cnt", CTLFLAG_RD, &priv->reset_cnt, 0, "Times reset"); } +static int +gve_check_num_queues(struct gve_priv *priv, int val, bool is_rx) +{ + if (val < 1) { + device_printf(priv->dev, + "Requested num queues (%u) must be a positive integer\n", val); + return (EINVAL); + } + + if (val > (is_rx ? priv->rx_cfg.max_queues : priv->tx_cfg.max_queues)) { + device_printf(priv->dev, + "Requested num queues (%u) is too large\n", val); + return (EINVAL); + } + + return (0); +} + +static int +gve_sysctl_num_tx_queues(SYSCTL_HANDLER_ARGS) +{ + struct gve_priv *priv = arg1; + int val; + int err; + + val = priv->tx_cfg.num_queues; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + err = gve_check_num_queues(priv, val, /*is_rx=*/false); + if (err != 0) + return (err); + + if (val != priv->tx_cfg.num_queues) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + err = gve_adjust_tx_queues(priv, val); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + + return (err); +} + +static int +gve_sysctl_num_rx_queues(SYSCTL_HANDLER_ARGS) +{ + struct gve_priv *priv = arg1; + int val; + int err; + + val = priv->rx_cfg.num_queues; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + err = gve_check_num_queues(priv, val, /*is_rx=*/true); + + if (err != 0) + return (err); + + if (val != priv->rx_cfg.num_queues) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + err = gve_adjust_rx_queues(priv, val); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + + return (err); +} + +static void +gve_setup_sysctl_writables(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_priv *priv) +{ + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_tx_queues", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + gve_sysctl_num_tx_queues, "I", "Number of TX queues"); + + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_rx_queues", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, + gve_sysctl_num_rx_queues, "I", "Number of RX queues"); +} + void gve_setup_sysctl(struct gve_priv *priv) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = priv->dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); gve_setup_queue_stat_sysctl(ctx, child, priv); gve_setup_adminq_stat_sysctl(ctx, child, priv); gve_setup_main_stat_sysctl(ctx, child, priv); + gve_setup_sysctl_writables(ctx, child, priv); } void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt) { struct gve_rxq_stats *rxqstats; struct gve_txq_stats *txqstats; int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) { rxqstats = &priv->rx[i].stats; *rpackets += counter_u64_fetch(rxqstats->rpackets); *rbytes += counter_u64_fetch(rxqstats->rbytes); *rx_dropped_pkt += counter_u64_fetch(rxqstats->rx_dropped_pkt); } for (i = 0; i < priv->tx_cfg.num_queues; i++) { txqstats = &priv->tx[i].stats; *tpackets += counter_u64_fetch(txqstats->tpackets); *tbytes += counter_u64_fetch(txqstats->tbytes); *tx_dropped_pkt += counter_u64_fetch(txqstats->tx_dropped_pkt); } } diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c index e594c66149bc..b667df4ca06e 100644 --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -1,933 +1,926 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) { struct gve_queue_page_list *qpl = tx->com.qpl; struct gve_tx_fifo *fifo = &tx->fifo; fifo->size = qpl->num_pages * PAGE_SIZE; fifo->base = qpl->kva; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; return (0); } static void gve_tx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; if (tx->desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->desc_ring = NULL; } if (tx->info != NULL) { free(tx->info, M_GVE); tx->info = NULL; } if (com->qpl != NULL) { gve_free_qpl(priv, com->qpl); com->qpl = NULL; } } static void gve_tx_free_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; /* Safe to call even if never alloced */ gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); if (mtx_initialized(&tx->ring_mtx)) mtx_destroy(&tx->ring_mtx); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } if (tx->br != NULL) { buf_ring_free(tx->br, M_DEVBUF); tx->br = NULL; } if (gve_is_gqi(priv)) gve_tx_free_ring_gqi(priv, i); else gve_tx_free_ring_dqo(priv, i); } static int gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; int err; err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->desc_ring = tx->desc_ring_mem.cpu_addr; com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, /*single_kva=*/true); if (com->qpl == NULL) { device_printf(priv->dev, "Failed to alloc QPL for tx ring %d\n", i); err = ENOMEM; goto abort; } err = gve_tx_fifo_init(priv, tx); if (err != 0) goto abort; tx->info = malloc( sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, M_GVE, M_WAITOK | M_ZERO); return (0); abort: gve_tx_free_ring_gqi(priv, i); return (err); } static int gve_tx_alloc_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; char mtx_name[16]; int err; com->priv = priv; com->id = i; if (gve_is_gqi(priv)) err = gve_tx_alloc_ring_gqi(priv, i); else err = gve_tx_alloc_ring_dqo(priv, i); if (err != 0) goto abort; sprintf(mtx_name, "gvetx%d", i); mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, M_WAITOK, &tx->ring_mtx); gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; return (0); abort: gve_tx_free_ring(priv, i); return (err); } int -gve_alloc_tx_rings(struct gve_priv *priv) +gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { - int err = 0; int i; + int err; - priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, - M_GVE, M_WAITOK | M_ZERO); + KASSERT(priv->tx != NULL, ("priv->tx is NULL!")); - for (i = 0; i < priv->tx_cfg.num_queues; i++) { + for (i = start_idx; i < stop_idx; i++) { err = gve_tx_alloc_ring(priv, i); if (err != 0) goto free_rings; - } return (0); - free_rings: - while (i--) - gve_tx_free_ring(priv, i); - free(priv->tx, M_GVE); + gve_free_tx_rings(priv, start_idx, i); return (err); } void -gve_free_tx_rings(struct gve_priv *priv) +gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { int i; - for (i = 0; i < priv->tx_cfg.num_queues; i++) + for (i = start_idx; i < stop_idx; i++) gve_tx_free_ring(priv, i); - - free(priv->tx, M_GVE); } static void gve_tx_clear_desc_ring(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->desc_ring[i] = (union gve_tx_desc){}; tx->info[i] = (struct gve_tx_buffer_state){}; } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_clear_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_tx_fifo *fifo = &tx->fifo; tx->req = 0; tx->done = 0; tx->mask = priv->tx_desc_cnt - 1; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; gve_tx_clear_desc_ring(tx); } static void gve_start_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; atomic_store_bool(&tx->stopped, false); if (gve_is_gqi(priv)) NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); else NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx); com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", device_get_nameunit(priv->dev), i); TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); tx->xmit_tq = taskqueue_create_fast("gve tx xmit", M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", device_get_nameunit(priv->dev), i); } int gve_create_tx_rings(struct gve_priv *priv) { struct gve_ring_com *com; struct gve_tx_ring *tx; int err; int i; if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) return (0); for (i = 0; i < priv->tx_cfg.num_queues; i++) { if (gve_is_gqi(priv)) gve_clear_tx_ring(priv, i); else gve_clear_tx_ring_dqo(priv, i); } err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_POSTREAD); for (i = 0; i < priv->tx_cfg.num_queues; i++) { tx = &priv->tx[i]; com = &tx->com; com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, BUS_DMASYNC_POSTREAD); com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); gve_start_tx_ring(priv, i); } gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); return (0); } static void gve_stop_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; if (com->cleanup_tq != NULL) { taskqueue_quiesce(com->cleanup_tq); taskqueue_free(com->cleanup_tq); com->cleanup_tq = NULL; } if (tx->xmit_tq != NULL) { taskqueue_quiesce(tx->xmit_tq); taskqueue_free(tx->xmit_tq); tx->xmit_tq = NULL; } } int gve_destroy_tx_rings(struct gve_priv *priv) { int err; int i; for (i = 0; i < priv->tx_cfg.num_queues; i++) gve_stop_tx_ring(priv, i); if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); } return (0); } int gve_tx_intr(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static uint32_t gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) { bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, BUS_DMASYNC_POSTREAD); uint32_t counter = priv->counters[tx->com.counter_idx]; return (be32toh(counter)); } static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) { atomic_add_int(&fifo->available, bytes); } void gve_tx_cleanup_tq(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; uint32_t nic_done = gve_tx_load_event_counter(priv, tx); uint32_t todo = nic_done - tx->done; size_t space_freed = 0; int i, j; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; for (j = 0; j < todo; j++) { uint32_t idx = tx->done & tx->mask; struct gve_tx_buffer_state *info = &tx->info[idx]; struct mbuf *mbuf = info->mbuf; tx->done++; if (mbuf == NULL) continue; info->mbuf = NULL; counter_enter(); counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); counter_u64_add_protected(tx->stats.tpackets, 1); counter_exit(); m_freem(mbuf); for (i = 0; i < GVE_TX_MAX_DESCS; i++) { space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; info->iov[i].iov_len = 0; info->iov[i].iov_padding = 0; } } gve_tx_free_fifo(&tx->fifo, space_freed); gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_ACK | GVE_IRQ_EVENT); /* * Completions born before this barrier MAY NOT cause the NIC to send an * interrupt but they will still be handled by the enqueue below. * Completions born after the barrier WILL trigger an interrupt. */ atomic_thread_fence_seq_cst(); nic_done = gve_tx_load_event_counter(priv, tx); todo = nic_done - tx->done; if (todo != 0) { gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); } if (atomic_load_bool(&tx->stopped) && space_freed) { atomic_store_bool(&tx->stopped, false); taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); } } static void gve_dma_sync_for_device(struct gve_queue_page_list *qpl, uint64_t iov_offset, uint64_t iov_len) { uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; uint64_t first_page = iov_offset / PAGE_SIZE; struct gve_dma_handle *dma; uint64_t page; for (page = first_page; page <= last_page; page++) { dma = &(qpl->dmas[page]); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); } } static void gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) { mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); mtd_desc->reserved0 = 0; mtd_desc->reserved1 = 0; } static void gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, uint16_t l4_hdr_offset, uint32_t desc_cnt, uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, int csum_offset, uint16_t pkt_len) { if (is_tso) { pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else if (has_csum_flag) { pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else { pkt_desc->type_flags = GVE_TXD_STD; pkt_desc->l4_csum_offset = 0; pkt_desc->l4_hdr_offset = 0; } pkt_desc->desc_cnt = desc_cnt; pkt_desc->len = htobe16(pkt_len); pkt_desc->seg_len = htobe16(first_seg_len); pkt_desc->seg_addr = htobe64(addr); } static void gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, bool is_tso, uint16_t len, uint64_t addr, bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) { seg_desc->type_flags = GVE_TXD_SEG; if (is_tso) { if (is_ipv6) seg_desc->type_flags |= GVE_TXSF_IPV6; seg_desc->l3_offset = l3_off >> 1; seg_desc->mss = htobe16(tso_mss); } seg_desc->seg_len = htobe16(len); seg_desc->seg_addr = htobe64(addr); } static inline uint32_t gve_tx_avail(struct gve_tx_ring *tx) { return (tx->mask + 1 - (tx->req - tx->done)); } static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) { return (atomic_load_int(&fifo->available) >= bytes); } static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) { return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); } static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) { return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; } static inline int gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, uint16_t pkt_len) { int pad_bytes, align_hdr_pad; int bytes; pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); /* We need to take into account the header alignment padding. */ align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; bytes = align_hdr_pad + pad_bytes + pkt_len; return (bytes); } static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, struct gve_tx_iovec iov[2]) { size_t overflow, padding; uint32_t aligned_head; int nfrags = 0; if (bytes == 0) return (0); /* * This check happens before we know how much padding is needed to * align to a cacheline boundary for the payload, but that is fine, * because the FIFO head always start aligned, and the FIFO's boundaries * are aligned, so if there is space for the data, there is space for * the padding to the next alignment. */ KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), ("Allocating gve tx fifo when there is no room")); nfrags++; iov[0].iov_offset = fifo->head; iov[0].iov_len = bytes; fifo->head += bytes; if (fifo->head > fifo->size) { /* * If the allocation did not fit in the tail fragment of the * FIFO, also use the head fragment. */ nfrags++; overflow = fifo->head - fifo->size; iov[0].iov_len -= overflow; iov[1].iov_offset = 0; /* Start of fifo*/ iov[1].iov_len = overflow; fifo->head = overflow; } /* Re-align to a cacheline boundary */ aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); padding = aligned_head - fifo->head; iov[nfrags - 1].iov_padding = padding; atomic_add_int(&fifo->available, -(bytes + padding)); fifo->head = aligned_head; if (fifo->head == fifo->size) fifo->head = 0; return (nfrags); } /* Only error this returns is ENOBUFS when the tx fifo is short of space */ static int gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) { bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; int pad_bytes, hdr_nfrags, payload_nfrags; struct gve_tx_pkt_desc *pkt_desc; struct gve_tx_seg_desc *seg_desc; struct gve_tx_mtd_desc *mtd_desc; struct gve_tx_buffer_state *info; uint32_t idx = tx->req & tx->mask; struct ether_header *eh; struct mbuf *mbuf_next; int payload_iov = 2; int bytes_required; struct ip6_hdr *ip6; struct tcphdr *th; uint32_t next_idx; uint8_t l3_off; struct ip *ip; int i; info = &tx->info[idx]; csum_flags = mbuf->m_pkthdr.csum_flags; pkt_len = mbuf->m_pkthdr.len; is_tso = csum_flags & CSUM_TSO; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; l3_off = ETHER_HDR_LEN; mbuf_next = m_getptr(mbuf, l3_off, &offset); if (is_ipv6) { ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); l4_off = l3_off + sizeof(struct ip6_hdr); is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); is_udp = (ip6->ip6_nxt == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (struct ip *)(mtodo(mbuf_next, offset)); l4_off = l3_off + (ip->ip_hl << 2); is_tcp = (ip->ip_p == IPPROTO_TCP); is_udp = (ip->ip_p == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } l4_data_off = 0; if (is_tcp) { th = (struct tcphdr *)(mtodo(mbuf_next, offset)); l4_data_off = l4_off + (th->th_off << 2); } else if (is_udp) l4_data_off = l4_off + sizeof(struct udphdr); if (has_csum_flag) { if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) csum_offset = offsetof(struct tcphdr, th_sum); else csum_offset = offsetof(struct udphdr, uh_sum); } /* * If this packet is neither a TCP nor a UDP packet, the first segment, * the one represented by the packet descriptor, will carry the * spec-stipulated minimum of 182B. */ if (l4_data_off != 0) first_seg_len = l4_data_off; else first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); if (__predict_false(!gve_can_tx(tx, bytes_required))) { counter_enter(); counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); counter_exit(); return (ENOBUFS); } /* So that the cleanup taskqueue can free the mbuf eventually. */ info->mbuf = mbuf; /* * We don't want to split the header, so if necessary, pad to the end * of the fifo and then put the header at the beginning of the fifo. */ pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, &info->iov[0]); KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, &info->iov[payload_iov]); pkt_desc = &tx->desc_ring[idx].pkt; gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 1 + mtd_desc_nr + payload_nfrags, first_seg_len, info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, pkt_len); m_copydata(mbuf, 0, first_seg_len, (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[hdr_nfrags - 1].iov_offset, info->iov[hdr_nfrags - 1].iov_len); copy_offset = first_seg_len; if (mtd_desc_nr == 1) { next_idx = (tx->req + 1) & tx->mask; mtd_desc = &tx->desc_ring[next_idx].mtd; gve_tx_fill_mtd_desc(mtd_desc, mbuf); } for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; seg_desc = &tx->desc_ring[next_idx].seg; gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); m_copydata(mbuf, copy_offset, info->iov[i].iov_len, (char *)tx->fifo.base + info->iov[i].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[i].iov_offset, info->iov[i].iov_len); copy_offset += info->iov[i].iov_len; } tx->req += (1 + mtd_desc_nr + payload_nfrags); if (is_tso) { counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } return (0); } static int gve_xmit_mbuf(struct gve_tx_ring *tx, struct mbuf **mbuf) { if (gve_is_gqi(tx->com.priv)) return (gve_xmit(tx, *mbuf)); if (gve_is_qpl(tx->com.priv)) return (gve_xmit_dqo_qpl(tx, *mbuf)); /* * gve_xmit_dqo might attempt to defrag the mbuf chain. * The reference is passed in so that in the case of * errors, the new mbuf chain is what's put back on the br. */ return (gve_xmit_dqo(tx, mbuf)); } /* * Has the side-effect of stopping the xmit queue by setting tx->stopped */ static int gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, struct mbuf **mbuf) { int err; atomic_store_bool(&tx->stopped, true); /* * Room made in the queue BEFORE the barrier will be seen by the * gve_xmit_mbuf retry below. * * If room is made in the queue AFTER the barrier, the cleanup tq * iteration creating the room will either see a tx->stopped value * of 0 or the 1 we just wrote: * * If it sees a 1, then it would enqueue the xmit tq. Enqueue * implies a retry on the waiting pkt. * * If it sees a 0, then that implies a previous iteration overwrote * our 1, and that iteration would enqueue the xmit tq. Enqueue * implies a retry on the waiting pkt. */ atomic_thread_fence_seq_cst(); err = gve_xmit_mbuf(tx, mbuf); if (err == 0) atomic_store_bool(&tx->stopped, false); return (err); } static void gve_xmit_br(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; struct ifnet *ifp = priv->ifp; struct mbuf *mbuf; int err; while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && (mbuf = drbr_peek(ifp, tx->br)) != NULL) { err = gve_xmit_mbuf(tx, &mbuf); /* * We need to stop this taskqueue when we can't xmit the pkt due * to lack of space in the NIC ring (ENOBUFS). The retry exists * to guard against a TOCTTOU bug that could end up freezing the * queue forever. */ if (__predict_false(mbuf != NULL && err == ENOBUFS)) err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); if (__predict_false(err != 0 && mbuf != NULL)) { if (err == EINVAL) { drbr_advance(ifp, tx->br); m_freem(mbuf); } else drbr_putback(ifp, tx->br, mbuf); break; } drbr_advance(ifp, tx->br); BPF_MTAP(ifp, mbuf); bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); else gve_db_bar_dqo_write_4(priv, tx->com.db_offset, tx->dqo.desc_tail); } } void gve_xmit_tq(void *arg, int pending) { struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; GVE_RING_LOCK(tx); gve_xmit_br(tx); GVE_RING_UNLOCK(tx); } static bool is_vlan_tagged_pkt(struct mbuf *mbuf) { struct ether_header *eh; eh = mtod(mbuf, struct ether_header *); return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); } int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; bool is_br_empty; int err; uint32_t i; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (ENODEV); if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; else i = curcpu % priv->tx_cfg.num_queues; tx = &priv->tx[i]; if (__predict_false(is_vlan_tagged_pkt(mbuf))) { counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); m_freem(mbuf); return (ENODEV); } is_br_empty = drbr_empty(ifp, tx->br); err = drbr_enqueue(ifp, tx->br, mbuf); if (__predict_false(err != 0)) { if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); return (err); } /* * If the mbuf we just enqueued is the only one on the ring, then * transmit it right away in the interests of low latency. */ if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { gve_xmit_br(tx); GVE_RING_UNLOCK(tx); } else if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); return (0); } void gve_qflush(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; int i; for (i = 0; i < priv->tx_cfg.num_queues; ++i) { tx = &priv->tx[i]; if (drbr_empty(ifp, tx->br) == 0) { GVE_RING_LOCK(tx); drbr_flush(ifp, tx->br); GVE_RING_UNLOCK(tx); } } if_qflush(ifp); } diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c index 080343d3f651..4e9dd4625e2f 100644 --- a/sys/dev/gve/gve_utils.c +++ b/sys/dev/gve/gve_utils.c @@ -1,441 +1,441 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_dqo.h" uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset) { return (be32toh(bus_read_4(priv->reg_bar, offset))); } void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) { bus_write_4(priv->reg_bar, offset, htobe32(val)); } void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) { bus_write_4(priv->db_bar, offset, htobe32(val)); } void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) { bus_write_4(priv->db_bar, offset, val); } void gve_alloc_counters(counter_u64_t *stat, int num_stats) { int i; for (i = 0; i < num_stats; i++) stat[i] = counter_u64_alloc(M_WAITOK); } void gve_free_counters(counter_u64_t *stat, int num_stats) { int i; for (i = 0; i < num_stats; i++) counter_u64_free(stat[i]); } /* Currently assumes a single segment. */ static void gve_dmamap_load_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { if (error == 0) *(bus_addr_t *) arg = segs[0].ds_addr; } int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma) { int err; device_t dev = priv->dev; err = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->tag); if (err != 0) { device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto clear_tag; } err = bus_dmamem_alloc(dma->tag, (void **) &dma->cpu_addr, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->map); if (err != 0) { device_printf(dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, err); goto destroy_tag; } /* An address set by the callback will never be -1 */ dma->bus_addr = (bus_addr_t)-1; err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_NOWAIT); if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); goto free_mem; } return (0); free_mem: bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); destroy_tag: bus_dma_tag_destroy(dma->tag); clear_tag: dma->tag = NULL; return (err); } void gve_dma_free_coherent(struct gve_dma_handle *dma) { bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->tag, dma->map); bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); bus_dma_tag_destroy(dma->tag); } int gve_dmamap_create(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma) { int err; device_t dev = priv->dev; err = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->tag); if (err != 0) { device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto clear_tag; } err = bus_dmamap_create(dma->tag, BUS_DMA_COHERENT, &dma->map); if (err != 0) { device_printf(dev, "%s: bus_dmamap_create failed: %d\n", __func__, err); goto destroy_tag; } /* An address set by the callback will never be -1 */ dma->bus_addr = (bus_addr_t)-1; err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_WAITOK); if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); goto destroy_map; } return (0); destroy_map: bus_dmamap_destroy(dma->tag, dma->map); destroy_tag: bus_dma_tag_destroy(dma->tag); clear_tag: dma->tag = NULL; return (err); } void gve_dmamap_destroy(struct gve_dma_handle *dma) { bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->tag, dma->map); bus_dmamap_destroy(dma->tag, dma->map); bus_dma_tag_destroy(dma->tag); } static int gve_mgmnt_intr(void *arg) { struct gve_priv *priv = arg; taskqueue_enqueue(priv->service_tq, &priv->service_task); return (FILTER_HANDLED); } void gve_free_irqs(struct gve_priv *priv) { struct gve_irq *irq; int num_irqs; int rid; int rc; int i; if (priv->irq_tbl == NULL) { device_printf(priv->dev, "No irq table, nothing to free\n"); return; } - num_irqs = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues + 1; + num_irqs = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues + 1; for (i = 0; i < num_irqs; i++) { irq = &priv->irq_tbl[i]; if (irq->res == NULL) continue; rid = rman_get_rid(irq->res); rc = bus_teardown_intr(priv->dev, irq->res, irq->cookie); if (rc != 0) device_printf(priv->dev, "Failed to teardown irq num %d\n", rid); rc = bus_release_resource(priv->dev, SYS_RES_IRQ, rid, irq->res); if (rc != 0) device_printf(priv->dev, "Failed to release irq num %d\n", rid); irq->res = NULL; irq->cookie = NULL; } free(priv->irq_tbl, M_GVE); priv->irq_tbl = NULL; /* Safe to call even if msix was never alloced */ pci_release_msi(priv->dev); } int gve_alloc_irqs(struct gve_priv *priv) { - int num_tx = priv->tx_cfg.num_queues; - int num_rx = priv->rx_cfg.num_queues; + int num_tx = priv->tx_cfg.max_queues; + int num_rx = priv->rx_cfg.max_queues; int req_nvecs = num_tx + num_rx + 1; int got_nvecs = req_nvecs; struct gve_irq *irq; int i, j, m; int rid; int err; struct gve_ring_com *com; struct gve_rx_ring *rx; struct gve_tx_ring *tx; if (pci_alloc_msix(priv->dev, &got_nvecs) != 0) { device_printf(priv->dev, "Failed to acquire any msix vectors\n"); err = ENXIO; goto abort; } else if (got_nvecs != req_nvecs) { device_printf(priv->dev, "Tried to acquire %d msix vectors, got only %d\n", req_nvecs, got_nvecs); err = ENOSPC; goto abort; } if (bootverbose) device_printf(priv->dev, "Enabled MSIX with %d vectors\n", got_nvecs); priv->irq_tbl = malloc(sizeof(struct gve_irq) * req_nvecs, M_GVE, M_WAITOK | M_ZERO); for (i = 0; i < num_tx; i++) { irq = &priv->irq_tbl[i]; tx = &priv->tx[i]; com = &tx->com; rid = i + 1; irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (irq->res == NULL) { device_printf(priv->dev, "Failed to alloc irq %d for Tx queue %d\n", rid, i); err = ENOMEM; goto abort; } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, gve_is_gqi(priv) ? gve_tx_intr : gve_tx_intr_dqo, NULL, &priv->tx[i], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Tx queue %d, " "err: %d\n", rid, i, err); goto abort; } bus_describe_intr(priv->dev, irq->res, irq->cookie, "tx%d", i); com->ntfy_id = i; } for (j = 0; j < num_rx; j++) { irq = &priv->irq_tbl[i + j]; rx = &priv->rx[j]; com = &rx->com; rid = i + j + 1; irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (irq->res == NULL) { device_printf(priv->dev, "Failed to alloc irq %d for Rx queue %d", rid, j); err = ENOMEM; goto abort; } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, gve_is_gqi(priv) ? gve_rx_intr : gve_rx_intr_dqo, NULL, &priv->rx[j], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Rx queue %d, " "err: %d\n", rid, j, err); goto abort; } bus_describe_intr(priv->dev, irq->res, irq->cookie, "rx%d", j); com->ntfy_id = i + j; } m = i + j; rid = m + 1; irq = &priv->irq_tbl[m]; irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (irq->res == NULL) { device_printf(priv->dev, "Failed to allocate irq %d for mgmnt queue\n", rid); err = ENOMEM; goto abort; } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, gve_mgmnt_intr, NULL, priv, &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for mgmnt queue, err: %d\n", rid, err); goto abort; } bus_describe_intr(priv->dev, irq->res, irq->cookie, "mgmnt"); return (0); abort: gve_free_irqs(priv); return (err); } /* * Builds register value to write to DQO IRQ doorbell to enable with specified * ITR interval. */ static uint32_t gve_setup_itr_interval_dqo(uint32_t interval_us) { uint32_t result = GVE_ITR_ENABLE_BIT_DQO; /* Interval has 2us granularity. */ interval_us >>= 1; interval_us &= GVE_ITR_INTERVAL_DQO_MASK; result |= (interval_us << GVE_ITR_INTERVAL_DQO_SHIFT); return (result); } void gve_unmask_all_queue_irqs(struct gve_priv *priv) { struct gve_tx_ring *tx; struct gve_rx_ring *rx; int idx; for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { tx = &priv->tx[idx]; if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); else gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, gve_setup_itr_interval_dqo(GVE_TX_IRQ_RATELIMIT_US_DQO)); } for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { rx = &priv->rx[idx]; if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); else gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, gve_setup_itr_interval_dqo(GVE_RX_IRQ_RATELIMIT_US_DQO)); } } void gve_mask_all_queue_irqs(struct gve_priv *priv) { for (int idx = 0; idx < priv->tx_cfg.num_queues; idx++) { struct gve_tx_ring *tx = &priv->tx[idx]; gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); } for (int idx = 0; idx < priv->rx_cfg.num_queues; idx++) { struct gve_rx_ring *rx = &priv->rx[idx]; gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); } }