diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 index 754071e2fad8..924a01a06d08 100644 --- a/share/man/man4/gve.4 +++ b/share/man/man4/gve.4 @@ -1,296 +1,302 @@ .\" SPDX-License-Identifier: BSD-3-Clause .\" .\" Copyright (c) 2023-2024 Google LLC .\" .\" Redistribution and use in source and binary forms, with or without modification, .\" are permitted provided that the following conditions are met: .\" .\" 1. Redistributions of source code must retain the above copyright notice, this .\" list of conditions and the following disclaimer. .\" .\" 2. Redistributions in binary form must reproduce the above copyright notice, .\" this list of conditions and the following disclaimer in the documentation .\" and/or other materials provided with the distribution. .\" .\" 3. Neither the name of the copyright holder nor the names of its contributors .\" may be used to endorse or promote products derived from this software without .\" specific prior written permission. .\" .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND .\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED .\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE .\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR .\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES .\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; .\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON .\" ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS .\" SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -.Dd October 14, 2024 +.Dd May 20, 2025 .Dt GVE 4 .Os .Sh NAME .Nm gve .Nd "Ethernet driver for Google Virtual NIC (gVNIC)" .Sh SYNOPSIS To compile this driver into the kernel, place the following lines in your kernel configuration file: .Bd -ragged -offset indent .Cd "device gve" .Ed .Pp Alternatively, to load the driver as a module at boot time, place the following line in .Xr loader.conf 5 : .Bd -literal -offset indent if_gve_load="YES" .Ed .Sh DESCRIPTION gVNIC is a virtual network interface designed specifically for Google Compute Engine (GCE). It is required to support per-VM Tier-1 networking performance, and for using certain VM shapes on GCE. .Pp .Nm is the driver for gVNIC. It supports the following features: .Pp .Bl -bullet -compact .It RX checksum offload .It TX chesksum offload .It TCP Segmentation Offload (TSO) .It Large Receive Offload (LRO) in software .It Jumbo frames .It Receive Side Scaling (RSS) .El .Pp For more information on configuring this device, see .Xr ifconfig 8 . .Sh HARDWARE .Nm binds to a single PCI device ID presented by gVNIC: .Pp .Bl -bullet -compact .It 0x1AE0:0x0042 .El .Sh EXAMPLES .Pp Change the TX queue count to 4 for the gve0 interface: .D1 sysctl dev.gve.0.num_tx_queues=4 .Pp Change the RX queue count to 4 for the gve0 interface: .D1 sysctl dev.gve.0.num_rx_queues=4 .Pp Change the TX ring size to 512 for the gve0 interface: .D1 sysctl dev.gve.0.tx_ring_size=512 .Pp Change the RX ring size to 512 for the gve0 interface: .D1 sysctl dev.gve.0.rx_ring_size=512 .Sh DIAGNOSTICS The following messages are recorded during driver initialization: .Bl -diag .It "Enabled MSIX with %d vectors" .It "Configured device resources" .It "Successfully attached %s" .It "Deconfigured device resources" .El .Pp These messages are seen if driver initialization fails. Global (across-queues) allocation failures: .Bl -diag .It "Failed to configure device resources: err=%d" .It "No compatible queue formats" .It "Failed to allocate ifnet struct" .It "Failed to allocate admin queue mem" .It "Failed to alloc DMA mem for DescribeDevice" .It "Failed to allocate QPL page" .El .Pp irq and BAR allocation failures: .Bl -diag .It "Failed to acquire any msix vectors" .It "Tried to acquire %d msix vectors, got only %d" .It "Failed to setup irq %d for Tx queue %d " .It "Failed to setup irq %d for Rx queue %d " .It "Failed to allocate irq %d for mgmnt queue" .It "Failed to setup irq %d for mgmnt queue, err: %d" .It "Failed to allocate BAR0" .It "Failed to allocate BAR2" .It "Failed to allocate msix table" .El .Pp Rx queue-specific allocation failures: .Bl -diag .It "No QPL left for rx ring %d" .It "Failed to alloc queue resources for rx ring %d" .It "Failed to alloc desc ring for rx ring %d" .It "Failed to alloc data ring for rx ring %d" .El .Pp Tx queue-specific allocation failures: .Bl -diag .It "No QPL left for tx ring %d" .It "Failed to alloc queue resources for tx ring %d" .It "Failed to alloc desc ring for tx ring %d" .It "Failed to vmap fifo, qpl_id = %d" .El .Pp The following messages are recorded when the interface detach fails: .Bl -diag .It "Failed to deconfigure device resources: err=%d" .El .Pp If bootverbose is on, the following messages are recorded when the interface is being brought up: .Bl -diag .It "Created %d rx queues" .It "Created %d tx queues" .It "MTU set to %d" .El .Pp The following messages are recorded when the interface is being brought down: .Bl -diag .It "Destroyed %d rx queues" .It "Destroyed %d tx queues" .El .Pp These messages are seen if errors are encountered when bringing the interface up or down: .Bl -diag .It "Failed to destroy rxq %d, err: %d" .It "Failed to destroy txq %d, err: %d" .It "Failed to create rxq %d, err: %d" .It "Failed to create txq %d, err: %d" .It "Failed to set MTU to %d" .It "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d" .It "Cannot bring the iface up when detached" .It "Reached max number of registered pages %lu > %lu" .It "Failed to init lro for rx ring %d" .El .Pp These messages are seen if any admin queue command fails: .Bl -diag .It "AQ command(%u): failed with status %d" .It "AQ command(%u): unknown status code %d" .It "AQ commands timed out, need to reset AQ" .It "Unknown AQ command opcode %d" .El .Pp +These messages appear if a TX timeout is detected: +.Bl -diag +.It "Found %d timed out packet(s) on txq%d, kicking it for completions" +.It "Found %d timed out packet(s) on txq%d with its last kick %ld sec ago which is less than the cooldown period %d. Resetting device" +.El +.Pp These messages are recorded when the device is being reset due to an error: .Bl -diag .It "Scheduling reset task!" .It "Waiting until admin queue is released." .It "Admin queue released" .El .Pp If it was the NIC that requested the reset, this message is recorded: .Bl -diag .It "Device requested reset" .El .Pp If the reset fails during the reinitialization phase, this message is recorded: .Bl -diag .It "Restore failed!" .El .Pp These two messages correspond to the NIC alerting the driver to link state changes: .Bl -diag .It "Device link is up." .It "Device link is down." .El .Pp Apart from these messages, the driver exposes per-queue packet and error counters as sysctl nodes. Global (across queues) counters can be read using .Xr netstat 1 . .Sh SYSCTL VARIABLES .Nm exposes the following .Xr sysctl 8 variables: .Bl -tag -width indent .It Va hw.gve.driver_version The driver version. This is read-only. .It Va hw.gve.queue_format The queue format in use. This is read-only. .It Va hw.gve.disable_hw_lro Setting this boot-time tunable to 1 disables Large Receive Offload (LRO) in the NIC. The default value is 0, which means hardware LRO is enabled by default. The software LRO stack in the kernel is always used. This sysctl variable needs to be set before loading the driver, using .Xr loader.conf 5 . .It Va dev.gve.X.num_rx_queues and dev.gve.X.num_tx_queues Run-time tunables that represent the number of currently used RX/TX queues. The default value is the max number of RX/TX queues the device can support. .Pp This call turns down the interface while setting up the new queues, which may potentially cause any new packets to be dropped. This call can fail if the system is not able to provide the driver with enough resources. In that situation, the driver will revert to the previous number of RX/TX queues. If this also fails, a device reset will be triggered. .Pp Note: sysctl nodes for queue stats remain available even if a queue is removed. .Pp .It Va dev.gve.X.rx_ring_size and dev.gve.X.tx_ring_size Run-time tunables that represent the current ring size for RX/TX queues. The default value is set to device defaults for ring size. .Pp This call turns down the interface while setting up the queues with the new ring size, which may potentially cause any new packets to be dropped. This call can fail if the system is not able to provide the driver with enough resources. In that situation, the driver will try to revert to the previous ring size for RX/TX queues. If this also fails, the device will be in an unhealthy state and will need to be reloaded. This value must be a power of 2 and within the defined range. .Pp .El .Sh LIMITATIONS .Nm does not support the transmission of VLAN-tagged packets. All VLAN-tagged traffic is dropped. .Sh QUEUE FORMATS .Nm features different datapath modes called queue formats: .Pp .Bl -bullet -compact .It GQI_QPL: "QPL" stands for "Queue Page List" and refers to the fact that hardware expects a fixed bounce buffer and cannot access arbitrary memory. GQI is the older descriptor format. The G in "GQI" refers to an older generation of hardware, and the "QI" stands for "Queue In-order" referring to the fact that the NIC sends Tx and Rx completions in the same order as the one in which the corresponding descriptors were posted by the driver. .It DQO_RDA: DQO is the descriptor format required to take full advantage of next generation VM shapes. "RDA" stands for "Raw DMA Addressing" and refers to the fact that hardware can work with DMA-ed packets and does not expect them to be copied into or out of a fixed bounce buffer. The D in "DQO" refers to a newer generation of hardware, and the "QO" stands for "Queue Out-of-order" referring to the fact that the NIC might send Tx and Rx completions in an order different from the one in which the corresponding descriptors were posted by the driver. .It DQO_QPL: The next generation descriptor format in the "QPL" mode. .El .Sh SUPPORT Please email gvnic-drivers@google.com with the specifics of the issue encountered. .Sh SEE ALSO .Xr netstat 1 , .Xr loader.conf 5 , .Xr ifconfig 8 , .Xr sysctl 8 .Sh HISTORY The .Nm device driver first appeared in .Fx 13.3 . .Sh AUTHORS The .Nm driver was written by Google. diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h index 5b298b889ed6..48e9a371df21 100644 --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -1,713 +1,760 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _GVE_FBSD_H #define _GVE_FBSD_H #include "gve_desc.h" #include "gve_plat.h" #include "gve_register.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 #endif #define PCI_DEV_ID_GVNIC 0x0042 #define GVE_REGISTER_BAR 0 #define GVE_DOORBELL_BAR 2 /* Driver can alloc up to 2 segments for the header and 2 for the payload. */ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 +#define GVE_TX_TIMEOUT_PKT_SEC 5 +#define GVE_TX_TIMEOUT_CHECK_CADENCE_SEC 5 +/* + * If the driver finds timed out packets on a tx queue it first kicks it and + * records the time. If the driver again finds a timeout on the same queue + * before the end of the cooldown period, only then will it reset. Thus, for a + * reset to be able to occur at all, the cooldown must be at least as long + * as the tx timeout checking cadence multiplied by the number of queues. + */ +#define GVE_TX_TIMEOUT_MAX_TX_QUEUES 16 +#define GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC \ + (2 * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC * GVE_TX_TIMEOUT_MAX_TX_QUEUES) + +#define GVE_TIMESTAMP_INVALID -1 + #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 /* Each RX bounce buffer page can fit two packet buffers. */ #define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) /* PTYPEs are always 10 bits. */ #define GVE_NUM_PTYPES 1024 /* * Number of descriptors per queue page list. * Page count AKA QPL size can be derived by dividing the number of elements in * a page by the number of descriptors available. */ #define GVE_QPL_DIVISOR 16 /* Ring Size Limits */ #define GVE_DEFAULT_MIN_RX_RING_SIZE 512 #define GVE_DEFAULT_MIN_TX_RING_SIZE 256 static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); struct gve_dma_handle { bus_addr_t bus_addr; void *cpu_addr; bus_dma_tag_t tag; bus_dmamap_t map; }; union gve_tx_desc { struct gve_tx_pkt_desc pkt; /* first desc for a packet */ struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ }; /* Tracks the memory in the fifo occupied by a segment of a packet */ struct gve_tx_iovec { uint32_t iov_offset; /* offset into this segment */ uint32_t iov_len; /* length */ uint32_t iov_padding; /* padding associated with this segment */ }; /* Tracks allowed and current queue settings */ struct gve_queue_config { uint16_t max_queues; uint16_t num_queues; /* current */ }; struct gve_irq_db { __be32 index; } __aligned(CACHE_LINE_SIZE); /* * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value * when the entire configure_device_resources command is zeroed out and the * queue_format is not specified. */ enum gve_queue_format { GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, GVE_DQO_QPL_FORMAT = 0x4, }; enum gve_state_flags_bit { GVE_STATE_FLAG_ADMINQ_OK, GVE_STATE_FLAG_RESOURCES_OK, GVE_STATE_FLAG_QPLREG_OK, GVE_STATE_FLAG_RX_RINGS_OK, GVE_STATE_FLAG_TX_RINGS_OK, GVE_STATE_FLAG_QUEUES_UP, GVE_STATE_FLAG_LINK_UP, GVE_STATE_FLAG_DO_RESET, GVE_STATE_FLAG_IN_RESET, GVE_NUM_STATE_FLAGS /* Not part of the enum space */ }; BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS); #define GVE_DEVICE_STATUS_RESET (0x1 << 1) #define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2) #define GVE_RING_LOCK(ring) mtx_lock(&(ring)->ring_mtx) #define GVE_RING_TRYLOCK(ring) mtx_trylock(&(ring)->ring_mtx) #define GVE_RING_UNLOCK(ring) mtx_unlock(&(ring)->ring_mtx) #define GVE_RING_ASSERT(ring) mtx_assert(&(ring)->ring_mtx, MA_OWNED) #define GVE_IFACE_LOCK_INIT(lock) sx_init(&lock, "gve interface lock") #define GVE_IFACE_LOCK_DESTROY(lock) sx_destroy(&lock) #define GVE_IFACE_LOCK_LOCK(lock) sx_xlock(&lock) #define GVE_IFACE_LOCK_UNLOCK(lock) sx_unlock(&lock) #define GVE_IFACE_LOCK_ASSERT(lock) sx_assert(&lock, SA_XLOCKED) struct gve_queue_page_list { uint32_t id; uint32_t num_dmas; uint32_t num_pages; vm_offset_t kva; vm_page_t *pages; struct gve_dma_handle *dmas; }; struct gve_irq { struct resource *res; void *cookie; }; struct gve_rx_slot_page_info { void *page_address; vm_page_t page; uint32_t page_offset; uint16_t pad; }; /* * A single received packet split across multiple buffers may be * reconstructed using the information in this structure. */ struct gve_rx_ctx { /* head and tail of mbuf chain for the current packet */ struct mbuf *mbuf_head; struct mbuf *mbuf_tail; uint32_t total_size; uint8_t frag_cnt; bool is_tcp; bool drop_pkt; }; struct gve_ring_com { struct gve_priv *priv; uint32_t id; /* * BAR2 offset for this ring's doorbell and the * counter-array offset for this ring's counter. * Acquired from the device individually for each * queue in the queue_create adminq command. */ struct gve_queue_resources *q_resources; struct gve_dma_handle q_resources_mem; /* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */ uint32_t irq_db_offset; /* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */ uint32_t db_offset; /* * Index, not byte-offset, into the counter array where this ring's * 4-byte counter lies. */ uint32_t counter_idx; /* * The index of the MSIX vector that was assigned to * this ring in `gve_alloc_irqs`. * * It is passed to the device in the queue_create adminq * command. * * Additionally, this also serves as the index into * `priv->irq_db_indices` where this ring's irq doorbell's * BAR2 offset, `irq_db_idx`, can be found. */ int ntfy_id; /* * The fixed bounce buffer for this ring. * Once allocated, has to be offered to the device * over the register-page-list adminq command. */ struct gve_queue_page_list *qpl; struct task cleanup_task; struct taskqueue *cleanup_tq; } __aligned(CACHE_LINE_SIZE); struct gve_rxq_stats { counter_u64_t rbytes; counter_u64_t rpackets; counter_u64_t rx_dropped_pkt; counter_u64_t rx_copybreak_cnt; counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; counter_u64_t rx_dropped_pkt_buf_post_fail; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; counter_u64_t rx_mbuf_dmamap_err; counter_u64_t rx_mbuf_mclget_null; }; #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) union gve_rx_qpl_buf_id_dqo { struct { uint16_t buf_id:11; /* Index into rx->dqo.bufs */ uint8_t frag_num:5; /* Which frag in the QPL page */ }; uint16_t all; } __packed; _Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2, "gve: bad dqo qpl rx buf id length"); struct gve_rx_buf_dqo { union { /* RDA */ struct { struct mbuf *mbuf; bus_dmamap_t dmamap; uint64_t addr; bool mapped; }; /* QPL */ struct { uint8_t num_nic_frags; /* number of pending completions */ uint8_t next_idx; /* index of the next frag to post */ /* for chaining rx->dqo.used_bufs */ STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry; }; }; /* for chaining rx->dqo.free_bufs */ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; }; /* power-of-2 sized receive ring */ struct gve_rx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; uint32_t cnt; /* free-running total number of completed packets */ uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ union { /* GQI-only fields */ struct { struct gve_dma_handle data_ring_mem; /* accessed in the GQ receive hot path */ struct gve_rx_desc *desc_ring; union gve_rx_data_slot *data_ring; struct gve_rx_slot_page_info *page_info; uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ uint8_t seq_no; /* helps traverse the descriptor ring */ }; /* DQO-only fields */ struct { struct gve_dma_handle compl_ring_mem; struct gve_rx_compl_desc_dqo *compl_ring; struct gve_rx_desc_dqo *desc_ring; struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */ bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */ uint32_t buf_cnt; /* Size of the bufs array */ uint32_t mask; /* One less than the sizes of the desc and compl rings */ uint32_t head; /* The index at which to post the next buffer at */ uint32_t tail; /* The index at which to receive the next compl at */ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; /* * Only used in QPL mode. Pages referred to by if_input-ed mbufs * stay parked here till their wire count comes back to 1. * Pages are moved here after there aren't any pending completions. */ STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs; } dqo; }; struct lro_ctrl lro; struct gve_rx_ctx ctx; struct gve_rxq_stats stats; } __aligned(CACHE_LINE_SIZE); /* * A contiguous representation of the pages composing the Tx bounce buffer. * The xmit taskqueue and the completion taskqueue both simultaneously use it. * Both operate on `available`: the xmit tq lowers it and the completion tq * raises it. `head` is the last location written at and so only the xmit tq * uses it. */ struct gve_tx_fifo { vm_offset_t base; /* address of base of FIFO */ uint32_t size; /* total size */ volatile int available; /* how much space is still available */ uint32_t head; /* offset to write at */ }; struct gve_tx_buffer_state { struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; struct gve_txq_stats { counter_u64_t tbytes; counter_u64_t tpackets; counter_u64_t tso_packet_cnt; counter_u64_t tx_dropped_pkt; counter_u64_t tx_delayed_pkt_nospace_device; counter_u64_t tx_dropped_pkt_nospace_bufring; counter_u64_t tx_delayed_pkt_nospace_descring; counter_u64_t tx_delayed_pkt_nospace_compring; counter_u64_t tx_delayed_pkt_nospace_qpl_bufs; counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; counter_u64_t tx_mbuf_collapse; counter_u64_t tx_mbuf_defrag; counter_u64_t tx_mbuf_defrag_err; counter_u64_t tx_mbuf_dmamap_enomem_err; counter_u64_t tx_mbuf_dmamap_err; + counter_u64_t tx_timeout; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; + + /* + * Time at which the xmit tq places descriptors for mbuf's payload on a + * tx queue. This timestamp is invalidated when the mbuf is freed and + * must be checked for validity when read. + */ + int64_t enqueue_time_sec; + union { /* RDA */ bus_dmamap_t dmamap; /* QPL */ struct { /* * A linked list of entries from qpl_bufs that served * as the bounce buffer for this packet. */ int32_t qpl_buf_head; uint32_t num_qpl_bufs; }; }; uint8_t state; /* the gve_packet_state enum */ int next; /* To chain the free_pending_pkts lists */ }; /* power-of-2 sized transmit ring */ struct gve_tx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; struct task xmit_task; struct taskqueue *xmit_tq; bool stopped; /* Accessed when writing descriptors */ struct buf_ring *br; struct mtx ring_mtx; uint32_t req; /* free-running total number of packets written to the nic */ uint32_t done; /* free-running total number of completed packets */ + int64_t last_kicked; /* always-valid timestamp in seconds for the last queue kick */ + union { /* GQI specific stuff */ struct { union gve_tx_desc *desc_ring; struct gve_tx_buffer_state *info; struct gve_tx_fifo fifo; uint32_t mask; /* masks the req and done to the size of the ring */ }; /* DQO specific stuff */ struct { struct gve_dma_handle compl_ring_mem; /* Accessed when writing descriptors */ struct { union gve_tx_desc_dqo *desc_ring; uint32_t desc_mask; /* masks head and tail to the size of desc_ring */ uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */ uint32_t desc_tail; /* last desc written by driver */ uint32_t last_re_idx; /* desc which last had "report event" set */ /* * The head index of a singly linked list containing pending packet objects * to park mbufs till the NIC sends completions. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. */ int32_t free_pending_pkts_csm; /* * The head index of a singly linked list representing QPL page fragments * to copy mbuf payload into for the NIC to see. Once this list is depleted, * the "_prd" suffixed producer list, grown by the completion taskqueue, * is stolen. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_csm; uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */ uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */ /* DMA params for mapping Tx mbufs. Only used in RDA mode. */ bus_dma_tag_t buf_dmatag; } __aligned(CACHE_LINE_SIZE); /* Accessed when processing completions */ struct { struct gve_tx_compl_desc_dqo *compl_ring; uint32_t compl_mask; /* masks head to the size of compl_ring */ uint32_t compl_head; /* last completion read by driver */ uint8_t cur_gen_bit; /* NIC flips a bit on every pass */ uint32_t hw_tx_head; /* last desc read by NIC */ /* * The completion taskqueue moves pending-packet objects to this * list after freeing the mbuf. The "_prd" denotes that this is * a producer list. The transmit taskqueue steals this list once * its consumer list, with the "_csm" suffix, is depleted. */ int32_t free_pending_pkts_prd; /* * The completion taskqueue moves the QPL pages corresponding to a * completed packet into this list. It is only used in QPL mode. * The "_prd" denotes that this is a producer list. The transmit * taskqueue steals this list once its consumer list, with the "_csm" * suffix, is depleted. * * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. */ int32_t free_qpl_bufs_prd; uint32_t qpl_bufs_produced; } __aligned(CACHE_LINE_SIZE); /* Accessed by both the completion and xmit loops */ struct { /* completion tags index into this array */ struct gve_tx_pending_pkt_dqo *pending_pkts; uint16_t num_pending_pkts; /* * Represents QPL page fragments. An index into this array * always represents the same QPL page fragment. The value * is also an index into this array and servers as a means * to chain buffers into linked lists whose heads are * either free_qpl_bufs_prd or free_qpl_bufs_csm or * qpl_bufs_head. */ int32_t *qpl_bufs; } __aligned(CACHE_LINE_SIZE); } dqo; }; struct gve_txq_stats stats; } __aligned(CACHE_LINE_SIZE); enum gve_packet_state { /* * Packet does not yet have a dmamap created. * This should always be zero since state is not explicitly initialized. */ GVE_PACKET_STATE_UNALLOCATED, /* Packet has a dmamap and is in free list, available to be allocated. */ GVE_PACKET_STATE_FREE, /* Packet is expecting a regular data completion */ GVE_PACKET_STATE_PENDING_DATA_COMPL, }; struct gve_ptype { uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */ uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */ }; struct gve_ptype_lut { struct gve_ptype ptypes[GVE_NUM_PTYPES]; }; struct gve_priv { if_t ifp; device_t dev; struct ifmedia media; uint8_t mac[ETHER_ADDR_LEN]; struct gve_dma_handle aq_mem; struct resource *reg_bar; /* BAR0 */ struct resource *db_bar; /* BAR2 */ struct resource *msix_table; uint32_t mgmt_msix_idx; uint32_t rx_copybreak; uint16_t num_event_counters; uint16_t default_num_queues; uint16_t tx_desc_cnt; uint16_t max_tx_desc_cnt; uint16_t min_tx_desc_cnt; uint16_t rx_desc_cnt; uint16_t max_rx_desc_cnt; uint16_t min_rx_desc_cnt; uint16_t rx_pages_per_qpl; uint64_t max_registered_pages; uint64_t num_registered_pages; uint32_t supported_features; uint16_t max_mtu; bool modify_ringsize_enabled; struct gve_dma_handle counter_array_mem; __be32 *counters; struct gve_dma_handle irqs_db_mem; struct gve_irq_db *irq_db_indices; enum gve_queue_format queue_format; struct gve_queue_config tx_cfg; struct gve_queue_config rx_cfg; uint32_t num_queues; struct gve_irq *irq_tbl; struct gve_tx_ring *tx; struct gve_rx_ring *rx; struct gve_ptype_lut *ptype_lut_dqo; /* * Admin queue - see gve_adminq.h * Since AQ cmds do not run in steady state, 32 bit counters suffice */ struct gve_adminq_command *adminq; vm_paddr_t adminq_bus_addr; uint32_t adminq_mask; /* masks prod_cnt to adminq size */ uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */ uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */ uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */ /* free-running count of each distinct AQ cmd executed */ uint32_t adminq_describe_device_cnt; uint32_t adminq_cfg_device_resources_cnt; uint32_t adminq_register_page_list_cnt; uint32_t adminq_unregister_page_list_cnt; uint32_t adminq_create_tx_queue_cnt; uint32_t adminq_create_rx_queue_cnt; uint32_t adminq_destroy_tx_queue_cnt; uint32_t adminq_destroy_rx_queue_cnt; uint32_t adminq_dcfg_device_resources_cnt; uint32_t adminq_set_driver_parameter_cnt; uint32_t adminq_verify_driver_compatibility_cnt; uint32_t adminq_get_ptype_map_cnt; uint32_t interface_up_cnt; uint32_t interface_down_cnt; uint32_t reset_cnt; struct task service_task; struct taskqueue *service_tq; struct gve_state_flags state_flags; struct sx gve_iface_lock; + + struct callout tx_timeout_service; + /* The index of tx queue that the timer service will check on its next invocation */ + uint16_t check_tx_queue_idx; + }; static inline bool gve_get_state_flag(struct gve_priv *priv, int pos) { return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags)); } static inline void gve_set_state_flag(struct gve_priv *priv, int pos) { BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline void gve_clear_state_flag(struct gve_priv *priv, int pos) { BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline bool gve_is_gqi(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT); } static inline bool gve_is_qpl(struct gve_priv *priv) { return (priv->queue_format == GVE_GQI_QPL_FORMAT || priv->queue_format == GVE_DQO_QPL_FORMAT); } /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); int gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt); int gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt); int gve_adjust_ring_sizes(struct gve_priv *priv, uint16_t new_desc_cnt, bool is_rx); /* Register access functions defined in gve_utils.c */ uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); /* QPL (Queue Page List) functions defined in gve_qpl.c */ struct gve_queue_page_list *gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva); void gve_free_qpl(struct gve_priv *priv, struct gve_queue_page_list *qpl); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); void gve_mextadd_free(struct mbuf *mbuf); /* TX functions defined in gve_tx.c */ int gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); +int gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); void gve_xmit_tq(void *arg, int pending); void gve_tx_cleanup_tq(void *arg, int pending); /* TX functions defined in gve_tx_dqo.c */ int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); +int gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); void gve_tx_cleanup_tq_dqo(void *arg, int pending); /* RX functions defined in gve_rx.c */ int gve_alloc_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); void gve_free_rx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx); int gve_create_rx_rings(struct gve_priv *priv); int gve_destroy_rx_rings(struct gve_priv *priv); int gve_rx_intr(void *arg); void gve_rx_cleanup_tq(void *arg, int pending); /* RX functions defined in gve_rx_dqo.c */ int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i); void gve_rx_free_ring_dqo(struct gve_priv *priv, int i); void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx); void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i); int gve_rx_intr_dqo(void *arg); void gve_rx_cleanup_tq_dqo(void *arg, int pending); /* DMA functions defined in gve_utils.c */ int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dma_free_coherent(struct gve_dma_handle *dma); int gve_dmamap_create(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dmamap_destroy(struct gve_dma_handle *dma); /* IRQ functions defined in gve_utils.c */ void gve_free_irqs(struct gve_priv *priv); int gve_alloc_irqs(struct gve_priv *priv); void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); +/* Miscellaneous functions defined in gve_utils.c */ +void gve_invalidate_timestamp(int64_t *timestamp_sec); +int64_t gve_seconds_since(int64_t *timestamp_sec); +void gve_set_timestamp(int64_t *timestamp_sec); +bool gve_timestamp_valid(int64_t *timestamp_sec); + /* Systcl functions defined in gve_sysctl.c */ extern bool gve_disable_hw_lro; extern char gve_queue_format[8]; extern char gve_version[8]; void gve_setup_sysctl(struct gve_priv *priv); void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt); /* Stats functions defined in gve_utils.c */ void gve_alloc_counters(counter_u64_t *stat, int num_stats); void gve_free_counters(counter_u64_t *stat, int num_stats); #endif /* _GVE_FBSD_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c index 8a00deedef36..2abd9d1aa698 100644 --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -1,1075 +1,1149 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.3\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.4\n" #define GVE_VERSION_MAJOR 1 #define GVE_VERSION_MINOR 3 -#define GVE_VERSION_SUB 3 +#define GVE_VERSION_SUB 4 #define GVE_DEFAULT_RX_COPYBREAK 256 /* Devices supported by this driver. */ static struct gve_dev { uint16_t vendor_id; uint16_t device_id; const char *name; } gve_devs[] = { { PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC, "gVNIC" } }; struct sx gve_global_lock; +static void gve_start_tx_timeout_service(struct gve_priv *priv); +static void gve_stop_tx_timeout_service(struct gve_priv *priv); + static int gve_verify_driver_compatibility(struct gve_priv *priv) { int err; struct gve_driver_info *driver_info; struct gve_dma_handle driver_info_mem; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_driver_info), PAGE_SIZE, &driver_info_mem); if (err != 0) return (ENOMEM); driver_info = driver_info_mem.cpu_addr; *driver_info = (struct gve_driver_info) { .os_type = 3, /* Freebsd */ .driver_major = GVE_VERSION_MAJOR, .driver_minor = GVE_VERSION_MINOR, .driver_sub = GVE_VERSION_SUB, .os_version_major = htobe32(FBSD_VERSION_MAJOR), .os_version_minor = htobe32(FBSD_VERSION_MINOR), .os_version_sub = htobe32(FBSD_VERSION_PATCH), .driver_capability_flags = { htobe64(GVE_DRIVER_CAPABILITY_FLAGS1), htobe64(GVE_DRIVER_CAPABILITY_FLAGS2), htobe64(GVE_DRIVER_CAPABILITY_FLAGS3), htobe64(GVE_DRIVER_CAPABILITY_FLAGS4), }, }; snprintf(driver_info->os_version_str1, sizeof(driver_info->os_version_str1), "FreeBSD %u", __FreeBSD_version); bus_dmamap_sync(driver_info_mem.tag, driver_info_mem.map, BUS_DMASYNC_PREREAD); err = gve_adminq_verify_driver_compatibility(priv, sizeof(struct gve_driver_info), driver_info_mem.bus_addr); /* It's ok if the device doesn't support this */ if (err == EOPNOTSUPP) err = 0; gve_dma_free_coherent(&driver_info_mem); return (err); } +static void +gve_handle_tx_timeout(struct gve_priv *priv, struct gve_tx_ring *tx, + int num_timeout_pkts) +{ + int64_t time_since_last_kick; + + counter_u64_add_protected(tx->stats.tx_timeout, 1); + + /* last_kicked is never GVE_TIMESTAMP_INVALID so we can skip checking */ + time_since_last_kick = gve_seconds_since(&tx->last_kicked); + + /* Try kicking first in case the timeout is due to a missed interrupt */ + if (time_since_last_kick > GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC) { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d, kicking it for completions\n", + num_timeout_pkts, tx->com.id); + gve_set_timestamp(&tx->last_kicked); + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + } else { + device_printf(priv->dev, + "Found %d timed out packet(s) on txq%d with its last kick %jd sec ago which is less than the cooldown period %d. Resetting device\n", + num_timeout_pkts, tx->com.id, + (intmax_t)time_since_last_kick, + GVE_TX_TIMEOUT_KICK_COOLDOWN_SEC); + gve_schedule_reset(priv); + } +} + +static void +gve_tx_timeout_service_callback(void *data) +{ + struct gve_priv *priv = (struct gve_priv *)data; + struct gve_tx_ring *tx; + uint16_t num_timeout_pkts; + + tx = &priv->tx[priv->check_tx_queue_idx]; + + num_timeout_pkts = gve_is_gqi(priv) ? + gve_check_tx_timeout_gqi(priv, tx) : + gve_check_tx_timeout_dqo(priv, tx); + if (num_timeout_pkts) + gve_handle_tx_timeout(priv, tx, num_timeout_pkts); + + priv->check_tx_queue_idx = (priv->check_tx_queue_idx + 1) % + priv->tx_cfg.num_queues; + callout_reset_sbt(&priv->tx_timeout_service, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0, + gve_tx_timeout_service_callback, (void *)priv, 0); +} + +static void +gve_start_tx_timeout_service(struct gve_priv *priv) +{ + priv->check_tx_queue_idx = 0; + callout_init(&priv->tx_timeout_service, true); + callout_reset_sbt(&priv->tx_timeout_service, + SBT_1S * GVE_TX_TIMEOUT_CHECK_CADENCE_SEC, 0, + gve_tx_timeout_service_callback, (void *)priv, 0); +} + +static void +gve_stop_tx_timeout_service(struct gve_priv *priv) +{ + callout_drain(&priv->tx_timeout_service); +} + static int gve_up(struct gve_priv *priv) { if_t ifp = priv->ifp; int err; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (device_is_attached(priv->dev) == 0) { device_printf(priv->dev, "Cannot bring the iface up when detached\n"); return (ENXIO); } if (gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return (0); if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, CSUM_TCP | CSUM_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, CSUM_IP6_TCP | CSUM_IP6_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); if (gve_is_qpl(priv)) { err = gve_register_qpls(priv); if (err != 0) goto reset; } err = gve_create_rx_rings(priv); if (err != 0) goto reset; err = gve_create_tx_rings(priv); if (err != 0) goto reset; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; + + gve_start_tx_timeout_service(priv); + return (0); reset: gve_schedule_reset(priv); return (err); } static void gve_down(struct gve_priv *priv) { GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; + gve_stop_tx_timeout_service(priv); + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if (gve_destroy_rx_rings(priv) != 0) goto reset; if (gve_destroy_tx_rings(priv) != 0) goto reset; if (gve_is_qpl(priv)) { if (gve_unregister_qpls(priv) != 0) goto reset; } if (gve_is_gqi(priv)) gve_mask_all_queue_irqs(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_down_cnt++; return; reset: gve_schedule_reset(priv); } int gve_adjust_rx_queues(struct gve_priv *priv, uint16_t new_queue_cnt) { int err; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); gve_down(priv); if (new_queue_cnt < priv->rx_cfg.num_queues) { /* * Freeing a ring still preserves its ntfy_id, * which is needed if we create the ring again. */ gve_free_rx_rings(priv, new_queue_cnt, priv->rx_cfg.num_queues); } else { err = gve_alloc_rx_rings(priv, priv->rx_cfg.num_queues, new_queue_cnt); if (err != 0) { device_printf(priv->dev, "Failed to allocate new queues"); /* Failed to allocate rings, start back up with old ones */ gve_up(priv); return (err); } } priv->rx_cfg.num_queues = new_queue_cnt; err = gve_up(priv); if (err != 0) gve_schedule_reset(priv); return (err); } int gve_adjust_tx_queues(struct gve_priv *priv, uint16_t new_queue_cnt) { int err; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); gve_down(priv); if (new_queue_cnt < priv->tx_cfg.num_queues) { /* * Freeing a ring still preserves its ntfy_id, * which is needed if we create the ring again. */ gve_free_tx_rings(priv, new_queue_cnt, priv->tx_cfg.num_queues); } else { err = gve_alloc_tx_rings(priv, priv->tx_cfg.num_queues, new_queue_cnt); if (err != 0) { device_printf(priv->dev, "Failed to allocate new queues"); /* Failed to allocate rings, start back up with old ones */ gve_up(priv); return (err); } } priv->tx_cfg.num_queues = new_queue_cnt; err = gve_up(priv); if (err != 0) gve_schedule_reset(priv); return (err); } int gve_adjust_ring_sizes(struct gve_priv *priv, uint16_t new_desc_cnt, bool is_rx) { int err; uint16_t prev_desc_cnt; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); gve_down(priv); if (is_rx) { gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues); prev_desc_cnt = priv->rx_desc_cnt; priv->rx_desc_cnt = new_desc_cnt; err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); if (err != 0) { device_printf(priv->dev, "Failed to allocate rings. Trying to start back up with previous ring size."); priv->rx_desc_cnt = prev_desc_cnt; err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); } } else { gve_free_tx_rings(priv, 0, priv->tx_cfg.num_queues); prev_desc_cnt = priv->tx_desc_cnt; priv->tx_desc_cnt = new_desc_cnt; err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues); if (err != 0) { device_printf(priv->dev, "Failed to allocate rings. Trying to start back up with previous ring size."); priv->tx_desc_cnt = prev_desc_cnt; err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues); } } if (err != 0) { device_printf(priv->dev, "Failed to allocate rings! Cannot start device back up!"); return (err); } err = gve_up(priv); if (err != 0) { gve_schedule_reset(priv); return (err); } return (0); } static int gve_set_mtu(if_t ifp, uint32_t new_mtu) { struct gve_priv *priv = if_getsoftc(ifp); const uint32_t max_problem_range = 8227; const uint32_t min_problem_range = 7822; int err; if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { device_printf(priv->dev, "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d\n", new_mtu, priv->max_mtu, ETHERMIN); return (EINVAL); } /* * When hardware LRO is enabled in DQ mode, MTUs within the range * [7822, 8227] trigger hardware issues which cause a drastic drop * in throughput. */ if (!gve_is_gqi(priv) && !gve_disable_hw_lro && new_mtu >= min_problem_range && new_mtu <= max_problem_range) { device_printf(priv->dev, "Cannot set to MTU to %d within the range [%d, %d] while hardware LRO is enabled\n", new_mtu, min_problem_range, max_problem_range); return (EINVAL); } err = gve_adminq_set_mtu(priv, new_mtu); if (err == 0) { if (bootverbose) device_printf(priv->dev, "MTU set to %d\n", new_mtu); if_setmtu(ifp, new_mtu); } else { device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); } return (err); } static void gve_init(void *arg) { struct gve_priv *priv = (struct gve_priv *)arg; if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } static int gve_ioctl(if_t ifp, u_long command, caddr_t data) { struct gve_priv *priv; struct ifreq *ifr; int rc = 0; priv = if_getsoftc(ifp); ifr = (struct ifreq *)data; switch (command) { case SIOCSIFMTU: if (if_getmtu(ifp) == ifr->ifr_mtu) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); gve_set_mtu(ifp, ifr->ifr_mtu); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFFLAGS: if ((if_getflags(ifp) & IFF_UP) != 0) { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } else { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } break; case SIOCSIFCAP: if (ifr->ifr_reqcap == if_getcapenable(ifp)) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); if_setcapenable(ifp, ifr->ifr_reqcap); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFMEDIA: /* FALLTHROUGH */ case SIOCGIFMEDIA: rc = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; default: rc = ether_ioctl(ifp, command, data); break; } return (rc); } static int gve_media_change(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); device_printf(priv->dev, "Media change not supported\n"); return (0); } static void gve_media_status(if_t ifp, struct ifmediareq *ifmr) { struct gve_priv *priv = if_getsoftc(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_AUTO; } else { ifmr->ifm_active |= IFM_NONE; } GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } static uint64_t gve_get_counter(if_t ifp, ift_counter cnt) { struct gve_priv *priv; uint64_t rpackets = 0; uint64_t tpackets = 0; uint64_t rbytes = 0; uint64_t tbytes = 0; uint64_t rx_dropped_pkt = 0; uint64_t tx_dropped_pkt = 0; priv = if_getsoftc(ifp); gve_accum_stats(priv, &rpackets, &rbytes, &rx_dropped_pkt, &tpackets, &tbytes, &tx_dropped_pkt); switch (cnt) { case IFCOUNTER_IPACKETS: return (rpackets); case IFCOUNTER_OPACKETS: return (tpackets); case IFCOUNTER_IBYTES: return (rbytes); case IFCOUNTER_OBYTES: return (tbytes); case IFCOUNTER_IQDROPS: return (rx_dropped_pkt); case IFCOUNTER_OQDROPS: return (tx_dropped_pkt); default: return (if_get_counter_default(ifp, cnt)); } } static void gve_setup_ifnet(device_t dev, struct gve_priv *priv) { int caps = 0; if_t ifp; ifp = priv->ifp = if_alloc(IFT_ETHER); if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, priv); if_setdev(ifp, dev); if_setinitfn(ifp, gve_init); if_setioctlfn(ifp, gve_ioctl); if_settransmitfn(ifp, gve_xmit_ifp); if_setqflushfn(ifp, gve_qflush); /* * Set TSO limits, must match the arguments to bus_dma_tag_create * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode * because in QPL we copy the entire packet into the bounce buffer * and thus it does not matter how fragmented the mbuf is. */ if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) { if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); } if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); #else if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); #endif ifmedia_init(&priv->media, IFM_IMASK, gve_media_change, gve_media_status); if_setgetcounterfn(ifp, gve_get_counter); caps = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO | IFCAP_LRO; if ((priv->supported_features & GVE_SUP_JUMBO_FRAMES_MASK) != 0) caps |= IFCAP_JUMBO_MTU; if_setcapabilities(ifp, caps); if_setcapenable(ifp, caps); if (bootverbose) device_printf(priv->dev, "Setting initial MTU to %d\n", priv->max_mtu); if_setmtu(ifp, priv->max_mtu); ether_ifattach(ifp, priv->mac); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); } static int gve_alloc_counter_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(uint32_t) * priv->num_event_counters, PAGE_SIZE, &priv->counter_array_mem); if (err != 0) return (err); priv->counters = priv->counter_array_mem.cpu_addr; return (0); } static void gve_free_counter_array(struct gve_priv *priv) { if (priv->counters != NULL) gve_dma_free_coherent(&priv->counter_array_mem); priv->counter_array_mem = (struct gve_dma_handle){}; } static int gve_alloc_irq_db_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_irq_db) * (priv->num_queues), PAGE_SIZE, &priv->irqs_db_mem); if (err != 0) return (err); priv->irq_db_indices = priv->irqs_db_mem.cpu_addr; return (0); } static void gve_free_irq_db_array(struct gve_priv *priv) { if (priv->irq_db_indices != NULL) gve_dma_free_coherent(&priv->irqs_db_mem); priv->irqs_db_mem = (struct gve_dma_handle){}; } static void gve_free_rings(struct gve_priv *priv) { gve_free_irqs(priv); gve_free_tx_rings(priv, 0, priv->tx_cfg.num_queues); free(priv->tx, M_GVE); priv->tx = NULL; gve_free_rx_rings(priv, 0, priv->rx_cfg.num_queues); free(priv->rx, M_GVE); priv->rx = NULL; } static int gve_alloc_rings(struct gve_priv *priv) { int err; priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.max_queues, M_GVE, M_WAITOK | M_ZERO); err = gve_alloc_rx_rings(priv, 0, priv->rx_cfg.num_queues); if (err != 0) goto abort; priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.max_queues, M_GVE, M_WAITOK | M_ZERO); err = gve_alloc_tx_rings(priv, 0, priv->tx_cfg.num_queues); if (err != 0) goto abort; err = gve_alloc_irqs(priv); if (err != 0) goto abort; return (0); abort: gve_free_rings(priv); return (err); } static void gve_deconfigure_and_free_device_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) { err = gve_adminq_deconfigure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to deconfigure device resources: err=%d\n", err); return; } if (bootverbose) device_printf(priv->dev, "Deconfigured device resources\n"); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); } gve_free_irq_db_array(priv); gve_free_counter_array(priv); if (priv->ptype_lut_dqo) { free(priv->ptype_lut_dqo, M_GVE); priv->ptype_lut_dqo = NULL; } } static int gve_alloc_and_configure_device_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) return (0); err = gve_alloc_counter_array(priv); if (err != 0) return (err); err = gve_alloc_irq_db_array(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } if (!gve_is_gqi(priv)) { priv->ptype_lut_dqo = malloc(sizeof(*priv->ptype_lut_dqo), M_GVE, M_WAITOK | M_ZERO); err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); if (err != 0) { device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", err); goto abort; } } gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); if (bootverbose) device_printf(priv->dev, "Configured device resources\n"); return (0); abort: gve_deconfigure_and_free_device_resources(priv); return (err); } static void gve_set_queue_cnts(struct gve_priv *priv) { priv->tx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_TX_QUEUES); priv->rx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_RX_QUEUES); priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; if (priv->default_num_queues > 0) { priv->tx_cfg.num_queues = MIN(priv->default_num_queues, priv->tx_cfg.num_queues); priv->rx_cfg.num_queues = MIN(priv->default_num_queues, priv->rx_cfg.num_queues); } priv->num_queues = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues; priv->mgmt_msix_idx = priv->num_queues; } static int gve_alloc_adminq_and_describe_device(struct gve_priv *priv) { int err; if ((err = gve_adminq_alloc(priv)) != 0) return (err); if ((err = gve_verify_driver_compatibility(priv)) != 0) { device_printf(priv->dev, "Failed to verify driver compatibility: err=%d\n", err); goto abort; } if ((err = gve_adminq_describe_device(priv)) != 0) goto abort; gve_set_queue_cnts(priv); priv->num_registered_pages = 0; return (0); abort: gve_release_adminq(priv); return (err); } void gve_schedule_reset(struct gve_priv *priv) { if (gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) return; device_printf(priv->dev, "Scheduling reset task!\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); taskqueue_enqueue(priv->service_tq, &priv->service_task); } static void gve_destroy(struct gve_priv *priv) { gve_down(priv); gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); } static void gve_restore(struct gve_priv *priv) { int err; err = gve_adminq_alloc(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } if (!gve_is_gqi(priv)) { err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); if (err != 0) { device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", err); goto abort; } } err = gve_up(priv); if (err != 0) goto abort; return; abort: device_printf(priv->dev, "Restore failed!\n"); return; } static void gve_clear_device_resources(struct gve_priv *priv) { int i; for (i = 0; i < priv->num_event_counters; i++) priv->counters[i] = 0; bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, BUS_DMASYNC_PREWRITE); for (i = 0; i < priv->num_queues; i++) priv->irq_db_indices[i] = (struct gve_irq_db){}; bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_PREWRITE); if (priv->ptype_lut_dqo) *priv->ptype_lut_dqo = (struct gve_ptype_lut){0}; } static void gve_handle_reset(struct gve_priv *priv) { if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) return; gve_clear_state_flag(priv, GVE_STATE_FLAG_DO_RESET); gve_set_state_flag(priv, GVE_STATE_FLAG_IN_RESET); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); /* * Releasing the adminq causes the NIC to destroy all resources * registered with it, so by clearing the flags beneath we cause * the subsequent gve_down call below to not attempt to tell the * NIC to destroy these resources again. * * The call to gve_down is needed in the first place to refresh * the state and the DMA-able memory within each driver ring. */ gve_release_adminq(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); gve_down(priv); gve_clear_device_resources(priv); gve_restore(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); priv->reset_cnt++; gve_clear_state_flag(priv, GVE_STATE_FLAG_IN_RESET); } static void gve_handle_link_status(struct gve_priv *priv) { uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); bool link_up = status & GVE_DEVICE_STATUS_LINK_STATUS; if (link_up == gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) return; if (link_up) { if (bootverbose) device_printf(priv->dev, "Device link is up.\n"); if_link_state_change(priv->ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } else { device_printf(priv->dev, "Device link is down.\n"); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } } static void gve_service_task(void *arg, int pending) { struct gve_priv *priv = (struct gve_priv *)arg; uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); if (((GVE_DEVICE_STATUS_RESET_MASK & status) != 0) && !gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) { device_printf(priv->dev, "Device requested reset\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); } gve_handle_reset(priv); gve_handle_link_status(priv); } static int gve_probe(device_t dev) { uint16_t deviceid, vendorid; int i; vendorid = pci_get_vendor(dev); deviceid = pci_get_device(dev); for (i = 0; i < nitems(gve_devs); i++) { if (vendorid == gve_devs[i].vendor_id && deviceid == gve_devs[i].device_id) { device_set_desc(dev, gve_devs[i].name); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void gve_free_sys_res_mem(struct gve_priv *priv) { if (priv->msix_table != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->msix_table), priv->msix_table); if (priv->db_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->db_bar), priv->db_bar); if (priv->reg_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->reg_bar), priv->reg_bar); } static int gve_attach(device_t dev) { struct gve_priv *priv; int rid; int err; snprintf(gve_version, sizeof(gve_version), "%d.%d.%d", GVE_VERSION_MAJOR, GVE_VERSION_MINOR, GVE_VERSION_SUB); priv = device_get_softc(dev); priv->dev = dev; GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); pci_enable_busmaster(dev); rid = PCIR_BAR(GVE_REGISTER_BAR); priv->reg_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->reg_bar == NULL) { device_printf(dev, "Failed to allocate BAR0\n"); err = ENXIO; goto abort; } rid = PCIR_BAR(GVE_DOORBELL_BAR); priv->db_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->db_bar == NULL) { device_printf(dev, "Failed to allocate BAR2\n"); err = ENXIO; goto abort; } rid = pci_msix_table_bar(priv->dev); priv->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->msix_table == NULL) { device_printf(dev, "Failed to allocate msix table\n"); err = ENXIO; goto abort; } err = gve_alloc_adminq_and_describe_device(priv); if (err != 0) goto abort; err = gve_alloc_and_configure_device_resources(priv); if (err != 0) goto abort; err = gve_alloc_rings(priv); if (err != 0) goto abort; gve_setup_ifnet(dev, priv); priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; bus_write_multi_1(priv->reg_bar, DRIVER_VERSION, GVE_DRIVER_VERSION, sizeof(GVE_DRIVER_VERSION) - 1); TASK_INIT(&priv->service_task, 0, gve_service_task, priv); priv->service_tq = taskqueue_create("gve service", M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &priv->service_tq); taskqueue_start_threads(&priv->service_tq, 1, PI_NET, "%s service tq", device_get_nameunit(priv->dev)); gve_setup_sysctl(priv); if (bootverbose) device_printf(priv->dev, "Successfully attached %s", GVE_DRIVER_VERSION); return (0); abort: gve_free_rings(priv); gve_deconfigure_and_free_device_resources(priv); gve_release_adminq(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); return (err); } static int gve_detach(device_t dev) { struct gve_priv *priv = device_get_softc(dev); if_t ifp = priv->ifp; int error; error = bus_generic_detach(dev); if (error != 0) return (error); ether_ifdetach(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_destroy(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); gve_free_rings(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); while (taskqueue_cancel(priv->service_tq, &priv->service_task, NULL)) taskqueue_drain(priv->service_tq, &priv->service_task); taskqueue_free(priv->service_tq); if_free(ifp); return (0); } static device_method_t gve_methods[] = { DEVMETHOD(device_probe, gve_probe), DEVMETHOD(device_attach, gve_attach), DEVMETHOD(device_detach, gve_detach), DEVMETHOD_END }; static driver_t gve_driver = { "gve", gve_methods, sizeof(struct gve_priv) }; #if __FreeBSD_version < 1301503 static devclass_t gve_devclass; DRIVER_MODULE(gve, pci, gve_driver, gve_devclass, 0, 0); #else DRIVER_MODULE(gve, pci, gve_driver, 0, 0); #endif MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, gve, gve_devs, nitems(gve_devs)); diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c index f7c7b5803865..661f61918853 100644 --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -1,497 +1,501 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" static SYSCTL_NODE(_hw, OID_AUTO, gve, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "GVE driver parameters"); bool gve_disable_hw_lro = false; SYSCTL_BOOL(_hw_gve, OID_AUTO, disable_hw_lro, CTLFLAG_RDTUN, &gve_disable_hw_lro, 0, "Controls if hardware LRO is used"); char gve_queue_format[8]; SYSCTL_STRING(_hw_gve, OID_AUTO, queue_format, CTLFLAG_RD, &gve_queue_format, 0, "Queue format being used by the iface"); char gve_version[8]; SYSCTL_STRING(_hw_gve, OID_AUTO, driver_version, CTLFLAG_RD, &gve_version, 0, "Driver version"); static void gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_rx_ring *rxq) { struct sysctl_oid *node; struct sysctl_oid_list *list; struct gve_rxq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->com.id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue"); list = SYSCTL_CHILDREN(node); stats = &rxq->stats; SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_bytes", CTLFLAG_RD, &stats->rbytes, "Bytes received"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_packets", CTLFLAG_RD, &stats->rpackets, "Packets received"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_copybreak_cnt", CTLFLAG_RD, &stats->rx_copybreak_cnt, "Total frags with mbufs allocated for copybreak"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_flip_cnt", CTLFLAG_RD, &stats->rx_frag_flip_cnt, "Total frags that allocated mbuf with page flip"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_copy_cnt", CTLFLAG_RD, &stats->rx_frag_copy_cnt, "Total frags with mbuf that copied payload into mbuf"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt", CTLFLAG_RD, &stats->rx_dropped_pkt, "Total rx packets dropped"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_desc_err", CTLFLAG_RD, &stats->rx_dropped_pkt_desc_err, "Packets dropped due to descriptor error"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_buf_post_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_buf_post_fail, "Packets dropped due to failure to post enough buffers"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_mbuf_alloc_fail, "Packets dropped due to failed mbuf allocation"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_mbuf_dmamap_err", CTLFLAG_RD, &stats->rx_mbuf_dmamap_err, "Number of rx mbufs which could not be dma mapped"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_mbuf_mclget_null", CTLFLAG_RD, &stats->rx_mbuf_mclget_null, "Number of times when there were no cluster mbufs"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "rx_completed_desc", CTLFLAG_RD, &rxq->cnt, 0, "Number of descriptors completed"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "num_desc_posted", CTLFLAG_RD, &rxq->fill_cnt, rxq->fill_cnt, "Toal number of descriptors posted"); } static void gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_tx_ring *txq) { struct sysctl_oid *node; struct sysctl_oid_list *tx_list; struct gve_txq_stats *stats; char namebuf[16]; snprintf(namebuf, sizeof(namebuf), "txq%d", txq->com.id); node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue"); tx_list = SYSCTL_CHILDREN(node); stats = &txq->stats; SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, "tx_posted_desc", CTLFLAG_RD, &txq->req, 0, "Number of descriptors posted by NIC"); SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, "tx_completed_desc", CTLFLAG_RD, &txq->done, 0, "Number of descriptors completed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_packets", CTLFLAG_RD, &stats->tpackets, "Packets transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_tso_packets", CTLFLAG_RD, &stats->tso_packet_cnt, "TSO Packets transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_bytes", CTLFLAG_RD, &stats->tbytes, "Bytes transmitted"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_device", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_device, "Packets delayed due to no space in device"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_dropped_pkt_nospace_bufring", CTLFLAG_RD, &stats->tx_dropped_pkt_nospace_bufring, "Packets dropped due to no space in br ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_dropped_pkt_vlan", CTLFLAG_RD, &stats->tx_dropped_pkt_vlan, "Dropped VLAN packets"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_descring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_descring, "Packets delayed due to no space in desc ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_compring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_compring, "Packets delayed due to no space in comp ring"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_qpl_bufs, "Packets delayed due to not enough qpl bufs"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_tsoerr", CTLFLAG_RD, &stats->tx_delayed_pkt_tsoerr, "TSO packets delayed due to err in prep errors"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, - "tx_mbuf_collpase", CTLFLAG_RD, + "tx_mbuf_collapse", CTLFLAG_RD, &stats->tx_mbuf_collapse, "tx mbufs that had to be collapsed"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_defrag", CTLFLAG_RD, &stats->tx_mbuf_defrag, "tx mbufs that had to be defragged"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_defrag_err", CTLFLAG_RD, &stats->tx_mbuf_defrag_err, "tx mbufs that failed defrag"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_dmamap_enomem_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_enomem_err, "tx mbufs that could not be dma-mapped due to low mem"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_mbuf_dmamap_err", CTLFLAG_RD, &stats->tx_mbuf_dmamap_err, "tx mbufs that could not be dma-mapped"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_timeout", CTLFLAG_RD, + &stats->tx_timeout, + "detections of timed out packets on tx queues"); } static void gve_setup_queue_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) { gve_setup_rxq_sysctl(ctx, child, &priv->rx[i]); } for (i = 0; i < priv->tx_cfg.num_queues; i++) { gve_setup_txq_sysctl(ctx, child, &priv->tx[i]); } } static void gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { struct sysctl_oid *admin_node; struct sysctl_oid_list *admin_list; /* Admin queue stats */ admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "adminq_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue statistics"); admin_list = SYSCTL_CHILDREN(admin_node); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_prod_cnt", CTLFLAG_RD, &priv->adminq_prod_cnt, 0, "Adminq Commands issued"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cmd_fail", CTLFLAG_RD, &priv->adminq_cmd_fail, 0, "Aqminq Failed commands"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_timeouts", CTLFLAG_RD, &priv->adminq_timeouts, 0, "Adminq Timedout commands"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_describe_device_cnt", CTLFLAG_RD, &priv->adminq_describe_device_cnt, 0, "adminq_describe_device_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_cfg_device_resources_cnt, 0, "adminq_cfg_device_resources_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_register_page_list_cnt", CTLFLAG_RD, &priv->adminq_register_page_list_cnt, 0, "adminq_register_page_list_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_unregister_page_list_cnt", CTLFLAG_RD, &priv->adminq_unregister_page_list_cnt, 0, "adminq_unregister_page_list_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_tx_queue_cnt", CTLFLAG_RD, &priv->adminq_create_tx_queue_cnt, 0, "adminq_create_tx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_create_rx_queue_cnt, 0, "adminq_create_rx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_tx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_tx_queue_cnt, 0, "adminq_destroy_tx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0, "adminq_destroy_rx_queue_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_get_ptype_map_cnt", CTLFLAG_RD, &priv->adminq_get_ptype_map_cnt, 0, "adminq_get_ptype_map_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_dcfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_dcfg_device_resources_cnt, 0, "adminq_dcfg_device_resources_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_set_driver_parameter_cnt", CTLFLAG_RD, &priv->adminq_set_driver_parameter_cnt, 0, "adminq_set_driver_parameter_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_verify_driver_compatibility_cnt", CTLFLAG_RD, &priv->adminq_verify_driver_compatibility_cnt, 0, "adminq_verify_driver_compatibility_cnt"); } static void gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { struct sysctl_oid *main_node; struct sysctl_oid_list *main_list; /* Main stats */ main_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "main_stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Main statistics"); main_list = SYSCTL_CHILDREN(main_node); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_up_cnt", CTLFLAG_RD, &priv->interface_up_cnt, 0, "Times interface was set to up"); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_down_cnt", CTLFLAG_RD, &priv->interface_down_cnt, 0, "Times interface was set to down"); SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "reset_cnt", CTLFLAG_RD, &priv->reset_cnt, 0, "Times reset"); } static int gve_check_num_queues(struct gve_priv *priv, int val, bool is_rx) { if (val < 1) { device_printf(priv->dev, "Requested num queues (%u) must be a positive integer\n", val); return (EINVAL); } if (val > (is_rx ? priv->rx_cfg.max_queues : priv->tx_cfg.max_queues)) { device_printf(priv->dev, "Requested num queues (%u) is too large\n", val); return (EINVAL); } return (0); } static int gve_sysctl_num_tx_queues(SYSCTL_HANDLER_ARGS) { struct gve_priv *priv = arg1; int val; int err; val = priv->tx_cfg.num_queues; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); err = gve_check_num_queues(priv, val, /*is_rx=*/false); if (err != 0) return (err); if (val != priv->tx_cfg.num_queues) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); err = gve_adjust_tx_queues(priv, val); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } return (err); } static int gve_sysctl_num_rx_queues(SYSCTL_HANDLER_ARGS) { struct gve_priv *priv = arg1; int val; int err; val = priv->rx_cfg.num_queues; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); err = gve_check_num_queues(priv, val, /*is_rx=*/true); if (err != 0) return (err); if (val != priv->rx_cfg.num_queues) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); err = gve_adjust_rx_queues(priv, val); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } return (err); } static int gve_check_ring_size(struct gve_priv *priv, int val, bool is_rx) { if (!powerof2(val) || val == 0) { device_printf(priv->dev, "Requested ring size (%u) must be a power of 2\n", val); return (EINVAL); } if (val < (is_rx ? priv->min_rx_desc_cnt : priv->min_tx_desc_cnt)) { device_printf(priv->dev, "Requested ring size (%u) cannot be less than %d\n", val, (is_rx ? priv->min_rx_desc_cnt : priv->min_tx_desc_cnt)); return (EINVAL); } if (val > (is_rx ? priv->max_rx_desc_cnt : priv->max_tx_desc_cnt)) { device_printf(priv->dev, "Requested ring size (%u) cannot be greater than %d\n", val, (is_rx ? priv->max_rx_desc_cnt : priv->max_tx_desc_cnt)); return (EINVAL); } return (0); } static int gve_sysctl_tx_ring_size(SYSCTL_HANDLER_ARGS) { struct gve_priv *priv = arg1; int val; int err; val = priv->tx_desc_cnt; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); err = gve_check_ring_size(priv, val, /*is_rx=*/false); if (err != 0) return (err); if (val != priv->tx_desc_cnt) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); err = gve_adjust_ring_sizes(priv, val, /*is_rx=*/false); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } return (err); } static int gve_sysctl_rx_ring_size(SYSCTL_HANDLER_ARGS) { struct gve_priv *priv = arg1; int val; int err; val = priv->rx_desc_cnt; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); err = gve_check_ring_size(priv, val, /*is_rx=*/true); if (err != 0) return (err); if (val != priv->rx_desc_cnt) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); err = gve_adjust_ring_sizes(priv, val, /*is_rx=*/true); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } return (err); } static void gve_setup_sysctl_writables(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_priv *priv) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_tx_queues", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, gve_sysctl_num_tx_queues, "I", "Number of TX queues"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "num_rx_queues", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, gve_sysctl_num_rx_queues, "I", "Number of RX queues"); if (priv->modify_ringsize_enabled) { SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_ring_size", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, gve_sysctl_tx_ring_size, "I", "TX ring size"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rx_ring_size", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, priv, 0, gve_sysctl_rx_ring_size, "I", "RX ring size"); } } void gve_setup_sysctl(struct gve_priv *priv) { device_t dev; struct sysctl_ctx_list *ctx; struct sysctl_oid *tree; struct sysctl_oid_list *child; dev = priv->dev; ctx = device_get_sysctl_ctx(dev); tree = device_get_sysctl_tree(dev); child = SYSCTL_CHILDREN(tree); gve_setup_queue_stat_sysctl(ctx, child, priv); gve_setup_adminq_stat_sysctl(ctx, child, priv); gve_setup_main_stat_sysctl(ctx, child, priv); gve_setup_sysctl_writables(ctx, child, priv); } void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt) { struct gve_rxq_stats *rxqstats; struct gve_txq_stats *txqstats; int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) { rxqstats = &priv->rx[i].stats; *rpackets += counter_u64_fetch(rxqstats->rpackets); *rbytes += counter_u64_fetch(rxqstats->rbytes); *rx_dropped_pkt += counter_u64_fetch(rxqstats->rx_dropped_pkt); } for (i = 0; i < priv->tx_cfg.num_queues; i++) { txqstats = &priv->tx[i].stats; *tpackets += counter_u64_fetch(txqstats->tpackets); *tbytes += counter_u64_fetch(txqstats->tbytes); *tx_dropped_pkt += counter_u64_fetch(txqstats->tx_dropped_pkt); } } diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c index b667df4ca06e..84e3a4c4eb9f 100644 --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -1,926 +1,958 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" #include "gve_dqo.h" #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 static int gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) { struct gve_queue_page_list *qpl = tx->com.qpl; struct gve_tx_fifo *fifo = &tx->fifo; fifo->size = qpl->num_pages * PAGE_SIZE; fifo->base = qpl->kva; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; return (0); } static void gve_tx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; if (tx->desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->desc_ring = NULL; } if (tx->info != NULL) { free(tx->info, M_GVE); tx->info = NULL; } if (com->qpl != NULL) { gve_free_qpl(priv, com->qpl); com->qpl = NULL; } } static void gve_tx_free_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; /* Safe to call even if never alloced */ gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); if (mtx_initialized(&tx->ring_mtx)) mtx_destroy(&tx->ring_mtx); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } if (tx->br != NULL) { buf_ring_free(tx->br, M_DEVBUF); tx->br = NULL; } if (gve_is_gqi(priv)) gve_tx_free_ring_gqi(priv, i); else gve_tx_free_ring_dqo(priv, i); } static int gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; int err; err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->desc_ring = tx->desc_ring_mem.cpu_addr; com->qpl = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, /*single_kva=*/true); if (com->qpl == NULL) { device_printf(priv->dev, "Failed to alloc QPL for tx ring %d\n", i); err = ENOMEM; goto abort; } err = gve_tx_fifo_init(priv, tx); if (err != 0) goto abort; tx->info = malloc( sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, M_GVE, M_WAITOK | M_ZERO); return (0); abort: gve_tx_free_ring_gqi(priv, i); return (err); } static int gve_tx_alloc_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; char mtx_name[16]; int err; com->priv = priv; com->id = i; if (gve_is_gqi(priv)) err = gve_tx_alloc_ring_gqi(priv, i); else err = gve_tx_alloc_ring_dqo(priv, i); if (err != 0) goto abort; sprintf(mtx_name, "gvetx%d", i); mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, M_WAITOK, &tx->ring_mtx); gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; + tx->last_kicked = 0; + return (0); abort: gve_tx_free_ring(priv, i); return (err); } int gve_alloc_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { int i; int err; KASSERT(priv->tx != NULL, ("priv->tx is NULL!")); for (i = start_idx; i < stop_idx; i++) { err = gve_tx_alloc_ring(priv, i); if (err != 0) goto free_rings; } return (0); free_rings: gve_free_tx_rings(priv, start_idx, i); return (err); } void gve_free_tx_rings(struct gve_priv *priv, uint16_t start_idx, uint16_t stop_idx) { int i; for (i = start_idx; i < stop_idx; i++) gve_tx_free_ring(priv, i); } static void gve_tx_clear_desc_ring(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->desc_ring[i] = (union gve_tx_desc){}; tx->info[i] = (struct gve_tx_buffer_state){}; + gve_invalidate_timestamp(&tx->info[i].enqueue_time_sec); } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_clear_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_tx_fifo *fifo = &tx->fifo; tx->req = 0; tx->done = 0; tx->mask = priv->tx_desc_cnt - 1; atomic_store_int(&fifo->available, fifo->size); fifo->head = 0; gve_tx_clear_desc_ring(tx); } static void gve_start_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; atomic_store_bool(&tx->stopped, false); if (gve_is_gqi(priv)) NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); else NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq_dqo, tx); com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", device_get_nameunit(priv->dev), i); TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); tx->xmit_tq = taskqueue_create_fast("gve tx xmit", M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", device_get_nameunit(priv->dev), i); } int gve_create_tx_rings(struct gve_priv *priv) { struct gve_ring_com *com; struct gve_tx_ring *tx; int err; int i; if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) return (0); for (i = 0; i < priv->tx_cfg.num_queues; i++) { if (gve_is_gqi(priv)) gve_clear_tx_ring(priv, i); else gve_clear_tx_ring_dqo(priv, i); } err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_POSTREAD); for (i = 0; i < priv->tx_cfg.num_queues; i++) { tx = &priv->tx[i]; com = &tx->com; com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, BUS_DMASYNC_POSTREAD); com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); gve_start_tx_ring(priv, i); } gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); return (0); } static void gve_stop_tx_ring(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; if (com->cleanup_tq != NULL) { taskqueue_quiesce(com->cleanup_tq); taskqueue_free(com->cleanup_tq); com->cleanup_tq = NULL; } if (tx->xmit_tq != NULL) { taskqueue_quiesce(tx->xmit_tq); taskqueue_free(tx->xmit_tq); tx->xmit_tq = NULL; } } int gve_destroy_tx_rings(struct gve_priv *priv) { int err; int i; for (i = 0; i < priv->tx_cfg.num_queues; i++) gve_stop_tx_ring(priv, i); if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); } return (0); } +int +gve_check_tx_timeout_gqi(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_buffer_state *info; + uint32_t pkt_idx; + int num_timeouts; + + num_timeouts = 0; + + for (pkt_idx = 0; pkt_idx < priv->tx_desc_cnt; pkt_idx++) { + info = &tx->info[pkt_idx]; + + if (!gve_timestamp_valid(&info->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&info->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + int gve_tx_intr(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static uint32_t gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) { bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, BUS_DMASYNC_POSTREAD); uint32_t counter = priv->counters[tx->com.counter_idx]; return (be32toh(counter)); } static void gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) { atomic_add_int(&fifo->available, bytes); } void gve_tx_cleanup_tq(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; uint32_t nic_done = gve_tx_load_event_counter(priv, tx); uint32_t todo = nic_done - tx->done; size_t space_freed = 0; int i, j; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; for (j = 0; j < todo; j++) { uint32_t idx = tx->done & tx->mask; struct gve_tx_buffer_state *info = &tx->info[idx]; struct mbuf *mbuf = info->mbuf; tx->done++; if (mbuf == NULL) continue; + gve_invalidate_timestamp(&info->enqueue_time_sec); + info->mbuf = NULL; + counter_enter(); counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); counter_u64_add_protected(tx->stats.tpackets, 1); counter_exit(); m_freem(mbuf); for (i = 0; i < GVE_TX_MAX_DESCS; i++) { space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; info->iov[i].iov_len = 0; info->iov[i].iov_padding = 0; } } gve_tx_free_fifo(&tx->fifo, space_freed); gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_ACK | GVE_IRQ_EVENT); /* * Completions born before this barrier MAY NOT cause the NIC to send an * interrupt but they will still be handled by the enqueue below. * Completions born after the barrier WILL trigger an interrupt. */ atomic_thread_fence_seq_cst(); nic_done = gve_tx_load_event_counter(priv, tx); todo = nic_done - tx->done; if (todo != 0) { gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); } if (atomic_load_bool(&tx->stopped) && space_freed) { atomic_store_bool(&tx->stopped, false); taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); } } static void gve_dma_sync_for_device(struct gve_queue_page_list *qpl, uint64_t iov_offset, uint64_t iov_len) { uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; uint64_t first_page = iov_offset / PAGE_SIZE; struct gve_dma_handle *dma; uint64_t page; for (page = first_page; page <= last_page; page++) { dma = &(qpl->dmas[page]); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); } } static void gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) { mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); mtd_desc->reserved0 = 0; mtd_desc->reserved1 = 0; } static void gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, uint16_t l4_hdr_offset, uint32_t desc_cnt, uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, int csum_offset, uint16_t pkt_len) { if (is_tso) { pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else if (has_csum_flag) { pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; pkt_desc->l4_csum_offset = csum_offset >> 1; pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; } else { pkt_desc->type_flags = GVE_TXD_STD; pkt_desc->l4_csum_offset = 0; pkt_desc->l4_hdr_offset = 0; } pkt_desc->desc_cnt = desc_cnt; pkt_desc->len = htobe16(pkt_len); pkt_desc->seg_len = htobe16(first_seg_len); pkt_desc->seg_addr = htobe64(addr); } static void gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, bool is_tso, uint16_t len, uint64_t addr, bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) { seg_desc->type_flags = GVE_TXD_SEG; if (is_tso) { if (is_ipv6) seg_desc->type_flags |= GVE_TXSF_IPV6; seg_desc->l3_offset = l3_off >> 1; seg_desc->mss = htobe16(tso_mss); } seg_desc->seg_len = htobe16(len); seg_desc->seg_addr = htobe64(addr); } static inline uint32_t gve_tx_avail(struct gve_tx_ring *tx) { return (tx->mask + 1 - (tx->req - tx->done)); } static bool gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) { return (atomic_load_int(&fifo->available) >= bytes); } static inline bool gve_can_tx(struct gve_tx_ring *tx, int bytes_required) { return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); } static int gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) { return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; } static inline int gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, uint16_t pkt_len) { int pad_bytes, align_hdr_pad; int bytes; pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); /* We need to take into account the header alignment padding. */ align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; bytes = align_hdr_pad + pad_bytes + pkt_len; return (bytes); } static int gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, struct gve_tx_iovec iov[2]) { size_t overflow, padding; uint32_t aligned_head; int nfrags = 0; if (bytes == 0) return (0); /* * This check happens before we know how much padding is needed to * align to a cacheline boundary for the payload, but that is fine, * because the FIFO head always start aligned, and the FIFO's boundaries * are aligned, so if there is space for the data, there is space for * the padding to the next alignment. */ KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), ("Allocating gve tx fifo when there is no room")); nfrags++; iov[0].iov_offset = fifo->head; iov[0].iov_len = bytes; fifo->head += bytes; if (fifo->head > fifo->size) { /* * If the allocation did not fit in the tail fragment of the * FIFO, also use the head fragment. */ nfrags++; overflow = fifo->head - fifo->size; iov[0].iov_len -= overflow; iov[1].iov_offset = 0; /* Start of fifo*/ iov[1].iov_len = overflow; fifo->head = overflow; } /* Re-align to a cacheline boundary */ aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); padding = aligned_head - fifo->head; iov[nfrags - 1].iov_padding = padding; atomic_add_int(&fifo->available, -(bytes + padding)); fifo->head = aligned_head; if (fifo->head == fifo->size) fifo->head = 0; return (nfrags); } /* Only error this returns is ENOBUFS when the tx fifo is short of space */ static int gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) { bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; int pad_bytes, hdr_nfrags, payload_nfrags; struct gve_tx_pkt_desc *pkt_desc; struct gve_tx_seg_desc *seg_desc; struct gve_tx_mtd_desc *mtd_desc; struct gve_tx_buffer_state *info; uint32_t idx = tx->req & tx->mask; struct ether_header *eh; struct mbuf *mbuf_next; int payload_iov = 2; int bytes_required; struct ip6_hdr *ip6; struct tcphdr *th; uint32_t next_idx; uint8_t l3_off; struct ip *ip; int i; info = &tx->info[idx]; csum_flags = mbuf->m_pkthdr.csum_flags; pkt_len = mbuf->m_pkthdr.len; is_tso = csum_flags & CSUM_TSO; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; l3_off = ETHER_HDR_LEN; mbuf_next = m_getptr(mbuf, l3_off, &offset); if (is_ipv6) { ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); l4_off = l3_off + sizeof(struct ip6_hdr); is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); is_udp = (ip6->ip6_nxt == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { ip = (struct ip *)(mtodo(mbuf_next, offset)); l4_off = l3_off + (ip->ip_hl << 2); is_tcp = (ip->ip_p == IPPROTO_TCP); is_udp = (ip->ip_p == IPPROTO_UDP); mbuf_next = m_getptr(mbuf, l4_off, &offset); } l4_data_off = 0; if (is_tcp) { th = (struct tcphdr *)(mtodo(mbuf_next, offset)); l4_data_off = l4_off + (th->th_off << 2); } else if (is_udp) l4_data_off = l4_off + sizeof(struct udphdr); if (has_csum_flag) { if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) csum_offset = offsetof(struct tcphdr, th_sum); else csum_offset = offsetof(struct udphdr, uh_sum); } /* * If this packet is neither a TCP nor a UDP packet, the first segment, * the one represented by the packet descriptor, will carry the * spec-stipulated minimum of 182B. */ if (l4_data_off != 0) first_seg_len = l4_data_off; else first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); if (__predict_false(!gve_can_tx(tx, bytes_required))) { counter_enter(); counter_u64_add_protected(tx->stats.tx_delayed_pkt_nospace_device, 1); counter_exit(); return (ENOBUFS); } /* So that the cleanup taskqueue can free the mbuf eventually. */ info->mbuf = mbuf; + gve_set_timestamp(&info->enqueue_time_sec); + /* * We don't want to split the header, so if necessary, pad to the end * of the fifo and then put the header at the beginning of the fifo. */ pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, &info->iov[0]); KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, &info->iov[payload_iov]); pkt_desc = &tx->desc_ring[idx].pkt; gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, 1 + mtd_desc_nr + payload_nfrags, first_seg_len, info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, pkt_len); m_copydata(mbuf, 0, first_seg_len, (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[hdr_nfrags - 1].iov_offset, info->iov[hdr_nfrags - 1].iov_len); copy_offset = first_seg_len; if (mtd_desc_nr == 1) { next_idx = (tx->req + 1) & tx->mask; mtd_desc = &tx->desc_ring[next_idx].mtd; gve_tx_fill_mtd_desc(mtd_desc, mbuf); } for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; seg_desc = &tx->desc_ring[next_idx].seg; gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); m_copydata(mbuf, copy_offset, info->iov[i].iov_len, (char *)tx->fifo.base + info->iov[i].iov_offset); gve_dma_sync_for_device(tx->com.qpl, info->iov[i].iov_offset, info->iov[i].iov_len); copy_offset += info->iov[i].iov_len; } tx->req += (1 + mtd_desc_nr + payload_nfrags); if (is_tso) { counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } return (0); } static int gve_xmit_mbuf(struct gve_tx_ring *tx, struct mbuf **mbuf) { if (gve_is_gqi(tx->com.priv)) return (gve_xmit(tx, *mbuf)); if (gve_is_qpl(tx->com.priv)) return (gve_xmit_dqo_qpl(tx, *mbuf)); /* * gve_xmit_dqo might attempt to defrag the mbuf chain. * The reference is passed in so that in the case of * errors, the new mbuf chain is what's put back on the br. */ return (gve_xmit_dqo(tx, mbuf)); } /* * Has the side-effect of stopping the xmit queue by setting tx->stopped */ static int gve_xmit_retry_enobuf_mbuf(struct gve_tx_ring *tx, struct mbuf **mbuf) { int err; atomic_store_bool(&tx->stopped, true); /* * Room made in the queue BEFORE the barrier will be seen by the * gve_xmit_mbuf retry below. * * If room is made in the queue AFTER the barrier, the cleanup tq * iteration creating the room will either see a tx->stopped value * of 0 or the 1 we just wrote: * * If it sees a 1, then it would enqueue the xmit tq. Enqueue * implies a retry on the waiting pkt. * * If it sees a 0, then that implies a previous iteration overwrote * our 1, and that iteration would enqueue the xmit tq. Enqueue * implies a retry on the waiting pkt. */ atomic_thread_fence_seq_cst(); err = gve_xmit_mbuf(tx, mbuf); if (err == 0) atomic_store_bool(&tx->stopped, false); return (err); } static void gve_xmit_br(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; struct ifnet *ifp = priv->ifp; struct mbuf *mbuf; int err; while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && (mbuf = drbr_peek(ifp, tx->br)) != NULL) { err = gve_xmit_mbuf(tx, &mbuf); /* * We need to stop this taskqueue when we can't xmit the pkt due * to lack of space in the NIC ring (ENOBUFS). The retry exists * to guard against a TOCTTOU bug that could end up freezing the * queue forever. */ if (__predict_false(mbuf != NULL && err == ENOBUFS)) err = gve_xmit_retry_enobuf_mbuf(tx, &mbuf); if (__predict_false(err != 0 && mbuf != NULL)) { if (err == EINVAL) { drbr_advance(ifp, tx->br); m_freem(mbuf); } else drbr_putback(ifp, tx->br, mbuf); break; } drbr_advance(ifp, tx->br); BPF_MTAP(ifp, mbuf); bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); else gve_db_bar_dqo_write_4(priv, tx->com.db_offset, tx->dqo.desc_tail); } } void gve_xmit_tq(void *arg, int pending) { struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; GVE_RING_LOCK(tx); gve_xmit_br(tx); GVE_RING_UNLOCK(tx); } static bool is_vlan_tagged_pkt(struct mbuf *mbuf) { struct ether_header *eh; eh = mtod(mbuf, struct ether_header *); return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); } int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; bool is_br_empty; int err; uint32_t i; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (ENODEV); if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; else i = curcpu % priv->tx_cfg.num_queues; tx = &priv->tx[i]; if (__predict_false(is_vlan_tagged_pkt(mbuf))) { counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); m_freem(mbuf); return (ENODEV); } is_br_empty = drbr_empty(ifp, tx->br); err = drbr_enqueue(ifp, tx->br, mbuf); if (__predict_false(err != 0)) { if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); counter_enter(); counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); counter_exit(); return (err); } /* * If the mbuf we just enqueued is the only one on the ring, then * transmit it right away in the interests of low latency. */ if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { gve_xmit_br(tx); GVE_RING_UNLOCK(tx); } else if (!atomic_load_bool(&tx->stopped)) taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); return (0); } void gve_qflush(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); struct gve_tx_ring *tx; int i; for (i = 0; i < priv->tx_cfg.num_queues; ++i) { tx = &priv->tx[i]; if (drbr_empty(ifp, tx->br) == 0) { GVE_RING_LOCK(tx); drbr_flush(ifp, tx->br); GVE_RING_UNLOCK(tx); } } if_qflush(ifp); } diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c index 8a1993c3e712..bbf2ee1b0324 100644 --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -1,1120 +1,1150 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "opt_inet6.h" #include "gve.h" #include "gve_dqo.h" static void gve_unmap_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap, BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap); } static void gve_clear_qpl_pending_pkt(struct gve_tx_pending_pkt_dqo *pending_pkt) { pending_pkt->qpl_buf_head = -1; pending_pkt->num_qpl_bufs = 0; } static void gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx) { struct gve_tx_pending_pkt_dqo *pending_pkt; int i; for (i = 0; i < tx->dqo.num_pending_pkts; i++) { pending_pkt = &tx->dqo.pending_pkts[i]; if (!pending_pkt->mbuf) continue; if (gve_is_qpl(tx->com.priv)) gve_clear_qpl_pending_pkt(pending_pkt); else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; } } void gve_tx_free_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; int j; if (tx->dqo.desc_ring != NULL) { gve_dma_free_coherent(&tx->desc_ring_mem); tx->dqo.desc_ring = NULL; } if (tx->dqo.compl_ring != NULL) { gve_dma_free_coherent(&tx->dqo.compl_ring_mem); tx->dqo.compl_ring = NULL; } if (tx->dqo.pending_pkts != NULL) { gve_free_tx_mbufs_dqo(tx); if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { for (j = 0; j < tx->dqo.num_pending_pkts; j++) if (tx->dqo.pending_pkts[j].state != GVE_PACKET_STATE_UNALLOCATED) bus_dmamap_destroy(tx->dqo.buf_dmatag, tx->dqo.pending_pkts[j].dmamap); } free(tx->dqo.pending_pkts, M_GVE); tx->dqo.pending_pkts = NULL; } if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) bus_dma_tag_destroy(tx->dqo.buf_dmatag); if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { free(tx->dqo.qpl_bufs, M_GVE); tx->dqo.qpl_bufs = NULL; } if (com->qpl != NULL) { gve_free_qpl(priv, com->qpl); com->qpl = NULL; } } static int gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) { struct gve_priv *priv = tx->com.priv; int err; int j; /* * DMA tag for mapping Tx mbufs * The maxsize, nsegments, and maxsegsize params should match * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. */ err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ GVE_TSO_MAXSIZE_DQO, /* maxsize */ GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &tx->dqo.buf_dmatag); if (err != 0) { device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); return (err); } for (j = 0; j < tx->dqo.num_pending_pkts; j++) { err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, &tx->dqo.pending_pkts[j].dmamap); if (err != 0) { device_printf(priv->dev, "err in creating pending pkt dmamap %d: %d", j, err); return (err); } tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } return (0); } int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; uint16_t num_pending_pkts; int err; /* Descriptor ring */ err = gve_dma_alloc_coherent(priv, sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); goto abort; } tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr; /* Completion ring */ err = gve_dma_alloc_coherent(priv, sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt, CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc compl ring for tx ring %d", i); goto abort; } tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; /* * pending_pkts array * * The max number of pending packets determines the maximum number of * descriptors which maybe written to the completion queue. * * We must set the number small enough to make sure we never overrun the * completion queue. */ num_pending_pkts = priv->tx_desc_cnt; /* * Reserve space for descriptor completions, which will be reported at * most every GVE_TX_MIN_RE_INTERVAL packets. */ num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL; tx->dqo.num_pending_pkts = num_pending_pkts; tx->dqo.pending_pkts = malloc( sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, M_GVE, M_WAITOK | M_ZERO); if (gve_is_qpl(priv)) { int qpl_buf_cnt; tx->com.qpl = gve_alloc_qpl(priv, i, GVE_TX_NUM_QPL_PAGES_DQO, /*single_kva*/false); if (tx->com.qpl == NULL) { device_printf(priv->dev, "Failed to alloc QPL for tx ring %d", i); err = ENOMEM; goto abort; } qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; tx->dqo.qpl_bufs = malloc( sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, M_GVE, M_WAITOK | M_ZERO); } else gve_tx_alloc_rda_fields_dqo(tx); return (0); abort: gve_tx_free_ring_dqo(priv, i); return (err); } static void gve_extract_tx_metadata_dqo(const struct mbuf *mbuf, struct gve_tx_metadata_dqo *metadata) { uint32_t hash = mbuf->m_pkthdr.flowid; uint16_t path_hash; metadata->version = GVE_TX_METADATA_VERSION_DQO; if (hash) { path_hash = hash ^ (hash >> 16); path_hash &= (1 << 15) - 1; if (__predict_false(path_hash == 0)) path_hash = ~path_hash; metadata->path_hash = path_hash; } } static void gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, uint32_t *desc_idx, uint32_t len, uint64_t addr, int16_t compl_tag, bool eop, bool csum_enabled) { while (len > 0) { struct gve_tx_pkt_desc_dqo *desc = &tx->dqo.desc_ring[*desc_idx].pkt; uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO); bool cur_eop = eop && cur_len == len; *desc = (struct gve_tx_pkt_desc_dqo){ .buf_addr = htole64(addr), .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, .end_of_packet = cur_eop, .checksum_offload_enable = csum_enabled, .compl_tag = htole16(compl_tag), .buf_size = cur_len, }; addr += cur_len; len -= cur_len; *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; } } static void gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata, int header_len) { *desc = (struct gve_tx_tso_context_desc_dqo){ .header_len = header_len, .cmd_dtype = { .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, .tso = 1, }, .flex0 = metadata->bytes[0], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], }; desc->tso_total_len = mbuf->m_pkthdr.len - header_len; desc->mss = mbuf->m_pkthdr.tso_segsz; } static void gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, const struct gve_tx_metadata_dqo *metadata) { *desc = (struct gve_tx_general_context_desc_dqo){ .flex0 = metadata->bytes[0], .flex1 = metadata->bytes[1], .flex2 = metadata->bytes[2], .flex3 = metadata->bytes[3], .flex4 = metadata->bytes[4], .flex5 = metadata->bytes[5], .flex6 = metadata->bytes[6], .flex7 = metadata->bytes[7], .flex8 = metadata->bytes[8], .flex9 = metadata->bytes[9], .flex10 = metadata->bytes[10], .flex11 = metadata->bytes[11], .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, }; } #define PULLUP_HDR(m, len) \ do { \ if (__predict_false((m)->m_len < (len))) { \ (m) = m_pullup((m), (len)); \ if ((m) == NULL) \ return (EINVAL); \ } \ } while (0) static int gve_prep_tso(struct mbuf *mbuf, int *header_len) { uint8_t l3_off, l4_off = 0; struct ether_header *eh; struct tcphdr *th; u_short csum; PULLUP_HDR(mbuf, sizeof(*eh)); eh = mtod(mbuf, struct ether_header *); KASSERT(eh->ether_type != ETHERTYPE_VLAN, ("VLAN-tagged packets not supported")); l3_off = ETHER_HDR_LEN; #ifdef INET6 if (ntohs(eh->ether_type) == ETHERTYPE_IPV6) { struct ip6_hdr *ip6; PULLUP_HDR(mbuf, l3_off + sizeof(*ip6)); ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off)); l4_off = l3_off + sizeof(struct ip6_hdr); csum = in6_cksum_pseudo(ip6, /*len=*/0, IPPROTO_TCP, /*csum=*/0); } else #endif if (ntohs(eh->ether_type) == ETHERTYPE_IP) { struct ip *ip; PULLUP_HDR(mbuf, l3_off + sizeof(*ip)); ip = (struct ip *)(mtodo(mbuf, l3_off)); l4_off = l3_off + (ip->ip_hl << 2); csum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(IPPROTO_TCP)); } PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *)); th = (struct tcphdr *)(mtodo(mbuf, l4_off)); *header_len = l4_off + (th->th_off << 2); /* * Hardware requires the th->th_sum to not include the TCP payload, * hence we recompute the csum with it excluded. */ th->th_sum = csum; return (0); } static int gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, bool is_tso, uint32_t *desc_idx) { struct gve_tx_general_context_desc_dqo *gen_desc; struct gve_tx_tso_context_desc_dqo *tso_desc; struct gve_tx_metadata_dqo metadata; int header_len; int err; metadata = (struct gve_tx_metadata_dqo){0}; gve_extract_tx_metadata_dqo(mbuf, &metadata); if (is_tso) { err = gve_prep_tso(mbuf, &header_len); if (__predict_false(err)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_tsoerr, 1); counter_exit(); return (err); } tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; counter_enter(); counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); counter_exit(); } gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; gve_tx_fill_general_ctx_desc(gen_desc, &metadata); *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; return (0); } static int gve_map_mbuf_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf, bus_dmamap_t dmamap, bus_dma_segment_t *segs, int *nsegs, int attempt) { struct mbuf *m_new = NULL; int err; err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap, *mbuf, segs, nsegs, BUS_DMA_NOWAIT); switch (err) { case __predict_true(0): break; case EFBIG: if (__predict_false(attempt > 0)) goto abort; counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_collapse, 1); counter_exit(); /* Try m_collapse before m_defrag */ m_new = m_collapse(*mbuf, M_NOWAIT, GVE_TX_MAX_DATA_DESCS_DQO); if (m_new == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag, 1); counter_exit(); m_new = m_defrag(*mbuf, M_NOWAIT); } if (__predict_false(m_new == NULL)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_defrag_err, 1); counter_exit(); m_freem(*mbuf); *mbuf = NULL; err = ENOMEM; goto abort; } else { *mbuf = m_new; return (gve_map_mbuf_dqo(tx, mbuf, dmamap, segs, nsegs, ++attempt)); } case ENOMEM: counter_enter(); counter_u64_add_protected( tx->stats.tx_mbuf_dmamap_enomem_err, 1); counter_exit(); goto abort; default: goto abort; } return (0); abort: counter_enter(); counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1); counter_exit(); return (err); } static uint32_t num_avail_desc_ring_slots(const struct gve_tx_ring *tx) { uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) & tx->dqo.desc_mask; return (tx->dqo.desc_mask - num_used); } static struct gve_tx_pending_pkt_dqo * gve_alloc_pending_packet(struct gve_tx_ring *tx) { int32_t index = tx->dqo.free_pending_pkts_csm; struct gve_tx_pending_pkt_dqo *pending_pkt; /* * No pending packets available in the consumer list, * try to steal the producer list. */ if (__predict_false(index == -1)) { tx->dqo.free_pending_pkts_csm = atomic_swap_32( &tx->dqo.free_pending_pkts_prd, -1); index = tx->dqo.free_pending_pkts_csm; if (__predict_false(index == -1)) return (NULL); } pending_pkt = &tx->dqo.pending_pkts[index]; /* Remove pending_pkt from the consumer list */ tx->dqo.free_pending_pkts_csm = pending_pkt->next; pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + gve_set_timestamp(&pending_pkt->enqueue_time_sec); + return (pending_pkt); } static void gve_free_pending_packet(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pending_pkt) { int index = pending_pkt - tx->dqo.pending_pkts; int32_t old_head; pending_pkt->state = GVE_PACKET_STATE_FREE; + gve_invalidate_timestamp(&pending_pkt->enqueue_time_sec); + /* Add pending_pkt to the producer list */ while (true) { old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); pending_pkt->next = old_head; if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd, old_head, index)) break; } } /* * Has the side-effect of retrieving the value of the last desc index * processed by the NIC. hw_tx_head is written to by the completions-processing * taskqueue upon receiving descriptor-completions. */ static bool gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs) { if (needed_descs <= num_avail_desc_ring_slots(tx)) return (true); tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head); if (needed_descs > num_avail_desc_ring_slots(tx)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_descring, 1); counter_exit(); return (false); } return (0); } static void gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx) { uint32_t last_report_event_interval; uint32_t last_desc_idx; last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask; last_report_event_interval = (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask; if (__predict_false(last_report_event_interval >= GVE_TX_MIN_RE_INTERVAL)) { tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true; tx->dqo.last_re_idx = last_desc_idx; } } static bool gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) { uint32_t available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( &tx->dqo.qpl_bufs_produced); available = tx->dqo.qpl_bufs_produced_cached - tx->dqo.qpl_bufs_consumed; if (__predict_true(available >= num_bufs)) return (true); return (false); } static int32_t gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) { int32_t buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) { tx->dqo.free_qpl_bufs_csm = atomic_swap_32( &tx->dqo.free_qpl_bufs_prd, -1); buf = tx->dqo.free_qpl_bufs_csm; if (__predict_false(buf == -1)) return (-1); } tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; tx->dqo.qpl_bufs_consumed++; return (buf); } /* * Tx buffer i corresponds to * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO */ static void gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, int32_t index, void **va, bus_addr_t *dma_addr) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << GVE_TX_BUF_SHIFT_DQO; *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; } static struct gve_dma_handle * gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) { int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); return (&tx->com.qpl->dmas[page_id]); } static void gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, bool csum_enabled, int16_t completion_tag, uint32_t *desc_idx) { int32_t pkt_len = mbuf->m_pkthdr.len; struct gve_dma_handle *dma; uint32_t copy_offset = 0; int32_t prev_buf = -1; uint32_t copy_len; bus_addr_t addr; int32_t buf; void *va; MPASS(pkt->num_qpl_bufs == 0); MPASS(pkt->qpl_buf_head == -1); while (copy_offset < pkt_len) { buf = gve_tx_alloc_qpl_buf(tx); /* We already checked for availability */ MPASS(buf != -1); gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); m_copydata(mbuf, copy_offset, copy_len, va); copy_offset += copy_len; dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); gve_tx_fill_pkt_desc_dqo(tx, desc_idx, copy_len, addr, completion_tag, /*eop=*/copy_offset == pkt_len, csum_enabled); /* Link all the qpl bufs for a packet */ if (prev_buf == -1) pkt->qpl_buf_head = buf; else tx->dqo.qpl_bufs[prev_buf] = buf; prev_buf = buf; pkt->num_qpl_bufs++; } tx->dqo.qpl_bufs[buf] = -1; } int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) { uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); /* Check if we have enough room in the desc ring */ total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); counter_exit(); return (ENOBUFS); } pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; pkt->mbuf = mbuf; err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort; gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, has_csum_flag, completion_tag, &desc_idx); /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) { bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; uint32_t desc_idx = tx->dqo.desc_tail; struct gve_tx_pending_pkt_dqo *pkt; struct mbuf *mbuf = *mbuf_ptr; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; int csum_flags; bool is_tso; int nsegs; int err; int i; csum_flags = mbuf->m_pkthdr.csum_flags; has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); is_tso = csum_flags & CSUM_TSO; /* * This mbuf might end up needing more than 1 pkt desc. * The actual number, `nsegs` is known only after the * expensive gve_map_mbuf_dqo call. This check beneath * exists to fail early when the desc ring is really full. */ total_descs_needed = 1 + /* general_ctx_desc */ 1 + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) return (ENOBUFS); pkt = gve_alloc_pending_packet(tx); if (pkt == NULL) { counter_enter(); counter_u64_add_protected( tx->stats.tx_delayed_pkt_nospace_compring, 1); counter_exit(); return (ENOBUFS); } completion_tag = pkt - tx->dqo.pending_pkts; err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap, segs, &nsegs, /*attempt=*/0); if (err) goto abort; mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */ pkt->mbuf = mbuf; total_descs_needed = 1 + /* general_ctx_desc */ nsegs + /* pkt_desc */ (is_tso ? 1 : 0); /* tso_ctx_desc */ if (__predict_false( !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) { err = ENOBUFS; goto abort_with_dma; } err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); if (err) goto abort_with_dma; bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); for (i = 0; i < nsegs; i++) { gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, segs[i].ds_len, segs[i].ds_addr, completion_tag, /*eop=*/i == (nsegs - 1), has_csum_flag); } /* Remember the index of the last desc written */ tx->dqo.desc_tail = desc_idx; /* * Request a descriptor completion on the last descriptor of the * packet if we are allowed to by the HW enforced interval. */ gve_tx_request_desc_compl(tx, desc_idx); tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ return (0); abort_with_dma: gve_unmap_packet(tx, pkt); abort: pkt->mbuf = NULL; gve_free_pending_packet(tx, pkt); return (err); } static void gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, struct gve_tx_pending_pkt_dqo *pkt) { int32_t buf = pkt->qpl_buf_head; struct gve_dma_handle *dma; int32_t qpl_buf_tail; int32_t old_head; int i; for (i = 0; i < pkt->num_qpl_bufs; i++) { dma = gve_get_page_dma_handle(tx, buf); bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); qpl_buf_tail = buf; buf = tx->dqo.qpl_bufs[buf]; } MPASS(buf == -1); buf = qpl_buf_tail; while (true) { old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); tx->dqo.qpl_bufs[buf] = old_head; /* * The "rel" ensures that the update to dqo.free_qpl_bufs_prd * is visible only after the linked list from this pkt is * attached above to old_head. */ if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd, old_head, pkt->qpl_buf_head)) break; } /* * The "rel" ensures that the update to dqo.qpl_bufs_produced is * visible only adter the update to dqo.free_qpl_bufs_prd above. */ atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); gve_clear_qpl_pending_pkt(pkt); } static uint64_t gve_handle_packet_completion(struct gve_priv *priv, struct gve_tx_ring *tx, uint16_t compl_tag) { struct gve_tx_pending_pkt_dqo *pending_pkt; int32_t pkt_len; if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) { device_printf(priv->dev, "Invalid TX completion tag: %d\n", compl_tag); return (0); } pending_pkt = &tx->dqo.pending_pkts[compl_tag]; /* Packet is allocated but not pending data completion. */ if (__predict_false(pending_pkt->state != GVE_PACKET_STATE_PENDING_DATA_COMPL)) { device_printf(priv->dev, "No pending data completion: %d\n", compl_tag); return (0); } pkt_len = pending_pkt->mbuf->m_pkthdr.len; if (gve_is_qpl(priv)) gve_reap_qpl_bufs_dqo(tx, pending_pkt); else gve_unmap_packet(tx, pending_pkt); m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; gve_free_pending_packet(tx, pending_pkt); return (pkt_len); } +int +gve_check_tx_timeout_dqo(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int num_timeouts; + uint16_t pkt_idx; + + num_timeouts = 0; + for (pkt_idx = 0; pkt_idx < tx->dqo.num_pending_pkts; pkt_idx++) { + pending_pkt = &tx->dqo.pending_pkts[pkt_idx]; + + if (!gve_timestamp_valid(&pending_pkt->enqueue_time_sec)) + continue; + + if (__predict_false( + gve_seconds_since(&pending_pkt->enqueue_time_sec) > + GVE_TX_TIMEOUT_PKT_SEC)) + num_timeouts += 1; + } + + return (num_timeouts); +} + int gve_tx_intr_dqo(void *arg) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; struct gve_ring_com *com = &tx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); /* Interrupts are automatically masked */ taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); return (FILTER_HANDLED); } static void gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int i; - for (i = 0; i < com->priv->tx_desc_cnt; i++) + for (i = 0; i < com->priv->tx_desc_cnt; i++) { tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; + gve_invalidate_timestamp( + &tx->dqo.pending_pkts[i].enqueue_time_sec); + } bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx) { struct gve_ring_com *com = &tx->com; int entries; int i; entries = com->priv->tx_desc_cnt; for (i = 0; i < entries; i++) tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){}; bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, BUS_DMASYNC_PREWRITE); } void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; int j; tx->dqo.desc_head = 0; tx->dqo.desc_tail = 0; tx->dqo.desc_mask = priv->tx_desc_cnt - 1; tx->dqo.last_re_idx = 0; tx->dqo.compl_head = 0; tx->dqo.compl_mask = priv->tx_desc_cnt - 1; atomic_store_32(&tx->dqo.hw_tx_head, 0); tx->dqo.cur_gen_bit = 0; gve_free_tx_mbufs_dqo(tx); for (j = 0; j < tx->dqo.num_pending_pkts; j++) { if (gve_is_qpl(tx->com.priv)) gve_clear_qpl_pending_pkt(&tx->dqo.pending_pkts[j]); tx->dqo.pending_pkts[j].next = (j == tx->dqo.num_pending_pkts - 1) ? -1 : j + 1; tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; } tx->dqo.free_pending_pkts_csm = 0; atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); if (gve_is_qpl(priv)) { int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * tx->com.qpl->num_pages; for (j = 0; j < qpl_buf_cnt - 1; j++) tx->dqo.qpl_bufs[j] = j + 1; tx->dqo.qpl_bufs[j] = -1; tx->dqo.free_qpl_bufs_csm = 0; atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1); atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt); tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; tx->dqo.qpl_bufs_consumed = 0; } gve_tx_clear_desc_ring_dqo(tx); gve_tx_clear_compl_ring_dqo(tx); } static uint8_t gve_tx_get_gen_bit(uint8_t *desc) { uint8_t byte; /* * Prevent generation bit from being read after the rest of the * descriptor. */ byte = atomic_load_acq_8(desc + GVE_TX_DESC_DQO_GEN_BYTE_OFFSET); return ((byte & GVE_TX_DESC_DQO_GEN_BIT_MASK) != 0); } static bool gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget) { struct gve_tx_compl_desc_dqo *compl_desc; uint64_t bytes_done = 0; uint64_t pkts_done = 0; uint16_t compl_tag; int work_done = 0; uint16_t tx_head; uint16_t type; while (work_done < budget) { bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, BUS_DMASYNC_POSTREAD); compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head]; if (gve_tx_get_gen_bit((uint8_t *)compl_desc) == tx->dqo.cur_gen_bit) break; type = compl_desc->type; if (type == GVE_COMPL_TYPE_DQO_DESC) { /* This is the last descriptor fetched by HW plus one */ tx_head = le16toh(compl_desc->tx_head); atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head); } else if (type == GVE_COMPL_TYPE_DQO_PKT) { compl_tag = le16toh(compl_desc->completion_tag); bytes_done += gve_handle_packet_completion(priv, tx, compl_tag); pkts_done++; } tx->dqo.compl_head = (tx->dqo.compl_head + 1) & tx->dqo.compl_mask; /* Flip the generation bit when we wrap around */ tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0; work_done++; } /* * Waking the xmit taskqueue has to occur after room has been made in * the queue. */ atomic_thread_fence_seq_cst(); if (atomic_load_bool(&tx->stopped) && work_done) { atomic_store_bool(&tx->stopped, false); taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); } tx->done += work_done; /* tx->done is just a sysctl counter */ counter_enter(); counter_u64_add_protected(tx->stats.tbytes, bytes_done); counter_u64_add_protected(tx->stats.tpackets, pkts_done); counter_exit(); return (work_done == budget); } void gve_tx_cleanup_tq_dqo(void *arg, int pending) { struct gve_tx_ring *tx = arg; struct gve_priv *priv = tx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) { taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); return; } gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); } diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c index 4e9dd4625e2f..707b8f039d88 100644 --- a/sys/dev/gve/gve_utils.c +++ b/sys/dev/gve/gve_utils.c @@ -1,441 +1,484 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_dqo.h" uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset) { return (be32toh(bus_read_4(priv->reg_bar, offset))); } void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) { bus_write_4(priv->reg_bar, offset, htobe32(val)); } void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) { bus_write_4(priv->db_bar, offset, htobe32(val)); } void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) { bus_write_4(priv->db_bar, offset, val); } void gve_alloc_counters(counter_u64_t *stat, int num_stats) { int i; for (i = 0; i < num_stats; i++) stat[i] = counter_u64_alloc(M_WAITOK); } void gve_free_counters(counter_u64_t *stat, int num_stats) { int i; for (i = 0; i < num_stats; i++) counter_u64_free(stat[i]); } /* Currently assumes a single segment. */ static void gve_dmamap_load_callback(void *arg, bus_dma_segment_t *segs, int nseg, int error) { if (error == 0) *(bus_addr_t *) arg = segs[0].ds_addr; } int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma) { int err; device_t dev = priv->dev; err = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->tag); if (err != 0) { device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto clear_tag; } err = bus_dmamem_alloc(dma->tag, (void **) &dma->cpu_addr, BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, &dma->map); if (err != 0) { device_printf(dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", __func__, (uintmax_t)size, err); goto destroy_tag; } /* An address set by the callback will never be -1 */ dma->bus_addr = (bus_addr_t)-1; err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_NOWAIT); if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); goto free_mem; } return (0); free_mem: bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); destroy_tag: bus_dma_tag_destroy(dma->tag); clear_tag: dma->tag = NULL; return (err); } void gve_dma_free_coherent(struct gve_dma_handle *dma) { bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->tag, dma->map); bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); bus_dma_tag_destroy(dma->tag); } int gve_dmamap_create(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma) { int err; device_t dev = priv->dev; err = bus_dma_tag_create( bus_get_dma_tag(dev), /* parent */ align, 0, /* alignment, bounds */ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ size, /* maxsize */ 1, /* nsegments */ size, /* maxsegsize */ BUS_DMA_ALLOCNOW, /* flags */ NULL, /* lockfunc */ NULL, /* lockarg */ &dma->tag); if (err != 0) { device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", __func__, err); goto clear_tag; } err = bus_dmamap_create(dma->tag, BUS_DMA_COHERENT, &dma->map); if (err != 0) { device_printf(dev, "%s: bus_dmamap_create failed: %d\n", __func__, err); goto destroy_tag; } /* An address set by the callback will never be -1 */ dma->bus_addr = (bus_addr_t)-1; err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_WAITOK); if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); goto destroy_map; } return (0); destroy_map: bus_dmamap_destroy(dma->tag, dma->map); destroy_tag: bus_dma_tag_destroy(dma->tag); clear_tag: dma->tag = NULL; return (err); } void gve_dmamap_destroy(struct gve_dma_handle *dma) { bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(dma->tag, dma->map); bus_dmamap_destroy(dma->tag, dma->map); bus_dma_tag_destroy(dma->tag); } static int gve_mgmnt_intr(void *arg) { struct gve_priv *priv = arg; taskqueue_enqueue(priv->service_tq, &priv->service_task); return (FILTER_HANDLED); } void gve_free_irqs(struct gve_priv *priv) { struct gve_irq *irq; int num_irqs; int rid; int rc; int i; if (priv->irq_tbl == NULL) { device_printf(priv->dev, "No irq table, nothing to free\n"); return; } num_irqs = priv->tx_cfg.max_queues + priv->rx_cfg.max_queues + 1; for (i = 0; i < num_irqs; i++) { irq = &priv->irq_tbl[i]; if (irq->res == NULL) continue; rid = rman_get_rid(irq->res); rc = bus_teardown_intr(priv->dev, irq->res, irq->cookie); if (rc != 0) device_printf(priv->dev, "Failed to teardown irq num %d\n", rid); rc = bus_release_resource(priv->dev, SYS_RES_IRQ, rid, irq->res); if (rc != 0) device_printf(priv->dev, "Failed to release irq num %d\n", rid); irq->res = NULL; irq->cookie = NULL; } free(priv->irq_tbl, M_GVE); priv->irq_tbl = NULL; /* Safe to call even if msix was never alloced */ pci_release_msi(priv->dev); } int gve_alloc_irqs(struct gve_priv *priv) { int num_tx = priv->tx_cfg.max_queues; int num_rx = priv->rx_cfg.max_queues; int req_nvecs = num_tx + num_rx + 1; int got_nvecs = req_nvecs; struct gve_irq *irq; int i, j, m; int rid; int err; struct gve_ring_com *com; struct gve_rx_ring *rx; struct gve_tx_ring *tx; if (pci_alloc_msix(priv->dev, &got_nvecs) != 0) { device_printf(priv->dev, "Failed to acquire any msix vectors\n"); err = ENXIO; goto abort; } else if (got_nvecs != req_nvecs) { device_printf(priv->dev, "Tried to acquire %d msix vectors, got only %d\n", req_nvecs, got_nvecs); err = ENOSPC; goto abort; } if (bootverbose) device_printf(priv->dev, "Enabled MSIX with %d vectors\n", got_nvecs); priv->irq_tbl = malloc(sizeof(struct gve_irq) * req_nvecs, M_GVE, M_WAITOK | M_ZERO); for (i = 0; i < num_tx; i++) { irq = &priv->irq_tbl[i]; tx = &priv->tx[i]; com = &tx->com; rid = i + 1; irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (irq->res == NULL) { device_printf(priv->dev, "Failed to alloc irq %d for Tx queue %d\n", rid, i); err = ENOMEM; goto abort; } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, gve_is_gqi(priv) ? gve_tx_intr : gve_tx_intr_dqo, NULL, &priv->tx[i], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Tx queue %d, " "err: %d\n", rid, i, err); goto abort; } bus_describe_intr(priv->dev, irq->res, irq->cookie, "tx%d", i); com->ntfy_id = i; } for (j = 0; j < num_rx; j++) { irq = &priv->irq_tbl[i + j]; rx = &priv->rx[j]; com = &rx->com; rid = i + j + 1; irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (irq->res == NULL) { device_printf(priv->dev, "Failed to alloc irq %d for Rx queue %d", rid, j); err = ENOMEM; goto abort; } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, gve_is_gqi(priv) ? gve_rx_intr : gve_rx_intr_dqo, NULL, &priv->rx[j], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Rx queue %d, " "err: %d\n", rid, j, err); goto abort; } bus_describe_intr(priv->dev, irq->res, irq->cookie, "rx%d", j); com->ntfy_id = i + j; } m = i + j; rid = m + 1; irq = &priv->irq_tbl[m]; irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, &rid, RF_ACTIVE); if (irq->res == NULL) { device_printf(priv->dev, "Failed to allocate irq %d for mgmnt queue\n", rid); err = ENOMEM; goto abort; } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, gve_mgmnt_intr, NULL, priv, &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for mgmnt queue, err: %d\n", rid, err); goto abort; } bus_describe_intr(priv->dev, irq->res, irq->cookie, "mgmnt"); return (0); abort: gve_free_irqs(priv); return (err); } /* * Builds register value to write to DQO IRQ doorbell to enable with specified * ITR interval. */ static uint32_t gve_setup_itr_interval_dqo(uint32_t interval_us) { uint32_t result = GVE_ITR_ENABLE_BIT_DQO; /* Interval has 2us granularity. */ interval_us >>= 1; interval_us &= GVE_ITR_INTERVAL_DQO_MASK; result |= (interval_us << GVE_ITR_INTERVAL_DQO_SHIFT); return (result); } void gve_unmask_all_queue_irqs(struct gve_priv *priv) { struct gve_tx_ring *tx; struct gve_rx_ring *rx; int idx; for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { tx = &priv->tx[idx]; if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); else gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, gve_setup_itr_interval_dqo(GVE_TX_IRQ_RATELIMIT_US_DQO)); } for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { rx = &priv->rx[idx]; if (gve_is_gqi(priv)) gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); else gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, gve_setup_itr_interval_dqo(GVE_RX_IRQ_RATELIMIT_US_DQO)); } } void gve_mask_all_queue_irqs(struct gve_priv *priv) { for (int idx = 0; idx < priv->tx_cfg.num_queues; idx++) { struct gve_tx_ring *tx = &priv->tx[idx]; gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); } for (int idx = 0; idx < priv->rx_cfg.num_queues; idx++) { struct gve_rx_ring *rx = &priv->rx[idx]; gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); } } + +/* + * In some cases, such as tracking timeout events, we must mark a timestamp as + * invalid when we do not want to consider its value. Such timestamps must be + * checked for validity before reading them. + */ +void +gve_invalidate_timestamp(int64_t *timestamp_sec) +{ + atomic_store_64(timestamp_sec, GVE_TIMESTAMP_INVALID); +} + +/* + * Returns 0 if the timestamp is invalid, otherwise returns the elapsed seconds + * since the timestamp was set. + */ +int64_t +gve_seconds_since(int64_t *timestamp_sec) +{ + struct bintime curr_time; + int64_t enqueued_time; + + getbintime(&curr_time); + enqueued_time = atomic_load_64(timestamp_sec); + if (enqueued_time == GVE_TIMESTAMP_INVALID) + return (0); + return ((int64_t)(curr_time.sec - enqueued_time)); +} + +void +gve_set_timestamp(int64_t *timestamp_sec) +{ + struct bintime curr_time; + + getbintime(&curr_time); + atomic_store_64(timestamp_sec, curr_time.sec); +} + +bool +gve_timestamp_valid(int64_t *timestamp_sec) +{ + return (atomic_load_64(timestamp_sec) != GVE_TIMESTAMP_INVALID); +}