diff --git a/share/man/man4/Makefile b/share/man/man4/Makefile --- a/share/man/man4/Makefile +++ b/share/man/man4/Makefile @@ -176,6 +176,7 @@ geom_map.4 \ geom_uzip.4 \ gif.4 \ + ${_gve.4} \ gpio.4 \ gpioiic.4 \ gpiokeys.4 \ @@ -895,6 +896,10 @@ _ossl.4= ossl.4 .endif +.if ${MACHINE_CPUARCH} == "amd64" || ${MACHINE_CPUARCH} == "aarch64" +_gve.4= gve.4 +.endif + .if ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "aarch64" || \ ${MACHINE_CPUARCH} == "riscv" _cgem.4= cgem.4 diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 new file mode 100644 --- /dev/null +++ b/share/man/man4/gve.4 @@ -0,0 +1,215 @@ +.\" SPDX-License-Identifier: BSD-3-Clause +.\" +.\" Copyright (c) 2023 Google LLC +.\" +.\" Redistribution and use in source and binary forms, with or without modification, +.\" are permitted provided that the following conditions are met: +.\" +.\" 1. Redistributions of source code must retain the above copyright notice, this +.\" list of conditions and the following disclaimer. +.\" +.\" 2. Redistributions in binary form must reproduce the above copyright notice, +.\" this list of conditions and the following disclaimer in the documentation +.\" and/or other materials provided with the distribution. +.\" +.\" 3. Neither the name of the copyright holder nor the names of its contributors +.\" may be used to endorse or promote products derived from this software without +.\" specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +.\" WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +.\" DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +.\" ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +.\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +.\" ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +.\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +.\" SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +.Dd April 26, 2023 +.Dt GVE 4 +.Os +.Sh NAME +.Nm gve +.Nd "Ethernet driver for Google Virtual NIC (gVNIC)" +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following lines in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device gve" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +if_gve_load="YES" +.Ed +.Sh DESCRIPTION +gVNIC is a virtual network interface designed specifically for Google Compute Engine (GCE). +It is required to support per-VM Tier-1 networking performance, and for using certain VM shapes on GCE. +.Pp +.Nm +is the driver for gVNIC. +It supports the following features: +.Pp +.Bl -bullet -compact +.It +RX checksum offload +.It +TX chesksum offload +.It +TCP Segmentation Offload (TSO) +.It +Large Receive Offload (LRO) in software +.It +Jumbo frames +.It +Receive Side Scaling (RSS) +.El +.Pp +For more information on configuring this device, see +.Xr ifconfig 8 . +.Sh HARDWARE +.Nm +binds to a single PCI device ID presented by gVNIC: +.Pp +.Bl -bullet -compact +.It +0x1AE0:0x0042 +.El +.Sh DIAGNOSTICS +The following messages are recorded during driver initialization: +.Bl -diag +.It "Enabled MSIX with %d vectors" +.It "Configured device resources" +.It "Successfully attached %s" +.It "Deconfigured device resources" +.El +.Pp +These messages are seen if driver initialization fails. +Global (across-queues) allocation failures: +.Bl -diag +.It "Failed to configure device resources: err=%d" +.It "No compatible queue formats" +.It "Failed to allocate ifnet struct" +.It "Failed to allocate admin queue mem" +.It "Failed to alloc DMA mem for DescribeDevice" +.It "Failed to allocate QPL page" +.El +.Pp +irq and BAR allocation failures: +.Bl -diag +.It "Failed to acquire any msix vectors" +.It "Tried to acquire %d msix vectors, got only %d" +.It "Failed to setup irq %d for Tx queue %d " +.It "Failed to setup irq %d for Rx queue %d " +.It "Failed to allocate irq %d for mgmnt queue" +.It "Failed to setup irq %d for mgmnt queue, err: %d" +.It "Failed to allocate BAR0" +.It "Failed to allocate BAR2" +.It "Failed to allocate msix table" +.El +.Pp +Rx queue-specific allocation failures: +.Bl -diag +.It "No QPL left for rx ring %d" +.It "Failed to alloc queue resources for rx ring %d" +.It "Failed to alloc desc ring for rx ring %d" +.It "Failed to alloc data ring for rx ring %d" +.El +.Pp +Tx queue-specific allocation failures: +.Bl -diag +.It "No QPL left for tx ring %d" +.It "Failed to alloc queue resources for tx ring %d" +.It "Failed to alloc desc ring for tx ring %d" +.It "Failed to vmap fifo, qpl_id = %d" +.El +.El +.Pp +The following messages are recorded when the interface detach fails: +.Bl -diag +.It "Failed to deconfigure device resources: err=%d" +.El +.Pp +If bootverbose is on, the following messages are recorded when the interface is being brought up: +.Bl -diag +.It "Created %d rx queues" +.It "Created %d tx queues" +.It "MTU set to %d" +.El +.Pp +The following messages are recorded when the interface is being brought down: +.Bl -diag +.It "Destroyed %d rx queues" +.It "Destroyed %d tx queues" +.El +.Pp +These messages are seen if errors are encountered when bringing the interface up or down: +.Bl -diag +.It "Failed to destroy rxq %d, err: %d" +.It "Failed to destroy txq %d, err: %d" +.It "Failed to create rxq %d, err: %d" +.It "Failed to create txq %d, err: %d" +.It "Failed to set MTU to %d" +.It "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d" +.It "Cannot bring the iface up when detached" +.It "Reached max number of registered pages %lu > %lu" +.It "Failed to init lro for rx ring %d" +.El +.Pp +These messages are seen if any admin queue command fails: +.Bl -diag +.It "AQ command(%u): failed with status %d" +.It "AQ command(%u): unknown status code %d" +.It "AQ commands timed out, need to reset AQ" +.It "Unknown AQ command opcode %d" +.El +.Pp +These messages are recorded when the device is being reset due to an error: +.Bl -diag +.It "Scheduling reset task!" +.It "Waiting until admin queue is released." +.It "Admin queue released" +.El +.Pp +If it was the NIC that requested the reset, this message is recorded: +.Bl -diag +.It "Device requested reset" +.El +.Pp +If the reset fails during the reinitialization phase, this message is recorded: +.Bl -diag +.It "Restore failed!" +.El +.Pp +These two messages correspoond to the NIC alerting the driver to link state changes: +.Bl -diag +.It "Device link is up." +.It "Device link is down." +.El +.Pp +Apart from these messages, the driver exposes per-queue packet and error counters as sysctl nodes. +Global (across queues) counters can be read using +.Xr netstat 8 . +.Sh LIMITATIONS +.Nm +does not support the transmission of VLAN-tagged packets. +All VLAN-tagged traffic is dropped. +.Sh SUPPORT +Please email gvnic-drivers@google.com with the specifics of the issue encountered. +.El +.Sh SEE ALSO +.Xr ifconfig 8 , +.Xr netstat 8 +.Sh HISTORY +The +.Nm +device driver first appeared in +.Fx 14.0 . +.Sh AUTHORS +The +.Nm +driver was written by Google. diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -1748,6 +1748,13 @@ dev/fxp/inphy.c optional fxp dev/gem/if_gem.c optional gem dev/gem/if_gem_pci.c optional gem pci +dev/gve/gve_adminq.c optional gve +dev/gve/gve_main.c optional gve +dev/gve/gve_qpl.c optional gve +dev/gve/gve_rx.c optional gve +dev/gve/gve_sysctl.c optional gve +dev/gve/gve_tx.c optional gve +dev/gve/gve_utils.c optional gve dev/goldfish/goldfish_rtc.c optional goldfish_rtc fdt dev/gpio/dwgpio/dwgpio.c optional gpio dwgpio fdt dev/gpio/dwgpio/dwgpio_bus.c optional gpio dwgpio fdt diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve.h @@ -0,0 +1,459 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_FBSD_H +#define _GVE_FBSD_H + +#include "gve_desc.h" +#include "gve_plat.h" +#include "gve_register.h" + +#ifndef PCI_VENDOR_ID_GOOGLE +#define PCI_VENDOR_ID_GOOGLE 0x1ae0 +#endif + +#define PCI_DEV_ID_GVNIC 0x0042 +#define GVE_REGISTER_BAR 0 +#define GVE_DOORBELL_BAR 2 + +/* Driver can alloc up to 2 segments for the header and 2 for the payload. */ +#define GVE_TX_MAX_DESCS 4 +#define GVE_TX_BUFRING_ENTRIES 4096 + +#define ADMINQ_SIZE PAGE_SIZE + +#define GVE_DEFAULT_RX_BUFFER_SIZE 2048 +/* Each RX bounce buffer page can fit two packet buffers. */ +#define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) + +/* + * Number of descriptors per queue page list. + * Page count AKA QPL size can be derived by dividing the number of elements in + * a page by the number of descriptors available. + */ +#define GVE_QPL_DIVISOR 16 + +static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); + +struct gve_dma_handle { + bus_addr_t bus_addr; + void *cpu_addr; + bus_dma_tag_t tag; + bus_dmamap_t map; +}; + +union gve_tx_desc { + struct gve_tx_pkt_desc pkt; /* first desc for a packet */ + struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ + struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ +}; + +/* Tracks the memory in the fifo occupied by a segment of a packet */ +struct gve_tx_iovec { + uint32_t iov_offset; /* offset into this segment */ + uint32_t iov_len; /* length */ + uint32_t iov_padding; /* padding associated with this segment */ +}; + +/* Tracks allowed and current queue settings */ +struct gve_queue_config { + uint16_t max_queues; + uint16_t num_queues; /* current */ +}; + +struct gve_irq_db { + __be32 index; +} __aligned(CACHE_LINE_SIZE); + +/* + * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value + * when the entire configure_device_resources command is zeroed out and the + * queue_format is not specified. + */ +enum gve_queue_format { + GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, + GVE_GQI_RDA_FORMAT = 0x1, + GVE_GQI_QPL_FORMAT = 0x2, + GVE_DQO_RDA_FORMAT = 0x3, +}; + +enum gve_state_flags_bit { + GVE_STATE_FLAG_ADMINQ_OK, + GVE_STATE_FLAG_RESOURCES_OK, + GVE_STATE_FLAG_QPLREG_OK, + GVE_STATE_FLAG_RX_RINGS_OK, + GVE_STATE_FLAG_TX_RINGS_OK, + GVE_STATE_FLAG_QUEUES_UP, + GVE_STATE_FLAG_LINK_UP, + GVE_STATE_FLAG_DO_RESET, + GVE_STATE_FLAG_IN_RESET, + GVE_NUM_STATE_FLAGS /* Not part of the enum space */ +}; + +BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS); + +#define GVE_DEVICE_STATUS_RESET (0x1 << 1) +#define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2) + +#define GVE_RING_LOCK(ring) mtx_lock(&(ring)->ring_mtx) +#define GVE_RING_TRYLOCK(ring) mtx_trylock(&(ring)->ring_mtx) +#define GVE_RING_UNLOCK(ring) mtx_unlock(&(ring)->ring_mtx) +#define GVE_RING_ASSERT(ring) mtx_assert(&(ring)->ring_mtx, MA_OWNED) + +#define GVE_IFACE_LOCK_INIT(lock) sx_init(&lock, "gve interface lock") +#define GVE_IFACE_LOCK_DESTROY(lock) sx_destroy(&lock) +#define GVE_IFACE_LOCK_LOCK(lock) sx_xlock(&lock) +#define GVE_IFACE_LOCK_UNLOCK(lock) sx_unlock(&lock) +#define GVE_IFACE_LOCK_ASSERT(lock) sx_assert(&lock, SA_XLOCKED) + +struct gve_queue_page_list { + uint32_t id; + uint32_t num_dmas; + uint32_t num_pages; + vm_offset_t kva; + vm_page_t *pages; + struct gve_dma_handle *dmas; +}; + +struct gve_irq { + struct resource *res; + void *cookie; +}; + +struct gve_rx_slot_page_info { + void *page_address; + vm_page_t page; + uint32_t page_offset; + uint16_t pad; +}; + +/* + * A single received packet split across multiple buffers may be + * reconstructed using the information in this structure. + */ +struct gve_rx_ctx { + /* head and tail of mbuf chain for the current packet */ + struct mbuf *mbuf_head; + struct mbuf *mbuf_tail; + uint32_t total_size; + uint8_t frag_cnt; + bool drop_pkt; +}; + +struct gve_ring_com { + struct gve_priv *priv; + uint32_t id; + + /* + * BAR2 offset for this ring's doorbell and the + * counter-array offset for this ring's counter. + * Acquired from the device individually for each + * queue in the queue_create adminq command. + */ + struct gve_queue_resources *q_resources; + struct gve_dma_handle q_resources_mem; + + /* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */ + uint32_t irq_db_offset; + /* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */ + uint32_t db_offset; + /* + * Index, not byte-offset, into the counter array where this ring's + * 4-byte counter lies. + */ + uint32_t counter_idx; + + /* + * The index of the MSIX vector that was assigned to + * this ring in `gve_alloc_irqs`. + * + * It is passed to the device in the queue_create adminq + * command. + * + * Additionally, this also serves as the index into + * `priv->irq_db_indices` where this ring's irq doorbell's + * BAR2 offset, `irq_db_idx`, can be found. + */ + int ntfy_id; + + /* + * The fixed bounce buffer for this ring. + * Once allocated, has to be offered to the device + * over the register-page-list adminq command. + */ + struct gve_queue_page_list *qpl; + + struct task cleanup_task; + struct taskqueue *cleanup_tq; +} __aligned(CACHE_LINE_SIZE); + +struct gve_rxq_stats { + counter_u64_t rbytes; + counter_u64_t rpackets; + counter_u64_t rx_dropped_pkt; + counter_u64_t rx_copybreak_cnt; + counter_u64_t rx_frag_flip_cnt; + counter_u64_t rx_frag_copy_cnt; + counter_u64_t rx_dropped_pkt_desc_err; + counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; +}; + +#define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) + +/* power-of-2 sized receive ring */ +struct gve_rx_ring { + struct gve_ring_com com; + struct gve_dma_handle desc_ring_mem; + struct gve_dma_handle data_ring_mem; + + /* accessed in the receive hot path */ + struct { + struct gve_rx_desc *desc_ring; + union gve_rx_data_slot *data_ring; + struct gve_rx_slot_page_info *page_info; + + struct gve_rx_ctx ctx; + struct lro_ctrl lro; + uint8_t seq_no; /* helps traverse the descriptor ring */ + uint32_t cnt; /* free-running total number of completed packets */ + uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ + uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ + struct gve_rxq_stats stats; + } __aligned(CACHE_LINE_SIZE); + +} __aligned(CACHE_LINE_SIZE); + +/* + * A contiguous representation of the pages composing the Tx bounce buffer. + * The xmit taskqueue and the completion taskqueue both simultaneously use it. + * Both operate on `available`: the xmit tq lowers it and the completion tq + * raises it. `head` is the last location written at and so only the xmit tq + * uses it. + */ +struct gve_tx_fifo { + vm_offset_t base; /* address of base of FIFO */ + uint32_t size; /* total size */ + volatile int available; /* how much space is still available */ + uint32_t head; /* offset to write at */ +}; + +struct gve_tx_buffer_state { + struct mbuf *mbuf; + struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; +}; + +struct gve_txq_stats { + counter_u64_t tbytes; + counter_u64_t tpackets; + counter_u64_t tso_packet_cnt; + counter_u64_t tx_dropped_pkt; + counter_u64_t tx_dropped_pkt_nospace_device; + counter_u64_t tx_dropped_pkt_nospace_bufring; + counter_u64_t tx_dropped_pkt_vlan; +}; + +#define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) + +/* power-of-2 sized transmit ring */ +struct gve_tx_ring { + struct gve_ring_com com; + struct gve_dma_handle desc_ring_mem; + + struct task xmit_task; + struct taskqueue *xmit_tq; + + /* accessed in the transmit hot path */ + struct { + union gve_tx_desc *desc_ring; + struct gve_tx_buffer_state *info; + struct buf_ring *br; + + struct gve_tx_fifo fifo; + struct mtx ring_mtx; + + uint32_t req; /* free-running total number of packets written to the nic */ + uint32_t done; /* free-running total number of completed packets */ + uint32_t mask; /* masks the req and done to the size of the ring */ + struct gve_txq_stats stats; + } __aligned(CACHE_LINE_SIZE); + +} __aligned(CACHE_LINE_SIZE); + +struct gve_priv { + if_t ifp; + device_t dev; + struct ifmedia media; + + uint8_t mac[ETHER_ADDR_LEN]; + + struct gve_dma_handle aq_mem; + + struct resource *reg_bar; /* BAR0 */ + struct resource *db_bar; /* BAR2 */ + struct resource *msix_table; + + uint32_t mgmt_msix_idx; + uint32_t rx_copybreak; + + uint16_t num_event_counters; + uint16_t default_num_queues; + uint16_t tx_desc_cnt; + uint16_t rx_desc_cnt; + uint16_t rx_pages_per_qpl; + uint64_t max_registered_pages; + uint64_t num_registered_pages; + uint32_t supported_features; + uint16_t max_mtu; + + struct gve_dma_handle counter_array_mem; + __be32 *counters; + struct gve_dma_handle irqs_db_mem; + struct gve_irq_db *irq_db_indices; + + enum gve_queue_format queue_format; + struct gve_queue_page_list *qpls; + struct gve_queue_config tx_cfg; + struct gve_queue_config rx_cfg; + uint32_t num_queues; + + struct gve_irq *irq_tbl; + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; + + /* + * Admin queue - see gve_adminq.h + * Since AQ cmds do not run in steady state, 32 bit counters suffice + */ + struct gve_adminq_command *adminq; + vm_paddr_t adminq_bus_addr; + uint32_t adminq_mask; /* masks prod_cnt to adminq size */ + uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */ + uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */ + uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */ + /* free-running count of each distinct AQ cmd executed */ + uint32_t adminq_describe_device_cnt; + uint32_t adminq_cfg_device_resources_cnt; + uint32_t adminq_register_page_list_cnt; + uint32_t adminq_unregister_page_list_cnt; + uint32_t adminq_create_tx_queue_cnt; + uint32_t adminq_create_rx_queue_cnt; + uint32_t adminq_destroy_tx_queue_cnt; + uint32_t adminq_destroy_rx_queue_cnt; + uint32_t adminq_dcfg_device_resources_cnt; + uint32_t adminq_set_driver_parameter_cnt; + uint32_t adminq_verify_driver_compatibility_cnt; + + uint32_t interface_up_cnt; + uint32_t interface_down_cnt; + uint32_t reset_cnt; + + struct task service_task; + struct taskqueue *service_tq; + + struct gve_state_flags state_flags; + struct sx gve_iface_lock; +}; + +static inline bool +gve_get_state_flag(struct gve_priv *priv, int pos) +{ + return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags)); +} + +static inline void +gve_set_state_flag(struct gve_priv *priv, int pos) +{ + BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); +} + +static inline void +gve_clear_state_flag(struct gve_priv *priv, int pos) +{ + BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); +} + +/* Defined in gve_main.c */ +void gve_schedule_reset(struct gve_priv *priv); + +/* Register access functions defined in gve_utils.c */ +uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); +void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); +void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); + +/* QPL (Queue Page List) functions defined in gve_qpl.c */ +int gve_alloc_qpls(struct gve_priv *priv); +void gve_free_qpls(struct gve_priv *priv); +int gve_register_qpls(struct gve_priv *priv); +int gve_unregister_qpls(struct gve_priv *priv); + +/* TX functions defined in gve_tx.c */ +int gve_alloc_tx_rings(struct gve_priv *priv); +void gve_free_tx_rings(struct gve_priv *priv); +int gve_create_tx_rings(struct gve_priv *priv); +int gve_destroy_tx_rings(struct gve_priv *priv); +int gve_tx_intr(void *arg); +int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); +void gve_qflush(if_t ifp); +void gve_xmit_tq(void *arg, int pending); +void gve_tx_cleanup_tq(void *arg, int pending); + +/* RX functions defined in gve_rx.c */ +int gve_alloc_rx_rings(struct gve_priv *priv); +void gve_free_rx_rings(struct gve_priv *priv); +int gve_create_rx_rings(struct gve_priv *priv); +int gve_destroy_rx_rings(struct gve_priv *priv); +int gve_rx_intr(void *arg); +void gve_rx_cleanup_tq(void *arg, int pending); + +/* DMA functions defined in gve_utils.c */ +int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma); +void gve_dma_free_coherent(struct gve_dma_handle *dma); +int gve_dmamap_create(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma); +void gve_dmamap_destroy(struct gve_dma_handle *dma); + +/* IRQ functions defined in gve_utils.c */ +void gve_free_irqs(struct gve_priv *priv); +int gve_alloc_irqs(struct gve_priv *priv); +void gve_unmask_all_queue_irqs(struct gve_priv *priv); +void gve_mask_all_queue_irqs(struct gve_priv *priv); + +/* Systcl functions defined in gve_sysctl.c*/ +void gve_setup_sysctl(struct gve_priv *priv); +void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, + uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, + uint64_t *tbytes, uint64_t *tx_dropped_pkt); + +/* Stats functions defined in gve_utils.c */ +void gve_alloc_counters(counter_u64_t *stat, int num_stats); +void gve_free_counters(counter_u64_t *stat, int num_stats); + +#endif /* _GVE_FBSD_H_ */ diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_adminq.h @@ -0,0 +1,394 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_AQ_H_ +#define _GVE_AQ_H_ 1 + +#include +#include +#include +#include +#include + +/* Admin queue opcodes */ +enum gve_adminq_opcodes { + GVE_ADMINQ_DESCRIBE_DEVICE = 0x1, + GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES = 0x2, + GVE_ADMINQ_REGISTER_PAGE_LIST = 0x3, + GVE_ADMINQ_UNREGISTER_PAGE_LIST = 0x4, + GVE_ADMINQ_CREATE_TX_QUEUE = 0x5, + GVE_ADMINQ_CREATE_RX_QUEUE = 0x6, + GVE_ADMINQ_DESTROY_TX_QUEUE = 0x7, + GVE_ADMINQ_DESTROY_RX_QUEUE = 0x8, + GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES = 0x9, + GVE_ADMINQ_SET_DRIVER_PARAMETER = 0xB, + GVE_ADMINQ_REPORT_STATS = 0xC, + GVE_ADMINQ_REPORT_LINK_SPEED = 0xD, + GVE_ADMINQ_GET_PTYPE_MAP = 0xE, + GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY = 0xF, +}; + +/* Admin queue status codes */ +enum gve_adminq_statuses { + GVE_ADMINQ_COMMAND_UNSET = 0x0, + GVE_ADMINQ_COMMAND_PASSED = 0x1, + GVE_ADMINQ_COMMAND_ERROR_ABORTED = 0xFFFFFFF0, + GVE_ADMINQ_COMMAND_ERROR_ALREADY_EXISTS = 0xFFFFFFF1, + GVE_ADMINQ_COMMAND_ERROR_CANCELLED = 0xFFFFFFF2, + GVE_ADMINQ_COMMAND_ERROR_DATALOSS = 0xFFFFFFF3, + GVE_ADMINQ_COMMAND_ERROR_DEADLINE_EXCEEDED = 0xFFFFFFF4, + GVE_ADMINQ_COMMAND_ERROR_FAILED_PRECONDITION = 0xFFFFFFF5, + GVE_ADMINQ_COMMAND_ERROR_INTERNAL_ERROR = 0xFFFFFFF6, + GVE_ADMINQ_COMMAND_ERROR_INVALID_ARGUMENT = 0xFFFFFFF7, + GVE_ADMINQ_COMMAND_ERROR_NOT_FOUND = 0xFFFFFFF8, + GVE_ADMINQ_COMMAND_ERROR_OUT_OF_RANGE = 0xFFFFFFF9, + GVE_ADMINQ_COMMAND_ERROR_PERMISSION_DENIED = 0xFFFFFFFA, + GVE_ADMINQ_COMMAND_ERROR_UNAUTHENTICATED = 0xFFFFFFFB, + GVE_ADMINQ_COMMAND_ERROR_RESOURCE_EXHAUSTED = 0xFFFFFFFC, + GVE_ADMINQ_COMMAND_ERROR_UNAVAILABLE = 0xFFFFFFFD, + GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED = 0xFFFFFFFE, + GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR = 0xFFFFFFFF, +}; + +#define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1 + +/* + * All AdminQ command structs should be naturally packed. The static_assert + * calls make sure this is the case at compile time. + */ + +struct gve_adminq_describe_device { + __be64 device_descriptor_addr; + __be32 device_descriptor_version; + __be32 available_length; +}; + +_Static_assert(sizeof(struct gve_adminq_describe_device) == 16, + "gve: bad admin queue struct length"); + +struct gve_device_descriptor { + __be64 max_registered_pages; + __be16 reserved1; + __be16 tx_queue_entries; + __be16 rx_queue_entries; + __be16 default_num_queues; + __be16 mtu; + __be16 counters; + __be16 reserved2; + __be16 rx_pages_per_qpl; + uint8_t mac[ETHER_ADDR_LEN]; + __be16 num_device_options; + __be16 total_length; + uint8_t reserved3[6]; +}; + +_Static_assert(sizeof(struct gve_device_descriptor) == 40, + "gve: bad admin queue struct length"); + +struct gve_device_option { + __be16 option_id; + __be16 option_length; + __be32 required_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option) == 8, + "gve: bad admin queue struct length"); + +struct gve_device_option_gqi_rda { + __be32 supported_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option_gqi_rda) == 4, + "gve: bad admin queue struct length"); + +struct gve_device_option_gqi_qpl { + __be32 supported_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option_gqi_qpl) == 4, + "gve: bad admin queue struct length"); + +struct gve_device_option_dqo_rda { + __be32 supported_features_mask; +}; + +_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 4, + "gve: bad admin queue struct length"); + +struct gve_device_option_modify_ring { + __be32 supported_features_mask; + __be16 max_rx_ring_size; + __be16 max_tx_ring_size; +}; + +_Static_assert(sizeof(struct gve_device_option_modify_ring) == 8, + "gve: bad admin queue struct length"); + +struct gve_device_option_jumbo_frames { + __be32 supported_features_mask; + __be16 max_mtu; + uint8_t padding[2]; +}; + +_Static_assert(sizeof(struct gve_device_option_jumbo_frames) == 8, + "gve: bad admin queue struct length"); + +enum gve_dev_opt_id { + GVE_DEV_OPT_ID_GQI_RAW_ADDRESSING = 0x1, + GVE_DEV_OPT_ID_GQI_RDA = 0x2, + GVE_DEV_OPT_ID_GQI_QPL = 0x3, + GVE_DEV_OPT_ID_DQO_RDA = 0x4, + GVE_DEV_OPT_ID_MODIFY_RING = 0x6, + GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, +}; + +/* + * These masks are way to predicate the use of a particular option on the driver + * having particular bug fixes represented by each bit position in the mask. + * Currently they are all zero because there are no known bugs preventing the + * use of any option. + */ +enum gve_dev_opt_req_feat_mask { + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RAW_ADDRESSING = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, +}; + +enum gve_sup_feature_mask { + GVE_SUP_MODIFY_RING_MASK = 1 << 0, + GVE_SUP_JUMBO_FRAMES_MASK = 1 << 2, +}; + +#define GVE_VERSION_STR_LEN 128 + +enum gve_driver_capability { + gve_driver_capability_gqi_qpl = 0, + gve_driver_capability_gqi_rda = 1, + gve_driver_capability_dqo_qpl = 2, /* reserved for future use */ + gve_driver_capability_dqo_rda = 3, + gve_driver_capability_alt_miss_compl = 4, +}; + +#define GVE_CAP1(a) BIT((int) a) +#define GVE_CAP2(a) BIT(((int) a) - 64) +#define GVE_CAP3(a) BIT(((int) a) - 128) +#define GVE_CAP4(a) BIT(((int) a) - 192) + +/* + * The following four defines describe 256 compatibility bits. + * Only a few bits (as shown in `gve_driver_compatibility`) are currently + * defined. The rest are reserved for future use. + */ +#define GVE_DRIVER_CAPABILITY_FLAGS1 (GVE_CAP1(gve_driver_capability_gqi_qpl)) +#define GVE_DRIVER_CAPABILITY_FLAGS2 0x0 +#define GVE_DRIVER_CAPABILITY_FLAGS3 0x0 +#define GVE_DRIVER_CAPABILITY_FLAGS4 0x0 + +struct gve_driver_info { + uint8_t os_type; + uint8_t driver_major; + uint8_t driver_minor; + uint8_t driver_sub; + __be32 os_version_major; + __be32 os_version_minor; + __be32 os_version_sub; + __be64 driver_capability_flags[4]; + uint8_t os_version_str1[GVE_VERSION_STR_LEN]; + uint8_t os_version_str2[GVE_VERSION_STR_LEN]; +}; + +struct gve_adminq_verify_driver_compatibility { + __be64 driver_info_len; + __be64 driver_info_addr; +}; + +_Static_assert(sizeof(struct gve_adminq_verify_driver_compatibility) == 16, + "gve: bad admin queue struct length"); + +struct gve_adminq_configure_device_resources { + __be64 counter_array; + __be64 irq_db_addr; + __be32 num_counters; + __be32 num_irq_dbs; + __be32 irq_db_stride; + __be32 ntfy_blk_msix_base_idx; + uint8_t queue_format; + uint8_t padding[7]; +}; + +_Static_assert(sizeof(struct gve_adminq_configure_device_resources) == 40, + "gve: bad admin queue struct length"); + +struct gve_adminq_register_page_list { + __be32 page_list_id; + __be32 num_pages; + __be64 page_address_list_addr; + __be64 page_size; +}; + +_Static_assert(sizeof(struct gve_adminq_register_page_list) == 24, + "gve: bad admin queue struct length"); + +struct gve_adminq_unregister_page_list { + __be32 page_list_id; +}; + +_Static_assert(sizeof(struct gve_adminq_unregister_page_list) == 4, + "gve: bad admin queue struct length"); + +struct gve_adminq_create_tx_queue { + __be32 queue_id; + __be32 reserved; + __be64 queue_resources_addr; + __be64 tx_ring_addr; + __be32 queue_page_list_id; + __be32 ntfy_id; + __be64 tx_comp_ring_addr; + __be16 tx_ring_size; + __be16 tx_comp_ring_size; + uint8_t padding[4]; +}; + +_Static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48, + "gve: bad admin queue struct length"); + +struct gve_adminq_create_rx_queue { + __be32 queue_id; + __be32 index; + __be32 reserved; + __be32 ntfy_id; + __be64 queue_resources_addr; + __be64 rx_desc_ring_addr; + __be64 rx_data_ring_addr; + __be32 queue_page_list_id; + __be16 rx_ring_size; + __be16 packet_buffer_size; + __be16 rx_buff_ring_size; + uint8_t enable_rsc; + uint8_t padding[5]; +}; + +_Static_assert(sizeof(struct gve_adminq_create_rx_queue) == 56, + "gve: bad admin queue struct length"); + +/* Queue resources that are shared with the device */ +struct gve_queue_resources { + union { + struct { + __be32 db_index; /* Device -> Guest */ + __be32 counter_index; /* Device -> Guest */ + }; + uint8_t reserved[64]; + }; +}; + +_Static_assert(sizeof(struct gve_queue_resources) == 64, + "gve: bad admin queue struct length"); + +struct gve_adminq_destroy_tx_queue { + __be32 queue_id; +}; + +_Static_assert(sizeof(struct gve_adminq_destroy_tx_queue) == 4, + "gve: bad admin queue struct length"); + +struct gve_adminq_destroy_rx_queue { + __be32 queue_id; +}; + +_Static_assert(sizeof(struct gve_adminq_destroy_rx_queue) == 4, + "gve: bad admin queue struct length"); + +/* GVE Set Driver Parameter Types */ +enum gve_set_driver_param_types { + GVE_SET_PARAM_MTU = 0x1, +}; + +struct gve_adminq_set_driver_parameter { + __be32 parameter_type; + uint8_t reserved[4]; + __be64 parameter_value; +}; + +_Static_assert(sizeof(struct gve_adminq_set_driver_parameter) == 16, + "gve: bad admin queue struct length"); + +struct stats { + __be32 stat_name; + __be32 queue_id; + __be64 value; +}; + +_Static_assert(sizeof(struct stats) == 16, + "gve: bad admin queue struct length"); + +struct gve_adminq_command { + __be32 opcode; + __be32 status; + union { + struct gve_adminq_configure_device_resources + configure_device_resources; + struct gve_adminq_create_tx_queue create_tx_queue; + struct gve_adminq_create_rx_queue create_rx_queue; + struct gve_adminq_destroy_tx_queue destroy_tx_queue; + struct gve_adminq_destroy_rx_queue destroy_rx_queue; + struct gve_adminq_describe_device describe_device; + struct gve_adminq_register_page_list reg_page_list; + struct gve_adminq_unregister_page_list unreg_page_list; + struct gve_adminq_set_driver_parameter set_driver_param; + struct gve_adminq_verify_driver_compatibility + verify_driver_compatibility; + uint8_t reserved[56]; + }; +}; + +_Static_assert(sizeof(struct gve_adminq_command) == 64, + "gve: bad admin queue struct length"); + +int gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_destroy_rx_queues(struct gve_priv *priv, uint32_t num_queues); +int gve_adminq_set_mtu(struct gve_priv *priv, uint32_t mtu); +int gve_adminq_alloc(struct gve_priv *priv); +void gve_reset_adminq(struct gve_priv *priv); +int gve_adminq_describe_device(struct gve_priv *priv); +int gve_adminq_configure_device_resources(struct gve_priv *priv); +int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); +void gve_release_adminq(struct gve_priv *priv); +int gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl); +int gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id); +int gve_adminq_verify_driver_compatibility(struct gve_priv *priv, + uint64_t driver_info_len, vm_paddr_t driver_info_addr); +#endif /* _GVE_AQ_H_ */ diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_adminq.c @@ -0,0 +1,803 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include +#include +#include + +#include +#include +#include + +#include "gve.h" +#include "gve_adminq.h" + +#define GVE_ADMINQ_SLEEP_LEN_MS 20 +#define GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK 10 +#define GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION 1 +#define GVE_REG_ADMINQ_ADDR 16 +#define ADMINQ_SLOTS (ADMINQ_SIZE / sizeof(struct gve_adminq_command)) + +#define GVE_DEVICE_OPTION_ERROR_FMT "%s option error:\n" \ + "Expected: length=%d, feature_mask=%x.\n" \ + "Actual: length=%d, feature_mask=%x.\n" + +#define GVE_DEVICE_OPTION_TOO_BIG_FMT "Length of %s option larger than expected." \ + " Possible older version of guest driver.\n" + +static +void gve_parse_device_option(struct gve_priv *priv, + struct gve_device_descriptor *device_descriptor, + struct gve_device_option *option, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ + uint32_t req_feat_mask = be32toh(option->required_features_mask); + uint16_t option_length = be16toh(option->option_length); + uint16_t option_id = be16toh(option->option_id); + + /* + * If the length or feature mask doesn't match, continue without + * enabling the feature. + */ + switch (option_id) { + case GVE_DEV_OPT_ID_GQI_QPL: + if (option_length < sizeof(**dev_op_gqi_qpl) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "GQI QPL", (int)sizeof(**dev_op_gqi_qpl), + GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_gqi_qpl)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "GQI QPL"); + } + *dev_op_gqi_qpl = (void *)(option + 1); + break; + + case GVE_DEV_OPT_ID_JUMBO_FRAMES: + if (option_length < sizeof(**dev_op_jumbo_frames) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "Jumbo Frames", (int)sizeof(**dev_op_jumbo_frames), + GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_jumbo_frames)) { + device_printf(priv->dev, + GVE_DEVICE_OPTION_TOO_BIG_FMT, "Jumbo Frames"); + } + *dev_op_jumbo_frames = (void *)(option + 1); + break; + + default: + /* + * If we don't recognize the option just continue + * without doing anything. + */ + device_printf(priv->dev, "Unrecognized device option 0x%hx not enabled.\n", + option_id); + } +} + +/* Process all device options for a given describe device call. */ +static int +gve_process_device_options(struct gve_priv *priv, + struct gve_device_descriptor *descriptor, + struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) +{ + char *desc_end = (char *)descriptor + be16toh(descriptor->total_length); + const int num_options = be16toh(descriptor->num_device_options); + struct gve_device_option *dev_opt; + int i; + + /* The options struct directly follows the device descriptor. */ + dev_opt = (void *)(descriptor + 1); + for (i = 0; i < num_options; i++) { + if ((char *)(dev_opt + 1) > desc_end || + (char *)(dev_opt + 1) + be16toh(dev_opt->option_length) > desc_end) { + device_printf(priv->dev, + "options exceed device_descriptor's total length.\n"); + return (EINVAL); + } + + gve_parse_device_option(priv, descriptor, dev_opt, + dev_op_gqi_qpl, dev_op_jumbo_frames); + dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length)); + } + + return (0); +} + +static int gve_adminq_execute_cmd(struct gve_priv *priv, + struct gve_adminq_command *cmd); + +static int +gve_adminq_destroy_tx_queue(struct gve_priv *priv, uint32_t id) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_DESTROY_TX_QUEUE); + cmd.destroy_tx_queue.queue_id = htobe32(id); + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +static int +gve_adminq_destroy_rx_queue(struct gve_priv *priv, uint32_t id) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_DESTROY_RX_QUEUE); + cmd.destroy_rx_queue.queue_id = htobe32(id); + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +int +gve_adminq_destroy_rx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_rx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to destroy rxq %d, err: %d\n", + i, err); + } + } + + if (err != 0) + return (err); + + device_printf(priv->dev, "Destroyed %d rx queues\n", num_queues); + return (0); +} + +int +gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_destroy_tx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to destroy txq %d, err: %d\n", + i, err); + } + } + + if (err != 0) + return (err); + + device_printf(priv->dev, "Destroyed %d tx queues\n", num_queues); + return (0); +} + +static int +gve_adminq_create_rx_queue(struct gve_priv *priv, uint32_t queue_index) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + struct gve_rx_ring *rx = &priv->rx[queue_index]; + struct gve_dma_handle *qres_dma = &rx->com.q_resources_mem; + + bus_dmamap_sync(qres_dma->tag, qres_dma->map, BUS_DMASYNC_PREREAD); + + cmd.opcode = htobe32(GVE_ADMINQ_CREATE_RX_QUEUE); + cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { + .queue_id = htobe32(queue_index), + .index = htobe32(queue_index), + .ntfy_id = htobe32(rx->com.ntfy_id), + .queue_resources_addr = htobe64(qres_dma->bus_addr), + .rx_desc_ring_addr = htobe64(rx->desc_ring_mem.bus_addr), + .rx_data_ring_addr = htobe64(rx->data_ring_mem.bus_addr), + .queue_page_list_id = htobe32((rx->com.qpl)->id), + .rx_ring_size = htobe16(priv->rx_desc_cnt), + .packet_buffer_size = htobe16(GVE_DEFAULT_RX_BUFFER_SIZE), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +int +gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_rx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to create rxq %d, err: %d\n", + i, err); + goto abort; + } + } + + if (bootverbose) + device_printf(priv->dev, "Created %d rx queues\n", num_queues); + return (0); + +abort: + gve_adminq_destroy_rx_queues(priv, i); + return (err); +} + +static int +gve_adminq_create_tx_queue(struct gve_priv *priv, uint32_t queue_index) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + struct gve_tx_ring *tx = &priv->tx[queue_index]; + struct gve_dma_handle *qres_dma = &tx->com.q_resources_mem; + + bus_dmamap_sync(qres_dma->tag, qres_dma->map, BUS_DMASYNC_PREREAD); + + cmd.opcode = htobe32(GVE_ADMINQ_CREATE_TX_QUEUE); + cmd.create_tx_queue = (struct gve_adminq_create_tx_queue) { + .queue_id = htobe32(queue_index), + .queue_resources_addr = htobe64(qres_dma->bus_addr), + .tx_ring_addr = htobe64(tx->desc_ring_mem.bus_addr), + .queue_page_list_id = htobe32((tx->com.qpl)->id), + .ntfy_id = htobe32(tx->com.ntfy_id), + .tx_ring_size = htobe16(priv->tx_desc_cnt), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +int +gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues) +{ + int err; + int i; + + for (i = 0; i < num_queues; i++) { + err = gve_adminq_create_tx_queue(priv, i); + if (err != 0) { + device_printf(priv->dev, "Failed to create txq %d, err: %d\n", + i, err); + goto abort; + } + } + + if (bootverbose) + device_printf(priv->dev, "Created %d tx queues\n", num_queues); + return (0); + +abort: + gve_adminq_destroy_tx_queues(priv, i); + return (err); +} + +int +gve_adminq_set_mtu(struct gve_priv *priv, uint32_t mtu) { + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_SET_DRIVER_PARAMETER); + cmd.set_driver_param = (struct gve_adminq_set_driver_parameter) { + .parameter_type = htobe32(GVE_SET_PARAM_MTU), + .parameter_value = htobe64(mtu), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +static void +gve_enable_supported_features(struct gve_priv *priv, + uint32_t supported_features_mask, + const struct gve_device_option_jumbo_frames *dev_op_jumbo_frames) +{ + if (dev_op_jumbo_frames && + (supported_features_mask & GVE_SUP_JUMBO_FRAMES_MASK)) { + if (bootverbose) + device_printf(priv->dev, "JUMBO FRAMES device option enabled: %u.\n", + be16toh(dev_op_jumbo_frames->max_mtu)); + priv->max_mtu = be16toh(dev_op_jumbo_frames->max_mtu); + } +} + +int +gve_adminq_describe_device(struct gve_priv *priv) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + struct gve_device_descriptor *desc; + struct gve_dma_handle desc_mem; + struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; + struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; + uint32_t supported_features_mask = 0; + int rc; + int i; + + rc = gve_dma_alloc_coherent(priv, ADMINQ_SIZE, ADMINQ_SIZE, &desc_mem); + if (rc != 0) { + device_printf(priv->dev, "Failed to alloc DMA mem for DescribeDevice.\n"); + return (rc); + } + + desc = desc_mem.cpu_addr; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_DESCRIBE_DEVICE); + aq_cmd.describe_device.device_descriptor_addr = htobe64( + desc_mem.bus_addr); + aq_cmd.describe_device.device_descriptor_version = htobe32( + GVE_ADMINQ_DEVICE_DESCRIPTOR_VERSION); + aq_cmd.describe_device.available_length = htobe32(ADMINQ_SIZE); + + bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_PREWRITE); + + rc = gve_adminq_execute_cmd(priv, &aq_cmd); + if (rc != 0) + goto free_device_descriptor; + + bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD); + + rc = gve_process_device_options(priv, desc, &dev_op_gqi_qpl, + &dev_op_jumbo_frames); + if (rc != 0) + goto free_device_descriptor; + + if (dev_op_gqi_qpl != NULL) { + priv->queue_format = GVE_GQI_QPL_FORMAT; + supported_features_mask = be32toh( + dev_op_gqi_qpl->supported_features_mask); + if (bootverbose) + device_printf(priv->dev, + "Driver is running with GQI QPL queue format.\n"); + } else { + device_printf(priv->dev, "No compatible queue formats\n"); + rc = (EINVAL); + goto free_device_descriptor; + } + + priv->num_event_counters = be16toh(desc->counters); + priv->default_num_queues = be16toh(desc->default_num_queues); + priv->tx_desc_cnt = be16toh(desc->tx_queue_entries); + priv->rx_desc_cnt = be16toh(desc->rx_queue_entries); + priv->rx_pages_per_qpl = be16toh(desc->rx_pages_per_qpl); + priv->max_registered_pages = be64toh(desc->max_registered_pages); + priv->max_mtu = be16toh(desc->mtu); + priv->default_num_queues = be16toh(desc->default_num_queues); + priv->supported_features = supported_features_mask; + + gve_enable_supported_features(priv, supported_features_mask, + dev_op_jumbo_frames); + + for (i = 0; i < ETHER_ADDR_LEN; i++) + priv->mac[i] = desc->mac[i]; + +free_device_descriptor: + gve_dma_free_coherent(&desc_mem); + + return (rc); +} + +int +gve_adminq_register_page_list(struct gve_priv *priv, + struct gve_queue_page_list *qpl) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + uint32_t num_entries = qpl->num_pages; + uint32_t size = num_entries * sizeof(qpl->dmas[0].bus_addr); + __be64 *page_list; + struct gve_dma_handle dma; + int err; + int i; + + err = gve_dma_alloc_coherent(priv, size, PAGE_SIZE, &dma); + if (err != 0) + return (ENOMEM); + + page_list = dma.cpu_addr; + + for (i = 0; i < num_entries; i++) + page_list[i] = htobe64(qpl->dmas[i].bus_addr); + + bus_dmamap_sync(dma.tag, dma.map, BUS_DMASYNC_PREWRITE); + + cmd.opcode = htobe32(GVE_ADMINQ_REGISTER_PAGE_LIST); + cmd.reg_page_list = (struct gve_adminq_register_page_list) { + .page_list_id = htobe32(qpl->id), + .num_pages = htobe32(num_entries), + .page_address_list_addr = htobe64(dma.bus_addr), + .page_size = htobe64(PAGE_SIZE), + }; + + err = gve_adminq_execute_cmd(priv, &cmd); + gve_dma_free_coherent(&dma); + return (err); +} + +int +gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id) +{ + struct gve_adminq_command cmd = (struct gve_adminq_command){}; + + cmd.opcode = htobe32(GVE_ADMINQ_UNREGISTER_PAGE_LIST); + cmd.unreg_page_list = (struct gve_adminq_unregister_page_list) { + .page_list_id = htobe32(page_list_id), + }; + + return (gve_adminq_execute_cmd(priv, &cmd)); +} + +#define GVE_NTFY_BLK_BASE_MSIX_IDX 0 +int +gve_adminq_configure_device_resources(struct gve_priv *priv) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + + bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, + BUS_DMASYNC_PREREAD); + bus_dmamap_sync(priv->counter_array_mem.tag, + priv->counter_array_mem.map, BUS_DMASYNC_PREREAD); + + aq_cmd.opcode = htobe32(GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES); + aq_cmd.configure_device_resources = + (struct gve_adminq_configure_device_resources) { + .counter_array = htobe64(priv->counter_array_mem.bus_addr), + .irq_db_addr = htobe64(priv->irqs_db_mem.bus_addr), + .num_counters = htobe32(priv->num_event_counters), + .num_irq_dbs = htobe32(priv->num_queues), + .irq_db_stride = htobe32(sizeof(struct gve_irq_db)), + .ntfy_blk_msix_base_idx = htobe32(GVE_NTFY_BLK_BASE_MSIX_IDX), + .queue_format = priv->queue_format, + }; + + return (gve_adminq_execute_cmd(priv, &aq_cmd)); +} + +int +gve_adminq_deconfigure_device_resources(struct gve_priv *priv) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES); + return (gve_adminq_execute_cmd(priv, &aq_cmd)); +} + +int +gve_adminq_verify_driver_compatibility(struct gve_priv *priv, + uint64_t driver_info_len, + vm_paddr_t driver_info_addr) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY); + aq_cmd.verify_driver_compatibility = (struct gve_adminq_verify_driver_compatibility) { + .driver_info_len = htobe64(driver_info_len), + .driver_info_addr = htobe64(driver_info_addr), + }; + + return (gve_adminq_execute_cmd(priv, &aq_cmd)); +} + +int +gve_adminq_alloc(struct gve_priv *priv) +{ + int rc; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK)) + return (0); + + if (priv->aq_mem.cpu_addr == NULL) { + rc = gve_dma_alloc_coherent(priv, ADMINQ_SIZE, ADMINQ_SIZE, + &priv->aq_mem); + if (rc != 0) { + device_printf(priv->dev, "Failed to allocate admin queue mem\n"); + return (rc); + } + } + + priv->adminq = priv->aq_mem.cpu_addr; + priv->adminq_bus_addr = priv->aq_mem.bus_addr; + + if (priv->adminq == NULL) + return (ENOMEM); + + priv->adminq_mask = ADMINQ_SLOTS - 1; + priv->adminq_prod_cnt = 0; + priv->adminq_cmd_fail = 0; + priv->adminq_timeouts = 0; + priv->adminq_describe_device_cnt = 0; + priv->adminq_cfg_device_resources_cnt = 0; + priv->adminq_register_page_list_cnt = 0; + priv->adminq_unregister_page_list_cnt = 0; + priv->adminq_create_tx_queue_cnt = 0; + priv->adminq_create_rx_queue_cnt = 0; + priv->adminq_destroy_tx_queue_cnt = 0; + priv->adminq_destroy_rx_queue_cnt = 0; + priv->adminq_dcfg_device_resources_cnt = 0; + priv->adminq_set_driver_parameter_cnt = 0; + + gve_reg_bar_write_4(priv, GVE_REG_ADMINQ_ADDR, + priv->adminq_bus_addr / ADMINQ_SIZE); + + gve_set_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK); + return (0); +} + +void +gve_release_adminq(struct gve_priv *priv) +{ + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK)) + return; + + gve_reg_bar_write_4(priv, GVE_REG_ADMINQ_ADDR, 0); + while (gve_reg_bar_read_4(priv, GVE_REG_ADMINQ_ADDR)) { + device_printf(priv->dev, "Waiting until admin queue is released.\n"); + pause("gve release adminq", GVE_ADMINQ_SLEEP_LEN_MS); + } + + gve_dma_free_coherent(&priv->aq_mem); + priv->aq_mem = (struct gve_dma_handle){}; + priv->adminq = 0; + priv->adminq_bus_addr = 0; + + gve_clear_state_flag(priv, GVE_STATE_FLAG_ADMINQ_OK); + + if (bootverbose) + device_printf(priv->dev, "Admin queue released\n"); +} + +static int +gve_adminq_parse_err(struct gve_priv *priv, uint32_t opcode, uint32_t status) +{ + if (status != GVE_ADMINQ_COMMAND_PASSED && + status != GVE_ADMINQ_COMMAND_UNSET) { + device_printf(priv->dev, "AQ command(%u): failed with status %d\n", opcode, status); + priv->adminq_cmd_fail++; + } + switch (status) { + case GVE_ADMINQ_COMMAND_PASSED: + return (0); + + case GVE_ADMINQ_COMMAND_UNSET: + device_printf(priv->dev, + "AQ command(%u): err and status both unset, this should not be possible.\n", + opcode); + return (EINVAL); + + case GVE_ADMINQ_COMMAND_ERROR_ABORTED: + case GVE_ADMINQ_COMMAND_ERROR_CANCELLED: + case GVE_ADMINQ_COMMAND_ERROR_DATALOSS: + case GVE_ADMINQ_COMMAND_ERROR_FAILED_PRECONDITION: + case GVE_ADMINQ_COMMAND_ERROR_UNAVAILABLE: + return (EAGAIN); + + case GVE_ADMINQ_COMMAND_ERROR_ALREADY_EXISTS: + case GVE_ADMINQ_COMMAND_ERROR_INTERNAL_ERROR: + case GVE_ADMINQ_COMMAND_ERROR_INVALID_ARGUMENT: + case GVE_ADMINQ_COMMAND_ERROR_NOT_FOUND: + case GVE_ADMINQ_COMMAND_ERROR_OUT_OF_RANGE: + case GVE_ADMINQ_COMMAND_ERROR_UNKNOWN_ERROR: + return (EINVAL); + + case GVE_ADMINQ_COMMAND_ERROR_DEADLINE_EXCEEDED: + return (ETIMEDOUT); + + case GVE_ADMINQ_COMMAND_ERROR_PERMISSION_DENIED: + case GVE_ADMINQ_COMMAND_ERROR_UNAUTHENTICATED: + return (EACCES); + + case GVE_ADMINQ_COMMAND_ERROR_RESOURCE_EXHAUSTED: + return (ENOMEM); + + case GVE_ADMINQ_COMMAND_ERROR_UNIMPLEMENTED: + return (EOPNOTSUPP); + + default: + device_printf(priv->dev, "AQ command(%u): unknown status code %d\n", + opcode, status); + return (EINVAL); + } +} + +static void +gve_adminq_kick_cmd(struct gve_priv *priv, uint32_t prod_cnt) +{ + gve_reg_bar_write_4(priv, ADMINQ_DOORBELL, prod_cnt); + +} + +static bool +gve_adminq_wait_for_cmd(struct gve_priv *priv, uint32_t prod_cnt) +{ + int i; + + for (i = 0; i < GVE_MAX_ADMINQ_EVENT_COUNTER_CHECK; i++) { + if (gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER) == prod_cnt) + return (true); + pause("gve adminq cmd", GVE_ADMINQ_SLEEP_LEN_MS); + } + + return (false); +} + +/* + * Flushes all AQ commands currently queued and waits for them to complete. + * If there are failures, it will return the first error. + */ +static int +gve_adminq_kick_and_wait(struct gve_priv *priv) +{ + struct gve_adminq_command *cmd; + uint32_t status, err; + uint32_t tail, head; + uint32_t opcode; + int i; + + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + head = priv->adminq_prod_cnt; + + gve_adminq_kick_cmd(priv, head); + if (!gve_adminq_wait_for_cmd(priv, head)) { + device_printf(priv->dev, "AQ commands timed out, need to reset AQ\n"); + priv->adminq_timeouts++; + return (ENOTRECOVERABLE); + } + bus_dmamap_sync( + priv->aq_mem.tag, priv->aq_mem.map, BUS_DMASYNC_POSTREAD); + + for (i = tail; i < head; i++) { + cmd = &priv->adminq[i & priv->adminq_mask]; + status = be32toh(cmd->status); + opcode = be32toh(cmd->opcode); + err = gve_adminq_parse_err(priv, opcode, status); + if (err != 0) + return (err); + } + + return (0); +} + +/* + * This function is not threadsafe - the caller is responsible for any + * necessary locks. + */ +static int +gve_adminq_issue_cmd(struct gve_priv *priv, struct gve_adminq_command *cmd_orig) +{ + struct gve_adminq_command *cmd; + uint32_t opcode; + uint32_t tail; + int err; + + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + + /* Check if next command will overflow the buffer. */ + if ((priv->adminq_prod_cnt - tail) > priv->adminq_mask) { + /* Flush existing commands to make room. */ + err = gve_adminq_kick_and_wait(priv); + if (err != 0) + return (err); + + /* Retry. */ + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + if ((priv->adminq_prod_cnt - tail) > priv->adminq_mask) { + /* + * This should never happen. We just flushed the + * command queue so there should be enough space. + */ + return (ENOMEM); + } + } + + cmd = &priv->adminq[priv->adminq_prod_cnt & priv->adminq_mask]; + priv->adminq_prod_cnt++; + + memcpy(cmd, cmd_orig, sizeof(*cmd_orig)); + + bus_dmamap_sync( + priv->aq_mem.tag, priv->aq_mem.map, BUS_DMASYNC_PREWRITE); + + opcode = be32toh(cmd->opcode); + + switch (opcode) { + case GVE_ADMINQ_DESCRIBE_DEVICE: + priv->adminq_describe_device_cnt++; + break; + + case GVE_ADMINQ_CONFIGURE_DEVICE_RESOURCES: + priv->adminq_cfg_device_resources_cnt++; + break; + + case GVE_ADMINQ_REGISTER_PAGE_LIST: + priv->adminq_register_page_list_cnt++; + break; + + case GVE_ADMINQ_UNREGISTER_PAGE_LIST: + priv->adminq_unregister_page_list_cnt++; + break; + + case GVE_ADMINQ_CREATE_TX_QUEUE: + priv->adminq_create_tx_queue_cnt++; + break; + + case GVE_ADMINQ_CREATE_RX_QUEUE: + priv->adminq_create_rx_queue_cnt++; + break; + + case GVE_ADMINQ_DESTROY_TX_QUEUE: + priv->adminq_destroy_tx_queue_cnt++; + break; + + case GVE_ADMINQ_DESTROY_RX_QUEUE: + priv->adminq_destroy_rx_queue_cnt++; + break; + + case GVE_ADMINQ_DECONFIGURE_DEVICE_RESOURCES: + priv->adminq_dcfg_device_resources_cnt++; + break; + + case GVE_ADMINQ_SET_DRIVER_PARAMETER: + priv->adminq_set_driver_parameter_cnt++; + break; + + case GVE_ADMINQ_VERIFY_DRIVER_COMPATIBILITY: + priv->adminq_verify_driver_compatibility_cnt++; + break; + + default: + device_printf(priv->dev, "Unknown AQ command opcode %d\n", opcode); + } + + return (0); +} + +/* + * This function is not threadsafe - the caller is responsible for any + * necessary locks. + * The caller is also responsible for making sure there are no commands + * waiting to be executed. + */ +static int +gve_adminq_execute_cmd(struct gve_priv *priv, struct gve_adminq_command *cmd_orig) +{ + uint32_t tail, head; + int err; + + tail = gve_reg_bar_read_4(priv, ADMINQ_EVENT_COUNTER); + head = priv->adminq_prod_cnt; + + if (tail != head) + return (EINVAL); + err = gve_adminq_issue_cmd(priv, cmd_orig); + if (err != 0) + return (err); + return (gve_adminq_kick_and_wait(priv)); +} diff --git a/sys/dev/gve/gve_desc.h b/sys/dev/gve/gve_desc.h new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_desc.h @@ -0,0 +1,151 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_DESC_H_ +#define _GVE_DESC_H_ + +#include "gve_plat.h" + +/* + * A note on seg_addrs + * + * Base addresses encoded in seg_addr are not assumed to be physical + * addresses. The ring format assumes these come from some linear address + * space. This could be physical memory, kernel virtual memory, user virtual + * memory. + * + * Each queue is assumed to be associated with a single such linear + * address space to ensure a consistent meaning for seg_addrs posted to its + * rings. + */ +struct gve_tx_pkt_desc { + uint8_t type_flags; /* desc type is lower 4 bits, flags upper */ + uint8_t l4_csum_offset; /* relative offset of L4 csum word */ + uint8_t l4_hdr_offset; /* Offset of start of L4 headers in packet */ + uint8_t desc_cnt; /* Total descriptors for this packet */ + __be16 len; /* Total length of this packet (in bytes) */ + __be16 seg_len; /* Length of this descriptor's segment */ + __be64 seg_addr; /* Base address (see note) of this segment */ +} __packed; + +struct gve_tx_mtd_desc { + uint8_t type_flags; /* type is lower 4 bits, subtype upper */ + uint8_t path_state; /* state is lower 4 bits, hash type upper */ + __be16 reserved0; + __be32 path_hash; + __be64 reserved1; +} __packed; + +struct gve_tx_seg_desc { + uint8_t type_flags; /* type is lower 4 bits, flags upper */ + uint8_t l3_offset; /* TSO: 2 byte units to start of IPH */ + __be16 reserved; + __be16 mss; /* TSO MSS */ + __be16 seg_len; + __be64 seg_addr; +} __packed; + +/* GVE Transmit Descriptor Types */ +#define GVE_TXD_STD (0x0 << 4) /* Std with Host Address */ +#define GVE_TXD_TSO (0x1 << 4) /* TSO with Host Address */ +#define GVE_TXD_SEG (0x2 << 4) /* Seg with Host Address */ +#define GVE_TXD_MTD (0x3 << 4) /* Metadata */ + +/* GVE Transmit Descriptor Flags for Std Pkts */ +#define GVE_TXF_L4CSUM BIT(0) /* Need csum offload */ +#define GVE_TXF_TSTAMP BIT(2) /* Timestamp required */ + +/* GVE Transmit Descriptor Flags for TSO Segs */ +#define GVE_TXSF_IPV6 BIT(1) /* IPv6 TSO */ + +/* GVE Transmit Descriptor Options for MTD Segs */ +#define GVE_MTD_SUBTYPE_PATH 0 + +#define GVE_MTD_PATH_STATE_DEFAULT 0 +#define GVE_MTD_PATH_STATE_TIMEOUT 1 +#define GVE_MTD_PATH_STATE_CONGESTION 2 +#define GVE_MTD_PATH_STATE_RETRANSMIT 3 + +#define GVE_MTD_PATH_HASH_NONE (0x0 << 4) +#define GVE_MTD_PATH_HASH_L4 (0x1 << 4) + +/* + * GVE Receive Packet Descriptor + * + * The start of an ethernet packet comes 2 bytes into the rx buffer. + * gVNIC adds this padding so that both the DMA and the L3/4 protocol header + * access is aligned. + */ +#define GVE_RX_PAD 2 + +struct gve_rx_desc { + uint8_t padding[48]; + __be32 rss_hash; /* Receive-side scaling hash (Toeplitz for gVNIC) */ + __be16 mss; + __be16 reserved; /* Reserved to zero */ + uint8_t hdr_len; /* Header length (L2-L4) including padding */ + uint8_t hdr_off; /* 64-byte-scaled offset into RX_DATA entry */ + uint16_t csum; /* 1's-complement partial checksum of L3+ bytes */ + __be16 len; /* Length of the received packet */ + __be16 flags_seq; /* Flags [15:3] and sequence number [2:0] (1-7) */ +} __packed; +_Static_assert(sizeof(struct gve_rx_desc) == 64, "gve: bad desc struct length"); + +/* + * If the device supports raw dma addressing then the addr in data slot is + * the dma address of the buffer. + * If the device only supports registered segments then the addr is a byte + * offset into the registered segment (an ordered list of pages) where the + * buffer is. + */ +union gve_rx_data_slot { + __be64 qpl_offset; + __be64 addr; +}; + +/* GVE Recive Packet Descriptor Seq No */ +#define GVE_SEQNO(x) (be16toh(x) & 0x7) + +/* GVE Recive Packet Descriptor Flags */ +#define GVE_RXFLG(x) htobe16(1 << (3 + (x))) +#define GVE_RXF_FRAG GVE_RXFLG(3) /* IP Fragment */ +#define GVE_RXF_IPV4 GVE_RXFLG(4) /* IPv4 */ +#define GVE_RXF_IPV6 GVE_RXFLG(5) /* IPv6 */ +#define GVE_RXF_TCP GVE_RXFLG(6) /* TCP Packet */ +#define GVE_RXF_UDP GVE_RXFLG(7) /* UDP Packet */ +#define GVE_RXF_ERR GVE_RXFLG(8) /* Packet Error Detected */ +#define GVE_RXF_PKT_CONT GVE_RXFLG(10) /* Multi Fragment RX packet */ + +/* GVE IRQ */ +#define GVE_IRQ_ACK BIT(31) +#define GVE_IRQ_MASK BIT(30) +#define GVE_IRQ_EVENT BIT(29) + +#endif /* _GVE_DESC_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_main.c @@ -0,0 +1,853 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" + +#define GVE_DRIVER_VERSION "GVE-FBSD-1.0.0\n" +#define GVE_VERSION_MAJOR 0 +#define GVE_VERSION_MINOR 9 +#define GVE_VERSION_SUB 0 + +#define GVE_DEFAULT_RX_COPYBREAK 256 + +struct sx gve_global_lock; + +static int +gve_verify_driver_compatibility(struct gve_priv *priv) +{ + int err; + struct gve_driver_info *driver_info; + struct gve_dma_handle driver_info_mem; + + err = gve_dma_alloc_coherent(priv, sizeof(struct gve_driver_info), + PAGE_SIZE, &driver_info_mem); + + if (err != 0) + return (ENOMEM); + + driver_info = driver_info_mem.cpu_addr; + + *driver_info = (struct gve_driver_info) { + .os_type = 3, /* Freebsd */ + .driver_major = GVE_VERSION_MAJOR, + .driver_minor = GVE_VERSION_MINOR, + .driver_sub = GVE_VERSION_SUB, + .os_version_major = htobe32(FBSD_VERSION_MAJOR), + .os_version_minor = htobe32(FBSD_VERSION_MINOR), + .os_version_sub = htobe32(FBSD_VERSION_PATCH), + .driver_capability_flags = { + htobe64(GVE_DRIVER_CAPABILITY_FLAGS1), + htobe64(GVE_DRIVER_CAPABILITY_FLAGS2), + htobe64(GVE_DRIVER_CAPABILITY_FLAGS3), + htobe64(GVE_DRIVER_CAPABILITY_FLAGS4), + }, + }; + + snprintf(driver_info->os_version_str1, sizeof(driver_info->os_version_str1), + "FreeBSD %u", __FreeBSD_version); + + bus_dmamap_sync(driver_info_mem.tag, driver_info_mem.map, + BUS_DMASYNC_PREREAD); + + err = gve_adminq_verify_driver_compatibility(priv, + sizeof(struct gve_driver_info), driver_info_mem.bus_addr); + + /* It's ok if the device doesn't support this */ + if (err == EOPNOTSUPP) + err = 0; + + gve_dma_free_coherent(&driver_info_mem); + + return (err); +} + +static int +gve_up(struct gve_priv *priv) +{ + if_t ifp = priv->ifp; + int err; + + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + if (device_is_attached(priv->dev) == 0) { + device_printf(priv->dev, "Cannot bring the iface up when detached\n"); + return (ENXIO); + } + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) + return (0); + + if_clearhwassist(ifp); + if (if_getcapenable(ifp) & IFCAP_TXCSUM) + if_sethwassistbits(ifp, CSUM_TCP | CSUM_UDP, 0); + if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) + if_sethwassistbits(ifp, CSUM_IP6_TCP | CSUM_IP6_UDP, 0); + if (if_getcapenable(ifp) & IFCAP_TSO4) + if_sethwassistbits(ifp, CSUM_IP_TSO, 0); + if (if_getcapenable(ifp) & IFCAP_TSO6) + if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); + + err = gve_register_qpls(priv); + if (err != 0) + goto reset; + + err = gve_create_rx_rings(priv); + if (err != 0) + goto reset; + + err = gve_create_tx_rings(priv); + if (err != 0) + goto reset; + + if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { + if_link_state_change(ifp, LINK_STATE_UP); + gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } + + gve_unmask_all_queue_irqs(priv); + gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); + priv->interface_up_cnt++; + return (0); + +reset: + gve_schedule_reset(priv); + return (err); +} + +static void +gve_down(struct gve_priv *priv) +{ + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) + return; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } + + if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); + + if (gve_destroy_rx_rings(priv) != 0) + goto reset; + + if (gve_destroy_tx_rings(priv) != 0) + goto reset; + + if (gve_unregister_qpls(priv) != 0) + goto reset; + + gve_mask_all_queue_irqs(priv); + gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); + priv->interface_down_cnt++; + return; + +reset: + gve_schedule_reset(priv); +} + +static int +gve_set_mtu(if_t ifp, uint32_t new_mtu) +{ + struct gve_priv *priv = if_getsoftc(ifp); + int err; + + if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { + device_printf(priv->dev, "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d\n", + new_mtu, priv->max_mtu, ETHERMIN); + return (EINVAL); + } + + err = gve_adminq_set_mtu(priv, new_mtu); + if (err == 0) { + if (bootverbose) + device_printf(priv->dev, "MTU set to %d\n", new_mtu); + if_setmtu(ifp, new_mtu); + } else { + device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); + } + + return (err); +} + +static void +gve_init(void *arg) +{ + struct gve_priv *priv = (struct gve_priv *)arg; + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } +} + +static int +gve_ioctl(if_t ifp, u_long command, caddr_t data) +{ + struct gve_priv *priv; + struct ifreq *ifr; + int rc = 0; + + priv = if_getsoftc(ifp); + ifr = (struct ifreq *)data; + + switch (command) { + case SIOCSIFMTU: + if (if_getmtu(ifp) == ifr->ifr_mtu) + break; + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_down(priv); + gve_set_mtu(ifp, ifr->ifr_mtu); + rc = gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + break; + + case SIOCSIFFLAGS: + if ((if_getflags(ifp) & IFF_UP) != 0) { + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + rc = gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + } else { + if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_down(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + } + } + break; + + case SIOCSIFCAP: + if (ifr->ifr_reqcap == if_getcapenable(ifp)) + break; + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_down(priv); + if_setcapenable(ifp, ifr->ifr_reqcap); + rc = gve_up(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + break; + + case SIOCSIFMEDIA: + /* FALLTHROUGH */ + case SIOCGIFMEDIA: + rc = ifmedia_ioctl(ifp, ifr, &priv->media, command); + break; + + default: + rc = ether_ioctl(ifp, command, data); + break; + } + + return (rc); +} + +static int +gve_media_change(if_t ifp) +{ + struct gve_priv *priv = if_getsoftc(ifp); + + device_printf(priv->dev, "Media change not supported\n"); + return (0); +} + +static void +gve_media_status(if_t ifp, struct ifmediareq *ifmr) +{ + struct gve_priv *priv = if_getsoftc(ifp); + + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active |= IFM_AUTO; + } else { + ifmr->ifm_active |= IFM_NONE; + } + + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); +} + +static uint64_t +gve_get_counter(if_t ifp, ift_counter cnt) +{ + struct gve_priv *priv; + uint64_t rpackets = 0; + uint64_t tpackets = 0; + uint64_t rbytes = 0; + uint64_t tbytes = 0; + uint64_t rx_dropped_pkt = 0; + uint64_t tx_dropped_pkt = 0; + + priv = if_getsoftc(ifp); + + gve_accum_stats(priv, &rpackets, &rbytes, &rx_dropped_pkt, &tpackets, + &tbytes, &tx_dropped_pkt); + + switch (cnt) { + case IFCOUNTER_IPACKETS: + return (rpackets); + + case IFCOUNTER_OPACKETS: + return (tpackets); + + case IFCOUNTER_IBYTES: + return (rbytes); + + case IFCOUNTER_OBYTES: + return (tbytes); + + case IFCOUNTER_IQDROPS: + return (rx_dropped_pkt); + + case IFCOUNTER_OQDROPS: + return (tx_dropped_pkt); + + default: + return (if_get_counter_default(ifp, cnt)); + } +} + +static int +gve_setup_ifnet(device_t dev, struct gve_priv *priv) +{ + int caps = 0; + if_t ifp; + + ifp = priv->ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + device_printf(priv->dev, "Failed to allocate ifnet struct\n"); + return (ENXIO); + } + + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); + if_setsoftc(ifp, priv); + if_setdev(ifp, dev); + if_setinitfn(ifp, gve_init); + if_setioctlfn(ifp, gve_ioctl); + if_settransmitfn(ifp, gve_xmit_ifp); + if_setqflushfn(ifp, gve_qflush); + +#if __FreeBSD_version >= 1400086 + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); +#else + if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); +#endif + + ifmedia_init(&priv->media, IFM_IMASK, gve_media_change, gve_media_status); + if_setgetcounterfn(ifp, gve_get_counter); + + caps = IFCAP_RXCSUM | + IFCAP_TXCSUM | + IFCAP_TXCSUM_IPV6 | + IFCAP_TSO | + IFCAP_LRO; + + if ((priv->supported_features & GVE_SUP_JUMBO_FRAMES_MASK) != 0) + caps |= IFCAP_JUMBO_MTU; + + if_setcapabilities(ifp, caps); + if_setcapenable(ifp, caps); + + if (bootverbose) + device_printf(priv->dev, "Setting initial MTU to %d\n", priv->max_mtu); + if_setmtu(ifp, priv->max_mtu); + + ether_ifattach(ifp, priv->mac); + + ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); + + return (0); +} + +static int +gve_alloc_counter_array(struct gve_priv *priv) +{ + int err; + + err = gve_dma_alloc_coherent(priv, sizeof(uint32_t) * priv->num_event_counters, + PAGE_SIZE, &priv->counter_array_mem); + if (err != 0) + return (err); + + priv->counters = priv->counter_array_mem.cpu_addr; + return (0); +} + +static void +gve_free_counter_array(struct gve_priv *priv) +{ + if (priv->counters != NULL) + gve_dma_free_coherent(&priv->counter_array_mem); + priv->counter_array_mem = (struct gve_dma_handle){}; +} + +static int +gve_alloc_irq_db_array(struct gve_priv *priv) +{ + int err; + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_irq_db) * (priv->num_queues), PAGE_SIZE, + &priv->irqs_db_mem); + if (err != 0) + return (err); + + priv->irq_db_indices = priv->irqs_db_mem.cpu_addr; + return (0); +} + +static void +gve_free_irq_db_array(struct gve_priv *priv) +{ + if (priv->irq_db_indices != NULL) + gve_dma_free_coherent(&priv->irqs_db_mem); + priv->irqs_db_mem = (struct gve_dma_handle){}; +} + +static void +gve_free_rings(struct gve_priv *priv) +{ + gve_free_irqs(priv); + gve_free_tx_rings(priv); + gve_free_rx_rings(priv); + gve_free_qpls(priv); +} + +static int +gve_alloc_rings(struct gve_priv *priv) +{ + int err; + + err = gve_alloc_qpls(priv); + if (err != 0) + goto abort; + + err = gve_alloc_rx_rings(priv); + if (err != 0) + goto abort; + + err = gve_alloc_tx_rings(priv); + if (err != 0) + goto abort; + + err = gve_alloc_irqs(priv); + if (err != 0) + goto abort; + + return (0); + +abort: + gve_free_rings(priv); + return (err); +} + +static void +gve_deconfigure_resources(struct gve_priv *priv) +{ + int err; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) { + err = gve_adminq_deconfigure_device_resources(priv); + if (err != 0) { + device_printf(priv->dev, "Failed to deconfigure device resources: err=%d\n", + err); + return; + } + if (bootverbose) + device_printf(priv->dev, "Deconfigured device resources\n"); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); + } + + gve_free_irq_db_array(priv); + gve_free_counter_array(priv); +} + +static int +gve_configure_resources(struct gve_priv *priv) +{ + int err; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) + return (0); + + err = gve_alloc_counter_array(priv); + if (err != 0) + return (err); + + err = gve_alloc_irq_db_array(priv); + if (err != 0) + goto abort; + + err = gve_adminq_configure_device_resources(priv); + if (err != 0) { + device_printf(priv->dev, "Failed to configure device resources: err=%d\n", + err); + err = (ENXIO); + goto abort; + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); + if (bootverbose) + device_printf(priv->dev, "Configured device resources\n"); + return (0); + +abort: + gve_deconfigure_resources(priv); + return (err); +} + +static void +gve_set_queue_cnts(struct gve_priv *priv) +{ + priv->tx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_TX_QUEUES); + priv->rx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_RX_QUEUES); + priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; + priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; + + if (priv->default_num_queues > 0) { + priv->tx_cfg.num_queues = MIN(priv->default_num_queues, + priv->tx_cfg.num_queues); + priv->rx_cfg.num_queues = MIN(priv->default_num_queues, + priv->rx_cfg.num_queues); + } + + priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues; + priv->mgmt_msix_idx = priv->num_queues; +} + +static int +gve_alloc_adminq_and_describe_device(struct gve_priv *priv) +{ + int err; + + if ((err = gve_adminq_alloc(priv)) != 0) + return (err); + + if ((err = gve_verify_driver_compatibility(priv)) != 0) { + device_printf(priv->dev, + "Failed to verify driver compatibility: err=%d\n", err); + goto abort; + } + + if ((err = gve_adminq_describe_device(priv)) != 0) + goto abort; + + gve_set_queue_cnts(priv); + + priv->num_registered_pages = 0; + return (0); + +abort: + gve_release_adminq(priv); + return (err); +} + +void +gve_schedule_reset(struct gve_priv *priv) +{ + if (gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) + return; + + device_printf(priv->dev, "Scheduling reset task!\n"); + gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); + taskqueue_enqueue(priv->service_tq, &priv->service_task); +} + +static void +gve_destroy(struct gve_priv *priv) +{ + gve_down(priv); + gve_deconfigure_resources(priv); + gve_release_adminq(priv); +} + +static void +gve_restore(struct gve_priv *priv) +{ + int err; + + err = gve_adminq_alloc(priv); + if (err != 0) + goto abort; + + err = gve_configure_resources(priv); + if (err != 0) + goto abort; + + err = gve_up(priv); + if (err != 0) + goto abort; + + return; + +abort: + device_printf(priv->dev, "Restore failed!\n"); + return; +} + +static void +gve_handle_reset(struct gve_priv *priv) +{ + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) + return; + + gve_clear_state_flag(priv, GVE_STATE_FLAG_DO_RESET); + gve_set_state_flag(priv, GVE_STATE_FLAG_IN_RESET); + + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + + if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + + /* + * Releasing the adminq causes the NIC to destroy all resources + * registered with it, so by clearing the flags beneath we cause + * the subsequent gve_down call below to not attempt to tell the + * NIC to destroy these resources again. + * + * The call to gve_down is needed in the first place to refresh + * the state and the DMA-able memory within each driver ring. + */ + gve_release_adminq(priv); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); + gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); + gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); + + gve_down(priv); + gve_restore(priv); + + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + + priv->reset_cnt++; + gve_clear_state_flag(priv, GVE_STATE_FLAG_IN_RESET); +} + +static void +gve_handle_link_status(struct gve_priv *priv) +{ + uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); + bool link_up = status & GVE_DEVICE_STATUS_LINK_STATUS; + + if (link_up == gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) + return; + + if (link_up) { + if (bootverbose) + device_printf(priv->dev, "Device link is up.\n"); + if_link_state_change(priv->ifp, LINK_STATE_UP); + gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } else { + device_printf(priv->dev, "Device link is down.\n"); + if_link_state_change(priv->ifp, LINK_STATE_DOWN); + gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); + } +} + +static void +gve_service_task(void *arg, int pending) +{ + struct gve_priv *priv = (struct gve_priv *)arg; + uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); + + if (((GVE_DEVICE_STATUS_RESET_MASK & status) != 0) && + !gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) { + device_printf(priv->dev, "Device requested reset\n"); + gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); + } + + gve_handle_reset(priv); + gve_handle_link_status(priv); +} + +static int +gve_probe(device_t dev) +{ + if (pci_get_vendor(dev) == PCI_VENDOR_ID_GOOGLE && + pci_get_device(dev) == PCI_DEV_ID_GVNIC) { + device_set_desc(dev, "gVNIC"); + return (BUS_PROBE_DEFAULT); + } + return (ENXIO); +} + +static void +gve_free_sys_res_mem(struct gve_priv *priv) +{ + if (priv->msix_table != NULL) + bus_release_resource(priv->dev, SYS_RES_MEMORY, + rman_get_rid(priv->msix_table), priv->msix_table); + + if (priv->db_bar != NULL) + bus_release_resource(priv->dev, SYS_RES_MEMORY, + rman_get_rid(priv->db_bar), priv->db_bar); + + if (priv->reg_bar != NULL) + bus_release_resource(priv->dev, SYS_RES_MEMORY, + rman_get_rid(priv->reg_bar), priv->reg_bar); +} + +static int +gve_attach(device_t dev) +{ + struct gve_priv *priv; + int rid; + int err; + + priv = device_get_softc(dev); + priv->dev = dev; + GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); + + pci_enable_busmaster(dev); + + rid = PCIR_BAR(GVE_REGISTER_BAR); + priv->reg_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (priv->reg_bar == NULL) { + device_printf(dev, "Failed to allocate BAR0\n"); + err = ENXIO; + goto abort; + } + + rid = PCIR_BAR(GVE_DOORBELL_BAR); + priv->db_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (priv->db_bar == NULL) { + device_printf(dev, "Failed to allocate BAR2\n"); + err = ENXIO; + goto abort; + } + + rid = pci_msix_table_bar(priv->dev); + priv->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, + &rid, RF_ACTIVE); + if (priv->msix_table == NULL) { + device_printf(dev, "Failed to allocate msix table\n"); + err = ENXIO; + goto abort; + } + + err = gve_alloc_adminq_and_describe_device(priv); + if (err != 0) + goto abort; + + err = gve_configure_resources(priv); + if (err != 0) + goto abort; + + err = gve_alloc_rings(priv); + if (err != 0) + goto abort; + + err = gve_setup_ifnet(dev, priv); + if (err != 0) + goto abort; + + priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; + + bus_write_multi_1(priv->reg_bar, DRIVER_VERSION, GVE_DRIVER_VERSION, + sizeof(GVE_DRIVER_VERSION) - 1); + + TASK_INIT(&priv->service_task, 0, gve_service_task, priv); + priv->service_tq = taskqueue_create("gve service", M_WAITOK | M_ZERO, + taskqueue_thread_enqueue, &priv->service_tq); + taskqueue_start_threads(&priv->service_tq, 1, PI_NET, "%s service tq", + device_get_nameunit(priv->dev)); + + gve_setup_sysctl(priv); + + if (bootverbose) + device_printf(priv->dev, "Successfully attached %s", GVE_DRIVER_VERSION); + return (0); + +abort: + gve_free_rings(priv); + gve_deconfigure_resources(priv); + gve_release_adminq(priv); + gve_free_sys_res_mem(priv); + GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); + return (err); +} + +static int +gve_detach(device_t dev) +{ + struct gve_priv *priv = device_get_softc(dev); + if_t ifp = priv->ifp; + + ether_ifdetach(ifp); + + GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); + gve_destroy(priv); + GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); + + gve_free_rings(priv); + gve_free_sys_res_mem(priv); + GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); + + while (taskqueue_cancel(priv->service_tq, &priv->service_task, NULL)) + taskqueue_drain(priv->service_tq, &priv->service_task); + taskqueue_free(priv->service_tq); + + if_free(ifp); + return (bus_generic_detach(dev)); +} + +static device_method_t gve_methods[] = { + DEVMETHOD(device_probe, gve_probe), + DEVMETHOD(device_attach, gve_attach), + DEVMETHOD(device_detach, gve_detach), + DEVMETHOD_END +}; + +static driver_t gve_driver = { + "gve", + gve_methods, + sizeof(struct gve_priv) +}; + +#if __FreeBSD_version < 1301503 +static devclass_t gve_devclass; + +DRIVER_MODULE(gve, pci, gve_driver, gve_devclass, 0, 0); +#else +DRIVER_MODULE(gve, pci, gve_driver, 0, 0); +#endif diff --git a/sys/dev/gve/gve_plat.h b/sys/dev/gve/gve_plat.h new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_plat.h @@ -0,0 +1,94 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_PLAT_FBSD_H +#define _GVE_PLAT_FBSD_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +typedef uint16_t __be16; +typedef uint32_t __be32; +typedef uint64_t __be64; +#define BIT(nr) (1UL << (nr)) + +#define FBSD_VERSION_MAJOR (__FreeBSD_version / 100000) +#define FBSD_VERSION_MINOR ((__FreeBSD_version / 1000) - FBSD_VERSION_MAJOR * 100) +#define FBSD_VERSION_PATCH (__FreeBSD_version - ((FBSD_VERSION_MAJOR * 100 + FBSD_VERSION_MINOR) * 1000)) + +#endif // _GVE_PLAT_FBSD_H diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_qpl.c @@ -0,0 +1,284 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include + +#include "gve.h" +#include "gve_adminq.h" + +static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations"); + +static uint32_t +gve_num_tx_qpls(struct gve_priv *priv) +{ + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return (0); + + return (priv->tx_cfg.max_queues); +} + +static uint32_t +gve_num_rx_qpls(struct gve_priv *priv) +{ + if (priv->queue_format != GVE_GQI_QPL_FORMAT) + return (0); + + return (priv->rx_cfg.max_queues); +} + +static void +gve_free_qpl(struct gve_priv *priv, uint32_t id) +{ + struct gve_queue_page_list *qpl = &priv->qpls[id]; + int i; + + for (i = 0; i < qpl->num_dmas; i++) { + gve_dmamap_destroy(&qpl->dmas[i]); + } + + if (qpl->kva) { + pmap_qremove(qpl->kva, qpl->num_pages); + kva_free(qpl->kva, PAGE_SIZE * qpl->num_pages); + } + + for (i = 0; i < qpl->num_pages; i++) { + /* + * Free the page only if this is the last ref. + * Tx pages are known to have no other refs at + * this point, but Rx pages might still be in + * use by the networking stack, see gve_mextadd_free. + */ + if (vm_page_unwire_noq(qpl->pages[i])) { + if (!qpl->kva) { + pmap_qremove((vm_offset_t)qpl->dmas[i].cpu_addr, 1); + kva_free((vm_offset_t)qpl->dmas[i].cpu_addr, PAGE_SIZE); + } + vm_page_free(qpl->pages[i]); + } + + priv->num_registered_pages--; + } + + if (qpl->pages != NULL) + free(qpl->pages, M_GVE_QPL); + + if (qpl->dmas != NULL) + free(qpl->dmas, M_GVE_QPL); +} + +static int +gve_alloc_qpl(struct gve_priv *priv, uint32_t id, int npages, bool single_kva) +{ + struct gve_queue_page_list *qpl = &priv->qpls[id]; + int err; + int i; + + if (npages + priv->num_registered_pages > priv->max_registered_pages) { + device_printf(priv->dev, "Reached max number of registered pages %lu > %lu\n", + npages + priv->num_registered_pages, + priv->max_registered_pages); + return (EINVAL); + } + + qpl->id = id; + qpl->num_pages = 0; + qpl->num_dmas = 0; + + qpl->dmas = malloc(npages * sizeof(*qpl->dmas), M_GVE_QPL, + M_WAITOK | M_ZERO); + + qpl->pages = malloc(npages * sizeof(*qpl->pages), M_GVE_QPL, + M_WAITOK | M_ZERO); + + qpl->kva = 0; + if (single_kva) { + qpl->kva = kva_alloc(PAGE_SIZE * npages); + if (!qpl->kva) { + device_printf(priv->dev, "Failed to create the single kva for QPL %d\n", id); + err = ENOMEM; + goto abort; + } + } + + for (i = 0; i < npages; i++) { + qpl->pages[i] = vm_page_alloc_noobj(VM_ALLOC_WIRED | + VM_ALLOC_WAITOK | + VM_ALLOC_ZERO); + + if (!single_kva) { + qpl->dmas[i].cpu_addr = (void *)kva_alloc(PAGE_SIZE); + if (!qpl->dmas[i].cpu_addr) { + device_printf(priv->dev, "Failed to create kva for page %d in QPL %d", i, id); + err = ENOMEM; + goto abort; + } + pmap_qenter((vm_offset_t)qpl->dmas[i].cpu_addr, &(qpl->pages[i]), 1); + } else + qpl->dmas[i].cpu_addr = (void *)(qpl->kva + (PAGE_SIZE * i)); + + + qpl->num_pages++; + } + + if (single_kva) + pmap_qenter(qpl->kva, qpl->pages, npages); + + for (i = 0; i < npages; i++) { + err = gve_dmamap_create(priv, /*size=*/PAGE_SIZE, /*align=*/PAGE_SIZE, + &qpl->dmas[i]); + if (err != 0) { + device_printf(priv->dev, "Failed to dma-map page %d in QPL %d\n", i, id); + goto abort; + } + + qpl->num_dmas++; + priv->num_registered_pages++; + } + + return (0); + +abort: + gve_free_qpl(priv, id); + return (err); +} + +void +gve_free_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int i; + + if (num_qpls == 0) + return; + + if (priv->qpls != NULL) { + for (i = 0; i < num_qpls; i++) + gve_free_qpl(priv, i); + free(priv->qpls, M_GVE_QPL); + } +} + +int gve_alloc_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + if (num_qpls == 0) + return (0); + + priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL, + M_WAITOK | M_ZERO); + + for (i = 0; i < gve_num_tx_qpls(priv); i++) { + err = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, + /*single_kva=*/true); + if (err != 0) + goto abort; + } + + for (; i < num_qpls; i++) { + err = gve_alloc_qpl(priv, i, priv->rx_desc_cnt, /*single_kva=*/false); + if (err != 0) + goto abort; + } + + return (0); + +abort: + gve_free_qpls(priv); + return (err); +} + +static int +gve_unregister_n_qpls(struct gve_priv *priv, int n) +{ + int err; + int i; + + for (i = 0; i < n; i++) { + err = gve_adminq_unregister_page_list(priv, priv->qpls[i].id); + if (err != 0) { + device_printf(priv->dev, + "Failed to unregister qpl %d, err: %d\n", + priv->qpls[i].id, err); + } + } + + if (err != 0) + return (err); + + return (0); +} + +int +gve_register_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + int i; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) + return (0); + + for (i = 0; i < num_qpls; i++) { + err = gve_adminq_register_page_list(priv, &priv->qpls[i]); + if (err != 0) { + device_printf(priv->dev, + "Failed to register qpl %d, err: %d\n", + priv->qpls[i].id, err); + goto abort; + } + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); + return (0); + +abort: + gve_unregister_n_qpls(priv, i); + return (err); +} + +int +gve_unregister_qpls(struct gve_priv *priv) +{ + int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int err; + + if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK)) + return (0); + + err = gve_unregister_n_qpls(priv, num_qpls); + if (err != 0) + return (err); + + gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); + return (0); +} diff --git a/sys/dev/gve/gve_register.h b/sys/dev/gve/gve_register.h new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_register.h @@ -0,0 +1,54 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _GVE_REGISTER_H_ +#define _GVE_REGISTER_H_ + +/* Fixed Configuration Registers */ +#define DEVICE_STATUS 0 +#define DRIVER_STATUS 4 +#define MAX_TX_QUEUES 8 +#define MAX_RX_QUEUES 12 +#define ADMINQ_PFN 16 +#define ADMINQ_DOORBELL 20 +#define ADMINQ_EVENT_COUNTER 24 +#define RESERVED 28 +#define DRIVER_VERSION 31 +#define ADMINQ_BASE_ADDRESS_HI 32 +#define ADMINQ_BASE_ADDRESS_LO 36 +#define ADMINQ_LENGTH 40 + + +enum gve_device_status_flags { + GVE_DEVICE_STATUS_RESET_MASK = BIT(1), + GVE_DEVICE_STATUS_LINK_STATUS_MASK = BIT(2), +}; + +#endif /* _GVE_REGISTER_H_ */ diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_rx.c @@ -0,0 +1,684 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" + +static void +gve_rx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + /* Safe to call even if never allocated */ + gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); + + if (rx->page_info != NULL) { + free(rx->page_info, M_GVE); + rx->page_info = NULL; + } + + if (rx->data_ring != NULL) { + gve_dma_free_coherent(&rx->data_ring_mem); + rx->data_ring = NULL; + } + + if (rx->desc_ring != NULL) { + gve_dma_free_coherent(&rx->desc_ring_mem); + rx->desc_ring = NULL; + } + + if (com->q_resources != NULL) { + gve_dma_free_coherent(&com->q_resources_mem); + com->q_resources = NULL; + } +} + +static void +gve_prefill_rx_slots(struct gve_rx_ring *rx) +{ + struct gve_ring_com *com = &rx->com; + struct gve_dma_handle *dma; + int i; + + for (i = 0; i < com->priv->rx_desc_cnt; i++) { + rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i); + rx->page_info[i].page_offset = 0; + rx->page_info[i].page_address = com->qpl->dmas[i].cpu_addr; + rx->page_info[i].page = com->qpl->pages[i]; + + dma = &com->qpl->dmas[i]; + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREREAD); + } + + bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static int +gve_rx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + int err; + + com->priv = priv; + com->id = i; + + rx->mask = priv->rx_pages_per_qpl - 1; + + com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; + if (com->qpl == NULL) { + device_printf(priv->dev, "No QPL left for rx ring %d", i); + return (ENOMEM); + } + + rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE, + M_WAITOK | M_ZERO); + + gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); + + err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), + PAGE_SIZE, &com->q_resources_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i); + goto abort; + } + com->q_resources = com->q_resources_mem.cpu_addr; + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); + goto abort; + } + rx->desc_ring = rx->desc_ring_mem.cpu_addr; + + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->data_ring_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i); + goto abort; + } + rx->data_ring = rx->data_ring_mem.cpu_addr; + + gve_prefill_rx_slots(rx); + return (0); + +abort: + gve_rx_free_ring(priv, i); + return (err); +} + +int +gve_alloc_rx_rings(struct gve_priv *priv) +{ + int err = 0; + int i; + + priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues, + M_GVE, M_WAITOK | M_ZERO); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + err = gve_rx_alloc_ring(priv, i); + if (err != 0) + goto free_rings; + } + + return (0); + +free_rings: + while (i--) + gve_rx_free_ring(priv, i); + free(priv->rx, M_GVE); + return (err); +} + +void +gve_free_rx_rings(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_rx_free_ring(priv, i); + + free(priv->rx, M_GVE); +} + +static void +gve_rx_clear_data_ring(struct gve_rx_ring *rx) +{ + struct gve_priv *priv = rx->com.priv; + int i; + + /* + * The Rx data ring has this invariant: "the networking stack is not + * using the buffer beginning at any page_offset". This invariant is + * established initially by gve_prefill_rx_slots at alloc-time and is + * maintained by the cleanup taskqueue. This invariant implies that the + * ring can be considered to be fully posted with buffers at this point, + * even if there are unfreed mbufs still being processed, which is why we + * can fill the ring without waiting on can_flip at each slot to become true. + */ + for (i = 0; i < priv->rx_desc_cnt; i++) { + rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i + + rx->page_info[i].page_offset); + rx->fill_cnt++; + } + + bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_rx_clear_desc_ring(struct gve_rx_ring *rx) +{ + struct gve_priv *priv = rx->com.priv; + int i; + + for (i = 0; i < priv->rx_desc_cnt; i++) + rx->desc_ring[i] = (struct gve_rx_desc){}; + + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_clear_rx_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + + rx->seq_no = 1; + rx->cnt = 0; + rx->fill_cnt = 0; + rx->mask = priv->rx_desc_cnt - 1; + + gve_rx_clear_desc_ring(rx); + gve_rx_clear_data_ring(rx); +} + +static void +gve_start_rx_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + if ((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) { + if (tcp_lro_init(&rx->lro) != 0) + device_printf(priv->dev, "Failed to init lro for rx ring %d", i); + rx->lro.ifp = priv->ifp; + } + + NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); + com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK, + taskqueue_thread_enqueue, &com->cleanup_tq); + + taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, + "%s rxq %d", device_get_nameunit(priv->dev), i); + + gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); +} + +int +gve_create_rx_rings(struct gve_priv *priv) +{ + struct gve_ring_com *com; + struct gve_rx_ring *rx; + int err; + int i; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) + return (0); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_clear_rx_ring(priv, i); + + err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); + if (err != 0) + return (err); + + bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, + BUS_DMASYNC_POSTREAD); + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rx = &priv->rx[i]; + com = &rx->com; + + com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); + + bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, + BUS_DMASYNC_POSTREAD); + com->db_offset = 4 * be32toh(com->q_resources->db_index); + com->counter_idx = be32toh(com->q_resources->counter_index); + + gve_start_rx_ring(priv, i); + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); + return (0); +} + +static void +gve_stop_rx_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + if (com->cleanup_tq != NULL) { + taskqueue_quiesce(com->cleanup_tq); + taskqueue_free(com->cleanup_tq); + com->cleanup_tq = NULL; + } + + tcp_lro_free(&rx->lro); + rx->ctx = (struct gve_rx_ctx){}; +} + +int +gve_destroy_rx_rings(struct gve_priv *priv) +{ + int err; + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) + gve_stop_rx_ring(priv, i); + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) { + err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); + if (err != 0) + return (err); + gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); + } + + return (0); +} + +int +gve_rx_intr(void *arg) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + struct gve_ring_com *com = &rx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); + return (FILTER_HANDLED); +} + +static inline void +gve_set_rss_type(__be16 flag, struct mbuf *mbuf) +{ + if ((flag & GVE_RXF_IPV4) != 0) { + if ((flag & GVE_RXF_TCP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); + else if ((flag & GVE_RXF_UDP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); + else + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); + return; + } + + if ((flag & GVE_RXF_IPV6) != 0) { + if ((flag & GVE_RXF_TCP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); + else if ((flag & GVE_RXF_UDP) != 0) + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); + else + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); + return; + } +} + +static void +gve_mextadd_free(struct mbuf *mbuf) +{ + vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; + vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; + + /* + * Free the page only if this is the last ref. + * The interface might no longer exist by the time + * this callback is called, see gve_free_qpl. + */ + if (__predict_false(vm_page_unwire_noq(page))) { + pmap_qremove(va, 1); + kva_free(va, PAGE_SIZE); + vm_page_free(page); + } +} + +static void +gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) +{ + const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET); + page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; + *(slot_addr) ^= offset; +} + +static struct mbuf * +gve_rx_create_mbuf(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_slot_page_info *page_info, uint16_t len, + union gve_rx_data_slot *data_slot, bool is_only_frag) +{ + struct gve_rx_ctx *ctx = &rx->ctx; + struct mbuf *mbuf; + u_int ref_count; + bool can_flip; + + uint32_t offset = page_info->page_offset + page_info->pad; + void *va = (char *)page_info->page_address + offset; + + if (len <= priv->rx_copybreak && is_only_frag) { + mbuf = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); + if (__predict_false(mbuf == NULL)) + return (NULL); + + m_copyback(mbuf, 0, len, va); + counter_enter(); + counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); + counter_exit(); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + struct mbuf *mbuf_tail = ctx->mbuf_tail; + KASSERT(len <= MCLBYTES, ("gve rx fragment bigger than cluster mbuf")); + + /* + * This page was created with VM_ALLOC_WIRED, thus the lowest + * wire count experienced by the page until the interface is + * destroyed is 1. + * + * We wire the page again before supplying an mbuf pointing to + * it to the networking stack, so before the mbuf leaves the + * driver, the wire count rises to 2. + * + * If it is 1 again, it necessarily means that the mbuf has been + * consumed and it was gve_mextadd_free that brought down the wire + * count back to 1. We only need to eventually observe the 1. + */ + ref_count = atomic_load_int(&page_info->page->ref_count); + can_flip = VPRC_WIRE_COUNT(ref_count) == 1; + + if (mbuf_tail == NULL) { + if (can_flip) + mbuf = m_gethdr(M_NOWAIT, MT_DATA); + else + mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + if (can_flip) + mbuf = m_get(M_NOWAIT, MT_DATA); + else + mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); + + mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + if (__predict_false(mbuf == NULL)) + return (NULL); + + if (can_flip) { + MEXTADD(mbuf, va, len, gve_mextadd_free, + page_info->page, page_info->page_address, + 0, EXT_NET_DRV); + + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); + counter_exit(); + + /* + * Grab an extra ref to the page so that gve_mextadd_free + * does not end up freeing the page while the interface exists. + */ + vm_page_wire(page_info->page); + + gve_rx_flip_buff(page_info, &data_slot->qpl_offset); + } else { + m_copyback(mbuf, 0, len, va); + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); + counter_exit(); + } + } + + mbuf->m_len = len; + ctx->total_size += len; + + return (mbuf); +} + +static inline bool +gve_needs_rss(__be16 flag) +{ + if ((flag & GVE_RXF_FRAG) != 0) + return (false); + if ((flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) != 0) + return (true); + return (false); +} + +static void +gve_rx(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_desc *desc, + uint32_t idx) +{ + struct gve_rx_slot_page_info *page_info; + struct gve_dma_handle *page_dma_handle; + union gve_rx_data_slot *data_slot; + struct gve_rx_ctx *ctx = &rx->ctx; + struct mbuf *mbuf = NULL; + if_t ifp = priv->ifp; + bool do_if_input; + uint16_t len; + + bool is_first_frag = ctx->frag_cnt == 0; + bool is_last_frag = !(GVE_RXF_PKT_CONT & desc->flags_seq); + bool is_only_frag = is_first_frag && is_last_frag; + + if (__predict_false(ctx->drop_pkt)) + goto finish_frag; + + if ((desc->flags_seq & GVE_RXF_ERR) != 0) { + ctx->drop_pkt = true; + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + goto finish_frag; + } + + page_info = &rx->page_info[idx]; + data_slot = &rx->data_ring[idx]; + page_dma_handle = &(rx->com.qpl->dmas[idx]); + + page_info->pad = is_first_frag ? GVE_RX_PAD : 0; + len = be16toh(desc->len) - page_info->pad; + + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_POSTREAD); + + mbuf = gve_rx_create_mbuf(priv, rx, page_info, len, data_slot, + is_only_frag); + if (mbuf == NULL) { + ctx->drop_pkt = true; + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + goto finish_frag; + } + + if (is_first_frag) { + mbuf->m_pkthdr.rcvif = priv->ifp; + + if (gve_needs_rss(desc->flags_seq)) { + gve_set_rss_type(desc->flags_seq, mbuf); + mbuf->m_pkthdr.flowid = be32toh(desc->rss_hash); + } + + if ((desc->csum != 0) && ((desc->flags_seq & GVE_RXF_FRAG) == 0)) { + mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | + CSUM_IP_VALID | + CSUM_DATA_VALID | + CSUM_PSEUDO_HDR; + mbuf->m_pkthdr.csum_data = 0xffff; + } + } + + if (is_last_frag) { + mbuf = ctx->mbuf_head; + mbuf->m_pkthdr.len = ctx->total_size; + do_if_input = true; + + if (((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) && /* LRO is enabled */ + (desc->flags_seq & GVE_RXF_TCP) && /* pkt is a TCP pkt */ + ((mbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID) != 0) && /* NIC verified csum */ + (rx->lro.lro_cnt != 0) && /* LRO resources exist */ + (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) + do_if_input = false; + + if (do_if_input) + if_input(ifp, mbuf); + + counter_enter(); + counter_u64_add_protected(rx->stats.rbytes, ctx->total_size); + counter_u64_add_protected(rx->stats.rpackets, 1); + counter_exit(); + } + +finish_frag: + ctx->frag_cnt++; + if (is_last_frag) + rx->ctx = (struct gve_rx_ctx){}; +} + +static bool +gve_rx_work_pending(struct gve_rx_ring *rx) +{ + struct gve_rx_desc *desc; + __be16 flags_seq; + uint32_t next_idx; + + next_idx = rx->cnt & rx->mask; + desc = rx->desc_ring + next_idx; + + flags_seq = desc->flags_seq; + + return (GVE_SEQNO(flags_seq) == rx->seq_no); +} + +static inline uint8_t +gve_next_seqno(uint8_t seq) +{ + return ((seq + 1) == 8 ? 1 : seq + 1); +} + +static void +gve_rx_cleanup(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) +{ + uint32_t idx = rx->cnt & rx->mask; + struct gve_rx_desc *desc; + struct gve_rx_ctx *ctx = &rx->ctx; + uint32_t work_done = 0; + + NET_EPOCH_ASSERT(); + + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_POSTREAD); + desc = &rx->desc_ring[idx]; + + while ((work_done < budget || ctx->frag_cnt) && + (GVE_SEQNO(desc->flags_seq) == rx->seq_no)) { + + gve_rx(priv, rx, desc, idx); + + rx->cnt++; + idx = rx->cnt & rx->mask; + desc = &rx->desc_ring[idx]; + rx->seq_no = gve_next_seqno(rx->seq_no); + work_done++; + } + + /* The device will only send whole packets. */ + if (__predict_false(ctx->frag_cnt)) { + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; + device_printf(priv->dev, + "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", + GVE_SEQNO(desc->flags_seq), rx->seq_no); + gve_schedule_reset(priv); + } + + if (work_done != 0) + tcp_lro_flush_all(&rx->lro); + + bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, + BUS_DMASYNC_PREWRITE); + + /* Buffers are refilled as the descs are processed */ + rx->fill_cnt += work_done; + gve_db_bar_write_4(priv, rx->com.db_offset, rx->fill_cnt); +} + +void +gve_rx_cleanup_tq(void *arg, int pending) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + gve_rx_cleanup(priv, rx, /*budget=*/128); + + gve_db_bar_write_4(priv, rx->com.irq_db_offset, + GVE_IRQ_ACK | GVE_IRQ_EVENT); + + /* + * Fragments received before this barrier MAY NOT cause the NIC to send an + * interrupt but they will still be handled by the enqueue below. + * Fragments received after the barrier WILL trigger an interrupt. + */ + mb(); + + if (gve_rx_work_pending(rx)) { + gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); + } +} diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_sysctl.c @@ -0,0 +1,261 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" + +static void +gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_rx_ring *rxq) +{ + struct sysctl_oid *node; + struct sysctl_oid_list *list; + struct gve_rxq_stats *stats; + char namebuf[16]; + + snprintf(namebuf, sizeof(namebuf), "rxq%d", rxq->com.id); + node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Receive Queue"); + list = SYSCTL_CHILDREN(node); + + stats = &rxq->stats; + + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_bytes", CTLFLAG_RD, + &stats->rbytes, "Bytes received"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_packets", CTLFLAG_RD, + &stats->rpackets, "Packets received"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_copybreak_cnt", + CTLFLAG_RD, &stats->rx_copybreak_cnt, + "Total frags with mbufs allocated for copybreak"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_flip_cnt", + CTLFLAG_RD, &stats->rx_frag_flip_cnt, + "Total frags that allocated mbuf with page flip"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_frag_copy_cnt", + CTLFLAG_RD, &stats->rx_frag_copy_cnt, + "Total frags with mbuf that copied payload into mbuf"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt", + CTLFLAG_RD, &stats->rx_dropped_pkt, + "Total rx packets dropped"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_dropped_pkt_desc_err", CTLFLAG_RD, + &stats->rx_dropped_pkt_desc_err, + "Packets dropped due to descriptor error"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, + &stats->rx_dropped_pkt_mbuf_alloc_fail, + "Packets dropped due to failed mbuf allocation"); + SYSCTL_ADD_U32(ctx, list, OID_AUTO, + "rx_completed_desc", CTLFLAG_RD, + &rxq->cnt, 0, "Number of descriptors completed"); + SYSCTL_ADD_U32(ctx, list, OID_AUTO, + "num_desc_posted", CTLFLAG_RD, + &rxq->fill_cnt, rxq->fill_cnt, + "Toal number of descriptors posted"); +} + +static void +gve_setup_txq_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_tx_ring *txq) +{ + struct sysctl_oid *node; + struct sysctl_oid_list *tx_list; + struct gve_txq_stats *stats; + char namebuf[16]; + + snprintf(namebuf, sizeof(namebuf), "txq%d", txq->com.id); + node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, namebuf, + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Transmit Queue"); + tx_list = SYSCTL_CHILDREN(node); + + stats = &txq->stats; + + SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, + "tx_posted_desc", CTLFLAG_RD, + &txq->req, 0, "Number of descriptors posted by NIC"); + SYSCTL_ADD_U32(ctx, tx_list, OID_AUTO, + "tx_completed_desc", CTLFLAG_RD, + &txq->done, 0, "Number of descriptors completed"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_packets", CTLFLAG_RD, + &stats->tpackets, "Packets transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_tso_packets", CTLFLAG_RD, + &stats->tso_packet_cnt, "TSO Packets transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_bytes", CTLFLAG_RD, + &stats->tbytes, "Bytes transmitted"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_dropped_pkt_nospace_device", CTLFLAG_RD, + &stats->tx_dropped_pkt_nospace_device, + "Packets dropped due to no space in device"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_dropped_pkt_nospace_bufring", CTLFLAG_RD, + &stats->tx_dropped_pkt_nospace_bufring, + "Packets dropped due to no space in br ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_dropped_pkt_vlan", CTLFLAG_RD, + &stats->tx_dropped_pkt_vlan, + "Dropped VLAN packets"); +} + +static void +gve_setup_queue_stat_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, + struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + gve_setup_rxq_sysctl(ctx, child, &priv->rx[i]); + } + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + gve_setup_txq_sysctl(ctx, child, &priv->tx[i]); + } +} + +static void +gve_setup_adminq_stat_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_priv *priv) +{ + struct sysctl_oid *admin_node; + struct sysctl_oid_list *admin_list; + + /* Admin queue stats */ + admin_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "adminq_stats", + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Admin Queue statistics"); + admin_list = SYSCTL_CHILDREN(admin_node); + + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_prod_cnt", CTLFLAG_RD, + &priv->adminq_prod_cnt, 0, "Adminq Commands issued"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_cmd_fail", CTLFLAG_RD, + &priv->adminq_cmd_fail, 0, "Aqminq Failed commands"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_timeouts", CTLFLAG_RD, + &priv->adminq_timeouts, 0, "Adminq Timedout commands"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_describe_device_cnt", + CTLFLAG_RD, &priv->adminq_describe_device_cnt, 0, + "adminq_describe_device_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_cfg_device_resources_cnt", CTLFLAG_RD, + &priv->adminq_cfg_device_resources_cnt, 0, + "adminq_cfg_device_resources_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_register_page_list_cnt", CTLFLAG_RD, + &priv->adminq_register_page_list_cnt, 0, + "adminq_register_page_list_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_unregister_page_list_cnt", CTLFLAG_RD, + &priv->adminq_unregister_page_list_cnt, 0, + "adminq_unregister_page_list_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_tx_queue_cnt", + CTLFLAG_RD, &priv->adminq_create_tx_queue_cnt, 0, + "adminq_create_tx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_create_rx_queue_cnt", + CTLFLAG_RD, &priv->adminq_create_rx_queue_cnt, 0, + "adminq_create_rx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_tx_queue_cnt", + CTLFLAG_RD, &priv->adminq_destroy_tx_queue_cnt, 0, + "adminq_destroy_tx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt", + CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0, + "adminq_destroy_rx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_dcfg_device_resources_cnt", CTLFLAG_RD, + &priv->adminq_dcfg_device_resources_cnt, 0, + "adminq_dcfg_device_resources_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_set_driver_parameter_cnt", CTLFLAG_RD, + &priv->adminq_set_driver_parameter_cnt, 0, + "adminq_set_driver_parameter_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, + "adminq_verify_driver_compatibility_cnt", CTLFLAG_RD, + &priv->adminq_verify_driver_compatibility_cnt, 0, + "adminq_verify_driver_compatibility_cnt"); +} + +static void +gve_setup_main_stat_sysctl(struct sysctl_ctx_list *ctx, + struct sysctl_oid_list *child, struct gve_priv *priv) +{ + struct sysctl_oid *main_node; + struct sysctl_oid_list *main_list; + + /* Main stats */ + main_node = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "main_stats", + CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "Main statistics"); + main_list = SYSCTL_CHILDREN(main_node); + + SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_up_cnt", CTLFLAG_RD, + &priv->interface_up_cnt, 0, "Times interface was set to up"); + SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "interface_down_cnt", CTLFLAG_RD, + &priv->interface_down_cnt, 0, "Times interface was set to down"); + SYSCTL_ADD_U32(ctx, main_list, OID_AUTO, "reset_cnt", CTLFLAG_RD, + &priv->reset_cnt, 0, "Times reset"); +} + +void gve_setup_sysctl(struct gve_priv *priv) +{ + device_t dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + + dev = priv->dev; + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + gve_setup_queue_stat_sysctl(ctx, child, priv); + gve_setup_adminq_stat_sysctl(ctx, child, priv); + gve_setup_main_stat_sysctl(ctx, child, priv); +} + +void +gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, + uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, + uint64_t *tbytes, uint64_t *tx_dropped_pkt) +{ + struct gve_rxq_stats *rxqstats; + struct gve_txq_stats *txqstats; + int i; + + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + rxqstats = &priv->rx[i].stats; + *rpackets += counter_u64_fetch(rxqstats->rpackets); + *rbytes += counter_u64_fetch(rxqstats->rbytes); + *rx_dropped_pkt += counter_u64_fetch(rxqstats->rx_dropped_pkt); + } + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + txqstats = &priv->tx[i].stats; + *tpackets += counter_u64_fetch(txqstats->tpackets); + *tbytes += counter_u64_fetch(txqstats->tbytes); + *tx_dropped_pkt += counter_u64_fetch(txqstats->tx_dropped_pkt); + } +} diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_tx.c @@ -0,0 +1,806 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" + +#define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 + +static int +gve_tx_fifo_init(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + struct gve_queue_page_list *qpl = tx->com.qpl; + struct gve_tx_fifo *fifo = &tx->fifo; + + fifo->size = qpl->num_pages * PAGE_SIZE; + fifo->base = qpl->kva; + atomic_store_int(&fifo->available, fifo->size); + fifo->head = 0; + + return (0); +} + +static void +gve_tx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + /* Safe to call even if never alloced */ + gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); + + if (tx->br != NULL) { + buf_ring_free(tx->br, M_DEVBUF); + tx->br = NULL; + } + + if (mtx_initialized(&tx->ring_mtx)) + mtx_destroy(&tx->ring_mtx); + + if (tx->info != NULL) { + free(tx->info, M_GVE); + tx->info = NULL; + } + + if (tx->desc_ring != NULL) { + gve_dma_free_coherent(&tx->desc_ring_mem); + tx->desc_ring = NULL; + } + + if (com->q_resources != NULL) { + gve_dma_free_coherent(&com->q_resources_mem); + com->q_resources = NULL; + } +} + +static int +gve_tx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + char mtx_name[16]; + int err; + + com->priv = priv; + com->id = i; + + com->qpl = &priv->qpls[i]; + if (com->qpl == NULL) { + device_printf(priv->dev, "No QPL left for tx ring %d\n", i); + return (ENOMEM); + } + + err = gve_tx_fifo_init(priv, tx); + if (err != 0) + goto abort; + + tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, + M_GVE, M_WAITOK | M_ZERO); + + sprintf(mtx_name, "gvetx%d", i); + mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); + + tx->br = buf_ring_alloc(GVE_TX_BUFRING_ENTRIES, M_DEVBUF, + M_WAITOK, &tx->ring_mtx); + + gve_alloc_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); + + err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), + PAGE_SIZE, &com->q_resources_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); + goto abort; + } + com->q_resources = com->q_resources_mem.cpu_addr; + + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_tx_desc) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); + goto abort; + } + tx->desc_ring = tx->desc_ring_mem.cpu_addr; + + return (0); + +abort: + gve_tx_free_ring(priv, i); + return (err); +} + +int +gve_alloc_tx_rings(struct gve_priv *priv) +{ + int err = 0; + int i; + + priv->tx = malloc(sizeof(struct gve_tx_ring) * priv->tx_cfg.num_queues, + M_GVE, M_WAITOK | M_ZERO); + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + err = gve_tx_alloc_ring(priv, i); + if (err != 0) + goto free_rings; + + } + + return (0); + +free_rings: + while (i--) + gve_tx_free_ring(priv, i); + free(priv->tx, M_GVE); + return (err); +} + +void +gve_free_tx_rings(struct gve_priv *priv) +{ + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) + gve_tx_free_ring(priv, i); + + free(priv->tx, M_GVE); +} + +static void +gve_tx_clear_desc_ring(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int i; + + for (i = 0; i < com->priv->tx_desc_cnt; i++) { + tx->desc_ring[i] = (union gve_tx_desc){}; + tx->info[i] = (struct gve_tx_buffer_state){}; + } + + bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_clear_tx_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_tx_fifo *fifo = &tx->fifo; + + tx->req = 0; + tx->done = 0; + tx->mask = priv->tx_desc_cnt - 1; + + atomic_store_int(&fifo->available, fifo->size); + fifo->head = 0; + + gve_tx_clear_desc_ring(tx); +} + +static void +gve_start_tx_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); + com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, + taskqueue_thread_enqueue, &com->cleanup_tq); + taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", + device_get_nameunit(priv->dev), i); + + TASK_INIT(&tx->xmit_task, 0, gve_xmit_tq, tx); + tx->xmit_tq = taskqueue_create_fast("gve tx xmit", + M_WAITOK, taskqueue_thread_enqueue, &tx->xmit_tq); + taskqueue_start_threads(&tx->xmit_tq, 1, PI_NET, "%s txq %d xmit", + device_get_nameunit(priv->dev), i); +} + +int +gve_create_tx_rings(struct gve_priv *priv) +{ + struct gve_ring_com *com; + struct gve_tx_ring *tx; + int err; + int i; + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) + return (0); + + for (i = 0; i < priv->tx_cfg.num_queues; i++) + gve_clear_tx_ring(priv, i); + + err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); + if (err != 0) + return (err); + + bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, + BUS_DMASYNC_POSTREAD); + + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + tx = &priv->tx[i]; + com = &tx->com; + + com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); + + bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, + BUS_DMASYNC_POSTREAD); + com->db_offset = 4 * be32toh(com->q_resources->db_index); + com->counter_idx = be32toh(com->q_resources->counter_index); + + gve_start_tx_ring(priv, i); + } + + gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); + return (0); +} + +static void +gve_stop_tx_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + if (com->cleanup_tq != NULL) { + taskqueue_quiesce(com->cleanup_tq); + taskqueue_free(com->cleanup_tq); + com->cleanup_tq = NULL; + } + + if (tx->xmit_tq != NULL) { + taskqueue_quiesce(tx->xmit_tq); + taskqueue_free(tx->xmit_tq); + tx->xmit_tq = NULL; + } +} + +int +gve_destroy_tx_rings(struct gve_priv *priv) +{ + int err; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; i++) + gve_stop_tx_ring(priv, i); + + if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) { + err = gve_adminq_destroy_tx_queues(priv, priv->tx_cfg.num_queues); + if (err != 0) + return (err); + gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); + } + + return (0); +} + +int +gve_tx_intr(void *arg) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + struct gve_ring_com *com = &tx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); + return (FILTER_HANDLED); +} + +static uint32_t +gve_tx_load_event_counter(struct gve_priv *priv, struct gve_tx_ring *tx) +{ + bus_dmamap_sync(priv->counter_array_mem.tag, priv->counter_array_mem.map, + BUS_DMASYNC_POSTREAD); + uint32_t counter = priv->counters[tx->com.counter_idx]; + return (be32toh(counter)); +} + +static void +gve_tx_free_fifo(struct gve_tx_fifo *fifo, size_t bytes) +{ + atomic_add_int(&fifo->available, bytes); +} + +void +gve_tx_cleanup_tq(void *arg, int pending) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + uint32_t nic_done = gve_tx_load_event_counter(priv, tx); + uint32_t todo = nic_done - tx->done; + size_t space_freed = 0; + int i, j; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + for (j = 0; j < todo; j++) { + uint32_t idx = tx->done & tx->mask; + struct gve_tx_buffer_state *info = &tx->info[idx]; + struct mbuf *mbuf = info->mbuf; + + tx->done++; + if (mbuf == NULL) + continue; + + info->mbuf = NULL; + counter_enter(); + counter_u64_add_protected(tx->stats.tbytes, mbuf->m_pkthdr.len); + counter_u64_add_protected(tx->stats.tpackets, 1); + counter_exit(); + m_freem(mbuf); + + for (i = 0; i < GVE_TX_MAX_DESCS; i++) { + space_freed += info->iov[i].iov_len + info->iov[i].iov_padding; + info->iov[i].iov_len = 0; + info->iov[i].iov_padding = 0; + } + } + + gve_tx_free_fifo(&tx->fifo, space_freed); + + gve_db_bar_write_4(priv, tx->com.irq_db_offset, + GVE_IRQ_ACK | GVE_IRQ_EVENT); + + /* + * Completions born before this barrier MAY NOT cause the NIC to send an + * interrupt but they will still be handled by the enqueue below. + * Completions born after the barrier WILL trigger an interrupt. + */ + mb(); + + nic_done = gve_tx_load_event_counter(priv, tx); + todo = nic_done - tx->done; + if (todo != 0) { + gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + } +} + +static void +gve_dma_sync_for_device(struct gve_queue_page_list *qpl, + uint64_t iov_offset, uint64_t iov_len) +{ + uint64_t last_page = (iov_offset + iov_len - 1) / PAGE_SIZE; + uint64_t first_page = iov_offset / PAGE_SIZE; + struct gve_dma_handle *dma; + uint64_t page; + + for (page = first_page; page <= last_page; page++) { + dma = &(qpl->dmas[page]); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); + } +} + +static void +gve_tx_fill_mtd_desc(struct gve_tx_mtd_desc *mtd_desc, struct mbuf *mbuf) +{ + mtd_desc->type_flags = GVE_TXD_MTD | GVE_MTD_SUBTYPE_PATH; + mtd_desc->path_state = GVE_MTD_PATH_STATE_DEFAULT | GVE_MTD_PATH_HASH_L4; + mtd_desc->path_hash = htobe32(mbuf->m_pkthdr.flowid); + mtd_desc->reserved0 = 0; + mtd_desc->reserved1 = 0; +} + +static void +gve_tx_fill_pkt_desc(struct gve_tx_pkt_desc *pkt_desc, bool is_tso, + uint16_t l4_hdr_offset, uint32_t desc_cnt, + uint16_t first_seg_len, uint64_t addr, bool has_csum_flag, + int csum_offset, uint16_t pkt_len) +{ + if (is_tso) { + pkt_desc->type_flags = GVE_TXD_TSO | GVE_TXF_L4CSUM; + pkt_desc->l4_csum_offset = csum_offset >> 1; + pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; + } else if (has_csum_flag) { + pkt_desc->type_flags = GVE_TXD_STD | GVE_TXF_L4CSUM; + pkt_desc->l4_csum_offset = csum_offset >> 1; + pkt_desc->l4_hdr_offset = l4_hdr_offset >> 1; + } else { + pkt_desc->type_flags = GVE_TXD_STD; + pkt_desc->l4_csum_offset = 0; + pkt_desc->l4_hdr_offset = 0; + } + pkt_desc->desc_cnt = desc_cnt; + pkt_desc->len = htobe16(pkt_len); + pkt_desc->seg_len = htobe16(first_seg_len); + pkt_desc->seg_addr = htobe64(addr); +} + +static void +gve_tx_fill_seg_desc(struct gve_tx_seg_desc *seg_desc, + bool is_tso, uint16_t len, uint64_t addr, + bool is_ipv6, uint8_t l3_off, uint16_t tso_mss) +{ + seg_desc->type_flags = GVE_TXD_SEG; + if (is_tso) { + if (is_ipv6) + seg_desc->type_flags |= GVE_TXSF_IPV6; + seg_desc->l3_offset = l3_off >> 1; + seg_desc->mss = htobe16(tso_mss); + } + seg_desc->seg_len = htobe16(len); + seg_desc->seg_addr = htobe64(addr); +} + +static inline uint32_t +gve_tx_avail(struct gve_tx_ring *tx) +{ + return (tx->mask + 1 - (tx->req - tx->done)); +} + +static bool +gve_tx_fifo_can_alloc(struct gve_tx_fifo *fifo, size_t bytes) +{ + return (atomic_load_int(&fifo->available) >= bytes); +} + +static inline bool +gve_can_tx(struct gve_tx_ring *tx, int bytes_required) +{ + return (gve_tx_avail(tx) >= (GVE_TX_MAX_DESCS + 1) && + gve_tx_fifo_can_alloc(&tx->fifo, bytes_required)); +} + +static int +gve_tx_fifo_pad_alloc_one_frag(struct gve_tx_fifo *fifo, size_t bytes) +{ + return (fifo->head + bytes < fifo->size) ? 0 : fifo->size - fifo->head; +} + +static inline int +gve_fifo_bytes_required(struct gve_tx_ring *tx, uint16_t first_seg_len, + uint16_t pkt_len) +{ + int pad_bytes, align_hdr_pad; + int bytes; + + pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); + /* We need to take into account the header alignment padding. */ + align_hdr_pad = roundup2(first_seg_len, CACHE_LINE_SIZE) - first_seg_len; + bytes = align_hdr_pad + pad_bytes + pkt_len; + + return (bytes); +} + +static int +gve_tx_alloc_fifo(struct gve_tx_fifo *fifo, size_t bytes, + struct gve_tx_iovec iov[2]) +{ + size_t overflow, padding; + uint32_t aligned_head; + int nfrags = 0; + + if (bytes == 0) + return (0); + + /* + * This check happens before we know how much padding is needed to + * align to a cacheline boundary for the payload, but that is fine, + * because the FIFO head always start aligned, and the FIFO's boundaries + * are aligned, so if there is space for the data, there is space for + * the padding to the next alignment. + */ + KASSERT(gve_tx_fifo_can_alloc(fifo, bytes), + ("Allocating gve tx fifo when there is no room")); + + nfrags++; + + iov[0].iov_offset = fifo->head; + iov[0].iov_len = bytes; + fifo->head += bytes; + + if (fifo->head > fifo->size) { + /* + * If the allocation did not fit in the tail fragment of the + * FIFO, also use the head fragment. + */ + nfrags++; + overflow = fifo->head - fifo->size; + iov[0].iov_len -= overflow; + iov[1].iov_offset = 0; /* Start of fifo*/ + iov[1].iov_len = overflow; + + fifo->head = overflow; + } + + /* Re-align to a cacheline boundary */ + aligned_head = roundup2(fifo->head, CACHE_LINE_SIZE); + padding = aligned_head - fifo->head; + iov[nfrags - 1].iov_padding = padding; + atomic_add_int(&fifo->available, -(bytes + padding)); + fifo->head = aligned_head; + + if (fifo->head == fifo->size) + fifo->head = 0; + + return (nfrags); +} + +/* Only error this returns is ENOBUFS when the tx fifo is short of space */ +static int +gve_xmit(struct gve_tx_ring *tx, struct mbuf *mbuf) +{ + bool is_tso, has_csum_flag, is_ipv6 = false, is_tcp = false, is_udp = false; + int csum_flags, csum_offset, mtd_desc_nr, offset, copy_offset; + uint16_t tso_mss, l4_off, l4_data_off, pkt_len, first_seg_len; + int pad_bytes, hdr_nfrags, payload_nfrags; + struct gve_tx_pkt_desc *pkt_desc; + struct gve_tx_seg_desc *seg_desc; + struct gve_tx_mtd_desc *mtd_desc; + struct gve_tx_buffer_state *info; + uint32_t idx = tx->req & tx->mask; + struct ether_header *eh; + struct mbuf *mbuf_next; + int payload_iov = 2; + int bytes_required; + struct ip6_hdr *ip6; + struct tcphdr *th; + uint32_t next_idx; + uint8_t l3_off; + struct ip *ip; + int i; + + info = &tx->info[idx]; + csum_flags = mbuf->m_pkthdr.csum_flags; + pkt_len = mbuf->m_pkthdr.len; + is_tso = csum_flags & CSUM_TSO; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + mtd_desc_nr = M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE ? 1 : 0; + tso_mss = is_tso ? mbuf->m_pkthdr.tso_segsz : 0; + + eh = mtod(mbuf, struct ether_header *); + KASSERT(eh->ether_type != ETHERTYPE_VLAN, + ("VLAN-tagged packets not supported")); + + is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; + l3_off = ETHER_HDR_LEN; + mbuf_next = m_getptr(mbuf, l3_off, &offset); + + if (is_ipv6) { + ip6 = (struct ip6_hdr *)(mtodo(mbuf_next, offset)); + l4_off = l3_off + sizeof(struct ip6_hdr); + is_tcp = (ip6->ip6_nxt == IPPROTO_TCP); + is_udp = (ip6->ip6_nxt == IPPROTO_UDP); + mbuf_next = m_getptr(mbuf, l4_off, &offset); + } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { + ip = (struct ip *)(mtodo(mbuf_next, offset)); + l4_off = l3_off + (ip->ip_hl << 2); + is_tcp = (ip->ip_p == IPPROTO_TCP); + is_udp = (ip->ip_p == IPPROTO_UDP); + mbuf_next = m_getptr(mbuf, l4_off, &offset); + } + + l4_data_off = 0; + if (is_tcp) { + th = (struct tcphdr *)(mtodo(mbuf_next, offset)); + l4_data_off = l4_off + (th->th_off << 2); + } else if (is_udp) + l4_data_off = l4_off + sizeof(struct udphdr); + + if (has_csum_flag) { + if ((csum_flags & (CSUM_TSO | CSUM_TCP | CSUM_IP6_TCP)) != 0) + csum_offset = offsetof(struct tcphdr, th_sum); + else + csum_offset = offsetof(struct udphdr, uh_sum); + } + + /* + * If this packet is neither a TCP nor a UDP packet, the first segment, + * the one represented by the packet descriptor, will carry the + * spec-stipulated minimum of 182B. + */ + if (l4_data_off != 0) + first_seg_len = l4_data_off; + else + first_seg_len = MIN(pkt_len, GVE_GQ_TX_MIN_PKT_DESC_BYTES); + + bytes_required = gve_fifo_bytes_required(tx, first_seg_len, pkt_len); + if (__predict_false(!gve_can_tx(tx, bytes_required))) { + counter_enter(); + counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_device, 1); + counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_exit(); + return (ENOBUFS); + } + + /* So that the cleanup taskqueue can free the mbuf eventually. */ + info->mbuf = mbuf; + + /* + * We don't want to split the header, so if necessary, pad to the end + * of the fifo and then put the header at the beginning of the fifo. + */ + pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->fifo, first_seg_len); + hdr_nfrags = gve_tx_alloc_fifo(&tx->fifo, first_seg_len + pad_bytes, + &info->iov[0]); + KASSERT(hdr_nfrags > 0, ("Number of header fragments for gve tx is 0")); + payload_nfrags = gve_tx_alloc_fifo(&tx->fifo, pkt_len - first_seg_len, + &info->iov[payload_iov]); + + pkt_desc = &tx->desc_ring[idx].pkt; + gve_tx_fill_pkt_desc(pkt_desc, is_tso, l4_off, + 1 + mtd_desc_nr + payload_nfrags, first_seg_len, + info->iov[hdr_nfrags - 1].iov_offset, has_csum_flag, csum_offset, + pkt_len); + + m_copydata(mbuf, 0, first_seg_len, + (char *)tx->fifo.base + info->iov[hdr_nfrags - 1].iov_offset); + gve_dma_sync_for_device(tx->com.qpl, + info->iov[hdr_nfrags - 1].iov_offset, + info->iov[hdr_nfrags - 1].iov_len); + copy_offset = first_seg_len; + + if (mtd_desc_nr == 1) { + next_idx = (tx->req + 1) & tx->mask; + mtd_desc = &tx->desc_ring[next_idx].mtd; + gve_tx_fill_mtd_desc(mtd_desc, mbuf); + } + + for (i = payload_iov; i < payload_nfrags + payload_iov; i++) { + next_idx = (tx->req + 1 + mtd_desc_nr + i - payload_iov) & tx->mask; + seg_desc = &tx->desc_ring[next_idx].seg; + + gve_tx_fill_seg_desc(seg_desc, is_tso, info->iov[i].iov_len, + info->iov[i].iov_offset, is_ipv6, l3_off, tso_mss); + + m_copydata(mbuf, copy_offset, info->iov[i].iov_len, + (char *)tx->fifo.base + info->iov[i].iov_offset); + gve_dma_sync_for_device(tx->com.qpl, + info->iov[i].iov_offset, info->iov[i].iov_len); + copy_offset += info->iov[i].iov_len; + } + + tx->req += (1 + mtd_desc_nr + payload_nfrags); + if (is_tso) { + counter_enter(); + counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); + counter_exit(); + } + return (0); +} + +static void +gve_xmit_br(struct gve_tx_ring *tx) +{ + struct gve_priv *priv = tx->com.priv; + struct ifnet *ifp = priv->ifp; + struct mbuf *mbuf; + + while (!drbr_empty(ifp, tx->br) && + (if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { + + mbuf = drbr_peek(ifp, tx->br); + if (__predict_false(gve_xmit(tx, mbuf) != 0)) { + drbr_putback(ifp, tx->br, mbuf); + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + break; + } + + bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); + gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); + + drbr_advance(ifp, tx->br); + BPF_MTAP(ifp, mbuf); + } +} + +void +gve_xmit_tq(void *arg, int pending) +{ + struct gve_tx_ring *tx = (struct gve_tx_ring *)arg; + + GVE_RING_LOCK(tx); + gve_xmit_br(tx); + GVE_RING_UNLOCK(tx); +} + +static bool +is_vlan_tagged_pkt(struct mbuf *mbuf) +{ + struct ether_header *eh; + + eh = mtod(mbuf, struct ether_header *); + return (ntohs(eh->ether_type) == ETHERTYPE_VLAN); +} + +int +gve_xmit_ifp(if_t ifp, struct mbuf *mbuf) +{ + struct gve_priv *priv = if_getsoftc(ifp); + struct gve_tx_ring *tx; + bool is_br_empty; + int err; + uint32_t i; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (ENODEV); + + if (M_HASHTYPE_GET(mbuf) != M_HASHTYPE_NONE) + i = mbuf->m_pkthdr.flowid % priv->tx_cfg.num_queues; + else + i = curcpu % priv->tx_cfg.num_queues; + tx = &priv->tx[i]; + + if (__predict_false(is_vlan_tagged_pkt(mbuf))) { + counter_enter(); + counter_u64_add_protected(tx->stats.tx_dropped_pkt_vlan, 1); + counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_exit(); + m_freem(mbuf); + return (ENODEV); + } + + is_br_empty = drbr_empty(ifp, tx->br); + err = drbr_enqueue(ifp, tx->br, mbuf); + if (__predict_false(err != 0)) { + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + counter_enter(); + counter_u64_add_protected(tx->stats.tx_dropped_pkt_nospace_bufring, 1); + counter_u64_add_protected(tx->stats.tx_dropped_pkt, 1); + counter_exit(); + return (err); + } + + /* + * If the mbuf we just enqueued is the only one on the ring, then + * transmit it right away in the interests of low latency. + */ + if (is_br_empty && (GVE_RING_TRYLOCK(tx) != 0)) { + gve_xmit_br(tx); + GVE_RING_UNLOCK(tx); + } else { + taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); + } + + return (0); +} + +void +gve_qflush(if_t ifp) +{ + struct gve_priv *priv = if_getsoftc(ifp); + struct gve_tx_ring *tx; + int i; + + for (i = 0; i < priv->tx_cfg.num_queues; ++i) { + tx = &priv->tx[i]; + if (drbr_empty(ifp, tx->br) == 0) { + GVE_RING_LOCK(tx); + drbr_flush(ifp, tx->br); + GVE_RING_UNLOCK(tx); + } + } + + if_qflush(ifp); +} diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_utils.c @@ -0,0 +1,405 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2023 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" + +uint32_t +gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset) +{ + return (be32toh(bus_read_4(priv->reg_bar, offset))); +} + +void +gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) +{ + bus_write_4(priv->reg_bar, offset, htobe32(val)); +} + +void +gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) +{ + bus_write_4(priv->db_bar, offset, htobe32(val)); +} + +void +gve_alloc_counters(counter_u64_t *stat, int num_stats) +{ + int i; + + for (i = 0; i < num_stats; i++) + stat[i] = counter_u64_alloc(M_WAITOK); +} + +void +gve_free_counters(counter_u64_t *stat, int num_stats) +{ + int i; + + for (i = 0; i < num_stats; i++) + counter_u64_free(stat[i]); +} + +/* Currently assumes a single segment. */ +static void +gve_dmamap_load_callback(void *arg, bus_dma_segment_t *segs, int nseg, + int error) +{ + if (error == 0) + *(bus_addr_t *) arg = segs[0].ds_addr; +} + +int +gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma) +{ + int err; + device_t dev = priv->dev; + + err = bus_dma_tag_create( + bus_get_dma_tag(dev), /* parent */ + align, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + size, /* maxsize */ + 1, /* nsegments */ + size, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &dma->tag); + if (err != 0) { + device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + goto clear_tag; + } + + err = bus_dmamem_alloc(dma->tag, (void **) &dma->cpu_addr, + BUS_DMA_WAITOK | BUS_DMA_COHERENT | BUS_DMA_ZERO, + &dma->map); + if (err != 0) { + device_printf(dev, "%s: bus_dmamem_alloc(%ju) failed: %d\n", + __func__, (uintmax_t)size, err); + goto destroy_tag; + } + + /* An address set by the callback will never be -1 */ + dma->bus_addr = (bus_addr_t)-1; + err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, + gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_NOWAIT); + if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { + device_printf(dev, "%s: bus_dmamap_load failed: %d\n", __func__, err); + goto free_mem; + } + + return (0); + +free_mem: + bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); +destroy_tag: + bus_dma_tag_destroy(dma->tag); +clear_tag: + dma->tag = NULL; + + return (err); +} + +void +gve_dma_free_coherent(struct gve_dma_handle *dma) +{ + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(dma->tag, dma->map); + bus_dmamem_free(dma->tag, dma->cpu_addr, dma->map); + bus_dma_tag_destroy(dma->tag); +} + +int +gve_dmamap_create(struct gve_priv *priv, int size, int align, + struct gve_dma_handle *dma) +{ + int err; + device_t dev = priv->dev; + + err = bus_dma_tag_create( + bus_get_dma_tag(dev), /* parent */ + align, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + size, /* maxsize */ + 1, /* nsegments */ + size, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &dma->tag); + if (err != 0) { + device_printf(dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + goto clear_tag; + } + + err = bus_dmamap_create(dma->tag, BUS_DMA_COHERENT, &dma->map); + if (err != 0) { + device_printf(dev, "%s: bus_dmamap_create failed: %d\n", + __func__, err); + goto destroy_tag; + } + + /* An address set by the callback will never be -1 */ + dma->bus_addr = (bus_addr_t)-1; + err = bus_dmamap_load(dma->tag, dma->map, dma->cpu_addr, size, + gve_dmamap_load_callback, &dma->bus_addr, BUS_DMA_WAITOK); + if (err != 0 || dma->bus_addr == (bus_addr_t)-1) { + device_printf(dev, "%s: bus_dmamap_load failed: %d\n", + __func__, err); + goto destroy_map; + } + + return (0); + +destroy_map: + bus_dmamap_destroy(dma->tag, dma->map); +destroy_tag: + bus_dma_tag_destroy(dma->tag); +clear_tag: + dma->tag = NULL; + + return (err); +} + +void +gve_dmamap_destroy(struct gve_dma_handle *dma) +{ + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(dma->tag, dma->map); + bus_dmamap_destroy(dma->tag, dma->map); + bus_dma_tag_destroy(dma->tag); +} + +static int +gve_mgmnt_intr(void *arg) +{ + struct gve_priv *priv = arg; + + taskqueue_enqueue(priv->service_tq, &priv->service_task); + return (FILTER_HANDLED); +} + +void +gve_free_irqs(struct gve_priv *priv) +{ + struct gve_irq *irq; + int num_irqs; + int rid; + int rc; + int i; + + if (priv->irq_tbl == NULL) { + device_printf(priv->dev, "No irq table, nothing to free\n"); + return; + } + + num_irqs = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues + 1; + + for (i = 0; i < num_irqs; i++) { + irq = &priv->irq_tbl[i]; + if (irq->res == NULL) + continue; + + rid = rman_get_rid(irq->res); + + rc = bus_teardown_intr(priv->dev, irq->res, irq->cookie); + if (rc != 0) + device_printf(priv->dev, "Failed to teardown irq num %d\n", + rid); + + rc = bus_release_resource(priv->dev, SYS_RES_IRQ, + rid, irq->res); + if (rc != 0) + device_printf(priv->dev, "Failed to release irq num %d\n", + rid); + + irq->res = NULL; + irq->cookie = NULL; + } + + free(priv->irq_tbl, M_GVE); + priv->irq_tbl = NULL; + + /* Safe to call even if msix was never alloced */ + pci_release_msi(priv->dev); +} + +int +gve_alloc_irqs(struct gve_priv *priv) +{ + int num_tx = priv->tx_cfg.num_queues; + int num_rx = priv->rx_cfg.num_queues; + int req_nvecs = num_tx + num_rx + 1; + int got_nvecs = req_nvecs; + struct gve_irq *irq; + int i, j, m; + int rid; + int err; + + struct gve_ring_com *com; + struct gve_rx_ring *rx; + struct gve_tx_ring *tx; + + if (pci_alloc_msix(priv->dev, &got_nvecs) != 0) { + device_printf(priv->dev, "Failed to acquire any msix vectors\n"); + err = ENXIO; + goto abort; + } else if (got_nvecs != req_nvecs) { + device_printf(priv->dev, "Tried to acquire %d msix vectors, got only %d\n", + req_nvecs, got_nvecs); + err = ENOSPC; + goto abort; + } + + if (bootverbose) + device_printf(priv->dev, "Enabled MSIX with %d vectors\n", got_nvecs); + + priv->irq_tbl = malloc(sizeof(struct gve_irq) * req_nvecs, M_GVE, + M_WAITOK | M_ZERO); + + for (i = 0; i < num_tx; i++) { + irq = &priv->irq_tbl[i]; + tx = &priv->tx[i]; + com = &tx->com; + rid = i + 1; + + irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (irq->res == NULL) { + device_printf(priv->dev, "Failed to alloc irq %d for Tx queue %d\n", + rid, i); + err = ENOMEM; + goto abort; + } + + err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, + gve_tx_intr, NULL, &priv->tx[i], &irq->cookie); + if (err != 0) { + device_printf(priv->dev, "Failed to setup irq %d for Tx queue %d, " + "err: %d\n", rid, i, err); + goto abort; + } + + bus_describe_intr(priv->dev, irq->res, irq->cookie, "tx%d", i); + com->ntfy_id = i; + } + + for (j = 0; j < num_rx; j++) { + irq = &priv->irq_tbl[i + j]; + rx = &priv->rx[j]; + com = &rx->com; + rid = i + j + 1; + + irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (irq->res == NULL) { + device_printf(priv->dev, + "Failed to alloc irq %d for Rx queue %d", rid, j); + err = ENOMEM; + goto abort; + } + + err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, + gve_rx_intr, NULL, &priv->rx[j], &irq->cookie); + if (err != 0) { + device_printf(priv->dev, "Failed to setup irq %d for Rx queue %d, " + "err: %d\n", rid, j, err); + goto abort; + } + + bus_describe_intr(priv->dev, irq->res, irq->cookie, "rx%d", j); + com->ntfy_id = i + j; + } + + m = i + j; + rid = m + 1; + irq = &priv->irq_tbl[m]; + + irq->res = bus_alloc_resource_any(priv->dev, SYS_RES_IRQ, + &rid, RF_ACTIVE); + if (irq->res == NULL) { + device_printf(priv->dev, "Failed to allocate irq %d for mgmnt queue\n", rid); + err = ENOMEM; + goto abort; + } + + err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, + gve_mgmnt_intr, NULL, priv, &irq->cookie); + if (err != 0) { + device_printf(priv->dev, "Failed to setup irq %d for mgmnt queue, err: %d\n", + rid, err); + goto abort; + } + + bus_describe_intr(priv->dev, irq->res, irq->cookie, "mgmnt"); + + return (0); + +abort: + gve_free_irqs(priv); + return (err); +} + +void +gve_unmask_all_queue_irqs(struct gve_priv *priv) +{ + struct gve_tx_ring *tx; + struct gve_rx_ring *rx; + int idx; + + for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + tx = &priv->tx[idx]; + gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); + } + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + rx = &priv->rx[idx]; + gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); + } +} + +void +gve_mask_all_queue_irqs(struct gve_priv *priv) +{ + for (int idx = 0; idx < priv->tx_cfg.num_queues; idx++) { + struct gve_tx_ring *tx = &priv->tx[idx]; + gve_db_bar_write_4(priv, tx->com.irq_db_offset, GVE_IRQ_MASK); + } + for (int idx = 0; idx < priv->rx_cfg.num_queues; idx++) { + struct gve_rx_ring *rx = &priv->rx[idx]; + gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); + } +} diff --git a/sys/modules/Makefile b/sys/modules/Makefile --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -131,6 +131,7 @@ ${_glxiic} \ ${_glxsb} \ gpio \ + ${_gve} \ hid \ hifn \ ${_hpt27xx} \ @@ -554,6 +555,10 @@ .endif .endif +.if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" +_gve= gve +.endif + .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" _ena= ena diff --git a/sys/modules/gve/Makefile b/sys/modules/gve/Makefile new file mode 100644 --- /dev/null +++ b/sys/modules/gve/Makefile @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: BSD-3-Clause +# +# Copyright (c) 2023 Google LLC +# +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its contributors +# may be used to endorse or promote products derived from this software without +# specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +.PATH: ${SRCTOP}/sys/dev/gve + +KMOD= if_gve +SRCS= gve_main.c gve_adminq.c gve_utils.c gve_qpl.c gve_rx.c gve_tx.c gve_sysctl.c +SRCS+= device_if.h bus_if.h pci_if.h + +.include