diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h index 61781cddee94..c446199dff2d 100644 --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -1,459 +1,460 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef _GVE_FBSD_H #define _GVE_FBSD_H #include "gve_desc.h" #include "gve_plat.h" #include "gve_register.h" #ifndef PCI_VENDOR_ID_GOOGLE #define PCI_VENDOR_ID_GOOGLE 0x1ae0 #endif #define PCI_DEV_ID_GVNIC 0x0042 #define GVE_REGISTER_BAR 0 #define GVE_DOORBELL_BAR 2 /* Driver can alloc up to 2 segments for the header and 2 for the payload. */ #define GVE_TX_MAX_DESCS 4 #define GVE_TX_BUFRING_ENTRIES 4096 #define ADMINQ_SIZE PAGE_SIZE #define GVE_DEFAULT_RX_BUFFER_SIZE 2048 /* Each RX bounce buffer page can fit two packet buffers. */ #define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) /* * Number of descriptors per queue page list. * Page count AKA QPL size can be derived by dividing the number of elements in * a page by the number of descriptors available. */ #define GVE_QPL_DIVISOR 16 static MALLOC_DEFINE(M_GVE, "gve", "gve allocations"); struct gve_dma_handle { bus_addr_t bus_addr; void *cpu_addr; bus_dma_tag_t tag; bus_dmamap_t map; }; union gve_tx_desc { struct gve_tx_pkt_desc pkt; /* first desc for a packet */ struct gve_tx_mtd_desc mtd; /* optional metadata descriptor */ struct gve_tx_seg_desc seg; /* subsequent descs for a packet */ }; /* Tracks the memory in the fifo occupied by a segment of a packet */ struct gve_tx_iovec { uint32_t iov_offset; /* offset into this segment */ uint32_t iov_len; /* length */ uint32_t iov_padding; /* padding associated with this segment */ }; /* Tracks allowed and current queue settings */ struct gve_queue_config { uint16_t max_queues; uint16_t num_queues; /* current */ }; struct gve_irq_db { __be32 index; } __aligned(CACHE_LINE_SIZE); /* * GVE_QUEUE_FORMAT_UNSPECIFIED must be zero since 0 is the default value * when the entire configure_device_resources command is zeroed out and the * queue_format is not specified. */ enum gve_queue_format { GVE_QUEUE_FORMAT_UNSPECIFIED = 0x0, GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, }; enum gve_state_flags_bit { GVE_STATE_FLAG_ADMINQ_OK, GVE_STATE_FLAG_RESOURCES_OK, GVE_STATE_FLAG_QPLREG_OK, GVE_STATE_FLAG_RX_RINGS_OK, GVE_STATE_FLAG_TX_RINGS_OK, GVE_STATE_FLAG_QUEUES_UP, GVE_STATE_FLAG_LINK_UP, GVE_STATE_FLAG_DO_RESET, GVE_STATE_FLAG_IN_RESET, GVE_NUM_STATE_FLAGS /* Not part of the enum space */ }; BITSET_DEFINE(gve_state_flags, GVE_NUM_STATE_FLAGS); #define GVE_DEVICE_STATUS_RESET (0x1 << 1) #define GVE_DEVICE_STATUS_LINK_STATUS (0x1 << 2) #define GVE_RING_LOCK(ring) mtx_lock(&(ring)->ring_mtx) #define GVE_RING_TRYLOCK(ring) mtx_trylock(&(ring)->ring_mtx) #define GVE_RING_UNLOCK(ring) mtx_unlock(&(ring)->ring_mtx) #define GVE_RING_ASSERT(ring) mtx_assert(&(ring)->ring_mtx, MA_OWNED) #define GVE_IFACE_LOCK_INIT(lock) sx_init(&lock, "gve interface lock") #define GVE_IFACE_LOCK_DESTROY(lock) sx_destroy(&lock) #define GVE_IFACE_LOCK_LOCK(lock) sx_xlock(&lock) #define GVE_IFACE_LOCK_UNLOCK(lock) sx_unlock(&lock) #define GVE_IFACE_LOCK_ASSERT(lock) sx_assert(&lock, SA_XLOCKED) struct gve_queue_page_list { uint32_t id; uint32_t num_dmas; uint32_t num_pages; vm_offset_t kva; vm_page_t *pages; struct gve_dma_handle *dmas; }; struct gve_irq { struct resource *res; void *cookie; }; struct gve_rx_slot_page_info { void *page_address; vm_page_t page; uint32_t page_offset; uint16_t pad; }; /* * A single received packet split across multiple buffers may be * reconstructed using the information in this structure. */ struct gve_rx_ctx { /* head and tail of mbuf chain for the current packet */ struct mbuf *mbuf_head; struct mbuf *mbuf_tail; uint32_t total_size; uint8_t frag_cnt; + bool is_tcp; bool drop_pkt; }; struct gve_ring_com { struct gve_priv *priv; uint32_t id; /* * BAR2 offset for this ring's doorbell and the * counter-array offset for this ring's counter. * Acquired from the device individually for each * queue in the queue_create adminq command. */ struct gve_queue_resources *q_resources; struct gve_dma_handle q_resources_mem; /* Byte offset into BAR2 where this ring's 4-byte irq doorbell lies. */ uint32_t irq_db_offset; /* Byte offset into BAR2 where this ring's 4-byte doorbell lies. */ uint32_t db_offset; /* * Index, not byte-offset, into the counter array where this ring's * 4-byte counter lies. */ uint32_t counter_idx; /* * The index of the MSIX vector that was assigned to * this ring in `gve_alloc_irqs`. * * It is passed to the device in the queue_create adminq * command. * * Additionally, this also serves as the index into * `priv->irq_db_indices` where this ring's irq doorbell's * BAR2 offset, `irq_db_idx`, can be found. */ int ntfy_id; /* * The fixed bounce buffer for this ring. * Once allocated, has to be offered to the device * over the register-page-list adminq command. */ struct gve_queue_page_list *qpl; struct task cleanup_task; struct taskqueue *cleanup_tq; } __aligned(CACHE_LINE_SIZE); struct gve_rxq_stats { counter_u64_t rbytes; counter_u64_t rpackets; counter_u64_t rx_dropped_pkt; counter_u64_t rx_copybreak_cnt; counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; }; #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) /* power-of-2 sized receive ring */ struct gve_rx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; struct gve_dma_handle data_ring_mem; /* accessed in the receive hot path */ struct { struct gve_rx_desc *desc_ring; union gve_rx_data_slot *data_ring; struct gve_rx_slot_page_info *page_info; struct gve_rx_ctx ctx; struct lro_ctrl lro; uint8_t seq_no; /* helps traverse the descriptor ring */ uint32_t cnt; /* free-running total number of completed packets */ uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ struct gve_rxq_stats stats; } __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); /* * A contiguous representation of the pages composing the Tx bounce buffer. * The xmit taskqueue and the completion taskqueue both simultaneously use it. * Both operate on `available`: the xmit tq lowers it and the completion tq * raises it. `head` is the last location written at and so only the xmit tq * uses it. */ struct gve_tx_fifo { vm_offset_t base; /* address of base of FIFO */ uint32_t size; /* total size */ volatile int available; /* how much space is still available */ uint32_t head; /* offset to write at */ }; struct gve_tx_buffer_state { struct mbuf *mbuf; struct gve_tx_iovec iov[GVE_TX_MAX_DESCS]; }; struct gve_txq_stats { counter_u64_t tbytes; counter_u64_t tpackets; counter_u64_t tso_packet_cnt; counter_u64_t tx_dropped_pkt; counter_u64_t tx_dropped_pkt_nospace_device; counter_u64_t tx_dropped_pkt_nospace_bufring; counter_u64_t tx_dropped_pkt_vlan; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) /* power-of-2 sized transmit ring */ struct gve_tx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; struct task xmit_task; struct taskqueue *xmit_tq; /* accessed in the transmit hot path */ struct { union gve_tx_desc *desc_ring; struct gve_tx_buffer_state *info; struct buf_ring *br; struct gve_tx_fifo fifo; struct mtx ring_mtx; uint32_t req; /* free-running total number of packets written to the nic */ uint32_t done; /* free-running total number of completed packets */ uint32_t mask; /* masks the req and done to the size of the ring */ struct gve_txq_stats stats; } __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); struct gve_priv { if_t ifp; device_t dev; struct ifmedia media; uint8_t mac[ETHER_ADDR_LEN]; struct gve_dma_handle aq_mem; struct resource *reg_bar; /* BAR0 */ struct resource *db_bar; /* BAR2 */ struct resource *msix_table; uint32_t mgmt_msix_idx; uint32_t rx_copybreak; uint16_t num_event_counters; uint16_t default_num_queues; uint16_t tx_desc_cnt; uint16_t rx_desc_cnt; uint16_t rx_pages_per_qpl; uint64_t max_registered_pages; uint64_t num_registered_pages; uint32_t supported_features; uint16_t max_mtu; struct gve_dma_handle counter_array_mem; __be32 *counters; struct gve_dma_handle irqs_db_mem; struct gve_irq_db *irq_db_indices; enum gve_queue_format queue_format; struct gve_queue_page_list *qpls; struct gve_queue_config tx_cfg; struct gve_queue_config rx_cfg; uint32_t num_queues; struct gve_irq *irq_tbl; struct gve_tx_ring *tx; struct gve_rx_ring *rx; /* * Admin queue - see gve_adminq.h * Since AQ cmds do not run in steady state, 32 bit counters suffice */ struct gve_adminq_command *adminq; vm_paddr_t adminq_bus_addr; uint32_t adminq_mask; /* masks prod_cnt to adminq size */ uint32_t adminq_prod_cnt; /* free-running count of AQ cmds executed */ uint32_t adminq_cmd_fail; /* free-running count of AQ cmds failed */ uint32_t adminq_timeouts; /* free-running count of AQ cmds timeouts */ /* free-running count of each distinct AQ cmd executed */ uint32_t adminq_describe_device_cnt; uint32_t adminq_cfg_device_resources_cnt; uint32_t adminq_register_page_list_cnt; uint32_t adminq_unregister_page_list_cnt; uint32_t adminq_create_tx_queue_cnt; uint32_t adminq_create_rx_queue_cnt; uint32_t adminq_destroy_tx_queue_cnt; uint32_t adminq_destroy_rx_queue_cnt; uint32_t adminq_dcfg_device_resources_cnt; uint32_t adminq_set_driver_parameter_cnt; uint32_t adminq_verify_driver_compatibility_cnt; uint32_t interface_up_cnt; uint32_t interface_down_cnt; uint32_t reset_cnt; struct task service_task; struct taskqueue *service_tq; struct gve_state_flags state_flags; struct sx gve_iface_lock; }; static inline bool gve_get_state_flag(struct gve_priv *priv, int pos) { return (BIT_ISSET(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags)); } static inline void gve_set_state_flag(struct gve_priv *priv, int pos) { BIT_SET_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } static inline void gve_clear_state_flag(struct gve_priv *priv, int pos) { BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); /* Register access functions defined in gve_utils.c */ uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); /* QPL (Queue Page List) functions defined in gve_qpl.c */ int gve_alloc_qpls(struct gve_priv *priv); void gve_free_qpls(struct gve_priv *priv); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); /* TX functions defined in gve_tx.c */ int gve_alloc_tx_rings(struct gve_priv *priv); void gve_free_tx_rings(struct gve_priv *priv); int gve_create_tx_rings(struct gve_priv *priv); int gve_destroy_tx_rings(struct gve_priv *priv); int gve_tx_intr(void *arg); int gve_xmit_ifp(if_t ifp, struct mbuf *mbuf); void gve_qflush(if_t ifp); void gve_xmit_tq(void *arg, int pending); void gve_tx_cleanup_tq(void *arg, int pending); /* RX functions defined in gve_rx.c */ int gve_alloc_rx_rings(struct gve_priv *priv); void gve_free_rx_rings(struct gve_priv *priv); int gve_create_rx_rings(struct gve_priv *priv); int gve_destroy_rx_rings(struct gve_priv *priv); int gve_rx_intr(void *arg); void gve_rx_cleanup_tq(void *arg, int pending); /* DMA functions defined in gve_utils.c */ int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dma_free_coherent(struct gve_dma_handle *dma); int gve_dmamap_create(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); void gve_dmamap_destroy(struct gve_dma_handle *dma); /* IRQ functions defined in gve_utils.c */ void gve_free_irqs(struct gve_priv *priv); int gve_alloc_irqs(struct gve_priv *priv); void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); /* Systcl functions defined in gve_sysctl.c*/ void gve_setup_sysctl(struct gve_priv *priv); void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, uint64_t *tbytes, uint64_t *tx_dropped_pkt); /* Stats functions defined in gve_utils.c */ void gve_alloc_counters(counter_u64_t *stat, int num_stats); void gve_free_counters(counter_u64_t *stat, int num_stats); #endif /* _GVE_FBSD_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c index c3ee6d5c3433..cd7849778bce 100644 --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -1,872 +1,872 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.0.0\n" -#define GVE_VERSION_MAJOR 0 -#define GVE_VERSION_MINOR 9 -#define GVE_VERSION_SUB 0 +#define GVE_DRIVER_VERSION "GVE-FBSD-1.0.1\n" +#define GVE_VERSION_MAJOR 1 +#define GVE_VERSION_MINOR 0 +#define GVE_VERSION_SUB 1 #define GVE_DEFAULT_RX_COPYBREAK 256 /* Devices supported by this driver. */ static struct gve_dev { uint16_t vendor_id; uint16_t device_id; const char *name; } gve_devs[] = { { PCI_VENDOR_ID_GOOGLE, PCI_DEV_ID_GVNIC, "gVNIC" } }; struct sx gve_global_lock; static int gve_verify_driver_compatibility(struct gve_priv *priv) { int err; struct gve_driver_info *driver_info; struct gve_dma_handle driver_info_mem; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_driver_info), PAGE_SIZE, &driver_info_mem); if (err != 0) return (ENOMEM); driver_info = driver_info_mem.cpu_addr; *driver_info = (struct gve_driver_info) { .os_type = 3, /* Freebsd */ .driver_major = GVE_VERSION_MAJOR, .driver_minor = GVE_VERSION_MINOR, .driver_sub = GVE_VERSION_SUB, .os_version_major = htobe32(FBSD_VERSION_MAJOR), .os_version_minor = htobe32(FBSD_VERSION_MINOR), .os_version_sub = htobe32(FBSD_VERSION_PATCH), .driver_capability_flags = { htobe64(GVE_DRIVER_CAPABILITY_FLAGS1), htobe64(GVE_DRIVER_CAPABILITY_FLAGS2), htobe64(GVE_DRIVER_CAPABILITY_FLAGS3), htobe64(GVE_DRIVER_CAPABILITY_FLAGS4), }, }; snprintf(driver_info->os_version_str1, sizeof(driver_info->os_version_str1), "FreeBSD %u", __FreeBSD_version); bus_dmamap_sync(driver_info_mem.tag, driver_info_mem.map, BUS_DMASYNC_PREREAD); err = gve_adminq_verify_driver_compatibility(priv, sizeof(struct gve_driver_info), driver_info_mem.bus_addr); /* It's ok if the device doesn't support this */ if (err == EOPNOTSUPP) err = 0; gve_dma_free_coherent(&driver_info_mem); return (err); } static int gve_up(struct gve_priv *priv) { if_t ifp = priv->ifp; int err; GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (device_is_attached(priv->dev) == 0) { device_printf(priv->dev, "Cannot bring the iface up when detached\n"); return (ENXIO); } if (gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return (0); if_clearhwassist(ifp); if (if_getcapenable(ifp) & IFCAP_TXCSUM) if_sethwassistbits(ifp, CSUM_TCP | CSUM_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TXCSUM_IPV6) if_sethwassistbits(ifp, CSUM_IP6_TCP | CSUM_IP6_UDP, 0); if (if_getcapenable(ifp) & IFCAP_TSO4) if_sethwassistbits(ifp, CSUM_IP_TSO, 0); if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); err = gve_register_qpls(priv); if (err != 0) goto reset; err = gve_create_rx_rings(priv); if (err != 0) goto reset; err = gve_create_tx_rings(priv); if (err != 0) goto reset; if_setdrvflagbits(ifp, IFF_DRV_RUNNING, IFF_DRV_OACTIVE); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } gve_unmask_all_queue_irqs(priv); gve_set_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_up_cnt++; return (0); reset: gve_schedule_reset(priv); return (err); } static void gve_down(struct gve_priv *priv) { GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) return; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if (gve_destroy_rx_rings(priv) != 0) goto reset; if (gve_destroy_tx_rings(priv) != 0) goto reset; if (gve_unregister_qpls(priv) != 0) goto reset; gve_mask_all_queue_irqs(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_down_cnt++; return; reset: gve_schedule_reset(priv); } static int gve_set_mtu(if_t ifp, uint32_t new_mtu) { struct gve_priv *priv = if_getsoftc(ifp); int err; if ((new_mtu > priv->max_mtu) || (new_mtu < ETHERMIN)) { device_printf(priv->dev, "Invalid new MTU setting. new mtu: %d max mtu: %d min mtu: %d\n", new_mtu, priv->max_mtu, ETHERMIN); return (EINVAL); } err = gve_adminq_set_mtu(priv, new_mtu); if (err == 0) { if (bootverbose) device_printf(priv->dev, "MTU set to %d\n", new_mtu); if_setmtu(ifp, new_mtu); } else { device_printf(priv->dev, "Failed to set MTU to %d\n", new_mtu); } return (err); } static void gve_init(void *arg) { struct gve_priv *priv = (struct gve_priv *)arg; if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } static int gve_ioctl(if_t ifp, u_long command, caddr_t data) { struct gve_priv *priv; struct ifreq *ifr; int rc = 0; priv = if_getsoftc(ifp); ifr = (struct ifreq *)data; switch (command) { case SIOCSIFMTU: if (if_getmtu(ifp) == ifr->ifr_mtu) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); gve_set_mtu(ifp, ifr->ifr_mtu); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFFLAGS: if ((if_getflags(ifp) & IFF_UP) != 0) { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } else { if ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0) { GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } } break; case SIOCSIFCAP: if (ifr->ifr_reqcap == if_getcapenable(ifp)) break; GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_down(priv); if_setcapenable(ifp, ifr->ifr_reqcap); rc = gve_up(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); break; case SIOCSIFMEDIA: /* FALLTHROUGH */ case SIOCGIFMEDIA: rc = ifmedia_ioctl(ifp, ifr, &priv->media, command); break; default: rc = ether_ioctl(ifp, command, data); break; } return (rc); } static int gve_media_change(if_t ifp) { struct gve_priv *priv = if_getsoftc(ifp); device_printf(priv->dev, "Media change not supported\n"); return (0); } static void gve_media_status(if_t ifp, struct ifmediareq *ifmr) { struct gve_priv *priv = if_getsoftc(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); ifmr->ifm_status = IFM_AVALID; ifmr->ifm_active = IFM_ETHER; if (gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) { ifmr->ifm_status |= IFM_ACTIVE; ifmr->ifm_active |= IFM_AUTO; } else { ifmr->ifm_active |= IFM_NONE; } GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); } static uint64_t gve_get_counter(if_t ifp, ift_counter cnt) { struct gve_priv *priv; uint64_t rpackets = 0; uint64_t tpackets = 0; uint64_t rbytes = 0; uint64_t tbytes = 0; uint64_t rx_dropped_pkt = 0; uint64_t tx_dropped_pkt = 0; priv = if_getsoftc(ifp); gve_accum_stats(priv, &rpackets, &rbytes, &rx_dropped_pkt, &tpackets, &tbytes, &tx_dropped_pkt); switch (cnt) { case IFCOUNTER_IPACKETS: return (rpackets); case IFCOUNTER_OPACKETS: return (tpackets); case IFCOUNTER_IBYTES: return (rbytes); case IFCOUNTER_OBYTES: return (tbytes); case IFCOUNTER_IQDROPS: return (rx_dropped_pkt); case IFCOUNTER_OQDROPS: return (tx_dropped_pkt); default: return (if_get_counter_default(ifp, cnt)); } } static int gve_setup_ifnet(device_t dev, struct gve_priv *priv) { int caps = 0; if_t ifp; ifp = priv->ifp = if_alloc(IFT_ETHER); if (ifp == NULL) { device_printf(priv->dev, "Failed to allocate ifnet struct\n"); return (ENXIO); } if_initname(ifp, device_get_name(dev), device_get_unit(dev)); if_setsoftc(ifp, priv); if_setdev(ifp, dev); if_setinitfn(ifp, gve_init); if_setioctlfn(ifp, gve_ioctl); if_settransmitfn(ifp, gve_xmit_ifp); if_setqflushfn(ifp, gve_qflush); #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); #else if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST | IFF_KNOWSEPOCH); #endif ifmedia_init(&priv->media, IFM_IMASK, gve_media_change, gve_media_status); if_setgetcounterfn(ifp, gve_get_counter); caps = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6 | IFCAP_TSO | IFCAP_LRO; if ((priv->supported_features & GVE_SUP_JUMBO_FRAMES_MASK) != 0) caps |= IFCAP_JUMBO_MTU; if_setcapabilities(ifp, caps); if_setcapenable(ifp, caps); if (bootverbose) device_printf(priv->dev, "Setting initial MTU to %d\n", priv->max_mtu); if_setmtu(ifp, priv->max_mtu); ether_ifattach(ifp, priv->mac); ifmedia_add(&priv->media, IFM_ETHER | IFM_AUTO, 0, NULL); ifmedia_set(&priv->media, IFM_ETHER | IFM_AUTO); return (0); } static int gve_alloc_counter_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(uint32_t) * priv->num_event_counters, PAGE_SIZE, &priv->counter_array_mem); if (err != 0) return (err); priv->counters = priv->counter_array_mem.cpu_addr; return (0); } static void gve_free_counter_array(struct gve_priv *priv) { if (priv->counters != NULL) gve_dma_free_coherent(&priv->counter_array_mem); priv->counter_array_mem = (struct gve_dma_handle){}; } static int gve_alloc_irq_db_array(struct gve_priv *priv) { int err; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_irq_db) * (priv->num_queues), PAGE_SIZE, &priv->irqs_db_mem); if (err != 0) return (err); priv->irq_db_indices = priv->irqs_db_mem.cpu_addr; return (0); } static void gve_free_irq_db_array(struct gve_priv *priv) { if (priv->irq_db_indices != NULL) gve_dma_free_coherent(&priv->irqs_db_mem); priv->irqs_db_mem = (struct gve_dma_handle){}; } static void gve_free_rings(struct gve_priv *priv) { gve_free_irqs(priv); gve_free_tx_rings(priv); gve_free_rx_rings(priv); gve_free_qpls(priv); } static int gve_alloc_rings(struct gve_priv *priv) { int err; err = gve_alloc_qpls(priv); if (err != 0) goto abort; err = gve_alloc_rx_rings(priv); if (err != 0) goto abort; err = gve_alloc_tx_rings(priv); if (err != 0) goto abort; err = gve_alloc_irqs(priv); if (err != 0) goto abort; return (0); abort: gve_free_rings(priv); return (err); } static void gve_deconfigure_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) { err = gve_adminq_deconfigure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to deconfigure device resources: err=%d\n", err); return; } if (bootverbose) device_printf(priv->dev, "Deconfigured device resources\n"); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); } gve_free_irq_db_array(priv); gve_free_counter_array(priv); } static int gve_configure_resources(struct gve_priv *priv) { int err; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK)) return (0); err = gve_alloc_counter_array(priv); if (err != 0) return (err); err = gve_alloc_irq_db_array(priv); if (err != 0) goto abort; err = gve_adminq_configure_device_resources(priv); if (err != 0) { device_printf(priv->dev, "Failed to configure device resources: err=%d\n", err); err = (ENXIO); goto abort; } gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); if (bootverbose) device_printf(priv->dev, "Configured device resources\n"); return (0); abort: gve_deconfigure_resources(priv); return (err); } static void gve_set_queue_cnts(struct gve_priv *priv) { priv->tx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_TX_QUEUES); priv->rx_cfg.max_queues = gve_reg_bar_read_4(priv, MAX_RX_QUEUES); priv->tx_cfg.num_queues = priv->tx_cfg.max_queues; priv->rx_cfg.num_queues = priv->rx_cfg.max_queues; if (priv->default_num_queues > 0) { priv->tx_cfg.num_queues = MIN(priv->default_num_queues, priv->tx_cfg.num_queues); priv->rx_cfg.num_queues = MIN(priv->default_num_queues, priv->rx_cfg.num_queues); } priv->num_queues = priv->tx_cfg.num_queues + priv->rx_cfg.num_queues; priv->mgmt_msix_idx = priv->num_queues; } static int gve_alloc_adminq_and_describe_device(struct gve_priv *priv) { int err; if ((err = gve_adminq_alloc(priv)) != 0) return (err); if ((err = gve_verify_driver_compatibility(priv)) != 0) { device_printf(priv->dev, "Failed to verify driver compatibility: err=%d\n", err); goto abort; } if ((err = gve_adminq_describe_device(priv)) != 0) goto abort; gve_set_queue_cnts(priv); priv->num_registered_pages = 0; return (0); abort: gve_release_adminq(priv); return (err); } void gve_schedule_reset(struct gve_priv *priv) { if (gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) return; device_printf(priv->dev, "Scheduling reset task!\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); taskqueue_enqueue(priv->service_tq, &priv->service_task); } static void gve_destroy(struct gve_priv *priv) { gve_down(priv); gve_deconfigure_resources(priv); gve_release_adminq(priv); } static void gve_restore(struct gve_priv *priv) { int err; err = gve_adminq_alloc(priv); if (err != 0) goto abort; err = gve_configure_resources(priv); if (err != 0) goto abort; err = gve_up(priv); if (err != 0) goto abort; return; abort: device_printf(priv->dev, "Restore failed!\n"); return; } static void gve_handle_reset(struct gve_priv *priv) { if (!gve_get_state_flag(priv, GVE_STATE_FLAG_DO_RESET)) return; gve_clear_state_flag(priv, GVE_STATE_FLAG_DO_RESET); gve_set_state_flag(priv, GVE_STATE_FLAG_IN_RESET); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); /* * Releasing the adminq causes the NIC to destroy all resources * registered with it, so by clearing the flags beneath we cause * the subsequent gve_down call below to not attempt to tell the * NIC to destroy these resources again. * * The call to gve_down is needed in the first place to refresh * the state and the DMA-able memory within each driver ring. */ gve_release_adminq(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); gve_clear_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); gve_down(priv); gve_restore(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); priv->reset_cnt++; gve_clear_state_flag(priv, GVE_STATE_FLAG_IN_RESET); } static void gve_handle_link_status(struct gve_priv *priv) { uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); bool link_up = status & GVE_DEVICE_STATUS_LINK_STATUS; if (link_up == gve_get_state_flag(priv, GVE_STATE_FLAG_LINK_UP)) return; if (link_up) { if (bootverbose) device_printf(priv->dev, "Device link is up.\n"); if_link_state_change(priv->ifp, LINK_STATE_UP); gve_set_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } else { device_printf(priv->dev, "Device link is down.\n"); if_link_state_change(priv->ifp, LINK_STATE_DOWN); gve_clear_state_flag(priv, GVE_STATE_FLAG_LINK_UP); } } static void gve_service_task(void *arg, int pending) { struct gve_priv *priv = (struct gve_priv *)arg; uint32_t status = gve_reg_bar_read_4(priv, DEVICE_STATUS); if (((GVE_DEVICE_STATUS_RESET_MASK & status) != 0) && !gve_get_state_flag(priv, GVE_STATE_FLAG_IN_RESET)) { device_printf(priv->dev, "Device requested reset\n"); gve_set_state_flag(priv, GVE_STATE_FLAG_DO_RESET); } gve_handle_reset(priv); gve_handle_link_status(priv); } static int gve_probe(device_t dev) { uint16_t deviceid, vendorid; int i; vendorid = pci_get_vendor(dev); deviceid = pci_get_device(dev); for (i = 0; i < nitems(gve_devs); i++) { if (vendorid == gve_devs[i].vendor_id && deviceid == gve_devs[i].device_id) { device_set_desc(dev, gve_devs[i].name); return (BUS_PROBE_DEFAULT); } } return (ENXIO); } static void gve_free_sys_res_mem(struct gve_priv *priv) { if (priv->msix_table != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->msix_table), priv->msix_table); if (priv->db_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->db_bar), priv->db_bar); if (priv->reg_bar != NULL) bus_release_resource(priv->dev, SYS_RES_MEMORY, rman_get_rid(priv->reg_bar), priv->reg_bar); } static int gve_attach(device_t dev) { struct gve_priv *priv; int rid; int err; priv = device_get_softc(dev); priv->dev = dev; GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); pci_enable_busmaster(dev); rid = PCIR_BAR(GVE_REGISTER_BAR); priv->reg_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->reg_bar == NULL) { device_printf(dev, "Failed to allocate BAR0\n"); err = ENXIO; goto abort; } rid = PCIR_BAR(GVE_DOORBELL_BAR); priv->db_bar = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->db_bar == NULL) { device_printf(dev, "Failed to allocate BAR2\n"); err = ENXIO; goto abort; } rid = pci_msix_table_bar(priv->dev); priv->msix_table = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid, RF_ACTIVE); if (priv->msix_table == NULL) { device_printf(dev, "Failed to allocate msix table\n"); err = ENXIO; goto abort; } err = gve_alloc_adminq_and_describe_device(priv); if (err != 0) goto abort; err = gve_configure_resources(priv); if (err != 0) goto abort; err = gve_alloc_rings(priv); if (err != 0) goto abort; err = gve_setup_ifnet(dev, priv); if (err != 0) goto abort; priv->rx_copybreak = GVE_DEFAULT_RX_COPYBREAK; bus_write_multi_1(priv->reg_bar, DRIVER_VERSION, GVE_DRIVER_VERSION, sizeof(GVE_DRIVER_VERSION) - 1); TASK_INIT(&priv->service_task, 0, gve_service_task, priv); priv->service_tq = taskqueue_create("gve service", M_WAITOK | M_ZERO, taskqueue_thread_enqueue, &priv->service_tq); taskqueue_start_threads(&priv->service_tq, 1, PI_NET, "%s service tq", device_get_nameunit(priv->dev)); gve_setup_sysctl(priv); if (bootverbose) device_printf(priv->dev, "Successfully attached %s", GVE_DRIVER_VERSION); return (0); abort: gve_free_rings(priv); gve_deconfigure_resources(priv); gve_release_adminq(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); return (err); } static int gve_detach(device_t dev) { struct gve_priv *priv = device_get_softc(dev); if_t ifp = priv->ifp; ether_ifdetach(ifp); GVE_IFACE_LOCK_LOCK(priv->gve_iface_lock); gve_destroy(priv); GVE_IFACE_LOCK_UNLOCK(priv->gve_iface_lock); gve_free_rings(priv); gve_free_sys_res_mem(priv); GVE_IFACE_LOCK_DESTROY(priv->gve_iface_lock); while (taskqueue_cancel(priv->service_tq, &priv->service_task, NULL)) taskqueue_drain(priv->service_tq, &priv->service_task); taskqueue_free(priv->service_tq); if_free(ifp); return (bus_generic_detach(dev)); } static device_method_t gve_methods[] = { DEVMETHOD(device_probe, gve_probe), DEVMETHOD(device_attach, gve_attach), DEVMETHOD(device_detach, gve_detach), DEVMETHOD_END }; static driver_t gve_driver = { "gve", gve_methods, sizeof(struct gve_priv) }; #if __FreeBSD_version < 1301503 static devclass_t gve_devclass; DRIVER_MODULE(gve, pci, gve_driver, gve_devclass, 0, 0); #else DRIVER_MODULE(gve, pci, gve_driver, 0, 0); #endif MODULE_PNP_INFO("U16:vendor;U16:device;D:#", pci, gve, gve_devs, nitems(gve_devs)); diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c index f5d8fd7a7b75..9be96cf1ee3a 100644 --- a/sys/dev/gve/gve_rx.c +++ b/sys/dev/gve/gve_rx.c @@ -1,684 +1,685 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 2023 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * 3. Neither the name of the copyright holder nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" #include "gve_adminq.h" static void gve_rx_free_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; /* Safe to call even if never allocated */ gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); if (rx->page_info != NULL) { free(rx->page_info, M_GVE); rx->page_info = NULL; } if (rx->data_ring != NULL) { gve_dma_free_coherent(&rx->data_ring_mem); rx->data_ring = NULL; } if (rx->desc_ring != NULL) { gve_dma_free_coherent(&rx->desc_ring_mem); rx->desc_ring = NULL; } if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } } static void gve_prefill_rx_slots(struct gve_rx_ring *rx) { struct gve_ring_com *com = &rx->com; struct gve_dma_handle *dma; int i; for (i = 0; i < com->priv->rx_desc_cnt; i++) { rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i); rx->page_info[i].page_offset = 0; rx->page_info[i].page_address = com->qpl->dmas[i].cpu_addr; rx->page_info[i].page = com->qpl->pages[i]; dma = &com->qpl->dmas[i]; bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREREAD); } bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); } static int gve_rx_alloc_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int err; com->priv = priv; com->id = i; rx->mask = priv->rx_pages_per_qpl - 1; com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; if (com->qpl == NULL) { device_printf(priv->dev, "No QPL left for rx ring %d", i); return (ENOMEM); } rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE, M_WAITOK | M_ZERO); gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; err = gve_dma_alloc_coherent(priv, sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->desc_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); goto abort; } rx->desc_ring = rx->desc_ring_mem.cpu_addr; err = gve_dma_alloc_coherent(priv, sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, CACHE_LINE_SIZE, &rx->data_ring_mem); if (err != 0) { device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i); goto abort; } rx->data_ring = rx->data_ring_mem.cpu_addr; gve_prefill_rx_slots(rx); return (0); abort: gve_rx_free_ring(priv, i); return (err); } int gve_alloc_rx_rings(struct gve_priv *priv) { int err = 0; int i; priv->rx = malloc(sizeof(struct gve_rx_ring) * priv->rx_cfg.num_queues, M_GVE, M_WAITOK | M_ZERO); for (i = 0; i < priv->rx_cfg.num_queues; i++) { err = gve_rx_alloc_ring(priv, i); if (err != 0) goto free_rings; } return (0); free_rings: while (i--) gve_rx_free_ring(priv, i); free(priv->rx, M_GVE); return (err); } void gve_free_rx_rings(struct gve_priv *priv) { int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_rx_free_ring(priv, i); free(priv->rx, M_GVE); } static void gve_rx_clear_data_ring(struct gve_rx_ring *rx) { struct gve_priv *priv = rx->com.priv; int i; /* * The Rx data ring has this invariant: "the networking stack is not * using the buffer beginning at any page_offset". This invariant is * established initially by gve_prefill_rx_slots at alloc-time and is * maintained by the cleanup taskqueue. This invariant implies that the * ring can be considered to be fully posted with buffers at this point, * even if there are unfreed mbufs still being processed, which is why we * can fill the ring without waiting on can_flip at each slot to become true. */ for (i = 0; i < priv->rx_desc_cnt; i++) { rx->data_ring[i].qpl_offset = htobe64(PAGE_SIZE * i + rx->page_info[i].page_offset); rx->fill_cnt++; } bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_rx_clear_desc_ring(struct gve_rx_ring *rx) { struct gve_priv *priv = rx->com.priv; int i; for (i = 0; i < priv->rx_desc_cnt; i++) rx->desc_ring[i] = (struct gve_rx_desc){}; bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); } static void gve_clear_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; rx->seq_no = 1; rx->cnt = 0; rx->fill_cnt = 0; rx->mask = priv->rx_desc_cnt - 1; gve_rx_clear_desc_ring(rx); gve_rx_clear_data_ring(rx); } static void gve_start_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; if ((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) { if (tcp_lro_init(&rx->lro) != 0) device_printf(priv->dev, "Failed to init lro for rx ring %d", i); rx->lro.ifp = priv->ifp; } NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(priv->dev), i); gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); } int gve_create_rx_rings(struct gve_priv *priv) { struct gve_ring_com *com; struct gve_rx_ring *rx; int err; int i; if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) return (0); for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_clear_rx_ring(priv, i); err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); if (err != 0) return (err); bus_dmamap_sync(priv->irqs_db_mem.tag, priv->irqs_db_mem.map, BUS_DMASYNC_POSTREAD); for (i = 0; i < priv->rx_cfg.num_queues; i++) { rx = &priv->rx[i]; com = &rx->com; com->irq_db_offset = 4 * be32toh(priv->irq_db_indices[com->ntfy_id].index); bus_dmamap_sync(com->q_resources_mem.tag, com->q_resources_mem.map, BUS_DMASYNC_POSTREAD); com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); gve_start_rx_ring(priv, i); } gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); return (0); } static void gve_stop_rx_ring(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; if (com->cleanup_tq != NULL) { taskqueue_quiesce(com->cleanup_tq); taskqueue_free(com->cleanup_tq); com->cleanup_tq = NULL; } tcp_lro_free(&rx->lro); rx->ctx = (struct gve_rx_ctx){}; } int gve_destroy_rx_rings(struct gve_priv *priv) { int err; int i; for (i = 0; i < priv->rx_cfg.num_queues; i++) gve_stop_rx_ring(priv, i); if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) { err = gve_adminq_destroy_rx_queues(priv, priv->rx_cfg.num_queues); if (err != 0) return (err); gve_clear_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); } return (0); } int gve_rx_intr(void *arg) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; struct gve_ring_com *com = &rx->com; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return (FILTER_STRAY); gve_db_bar_write_4(priv, com->irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); return (FILTER_HANDLED); } static inline void gve_set_rss_type(__be16 flag, struct mbuf *mbuf) { if ((flag & GVE_RXF_IPV4) != 0) { if ((flag & GVE_RXF_TCP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); else if ((flag & GVE_RXF_UDP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); else M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); return; } if ((flag & GVE_RXF_IPV6) != 0) { if ((flag & GVE_RXF_TCP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); else if ((flag & GVE_RXF_UDP) != 0) M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); else M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); return; } } static void gve_mextadd_free(struct mbuf *mbuf) { vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; /* * Free the page only if this is the last ref. * The interface might no longer exist by the time * this callback is called, see gve_free_qpl. */ if (__predict_false(vm_page_unwire_noq(page))) { pmap_qremove(va, 1); kva_free(va, PAGE_SIZE); vm_page_free(page); } } static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) { const __be64 offset = htobe64(GVE_DEFAULT_RX_BUFFER_OFFSET); page_info->page_offset ^= GVE_DEFAULT_RX_BUFFER_OFFSET; *(slot_addr) ^= offset; } static struct mbuf * gve_rx_create_mbuf(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info, uint16_t len, union gve_rx_data_slot *data_slot, bool is_only_frag) { struct gve_rx_ctx *ctx = &rx->ctx; struct mbuf *mbuf; u_int ref_count; bool can_flip; uint32_t offset = page_info->page_offset + page_info->pad; void *va = (char *)page_info->page_address + offset; if (len <= priv->rx_copybreak && is_only_frag) { mbuf = m_get2(len, M_NOWAIT, MT_DATA, M_PKTHDR); if (__predict_false(mbuf == NULL)) return (NULL); m_copyback(mbuf, 0, len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); counter_exit(); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { struct mbuf *mbuf_tail = ctx->mbuf_tail; KASSERT(len <= MCLBYTES, ("gve rx fragment bigger than cluster mbuf")); /* * This page was created with VM_ALLOC_WIRED, thus the lowest * wire count experienced by the page until the interface is * destroyed is 1. * * We wire the page again before supplying an mbuf pointing to * it to the networking stack, so before the mbuf leaves the * driver, the wire count rises to 2. * * If it is 1 again, it necessarily means that the mbuf has been * consumed and it was gve_mextadd_free that brought down the wire * count back to 1. We only need to eventually observe the 1. */ ref_count = atomic_load_int(&page_info->page->ref_count); can_flip = VPRC_WIRE_COUNT(ref_count) == 1; if (mbuf_tail == NULL) { if (can_flip) mbuf = m_gethdr(M_NOWAIT, MT_DATA); else mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); ctx->mbuf_head = mbuf; ctx->mbuf_tail = mbuf; } else { if (can_flip) mbuf = m_get(M_NOWAIT, MT_DATA); else mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); mbuf_tail->m_next = mbuf; ctx->mbuf_tail = mbuf; } if (__predict_false(mbuf == NULL)) return (NULL); if (can_flip) { MEXTADD(mbuf, va, len, gve_mextadd_free, page_info->page, page_info->page_address, 0, EXT_NET_DRV); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); counter_exit(); /* * Grab an extra ref to the page so that gve_mextadd_free * does not end up freeing the page while the interface exists. */ vm_page_wire(page_info->page); gve_rx_flip_buff(page_info, &data_slot->qpl_offset); } else { m_copyback(mbuf, 0, len, va); counter_enter(); counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); counter_exit(); } } mbuf->m_len = len; ctx->total_size += len; return (mbuf); } static inline bool gve_needs_rss(__be16 flag) { if ((flag & GVE_RXF_FRAG) != 0) return (false); if ((flag & (GVE_RXF_IPV4 | GVE_RXF_IPV6)) != 0) return (true); return (false); } static void gve_rx(struct gve_priv *priv, struct gve_rx_ring *rx, struct gve_rx_desc *desc, uint32_t idx) { struct gve_rx_slot_page_info *page_info; struct gve_dma_handle *page_dma_handle; union gve_rx_data_slot *data_slot; struct gve_rx_ctx *ctx = &rx->ctx; struct mbuf *mbuf = NULL; if_t ifp = priv->ifp; bool do_if_input; uint16_t len; bool is_first_frag = ctx->frag_cnt == 0; bool is_last_frag = !(GVE_RXF_PKT_CONT & desc->flags_seq); bool is_only_frag = is_first_frag && is_last_frag; if (__predict_false(ctx->drop_pkt)) goto finish_frag; if ((desc->flags_seq & GVE_RXF_ERR) != 0) { ctx->drop_pkt = true; counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); goto finish_frag; } page_info = &rx->page_info[idx]; data_slot = &rx->data_ring[idx]; page_dma_handle = &(rx->com.qpl->dmas[idx]); page_info->pad = is_first_frag ? GVE_RX_PAD : 0; len = be16toh(desc->len) - page_info->pad; bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, BUS_DMASYNC_POSTREAD); mbuf = gve_rx_create_mbuf(priv, rx, page_info, len, data_slot, is_only_frag); if (mbuf == NULL) { ctx->drop_pkt = true; counter_enter(); counter_u64_add_protected(rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); counter_exit(); m_freem(ctx->mbuf_head); goto finish_frag; } if (is_first_frag) { mbuf->m_pkthdr.rcvif = priv->ifp; + ctx->is_tcp = desc->flags_seq & GVE_RXF_TCP; if (gve_needs_rss(desc->flags_seq)) { gve_set_rss_type(desc->flags_seq, mbuf); mbuf->m_pkthdr.flowid = be32toh(desc->rss_hash); } if ((desc->csum != 0) && ((desc->flags_seq & GVE_RXF_FRAG) == 0)) { mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR; mbuf->m_pkthdr.csum_data = 0xffff; } } if (is_last_frag) { mbuf = ctx->mbuf_head; mbuf->m_pkthdr.len = ctx->total_size; do_if_input = true; if (((if_getcapenable(priv->ifp) & IFCAP_LRO) != 0) && /* LRO is enabled */ - (desc->flags_seq & GVE_RXF_TCP) && /* pkt is a TCP pkt */ + (ctx->is_tcp) && /* pkt is a TCP pkt */ ((mbuf->m_pkthdr.csum_flags & CSUM_DATA_VALID) != 0) && /* NIC verified csum */ (rx->lro.lro_cnt != 0) && /* LRO resources exist */ (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) do_if_input = false; if (do_if_input) if_input(ifp, mbuf); counter_enter(); counter_u64_add_protected(rx->stats.rbytes, ctx->total_size); counter_u64_add_protected(rx->stats.rpackets, 1); counter_exit(); } finish_frag: ctx->frag_cnt++; if (is_last_frag) rx->ctx = (struct gve_rx_ctx){}; } static bool gve_rx_work_pending(struct gve_rx_ring *rx) { struct gve_rx_desc *desc; __be16 flags_seq; uint32_t next_idx; next_idx = rx->cnt & rx->mask; desc = rx->desc_ring + next_idx; flags_seq = desc->flags_seq; return (GVE_SEQNO(flags_seq) == rx->seq_no); } static inline uint8_t gve_next_seqno(uint8_t seq) { return ((seq + 1) == 8 ? 1 : seq + 1); } static void gve_rx_cleanup(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) { uint32_t idx = rx->cnt & rx->mask; struct gve_rx_desc *desc; struct gve_rx_ctx *ctx = &rx->ctx; uint32_t work_done = 0; NET_EPOCH_ASSERT(); bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, BUS_DMASYNC_POSTREAD); desc = &rx->desc_ring[idx]; while ((work_done < budget || ctx->frag_cnt) && (GVE_SEQNO(desc->flags_seq) == rx->seq_no)) { gve_rx(priv, rx, desc, idx); rx->cnt++; idx = rx->cnt & rx->mask; desc = &rx->desc_ring[idx]; rx->seq_no = gve_next_seqno(rx->seq_no); work_done++; } /* The device will only send whole packets. */ if (__predict_false(ctx->frag_cnt)) { m_freem(ctx->mbuf_head); rx->ctx = (struct gve_rx_ctx){}; device_printf(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset", GVE_SEQNO(desc->flags_seq), rx->seq_no); gve_schedule_reset(priv); } if (work_done != 0) tcp_lro_flush_all(&rx->lro); bus_dmamap_sync(rx->data_ring_mem.tag, rx->data_ring_mem.map, BUS_DMASYNC_PREWRITE); /* Buffers are refilled as the descs are processed */ rx->fill_cnt += work_done; gve_db_bar_write_4(priv, rx->com.db_offset, rx->fill_cnt); } void gve_rx_cleanup_tq(void *arg, int pending) { struct gve_rx_ring *rx = arg; struct gve_priv *priv = rx->com.priv; if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) return; gve_rx_cleanup(priv, rx, /*budget=*/128); gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_ACK | GVE_IRQ_EVENT); /* * Fragments received before this barrier MAY NOT cause the NIC to send an * interrupt but they will still be handled by the enqueue below. * Fragments received after the barrier WILL trigger an interrupt. */ mb(); if (gve_rx_work_pending(rx)) { gve_db_bar_write_4(priv, rx->com.irq_db_offset, GVE_IRQ_MASK); taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); } }