diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 --- a/share/man/man4/gve.4 +++ b/share/man/man4/gve.4 @@ -1,6 +1,6 @@ .\" SPDX-License-Identifier: BSD-3-Clause .\" -.\" Copyright (c) 2023 Google LLC +.\" Copyright (c) 2023-2024 Google LLC .\" .\" Redistribution and use in source and binary forms, with or without modification, .\" are permitted provided that the following conditions are met: @@ -26,7 +26,7 @@ .\" ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT .\" (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS .\" SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -.Dd April 26, 2023 +.Dd October 14, 2024 .Dt GVE 4 .Os .Sh NAME @@ -192,16 +192,53 @@ .Pp Apart from these messages, the driver exposes per-queue packet and error counters as sysctl nodes. Global (across queues) counters can be read using -.Xr netstat 8 . +.Xr netstat 1 . +.Sh SYSCTL VARIABLES +.Nm +exposes the following +.Xr sysctl 8 +variables: +.Bl -tag -width indent +.It Va hw.gve.driver_version +The driver version. +This is read-only. +.It Va hw.gve.queue_format +The queue format in use. +This is read-only. +.It Va hw.gve.disable_hw_lro +Setting this boot-time tunable to 1 disables Large Receive Offload in the NIC. +The default value is 0, which means hardware LRO is enabled by default. +The software LRO stack in the kernel is always used. +This sysctl variable needs to be set before loading the driver with +.Xr loader.conf 5 . +.El .Sh LIMITATIONS .Nm does not support the transmission of VLAN-tagged packets. All VLAN-tagged traffic is dropped. +.Sh QUEUE FORMATS +.Nm +features different datapath modes called queue formats: +.Pp +.Bl -bullet -compact +.It +GQI_QPL: "QPL" stands for "Queue Page List" and refers to the fact that +hardware expects a fixed bounce buffer and cannot access arbitrary memory. +GQI is the older descriptor format. +.It +DQO_RDA: DQO is the descriptor format required to take full advantage of +next generation VM shapes. +"RDA" stands for "Raw DMA Addressing" and refers to the fact that hardware +can work with DMA-ed packets and does not expect them to be copied into or +out of a fixed bounce buffer. +.El .Sh SUPPORT Please email gvnic-drivers@google.com with the specifics of the issue encountered. .Sh SEE ALSO +.Xr netstat 1 , +.Xr loader.conf 5 , .Xr ifconfig 8 , -.Xr netstat 8 +.Xr sysctl 8 .Sh HISTORY The .Nm diff --git a/sys/conf/files b/sys/conf/files --- a/sys/conf/files +++ b/sys/conf/files @@ -1732,8 +1732,10 @@ dev/gve/gve_main.c optional gve dev/gve/gve_qpl.c optional gve dev/gve/gve_rx.c optional gve +dev/gve/gve_rx_dqo.c optional gve dev/gve/gve_sysctl.c optional gve dev/gve/gve_tx.c optional gve +dev/gve/gve_tx_dqo.c optional gve dev/gve/gve_utils.c optional gve dev/goldfish/goldfish_rtc.c optional goldfish_rtc fdt dev/gpio/acpi_gpiobus.c optional acpi gpio diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -53,6 +53,9 @@ /* Each RX bounce buffer page can fit two packet buffers. */ #define GVE_DEFAULT_RX_BUFFER_OFFSET (PAGE_SIZE / 2) +/* PTYPEs are always 10 bits. */ +#define GVE_NUM_PTYPES 1024 + /* * Number of descriptors per queue page list. * Page count AKA QPL size can be derived by dividing the number of elements in @@ -224,30 +227,61 @@ counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; + counter_u64_t rx_mbuf_dmamap_err; + counter_u64_t rx_mbuf_mclget_null; }; #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) +struct gve_rx_buf_dqo { + struct mbuf *mbuf; + bus_dmamap_t dmamap; + uint64_t addr; + bool mapped; + SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; +}; + /* power-of-2 sized receive ring */ struct gve_rx_ring { struct gve_ring_com com; struct gve_dma_handle desc_ring_mem; - struct gve_dma_handle data_ring_mem; - - /* accessed in the receive hot path */ - struct { - struct gve_rx_desc *desc_ring; - union gve_rx_data_slot *data_ring; - struct gve_rx_slot_page_info *page_info; - - struct gve_rx_ctx ctx; - struct lro_ctrl lro; - uint8_t seq_no; /* helps traverse the descriptor ring */ - uint32_t cnt; /* free-running total number of completed packets */ - uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ - uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ - struct gve_rxq_stats stats; - } __aligned(CACHE_LINE_SIZE); + uint32_t cnt; /* free-running total number of completed packets */ + uint32_t fill_cnt; /* free-running total number of descs and buffs posted */ + + union { + /* GQI-only fields */ + struct { + struct gve_dma_handle data_ring_mem; + + /* accessed in the GQ receive hot path */ + struct gve_rx_desc *desc_ring; + union gve_rx_data_slot *data_ring; + struct gve_rx_slot_page_info *page_info; + uint32_t mask; /* masks the cnt and fill_cnt to the size of the ring */ + uint8_t seq_no; /* helps traverse the descriptor ring */ + }; + + /* DQO-only fields */ + struct { + struct gve_dma_handle compl_ring_mem; + + struct gve_rx_compl_desc_dqo *compl_ring; + struct gve_rx_desc_dqo *desc_ring; + struct gve_rx_buf_dqo *bufs; /* Parking place for posted buffers */ + bus_dma_tag_t buf_dmatag; /* To dmamap posted mbufs with */ + + uint32_t buf_cnt; /* Size of the bufs array */ + uint32_t mask; /* One less than the sizes of the desc and compl rings */ + uint32_t head; /* The index at which to post the next buffer at */ + uint32_t tail; /* The index at which to receive the next compl at */ + uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ + SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; + } dqo; + }; + + struct lro_ctrl lro; + struct gve_rx_ctx ctx; + struct gve_rxq_stats stats; } __aligned(CACHE_LINE_SIZE); @@ -277,11 +311,26 @@ counter_u64_t tx_dropped_pkt; counter_u64_t tx_dropped_pkt_nospace_device; counter_u64_t tx_dropped_pkt_nospace_bufring; + counter_u64_t tx_delayed_pkt_nospace_descring; + counter_u64_t tx_delayed_pkt_nospace_compring; + counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; + counter_u64_t tx_mbuf_collapse; + counter_u64_t tx_mbuf_defrag; + counter_u64_t tx_mbuf_defrag_err; + counter_u64_t tx_mbuf_dmamap_enomem_err; + counter_u64_t tx_mbuf_dmamap_err; }; #define NUM_TX_STATS (sizeof(struct gve_txq_stats) / sizeof(counter_u64_t)) +struct gve_tx_pending_pkt_dqo { + struct mbuf *mbuf; + bus_dmamap_t dmamap; + uint8_t state; /* the gve_packet_state enum */ + int next; /* To chain the free_pending_pkts lists */ +}; + /* power-of-2 sized transmit ring */ struct gve_tx_ring { struct gve_ring_com com; @@ -290,22 +339,95 @@ struct task xmit_task; struct taskqueue *xmit_tq; - /* accessed in the transmit hot path */ - struct { - union gve_tx_desc *desc_ring; - struct gve_tx_buffer_state *info; - struct buf_ring *br; + /* Accessed when writing descriptors */ + struct buf_ring *br; + struct mtx ring_mtx; + + uint32_t req; /* free-running total number of packets written to the nic */ + uint32_t done; /* free-running total number of completed packets */ + + union { + /* GQI specific stuff */ + struct { + union gve_tx_desc *desc_ring; + struct gve_tx_buffer_state *info; + + struct gve_tx_fifo fifo; + + uint32_t mask; /* masks the req and done to the size of the ring */ + }; + + /* DQO specific stuff */ + struct { + struct gve_dma_handle compl_ring_mem; + + /* Accessed when writing descriptors */ + struct { + union gve_tx_desc_dqo *desc_ring; + uint32_t desc_mask; /* masks head and tail to the size of desc_ring */ + uint32_t desc_head; /* last desc read by NIC, cached value of hw_tx_head */ + uint32_t desc_tail; /* last desc written by driver */ + uint32_t last_re_idx; /* desc which last had "report event" set */ + + /* + * The head index of a singly linked list containing pending packet objects + * to park mbufs till the NIC sends completions. Once this list is depleted, + * the "_prd" suffixed producer list, grown by the completion taskqueue, + * is stolen. + */ + int32_t free_pending_pkts_csm; + + bus_dma_tag_t buf_dmatag; /* DMA params for mapping Tx mbufs */ + } __aligned(CACHE_LINE_SIZE); + + /* Accessed when processing completions */ + struct { + struct gve_tx_compl_desc_dqo *compl_ring; + uint32_t compl_mask; /* masks head to the size of compl_ring */ + uint32_t compl_head; /* last completion read by driver */ + uint8_t cur_gen_bit; /* NIC flips a bit on every pass */ + volatile uint32_t hw_tx_head; /* last desc read by NIC */ + + /* + * The completion taskqueue moves pending-packet objects to this + * list after freeing the mbuf. The "_prd" denotes that this is + * a producer list. The trasnmit taskqueue steals this list once + * its consumer list, with the "_csm" suffix, is depleted. + */ + volatile int32_t free_pending_pkts_prd; + } __aligned(CACHE_LINE_SIZE); + + /* Accessed by both the completion and xmit loops */ + struct { + /* completion tags index into this array */ + struct gve_tx_pending_pkt_dqo *pending_pkts; + uint16_t num_pending_pkts; + } __aligned(CACHE_LINE_SIZE); + } dqo; + }; + struct gve_txq_stats stats; +} __aligned(CACHE_LINE_SIZE); - struct gve_tx_fifo fifo; - struct mtx ring_mtx; +enum gve_packet_state { + /* + * Packet does not yet have a dmamap created. + * This should always be zero since state is not explicitly initialized. + */ + GVE_PACKET_STATE_UNALLOCATED, + /* Packet has a dmamap and is in free list, available to be allocated. */ + GVE_PACKET_STATE_FREE, + /* Packet is expecting a regular data completion */ + GVE_PACKET_STATE_PENDING_DATA_COMPL, +}; - uint32_t req; /* free-running total number of packets written to the nic */ - uint32_t done; /* free-running total number of completed packets */ - uint32_t mask; /* masks the req and done to the size of the ring */ - struct gve_txq_stats stats; - } __aligned(CACHE_LINE_SIZE); +struct gve_ptype { + uint8_t l3_type; /* `gve_l3_type` in gve_adminq.h */ + uint8_t l4_type; /* `gve_l4_type` in gve_adminq.h */ +}; -} __aligned(CACHE_LINE_SIZE); +struct gve_ptype_lut { + struct gve_ptype ptypes[GVE_NUM_PTYPES]; +}; struct gve_priv { if_t ifp; @@ -348,6 +470,8 @@ struct gve_tx_ring *tx; struct gve_rx_ring *rx; + struct gve_ptype_lut *ptype_lut_dqo; + /* * Admin queue - see gve_adminq.h * Since AQ cmds do not run in steady state, 32 bit counters suffice @@ -370,6 +494,7 @@ uint32_t adminq_dcfg_device_resources_cnt; uint32_t adminq_set_driver_parameter_cnt; uint32_t adminq_verify_driver_compatibility_cnt; + uint32_t adminq_get_ptype_map_cnt; uint32_t interface_up_cnt; uint32_t interface_down_cnt; @@ -400,6 +525,12 @@ BIT_CLR_ATOMIC(GVE_NUM_STATE_FLAGS, pos, &priv->state_flags); } +static inline bool +gve_is_gqi(struct gve_priv *priv) +{ + return (priv->queue_format == GVE_GQI_QPL_FORMAT); +} + /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); @@ -407,6 +538,7 @@ uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset); void gve_reg_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); void gve_db_bar_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); +void gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val); /* QPL (Queue Page List) functions defined in gve_qpl.c */ int gve_alloc_qpls(struct gve_priv *priv); @@ -425,6 +557,14 @@ void gve_xmit_tq(void *arg, int pending); void gve_tx_cleanup_tq(void *arg, int pending); +/* TX functions defined in gve_tx_dqo.c */ +int gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i); +void gve_tx_free_ring_dqo(struct gve_priv *priv, int i); +void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); +int gve_tx_intr_dqo(void *arg); +int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); +void gve_tx_cleanup_tq_dqo(void *arg, int pending); + /* RX functions defined in gve_rx.c */ int gve_alloc_rx_rings(struct gve_priv *priv); void gve_free_rx_rings(struct gve_priv *priv); @@ -433,6 +573,14 @@ int gve_rx_intr(void *arg); void gve_rx_cleanup_tq(void *arg, int pending); +/* RX functions defined in gve_rx_dqo.c */ +int gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i); +void gve_rx_free_ring_dqo(struct gve_priv *priv, int i); +void gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx); +void gve_clear_rx_ring_dqo(struct gve_priv *priv, int i); +int gve_rx_intr_dqo(void *arg); +void gve_rx_cleanup_tq_dqo(void *arg, int pending); + /* DMA functions defined in gve_utils.c */ int gve_dma_alloc_coherent(struct gve_priv *priv, int size, int align, struct gve_dma_handle *dma); @@ -447,7 +595,10 @@ void gve_unmask_all_queue_irqs(struct gve_priv *priv); void gve_mask_all_queue_irqs(struct gve_priv *priv); -/* Systcl functions defined in gve_sysctl.c*/ +/* Systcl functions defined in gve_sysctl.c */ +extern bool gve_disable_hw_lro; +extern char gve_queue_format[8]; +extern char gve_version[8]; void gve_setup_sysctl(struct gve_priv *priv); void gve_accum_stats(struct gve_priv *priv, uint64_t *rpackets, uint64_t *rbytes, uint64_t *rx_dropped_pkt, uint64_t *tpackets, diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h --- a/sys/dev/gve/gve_adminq.h +++ b/sys/dev/gve/gve_adminq.h @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -137,9 +137,11 @@ struct gve_device_option_dqo_rda { __be32 supported_features_mask; + __be16 tx_comp_ring_entries; + __be16 rx_buff_ring_entries; }; -_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 4, +_Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8, "gve: bad admin queue struct length"); struct gve_device_option_modify_ring { @@ -196,7 +198,6 @@ gve_driver_capability_gqi_rda = 1, gve_driver_capability_dqo_qpl = 2, /* reserved for future use */ gve_driver_capability_dqo_rda = 3, - gve_driver_capability_alt_miss_compl = 4, }; #define GVE_CAP1(a) BIT((int) a) @@ -209,7 +210,9 @@ * Only a few bits (as shown in `gve_driver_compatibility`) are currently * defined. The rest are reserved for future use. */ -#define GVE_DRIVER_CAPABILITY_FLAGS1 (GVE_CAP1(gve_driver_capability_gqi_qpl)) +#define GVE_DRIVER_CAPABILITY_FLAGS1 \ + (GVE_CAP1(gve_driver_capability_gqi_qpl) | \ + GVE_CAP1(gve_driver_capability_dqo_rda)) #define GVE_DRIVER_CAPABILITY_FLAGS2 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS3 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS4 0x0 @@ -282,6 +285,8 @@ _Static_assert(sizeof(struct gve_adminq_create_tx_queue) == 48, "gve: bad admin queue struct length"); +#define GVE_RAW_ADDRESSING_QPL_ID 0xFFFFFFFF + struct gve_adminq_create_rx_queue { __be32 queue_id; __be32 index; @@ -352,6 +357,23 @@ _Static_assert(sizeof(struct stats) == 16, "gve: bad admin queue struct length"); +/* These are control path types for PTYPE which are the same as the data path + * types. + */ +struct gve_ptype_entry { + uint8_t l3_type; + uint8_t l4_type; +}; + +struct gve_ptype_map { + struct gve_ptype_entry ptypes[1 << 10]; /* PTYPES are always 10 bits. */ +}; + +struct gve_adminq_get_ptype_map { + __be64 ptype_map_len; + __be64 ptype_map_addr; +}; + struct gve_adminq_command { __be32 opcode; __be32 status; @@ -368,6 +390,7 @@ struct gve_adminq_set_driver_parameter set_driver_param; struct gve_adminq_verify_driver_compatibility verify_driver_compatibility; + struct gve_adminq_get_ptype_map get_ptype_map; uint8_t reserved[56]; }; }; @@ -375,6 +398,24 @@ _Static_assert(sizeof(struct gve_adminq_command) == 64, "gve: bad admin queue struct length"); +enum gve_l3_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L3_TYPE_UNKNOWN = 0, + GVE_L3_TYPE_OTHER, + GVE_L3_TYPE_IPV4, + GVE_L3_TYPE_IPV6, +}; + +enum gve_l4_type { + /* Must be zero so zero initialized LUT is unknown. */ + GVE_L4_TYPE_UNKNOWN = 0, + GVE_L4_TYPE_OTHER, + GVE_L4_TYPE_TCP, + GVE_L4_TYPE_UDP, + GVE_L4_TYPE_ICMP, + GVE_L4_TYPE_SCTP, +}; + int gve_adminq_create_rx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_create_tx_queues(struct gve_priv *priv, uint32_t num_queues); int gve_adminq_destroy_tx_queues(struct gve_priv *priv, uint32_t num_queues); @@ -387,8 +428,10 @@ int gve_adminq_deconfigure_device_resources(struct gve_priv *priv); void gve_release_adminq(struct gve_priv *priv); int gve_adminq_register_page_list(struct gve_priv *priv, - struct gve_queue_page_list *qpl); + struct gve_queue_page_list *qpl); int gve_adminq_unregister_page_list(struct gve_priv *priv, uint32_t page_list_id); int gve_adminq_verify_driver_compatibility(struct gve_priv *priv, - uint64_t driver_info_len, vm_paddr_t driver_info_addr); + uint64_t driver_info_len, vm_paddr_t driver_info_addr); +int gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut); #endif /* _GVE_AQ_H_ */ diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c --- a/sys/dev/gve/gve_adminq.c +++ b/sys/dev/gve/gve_adminq.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -57,6 +57,7 @@ struct gve_device_descriptor *device_descriptor, struct gve_device_option *option, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { uint32_t req_feat_mask = be32toh(option->required_features_mask); @@ -85,6 +86,23 @@ *dev_op_gqi_qpl = (void *)(option + 1); break; + case GVE_DEV_OPT_ID_DQO_RDA: + if (option_length < sizeof(**dev_op_dqo_rda) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "DQO RDA", (int)sizeof(**dev_op_dqo_rda), + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_dqo_rda)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "DQO RDA"); + } + *dev_op_dqo_rda = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_JUMBO_FRAMES: if (option_length < sizeof(**dev_op_jumbo_frames) || req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { @@ -117,6 +135,7 @@ gve_process_device_options(struct gve_priv *priv, struct gve_device_descriptor *descriptor, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, + struct gve_device_option_dqo_rda **dev_op_dqo_rda, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { char *desc_end = (char *)descriptor + be16toh(descriptor->total_length); @@ -130,12 +149,12 @@ if ((char *)(dev_opt + 1) > desc_end || (char *)(dev_opt + 1) + be16toh(dev_opt->option_length) > desc_end) { device_printf(priv->dev, - "options exceed device_descriptor's total length.\n"); + "options exceed device descriptor's total length.\n"); return (EINVAL); } gve_parse_device_option(priv, descriptor, dev_opt, - dev_op_gqi_qpl, dev_op_jumbo_frames); + dev_op_gqi_qpl, dev_op_dqo_rda, dev_op_jumbo_frames); dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length)); } @@ -221,16 +240,35 @@ cmd.opcode = htobe32(GVE_ADMINQ_CREATE_RX_QUEUE); cmd.create_rx_queue = (struct gve_adminq_create_rx_queue) { .queue_id = htobe32(queue_index), - .index = htobe32(queue_index), .ntfy_id = htobe32(rx->com.ntfy_id), .queue_resources_addr = htobe64(qres_dma->bus_addr), - .rx_desc_ring_addr = htobe64(rx->desc_ring_mem.bus_addr), - .rx_data_ring_addr = htobe64(rx->data_ring_mem.bus_addr), - .queue_page_list_id = htobe32((rx->com.qpl)->id), .rx_ring_size = htobe16(priv->rx_desc_cnt), .packet_buffer_size = htobe16(GVE_DEFAULT_RX_BUFFER_SIZE), }; + if (gve_is_gqi(priv)) { + cmd.create_rx_queue.rx_desc_ring_addr = + htobe64(rx->desc_ring_mem.bus_addr); + cmd.create_rx_queue.rx_data_ring_addr = + htobe64(rx->data_ring_mem.bus_addr); + cmd.create_rx_queue.index = + htobe32(queue_index); + cmd.create_rx_queue.queue_page_list_id = + htobe32((rx->com.qpl)->id); + } else { + cmd.create_rx_queue.queue_page_list_id = + htobe32(GVE_RAW_ADDRESSING_QPL_ID); + cmd.create_rx_queue.rx_desc_ring_addr = + htobe64(rx->dqo.compl_ring_mem.bus_addr); + cmd.create_rx_queue.rx_data_ring_addr = + htobe64(rx->desc_ring_mem.bus_addr); + cmd.create_rx_queue.rx_buff_ring_size = + htobe16(priv->rx_desc_cnt); + cmd.create_rx_queue.enable_rsc = + !!((if_getcapenable(priv->ifp) & IFCAP_LRO) && + !gve_disable_hw_lro); + } + return (gve_adminq_execute_cmd(priv, &cmd)); } @@ -272,11 +310,21 @@ .queue_id = htobe32(queue_index), .queue_resources_addr = htobe64(qres_dma->bus_addr), .tx_ring_addr = htobe64(tx->desc_ring_mem.bus_addr), - .queue_page_list_id = htobe32((tx->com.qpl)->id), .ntfy_id = htobe32(tx->com.ntfy_id), .tx_ring_size = htobe16(priv->tx_desc_cnt), }; + if (gve_is_gqi(priv)) { + cmd.create_tx_queue.queue_page_list_id = + htobe32((tx->com.qpl)->id); + } else { + cmd.create_tx_queue.queue_page_list_id = + htobe32(GVE_RAW_ADDRESSING_QPL_ID); + cmd.create_tx_queue.tx_comp_ring_addr = + htobe64(tx->dqo.compl_ring_mem.bus_addr); + cmd.create_tx_queue.tx_comp_ring_size = + htobe16(priv->tx_desc_cnt); + } return (gve_adminq_execute_cmd(priv, &cmd)); } @@ -338,6 +386,7 @@ struct gve_device_descriptor *desc; struct gve_dma_handle desc_mem; struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; + struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL; struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; uint32_t supported_features_mask = 0; int rc; @@ -366,12 +415,24 @@ bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD); - rc = gve_process_device_options(priv, desc, &dev_op_gqi_qpl, + rc = gve_process_device_options(priv, desc, + &dev_op_gqi_qpl, &dev_op_dqo_rda, &dev_op_jumbo_frames); if (rc != 0) goto free_device_descriptor; - if (dev_op_gqi_qpl != NULL) { + if (dev_op_dqo_rda != NULL) { + snprintf(gve_queue_format, sizeof(gve_queue_format), + "%s", "DQO RDA"); + priv->queue_format = GVE_DQO_RDA_FORMAT; + supported_features_mask = be32toh( + dev_op_dqo_rda->supported_features_mask); + if (bootverbose) + device_printf(priv->dev, + "Driver is running with DQO RDA queue format.\n"); + } else if (dev_op_gqi_qpl != NULL) { + snprintf(gve_queue_format, sizeof(gve_queue_format), + "%s", "GQI QPL"); priv->queue_format = GVE_GQI_QPL_FORMAT; supported_features_mask = be32toh( dev_op_gqi_qpl->supported_features_mask); @@ -380,7 +441,7 @@ "Driver is running with GQI QPL queue format.\n"); } else { device_printf(priv->dev, "No compatible queue formats\n"); - rc = (EINVAL); + rc = EINVAL; goto free_device_descriptor; } @@ -506,6 +567,41 @@ return (gve_adminq_execute_cmd(priv, &aq_cmd)); } +int +gve_adminq_get_ptype_map_dqo(struct gve_priv *priv, + struct gve_ptype_lut *ptype_lut_dqo) +{ + struct gve_adminq_command aq_cmd = (struct gve_adminq_command){}; + struct gve_ptype_map *ptype_map; + struct gve_dma_handle dma; + int err = 0; + int i; + + err = gve_dma_alloc_coherent(priv, sizeof(*ptype_map), PAGE_SIZE, &dma); + if (err) + return (err); + ptype_map = dma.cpu_addr; + + aq_cmd.opcode = htobe32(GVE_ADMINQ_GET_PTYPE_MAP); + aq_cmd.get_ptype_map = (struct gve_adminq_get_ptype_map) { + .ptype_map_len = htobe64(sizeof(*ptype_map)), + .ptype_map_addr = htobe64(dma.bus_addr), + }; + + err = gve_adminq_execute_cmd(priv, &aq_cmd); + if (err) + goto err; + + /* Populate ptype_lut_dqo. */ + for (i = 0; i < GVE_NUM_PTYPES; i++) { + ptype_lut_dqo->ptypes[i].l3_type = ptype_map->ptypes[i].l3_type; + ptype_lut_dqo->ptypes[i].l4_type = ptype_map->ptypes[i].l4_type; + } +err: + gve_dma_free_coherent(&dma); + return (err); +} + int gve_adminq_alloc(struct gve_priv *priv) { @@ -543,6 +639,7 @@ priv->adminq_destroy_rx_queue_cnt = 0; priv->adminq_dcfg_device_resources_cnt = 0; priv->adminq_set_driver_parameter_cnt = 0; + priv->adminq_get_ptype_map_cnt = 0; gve_reg_bar_write_4(priv, GVE_REG_ADMINQ_ADDR, priv->adminq_bus_addr / ADMINQ_SIZE); @@ -772,6 +869,10 @@ priv->adminq_verify_driver_compatibility_cnt++; break; + case GVE_ADMINQ_GET_PTYPE_MAP: + priv->adminq_get_ptype_map_cnt++; + break; + default: device_printf(priv->dev, "Unknown AQ command opcode %d\n", opcode); } diff --git a/sys/dev/gve/gve_dqo.h b/sys/dev/gve/gve_dqo.h new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_dqo.h @@ -0,0 +1,306 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2024 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* GVE DQO Descriptor formats */ + +#ifndef _GVE_DESC_DQO_H_ +#define _GVE_DESC_DQO_H_ + +#include "gve_plat.h" + +#define GVE_ITR_ENABLE_BIT_DQO BIT(0) +#define GVE_ITR_NO_UPDATE_DQO (3 << 3) +#define GVE_ITR_INTERVAL_DQO_SHIFT 5 +#define GVE_ITR_INTERVAL_DQO_MASK ((1 << 12) - 1) +#define GVE_TX_IRQ_RATELIMIT_US_DQO 50 +#define GVE_RX_IRQ_RATELIMIT_US_DQO 20 + +#define GVE_TX_MAX_HDR_SIZE_DQO 255 +#define GVE_TX_MIN_TSO_MSS_DQO 88 + +/* + * Ringing the doorbell too often can hurt performance. + * + * HW requires this value to be at least 8. + */ +#define GVE_RX_BUF_THRESH_DQO 32 + +/* + * Start dropping RX fragments if at least these many + * buffers cannot be posted to the NIC. + */ +#define GVE_RX_DQO_MIN_PENDING_BUFS 32 + +/* Basic TX descriptor (DTYPE 0x0C) */ +struct gve_tx_pkt_desc_dqo { + __le64 buf_addr; + + /* Must be GVE_TX_PKT_DESC_DTYPE_DQO (0xc) */ + uint8_t dtype:5; + + /* Denotes the last descriptor of a packet. */ + uint8_t end_of_packet:1; + uint8_t checksum_offload_enable:1; + + /* If set, will generate a descriptor completion for this descriptor. */ + uint8_t report_event:1; + uint8_t reserved0; + __le16 reserved1; + + /* The TX completion for this packet will contain this tag. */ + __le16 compl_tag; + uint16_t buf_size:14; + uint16_t reserved2:2; +} __packed; +_Static_assert(sizeof(struct gve_tx_pkt_desc_dqo) == 16, + "gve: bad dqo desc struct length"); + +#define GVE_TX_PKT_DESC_DTYPE_DQO 0xc + +/* + * Maximum number of data descriptors allowed per packet, or per-TSO segment. + */ +#define GVE_TX_MAX_DATA_DESCS_DQO 10 +#define GVE_TX_MAX_BUF_SIZE_DQO ((16 * 1024) - 1) +#define GVE_TSO_MAXSIZE_DQO IP_MAXPACKET + +_Static_assert(GVE_TX_MAX_BUF_SIZE_DQO * GVE_TX_MAX_DATA_DESCS_DQO >= + GVE_TSO_MAXSIZE_DQO, + "gve: bad tso parameters"); + +/* + * "report_event" on TX packet descriptors may only be reported on the last + * descriptor of a TX packet, and they must be spaced apart with at least this + * value. + */ +#define GVE_TX_MIN_RE_INTERVAL 32 + +struct gve_tx_context_cmd_dtype { + uint8_t dtype:5; + uint8_t tso:1; + uint8_t reserved1:2; + uint8_t reserved2; +}; + +_Static_assert(sizeof(struct gve_tx_context_cmd_dtype) == 2, + "gve: bad dqo desc struct length"); + +/* + * TX Native TSO Context DTYPE (0x05) + * + * "flex" fields allow the driver to send additional packet context to HW. + */ +struct gve_tx_tso_context_desc_dqo { + /* The L4 payload bytes that should be segmented. */ + uint32_t tso_total_len:24; + uint32_t flex10:8; + + /* Max segment size in TSO excluding headers. */ + uint16_t mss:14; + uint16_t reserved:2; + + uint8_t header_len; /* Header length to use for TSO offload */ + uint8_t flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + uint8_t flex0; + uint8_t flex5; + uint8_t flex6; + uint8_t flex7; + uint8_t flex8; + uint8_t flex9; +} __packed; +_Static_assert(sizeof(struct gve_tx_tso_context_desc_dqo) == 16, + "gve: bad dqo desc struct length"); + +#define GVE_TX_TSO_CTX_DESC_DTYPE_DQO 0x5 + +/* General context descriptor for sending metadata. */ +struct gve_tx_general_context_desc_dqo { + uint8_t flex4; + uint8_t flex5; + uint8_t flex6; + uint8_t flex7; + uint8_t flex8; + uint8_t flex9; + uint8_t flex10; + uint8_t flex11; + struct gve_tx_context_cmd_dtype cmd_dtype; + uint16_t reserved; + uint8_t flex0; + uint8_t flex1; + uint8_t flex2; + uint8_t flex3; +} __packed; +_Static_assert(sizeof(struct gve_tx_general_context_desc_dqo) == 16, + "gve: bad dqo desc struct length"); + +#define GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO 0x4 + +/* + * Logical structure of metadata which is packed into context descriptor flex + * fields. + */ +struct gve_tx_metadata_dqo { + union { + struct { + uint8_t version; + + /* + * A zero value means no l4_hash was associated with the + * mbuf. + */ + uint16_t path_hash:15; + + /* + * Should be set to 1 if the flow associated with the + * mbuf had a rehash from the TCP stack. + */ + uint16_t rehash_event:1; + } __packed; + uint8_t bytes[12]; + }; +} __packed; +_Static_assert(sizeof(struct gve_tx_metadata_dqo) == 12, + "gve: bad dqo desc struct length"); + +#define GVE_TX_METADATA_VERSION_DQO 0 + +/* TX completion descriptor */ +struct gve_tx_compl_desc_dqo { + /* For types 0-4 this is the TX queue ID associated with this + * completion. + */ + uint16_t id:11; + + /* See: GVE_COMPL_TYPE_DQO* */ + uint16_t type:3; + uint16_t reserved0:1; + + /* Flipped by HW to notify the descriptor is populated. */ + uint16_t generation:1; + union { + /* For descriptor completions, this is the last index fetched + * by HW + 1. + */ + __le16 tx_head; + + /* For packet completions, this is the completion tag set on the + * TX packet descriptors. + */ + __le16 completion_tag; + }; + __le32 reserved1; +} __packed; +_Static_assert(sizeof(struct gve_tx_compl_desc_dqo) == 8, + "gve: bad dqo desc struct length"); + +union gve_tx_desc_dqo { + struct gve_tx_pkt_desc_dqo pkt; + struct gve_tx_tso_context_desc_dqo tso_ctx; + struct gve_tx_general_context_desc_dqo general_ctx; +}; + +#define GVE_COMPL_TYPE_DQO_PKT 0x2 /* Packet completion */ +#define GVE_COMPL_TYPE_DQO_DESC 0x4 /* Descriptor completion */ + +/* Descriptor to post buffers to HW on buffer queue. */ +struct gve_rx_desc_dqo { + __le16 buf_id; /* ID returned in Rx completion descriptor */ + __le16 reserved0; + __le32 reserved1; + __le64 buf_addr; /* DMA address of the buffer */ + __le64 header_buf_addr; + __le64 reserved2; +} __packed; +_Static_assert(sizeof(struct gve_rx_desc_dqo) == 32, + "gve: bad dqo desc struct length"); + +/* Descriptor for HW to notify SW of new packets received on RX queue. */ +struct gve_rx_compl_desc_dqo { + /* Must be 1 */ + uint8_t rxdid:4; + uint8_t reserved0:4; + + /* Packet originated from this system rather than the network. */ + uint8_t loopback:1; + /* Set when IPv6 packet contains a destination options header or routing + * header. + */ + uint8_t ipv6_ex_add:1; + /* Invalid packet was received. */ + uint8_t rx_error:1; + uint8_t reserved1:5; + + uint16_t packet_type:10; + uint16_t ip_hdr_err:1; + uint16_t udp_len_err:1; + uint16_t raw_cs_invalid:1; + uint16_t reserved2:3; + + uint16_t packet_len:14; + /* Flipped by HW to notify the descriptor is populated. */ + uint16_t generation:1; + /* Should be zero. */ + uint16_t buffer_queue_id:1; + + uint16_t header_len:10; + uint16_t rsc:1; + uint16_t split_header:1; + uint16_t reserved3:4; + + uint8_t descriptor_done:1; + uint8_t end_of_packet:1; + uint8_t header_buffer_overflow:1; + uint8_t l3_l4_processed:1; + uint8_t csum_ip_err:1; + uint8_t csum_l4_err:1; + uint8_t csum_external_ip_err:1; + uint8_t csum_external_udp_err:1; + + uint8_t status_error1; + + __le16 reserved5; + __le16 buf_id; /* Buffer ID which was sent on the buffer queue. */ + + union { + /* Packet checksum. */ + __le16 raw_cs; + /* Segment length for RSC packets. */ + __le16 rsc_seg_len; + }; + __le32 hash; + __le32 reserved6; + __le64 reserved7; +} __packed; + +_Static_assert(sizeof(struct gve_rx_compl_desc_dqo) == 32, + "gve: bad dqo desc struct length"); +#endif /* _GVE_DESC_DQO_H_ */ diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,11 +30,12 @@ */ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.0.1\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.2.0\n" #define GVE_VERSION_MAJOR 1 -#define GVE_VERSION_MINOR 0 -#define GVE_VERSION_SUB 1 +#define GVE_VERSION_MINOR 2 +#define GVE_VERSION_SUB 0 #define GVE_DEFAULT_RX_COPYBREAK 256 @@ -124,9 +125,11 @@ if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); - err = gve_register_qpls(priv); - if (err != 0) - goto reset; + if (gve_is_gqi(priv)) { + err = gve_register_qpls(priv); + if (err != 0) + goto reset; + } err = gve_create_rx_rings(priv); if (err != 0) @@ -156,6 +159,8 @@ static void gve_down(struct gve_priv *priv) { + int err; + GVE_IFACE_LOCK_ASSERT(priv->gve_iface_lock); if (!gve_get_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP)) @@ -168,16 +173,21 @@ if_setdrvflagbits(priv->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING); - if (gve_destroy_rx_rings(priv) != 0) + err = gve_destroy_rx_rings(priv); + if (err != 0) goto reset; - if (gve_destroy_tx_rings(priv) != 0) + err = gve_destroy_tx_rings(priv); + if (err != 0) goto reset; - if (gve_unregister_qpls(priv) != 0) - goto reset; + if (gve_is_gqi(priv)) { + if (gve_unregister_qpls(priv) != 0) + goto reset; + } - gve_mask_all_queue_irqs(priv); + if (gve_is_gqi(priv)) + gve_mask_all_queue_irqs(priv); gve_clear_state_flag(priv, GVE_STATE_FLAG_QUEUES_UP); priv->interface_down_cnt++; return; @@ -367,6 +377,16 @@ if_settransmitfn(ifp, gve_xmit_ifp); if_setqflushfn(ifp, gve_qflush); + /* + * Set TSO limits, must match the arguments to bus_dma_tag_create + * when creating tx->dqo.buf_dmatag + */ + if (!gve_is_gqi(priv)) { + if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); + if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); + if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); + } + #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); #else @@ -449,7 +469,8 @@ gve_free_irqs(priv); gve_free_tx_rings(priv); gve_free_rx_rings(priv); - gve_free_qpls(priv); + if (gve_is_gqi(priv)) + gve_free_qpls(priv); } static int @@ -457,9 +478,11 @@ { int err; - err = gve_alloc_qpls(priv); - if (err != 0) - goto abort; + if (gve_is_gqi(priv)) { + err = gve_alloc_qpls(priv); + if (err != 0) + goto abort; + } err = gve_alloc_rx_rings(priv); if (err != 0) @@ -499,6 +522,11 @@ gve_free_irq_db_array(priv); gve_free_counter_array(priv); + + if (priv->ptype_lut_dqo) { + free(priv->ptype_lut_dqo, M_GVE); + priv->ptype_lut_dqo = NULL; + } } static int @@ -525,6 +553,22 @@ goto abort; } + if (!gve_is_gqi(priv)) { + priv->ptype_lut_dqo = malloc(sizeof(*priv->ptype_lut_dqo), M_GVE, + M_WAITOK | M_ZERO); + if (!priv->ptype_lut_dqo) { + err = (ENOMEM); + goto abort; + } + + err = gve_adminq_get_ptype_map_dqo(priv, priv->ptype_lut_dqo); + if (err != 0) { + device_printf(priv->dev, "Failed to configure ptype lut: err=%d\n", + err); + goto abort; + } + } + gve_set_state_flag(priv, GVE_STATE_FLAG_RESOURCES_OK); if (bootverbose) device_printf(priv->dev, "Configured device resources\n"); @@ -742,6 +786,9 @@ int rid; int err; + snprintf(gve_version, sizeof(gve_version), "%d.%d.%d", + GVE_VERSION_MAJOR, GVE_VERSION_MINOR, GVE_VERSION_SUB); + priv = device_get_softc(dev); priv->dev = dev; GVE_IFACE_LOCK_INIT(priv->gve_iface_lock); diff --git a/sys/dev/gve/gve_plat.h b/sys/dev/gve/gve_plat.h --- a/sys/dev/gve/gve_plat.h +++ b/sys/dev/gve/gve_plat.h @@ -85,6 +85,9 @@ typedef uint16_t __be16; typedef uint32_t __be32; typedef uint64_t __be64; +typedef uint16_t __le16; +typedef uint32_t __le32; +typedef uint64_t __le64; #define BIT(nr) (1UL << (nr)) #define FBSD_VERSION_MAJOR (__FreeBSD_version / 100000) diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c --- a/sys/dev/gve/gve_rx.c +++ b/sys/dev/gve/gve_rx.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,15 +30,12 @@ */ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" static void -gve_rx_free_ring(struct gve_priv *priv, int i) +gve_rx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; - struct gve_ring_com *com = &rx->com; - - /* Safe to call even if never allocated */ - gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); if (rx->page_info != NULL) { free(rx->page_info, M_GVE); @@ -54,6 +51,21 @@ gve_dma_free_coherent(&rx->desc_ring_mem); rx->desc_ring = NULL; } +} + +static void +gve_rx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + + /* Safe to call even if never allocated */ + gve_free_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); + + if (gve_is_gqi(priv)) + gve_rx_free_ring_gqi(priv, i); + else + gve_rx_free_ring_dqo(priv, i); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); @@ -83,16 +95,23 @@ } static int -gve_rx_alloc_ring(struct gve_priv *priv, int i) +gve_rx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; int err; - com->priv = priv; - com->id = i; + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for rx ring %d", i); + goto abort; + } rx->mask = priv->rx_pages_per_qpl - 1; + rx->desc_ring = rx->desc_ring_mem.cpu_addr; com->qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; if (com->qpl == NULL) { @@ -100,38 +119,55 @@ return (ENOMEM); } - rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), M_GVE, - M_WAITOK | M_ZERO); + rx->page_info = malloc(priv->rx_desc_cnt * sizeof(*rx->page_info), + M_GVE, M_WAITOK | M_ZERO); + + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->data_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc data ring for rx ring %d", i); + goto abort; + } + rx->data_ring = rx->data_ring_mem.cpu_addr; + + gve_prefill_rx_slots(rx); + return (0); + +abort: + gve_rx_free_ring_gqi(priv, i); + return (err); +} + +static int +gve_rx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + struct gve_ring_com *com = &rx->com; + int err; + + com->priv = priv; + com->id = i; gve_alloc_counters((counter_u64_t *)&rx->stats, NUM_RX_STATS); err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { - device_printf(priv->dev, "Failed to alloc queue resources for rx ring %d", i); + device_printf(priv->dev, + "Failed to alloc queue resources for rx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; - err = gve_dma_alloc_coherent(priv, - sizeof(struct gve_rx_desc) * priv->rx_desc_cnt, - CACHE_LINE_SIZE, &rx->desc_ring_mem); - if (err != 0) { - device_printf(priv->dev, "Failed to alloc desc ring for rx ring %d", i); + if (gve_is_gqi(priv)) + err = gve_rx_alloc_ring_gqi(priv, i); + else + err = gve_rx_alloc_ring_dqo(priv, i); + if (err != 0) goto abort; - } - rx->desc_ring = rx->desc_ring_mem.cpu_addr; - err = gve_dma_alloc_coherent(priv, - sizeof(union gve_rx_data_slot) * priv->rx_desc_cnt, - CACHE_LINE_SIZE, &rx->data_ring_mem); - if (err != 0) { - device_printf(priv->dev, "Failed to alloc data ring for rx ring %d", i); - goto abort; - } - rx->data_ring = rx->data_ring_mem.cpu_addr; - - gve_prefill_rx_slots(rx); return (0); abort: @@ -227,7 +263,8 @@ } static void -gve_start_rx_ring(struct gve_priv *priv, int i) +gve_start_rx_ring(struct gve_priv *priv, int i, + void (cleanup) (void *arg, int pending)) { struct gve_rx_ring *rx = &priv->rx[i]; struct gve_ring_com *com = &rx->com; @@ -238,14 +275,19 @@ rx->lro.ifp = priv->ifp; } - NET_TASK_INIT(&com->cleanup_task, 0, gve_rx_cleanup_tq, rx); + NET_TASK_INIT(&com->cleanup_task, 0, cleanup, rx); com->cleanup_tq = taskqueue_create_fast("gve rx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s rxq %d", device_get_nameunit(priv->dev), i); - gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); + if (gve_is_gqi(priv)) { + /* GQ RX bufs are prefilled at ring alloc time */ + gve_db_bar_write_4(priv, com->db_offset, rx->fill_cnt); + } else { + gve_rx_prefill_buffers_dqo(rx); + } } int @@ -259,8 +301,12 @@ if (gve_get_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK)) return (0); - for (i = 0; i < priv->rx_cfg.num_queues; i++) - gve_clear_rx_ring(priv, i); + for (i = 0; i < priv->rx_cfg.num_queues; i++) { + if (gve_is_gqi(priv)) + gve_clear_rx_ring(priv, i); + else + gve_clear_rx_ring_dqo(priv, i); + } err = gve_adminq_create_rx_queues(priv, priv->rx_cfg.num_queues); if (err != 0) @@ -280,7 +326,10 @@ com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); - gve_start_rx_ring(priv, i); + if (gve_is_gqi(priv)) + gve_start_rx_ring(priv, i, gve_rx_cleanup_tq); + else + gve_start_rx_ring(priv, i, gve_rx_cleanup_tq_dqo); } gve_set_state_flag(priv, GVE_STATE_FLAG_RX_RINGS_OK); diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_rx_dqo.c @@ -0,0 +1,632 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2024 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_adminq.h" +#include "gve_dqo.h" + +static void +gve_free_rx_mbufs_dqo(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_dqo *buf; + int i; + + for (i = 0; i < rx->dqo.buf_cnt; i++) { + buf = &rx->dqo.bufs[i]; + if (!buf->mbuf) + continue; + + bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, + BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); + m_freem(buf->mbuf); + buf->mbuf = NULL; + } +} + +void +gve_rx_free_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + int j; + + if (rx->dqo.compl_ring != NULL) { + gve_dma_free_coherent(&rx->dqo.compl_ring_mem); + rx->dqo.compl_ring = NULL; + } + + if (rx->dqo.desc_ring != NULL) { + gve_dma_free_coherent(&rx->desc_ring_mem); + rx->dqo.desc_ring = NULL; + } + + if (rx->dqo.bufs != NULL) { + gve_free_rx_mbufs_dqo(rx); + + if (rx->dqo.buf_dmatag) { + for (j = 0; j < rx->dqo.buf_cnt; j++) + if (rx->dqo.bufs[j].mapped) + bus_dmamap_destroy(rx->dqo.buf_dmatag, + rx->dqo.bufs[j].dmamap); + } + + free(rx->dqo.bufs, M_GVE); + rx->dqo.bufs = NULL; + } + + if (rx->dqo.buf_dmatag) + bus_dma_tag_destroy(rx->dqo.buf_dmatag); +} + +int +gve_rx_alloc_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + int err; + int j; + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_desc_dqo) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for rx ring %d", i); + goto abort; + } + rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr; + rx->dqo.mask = priv->rx_desc_cnt - 1; + + err = bus_dma_tag_create( + bus_get_dma_tag(priv->dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + MCLBYTES, /* maxsize */ + 1, /* nsegments */ + MCLBYTES, /* maxsegsize */ + 0, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &rx->dqo.buf_dmatag); + if (err != 0) { + device_printf(priv->dev, + "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + goto abort; + } + + rx->dqo.buf_cnt = priv->rx_desc_cnt; + rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), + M_GVE, M_WAITOK | M_ZERO); + for (j = 0; j < rx->dqo.buf_cnt; j++) { + err = bus_dmamap_create(rx->dqo.buf_dmatag, 0, + &rx->dqo.bufs[j].dmamap); + if (err != 0) { + device_printf(priv->dev, + "err in creating rx buf dmamap %d: %d", + j, err); + goto abort; + } + rx->dqo.bufs[j].mapped = true; + } + + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc compl ring for rx ring %d", i); + goto abort; + } + rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; + rx->dqo.mask = priv->rx_desc_cnt - 1; + + return (0); + +abort: + gve_rx_free_ring_dqo(priv, i); + return (err); +} + +static void +gve_rx_clear_desc_ring_dqo(struct gve_rx_ring *rx) +{ + struct gve_ring_com *com = &rx->com; + int entries; + int i; + + entries = com->priv->rx_desc_cnt; + for (i = 0; i < entries; i++) + rx->dqo.desc_ring[i] = (struct gve_rx_desc_dqo){}; + + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_rx_clear_compl_ring_dqo(struct gve_rx_ring *rx) +{ + struct gve_ring_com *com = &rx->com; + int i; + + for (i = 0; i < com->priv->rx_desc_cnt; i++) + rx->dqo.compl_ring[i] = (struct gve_rx_compl_desc_dqo){}; + + bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +void +gve_clear_rx_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_rx_ring *rx = &priv->rx[i]; + int j; + + rx->fill_cnt = 0; + rx->cnt = 0; + rx->dqo.mask = priv->rx_desc_cnt - 1; + rx->dqo.head = 0; + rx->dqo.tail = 0; + rx->dqo.cur_gen_bit = 0; + + gve_rx_clear_desc_ring_dqo(rx); + gve_rx_clear_compl_ring_dqo(rx); + + gve_free_rx_mbufs_dqo(rx); + + SLIST_INIT(&rx->dqo.free_bufs); + for (j = 0; j < rx->dqo.buf_cnt; j++) + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + &rx->dqo.bufs[j], slist_entry); +} + +int +gve_rx_intr_dqo(void *arg) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + struct gve_ring_com *com = &rx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + /* Interrupts are automatically masked */ + taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); + return (FILTER_HANDLED); +} + +static void +gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) +{ + struct gve_rx_desc_dqo *desc; + + bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, + BUS_DMASYNC_PREREAD); + + desc = &rx->dqo.desc_ring[rx->dqo.head]; + desc->buf_id = htole16(buf - rx->dqo.bufs); + desc->buf_addr = htole64(buf->addr); + + rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; + rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ + + if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); + gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, + rx->dqo.head); + } +} + +static int +gve_rx_post_new_mbuf_dqo(struct gve_rx_ring *rx, int how) +{ + struct gve_rx_buf_dqo *buf; + bus_dma_segment_t segs[1]; + int nsegs; + int err; + + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(!buf)) { + device_printf(rx->com.priv->dev, + "Unexpected empty free bufs list\n"); + return (ENOBUFS); + } + SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); + + buf->mbuf = m_getcl(how, MT_DATA, M_PKTHDR); + if (__predict_false(!buf->mbuf)) { + err = ENOMEM; + counter_enter(); + counter_u64_add_protected(rx->stats.rx_mbuf_mclget_null, 1); + counter_exit(); + goto abort_with_buf; + } + buf->mbuf->m_len = MCLBYTES; + + err = bus_dmamap_load_mbuf_sg(rx->dqo.buf_dmatag, buf->dmamap, + buf->mbuf, segs, &nsegs, BUS_DMA_NOWAIT); + if (__predict_false(err != 0 || nsegs != 1)) { + counter_enter(); + counter_u64_add_protected(rx->stats.rx_mbuf_dmamap_err, 1); + counter_exit(); + goto abort_with_mbuf; + } + buf->addr = segs[0].ds_addr; + + gve_rx_post_buf_dqo(rx, buf); + return (0); + +abort_with_mbuf: + m_freem(buf->mbuf); + buf->mbuf = NULL; +abort_with_buf: + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); + return (err); +} + +static void +gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how) +{ + uint32_t num_pending_bufs; + uint32_t num_to_post; + uint32_t i; + int err; + + num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; + num_to_post = rx->dqo.mask - num_pending_bufs; + + for (i = 0; i < num_to_post; i++) { + err = gve_rx_post_new_mbuf_dqo(rx, how); + if (err) + break; + } +} + +void +gve_rx_prefill_buffers_dqo(struct gve_rx_ring *rx) +{ + gve_rx_post_buffers_dqo(rx, M_WAITOK); +} + +static void +gve_rx_set_hashtype_dqo(struct mbuf *mbuf, struct gve_ptype *ptype, bool *is_tcp) +{ + switch (ptype->l3_type) { + case GVE_L3_TYPE_IPV4: + switch (ptype->l4_type) { + case GVE_L4_TYPE_TCP: + *is_tcp = true; + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV4); + break; + case GVE_L4_TYPE_UDP: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV4); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV4); + } + break; + case GVE_L3_TYPE_IPV6: + switch (ptype->l4_type) { + case GVE_L4_TYPE_TCP: + *is_tcp = true; + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_TCP_IPV6); + break; + case GVE_L4_TYPE_UDP: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_UDP_IPV6); + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_RSS_IPV6); + } + break; + default: + M_HASHTYPE_SET(mbuf, M_HASHTYPE_OPAQUE_HASH); + } +} + +static void +gve_rx_set_csum_flags_dqo(struct mbuf *mbuf, + struct gve_rx_compl_desc_dqo *desc, + struct gve_ptype *ptype) +{ + /* HW did not identify and process L3 and L4 headers. */ + if (__predict_false(!desc->l3_l4_processed)) + return; + + if (ptype->l3_type == GVE_L3_TYPE_IPV4) { + if (__predict_false(desc->csum_ip_err || + desc->csum_external_ip_err)) + return; + } else if (ptype->l3_type == GVE_L3_TYPE_IPV6) { + /* Checksum should be skipped if this flag is set. */ + if (__predict_false(desc->ipv6_ex_add)) + return; + } + + if (__predict_false(desc->csum_l4_err)) + return; + + switch (ptype->l4_type) { + case GVE_L4_TYPE_TCP: + case GVE_L4_TYPE_UDP: + case GVE_L4_TYPE_ICMP: + case GVE_L4_TYPE_SCTP: + mbuf->m_pkthdr.csum_flags = CSUM_IP_CHECKED | + CSUM_IP_VALID | + CSUM_DATA_VALID | + CSUM_PSEUDO_HDR; + mbuf->m_pkthdr.csum_data = 0xffff; + break; + default: + break; + } +} + +static void +gve_rx_input_mbuf_dqo(struct gve_rx_ring *rx, + struct gve_rx_compl_desc_dqo *compl_desc) +{ + struct mbuf *mbuf = rx->ctx.mbuf_head; + if_t ifp = rx->com.priv->ifp; + struct gve_ptype *ptype; + bool do_if_input = true; + bool is_tcp = false; + + ptype = &rx->com.priv->ptype_lut_dqo->ptypes[compl_desc->packet_type]; + gve_rx_set_hashtype_dqo(mbuf, ptype, &is_tcp); + mbuf->m_pkthdr.flowid = le32toh(compl_desc->hash); + gve_rx_set_csum_flags_dqo(mbuf, compl_desc, ptype); + + mbuf->m_pkthdr.rcvif = ifp; + mbuf->m_pkthdr.len = rx->ctx.total_size; + + if (((if_getcapenable(rx->com.priv->ifp) & IFCAP_LRO) != 0) && /* LRO is enabled */ + is_tcp && /* pkt is a TCP pkt */ + (rx->lro.lro_cnt != 0) && /* LRO resources exist */ + (tcp_lro_rx(&rx->lro, mbuf, 0) == 0)) + do_if_input = false; + + if (do_if_input) + if_input(ifp, mbuf); + + counter_enter(); + counter_u64_add_protected(rx->stats.rbytes, rx->ctx.total_size); + counter_u64_add_protected(rx->stats.rpackets, 1); + counter_exit(); + + rx->ctx = (struct gve_rx_ctx){}; +} + +static int +gve_rx_copybreak_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, + struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len) +{ + struct mbuf *mbuf; + + mbuf = m_get2(frag_len, M_NOWAIT, MT_DATA, M_PKTHDR); + if (__predict_false(mbuf == NULL)) + return (ENOMEM); + + counter_enter(); + counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); + counter_exit(); + + m_copyback(mbuf, 0, frag_len, mtod(buf->mbuf, char*)); + mbuf->m_len = frag_len; + + rx->ctx.mbuf_head = mbuf; + rx->ctx.mbuf_tail = mbuf; + rx->ctx.total_size += frag_len; + + gve_rx_post_buf_dqo(rx, buf); + gve_rx_input_mbuf_dqo(rx, compl_desc); + return (0); +} + +static void +gve_rx_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_compl_desc_dqo *compl_desc, + int *work_done) +{ + bool is_last_frag = compl_desc->end_of_packet != 0; + struct gve_rx_ctx *ctx = &rx->ctx; + struct gve_rx_buf_dqo *buf; + uint32_t num_pending_bufs; + uint16_t frag_len; + uint16_t buf_id; + int err; + + buf_id = le16toh(compl_desc->buf_id); + if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { + device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", + buf_id, rx->com.id); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + buf = &rx->dqo.bufs[buf_id]; + if (__predict_false(buf->mbuf == NULL)) { + device_printf(priv->dev, "Spurious completion for buf id %d on rxq %d, issuing reset\n", + buf_id, rx->com.id); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + + if (__predict_false(ctx->drop_pkt)) + goto drop_frag; + + if (__predict_false(compl_desc->rx_error)) { + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); + counter_exit(); + goto drop_frag; + } + + bus_dmamap_sync(rx->dqo.buf_dmatag, buf->dmamap, + BUS_DMASYNC_POSTREAD); + + frag_len = compl_desc->packet_len; + if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { + err = gve_rx_copybreak_dqo(rx, buf, compl_desc, frag_len); + if (__predict_false(err != 0)) + goto drop_frag; + (*work_done)++; + return; + } + + /* + * Although buffer completions may arrive out of order, buffer + * descriptors are consumed by the NIC in order. That is, the + * buffer at desc_ring[tail] might not be the buffer we got the + * completion compl_ring[tail] for: but we know that desc_ring[tail] + * has already been read by the NIC. + */ + num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; + + /* + * For every fragment received, try to post a new buffer. + * + * Failures are okay but only so long as the number of outstanding + * buffers is above a threshold. + * + * Beyond that we drop new packets to reuse their buffers. + * Without ensuring a minimum number of buffers for the NIC to + * put packets in, we run the risk of getting the queue stuck + * for good. + */ + err = gve_rx_post_new_mbuf_dqo(rx, M_NOWAIT); + if (__predict_false(err != 0 && + num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); + counter_exit(); + goto drop_frag; + } + + buf->mbuf->m_len = frag_len; + ctx->total_size += frag_len; + if (ctx->mbuf_tail == NULL) { + ctx->mbuf_head = buf->mbuf; + ctx->mbuf_tail = buf->mbuf; + } else { + buf->mbuf->m_flags &= ~M_PKTHDR; + ctx->mbuf_tail->m_next = buf->mbuf; + ctx->mbuf_tail = buf->mbuf; + } + + /* + * Disassociate the mbuf from buf and surrender buf to the free list to + * be used by a future mbuf. + */ + bus_dmamap_unload(rx->dqo.buf_dmatag, buf->dmamap); + buf->mbuf = NULL; + buf->addr = 0; + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, buf, slist_entry); + + if (is_last_frag) { + gve_rx_input_mbuf_dqo(rx, compl_desc); + (*work_done)++; + } + return; + +drop_frag: + /* Clear the earlier frags if there were any */ + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; + /* Drop the rest of the pkt if there are more frags */ + ctx->drop_pkt = true; + /* Reuse the dropped frag's buffer */ + gve_rx_post_buf_dqo(rx, buf); + + if (is_last_frag) + goto drop_frag_clear_ctx; + return; + +drop_frag_clear_ctx: + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; +} + +static bool +gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) +{ + struct gve_rx_compl_desc_dqo *compl_desc; + uint32_t work_done = 0; + + NET_EPOCH_ASSERT(); + + while (work_done < budget) { + bus_dmamap_sync(rx->dqo.compl_ring_mem.tag, rx->dqo.compl_ring_mem.map, + BUS_DMASYNC_POSTREAD); + + compl_desc = &rx->dqo.compl_ring[rx->dqo.tail]; + if (compl_desc->generation == rx->dqo.cur_gen_bit) + break; + /* + * Prevent generation bit from being read after the rest of the + * descriptor. + */ + rmb(); + + rx->cnt++; + rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask; + rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0); + + gve_rx_dqo(priv, rx, compl_desc, &work_done); + } + + if (work_done != 0) + tcp_lro_flush_all(&rx->lro); + + gve_rx_post_buffers_dqo(rx, M_NOWAIT); + return (work_done == budget); +} + +void +gve_rx_cleanup_tq_dqo(void *arg, int pending) +{ + struct gve_rx_ring *rx = arg; + struct gve_priv *priv = rx->com.priv; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + if (gve_rx_cleanup_dqo(priv, rx, /*budget=*/64)) { + taskqueue_enqueue(rx->com.cleanup_tq, &rx->com.cleanup_task); + return; + } + + gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); +} diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,6 +30,21 @@ */ #include "gve.h" +static SYSCTL_NODE(_hw, OID_AUTO, gve, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, + "GVE driver parameters"); + +bool gve_disable_hw_lro = false; +SYSCTL_BOOL(_hw_gve, OID_AUTO, disable_hw_lro, CTLFLAG_RDTUN, + &gve_disable_hw_lro, 0, "Controls if hardware LRO is used.\n"); + +char gve_queue_format[8]; +SYSCTL_STRING(_hw_gve, OID_AUTO, queue_format, CTLFLAG_RD, + &gve_queue_format, 0, "Queue format being used by the iface.\n"); + +char gve_version[8]; +SYSCTL_STRING(_hw_gve, OID_AUTO, driver_version, CTLFLAG_RD, + &gve_version, 0, "Driver version.\n"); + static void gve_setup_rxq_sysctl(struct sysctl_ctx_list *ctx, struct sysctl_oid_list *child, struct gve_rx_ring *rxq) @@ -72,6 +87,14 @@ "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_mbuf_alloc_fail, "Packets dropped due to failed mbuf allocation"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_mbuf_dmamap_err", CTLFLAG_RD, + &stats->rx_mbuf_dmamap_err, + "Number of rx mbufs which couldnt be dma mapped"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_mbuf_mclget_null", CTLFLAG_RD, + &stats->rx_mbuf_mclget_null, + "Number of times when there were no cluster mbufs"); SYSCTL_ADD_U32(ctx, list, OID_AUTO, "rx_completed_desc", CTLFLAG_RD, &rxq->cnt, 0, "Number of descriptors completed"); @@ -124,6 +147,38 @@ "tx_dropped_pkt_vlan", CTLFLAG_RD, &stats->tx_dropped_pkt_vlan, "Dropped VLAN packets"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_nospace_descring", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_descring, + "Packets delayed due to no space in desc ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_nospace_compring", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_compring, + "Packets delayed due to no space in comp ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_tsoerr", CTLFLAG_RD, + &stats->tx_delayed_pkt_tsoerr, + "TSO packets delayed due to err in prep errors"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_collpase", CTLFLAG_RD, + &stats->tx_mbuf_collapse, + "tx mbufs that had to be collpased"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_defrag", CTLFLAG_RD, + &stats->tx_mbuf_defrag, + "tx mbufs that had to be defragged"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_defrag_err", CTLFLAG_RD, + &stats->tx_mbuf_defrag_err, + "tx mbufs that failed defrag"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_dmamap_enomem_err", CTLFLAG_RD, + &stats->tx_mbuf_dmamap_enomem_err, + "tx mbufs that could not be dma-mapped due to low mem"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_mbuf_dmamap_err", CTLFLAG_RD, + &stats->tx_mbuf_dmamap_err, + "tx mbufs that could not be dma-mapped"); } static void @@ -185,6 +240,9 @@ SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_destroy_rx_queue_cnt", CTLFLAG_RD, &priv->adminq_destroy_rx_queue_cnt, 0, "adminq_destroy_rx_queue_cnt"); + SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_get_ptype_map_cnt", + CTLFLAG_RD, &priv->adminq_get_ptype_map_cnt, 0, + "adminq_get_ptype_map_cnt"); SYSCTL_ADD_U32(ctx, admin_list, OID_AUTO, "adminq_dcfg_device_resources_cnt", CTLFLAG_RD, &priv->adminq_dcfg_device_resources_cnt, 0, diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -30,6 +30,7 @@ */ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" #define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 @@ -48,61 +49,104 @@ } static void -gve_tx_free_ring(struct gve_priv *priv, int i) +gve_tx_free_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; - struct gve_ring_com *com = &tx->com; - - /* Safe to call even if never alloced */ - gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); - if (tx->br != NULL) { - buf_ring_free(tx->br, M_DEVBUF); - tx->br = NULL; + if (tx->desc_ring != NULL) { + gve_dma_free_coherent(&tx->desc_ring_mem); + tx->desc_ring = NULL; } - if (mtx_initialized(&tx->ring_mtx)) - mtx_destroy(&tx->ring_mtx); - if (tx->info != NULL) { free(tx->info, M_GVE); tx->info = NULL; } +} - if (tx->desc_ring != NULL) { - gve_dma_free_coherent(&tx->desc_ring_mem); - tx->desc_ring = NULL; - } +static void +gve_tx_free_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + + /* Safe to call even if never alloced */ + gve_free_counters((counter_u64_t *)&tx->stats, NUM_TX_STATS); + + if (mtx_initialized(&tx->ring_mtx)) + mtx_destroy(&tx->ring_mtx); if (com->q_resources != NULL) { gve_dma_free_coherent(&com->q_resources_mem); com->q_resources = NULL; } + + if (tx->br != NULL) { + buf_ring_free(tx->br, M_DEVBUF); + tx->br = NULL; + } + + if (gve_is_gqi(priv)) + gve_tx_free_ring_gqi(priv, i); + else + gve_tx_free_ring_dqo(priv, i); } static int -gve_tx_alloc_ring(struct gve_priv *priv, int i) +gve_tx_alloc_ring_gqi(struct gve_priv *priv, int i) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; - char mtx_name[16]; int err; - com->priv = priv; - com->id = i; + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_tx_desc) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for tx ring %d", i); + goto abort; + } + tx->desc_ring = tx->desc_ring_mem.cpu_addr; com->qpl = &priv->qpls[i]; if (com->qpl == NULL) { device_printf(priv->dev, "No QPL left for tx ring %d\n", i); - return (ENOMEM); + err = ENOMEM; + goto abort; } err = gve_tx_fifo_init(priv, tx); if (err != 0) goto abort; - tx->info = malloc(sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, + tx->info = malloc( + sizeof(struct gve_tx_buffer_state) * priv->tx_desc_cnt, M_GVE, M_WAITOK | M_ZERO); + return (0); + +abort: + gve_tx_free_ring_gqi(priv, i); + return (err); +} + +static int +gve_tx_alloc_ring(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + struct gve_ring_com *com = &tx->com; + char mtx_name[16]; + int err; + + com->priv = priv; + com->id = i; + + if (gve_is_gqi(priv)) + err = gve_tx_alloc_ring_gqi(priv, i); + else + err = gve_tx_alloc_ring_dqo(priv, i); + if (err != 0) + goto abort; sprintf(mtx_name, "gvetx%d", i); mtx_init(&tx->ring_mtx, mtx_name, NULL, MTX_DEF); @@ -115,20 +159,12 @@ err = gve_dma_alloc_coherent(priv, sizeof(struct gve_queue_resources), PAGE_SIZE, &com->q_resources_mem); if (err != 0) { - device_printf(priv->dev, "Failed to alloc queue resources for tx ring %d", i); + device_printf(priv->dev, + "Failed to alloc queue resources for tx ring %d", i); goto abort; } com->q_resources = com->q_resources_mem.cpu_addr; - err = gve_dma_alloc_coherent(priv, - sizeof(union gve_tx_desc) * priv->tx_desc_cnt, - CACHE_LINE_SIZE, &tx->desc_ring_mem); - if (err != 0) { - device_printf(priv->dev, "Failed to alloc desc ring for tx ring %d", i); - goto abort; - } - tx->desc_ring = tx->desc_ring_mem.cpu_addr; - return (0); abort: @@ -147,6 +183,7 @@ for (i = 0; i < priv->tx_cfg.num_queues; i++) { err = gve_tx_alloc_ring(priv, i); + if (err != 0) goto free_rings; @@ -204,12 +241,13 @@ } static void -gve_start_tx_ring(struct gve_priv *priv, int i) +gve_start_tx_ring(struct gve_priv *priv, int i, + void (cleanup) (void *arg, int pending)) { struct gve_tx_ring *tx = &priv->tx[i]; struct gve_ring_com *com = &tx->com; - NET_TASK_INIT(&com->cleanup_task, 0, gve_tx_cleanup_tq, tx); + NET_TASK_INIT(&com->cleanup_task, 0, cleanup, tx); com->cleanup_tq = taskqueue_create_fast("gve tx", M_WAITOK, taskqueue_thread_enqueue, &com->cleanup_tq); taskqueue_start_threads(&com->cleanup_tq, 1, PI_NET, "%s txq %d", @@ -233,8 +271,12 @@ if (gve_get_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK)) return (0); - for (i = 0; i < priv->tx_cfg.num_queues; i++) - gve_clear_tx_ring(priv, i); + for (i = 0; i < priv->tx_cfg.num_queues; i++) { + if (gve_is_gqi(priv)) + gve_clear_tx_ring(priv, i); + else + gve_clear_tx_ring_dqo(priv, i); + } err = gve_adminq_create_tx_queues(priv, priv->tx_cfg.num_queues); if (err != 0) @@ -254,7 +296,10 @@ com->db_offset = 4 * be32toh(com->q_resources->db_index); com->counter_idx = be32toh(com->q_resources->counter_index); - gve_start_tx_ring(priv, i); + if (gve_is_gqi(priv)) + gve_start_tx_ring(priv, i, gve_tx_cleanup_tq); + else + gve_start_tx_ring(priv, i, gve_tx_cleanup_tq_dqo); } gve_set_state_flag(priv, GVE_STATE_FLAG_TX_RINGS_OK); @@ -695,11 +740,23 @@ struct gve_priv *priv = tx->com.priv; struct ifnet *ifp = priv->ifp; struct mbuf *mbuf; + int err; while ((if_getdrvflags(ifp) & IFF_DRV_RUNNING) != 0 && (mbuf = drbr_peek(ifp, tx->br)) != NULL) { - if (__predict_false(gve_xmit(tx, mbuf) != 0)) { + if (gve_is_gqi(priv)) + err = gve_xmit(tx, mbuf); + else { + /* + * gve_xmit_dqo might attempt to defrag the mbuf chain. + * The reference is passed in so that in the case of + * errors, the new mbuf chain is what's put back on the br. + */ + err = gve_xmit_dqo(tx, &mbuf); + } + + if (__predict_false(err != 0 && mbuf != NULL)) { drbr_putback(ifp, tx->br, mbuf); taskqueue_enqueue(tx->xmit_tq, &tx->xmit_task); break; @@ -710,7 +767,12 @@ bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, BUS_DMASYNC_PREWRITE); - gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); + + if (gve_is_gqi(priv)) + gve_db_bar_write_4(priv, tx->com.db_offset, tx->req); + else + gve_db_bar_dqo_write_4(priv, tx->com.db_offset, + tx->dqo.desc_tail); } } diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c new file mode 100644 --- /dev/null +++ b/sys/dev/gve/gve_tx_dqo.c @@ -0,0 +1,792 @@ +/*- + * SPDX-License-Identifier: BSD-3-Clause + * + * Copyright (c) 2024 Google LLC + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "gve.h" +#include "gve_dqo.h" + +static void +gve_unmap_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + bus_dmamap_sync(tx->dqo.buf_dmatag, pending_pkt->dmamap, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(tx->dqo.buf_dmatag, pending_pkt->dmamap); +} + +static void +gve_free_tx_mbufs_dqo(struct gve_tx_ring *tx) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int i; + + for (i = 0; i < tx->dqo.num_pending_pkts; i++) { + pending_pkt = &tx->dqo.pending_pkts[i]; + if (!pending_pkt->mbuf) + continue; + + gve_unmap_packet(tx, pending_pkt); + m_freem(pending_pkt->mbuf); + pending_pkt->mbuf = NULL; + } +} + +void +gve_tx_free_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + int j; + + if (tx->dqo.desc_ring != NULL) { + gve_dma_free_coherent(&tx->desc_ring_mem); + tx->dqo.desc_ring = NULL; + } + + if (tx->dqo.compl_ring != NULL) { + gve_dma_free_coherent(&tx->dqo.compl_ring_mem); + tx->dqo.compl_ring = NULL; + } + + if (tx->dqo.pending_pkts != NULL) { + gve_free_tx_mbufs_dqo(tx); + + if (tx->dqo.buf_dmatag) { + for (j = 0; j < tx->dqo.num_pending_pkts; j++) + if (tx->dqo.pending_pkts[j].state != + GVE_PACKET_STATE_UNALLOCATED) + bus_dmamap_destroy(tx->dqo.buf_dmatag, + tx->dqo.pending_pkts[j].dmamap); + } + + free(tx->dqo.pending_pkts, M_GVE); + tx->dqo.pending_pkts = NULL; + } + + if (tx->dqo.buf_dmatag) + bus_dma_tag_destroy(tx->dqo.buf_dmatag); +} + +int +gve_tx_alloc_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + uint16_t num_pending_pkts; + int err; + int j; + + /* Descriptor ring */ + err = gve_dma_alloc_coherent(priv, + sizeof(union gve_tx_desc_dqo) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->desc_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc desc ring for tx ring %d", i); + goto abort; + } + tx->dqo.desc_ring = tx->desc_ring_mem.cpu_addr; + + /* Completion ring */ + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_tx_compl_desc_dqo) * priv->tx_desc_cnt, + CACHE_LINE_SIZE, &tx->dqo.compl_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc compl ring for tx ring %d", i); + goto abort; + } + tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; + + /* + * DMA tag for mapping Tx mbufs + * The maxsize, nsegments, and maxsegsize params should match + * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. + */ + err = bus_dma_tag_create( + bus_get_dma_tag(priv->dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + GVE_TSO_MAXSIZE_DQO, /* maxsize */ + GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ + GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &tx->dqo.buf_dmatag); + if (err != 0) { + device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + goto abort; + } + + /* + * pending_pkts array + * + * The max number of pending packets determines the maximum number of + * descriptors which maybe written to the completion queue. + * + * We must set the number small enough to make sure we never overrun the + * completion queue. + */ + num_pending_pkts = priv->tx_desc_cnt; + /* + * Reserve space for descriptor completions, which will be reported at + * most every GVE_TX_MIN_RE_INTERVAL packets. + */ + num_pending_pkts -= num_pending_pkts / GVE_TX_MIN_RE_INTERVAL; + + tx->dqo.num_pending_pkts = num_pending_pkts; + tx->dqo.pending_pkts = malloc( + sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, + M_GVE, M_WAITOK | M_ZERO); + if (!tx->dqo.pending_pkts) + goto abort; + + for (j = 0; j < tx->dqo.num_pending_pkts; j++) { + err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, + &tx->dqo.pending_pkts[j].dmamap); + if (err != 0) { + device_printf(priv->dev, + "err in creating pending pkt dmamap %d: %d", + j, err); + goto abort; + } + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + + return (0); + +abort: + gve_tx_free_ring_dqo(priv, i); + return (err); +} + +static void +gve_extract_tx_metadata_dqo(const struct mbuf *mbuf, + struct gve_tx_metadata_dqo *metadata) +{ + uint32_t hash = mbuf->m_pkthdr.flowid; + uint16_t path_hash; + + metadata->version = GVE_TX_METADATA_VERSION_DQO; + if (hash) { + path_hash = hash ^ (hash >> 16); + + path_hash &= (1 << 15) - 1; + if (__predict_false(path_hash == 0)) + path_hash = ~path_hash; + + metadata->path_hash = path_hash; + } +} + +static void +gve_tx_fill_pkt_desc_dqo(struct gve_tx_ring *tx, + uint32_t *desc_idx, uint32_t len, uint64_t addr, + int16_t compl_tag, bool eop, bool csum_enabled) +{ + while (len > 0) { + struct gve_tx_pkt_desc_dqo *desc = + &tx->dqo.desc_ring[*desc_idx].pkt; + uint32_t cur_len = MIN(len, GVE_TX_MAX_BUF_SIZE_DQO); + bool cur_eop = eop && cur_len == len; + + *desc = (struct gve_tx_pkt_desc_dqo){ + .buf_addr = htole64(addr), + .dtype = GVE_TX_PKT_DESC_DTYPE_DQO, + .end_of_packet = cur_eop, + .checksum_offload_enable = csum_enabled, + .compl_tag = htole16(compl_tag), + .buf_size = cur_len, + }; + + addr += cur_len; + len -= cur_len; + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + } +} + +static void +gve_tx_fill_tso_ctx_desc(struct gve_tx_tso_context_desc_dqo *desc, + const struct mbuf *mbuf, const struct gve_tx_metadata_dqo *metadata, + int header_len) +{ + *desc = (struct gve_tx_tso_context_desc_dqo){ + .header_len = header_len, + .cmd_dtype = { + .dtype = GVE_TX_TSO_CTX_DESC_DTYPE_DQO, + .tso = 1, + }, + .flex0 = metadata->bytes[0], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + }; + desc->tso_total_len = mbuf->m_pkthdr.len - header_len; + desc->mss = mbuf->m_pkthdr.tso_segsz; +} + +static void +gve_tx_fill_general_ctx_desc(struct gve_tx_general_context_desc_dqo *desc, + const struct gve_tx_metadata_dqo *metadata) +{ + *desc = (struct gve_tx_general_context_desc_dqo){ + .flex0 = metadata->bytes[0], + .flex1 = metadata->bytes[1], + .flex2 = metadata->bytes[2], + .flex3 = metadata->bytes[3], + .flex4 = metadata->bytes[4], + .flex5 = metadata->bytes[5], + .flex6 = metadata->bytes[6], + .flex7 = metadata->bytes[7], + .flex8 = metadata->bytes[8], + .flex9 = metadata->bytes[9], + .flex10 = metadata->bytes[10], + .flex11 = metadata->bytes[11], + .cmd_dtype = {.dtype = GVE_TX_GENERAL_CTX_DESC_DTYPE_DQO}, + }; +} + +#define PULLUP_HDR(m, len) \ +do { \ + if (__predict_false((m)->m_len < (len))) { \ + (m) = m_pullup((m), (len)); \ + if ((m) == NULL) \ + return (EINVAL); \ + } \ +} while (0) + +static int +gve_prep_tso(struct mbuf *mbuf, int *header_len) +{ + uint8_t l3_off, l4_off = 0; + struct ether_header *eh; + struct ip6_hdr *ip6; + struct tcphdr *th; + struct ip *ip; + bool is_ipv6; + + PULLUP_HDR(mbuf, sizeof(*eh)); + eh = mtod(mbuf, struct ether_header *); + KASSERT(eh->ether_type != ETHERTYPE_VLAN, + ("VLAN-tagged packets not supported")); + is_ipv6 = ntohs(eh->ether_type) == ETHERTYPE_IPV6; + l3_off = ETHER_HDR_LEN; + + if (is_ipv6) { + PULLUP_HDR(mbuf, l3_off + sizeof(*ip6)); + ip6 = (struct ip6_hdr *)(mtodo(mbuf, l3_off)); + l4_off = l3_off + sizeof(struct ip6_hdr); + } else if (ntohs(eh->ether_type) == ETHERTYPE_IP) { + PULLUP_HDR(mbuf, l3_off + sizeof(*ip)); + ip = (struct ip *)(mtodo(mbuf, l3_off)); + l4_off = l3_off + (ip->ip_hl << 2); + } + + if (__predict_false(l4_off == 0)) + return (EINVAL); + + PULLUP_HDR(mbuf, l4_off + sizeof(struct tcphdr *)); + th = (struct tcphdr *)(mtodo(mbuf, l4_off)); + *header_len = l4_off + (th->th_off << 2); + + /* + * Hardware requires the th->th_sum to not include the TCP payload, + * hence we recompute the csum with it excluded. + */ + if (is_ipv6) + th->th_sum = in6_cksum_pseudo(ip6, /*len=*/0, + IPPROTO_TCP, /*csum=*/0); + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(IPPROTO_TCP)); + + return (0); +} + +static int +gve_map_mbuf_dqo(struct gve_tx_ring *tx, + struct mbuf **mbuf, bus_dmamap_t dmamap, + bus_dma_segment_t *segs, int *nsegs, int attempt) +{ + struct mbuf *m_new = NULL; + int err; + + err = bus_dmamap_load_mbuf_sg(tx->dqo.buf_dmatag, dmamap, + *mbuf, segs, nsegs, BUS_DMA_NOWAIT); + + if (__predict_false(err)) { + switch (err) { + case EFBIG: + if (__predict_false(attempt > 0)) + goto abort; + + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_collapse, 1); + counter_exit(); + + /* Try m_collapse before m_defrag */ + m_new = m_collapse(*mbuf, M_NOWAIT, + GVE_TX_MAX_DATA_DESCS_DQO); + if (m_new == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_defrag, 1); + counter_exit(); + m_new = m_defrag(*mbuf, M_NOWAIT); + } + + if (__predict_false(m_new == NULL)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_defrag_err, 1); + counter_exit(); + + m_freem(*mbuf); + *mbuf = NULL; + err = ENOMEM; + goto abort; + } else { + *mbuf = m_new; + return (gve_map_mbuf_dqo(tx, mbuf, dmamap, + segs, nsegs, ++attempt)); + } + case ENOMEM: + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_mbuf_dmamap_enomem_err, 1); + counter_exit(); + goto abort; + default: + goto abort; + } + } + + return (0); + +abort: + counter_enter(); + counter_u64_add_protected(tx->stats.tx_mbuf_dmamap_err, 1); + counter_exit(); + return (err); +} + +static uint32_t +num_avail_desc_ring_slots(const struct gve_tx_ring *tx) +{ + uint32_t num_used = (tx->dqo.desc_tail - tx->dqo.desc_head) & + tx->dqo.desc_mask; + + return (tx->dqo.desc_mask - num_used); +} + +static struct gve_tx_pending_pkt_dqo * +gve_alloc_pending_packet(struct gve_tx_ring *tx) +{ + int32_t index = tx->dqo.free_pending_pkts_csm; + struct gve_tx_pending_pkt_dqo *pending_pkt; + + /* + * No pending packets available in the consumer list, + * try to steal the producer list. + */ + if (__predict_false(index == -1)) { + tx->dqo.free_pending_pkts_csm = atomic_swap_32( + &tx->dqo.free_pending_pkts_prd, -1); + + index = tx->dqo.free_pending_pkts_csm; + if (__predict_false(index == -1)) + return (NULL); + } + + pending_pkt = &tx->dqo.pending_pkts[index]; + + /* Remove pending_pkt from the consumer list */ + tx->dqo.free_pending_pkts_csm = pending_pkt->next; + pending_pkt->state = GVE_PACKET_STATE_PENDING_DATA_COMPL; + + return (pending_pkt); +} + +static void +gve_free_pending_packet(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pending_pkt) +{ + int index = pending_pkt - tx->dqo.pending_pkts; + int32_t old_head; + + pending_pkt->state = GVE_PACKET_STATE_FREE; + + /* Add pending_pkt to the producer list */ + while (true) { + old_head = atomic_load_acq_32(&tx->dqo.free_pending_pkts_prd); + + pending_pkt->next = old_head; + if (atomic_cmpset_32(&tx->dqo.free_pending_pkts_prd, + old_head, index)) + break; + } +} + +/* + * Has the side-effect of retrieving the value of the last desc index + * processed by the NIC. hw_tx_head is written to by the completions-processing + * taskqueue upon receiving descriptor-completions. + */ +static bool +gve_tx_has_desc_room_dqo(struct gve_tx_ring *tx, int needed_descs) +{ + if (needed_descs <= num_avail_desc_ring_slots(tx)) + return (true); + + tx->dqo.desc_head = atomic_load_acq_32(&tx->dqo.hw_tx_head); + if (needed_descs > num_avail_desc_ring_slots(tx)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_descring, 1); + counter_exit(); + return (false); + } + + return (0); +} + +static void +gve_tx_request_desc_compl(struct gve_tx_ring *tx, uint32_t desc_idx) +{ + uint32_t last_report_event_interval; + uint32_t last_desc_idx; + + last_desc_idx = (desc_idx - 1) & tx->dqo.desc_mask; + last_report_event_interval = + (last_desc_idx - tx->dqo.last_re_idx) & tx->dqo.desc_mask; + + if (__predict_false(last_report_event_interval >= + GVE_TX_MIN_RE_INTERVAL)) { + tx->dqo.desc_ring[last_desc_idx].pkt.report_event = true; + tx->dqo.last_re_idx = last_desc_idx; + } +} + +int +gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) +{ + bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; + uint32_t desc_idx = tx->dqo.desc_tail; + struct gve_tx_metadata_dqo metadata; + struct gve_tx_pending_pkt_dqo *pkt; + struct mbuf *mbuf = *mbuf_ptr; + int total_descs_needed; + int16_t completion_tag; + bool has_csum_flag; + int header_len; + int csum_flags; + bool is_tso; + int nsegs; + int err; + int i; + + csum_flags = mbuf->m_pkthdr.csum_flags; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + is_tso = csum_flags & CSUM_TSO; + + /* + * This mbuf might end up needing more than 1 pkt desc. + * The actual number, `nsegs` is known only after the + * expensive gve_map_mbuf_dqo call. This check beneath + * exists to fail early when the desc ring is really full. + */ + total_descs_needed = 1 + /* general_ctx_desc */ + 1 + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) + return (ENOBUFS); + + pkt = gve_alloc_pending_packet(tx); + if (pkt == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_compring, 1); + counter_exit(); + return (ENOBUFS); + } + completion_tag = pkt - tx->dqo.pending_pkts; + + err = gve_map_mbuf_dqo(tx, mbuf_ptr, pkt->dmamap, + segs, &nsegs, /*attempt=*/0); + if (err) + goto abort; + mbuf = *mbuf_ptr; /* gve_map_mbuf_dqo might replace the mbuf chain */ + pkt->mbuf = mbuf; + + total_descs_needed = 1 + /* general_ctx_desc */ + nsegs + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false( + !gve_tx_has_desc_room_dqo(tx, total_descs_needed))) { + err = ENOBUFS; + goto abort_with_dma; + } + + bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); + + metadata = (struct gve_tx_metadata_dqo){0}; + gve_extract_tx_metadata_dqo(mbuf, &metadata); + + if (is_tso) { + err = gve_prep_tso(mbuf, &header_len); + if (__predict_false(err)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_tsoerr, 1); + counter_exit(); + goto abort_with_dma; + } + + gve_tx_fill_tso_ctx_desc(&tx->dqo.desc_ring[desc_idx].tso_ctx, + mbuf, &metadata, header_len); + desc_idx = (desc_idx + 1) & tx->dqo.desc_mask; + + counter_enter(); + counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); + counter_exit(); + } + + gve_tx_fill_general_ctx_desc(&tx->dqo.desc_ring[desc_idx].general_ctx, + &metadata); + desc_idx = (desc_idx + 1) & tx->dqo.desc_mask; + + for (i = 0; i < nsegs; i++) { + gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, + segs[i].ds_len, segs[i].ds_addr, + completion_tag, /*eop=*/i == (nsegs - 1), + has_csum_flag); + } + + /* Remember the index of the last desc written */ + tx->dqo.desc_tail = desc_idx; + + /* + * Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + gve_tx_request_desc_compl(tx, desc_idx); + + tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ + return (0); + +abort_with_dma: + gve_unmap_packet(tx, pkt); +abort: + pkt->mbuf = NULL; + gve_free_pending_packet(tx, pkt); + return (err); +} + +static uint64_t +gve_handle_packet_completion(struct gve_priv *priv, + struct gve_tx_ring *tx, uint16_t compl_tag) +{ + struct gve_tx_pending_pkt_dqo *pending_pkt; + int32_t pkt_len; + + if (__predict_false(compl_tag >= tx->dqo.num_pending_pkts)) { + device_printf(priv->dev, "Invalid TX completion tag: %d\n", + compl_tag); + return (0); + } + + pending_pkt = &tx->dqo.pending_pkts[compl_tag]; + + /* Packet is allocated but not pending data completion. */ + if (__predict_false(pending_pkt->state != + GVE_PACKET_STATE_PENDING_DATA_COMPL)) { + device_printf(priv->dev, + "No pending data completion: %d\n", compl_tag); + return (0); + } + + pkt_len = pending_pkt->mbuf->m_pkthdr.len; + gve_unmap_packet(tx, pending_pkt); + m_freem(pending_pkt->mbuf); + pending_pkt->mbuf = NULL; + gve_free_pending_packet(tx, pending_pkt); + return (pkt_len); +} + +int +gve_tx_intr_dqo(void *arg) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + struct gve_ring_com *com = &tx->com; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return (FILTER_STRAY); + + /* Interrupts are automatically masked */ + taskqueue_enqueue(com->cleanup_tq, &com->cleanup_task); + return (FILTER_HANDLED); +} + +static void +gve_tx_clear_desc_ring_dqo(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int i; + + for (i = 0; i < com->priv->tx_desc_cnt; i++) + tx->dqo.desc_ring[i] = (union gve_tx_desc_dqo){}; + + bus_dmamap_sync(tx->desc_ring_mem.tag, tx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +static void +gve_tx_clear_compl_ring_dqo(struct gve_tx_ring *tx) +{ + struct gve_ring_com *com = &tx->com; + int entries; + int i; + + entries = com->priv->tx_desc_cnt; + for (i = 0; i < entries; i++) + tx->dqo.compl_ring[i] = (struct gve_tx_compl_desc_dqo){}; + + bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, + BUS_DMASYNC_PREWRITE); +} + +void +gve_clear_tx_ring_dqo(struct gve_priv *priv, int i) +{ + struct gve_tx_ring *tx = &priv->tx[i]; + int j; + + tx->dqo.desc_head = 0; + tx->dqo.desc_tail = 0; + tx->dqo.desc_mask = priv->tx_desc_cnt - 1; + tx->dqo.last_re_idx = 0; + + tx->dqo.compl_head = 0; + tx->dqo.compl_mask = priv->tx_desc_cnt - 1; + tx->dqo.hw_tx_head = 0; + tx->dqo.cur_gen_bit = 0; + + gve_free_tx_mbufs_dqo(tx); + + for (j = 0; j < tx->dqo.num_pending_pkts - 1; j++) { + tx->dqo.pending_pkts[j].next = j + 1; + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + tx->dqo.pending_pkts[tx->dqo.num_pending_pkts - 1].next = -1; + tx->dqo.free_pending_pkts_csm = 0; + atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); + + gve_tx_clear_desc_ring_dqo(tx); + gve_tx_clear_compl_ring_dqo(tx); +} + +static bool +gve_tx_cleanup_dqo(struct gve_priv *priv, struct gve_tx_ring *tx, int budget) +{ + struct gve_tx_compl_desc_dqo *compl_desc; + uint64_t bytes_done = 0; + uint64_t pkts_done = 0; + uint16_t compl_tag; + int work_done = 0; + uint16_t tx_head; + uint16_t type; + + while (work_done < budget) { + bus_dmamap_sync(tx->dqo.compl_ring_mem.tag, tx->dqo.compl_ring_mem.map, + BUS_DMASYNC_POSTREAD); + + compl_desc = &tx->dqo.compl_ring[tx->dqo.compl_head]; + if (compl_desc->generation == tx->dqo.cur_gen_bit) + break; + + /* + * Prevent generation bit from being read after the rest of the + * descriptor. + */ + rmb(); + type = compl_desc->type; + + if (type == GVE_COMPL_TYPE_DQO_DESC) { + /* This is the last descriptor fetched by HW plus one */ + tx_head = le16toh(compl_desc->tx_head); + atomic_store_rel_32(&tx->dqo.hw_tx_head, tx_head); + } else if (type == GVE_COMPL_TYPE_DQO_PKT) { + compl_tag = le16toh(compl_desc->completion_tag); + bytes_done += gve_handle_packet_completion(priv, + tx, compl_tag); + pkts_done++; + } + + tx->dqo.compl_head = (tx->dqo.compl_head + 1) & + tx->dqo.compl_mask; + /* Flip the generation bit when we wrap around */ + tx->dqo.cur_gen_bit ^= tx->dqo.compl_head == 0; + work_done++; + } + + tx->done += work_done; /* tx->done is just a sysctl counter */ + counter_enter(); + counter_u64_add_protected(tx->stats.tbytes, bytes_done); + counter_u64_add_protected(tx->stats.tpackets, pkts_done); + counter_exit(); + + return (work_done == budget); +} + +void +gve_tx_cleanup_tq_dqo(void *arg, int pending) +{ + struct gve_tx_ring *tx = arg; + struct gve_priv *priv = tx->com.priv; + + if (__predict_false((if_getdrvflags(priv->ifp) & IFF_DRV_RUNNING) == 0)) + return; + + if (gve_tx_cleanup_dqo(priv, tx, /*budget=*/1024)) { + taskqueue_enqueue(tx->com.cleanup_tq, &tx->com.cleanup_task); + return; + } + + gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, + GVE_ITR_NO_UPDATE_DQO | GVE_ITR_ENABLE_BIT_DQO); +} diff --git a/sys/dev/gve/gve_utils.c b/sys/dev/gve/gve_utils.c --- a/sys/dev/gve/gve_utils.c +++ b/sys/dev/gve/gve_utils.c @@ -1,7 +1,7 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * - * Copyright (c) 2023 Google LLC + * Copyright (c) 2023-2024 Google LLC * * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -29,6 +29,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "gve.h" +#include "gve_dqo.h" uint32_t gve_reg_bar_read_4(struct gve_priv *priv, bus_size_t offset) @@ -48,6 +49,12 @@ bus_write_4(priv->db_bar, offset, htobe32(val)); } +void +gve_db_bar_dqo_write_4(struct gve_priv *priv, bus_size_t offset, uint32_t val) +{ + bus_write_4(priv->db_bar, offset, val); +} + void gve_alloc_counters(counter_u64_t *stat, int num_stats) { @@ -307,7 +314,8 @@ } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, - gve_tx_intr, NULL, &priv->tx[i], &irq->cookie); + gve_is_gqi(priv) ? gve_tx_intr : gve_tx_intr_dqo, NULL, + &priv->tx[i], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Tx queue %d, " "err: %d\n", rid, i, err); @@ -334,7 +342,8 @@ } err = bus_setup_intr(priv->dev, irq->res, INTR_TYPE_NET | INTR_MPSAFE, - gve_rx_intr, NULL, &priv->rx[j], &irq->cookie); + gve_is_gqi(priv) ? gve_rx_intr : gve_rx_intr_dqo, NULL, + &priv->rx[j], &irq->cookie); if (err != 0) { device_printf(priv->dev, "Failed to setup irq %d for Rx queue %d, " "err: %d\n", rid, j, err); @@ -374,6 +383,24 @@ return (err); } +/* + * Builds register value to write to DQO IRQ doorbell to enable with specified + * ITR interval. + */ +static uint32_t +gve_setup_itr_interval_dqo(uint32_t interval_us) +{ + uint32_t result = GVE_ITR_ENABLE_BIT_DQO; + + /* Interval has 2us granularity. */ + interval_us >>= 1; + + interval_us &= GVE_ITR_INTERVAL_DQO_MASK; + result |= (interval_us << GVE_ITR_INTERVAL_DQO_SHIFT); + + return (result); +} + void gve_unmask_all_queue_irqs(struct gve_priv *priv) { @@ -383,11 +410,20 @@ for (idx = 0; idx < priv->tx_cfg.num_queues; idx++) { tx = &priv->tx[idx]; - gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); + if (gve_is_gqi(priv)) + gve_db_bar_write_4(priv, tx->com.irq_db_offset, 0); + else + gve_db_bar_dqo_write_4(priv, tx->com.irq_db_offset, + gve_setup_itr_interval_dqo(GVE_TX_IRQ_RATELIMIT_US_DQO)); } + for (idx = 0; idx < priv->rx_cfg.num_queues; idx++) { rx = &priv->rx[idx]; - gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); + if (gve_is_gqi(priv)) + gve_db_bar_write_4(priv, rx->com.irq_db_offset, 0); + else + gve_db_bar_dqo_write_4(priv, rx->com.irq_db_offset, + gve_setup_itr_interval_dqo(GVE_RX_IRQ_RATELIMIT_US_DQO)); } } diff --git a/sys/modules/gve/Makefile b/sys/modules/gve/Makefile --- a/sys/modules/gve/Makefile +++ b/sys/modules/gve/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: BSD-3-Clause # -# Copyright (c) 2023 Google LLC +# Copyright (c) 2023-2024 Google LLC # # Redistribution and use in source and binary forms, with or without modification, # are permitted provided that the following conditions are met: @@ -30,7 +30,7 @@ .PATH: ${SRCTOP}/sys/dev/gve KMOD= if_gve -SRCS= gve_main.c gve_adminq.c gve_utils.c gve_qpl.c gve_rx.c gve_tx.c gve_sysctl.c +SRCS= gve_main.c gve_adminq.c gve_utils.c gve_qpl.c gve_rx.c gve_rx_dqo.c gve_tx.c gve_tx_dqo.c gve_sysctl.c SRCS+= device_if.h bus_if.h pci_if.h .include