diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 --- a/share/man/man4/gve.4 +++ b/share/man/man4/gve.4 @@ -239,6 +239,8 @@ stands for "Queue Out-of-order" referring to the fact that the NIC might send Tx and Rx completions in an order different from the one in which the corresponding descriptors were posted by the driver. +.It +DQO_QPL: The next generation descriptor format in the "QPL" mode. .El .Sh SUPPORT Please email gvnic-drivers@google.com with the specifics of the issue encountered. diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -105,6 +105,7 @@ GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, + GVE_DQO_QPL_FORMAT = 0x4, }; enum gve_state_flags_bit { @@ -226,6 +227,7 @@ counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; + counter_u64_t rx_dropped_pkt_buf_post_fail; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; counter_u64_t rx_mbuf_dmamap_err; counter_u64_t rx_mbuf_mclget_null; @@ -233,11 +235,34 @@ #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) +union gve_rx_qpl_buf_id_dqo { + struct { + uint16_t buf_id:11; /* Index into rx->dqo.bufs */ + uint8_t frag_num:5; /* Which frag in the QPL page */ + }; + uint16_t all; +} __packed; +_Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2, + "gve: bad dqo qpl rx buf id length"); + struct gve_rx_buf_dqo { - struct mbuf *mbuf; - bus_dmamap_t dmamap; - uint64_t addr; - bool mapped; + union { + /* RDA */ + struct { + struct mbuf *mbuf; + bus_dmamap_t dmamap; + uint64_t addr; + bool mapped; + }; + /* QPL */ + struct { + uint8_t num_nic_frags; /* number of pending completions */ + uint8_t next_idx; /* index of the next frag to post */ + /* for chaining rx->dqo.used_bufs */ + STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry; + }; + }; + /* for chaining rx->dqo.free_bufs */ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; }; @@ -276,6 +301,13 @@ uint32_t tail; /* The index at which to receive the next compl at */ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; + + /* + * Only used in QPL mode. Pages refered to by if_input-ed mbufs + * stay parked here till their wire count comes back to 1. + * Pages are moved here after there aren't any pending completions. + */ + STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs; } dqo; }; @@ -313,6 +345,7 @@ counter_u64_t tx_dropped_pkt_nospace_bufring; counter_u64_t tx_delayed_pkt_nospace_descring; counter_u64_t tx_delayed_pkt_nospace_compring; + counter_u64_t tx_delayed_pkt_nospace_qpl_bufs; counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; counter_u64_t tx_mbuf_collapse; @@ -326,7 +359,19 @@ struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; - bus_dmamap_t dmamap; + union { + /* RDA */ + bus_dmamap_t dmamap; + /* QPL */ + struct { + /* + * A linked list of entries from qpl_bufs that served + * as the bounce buffer for this packet. + */ + int32_t qpl_buf_head; + uint32_t num_qpl_bufs; + }; + }; uint8_t state; /* the gve_packet_state enum */ int next; /* To chain the free_pending_pkts lists */ }; @@ -377,7 +422,20 @@ */ int32_t free_pending_pkts_csm; - bus_dma_tag_t buf_dmatag; /* DMA params for mapping Tx mbufs */ + /* + * The head index of a singly linked list representing QPL page fragments + * to copy mbuf payload into for the NIC to see. Once this list is depleted, + * the "_prd" suffixed producer list, grown by the completion taskqueue, + * is stolen. + * + * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. + */ + int32_t free_qpl_bufs_csm; + uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */ + uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */ + + /* DMA params for mapping Tx mbufs. Only used in RDA mode. */ + bus_dma_tag_t buf_dmatag; } __aligned(CACHE_LINE_SIZE); /* Accessed when processing completions */ @@ -395,6 +453,18 @@ * its consumer list, with the "_csm" suffix, is depleted. */ int32_t free_pending_pkts_prd; + + /* + * The completion taskqueue moves the QPL pages corresponding to a + * completed packet into this list. It is only used in QPL mode. + * The "_prd" denotes that this is a producer list. The trasnmit + * taskqueue steals this list once its consumer list, with the "_csm" + * suffix, is depleted. + * + * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. + */ + int32_t free_qpl_bufs_prd; + uint32_t qpl_bufs_produced; } __aligned(CACHE_LINE_SIZE); /* Accessed by both the completion and xmit loops */ @@ -402,6 +472,16 @@ /* completion tags index into this array */ struct gve_tx_pending_pkt_dqo *pending_pkts; uint16_t num_pending_pkts; + + /* + * Represents QPL page fragments. An index into this array + * always represents the same QPL page fragment. The value + * is also an index into this array and servers as a means + * to chain buffers into linked lists whose heads are + * either free_qpl_bufs_prd or free_qpl_bufs_csm or + * qpl_bufs_head. + */ + int32_t *qpl_bufs; } __aligned(CACHE_LINE_SIZE); } dqo; }; @@ -531,6 +611,13 @@ return (priv->queue_format == GVE_GQI_QPL_FORMAT); } +static inline bool +gve_is_qpl(struct gve_priv *priv) +{ + return (priv->queue_format == GVE_GQI_QPL_FORMAT || + priv->queue_format == GVE_DQO_QPL_FORMAT); +} + /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); @@ -545,6 +632,7 @@ void gve_free_qpls(struct gve_priv *priv); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); +void gve_mextadd_free(struct mbuf *mbuf); /* TX functions defined in gve_tx.c */ int gve_alloc_tx_rings(struct gve_priv *priv); @@ -563,6 +651,7 @@ void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); +int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); void gve_tx_cleanup_tq_dqo(void *arg, int pending); /* RX functions defined in gve_rx.c */ diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h --- a/sys/dev/gve/gve_adminq.h +++ b/sys/dev/gve/gve_adminq.h @@ -144,6 +144,15 @@ _Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8, "gve: bad admin queue struct length"); +struct gve_device_option_dqo_qpl { + __be32 supported_features_mask; + __be16 tx_comp_ring_entries; + __be16 rx_buff_ring_entries; +}; + +_Static_assert(sizeof(struct gve_device_option_dqo_qpl) == 8, + "gve: bad admin queue struct length"); + struct gve_device_option_modify_ring { __be32 supported_features_mask; __be16 max_rx_ring_size; @@ -168,6 +177,7 @@ GVE_DEV_OPT_ID_GQI_QPL = 0x3, GVE_DEV_OPT_ID_DQO_RDA = 0x4, GVE_DEV_OPT_ID_MODIFY_RING = 0x6, + GVE_DEV_OPT_ID_DQO_QPL = 0x7, GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, }; @@ -182,6 +192,7 @@ GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, }; @@ -196,7 +207,7 @@ enum gve_driver_capability { gve_driver_capability_gqi_qpl = 0, gve_driver_capability_gqi_rda = 1, - gve_driver_capability_dqo_qpl = 2, /* reserved for future use */ + gve_driver_capability_dqo_qpl = 2, gve_driver_capability_dqo_rda = 3, }; @@ -212,6 +223,7 @@ */ #define GVE_DRIVER_CAPABILITY_FLAGS1 \ (GVE_CAP1(gve_driver_capability_gqi_qpl) | \ + GVE_CAP1(gve_driver_capability_dqo_qpl) | \ GVE_CAP1(gve_driver_capability_dqo_rda)) #define GVE_DRIVER_CAPABILITY_FLAGS2 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS3 0x0 diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c --- a/sys/dev/gve/gve_adminq.c +++ b/sys/dev/gve/gve_adminq.c @@ -58,6 +58,7 @@ struct gve_device_option *option, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_dqo_qpl **dev_op_dqo_qpl, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { uint32_t req_feat_mask = be32toh(option->required_features_mask); @@ -103,6 +104,23 @@ *dev_op_dqo_rda = (void *)(option + 1); break; + case GVE_DEV_OPT_ID_DQO_QPL: + if (option_length < sizeof(**dev_op_dqo_qpl) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "DQO QPL", (int)sizeof(**dev_op_dqo_qpl), + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_dqo_qpl)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "DQO QPL"); + } + *dev_op_dqo_qpl = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_JUMBO_FRAMES: if (option_length < sizeof(**dev_op_jumbo_frames) || req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { @@ -136,6 +154,7 @@ struct gve_device_descriptor *descriptor, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_dqo_qpl **dev_op_dqo_qpl, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { char *desc_end = (char *)descriptor + be16toh(descriptor->total_length); @@ -154,7 +173,10 @@ } gve_parse_device_option(priv, descriptor, dev_opt, - dev_op_gqi_qpl, dev_op_dqo_rda, dev_op_jumbo_frames); + dev_op_gqi_qpl, + dev_op_dqo_rda, + dev_op_dqo_qpl, + dev_op_jumbo_frames); dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length)); } @@ -387,6 +409,7 @@ struct gve_dma_handle desc_mem; struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL; + struct gve_device_option_dqo_qpl *dev_op_dqo_qpl = NULL; struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; uint32_t supported_features_mask = 0; int rc; @@ -416,7 +439,9 @@ bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD); rc = gve_process_device_options(priv, desc, - &dev_op_gqi_qpl, &dev_op_dqo_rda, + &dev_op_gqi_qpl, + &dev_op_dqo_rda, + &dev_op_dqo_qpl, &dev_op_jumbo_frames); if (rc != 0) goto free_device_descriptor; @@ -430,6 +455,15 @@ if (bootverbose) device_printf(priv->dev, "Driver is running with DQO RDA queue format.\n"); + } else if (dev_op_dqo_qpl != NULL) { + snprintf(gve_queue_format, sizeof(gve_queue_format), + "%s", "DQO QPL"); + priv->queue_format = GVE_DQO_QPL_FORMAT; + supported_features_mask = be32toh( + dev_op_dqo_qpl->supported_features_mask); + if (bootverbose) + device_printf(priv->dev, + "Driver is running with DQO QPL queue format.\n"); } else if (dev_op_gqi_qpl != NULL) { snprintf(gve_queue_format, sizeof(gve_queue_format), "%s", "GQI QPL"); diff --git a/sys/dev/gve/gve_dqo.h b/sys/dev/gve/gve_dqo.h --- a/sys/dev/gve/gve_dqo.h +++ b/sys/dev/gve/gve_dqo.h @@ -57,7 +57,22 @@ * Start dropping RX fragments if at least these many * buffers cannot be posted to the NIC. */ -#define GVE_RX_DQO_MIN_PENDING_BUFS 32 +#define GVE_RX_DQO_MIN_PENDING_BUFS 128 + +#define GVE_DQ_NUM_FRAGS_IN_PAGE (PAGE_SIZE / GVE_DEFAULT_RX_BUFFER_SIZE) + +/* + * gve_rx_qpl_buf_id_dqo's 11 bit wide buf_id field limits the total + * number of pages per QPL to 2048. + */ +#define GVE_RX_NUM_QPL_PAGES_DQO 2048 + +/* 2K TX buffers for DQO-QPL */ +#define GVE_TX_BUF_SHIFT_DQO 11 +#define GVE_TX_BUF_SIZE_DQO BIT(GVE_TX_BUF_SHIFT_DQO) +#define GVE_TX_BUFS_PER_PAGE_DQO (PAGE_SIZE >> GVE_TX_BUF_SHIFT_DQO) + +#define GVE_TX_NUM_QPL_PAGES_DQO 512 /* Basic TX descriptor (DTYPE 0x0C) */ struct gve_tx_pkt_desc_dqo { diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -32,9 +32,9 @@ #include "gve_adminq.h" #include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.2.0\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.0\n" #define GVE_VERSION_MAJOR 1 -#define GVE_VERSION_MINOR 2 +#define GVE_VERSION_MINOR 3 #define GVE_VERSION_SUB 0 #define GVE_DEFAULT_RX_COPYBREAK 256 @@ -125,7 +125,7 @@ if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); - if (gve_is_gqi(priv)) { + if (gve_is_qpl(priv)) { err = gve_register_qpls(priv); if (err != 0) goto reset; @@ -177,7 +177,7 @@ if (gve_destroy_tx_rings(priv) != 0) goto reset; - if (gve_is_gqi(priv)) { + if (gve_is_qpl(priv)) { if (gve_unregister_qpls(priv) != 0) goto reset; } @@ -375,13 +375,15 @@ /* * Set TSO limits, must match the arguments to bus_dma_tag_create - * when creating tx->dqo.buf_dmatag + * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode + * because in QPL we copy the entire pakcet into the bounce buffer + * and thus it does not matter how fragmented the mbuf is. */ - if (!gve_is_gqi(priv)) { - if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); + if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) { if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); } + if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); @@ -465,7 +467,7 @@ gve_free_irqs(priv); gve_free_tx_rings(priv); gve_free_rx_rings(priv); - if (gve_is_gqi(priv)) + if (gve_is_qpl(priv)) gve_free_qpls(priv); } @@ -474,7 +476,7 @@ { int err; - if (gve_is_gqi(priv)) { + if (gve_is_qpl(priv)) { err = gve_alloc_qpls(priv); if (err != 0) goto abort; diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c --- a/sys/dev/gve/gve_qpl.c +++ b/sys/dev/gve/gve_qpl.c @@ -32,13 +32,14 @@ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations"); static uint32_t gve_num_tx_qpls(struct gve_priv *priv) { - if (priv->queue_format != GVE_GQI_QPL_FORMAT) + if (!gve_is_qpl(priv)) return (0); return (priv->tx_cfg.max_queues); @@ -47,7 +48,7 @@ static uint32_t gve_num_rx_qpls(struct gve_priv *priv) { - if (priv->queue_format != GVE_GQI_QPL_FORMAT) + if (!gve_is_qpl(priv)) return (0); return (priv->rx_cfg.max_queues); @@ -189,6 +190,7 @@ int gve_alloc_qpls(struct gve_priv *priv) { int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int num_pages; int err; int i; @@ -198,15 +200,19 @@ priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL, M_WAITOK | M_ZERO); + num_pages = gve_is_gqi(priv) ? + priv->tx_desc_cnt / GVE_QPL_DIVISOR : + GVE_TX_NUM_QPL_PAGES_DQO; for (i = 0; i < gve_num_tx_qpls(priv); i++) { - err = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, + err = gve_alloc_qpl(priv, i, num_pages, /*single_kva=*/true); if (err != 0) goto abort; } + num_pages = gve_is_gqi(priv) ? priv->rx_desc_cnt : GVE_RX_NUM_QPL_PAGES_DQO; for (; i < num_qpls; i++) { - err = gve_alloc_qpl(priv, i, priv->rx_desc_cnt, /*single_kva=*/false); + err = gve_alloc_qpl(priv, i, num_pages, /*single_kva=*/false); if (err != 0) goto abort; } @@ -283,3 +289,21 @@ gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); return (0); } + +void +gve_mextadd_free(struct mbuf *mbuf) +{ + vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; + vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; + + /* + * Free the page only if this is the last ref. + * The interface might no longer exist by the time + * this callback is called, see gve_free_qpl. + */ + if (__predict_false(vm_page_unwire_noq(page))) { + pmap_qremove(va, 1); + kva_free(va, PAGE_SIZE); + vm_page_free(page); + } +} diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c --- a/sys/dev/gve/gve_rx.c +++ b/sys/dev/gve/gve_rx.c @@ -409,24 +409,6 @@ } } -static void -gve_mextadd_free(struct mbuf *mbuf) -{ - vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; - vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; - - /* - * Free the page only if this is the last ref. - * The interface might no longer exist by the time - * this callback is called, see gve_free_qpl. - */ - if (__predict_false(vm_page_unwire_noq(page))) { - pmap_qremove(va, 1); - kva_free(va, PAGE_SIZE); - vm_page_free(page); - } -} - static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) { diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c --- a/sys/dev/gve/gve_rx_dqo.c +++ b/sys/dev/gve/gve_rx_dqo.c @@ -38,6 +38,9 @@ struct gve_rx_buf_dqo *buf; int i; + if (gve_is_qpl(rx->com.priv)) + return; + for (i = 0; i < rx->dqo.buf_cnt; i++) { buf = &rx->dqo.bufs[i]; if (!buf->mbuf) @@ -70,7 +73,7 @@ if (rx->dqo.bufs != NULL) { gve_free_rx_mbufs_dqo(rx); - if (rx->dqo.buf_dmatag) { + if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) { for (j = 0; j < rx->dqo.buf_cnt; j++) if (rx->dqo.bufs[j].mapped) bus_dmamap_destroy(rx->dqo.buf_dmatag, @@ -81,7 +84,7 @@ rx->dqo.bufs = NULL; } - if (rx->dqo.buf_dmatag) + if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) bus_dma_tag_destroy(rx->dqo.buf_dmatag); } @@ -103,6 +106,31 @@ rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr; rx->dqo.mask = priv->rx_desc_cnt - 1; + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc compl ring for rx ring %d", i); + goto abort; + } + rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; + rx->dqo.mask = priv->rx_desc_cnt - 1; + + rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO : + priv->rx_desc_cnt; + rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), + M_GVE, M_WAITOK | M_ZERO); + + if (gve_is_qpl(priv)) { + rx->com.qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; + if (rx->com.qpl == NULL) { + device_printf(priv->dev, "No QPL left for rx ring %d", i); + return (ENOMEM); + } + return (0); + } + err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ @@ -123,9 +151,6 @@ goto abort; } - rx->dqo.buf_cnt = priv->rx_desc_cnt; - rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), - M_GVE, M_WAITOK | M_ZERO); for (j = 0; j < rx->dqo.buf_cnt; j++) { err = bus_dmamap_create(rx->dqo.buf_dmatag, 0, &rx->dqo.bufs[j].dmamap); @@ -138,17 +163,6 @@ rx->dqo.bufs[j].mapped = true; } - err = gve_dma_alloc_coherent(priv, - sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, - CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); - if (err != 0) { - device_printf(priv->dev, - "Failed to alloc compl ring for rx ring %d", i); - goto abort; - } - rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; - rx->dqo.mask = priv->rx_desc_cnt - 1; - return (0); abort: @@ -202,10 +216,36 @@ gve_free_rx_mbufs_dqo(rx); - SLIST_INIT(&rx->dqo.free_bufs); - for (j = 0; j < rx->dqo.buf_cnt; j++) - SLIST_INSERT_HEAD(&rx->dqo.free_bufs, - &rx->dqo.bufs[j], slist_entry); + if (gve_is_qpl(priv)) { + SLIST_INIT(&rx->dqo.free_bufs); + STAILQ_INIT(&rx->dqo.used_bufs); + + for (j = 0; j < rx->dqo.buf_cnt; j++) { + struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j]; + + vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs]; + u_int ref_count = atomic_load_int(&page->ref_count); + + /* + * An ifconfig down+up might see pages still in flight + * from the previous innings. + */ + if (VPRC_WIRE_COUNT(ref_count) == 1) + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + buf, slist_entry); + else + STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, + buf, stailq_entry); + + buf->num_nic_frags = 0; + buf->next_idx = 0; + } + } else { + SLIST_INIT(&rx->dqo.free_bufs); + for (j = 0; j < rx->dqo.buf_cnt; j++) + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + &rx->dqo.bufs[j], slist_entry); + } } int @@ -223,6 +263,20 @@ return (FILTER_HANDLED); } +static void +gve_rx_advance_head_dqo(struct gve_rx_ring *rx) +{ + rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; + rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ + + if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); + gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, + rx->dqo.head); + } +} + static void gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) { @@ -235,15 +289,7 @@ desc->buf_id = htole16(buf - rx->dqo.bufs); desc->buf_addr = htole64(buf->addr); - rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; - rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ - - if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { - bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, - BUS_DMASYNC_PREWRITE); - gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, - rx->dqo.head); - } + gve_rx_advance_head_dqo(rx); } static int @@ -294,6 +340,103 @@ return (err); } +static struct gve_dma_handle * +gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) +{ + return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs])); +} + +static void +gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, + uint8_t frag_num) +{ + struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head]; + union gve_rx_qpl_buf_id_dqo composed_id; + struct gve_dma_handle *page_dma_handle; + + composed_id.buf_id = buf - rx->dqo.bufs; + composed_id.frag_num = frag_num; + desc->buf_id = htole16(composed_id.all); + + page_dma_handle = gve_get_page_dma_handle(rx, buf); + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_PREREAD); + desc->buf_addr = htole64(page_dma_handle->bus_addr + + frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); + + buf->num_nic_frags++; + gve_rx_advance_head_dqo(rx); +} + +static void +gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one) +{ + struct gve_rx_buf_dqo *hol_blocker = NULL; + struct gve_rx_buf_dqo *buf; + u_int ref_count; + vm_page_t page; + + while (true) { + buf = STAILQ_FIRST(&rx->dqo.used_bufs); + if (__predict_false(buf == NULL)) + break; + + page = rx->com.qpl->pages[buf - rx->dqo.bufs]; + ref_count = atomic_load_int(&page->ref_count); + + if (VPRC_WIRE_COUNT(ref_count) != 1) { + /* Account for one head-of-line blocker */ + if (hol_blocker != NULL) + break; + hol_blocker = buf; + STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, + stailq_entry); + continue; + } + + STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, + stailq_entry); + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + buf, slist_entry); + if (just_one) + break; + } + + if (hol_blocker != NULL) + STAILQ_INSERT_HEAD(&rx->dqo.used_bufs, + hol_blocker, stailq_entry); +} + +static int +gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_dqo *buf; + + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(buf == NULL)) { + gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true); + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(buf == NULL)) + return (ENOBUFS); + } + + gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx); + if (buf->next_idx == GVE_DQ_NUM_FRAGS_IN_PAGE - 1) + buf->next_idx = 0; + else + buf->next_idx++; + + /* + * We have posted all the frags in this buf to the NIC. + * - buf will enter used_bufs once the last completion arrives. + * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs + * when its wire count drops back to 1. + */ + if (buf->next_idx == 0) + SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); + return (0); +} + static void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how) { @@ -306,7 +449,10 @@ num_to_post = rx->dqo.mask - num_pending_bufs; for (i = 0; i < num_to_post; i++) { - err = gve_rx_post_new_mbuf_dqo(rx, how); + if (gve_is_qpl(rx->com.priv)) + err = gve_rx_post_new_dqo_qpl_buf(rx); + else + err = gve_rx_post_new_mbuf_dqo(rx, how); if (err) break; } @@ -427,7 +573,7 @@ } static int -gve_rx_copybreak_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, +gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va, struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len) { struct mbuf *mbuf; @@ -440,14 +586,13 @@ counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); counter_exit(); - m_copyback(mbuf, 0, frag_len, mtod(buf->mbuf, char*)); + m_copyback(mbuf, 0, frag_len, va); mbuf->m_len = frag_len; rx->ctx.mbuf_head = mbuf; rx->ctx.mbuf_tail = mbuf; rx->ctx.total_size += frag_len; - gve_rx_post_buf_dqo(rx, buf); gve_rx_input_mbuf_dqo(rx, compl_desc); return (0); } @@ -495,10 +640,12 @@ frag_len = compl_desc->packet_len; if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { - err = gve_rx_copybreak_dqo(rx, buf, compl_desc, frag_len); + err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*), + compl_desc, frag_len); if (__predict_false(err != 0)) goto drop_frag; (*work_done)++; + gve_rx_post_buf_dqo(rx, buf); return; } @@ -579,6 +726,233 @@ rx->ctx = (struct gve_rx_ctx){}; } +static void * +gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx, + struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num) +{ + int page_idx = buf - rx->dqo.bufs; + void *va = rx->com.qpl->dmas[page_idx].cpu_addr; + + va = (char *)va + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); + return (va); +} + +static int +gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx, + struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, + uint8_t buf_frag_num, uint16_t frag_len) +{ + void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); + struct mbuf *mbuf; + + if (ctx->mbuf_tail == NULL) { + mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + mbuf->m_len = frag_len; + ctx->total_size += frag_len; + + m_copyback(mbuf, 0, frag_len, va); + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); + counter_exit(); + return (0); +} + +static int +gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx, + struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, + uint8_t buf_frag_num, uint16_t frag_len) +{ + struct mbuf *mbuf; + void *page_addr; + vm_page_t page; + int page_idx; + void *va; + + if (ctx->mbuf_tail == NULL) { + mbuf = m_gethdr(M_NOWAIT, MT_DATA); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + mbuf = m_get(M_NOWAIT, MT_DATA); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + mbuf->m_len = frag_len; + ctx->total_size += frag_len; + + page_idx = buf - rx->dqo.bufs; + page = rx->com.qpl->pages[page_idx]; + page_addr = rx->com.qpl->dmas[page_idx].cpu_addr; + va = (char *)page_addr + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); + + /* + * Grab an extra ref to the page so that gve_mextadd_free + * does not end up freeing the page while the interface exists. + */ + vm_page_wire(page); + + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); + counter_exit(); + + MEXTADD(mbuf, va, frag_len, + gve_mextadd_free, page, page_addr, + 0, EXT_NET_DRV); + return (0); +} + +static void +gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_compl_desc_dqo *compl_desc, + int *work_done) +{ + bool is_last_frag = compl_desc->end_of_packet != 0; + union gve_rx_qpl_buf_id_dqo composed_id; + struct gve_dma_handle *page_dma_handle; + struct gve_rx_ctx *ctx = &rx->ctx; + struct gve_rx_buf_dqo *buf; + uint32_t num_pending_bufs; + uint8_t buf_frag_num; + uint16_t frag_len; + uint16_t buf_id; + int err; + + composed_id.all = le16toh(compl_desc->buf_id); + buf_id = composed_id.buf_id; + buf_frag_num = composed_id.frag_num; + + if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { + device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", + buf_id, rx->com.id); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + buf = &rx->dqo.bufs[buf_id]; + if (__predict_false(buf->num_nic_frags == 0 || + buf_frag_num > GVE_DQ_NUM_FRAGS_IN_PAGE - 1)) { + device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d " + "with buf_frag_num %d and num_nic_frags %d, issuing reset\n", + buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + + buf->num_nic_frags--; + + if (__predict_false(ctx->drop_pkt)) + goto drop_frag; + + if (__predict_false(compl_desc->rx_error)) { + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); + counter_exit(); + goto drop_frag; + } + + page_dma_handle = gve_get_page_dma_handle(rx, buf); + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_POSTREAD); + + frag_len = compl_desc->packet_len; + if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { + void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); + + err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len); + if (__predict_false(err != 0)) + goto drop_frag; + (*work_done)++; + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + return; + } + + num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; + err = gve_rx_post_new_dqo_qpl_buf(rx); + if (__predict_false(err != 0 && + num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { + /* + * Resort to copying this fragment into a cluster mbuf + * when the above threshold is breached and repost the + * incoming buffer. If we cannot find cluster mbufs, + * just drop the packet (to repost its buffer). + */ + err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf, + buf_frag_num, frag_len); + if (err != 0) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_buf_post_fail, 1); + counter_exit(); + goto drop_frag; + } + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + } else { + err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf, + buf_frag_num, frag_len); + if (__predict_false(err != 0)) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); + counter_exit(); + goto drop_frag; + } + } + + /* + * Both the counts need to be checked. + * + * num_nic_frags == 0 implies no pending completions + * but not all frags may have yet been posted. + * + * next_idx == 0 implies all frags have been posted + * but there might be pending completions. + */ + if (buf->num_nic_frags == 0 && buf->next_idx == 0) + STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry); + + if (is_last_frag) { + gve_rx_input_mbuf_dqo(rx, compl_desc); + (*work_done)++; + } + return; + +drop_frag: + /* Clear the earlier frags if there were any */ + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; + /* Drop the rest of the pkt if there are more frags */ + ctx->drop_pkt = true; + /* Reuse the dropped frag's buffer */ + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + + if (is_last_frag) + goto drop_frag_clear_ctx; + return; + +drop_frag_clear_ctx: + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; +} + static bool gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) { @@ -604,13 +978,18 @@ rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask; rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0); - gve_rx_dqo(priv, rx, compl_desc, &work_done); + if (gve_is_qpl(priv)) + gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done); + else + gve_rx_dqo(priv, rx, compl_desc, &work_done); } if (work_done != 0) tcp_lro_flush_all(&rx->lro); gve_rx_post_buffers_dqo(rx, M_NOWAIT); + if (gve_is_qpl(priv)) + gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false); return (work_done == budget); } diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -83,6 +83,10 @@ "rx_dropped_pkt_desc_err", CTLFLAG_RD, &stats->rx_dropped_pkt_desc_err, "Packets dropped due to descriptor error"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_dropped_pkt_buf_post_fail", CTLFLAG_RD, + &stats->rx_dropped_pkt_buf_post_fail, + "Packets dropped due to failure to post enough buffers"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_mbuf_alloc_fail, @@ -155,6 +159,10 @@ "tx_delayed_pkt_nospace_compring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_compring, "Packets delayed due to no space in comp ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_qpl_bufs, + "Packets delayed due to not enough qpl bufs"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_tsoerr", CTLFLAG_RD, &stats->tx_delayed_pkt_tsoerr, diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -752,7 +752,10 @@ * The reference is passed in so that in the case of * errors, the new mbuf chain is what's put back on the br. */ - err = gve_xmit_dqo(tx, &mbuf); + if (gve_is_qpl(priv)) + err = gve_xmit_dqo_qpl(tx, mbuf); + else + err = gve_xmit_dqo(tx, &mbuf); } if (__predict_false(err != 0 && mbuf != NULL)) { diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -51,7 +51,12 @@ if (!pending_pkt->mbuf) continue; - gve_unmap_packet(tx, pending_pkt); + if (gve_is_qpl(tx->com.priv)) { + pending_pkt->qpl_buf_head = -1; + pending_pkt->num_qpl_bufs = 0; + } else + gve_unmap_packet(tx, pending_pkt); + m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; } @@ -76,7 +81,7 @@ if (tx->dqo.pending_pkts != NULL) { gve_free_tx_mbufs_dqo(tx); - if (tx->dqo.buf_dmatag) { + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { for (j = 0; j < tx->dqo.num_pending_pkts; j++) if (tx->dqo.pending_pkts[j].state != GVE_PACKET_STATE_UNALLOCATED) @@ -88,8 +93,59 @@ tx->dqo.pending_pkts = NULL; } - if (tx->dqo.buf_dmatag) + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) bus_dma_tag_destroy(tx->dqo.buf_dmatag); + + if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { + free(tx->dqo.qpl_bufs, M_GVE); + tx->dqo.qpl_bufs = NULL; + } +} + +static int +gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) +{ + struct gve_priv *priv = tx->com.priv; + int err; + int j; + + /* + * DMA tag for mapping Tx mbufs + * The maxsize, nsegments, and maxsegsize params should match + * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. + */ + err = bus_dma_tag_create( + bus_get_dma_tag(priv->dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + GVE_TSO_MAXSIZE_DQO, /* maxsize */ + GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ + GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &tx->dqo.buf_dmatag); + if (err != 0) { + device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + return (err); + } + + for (j = 0; j < tx->dqo.num_pending_pkts; j++) { + err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, + &tx->dqo.pending_pkts[j].dmamap); + if (err != 0) { + device_printf(priv->dev, + "err in creating pending pkt dmamap %d: %d", + j, err); + return (err); + } + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + + return (0); } int @@ -98,7 +154,6 @@ struct gve_tx_ring *tx = &priv->tx[i]; uint16_t num_pending_pkts; int err; - int j; /* Descriptor ring */ err = gve_dma_alloc_coherent(priv, @@ -122,30 +177,6 @@ } tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; - /* - * DMA tag for mapping Tx mbufs - * The maxsize, nsegments, and maxsegsize params should match - * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. - */ - err = bus_dma_tag_create( - bus_get_dma_tag(priv->dev), /* parent */ - 1, 0, /* alignment, bounds */ - BUS_SPACE_MAXADDR, /* lowaddr */ - BUS_SPACE_MAXADDR, /* highaddr */ - NULL, NULL, /* filter, filterarg */ - GVE_TSO_MAXSIZE_DQO, /* maxsize */ - GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ - GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ - BUS_DMA_ALLOCNOW, /* flags */ - NULL, /* lockfunc */ - NULL, /* lockarg */ - &tx->dqo.buf_dmatag); - if (err != 0) { - device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", - __func__, err); - goto abort; - } - /* * pending_pkts array * @@ -167,18 +198,18 @@ sizeof(struct gve_tx_pending_pkt_dqo) * num_pending_pkts, M_GVE, M_WAITOK | M_ZERO); - for (j = 0; j < tx->dqo.num_pending_pkts; j++) { - err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, - &tx->dqo.pending_pkts[j].dmamap); - if (err != 0) { - device_printf(priv->dev, - "err in creating pending pkt dmamap %d: %d", - j, err); - goto abort; - } - tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; - } + if (gve_is_qpl(priv)) { + int qpl_buf_cnt; + tx->com.qpl = &priv->qpls[i]; + qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + + tx->dqo.qpl_bufs = malloc( + sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, + M_GVE, M_WAITOK | M_ZERO); + } else + gve_tx_alloc_rda_fields_dqo(tx); return (0); abort: @@ -330,6 +361,44 @@ return (0); } +static int +gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, + bool is_tso, uint32_t *desc_idx) +{ + struct gve_tx_general_context_desc_dqo *gen_desc; + struct gve_tx_tso_context_desc_dqo *tso_desc; + struct gve_tx_metadata_dqo metadata; + int header_len; + int err; + + metadata = (struct gve_tx_metadata_dqo){0}; + gve_extract_tx_metadata_dqo(mbuf, &metadata); + + if (is_tso) { + err = gve_prep_tso(mbuf, &header_len); + if (__predict_false(err)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_tsoerr, 1); + counter_exit(); + return (err); + } + + tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; + gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); + + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + counter_enter(); + counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); + counter_exit(); + } + + gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; + gve_tx_fill_general_ctx_desc(gen_desc, &metadata); + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + return (0); +} + static int gve_map_mbuf_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf, bus_dmamap_t dmamap, @@ -495,18 +564,197 @@ } } +static bool +gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) +{ + uint32_t available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + + tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( + &tx->dqo.qpl_bufs_produced); + available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + return (false); +} + +static int32_t +gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) +{ + int32_t buf = tx->dqo.free_qpl_bufs_csm; + + if (__predict_false(buf == -1)) { + tx->dqo.free_qpl_bufs_csm = atomic_swap_32( + &tx->dqo.free_qpl_bufs_prd, -1); + buf = tx->dqo.free_qpl_bufs_csm; + if (__predict_false(buf == -1)) + return (-1); + } + + tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; + tx->dqo.qpl_bufs_consumed++; + return (buf); +} + +/* + * Tx buffer i corresponds to + * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO + * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO + */ +static void +gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, + int32_t index, void **va, bus_addr_t *dma_addr) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << + GVE_TX_BUF_SHIFT_DQO; + + *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; + *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; +} + +static struct gve_dma_handle * +gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + + return (&tx->com.qpl->dmas[page_id]); +} + +static void +gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, + struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, + bool csum_enabled, int16_t completion_tag, + uint32_t *desc_idx) +{ + int32_t pkt_len = mbuf->m_pkthdr.len; + struct gve_dma_handle *dma; + uint32_t copy_offset = 0; + int32_t prev_buf = -1; + uint32_t copy_len; + bus_addr_t addr; + int32_t buf; + void *va; + + MPASS(pkt->num_qpl_bufs == 0); + MPASS(pkt->qpl_buf_head == -1); + + while (copy_offset < pkt_len) { + buf = gve_tx_alloc_qpl_buf(tx); + /* We already checked for availability */ + MPASS(buf != -1); + + gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); + copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); + m_copydata(mbuf, copy_offset, copy_len, va); + copy_offset += copy_len; + + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); + + gve_tx_fill_pkt_desc_dqo(tx, desc_idx, + copy_len, addr, completion_tag, + /*eop=*/copy_offset == pkt_len, + csum_enabled); + + /* Link all the qpl bufs for a packet */ + if (prev_buf == -1) + pkt->qpl_buf_head = buf; + else + tx->dqo.qpl_bufs[prev_buf] = buf; + + prev_buf = buf; + pkt->num_qpl_bufs++; + } + + tx->dqo.qpl_bufs[buf] = -1; +} + +int +gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) +{ + uint32_t desc_idx = tx->dqo.desc_tail; + struct gve_tx_pending_pkt_dqo *pkt; + int total_descs_needed; + int16_t completion_tag; + bool has_csum_flag; + int csum_flags; + bool is_tso; + int nsegs; + int err; + + csum_flags = mbuf->m_pkthdr.csum_flags; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + is_tso = csum_flags & CSUM_TSO; + + nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); + /* Check if we have enough room in the desc ring */ + total_descs_needed = 1 + /* general_ctx_desc */ + nsegs + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) + return (ENOBUFS); + + if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); + counter_exit(); + return (ENOBUFS); + } + + pkt = gve_alloc_pending_packet(tx); + if (pkt == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_compring, 1); + counter_exit(); + return (ENOBUFS); + } + completion_tag = pkt - tx->dqo.pending_pkts; + pkt->mbuf = mbuf; + + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort; + + gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, + has_csum_flag, completion_tag, &desc_idx); + + /* Remember the index of the last desc written */ + tx->dqo.desc_tail = desc_idx; + + /* + * Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + gve_tx_request_desc_compl(tx, desc_idx); + + tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ + return (0); + +abort: + pkt->mbuf = NULL; + gve_free_pending_packet(tx, pkt); + return (err); +} + int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) { bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; uint32_t desc_idx = tx->dqo.desc_tail; - struct gve_tx_metadata_dqo metadata; struct gve_tx_pending_pkt_dqo *pkt; struct mbuf *mbuf = *mbuf_ptr; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; - int header_len; int csum_flags; bool is_tso; int nsegs; @@ -556,34 +804,11 @@ goto abort_with_dma; } - bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); - - metadata = (struct gve_tx_metadata_dqo){0}; - gve_extract_tx_metadata_dqo(mbuf, &metadata); - - if (is_tso) { - err = gve_prep_tso(mbuf, &header_len); - if (__predict_false(err)) { - counter_enter(); - counter_u64_add_protected( - tx->stats.tx_delayed_pkt_tsoerr, 1); - counter_exit(); - goto abort_with_dma; - } - - gve_tx_fill_tso_ctx_desc(&tx->dqo.desc_ring[desc_idx].tso_ctx, - mbuf, &metadata, header_len); - desc_idx = (desc_idx + 1) & tx->dqo.desc_mask; - - counter_enter(); - counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); - counter_exit(); - } - - gve_tx_fill_general_ctx_desc(&tx->dqo.desc_ring[desc_idx].general_ctx, - &metadata); - desc_idx = (desc_idx + 1) & tx->dqo.desc_mask; + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort_with_dma; + bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); for (i = 0; i < nsegs; i++) { gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, segs[i].ds_len, segs[i].ds_addr, @@ -611,6 +836,39 @@ return (err); } +static void +gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pkt) +{ + int32_t buf = pkt->qpl_buf_head; + struct gve_dma_handle *dma; + int32_t qpl_buf_tail; + int32_t old_head; + int i; + + for (i = 0; i < pkt->num_qpl_bufs; i++) { + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); + qpl_buf_tail = buf; + buf = tx->dqo.qpl_bufs[buf]; + } + MPASS(buf == -1); + buf = qpl_buf_tail; + + while (true) { + old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); + tx->dqo.qpl_bufs[buf] = old_head; + + if (atomic_cmpset_rel_32(&tx->dqo.free_qpl_bufs_prd, + old_head, pkt->qpl_buf_head)) + break; + } + atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); + + pkt->qpl_buf_head = -1; + pkt->num_qpl_bufs = 0; +} + static uint64_t gve_handle_packet_completion(struct gve_priv *priv, struct gve_tx_ring *tx, uint16_t compl_tag) @@ -635,7 +893,12 @@ } pkt_len = pending_pkt->mbuf->m_pkthdr.len; - gve_unmap_packet(tx, pending_pkt); + + if (gve_is_qpl(priv)) + gve_reap_qpl_bufs_dqo(tx, pending_pkt); + else + gve_unmap_packet(tx, pending_pkt); + m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; gve_free_pending_packet(tx, pending_pkt); @@ -711,6 +974,21 @@ tx->dqo.free_pending_pkts_csm = 0; atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); + if (gve_is_qpl(priv)) { + int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + + for (j = 0; j < qpl_buf_cnt - 1; j++) + tx->dqo.qpl_bufs[j] = j + 1; + tx->dqo.qpl_bufs[j] = -1; + + tx->dqo.free_qpl_bufs_csm = 0; + atomic_store_32(&tx->dqo.free_qpl_bufs_prd, -1); + atomic_store_32(&tx->dqo.qpl_bufs_produced, qpl_buf_cnt); + tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; + tx->dqo.qpl_bufs_consumed = 0; + } + gve_tx_clear_desc_ring_dqo(tx); gve_tx_clear_compl_ring_dqo(tx); }