diff --git a/share/man/man4/gve.4 b/share/man/man4/gve.4 --- a/share/man/man4/gve.4 +++ b/share/man/man4/gve.4 @@ -209,6 +209,8 @@ DQO_RDA: DQO is the descriptor format required to take full advantage of next generation VM shapes. "RDA" stands for "Raw DMA Addressing" and refers to the fact that hardware can work with DMA-ed packets and does not expect them to be copied into or out of a fixed bounce buffer. +.It +DQO_QPL: The next generation descriptor format in the "QPL" mode. .El .Sh SUPPORT Please email gvnic-drivers@google.com with the specifics of the issue encountered. diff --git a/sys/dev/gve/gve.h b/sys/dev/gve/gve.h --- a/sys/dev/gve/gve.h +++ b/sys/dev/gve/gve.h @@ -105,6 +105,7 @@ GVE_GQI_RDA_FORMAT = 0x1, GVE_GQI_QPL_FORMAT = 0x2, GVE_DQO_RDA_FORMAT = 0x3, + GVE_DQO_QPL_FORMAT = 0x4, }; enum gve_state_flags_bit { @@ -226,6 +227,7 @@ counter_u64_t rx_frag_flip_cnt; counter_u64_t rx_frag_copy_cnt; counter_u64_t rx_dropped_pkt_desc_err; + counter_u64_t rx_dropped_pkt_buf_post_fail; counter_u64_t rx_dropped_pkt_mbuf_alloc_fail; counter_u64_t rx_mbuf_dmamap_err; counter_u64_t rx_mbuf_mclget_null; @@ -233,11 +235,34 @@ #define NUM_RX_STATS (sizeof(struct gve_rxq_stats) / sizeof(counter_u64_t)) +union gve_rx_qpl_buf_id_dqo { + struct { + uint16_t buf_id:11; /* Index into rx->dqo.bufs */ + uint8_t frag_num:5; /* Which frag in the QPL page */ + }; + uint16_t all; +} __packed; +_Static_assert(sizeof(union gve_rx_qpl_buf_id_dqo) == 2, + "gve: bad dqo qpl rx buf id length"); + struct gve_rx_buf_dqo { - struct mbuf *mbuf; - bus_dmamap_t dmamap; - uint64_t addr; - bool mapped; + union { + /* RDA */ + struct { + struct mbuf *mbuf; + bus_dmamap_t dmamap; + uint64_t addr; + bool mapped; + }; + /* QPL */ + struct { + uint8_t num_nic_frags; /* number of pending completions */ + uint8_t next_idx; /* index of the next frag to post */ + /* for chaining rx->dqo.used_bufs */ + STAILQ_ENTRY(gve_rx_buf_dqo) stailq_entry; + }; + }; + /* for chaining rx->dqo.free_bufs */ SLIST_ENTRY(gve_rx_buf_dqo) slist_entry; }; @@ -276,6 +301,13 @@ uint32_t tail; /* The index at which to receive the next compl at */ uint8_t cur_gen_bit; /* Gets flipped on every cycle of the compl ring */ SLIST_HEAD(, gve_rx_buf_dqo) free_bufs; + + /* + * Only used in QPL mode. Pages refered to by if_input-ed mbufs + * stay parked here till their wire count comes back to 1. + * Pages are moved here after there aren't any pending completions. + */ + STAILQ_HEAD(, gve_rx_buf_dqo) used_bufs; } dqo; }; @@ -313,6 +345,7 @@ counter_u64_t tx_dropped_pkt_nospace_bufring; counter_u64_t tx_delayed_pkt_nospace_descring; counter_u64_t tx_delayed_pkt_nospace_compring; + counter_u64_t tx_delayed_pkt_nospace_qpl_bufs; counter_u64_t tx_delayed_pkt_tsoerr; counter_u64_t tx_dropped_pkt_vlan; counter_u64_t tx_mbuf_collapse; @@ -326,7 +359,19 @@ struct gve_tx_pending_pkt_dqo { struct mbuf *mbuf; - bus_dmamap_t dmamap; + union { + /* RDA */ + bus_dmamap_t dmamap; + /* QPL */ + struct { + /* + * A linked list of entries from qpl_bufs that served + * as the bounce buffer for this packet. + */ + int32_t qpl_buf_head; + uint32_t num_qpl_bufs; + }; + }; uint8_t state; /* the gve_packet_state enum */ int next; /* To chain the free_pending_pkts lists */ }; @@ -377,7 +422,20 @@ */ int32_t free_pending_pkts_csm; - bus_dma_tag_t buf_dmatag; /* DMA params for mapping Tx mbufs */ + /* + * The head index of a singly linked list representing QPL page fragments + * to copy mbuf payload into for the NIC to see. Once this list is depleted, + * the "_prd" suffixed producer list, grown by the completion taskqueue, + * is stolen. + * + * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. + */ + int32_t free_qpl_bufs_csm; + uint32_t qpl_bufs_consumed; /* Allows quickly checking for buf availability */ + uint32_t qpl_bufs_produced_cached; /* Cached value of qpl_bufs_produced */ + + /* DMA params for mapping Tx mbufs. Only used in RDA mode. */ + bus_dma_tag_t buf_dmatag; } __aligned(CACHE_LINE_SIZE); /* Accessed when processing completions */ @@ -395,6 +453,18 @@ * its consumer list, with the "_csm" suffix, is depleted. */ volatile int32_t free_pending_pkts_prd; + + /* + * The completion taskqueue moves the QPL pages corresponding to a + * completed packet into this list. It is only used in QPL mode. + * The "_prd" denotes that this is a producer list. The trasnmit + * taskqueue steals this list once its consumer list, with the "_csm" + * suffix, is depleted. + * + * Only used in QPL mode. int32_t because atomic_swap_16 doesn't exist. + */ + volatile int32_t free_qpl_bufs_prd; + volatile uint32_t qpl_bufs_produced; } __aligned(CACHE_LINE_SIZE); /* Accessed by both the completion and xmit loops */ @@ -402,6 +472,16 @@ /* completion tags index into this array */ struct gve_tx_pending_pkt_dqo *pending_pkts; uint16_t num_pending_pkts; + + /* + * Represents QPL page fragments. An index into this array + * always represents the same QPL page fragment. The value + * is also an index into this array and servers as a means + * to chain buffers into linked lists whose heads are + * either free_qpl_bufs_prd or free_qpl_bufs_csm or + * qpl_bufs_head. + */ + int32_t *qpl_bufs; } __aligned(CACHE_LINE_SIZE); } dqo; }; @@ -531,6 +611,13 @@ return (priv->queue_format == GVE_GQI_QPL_FORMAT); } +static inline bool +gve_is_qpl(struct gve_priv *priv) +{ + return (priv->queue_format == GVE_GQI_QPL_FORMAT || + priv->queue_format == GVE_DQO_QPL_FORMAT); +} + /* Defined in gve_main.c */ void gve_schedule_reset(struct gve_priv *priv); @@ -545,6 +632,7 @@ void gve_free_qpls(struct gve_priv *priv); int gve_register_qpls(struct gve_priv *priv); int gve_unregister_qpls(struct gve_priv *priv); +void gve_mextadd_free(struct mbuf *mbuf); /* TX functions defined in gve_tx.c */ int gve_alloc_tx_rings(struct gve_priv *priv); @@ -563,6 +651,7 @@ void gve_clear_tx_ring_dqo(struct gve_priv *priv, int i); int gve_tx_intr_dqo(void *arg); int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr); +int gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf); void gve_tx_cleanup_tq_dqo(void *arg, int pending); /* RX functions defined in gve_rx.c */ diff --git a/sys/dev/gve/gve_adminq.h b/sys/dev/gve/gve_adminq.h --- a/sys/dev/gve/gve_adminq.h +++ b/sys/dev/gve/gve_adminq.h @@ -144,6 +144,15 @@ _Static_assert(sizeof(struct gve_device_option_dqo_rda) == 8, "gve: bad admin queue struct length"); +struct gve_device_option_dqo_qpl { + __be32 supported_features_mask; + __be16 tx_comp_ring_entries; + __be16 rx_buff_ring_entries; +}; + +_Static_assert(sizeof(struct gve_device_option_dqo_qpl) == 8, + "gve: bad admin queue struct length"); + struct gve_device_option_modify_ring { __be32 supported_features_mask; __be16 max_rx_ring_size; @@ -168,6 +177,7 @@ GVE_DEV_OPT_ID_GQI_QPL = 0x3, GVE_DEV_OPT_ID_DQO_RDA = 0x4, GVE_DEV_OPT_ID_MODIFY_RING = 0x6, + GVE_DEV_OPT_ID_DQO_QPL = 0x7, GVE_DEV_OPT_ID_JUMBO_FRAMES = 0x8, }; @@ -182,6 +192,7 @@ GVE_DEV_OPT_REQ_FEAT_MASK_GQI_RDA = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_GQI_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_DQO_RDA = 0x0, + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_MODIFY_RING = 0x0, GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES = 0x0, }; @@ -196,7 +207,7 @@ enum gve_driver_capability { gve_driver_capability_gqi_qpl = 0, gve_driver_capability_gqi_rda = 1, - gve_driver_capability_dqo_qpl = 2, /* reserved for future use */ + gve_driver_capability_dqo_qpl = 2, gve_driver_capability_dqo_rda = 3, }; @@ -212,6 +223,7 @@ */ #define GVE_DRIVER_CAPABILITY_FLAGS1 \ (GVE_CAP1(gve_driver_capability_gqi_qpl) | \ + GVE_CAP1(gve_driver_capability_dqo_qpl) | \ GVE_CAP1(gve_driver_capability_dqo_rda)) #define GVE_DRIVER_CAPABILITY_FLAGS2 0x0 #define GVE_DRIVER_CAPABILITY_FLAGS3 0x0 diff --git a/sys/dev/gve/gve_adminq.c b/sys/dev/gve/gve_adminq.c --- a/sys/dev/gve/gve_adminq.c +++ b/sys/dev/gve/gve_adminq.c @@ -58,6 +58,7 @@ struct gve_device_option *option, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_dqo_qpl **dev_op_dqo_qpl, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { uint32_t req_feat_mask = be32toh(option->required_features_mask); @@ -103,6 +104,23 @@ *dev_op_dqo_rda = (void *)(option + 1); break; + case GVE_DEV_OPT_ID_DQO_QPL: + if (option_length < sizeof(**dev_op_dqo_qpl) || + req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL) { + device_printf(priv->dev, GVE_DEVICE_OPTION_ERROR_FMT, + "DQO QPL", (int)sizeof(**dev_op_dqo_qpl), + GVE_DEV_OPT_REQ_FEAT_MASK_DQO_QPL, + option_length, req_feat_mask); + break; + } + + if (option_length > sizeof(**dev_op_dqo_qpl)) { + device_printf(priv->dev, GVE_DEVICE_OPTION_TOO_BIG_FMT, + "DQO QPL"); + } + *dev_op_dqo_qpl = (void *)(option + 1); + break; + case GVE_DEV_OPT_ID_JUMBO_FRAMES: if (option_length < sizeof(**dev_op_jumbo_frames) || req_feat_mask != GVE_DEV_OPT_REQ_FEAT_MASK_JUMBO_FRAMES) { @@ -136,6 +154,7 @@ struct gve_device_descriptor *descriptor, struct gve_device_option_gqi_qpl **dev_op_gqi_qpl, struct gve_device_option_dqo_rda **dev_op_dqo_rda, + struct gve_device_option_dqo_qpl **dev_op_dqo_qpl, struct gve_device_option_jumbo_frames **dev_op_jumbo_frames) { char *desc_end = (char *)descriptor + be16toh(descriptor->total_length); @@ -154,7 +173,10 @@ } gve_parse_device_option(priv, descriptor, dev_opt, - dev_op_gqi_qpl, dev_op_dqo_rda, dev_op_jumbo_frames); + dev_op_gqi_qpl, + dev_op_dqo_rda, + dev_op_dqo_qpl, + dev_op_jumbo_frames); dev_opt = (void *)((char *)(dev_opt + 1) + be16toh(dev_opt->option_length)); } @@ -387,6 +409,7 @@ struct gve_dma_handle desc_mem; struct gve_device_option_gqi_qpl *dev_op_gqi_qpl = NULL; struct gve_device_option_dqo_rda *dev_op_dqo_rda = NULL; + struct gve_device_option_dqo_qpl *dev_op_dqo_qpl = NULL; struct gve_device_option_jumbo_frames *dev_op_jumbo_frames = NULL; uint32_t supported_features_mask = 0; int rc; @@ -416,7 +439,9 @@ bus_dmamap_sync(desc_mem.tag, desc_mem.map, BUS_DMASYNC_POSTREAD); rc = gve_process_device_options(priv, desc, - &dev_op_gqi_qpl, &dev_op_dqo_rda, + &dev_op_gqi_qpl, + &dev_op_dqo_rda, + &dev_op_dqo_qpl, &dev_op_jumbo_frames); if (rc != 0) goto free_device_descriptor; @@ -428,6 +453,13 @@ if (bootverbose) device_printf(priv->dev, "Driver is running with DQO RDA queue format.\n"); + } else if (dev_op_dqo_qpl != NULL) { + priv->queue_format = GVE_DQO_QPL_FORMAT; + supported_features_mask = be32toh( + dev_op_dqo_qpl->supported_features_mask); + if (bootverbose) + device_printf(priv->dev, + "Driver is running with DQO QPL queue format.\n"); } else if (dev_op_gqi_qpl != NULL) { priv->queue_format = GVE_GQI_QPL_FORMAT; supported_features_mask = be32toh( diff --git a/sys/dev/gve/gve_dqo.h b/sys/dev/gve/gve_dqo.h --- a/sys/dev/gve/gve_dqo.h +++ b/sys/dev/gve/gve_dqo.h @@ -57,7 +57,22 @@ * Start dropping RX fragments if at least these many * buffers cannot be posted to the NIC. */ -#define GVE_RX_DQO_MIN_PENDING_BUFS 32 +#define GVE_RX_DQO_MIN_PENDING_BUFS 128 + +#define GVE_DQ_NUM_FRAGS_IN_PAGE (PAGE_SIZE / GVE_DEFAULT_RX_BUFFER_SIZE) + +/* + * gve_rx_qpl_buf_id_dqo's 11 bit wide buf_id field limits the total + * number of pages per QPL to 2048. + */ +#define GVE_RX_NUM_QPL_PAGES_DQO 2048 + +/* 2K TX buffers for DQO-QPL */ +#define GVE_TX_BUF_SHIFT_DQO 11 +#define GVE_TX_BUF_SIZE_DQO BIT(GVE_TX_BUF_SHIFT_DQO) +#define GVE_TX_BUFS_PER_PAGE_DQO (PAGE_SIZE >> GVE_TX_BUF_SHIFT_DQO) + +#define GVE_TX_NUM_QPL_PAGES_DQO 512 /* Basic TX descriptor (DTYPE 0x0C) */ struct gve_tx_pkt_desc_dqo { diff --git a/sys/dev/gve/gve_main.c b/sys/dev/gve/gve_main.c --- a/sys/dev/gve/gve_main.c +++ b/sys/dev/gve/gve_main.c @@ -32,9 +32,9 @@ #include "gve_adminq.h" #include "gve_dqo.h" -#define GVE_DRIVER_VERSION "GVE-FBSD-1.2.0\n" +#define GVE_DRIVER_VERSION "GVE-FBSD-1.3.0\n" #define GVE_VERSION_MAJOR 1 -#define GVE_VERSION_MINOR 2 +#define GVE_VERSION_MINOR 3 #define GVE_VERSION_SUB 0 #define GVE_DEFAULT_RX_COPYBREAK 256 @@ -125,7 +125,7 @@ if (if_getcapenable(ifp) & IFCAP_TSO6) if_sethwassistbits(ifp, CSUM_IP6_TSO, 0); - if (gve_is_gqi(priv)) { + if (gve_is_qpl(priv)) { err = gve_register_qpls(priv); if (err != 0) goto reset; @@ -181,7 +181,7 @@ if (err != 0) goto reset; - if (gve_is_gqi(priv)) { + if (gve_is_qpl(priv)) { if (gve_unregister_qpls(priv) != 0) goto reset; } @@ -379,13 +379,15 @@ /* * Set TSO limits, must match the arguments to bus_dma_tag_create - * when creating tx->dqo.buf_dmatag + * when creating tx->dqo.buf_dmatag. Only applies to the RDA mode + * because in QPL we copy the entire pakcet into the bounce buffer + * and thus it does not matter how fragmented the mbuf is. */ - if (!gve_is_gqi(priv)) { - if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); + if (!gve_is_gqi(priv) && !gve_is_qpl(priv)) { if_sethwtsomaxsegcount(ifp, GVE_TX_MAX_DATA_DESCS_DQO); if_sethwtsomaxsegsize(ifp, GVE_TX_MAX_BUF_SIZE_DQO); } + if_sethwtsomax(ifp, GVE_TSO_MAXSIZE_DQO); #if __FreeBSD_version >= 1400086 if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); @@ -469,7 +471,7 @@ gve_free_irqs(priv); gve_free_tx_rings(priv); gve_free_rx_rings(priv); - if (gve_is_gqi(priv)) + if (gve_is_qpl(priv)) gve_free_qpls(priv); } @@ -478,7 +480,7 @@ { int err; - if (gve_is_gqi(priv)) { + if (gve_is_qpl(priv)) { err = gve_alloc_qpls(priv); if (err != 0) goto abort; diff --git a/sys/dev/gve/gve_qpl.c b/sys/dev/gve/gve_qpl.c --- a/sys/dev/gve/gve_qpl.c +++ b/sys/dev/gve/gve_qpl.c @@ -32,13 +32,14 @@ #include "gve.h" #include "gve_adminq.h" +#include "gve_dqo.h" static MALLOC_DEFINE(M_GVE_QPL, "gve qpl", "gve qpl allocations"); static uint32_t gve_num_tx_qpls(struct gve_priv *priv) { - if (priv->queue_format != GVE_GQI_QPL_FORMAT) + if (!gve_is_qpl(priv)) return (0); return (priv->tx_cfg.max_queues); @@ -47,7 +48,7 @@ static uint32_t gve_num_rx_qpls(struct gve_priv *priv) { - if (priv->queue_format != GVE_GQI_QPL_FORMAT) + if (!gve_is_qpl(priv)) return (0); return (priv->rx_cfg.max_queues); @@ -189,6 +190,7 @@ int gve_alloc_qpls(struct gve_priv *priv) { int num_qpls = gve_num_tx_qpls(priv) + gve_num_rx_qpls(priv); + int num_pages; int err; int i; @@ -198,15 +200,19 @@ priv->qpls = malloc(num_qpls * sizeof(*priv->qpls), M_GVE_QPL, M_WAITOK | M_ZERO); + num_pages = gve_is_gqi(priv) ? + priv->tx_desc_cnt / GVE_QPL_DIVISOR : + GVE_TX_NUM_QPL_PAGES_DQO; for (i = 0; i < gve_num_tx_qpls(priv); i++) { - err = gve_alloc_qpl(priv, i, priv->tx_desc_cnt / GVE_QPL_DIVISOR, + err = gve_alloc_qpl(priv, i, num_pages, /*single_kva=*/true); if (err != 0) goto abort; } + num_pages = gve_is_gqi(priv) ? priv->rx_desc_cnt : GVE_RX_NUM_QPL_PAGES_DQO; for (; i < num_qpls; i++) { - err = gve_alloc_qpl(priv, i, priv->rx_desc_cnt, /*single_kva=*/false); + err = gve_alloc_qpl(priv, i, num_pages, /*single_kva=*/false); if (err != 0) goto abort; } @@ -283,3 +289,21 @@ gve_clear_state_flag(priv, GVE_STATE_FLAG_QPLREG_OK); return (0); } + +void +gve_mextadd_free(struct mbuf *mbuf) +{ + vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; + vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; + + /* + * Free the page only if this is the last ref. + * The interface might no longer exist by the time + * this callback is called, see gve_free_qpl. + */ + if (__predict_false(vm_page_unwire_noq(page))) { + pmap_qremove(va, 1); + kva_free(va, PAGE_SIZE); + vm_page_free(page); + } +} diff --git a/sys/dev/gve/gve_rx.c b/sys/dev/gve/gve_rx.c --- a/sys/dev/gve/gve_rx.c +++ b/sys/dev/gve/gve_rx.c @@ -410,24 +410,6 @@ } } -static void -gve_mextadd_free(struct mbuf *mbuf) -{ - vm_page_t page = (vm_page_t)mbuf->m_ext.ext_arg1; - vm_offset_t va = (vm_offset_t)mbuf->m_ext.ext_arg2; - - /* - * Free the page only if this is the last ref. - * The interface might no longer exist by the time - * this callback is called, see gve_free_qpl. - */ - if (__predict_false(vm_page_unwire_noq(page))) { - pmap_qremove(va, 1); - kva_free(va, PAGE_SIZE); - vm_page_free(page); - } -} - static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr) { diff --git a/sys/dev/gve/gve_rx_dqo.c b/sys/dev/gve/gve_rx_dqo.c --- a/sys/dev/gve/gve_rx_dqo.c +++ b/sys/dev/gve/gve_rx_dqo.c @@ -38,6 +38,9 @@ struct gve_rx_buf_dqo *buf; int i; + if (gve_is_qpl(rx->com.priv)) + return; + for (i = 0; i < rx->dqo.buf_cnt; i++) { buf = &rx->dqo.bufs[i]; if (!buf->mbuf) @@ -70,7 +73,7 @@ if (rx->dqo.bufs != NULL) { gve_free_rx_mbufs_dqo(rx); - if (rx->dqo.buf_dmatag) { + if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) { for (j = 0; j < rx->dqo.buf_cnt; j++) if (rx->dqo.bufs[j].mapped) bus_dmamap_destroy(rx->dqo.buf_dmatag, @@ -81,7 +84,7 @@ rx->dqo.bufs = NULL; } - if (rx->dqo.buf_dmatag) + if (!gve_is_qpl(priv) && rx->dqo.buf_dmatag) bus_dma_tag_destroy(rx->dqo.buf_dmatag); } @@ -103,6 +106,31 @@ rx->dqo.desc_ring = rx->desc_ring_mem.cpu_addr; rx->dqo.mask = priv->rx_desc_cnt - 1; + err = gve_dma_alloc_coherent(priv, + sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, + CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); + if (err != 0) { + device_printf(priv->dev, + "Failed to alloc compl ring for rx ring %d", i); + goto abort; + } + rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; + rx->dqo.mask = priv->rx_desc_cnt - 1; + + rx->dqo.buf_cnt = gve_is_qpl(priv) ? GVE_RX_NUM_QPL_PAGES_DQO : + priv->rx_desc_cnt; + rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), + M_GVE, M_WAITOK | M_ZERO); + + if (gve_is_qpl(priv)) { + rx->com.qpl = &priv->qpls[priv->tx_cfg.max_queues + i]; + if (rx->com.qpl == NULL) { + device_printf(priv->dev, "No QPL left for rx ring %d", i); + return (ENOMEM); + } + return (0); + } + err = bus_dma_tag_create( bus_get_dma_tag(priv->dev), /* parent */ 1, 0, /* alignment, bounds */ @@ -123,9 +151,6 @@ goto abort; } - rx->dqo.buf_cnt = priv->rx_desc_cnt; - rx->dqo.bufs = malloc(rx->dqo.buf_cnt * sizeof(struct gve_rx_buf_dqo), - M_GVE, M_WAITOK | M_ZERO); for (j = 0; j < rx->dqo.buf_cnt; j++) { err = bus_dmamap_create(rx->dqo.buf_dmatag, 0, &rx->dqo.bufs[j].dmamap); @@ -138,17 +163,6 @@ rx->dqo.bufs[j].mapped = true; } - err = gve_dma_alloc_coherent(priv, - sizeof(struct gve_rx_compl_desc_dqo) * priv->rx_desc_cnt, - CACHE_LINE_SIZE, &rx->dqo.compl_ring_mem); - if (err != 0) { - device_printf(priv->dev, - "Failed to alloc compl ring for rx ring %d", i); - goto abort; - } - rx->dqo.compl_ring = rx->dqo.compl_ring_mem.cpu_addr; - rx->dqo.mask = priv->rx_desc_cnt - 1; - return (0); abort: @@ -202,10 +216,35 @@ gve_free_rx_mbufs_dqo(rx); - SLIST_INIT(&rx->dqo.free_bufs); - for (j = 0; j < rx->dqo.buf_cnt; j++) - SLIST_INSERT_HEAD(&rx->dqo.free_bufs, - &rx->dqo.bufs[j], slist_entry); + if (gve_is_qpl(priv)) { + SLIST_INIT(&rx->dqo.free_bufs); + STAILQ_INIT(&rx->dqo.used_bufs); + + for (j = 0; j < rx->dqo.buf_cnt; j++) { + struct gve_rx_buf_dqo *buf = &rx->dqo.bufs[j]; + vm_page_t page = rx->com.qpl->pages[buf - rx->dqo.bufs]; + u_int ref_count = atomic_load_int(&page->ref_count); + + /* + * An ifconfig down+up might see pages still in flight + * from the previous innings. + */ + if (VPRC_WIRE_COUNT(ref_count) == 1) + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + buf, slist_entry); + else + STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, + buf, stailq_entry); + + buf->num_nic_frags = 0; + buf->next_idx = 0; + } + } else { + SLIST_INIT(&rx->dqo.free_bufs); + for (j = 0; j < rx->dqo.buf_cnt; j++) + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + &rx->dqo.bufs[j], slist_entry); + } } int @@ -223,6 +262,20 @@ return (FILTER_HANDLED); } +static void +gve_rx_advance_head_dqo(struct gve_rx_ring *rx) +{ + rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; + rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ + + if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { + bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, + BUS_DMASYNC_PREWRITE); + gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, + rx->dqo.head); + } +} + static void gve_rx_post_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) { @@ -235,15 +288,7 @@ desc->buf_id = htole16(buf - rx->dqo.bufs); desc->buf_addr = htole64(buf->addr); - rx->dqo.head = (rx->dqo.head + 1) & rx->dqo.mask; - rx->fill_cnt++; /* rx->fill_cnt is just a sysctl counter */ - - if ((rx->dqo.head & (GVE_RX_BUF_THRESH_DQO - 1)) == 0) { - bus_dmamap_sync(rx->desc_ring_mem.tag, rx->desc_ring_mem.map, - BUS_DMASYNC_PREWRITE); - gve_db_bar_dqo_write_4(rx->com.priv, rx->com.db_offset, - rx->dqo.head); - } + gve_rx_advance_head_dqo(rx); } static int @@ -293,6 +338,103 @@ return (err); } +static struct gve_dma_handle * +gve_get_page_dma_handle(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf) +{ + return (&(rx->com.qpl->dmas[buf - rx->dqo.bufs])); +} + +static void +gve_rx_post_qpl_buf_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, + uint8_t frag_num) +{ + struct gve_rx_desc_dqo *desc = &rx->dqo.desc_ring[rx->dqo.head]; + union gve_rx_qpl_buf_id_dqo composed_id; + struct gve_dma_handle *page_dma_handle; + + composed_id.buf_id = buf - rx->dqo.bufs; + composed_id.frag_num = frag_num; + desc->buf_id = htole16(composed_id.all); + + page_dma_handle = gve_get_page_dma_handle(rx, buf); + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_PREREAD); + desc->buf_addr = htole64(page_dma_handle->bus_addr + + frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); + + buf->num_nic_frags++; + gve_rx_advance_head_dqo(rx); +} + +static void +gve_rx_maybe_extract_from_used_bufs(struct gve_rx_ring *rx, bool just_one) +{ + struct gve_rx_buf_dqo *hol_blocker = NULL; + struct gve_rx_buf_dqo *buf; + u_int ref_count; + vm_page_t page; + + while (true) { + buf = STAILQ_FIRST(&rx->dqo.used_bufs); + if (__predict_false(buf == NULL)) + break; + + page = rx->com.qpl->pages[buf - rx->dqo.bufs]; + ref_count = atomic_load_int(&page->ref_count); + + if (VPRC_WIRE_COUNT(ref_count) != 1) { + /* Account for one head-of-line blocker */ + if (hol_blocker != NULL) + break; + hol_blocker = buf; + STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, + stailq_entry); + continue; + } + + STAILQ_REMOVE_HEAD(&rx->dqo.used_bufs, + stailq_entry); + SLIST_INSERT_HEAD(&rx->dqo.free_bufs, + buf, slist_entry); + if (just_one) + break; + } + + if (hol_blocker != NULL) + STAILQ_INSERT_HEAD(&rx->dqo.used_bufs, + hol_blocker, stailq_entry); +} + +static int +gve_rx_post_new_dqo_qpl_buf(struct gve_rx_ring *rx) +{ + struct gve_rx_buf_dqo *buf; + + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(buf == NULL)) { + gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/true); + buf = SLIST_FIRST(&rx->dqo.free_bufs); + if (__predict_false(buf == NULL)) + return (ENOBUFS); + } + + gve_rx_post_qpl_buf_dqo(rx, buf, buf->next_idx); + if (buf->next_idx == GVE_DQ_NUM_FRAGS_IN_PAGE - 1) + buf->next_idx = 0; + else + buf->next_idx++; + + /* + * We have posted all the frags in this buf to the NIC. + * - buf will enter used_bufs once the last completion arrives. + * - It will renter free_bufs in gve_rx_maybe_extract_from_used_bufs + * when its wire count drops back to 1. + */ + if (buf->next_idx == 0) + SLIST_REMOVE_HEAD(&rx->dqo.free_bufs, slist_entry); + return (0); +} + static void gve_rx_post_buffers_dqo(struct gve_rx_ring *rx, int how) { @@ -305,7 +447,10 @@ num_to_post = rx->dqo.mask - num_pending_bufs; for (i = 0; i < num_to_post; i++) { - err = gve_rx_post_new_mbuf_dqo(rx, how); + if (gve_is_qpl(rx->com.priv)) + err = gve_rx_post_new_dqo_qpl_buf(rx); + else + err = gve_rx_post_new_mbuf_dqo(rx, how); if (err) break; } @@ -426,7 +571,7 @@ } static int -gve_rx_copybreak_dqo(struct gve_rx_ring *rx, struct gve_rx_buf_dqo *buf, +gve_rx_copybreak_dqo(struct gve_rx_ring *rx, void *va, struct gve_rx_compl_desc_dqo *compl_desc, uint16_t frag_len) { struct mbuf *mbuf; @@ -439,14 +584,13 @@ counter_u64_add_protected(rx->stats.rx_copybreak_cnt, 1); counter_exit(); - m_copyback(mbuf, 0, frag_len, mtod(buf->mbuf, char*)); + m_copyback(mbuf, 0, frag_len, va); mbuf->m_len = frag_len; rx->ctx.mbuf_head = mbuf; rx->ctx.mbuf_tail = mbuf; rx->ctx.total_size += frag_len; - gve_rx_post_buf_dqo(rx, buf); gve_rx_input_mbuf_dqo(rx, compl_desc); return (0); } @@ -494,10 +638,12 @@ frag_len = compl_desc->packet_len; if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { - err = gve_rx_copybreak_dqo(rx, buf, compl_desc, frag_len); + err = gve_rx_copybreak_dqo(rx, mtod(buf->mbuf, char*), + compl_desc, frag_len); if (__predict_false(err != 0)) goto drop_frag; (*work_done)++; + gve_rx_post_buf_dqo(rx, buf); return; } @@ -578,6 +724,233 @@ rx->ctx = (struct gve_rx_ctx){}; } +static void * +gve_get_cpu_addr_for_qpl_buf(struct gve_rx_ring *rx, + struct gve_rx_buf_dqo *buf, uint8_t buf_frag_num) +{ + int page_idx = buf - rx->dqo.bufs; + void *va = rx->com.qpl->dmas[page_idx].cpu_addr; + + va = (char *)va + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); + return (va); +} + +static int +gve_rx_add_clmbuf_to_ctx(struct gve_rx_ring *rx, + struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, + uint8_t buf_frag_num, uint16_t frag_len) +{ + void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); + struct mbuf *mbuf; + + if (ctx->mbuf_tail == NULL) { + mbuf = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + mbuf = m_getcl(M_NOWAIT, MT_DATA, 0); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + mbuf->m_len = frag_len; + ctx->total_size += frag_len; + + m_copyback(mbuf, 0, frag_len, va); + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_copy_cnt, 1); + counter_exit(); + return (0); +} + +static int +gve_rx_add_extmbuf_to_ctx(struct gve_rx_ring *rx, + struct gve_rx_ctx *ctx, struct gve_rx_buf_dqo *buf, + uint8_t buf_frag_num, uint16_t frag_len) +{ + struct mbuf *mbuf; + void *page_addr; + vm_page_t page; + int page_idx; + void *va; + + if (ctx->mbuf_tail == NULL) { + mbuf = m_gethdr(M_NOWAIT, MT_DATA); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_head = mbuf; + ctx->mbuf_tail = mbuf; + } else { + mbuf = m_get(M_NOWAIT, MT_DATA); + if (mbuf == NULL) + return (ENOMEM); + ctx->mbuf_tail->m_next = mbuf; + ctx->mbuf_tail = mbuf; + } + + mbuf->m_len = frag_len; + ctx->total_size += frag_len; + + page_idx = buf - rx->dqo.bufs; + page = rx->com.qpl->pages[page_idx]; + page_addr = rx->com.qpl->dmas[page_idx].cpu_addr; + va = (char *)page_addr + (buf_frag_num * GVE_DEFAULT_RX_BUFFER_SIZE); + + /* + * Grab an extra ref to the page so that gve_mextadd_free + * does not end up freeing the page while the interface exists. + */ + vm_page_wire(page); + + counter_enter(); + counter_u64_add_protected(rx->stats.rx_frag_flip_cnt, 1); + counter_exit(); + + MEXTADD(mbuf, va, frag_len, + gve_mextadd_free, page, page_addr, + 0, EXT_NET_DRV); + return (0); +} + +static void +gve_rx_dqo_qpl(struct gve_priv *priv, struct gve_rx_ring *rx, + struct gve_rx_compl_desc_dqo *compl_desc, + int *work_done) +{ + bool is_last_frag = compl_desc->end_of_packet != 0; + union gve_rx_qpl_buf_id_dqo composed_id; + struct gve_dma_handle *page_dma_handle; + struct gve_rx_ctx *ctx = &rx->ctx; + struct gve_rx_buf_dqo *buf; + uint32_t num_pending_bufs; + uint8_t buf_frag_num; + uint16_t frag_len; + uint16_t buf_id; + int err; + + composed_id.all = le16toh(compl_desc->buf_id); + buf_id = composed_id.buf_id; + buf_frag_num = composed_id.frag_num; + + if (__predict_false(buf_id >= rx->dqo.buf_cnt)) { + device_printf(priv->dev, "Invalid rx buf id %d on rxq %d, issuing reset\n", + buf_id, rx->com.id); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + buf = &rx->dqo.bufs[buf_id]; + if (__predict_false(buf->num_nic_frags == 0 || + buf_frag_num > GVE_DQ_NUM_FRAGS_IN_PAGE - 1)) { + device_printf(priv->dev, "Spurious compl for buf id %d on rxq %d " + "with buf_frag_num %d and num_nic_frags %d, issuing reset\n", + buf_id, rx->com.id, buf_frag_num, buf->num_nic_frags); + gve_schedule_reset(priv); + goto drop_frag_clear_ctx; + } + + buf->num_nic_frags--; + + if (__predict_false(ctx->drop_pkt)) + goto drop_frag; + + if (__predict_false(compl_desc->rx_error)) { + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt_desc_err, 1); + counter_exit(); + goto drop_frag; + } + + page_dma_handle = gve_get_page_dma_handle(rx, buf); + bus_dmamap_sync(page_dma_handle->tag, page_dma_handle->map, + BUS_DMASYNC_POSTREAD); + + frag_len = compl_desc->packet_len; + if (frag_len <= priv->rx_copybreak && !ctx->mbuf_head && is_last_frag) { + void *va = gve_get_cpu_addr_for_qpl_buf(rx, buf, buf_frag_num); + + err = gve_rx_copybreak_dqo(rx, va, compl_desc, frag_len); + if (__predict_false(err != 0)) + goto drop_frag; + (*work_done)++; + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + return; + } + + num_pending_bufs = (rx->dqo.head - rx->dqo.tail) & rx->dqo.mask; + err = gve_rx_post_new_dqo_qpl_buf(rx); + if (__predict_false(err != 0 && + num_pending_bufs <= GVE_RX_DQO_MIN_PENDING_BUFS)) { + /* + * Resort to copying this fragment into a cluster mbuf + * when the above threshold is breached and repost the + * incoming buffer. If we cannot find cluster mbufs, + * just drop the packet (to repost its buffer). + */ + err = gve_rx_add_clmbuf_to_ctx(rx, ctx, buf, + buf_frag_num, frag_len); + if (err != 0) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_buf_post_fail, 1); + counter_exit(); + goto drop_frag; + } + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + } else { + err = gve_rx_add_extmbuf_to_ctx(rx, ctx, buf, + buf_frag_num, frag_len); + if (__predict_false(err != 0)) { + counter_enter(); + counter_u64_add_protected( + rx->stats.rx_dropped_pkt_mbuf_alloc_fail, 1); + counter_exit(); + goto drop_frag; + } + } + + /* + * Both the counts need to be checked. + * + * num_nic_frags == 0 implies no pending completions + * but not all frags may have yet been posted. + * + * next_idx == 0 implies all frags have been posted + * but there might be pending completions. + */ + if (buf->num_nic_frags == 0 && buf->next_idx == 0) + STAILQ_INSERT_TAIL(&rx->dqo.used_bufs, buf, stailq_entry); + + if (is_last_frag) { + gve_rx_input_mbuf_dqo(rx, compl_desc); + (*work_done)++; + } + return; + +drop_frag: + /* Clear the earlier frags if there were any */ + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; + /* Drop the rest of the pkt if there are more frags */ + ctx->drop_pkt = true; + /* Reuse the dropped frag's buffer */ + gve_rx_post_qpl_buf_dqo(rx, buf, buf_frag_num); + + if (is_last_frag) + goto drop_frag_clear_ctx; + return; + +drop_frag_clear_ctx: + counter_enter(); + counter_u64_add_protected(rx->stats.rx_dropped_pkt, 1); + counter_exit(); + m_freem(ctx->mbuf_head); + rx->ctx = (struct gve_rx_ctx){}; +} + static bool gve_rx_cleanup_dqo(struct gve_priv *priv, struct gve_rx_ring *rx, int budget) { @@ -603,13 +976,18 @@ rx->dqo.tail = (rx->dqo.tail + 1) & rx->dqo.mask; rx->dqo.cur_gen_bit ^= (rx->dqo.tail == 0); - gve_rx_dqo(priv, rx, compl_desc, &work_done); + if (gve_is_qpl(priv)) + gve_rx_dqo_qpl(priv, rx, compl_desc, &work_done); + else + gve_rx_dqo(priv, rx, compl_desc, &work_done); } if (work_done != 0) tcp_lro_flush_all(&rx->lro); gve_rx_post_buffers_dqo(rx, M_NOWAIT); + if (gve_is_qpl(priv)) + gve_rx_maybe_extract_from_used_bufs(rx, /*just_one=*/false); return (work_done == budget); } diff --git a/sys/dev/gve/gve_sysctl.c b/sys/dev/gve/gve_sysctl.c --- a/sys/dev/gve/gve_sysctl.c +++ b/sys/dev/gve/gve_sysctl.c @@ -75,6 +75,10 @@ "rx_dropped_pkt_desc_err", CTLFLAG_RD, &stats->rx_dropped_pkt_desc_err, "Packets dropped due to descriptor error"); + SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, + "rx_dropped_pkt_buf_post_fail", CTLFLAG_RD, + &stats->rx_dropped_pkt_buf_post_fail, + "Packets dropped due to failure to post enough buffers"); SYSCTL_ADD_COUNTER_U64(ctx, list, OID_AUTO, "rx_dropped_pkt_mbuf_alloc_fail", CTLFLAG_RD, &stats->rx_dropped_pkt_mbuf_alloc_fail, @@ -147,6 +151,10 @@ "tx_delayed_pkt_nospace_compring", CTLFLAG_RD, &stats->tx_delayed_pkt_nospace_compring, "Packets delayed due to no space in comp ring"); + SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, + "tx_delayed_pkt_nospace_qpl_bufs", CTLFLAG_RD, + &stats->tx_delayed_pkt_nospace_qpl_bufs, + "Packets delayed due to not enough qpl bufs"); SYSCTL_ADD_COUNTER_U64(ctx, tx_list, OID_AUTO, "tx_delayed_pkt_tsoerr", CTLFLAG_RD, &stats->tx_delayed_pkt_tsoerr, diff --git a/sys/dev/gve/gve_tx.c b/sys/dev/gve/gve_tx.c --- a/sys/dev/gve/gve_tx.c +++ b/sys/dev/gve/gve_tx.c @@ -753,7 +753,10 @@ * The reference is passed in so that in the case of * errors, the new mbuf chain is what's put back on the br. */ - err = gve_xmit_dqo(tx, &mbuf); + if (gve_is_qpl(priv)) + err = gve_xmit_dqo_qpl(tx, mbuf); + else + err = gve_xmit_dqo(tx, &mbuf); } if (__predict_false(err != 0 && mbuf != NULL)) { diff --git a/sys/dev/gve/gve_tx_dqo.c b/sys/dev/gve/gve_tx_dqo.c --- a/sys/dev/gve/gve_tx_dqo.c +++ b/sys/dev/gve/gve_tx_dqo.c @@ -51,7 +51,12 @@ if (!pending_pkt->mbuf) continue; - gve_unmap_packet(tx, pending_pkt); + if (gve_is_qpl(tx->com.priv)) { + pending_pkt->qpl_buf_head = -1; + pending_pkt->num_qpl_bufs = 0; + } else + gve_unmap_packet(tx, pending_pkt); + m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; } @@ -76,7 +81,7 @@ if (tx->dqo.pending_pkts != NULL) { gve_free_tx_mbufs_dqo(tx); - if (tx->dqo.buf_dmatag) { + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) { for (j = 0; j < tx->dqo.num_pending_pkts; j++) if (tx->dqo.pending_pkts[j].state != GVE_PACKET_STATE_UNALLOCATED) @@ -88,8 +93,59 @@ tx->dqo.pending_pkts = NULL; } - if (tx->dqo.buf_dmatag) + if (!gve_is_qpl(priv) && tx->dqo.buf_dmatag) bus_dma_tag_destroy(tx->dqo.buf_dmatag); + + if (gve_is_qpl(priv) && tx->dqo.qpl_bufs != NULL) { + free(tx->dqo.qpl_bufs, M_GVE); + tx->dqo.qpl_bufs = NULL; + } +} + +static int +gve_tx_alloc_rda_fields_dqo(struct gve_tx_ring *tx) +{ + struct gve_priv *priv = tx->com.priv; + int err; + int j; + + /* + * DMA tag for mapping Tx mbufs + * The maxsize, nsegments, and maxsegsize params should match + * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. + */ + err = bus_dma_tag_create( + bus_get_dma_tag(priv->dev), /* parent */ + 1, 0, /* alignment, bounds */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + GVE_TSO_MAXSIZE_DQO, /* maxsize */ + GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ + GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ + BUS_DMA_ALLOCNOW, /* flags */ + NULL, /* lockfunc */ + NULL, /* lockarg */ + &tx->dqo.buf_dmatag); + if (err != 0) { + device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", + __func__, err); + return (err); + } + + for (j = 0; j < tx->dqo.num_pending_pkts; j++) { + err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, + &tx->dqo.pending_pkts[j].dmamap); + if (err != 0) { + device_printf(priv->dev, + "err in creating pending pkt dmamap %d: %d", + j, err); + return (err); + } + tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; + } + + return (0); } int @@ -98,7 +154,6 @@ struct gve_tx_ring *tx = &priv->tx[i]; uint16_t num_pending_pkts; int err; - int j; /* Descriptor ring */ err = gve_dma_alloc_coherent(priv, @@ -122,30 +177,6 @@ } tx->dqo.compl_ring = tx->dqo.compl_ring_mem.cpu_addr; - /* - * DMA tag for mapping Tx mbufs - * The maxsize, nsegments, and maxsegsize params should match - * the if_sethwtso* arguments in gve_setup_ifnet in gve_main.c. - */ - err = bus_dma_tag_create( - bus_get_dma_tag(priv->dev), /* parent */ - 1, 0, /* alignment, bounds */ - BUS_SPACE_MAXADDR, /* lowaddr */ - BUS_SPACE_MAXADDR, /* highaddr */ - NULL, NULL, /* filter, filterarg */ - GVE_TSO_MAXSIZE_DQO, /* maxsize */ - GVE_TX_MAX_DATA_DESCS_DQO, /* nsegments */ - GVE_TX_MAX_BUF_SIZE_DQO, /* maxsegsize */ - BUS_DMA_ALLOCNOW, /* flags */ - NULL, /* lockfunc */ - NULL, /* lockarg */ - &tx->dqo.buf_dmatag); - if (err != 0) { - device_printf(priv->dev, "%s: bus_dma_tag_create failed: %d\n", - __func__, err); - goto abort; - } - /* * pending_pkts array * @@ -169,18 +200,18 @@ if (!tx->dqo.pending_pkts) goto abort; - for (j = 0; j < tx->dqo.num_pending_pkts; j++) { - err = bus_dmamap_create(tx->dqo.buf_dmatag, 0, - &tx->dqo.pending_pkts[j].dmamap); - if (err != 0) { - device_printf(priv->dev, - "err in creating pending pkt dmamap %d: %d", - j, err); - goto abort; - } - tx->dqo.pending_pkts[j].state = GVE_PACKET_STATE_FREE; - } + if (gve_is_qpl(priv)) { + int qpl_buf_cnt; + + tx->com.qpl = &priv->qpls[i]; + qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + tx->dqo.qpl_bufs = malloc( + sizeof(*tx->dqo.qpl_bufs) * qpl_buf_cnt, + M_GVE, M_WAITOK | M_ZERO); + } else + gve_tx_alloc_rda_fields_dqo(tx); return (0); abort: @@ -335,6 +366,44 @@ return (0); } +static int +gve_tx_fill_ctx_descs(struct gve_tx_ring *tx, struct mbuf *mbuf, + bool is_tso, uint32_t *desc_idx) +{ + struct gve_tx_general_context_desc_dqo *gen_desc; + struct gve_tx_tso_context_desc_dqo *tso_desc; + struct gve_tx_metadata_dqo metadata; + int header_len; + int err; + + metadata = (struct gve_tx_metadata_dqo){0}; + gve_extract_tx_metadata_dqo(mbuf, &metadata); + + if (is_tso) { + err = gve_prep_tso(mbuf, &header_len); + if (__predict_false(err)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_tsoerr, 1); + counter_exit(); + return (err); + } + + tso_desc = &tx->dqo.desc_ring[*desc_idx].tso_ctx; + gve_tx_fill_tso_ctx_desc(tso_desc, mbuf, &metadata, header_len); + + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + counter_enter(); + counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); + counter_exit(); + } + + gen_desc = &tx->dqo.desc_ring[*desc_idx].general_ctx; + gve_tx_fill_general_ctx_desc(gen_desc, &metadata); + *desc_idx = (*desc_idx + 1) & tx->dqo.desc_mask; + return (0); +} + static int gve_map_mbuf_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf, bus_dmamap_t dmamap, @@ -500,18 +569,197 @@ } } +static bool +gve_tx_have_enough_qpl_bufs(struct gve_tx_ring *tx, int num_bufs) +{ + uint32_t available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + + tx->dqo.qpl_bufs_produced_cached = atomic_load_acq_32( + &tx->dqo.qpl_bufs_produced); + available = tx->dqo.qpl_bufs_produced_cached - + tx->dqo.qpl_bufs_consumed; + + if (__predict_true(available >= num_bufs)) + return (true); + return (false); +} + +static int32_t +gve_tx_alloc_qpl_buf(struct gve_tx_ring *tx) +{ + int32_t buf = tx->dqo.free_qpl_bufs_csm; + + if (__predict_false(buf == -1)) { + tx->dqo.free_qpl_bufs_csm = atomic_swap_32( + &tx->dqo.free_qpl_bufs_prd, -1); + buf = tx->dqo.free_qpl_bufs_csm; + if (__predict_false(buf == -1)) + return (-1); + } + + tx->dqo.free_qpl_bufs_csm = tx->dqo.qpl_bufs[buf]; + tx->dqo.qpl_bufs_consumed++; + return (buf); +} + +/* + * Tx buffer i corresponds to + * qpl_page_id = i / GVE_TX_BUFS_PER_PAGE_DQO + * qpl_page_offset = (i % GVE_TX_BUFS_PER_PAGE_DQO) * GVE_TX_BUF_SIZE_DQO + */ +static void +gve_tx_buf_get_addr_dqo(struct gve_tx_ring *tx, + int32_t index, void **va, bus_addr_t *dma_addr) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + int offset = (index & (GVE_TX_BUFS_PER_PAGE_DQO - 1)) << + GVE_TX_BUF_SHIFT_DQO; + + *va = (char *)tx->com.qpl->dmas[page_id].cpu_addr + offset; + *dma_addr = tx->com.qpl->dmas[page_id].bus_addr + offset; +} + +static struct gve_dma_handle * +gve_get_page_dma_handle(struct gve_tx_ring *tx, int32_t index) +{ + int page_id = index >> (PAGE_SHIFT - GVE_TX_BUF_SHIFT_DQO); + + return (&tx->com.qpl->dmas[page_id]); +} + +static void +gve_tx_copy_mbuf_and_write_pkt_descs(struct gve_tx_ring *tx, + struct mbuf *mbuf, struct gve_tx_pending_pkt_dqo *pkt, + bool csum_enabled, int16_t completion_tag, + uint32_t *desc_idx) +{ + int32_t pkt_len = mbuf->m_pkthdr.len; + struct gve_dma_handle *dma; + uint32_t copy_offset = 0; + int32_t prev_buf = -1; + uint32_t copy_len; + bus_addr_t addr; + int32_t buf; + void *va; + + MPASS(pkt->num_qpl_bufs == 0); + MPASS(pkt->qpl_buf_head == -1); + + while (copy_offset < pkt_len) { + buf = gve_tx_alloc_qpl_buf(tx); + /* We already checked for availability */ + MPASS(buf != -1); + + gve_tx_buf_get_addr_dqo(tx, buf, &va, &addr); + copy_len = MIN(GVE_TX_BUF_SIZE_DQO, pkt_len - copy_offset); + m_copydata(mbuf, copy_offset, copy_len, va); + copy_offset += copy_len; + + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_PREWRITE); + + gve_tx_fill_pkt_desc_dqo(tx, desc_idx, + copy_len, addr, completion_tag, + /*eop=*/copy_offset == pkt_len, + csum_enabled); + + /* Link all the qpl bufs for a packet */ + if (prev_buf == -1) + pkt->qpl_buf_head = buf; + else + tx->dqo.qpl_bufs[prev_buf] = buf; + + prev_buf = buf; + pkt->num_qpl_bufs++; + } + + tx->dqo.qpl_bufs[buf] = -1; +} + +int +gve_xmit_dqo_qpl(struct gve_tx_ring *tx, struct mbuf *mbuf) +{ + uint32_t desc_idx = tx->dqo.desc_tail; + struct gve_tx_pending_pkt_dqo *pkt; + int total_descs_needed; + int16_t completion_tag; + bool has_csum_flag; + int csum_flags; + bool is_tso; + int nsegs; + int err; + + csum_flags = mbuf->m_pkthdr.csum_flags; + has_csum_flag = csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_IP6_TCP | CSUM_IP6_UDP | CSUM_TSO); + is_tso = csum_flags & CSUM_TSO; + + nsegs = howmany(mbuf->m_pkthdr.len, GVE_TX_BUF_SIZE_DQO); + /* Check if we have enough room in the desc ring */ + total_descs_needed = 1 + /* general_ctx_desc */ + nsegs + /* pkt_desc */ + (is_tso ? 1 : 0); /* tso_ctx_desc */ + if (__predict_false(!gve_tx_has_desc_room_dqo(tx, total_descs_needed))) + return (ENOBUFS); + + if (!gve_tx_have_enough_qpl_bufs(tx, nsegs)) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_qpl_bufs, 1); + counter_exit(); + return (ENOBUFS); + } + + pkt = gve_alloc_pending_packet(tx); + if (pkt == NULL) { + counter_enter(); + counter_u64_add_protected( + tx->stats.tx_delayed_pkt_nospace_compring, 1); + counter_exit(); + return (ENOBUFS); + } + completion_tag = pkt - tx->dqo.pending_pkts; + pkt->mbuf = mbuf; + + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort; + + gve_tx_copy_mbuf_and_write_pkt_descs(tx, mbuf, pkt, + has_csum_flag, completion_tag, &desc_idx); + + /* Remember the index of the last desc written */ + tx->dqo.desc_tail = desc_idx; + + /* + * Request a descriptor completion on the last descriptor of the + * packet if we are allowed to by the HW enforced interval. + */ + gve_tx_request_desc_compl(tx, desc_idx); + + tx->req += total_descs_needed; /* tx->req is just a sysctl counter */ + return (0); + +abort: + pkt->mbuf = NULL; + gve_free_pending_packet(tx, pkt); + return (err); +} + int gve_xmit_dqo(struct gve_tx_ring *tx, struct mbuf **mbuf_ptr) { bus_dma_segment_t segs[GVE_TX_MAX_DATA_DESCS_DQO]; uint32_t desc_idx = tx->dqo.desc_tail; - struct gve_tx_metadata_dqo metadata; struct gve_tx_pending_pkt_dqo *pkt; struct mbuf *mbuf = *mbuf_ptr; int total_descs_needed; int16_t completion_tag; bool has_csum_flag; - int header_len; int csum_flags; bool is_tso; int nsegs; @@ -561,34 +809,11 @@ goto abort_with_dma; } - bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); - - metadata = (struct gve_tx_metadata_dqo){0}; - gve_extract_tx_metadata_dqo(mbuf, &metadata); - - if (is_tso) { - err = gve_prep_tso(mbuf, &header_len); - if (__predict_false(err)) { - counter_enter(); - counter_u64_add_protected( - tx->stats.tx_delayed_pkt_tsoerr, 1); - counter_exit(); - goto abort_with_dma; - } - - gve_tx_fill_tso_ctx_desc(&tx->dqo.desc_ring[desc_idx].tso_ctx, - mbuf, &metadata, header_len); - desc_idx = (desc_idx + 1) & tx->dqo.desc_mask; - - counter_enter(); - counter_u64_add_protected(tx->stats.tso_packet_cnt, 1); - counter_exit(); - } - - gve_tx_fill_general_ctx_desc(&tx->dqo.desc_ring[desc_idx].general_ctx, - &metadata); - desc_idx = (desc_idx + 1) & tx->dqo.desc_mask; + err = gve_tx_fill_ctx_descs(tx, mbuf, is_tso, &desc_idx); + if (err) + goto abort_with_dma; + bus_dmamap_sync(tx->dqo.buf_dmatag, pkt->dmamap, BUS_DMASYNC_PREWRITE); for (i = 0; i < nsegs; i++) { gve_tx_fill_pkt_desc_dqo(tx, &desc_idx, segs[i].ds_len, segs[i].ds_addr, @@ -616,6 +841,45 @@ return (err); } +static void +gve_reap_qpl_bufs_dqo(struct gve_tx_ring *tx, + struct gve_tx_pending_pkt_dqo *pkt) +{ + int32_t buf = pkt->qpl_buf_head; + struct gve_dma_handle *dma; + int32_t qpl_buf_tail; + int32_t old_head; + int i; + + for (i = 0; i < pkt->num_qpl_bufs; i++) { + dma = gve_get_page_dma_handle(tx, buf); + bus_dmamap_sync(dma->tag, dma->map, BUS_DMASYNC_POSTWRITE); + qpl_buf_tail = buf; + buf = tx->dqo.qpl_bufs[buf]; + } + MPASS(buf == -1); + buf = qpl_buf_tail; + + while (true) { + old_head = atomic_load_32(&tx->dqo.free_qpl_bufs_prd); + tx->dqo.qpl_bufs[buf] = old_head; + + /* + * Prior loads and stores will be completed before subsequent + * stores. + */ + atomic_thread_fence_rel(); + + if (atomic_cmpset_32(&tx->dqo.free_qpl_bufs_prd, + old_head, pkt->qpl_buf_head)) + break; + } + atomic_add_rel_32(&tx->dqo.qpl_bufs_produced, pkt->num_qpl_bufs); + + pkt->qpl_buf_head = -1; + pkt->num_qpl_bufs = 0; +} + static uint64_t gve_handle_packet_completion(struct gve_priv *priv, struct gve_tx_ring *tx, uint16_t compl_tag) @@ -640,7 +904,12 @@ } pkt_len = pending_pkt->mbuf->m_pkthdr.len; - gve_unmap_packet(tx, pending_pkt); + + if (gve_is_qpl(priv)) + gve_reap_qpl_bufs_dqo(tx, pending_pkt); + else + gve_unmap_packet(tx, pending_pkt); + m_freem(pending_pkt->mbuf); pending_pkt->mbuf = NULL; gve_free_pending_packet(tx, pending_pkt); @@ -716,6 +985,21 @@ tx->dqo.free_pending_pkts_csm = 0; atomic_store_rel_32(&tx->dqo.free_pending_pkts_prd, -1); + if (gve_is_qpl(priv)) { + int qpl_buf_cnt = GVE_TX_BUFS_PER_PAGE_DQO * + tx->com.qpl->num_pages; + + for (j = 0; j < qpl_buf_cnt - 1; j++) + tx->dqo.qpl_bufs[j] = j + 1; + tx->dqo.qpl_bufs[j] = -1; + + tx->dqo.free_qpl_bufs_csm = 0; + tx->dqo.free_qpl_bufs_prd = -1; + tx->dqo.qpl_bufs_produced = qpl_buf_cnt; + tx->dqo.qpl_bufs_produced_cached = qpl_buf_cnt; + tx->dqo.qpl_bufs_consumed = 0; + } + gve_tx_clear_desc_ring_dqo(tx); gve_tx_clear_compl_ring_dqo(tx); }