Index: sys/dev/mlx4/mlx4_en/en.h =================================================================== --- sys/dev/mlx4/mlx4_en/en.h +++ sys/dev/mlx4/mlx4_en/en.h @@ -108,6 +108,25 @@ MLX4_EN_ALLOC_REPLACEMENT = 1, }; +/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU + * and 4K allocations) */ +#if MJUMPAGESIZE == 4096 +enum { + FRAG_SZ0 = MCLBYTES, + FRAG_SZ1 = MJUMPAGESIZE, + FRAG_SZ2 = MJUMPAGESIZE, +}; +#define MLX4_EN_MAX_RX_FRAGS 3 +#elif MJUMPAGESIZE == 8192 +enum { + FRAG_SZ0 = MCLBYTES, + FRAG_SZ1 = MJUMPAGESIZE, +}; +#define MLX4_EN_MAX_RX_FRAGS 2 +#else +#error "Unknown PAGE_SIZE" +#endif + /* Maximum ring sizes */ #define MLX4_EN_DEF_TX_QUEUE_SIZE 4096 @@ -307,6 +326,7 @@ bus_dma_tag_t dma_tag; struct mlx4_en_rx_spare spare; u32 size ; /* number of Rx descs*/ + u32 num_mbufs; u32 actual_size; u32 size_mask; u16 stride; @@ -504,6 +524,10 @@ u8 vport_num; }; +struct mlx4_en_frag_info { + u16 frag_size; +}; + struct mlx4_en_priv { struct mlx4_en_dev *mdev; struct mlx4_en_port_profile *prof; @@ -554,6 +578,9 @@ u32 tx_ring_num; u32 rx_ring_num; u32 rx_mb_size; + struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS]; + u16 num_frags; + u16 log_mbuf; struct mlx4_en_tx_ring **tx_ring; struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS]; Index: sys/dev/mlx4/mlx4_en/mlx4_en_rx.c =================================================================== --- sys/dev/mlx4/mlx4_en/mlx4_en_rx.c +++ sys/dev/mlx4/mlx4_en/mlx4_en_rx.c @@ -53,10 +53,18 @@ (ring->buf + (ring->stride * index)); int possible_frags; int i; + int ip_align; + ip_align = MLX4_NET_IP_ALIGN; /* Set size and memtype fields */ - rx_desc->data[0].byte_count = cpu_to_be32(priv->rx_mb_size - MLX4_NET_IP_ALIGN); - rx_desc->data[0].lkey = cpu_to_be32(priv->mdev->mr.key); + for (i = 0; i < priv->num_frags; i++) { + rx_desc->data[i].byte_count = + cpu_to_be32(priv->frag_info[i].frag_size - ip_align); + rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key); + + /* Adjust only the first fragment for IP header alignment. */ + ip_align = 0; + } /* * If the number of used fragments does not fill up the ring @@ -64,50 +72,23 @@ * null address/size and a special memory key: */ possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE; - for (i = 1; i < possible_frags; i++) { + for (i = priv->num_frags; i < possible_frags; i++) { rx_desc->data[i].byte_count = 0; rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD); rx_desc->data[i].addr = 0; } + } static int -mlx4_en_alloc_buf(struct mlx4_en_rx_ring *ring, - __be64 *pdma, struct mlx4_en_rx_mbuf *mb_list) +mlx4_en_alloc_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, + __be64 *pdma, struct mlx4_en_rx_mbuf *mb_list, int flags, int frag_size) { bus_dma_segment_t segs[1]; - bus_dmamap_t map; struct mbuf *mb; int nsegs; int err; - /* try to allocate a new spare mbuf */ - if (unlikely(ring->spare.mbuf == NULL)) { - mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size); - if (unlikely(mb == NULL)) - return (-ENOMEM); - /* setup correct length */ - mb->m_pkthdr.len = mb->m_len = ring->rx_mb_size; - - /* make sure IP header gets aligned */ - m_adj(mb, MLX4_NET_IP_ALIGN); - - /* load spare mbuf into BUSDMA */ - err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, ring->spare.dma_map, - mb, segs, &nsegs, BUS_DMA_NOWAIT); - if (unlikely(err != 0)) { - m_freem(mb); - return (err); - } - - /* store spare info */ - ring->spare.mbuf = mb; - ring->spare.paddr_be = cpu_to_be64(segs[0].ds_addr); - - bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map, - BUS_DMASYNC_PREREAD); - } - /* synchronize and unload the current mbuf, if any */ if (likely(mb_list->mbuf != NULL)) { bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, @@ -115,21 +96,26 @@ bus_dmamap_unload(ring->dma_tag, mb_list->dma_map); } - mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size); - if (unlikely(mb == NULL)) - goto use_spare; + mb = m_getjcl(M_NOWAIT, MT_DATA, flags, frag_size); + if (unlikely(mb == NULL)) { + priv->port_stats.rx_alloc_failed++; + return -ENOMEM; + } /* setup correct length */ - mb->m_pkthdr.len = mb->m_len = ring->rx_mb_size; + mb->m_len = frag_size; /* make sure IP header gets aligned */ - m_adj(mb, MLX4_NET_IP_ALIGN); + if (flags & M_PKTHDR) { + mb->m_pkthdr.len = frag_size; + m_adj(mb, MLX4_NET_IP_ALIGN); + } err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, mb_list->dma_map, mb, segs, &nsegs, BUS_DMA_NOWAIT); if (unlikely(err != 0)) { m_freem(mb); - goto use_spare; + return (-err); } *pdma = cpu_to_be64(segs[0].ds_addr); @@ -137,30 +123,40 @@ bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, BUS_DMASYNC_PREREAD); return (0); +} + +static void +mlx4_en_free_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, + struct mlx4_en_rx_mbuf *mb_list) +{ + bus_dmamap_t map; + bus_dma_tag_t tag; + int nr; -use_spare: - /* swap DMA maps */ - map = mb_list->dma_map; - mb_list->dma_map = ring->spare.dma_map; - ring->spare.dma_map = map; + for (nr = 0; nr < priv->num_frags; nr++) { + en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); - /* swap MBUFs */ - mb_list->mbuf = ring->spare.mbuf; - ring->spare.mbuf = NULL; + if (mb_list->mbuf != NULL) { + map = mb_list->dma_map; + tag = ring->dma_tag; - /* store physical address */ - *pdma = ring->spare.paddr_be; - return (0); + bus_dmamap_sync(tag, map, BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(tag, map); + m_freem(mb_list->mbuf); + mb_list->mbuf = NULL; /* safety clearing */ + } + mb_list++; + } } static void -mlx4_en_free_buf(struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_mbuf *mb_list) +mlx4_en_free_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, + int index) { - bus_dmamap_t map = mb_list->dma_map; - bus_dmamap_sync(ring->dma_tag, map, BUS_DMASYNC_POSTREAD); - bus_dmamap_unload(ring->dma_tag, map); - m_freem(mb_list->mbuf); - mb_list->mbuf = NULL; /* safety clearing */ + struct mlx4_en_rx_mbuf *mb_list; + + mb_list = ring->mbuf + (index << priv->log_mbuf); + mlx4_en_free_buf(priv, ring, mb_list); } static int @@ -169,15 +165,24 @@ { struct mlx4_en_rx_desc *rx_desc = (struct mlx4_en_rx_desc *) (ring->buf + (index * ring->stride)); - struct mlx4_en_rx_mbuf *mb_list = ring->mbuf + index; + struct mlx4_en_rx_mbuf *mb_list = ring->mbuf + (index << priv->log_mbuf); + int i; + int flags; - mb_list->mbuf = NULL; + mlx4_en_free_buf(priv, ring, mb_list); - if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list)) { - priv->port_stats.rx_alloc_failed++; - return (-ENOMEM); + flags = M_PKTHDR; + for (i = 0; i < priv->num_frags; i++) { + if (mlx4_en_alloc_buf(priv, ring, &rx_desc->data[i].addr, &mb_list[i], + flags, priv->frag_info[i].frag_size)) + goto err; + flags = 0; } return (0); + +err: + mlx4_en_free_buf(priv, ring, mb_list); + return -ENOMEM; } static inline void @@ -226,8 +231,7 @@ while (ring->actual_size > new_size) { ring->actual_size--; ring->prod--; - mlx4_en_free_buf(ring, - ring->mbuf + ring->actual_size); + mlx4_en_free_rx_desc(priv, ring, ring->actual_size); } } @@ -247,33 +251,69 @@ while (ring->cons != ring->prod) { index = ring->cons & ring->size_mask; en_dbg(DRV, priv, "Processing descriptor:%d\n", index); - mlx4_en_free_buf(ring, ring->mbuf + index); + mlx4_en_free_rx_desc(priv, ring, index); ++ring->cons; } } +#if MLX4_EN_MAX_RX_FRAGS == 3 +static int frag_sizes[] = { + FRAG_SZ0, + FRAG_SZ1, + FRAG_SZ2, +}; +#elif MLX4_EN_MAX_RX_FRAGS == 2 +static int frag_sizes[] = { + FRAG_SZ0, + FRAG_SZ1, +}; +#else +#error "Unknown MAX_RX_FRAGS" +#endif + void mlx4_en_calc_rx_buf(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); int eff_mtu = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN + MLX4_NET_IP_ALIGN; + int buf_size = 0; + int i, frag; - if (eff_mtu > MJUM16BYTES) { - en_err(priv, "MTU(%d) is too big\n", dev->if_mtu); - eff_mtu = MJUM16BYTES; - } else if (eff_mtu > MJUM9BYTES) { - eff_mtu = MJUM16BYTES; - } else if (eff_mtu > MJUMPAGESIZE) { - eff_mtu = MJUM9BYTES; - } else if (eff_mtu > MCLBYTES) { - eff_mtu = MJUMPAGESIZE; - } else { - eff_mtu = MCLBYTES; - } + /* + * Try to fit packets into a single mbuf+cluster, but we have to split + * frames across multiple mbufs if the MTU is greater than the page + * size so that we don't don't trigger the (very expensive) contiguous + * memory allocator during normal rx operation. + */ + if (eff_mtu <= MCLBYTES) { + priv->frag_info[0].frag_size = MCLBYTES; + priv->num_frags = 1; + } else if (eff_mtu <= MJUMPAGESIZE) { + priv->frag_info[0].frag_size = MJUMPAGESIZE; + priv->num_frags = 1; + } else { + for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) { + /* + * Allocate small to large but only as much as is needed for + * the tail. + */ + while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1]) + i--; + priv->frag_info[frag].frag_size = frag_sizes[i]; + buf_size += priv->frag_info[frag].frag_size; + } + priv->num_frags = frag; + } priv->rx_mb_size = eff_mtu; + priv->log_mbuf = ROUNDUP_LOG2(priv->num_frags); - en_dbg(DRV, priv, "Effective RX MTU: %d bytes\n", eff_mtu); + en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d " + "num_frags:%d):\n", eff_mtu, priv->num_frags); + for (i = 0; i < priv->num_frags; i++) { + en_dbg(DRV, priv, " frag:%d - size:%d\n", i, + priv->frag_info[i].frag_size); + } } int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, @@ -283,7 +323,7 @@ struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring; int err; - int tmp; + size_t ring_size_bytes; uint32_t x; ring = kzalloc(sizeof(struct mlx4_en_rx_ring), GFP_KERNEL); @@ -300,9 +340,9 @@ BUS_SPACE_MAXADDR, /* lowaddr */ BUS_SPACE_MAXADDR, /* highaddr */ NULL, NULL, /* filter, filterarg */ - MJUM16BYTES, /* maxsize */ + MJUMPAGESIZE, /* maxsize */ 1, /* nsegments */ - MJUM16BYTES, /* maxsegsize */ + MJUMPAGESIZE, /* maxsegsize */ 0, /* flags */ NULL, NULL, /* lockfunc, lockfuncarg */ &ring->dma_tag))) { @@ -314,24 +354,21 @@ ring->cons = 0; ring->size = size; ring->size_mask = size - 1; - ring->stride = roundup_pow_of_two( - sizeof(struct mlx4_en_rx_desc) + DS_SIZE); + ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + + DS_SIZE * MLX4_EN_MAX_RX_FRAGS); ring->log_stride = ffs(ring->stride) - 1; ring->buf_size = ring->size * ring->stride + TXBB_SIZE; - tmp = size * sizeof(struct mlx4_en_rx_mbuf); + ring->num_mbufs = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS); + ring_size_bytes = ring->num_mbufs * sizeof(struct mlx4_en_rx_mbuf); - ring->mbuf = kzalloc(tmp, GFP_KERNEL); + ring->mbuf = kzalloc(ring_size_bytes, GFP_KERNEL); if (ring->mbuf == NULL) { err = -ENOMEM; goto err_dma_tag; } - err = -bus_dmamap_create(ring->dma_tag, 0, &ring->spare.dma_map); - if (err != 0) - goto err_info; - - for (x = 0; x != size; x++) { + for (x = 0; x != ring->num_mbufs; x++) { err = -bus_dmamap_create(ring->dma_tag, 0, &ring->mbuf[x].dma_map); if (err != 0) { @@ -341,8 +378,8 @@ goto err_info; } } - en_dbg(DRV, priv, "Allocated MBUF ring at addr:%p size:%d\n", - ring->mbuf, tmp); + en_dbg(DRV, priv, "Allocated MBUF ring at addr:%p size:%zd\n", + ring->mbuf, ring_size_bytes); err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); @@ -381,8 +418,8 @@ int i; int ring_ind; int err; - int stride = roundup_pow_of_two( - sizeof(struct mlx4_en_rx_desc) + DS_SIZE); + int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + + DS_SIZE * priv->num_frags); for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; @@ -465,16 +502,8 @@ mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE); - for (x = 0; x != size; x++) + for (x = 0; x != ring->num_mbufs; x++) bus_dmamap_destroy(ring->dma_tag, ring->mbuf[x].dma_map); - /* free spare mbuf, if any */ - if (ring->spare.mbuf != NULL) { - bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map, - BUS_DMASYNC_POSTREAD); - bus_dmamap_unload(ring->dma_tag, ring->spare.dma_map); - m_freem(ring->spare.mbuf); - } - bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map); vfree(ring->mbuf); bus_dma_tag_destroy(ring->dma_tag); kfree(ring); @@ -532,6 +561,68 @@ return 0; } +/* + * Collect up the packet fragments, represented by individual mbufs, into a + * single mbuf chain ready to be passed up the stack. As mbufs are removed from + * the ring replace them with newly allocated ones; if we fail to allocate an + * mbuf then drop the current packet and return an error. This ensures that the + * ring is always in a state where it is ready to receive packets. + */ +static int +mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_desc *rx_desc, + struct mlx4_en_rx_mbuf *mb_list, int length) +{ + struct mlx4_en_frag_info *frag_info; + struct mbuf *mb, *first_mb, *prev_mb; + int flags, nr, align_len, mb_len; + + first_mb = mb_list[0].mbuf; + prev_mb = NULL; + first_mb->m_pkthdr.len = length; + flags = M_PKTHDR; + align_len = MLX4_NET_IP_ALIGN; + + /* Collect used fragments while replacing them in the HW descriptors */ + for (nr = 0; nr < priv->num_frags; nr++) { + frag_info = &priv->frag_info[nr]; + + mb = mb_list[nr].mbuf; + + /* Allocate a replacement page */ + if (mlx4_en_alloc_buf(priv, ring, &rx_desc->data[nr].addr, + &mb_list[nr], flags, priv->frag_info[nr].frag_size)) + goto fail; + + if (prev_mb != NULL) + prev_mb->m_next = mb; + mb_len = frag_info->frag_size - align_len; + prev_mb = mb; + + if (length <= mb_len) + break; + + mb->m_len = mb_len; + length -= mb_len; + flags = 0; + align_len = 0; + } + /* Adjust size of last fragment to match actual length */ + prev_mb->m_len = min(length, prev_mb->m_len);; + prev_mb->m_next = NULL; + return (0); + +fail: + /* + * At this point the fragments have been partially extracted and + * replaced. Free the mbufs that are no longer referenced by the ring. + */ + if (first_mb != mb_list[0].mbuf) + m_freem(first_mb); + return (-ENOMEM); + +} + static struct mbuf * mlx4_en_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_mbuf *mb_list, @@ -539,19 +630,12 @@ { struct mbuf *mb; - /* get mbuf */ mb = mb_list->mbuf; - /* collect used fragment while atomically replacing it */ - if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list)) - return (NULL); - - /* range check hardware computed value */ - if (unlikely(length > mb->m_len)) - length = mb->m_len; + /* Move relevant fragments to mb */ + if (unlikely(mlx4_en_complete_rx_desc(priv, ring, rx_desc, mb_list, length))) + return NULL; - /* update total packet length in packet header */ - mb->m_len = mb->m_pkthdr.len = length; return (mb); } @@ -591,7 +675,7 @@ /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { - mb_list = ring->mbuf + index; + mb_list = ring->mbuf + (index << priv->log_mbuf); rx_desc = (struct mlx4_en_rx_desc *) (ring->buf + (index << ring->log_stride));