Page MenuHomeFreeBSD

D11560.diff
No OneTemporary

D11560.diff

Index: sys/dev/mlx4/mlx4_en/en.h
===================================================================
--- sys/dev/mlx4/mlx4_en/en.h
+++ sys/dev/mlx4/mlx4_en/en.h
@@ -108,6 +108,25 @@
MLX4_EN_ALLOC_REPLACEMENT = 1,
};
+/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU
+ * and 4K allocations) */
+#if MJUMPAGESIZE == 4096
+enum {
+ FRAG_SZ0 = MCLBYTES,
+ FRAG_SZ1 = MJUMPAGESIZE,
+ FRAG_SZ2 = MJUMPAGESIZE,
+};
+#define MLX4_EN_MAX_RX_FRAGS 3
+#elif MJUMPAGESIZE == 8192
+enum {
+ FRAG_SZ0 = MCLBYTES,
+ FRAG_SZ1 = MJUMPAGESIZE,
+};
+#define MLX4_EN_MAX_RX_FRAGS 2
+#else
+#error "Unknown PAGE_SIZE"
+#endif
+
/* Maximum ring sizes */
#define MLX4_EN_DEF_TX_QUEUE_SIZE 4096
@@ -307,6 +326,7 @@
bus_dma_tag_t dma_tag;
struct mlx4_en_rx_spare spare;
u32 size ; /* number of Rx descs*/
+ u32 num_mbufs;
u32 actual_size;
u32 size_mask;
u16 stride;
@@ -504,6 +524,10 @@
u8 vport_num;
};
+struct mlx4_en_frag_info {
+ u16 frag_size;
+};
+
struct mlx4_en_priv {
struct mlx4_en_dev *mdev;
struct mlx4_en_port_profile *prof;
@@ -554,6 +578,9 @@
u32 tx_ring_num;
u32 rx_ring_num;
u32 rx_mb_size;
+ struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS];
+ u16 num_frags;
+ u16 log_mbuf;
struct mlx4_en_tx_ring **tx_ring;
struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];
Index: sys/dev/mlx4/mlx4_en/mlx4_en_rx.c
===================================================================
--- sys/dev/mlx4/mlx4_en/mlx4_en_rx.c
+++ sys/dev/mlx4/mlx4_en/mlx4_en_rx.c
@@ -53,10 +53,18 @@
(ring->buf + (ring->stride * index));
int possible_frags;
int i;
+ int ip_align;
+ ip_align = MLX4_NET_IP_ALIGN;
/* Set size and memtype fields */
- rx_desc->data[0].byte_count = cpu_to_be32(priv->rx_mb_size - MLX4_NET_IP_ALIGN);
- rx_desc->data[0].lkey = cpu_to_be32(priv->mdev->mr.key);
+ for (i = 0; i < priv->num_frags; i++) {
+ rx_desc->data[i].byte_count =
+ cpu_to_be32(priv->frag_info[i].frag_size - ip_align);
+ rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
+
+ /* Adjust only the first fragment for IP header alignment. */
+ ip_align = 0;
+ }
/*
* If the number of used fragments does not fill up the ring
@@ -64,50 +72,23 @@
* null address/size and a special memory key:
*/
possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE;
- for (i = 1; i < possible_frags; i++) {
+ for (i = priv->num_frags; i < possible_frags; i++) {
rx_desc->data[i].byte_count = 0;
rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD);
rx_desc->data[i].addr = 0;
}
+
}
static int
-mlx4_en_alloc_buf(struct mlx4_en_rx_ring *ring,
- __be64 *pdma, struct mlx4_en_rx_mbuf *mb_list)
+mlx4_en_alloc_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+ __be64 *pdma, struct mlx4_en_rx_mbuf *mb_list, int flags, int frag_size)
{
bus_dma_segment_t segs[1];
- bus_dmamap_t map;
struct mbuf *mb;
int nsegs;
int err;
- /* try to allocate a new spare mbuf */
- if (unlikely(ring->spare.mbuf == NULL)) {
- mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size);
- if (unlikely(mb == NULL))
- return (-ENOMEM);
- /* setup correct length */
- mb->m_pkthdr.len = mb->m_len = ring->rx_mb_size;
-
- /* make sure IP header gets aligned */
- m_adj(mb, MLX4_NET_IP_ALIGN);
-
- /* load spare mbuf into BUSDMA */
- err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, ring->spare.dma_map,
- mb, segs, &nsegs, BUS_DMA_NOWAIT);
- if (unlikely(err != 0)) {
- m_freem(mb);
- return (err);
- }
-
- /* store spare info */
- ring->spare.mbuf = mb;
- ring->spare.paddr_be = cpu_to_be64(segs[0].ds_addr);
-
- bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map,
- BUS_DMASYNC_PREREAD);
- }
-
/* synchronize and unload the current mbuf, if any */
if (likely(mb_list->mbuf != NULL)) {
bus_dmamap_sync(ring->dma_tag, mb_list->dma_map,
@@ -115,21 +96,26 @@
bus_dmamap_unload(ring->dma_tag, mb_list->dma_map);
}
- mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size);
- if (unlikely(mb == NULL))
- goto use_spare;
+ mb = m_getjcl(M_NOWAIT, MT_DATA, flags, frag_size);
+ if (unlikely(mb == NULL)) {
+ priv->port_stats.rx_alloc_failed++;
+ return -ENOMEM;
+ }
/* setup correct length */
- mb->m_pkthdr.len = mb->m_len = ring->rx_mb_size;
+ mb->m_len = frag_size;
/* make sure IP header gets aligned */
- m_adj(mb, MLX4_NET_IP_ALIGN);
+ if (flags & M_PKTHDR) {
+ mb->m_pkthdr.len = frag_size;
+ m_adj(mb, MLX4_NET_IP_ALIGN);
+ }
err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, mb_list->dma_map,
mb, segs, &nsegs, BUS_DMA_NOWAIT);
if (unlikely(err != 0)) {
m_freem(mb);
- goto use_spare;
+ return (-err);
}
*pdma = cpu_to_be64(segs[0].ds_addr);
@@ -137,30 +123,40 @@
bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, BUS_DMASYNC_PREREAD);
return (0);
+}
+
+static void
+mlx4_en_free_buf(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+ struct mlx4_en_rx_mbuf *mb_list)
+{
+ bus_dmamap_t map;
+ bus_dma_tag_t tag;
+ int nr;
-use_spare:
- /* swap DMA maps */
- map = mb_list->dma_map;
- mb_list->dma_map = ring->spare.dma_map;
- ring->spare.dma_map = map;
+ for (nr = 0; nr < priv->num_frags; nr++) {
+ en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
- /* swap MBUFs */
- mb_list->mbuf = ring->spare.mbuf;
- ring->spare.mbuf = NULL;
+ if (mb_list->mbuf != NULL) {
+ map = mb_list->dma_map;
+ tag = ring->dma_tag;
- /* store physical address */
- *pdma = ring->spare.paddr_be;
- return (0);
+ bus_dmamap_sync(tag, map, BUS_DMASYNC_POSTREAD);
+ bus_dmamap_unload(tag, map);
+ m_freem(mb_list->mbuf);
+ mb_list->mbuf = NULL; /* safety clearing */
+ }
+ mb_list++;
+ }
}
static void
-mlx4_en_free_buf(struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_mbuf *mb_list)
+mlx4_en_free_rx_desc(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
+ int index)
{
- bus_dmamap_t map = mb_list->dma_map;
- bus_dmamap_sync(ring->dma_tag, map, BUS_DMASYNC_POSTREAD);
- bus_dmamap_unload(ring->dma_tag, map);
- m_freem(mb_list->mbuf);
- mb_list->mbuf = NULL; /* safety clearing */
+ struct mlx4_en_rx_mbuf *mb_list;
+
+ mb_list = ring->mbuf + (index << priv->log_mbuf);
+ mlx4_en_free_buf(priv, ring, mb_list);
}
static int
@@ -169,15 +165,24 @@
{
struct mlx4_en_rx_desc *rx_desc = (struct mlx4_en_rx_desc *)
(ring->buf + (index * ring->stride));
- struct mlx4_en_rx_mbuf *mb_list = ring->mbuf + index;
+ struct mlx4_en_rx_mbuf *mb_list = ring->mbuf + (index << priv->log_mbuf);
+ int i;
+ int flags;
- mb_list->mbuf = NULL;
+ mlx4_en_free_buf(priv, ring, mb_list);
- if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list)) {
- priv->port_stats.rx_alloc_failed++;
- return (-ENOMEM);
+ flags = M_PKTHDR;
+ for (i = 0; i < priv->num_frags; i++) {
+ if (mlx4_en_alloc_buf(priv, ring, &rx_desc->data[i].addr, &mb_list[i],
+ flags, priv->frag_info[i].frag_size))
+ goto err;
+ flags = 0;
}
return (0);
+
+err:
+ mlx4_en_free_buf(priv, ring, mb_list);
+ return -ENOMEM;
}
static inline void
@@ -226,8 +231,7 @@
while (ring->actual_size > new_size) {
ring->actual_size--;
ring->prod--;
- mlx4_en_free_buf(ring,
- ring->mbuf + ring->actual_size);
+ mlx4_en_free_rx_desc(priv, ring, ring->actual_size);
}
}
@@ -247,33 +251,69 @@
while (ring->cons != ring->prod) {
index = ring->cons & ring->size_mask;
en_dbg(DRV, priv, "Processing descriptor:%d\n", index);
- mlx4_en_free_buf(ring, ring->mbuf + index);
+ mlx4_en_free_rx_desc(priv, ring, index);
++ring->cons;
}
}
+#if MLX4_EN_MAX_RX_FRAGS == 3
+static int frag_sizes[] = {
+ FRAG_SZ0,
+ FRAG_SZ1,
+ FRAG_SZ2,
+};
+#elif MLX4_EN_MAX_RX_FRAGS == 2
+static int frag_sizes[] = {
+ FRAG_SZ0,
+ FRAG_SZ1,
+};
+#else
+#error "Unknown MAX_RX_FRAGS"
+#endif
+
void mlx4_en_calc_rx_buf(struct net_device *dev)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
int eff_mtu = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN +
MLX4_NET_IP_ALIGN;
+ int buf_size = 0;
+ int i, frag;
- if (eff_mtu > MJUM16BYTES) {
- en_err(priv, "MTU(%d) is too big\n", dev->if_mtu);
- eff_mtu = MJUM16BYTES;
- } else if (eff_mtu > MJUM9BYTES) {
- eff_mtu = MJUM16BYTES;
- } else if (eff_mtu > MJUMPAGESIZE) {
- eff_mtu = MJUM9BYTES;
- } else if (eff_mtu > MCLBYTES) {
- eff_mtu = MJUMPAGESIZE;
- } else {
- eff_mtu = MCLBYTES;
- }
+ /*
+ * Try to fit packets into a single mbuf+cluster, but we have to split
+ * frames across multiple mbufs if the MTU is greater than the page
+ * size so that we don't don't trigger the (very expensive) contiguous
+ * memory allocator during normal rx operation.
+ */
+ if (eff_mtu <= MCLBYTES) {
+ priv->frag_info[0].frag_size = MCLBYTES;
+ priv->num_frags = 1;
+ } else if (eff_mtu <= MJUMPAGESIZE) {
+ priv->frag_info[0].frag_size = MJUMPAGESIZE;
+ priv->num_frags = 1;
+ } else {
+ for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) {
+ /*
+ * Allocate small to large but only as much as is needed for
+ * the tail.
+ */
+ while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1])
+ i--;
+ priv->frag_info[frag].frag_size = frag_sizes[i];
+ buf_size += priv->frag_info[frag].frag_size;
+ }
+ priv->num_frags = frag;
+ }
priv->rx_mb_size = eff_mtu;
+ priv->log_mbuf = ROUNDUP_LOG2(priv->num_frags);
- en_dbg(DRV, priv, "Effective RX MTU: %d bytes\n", eff_mtu);
+ en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d "
+ "num_frags:%d):\n", eff_mtu, priv->num_frags);
+ for (i = 0; i < priv->num_frags; i++) {
+ en_dbg(DRV, priv, " frag:%d - size:%d\n", i,
+ priv->frag_info[i].frag_size);
+ }
}
int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
@@ -283,7 +323,7 @@
struct mlx4_en_dev *mdev = priv->mdev;
struct mlx4_en_rx_ring *ring;
int err;
- int tmp;
+ size_t ring_size_bytes;
uint32_t x;
ring = kzalloc(sizeof(struct mlx4_en_rx_ring), GFP_KERNEL);
@@ -300,9 +340,9 @@
BUS_SPACE_MAXADDR, /* lowaddr */
BUS_SPACE_MAXADDR, /* highaddr */
NULL, NULL, /* filter, filterarg */
- MJUM16BYTES, /* maxsize */
+ MJUMPAGESIZE, /* maxsize */
1, /* nsegments */
- MJUM16BYTES, /* maxsegsize */
+ MJUMPAGESIZE, /* maxsegsize */
0, /* flags */
NULL, NULL, /* lockfunc, lockfuncarg */
&ring->dma_tag))) {
@@ -314,24 +354,21 @@
ring->cons = 0;
ring->size = size;
ring->size_mask = size - 1;
- ring->stride = roundup_pow_of_two(
- sizeof(struct mlx4_en_rx_desc) + DS_SIZE);
+ ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+ DS_SIZE * MLX4_EN_MAX_RX_FRAGS);
ring->log_stride = ffs(ring->stride) - 1;
ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
- tmp = size * sizeof(struct mlx4_en_rx_mbuf);
+ ring->num_mbufs = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS);
+ ring_size_bytes = ring->num_mbufs * sizeof(struct mlx4_en_rx_mbuf);
- ring->mbuf = kzalloc(tmp, GFP_KERNEL);
+ ring->mbuf = kzalloc(ring_size_bytes, GFP_KERNEL);
if (ring->mbuf == NULL) {
err = -ENOMEM;
goto err_dma_tag;
}
- err = -bus_dmamap_create(ring->dma_tag, 0, &ring->spare.dma_map);
- if (err != 0)
- goto err_info;
-
- for (x = 0; x != size; x++) {
+ for (x = 0; x != ring->num_mbufs; x++) {
err = -bus_dmamap_create(ring->dma_tag, 0,
&ring->mbuf[x].dma_map);
if (err != 0) {
@@ -341,8 +378,8 @@
goto err_info;
}
}
- en_dbg(DRV, priv, "Allocated MBUF ring at addr:%p size:%d\n",
- ring->mbuf, tmp);
+ en_dbg(DRV, priv, "Allocated MBUF ring at addr:%p size:%zd\n",
+ ring->mbuf, ring_size_bytes);
err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres,
ring->buf_size, 2 * PAGE_SIZE);
@@ -381,8 +418,8 @@
int i;
int ring_ind;
int err;
- int stride = roundup_pow_of_two(
- sizeof(struct mlx4_en_rx_desc) + DS_SIZE);
+ int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) +
+ DS_SIZE * priv->num_frags);
for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) {
ring = priv->rx_ring[ring_ind];
@@ -465,16 +502,8 @@
mlx4_en_unmap_buffer(&ring->wqres.buf);
mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE);
- for (x = 0; x != size; x++)
+ for (x = 0; x != ring->num_mbufs; x++)
bus_dmamap_destroy(ring->dma_tag, ring->mbuf[x].dma_map);
- /* free spare mbuf, if any */
- if (ring->spare.mbuf != NULL) {
- bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map,
- BUS_DMASYNC_POSTREAD);
- bus_dmamap_unload(ring->dma_tag, ring->spare.dma_map);
- m_freem(ring->spare.mbuf);
- }
- bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map);
vfree(ring->mbuf);
bus_dma_tag_destroy(ring->dma_tag);
kfree(ring);
@@ -532,6 +561,68 @@
return 0;
}
+/*
+ * Collect up the packet fragments, represented by individual mbufs, into a
+ * single mbuf chain ready to be passed up the stack. As mbufs are removed from
+ * the ring replace them with newly allocated ones; if we fail to allocate an
+ * mbuf then drop the current packet and return an error. This ensures that the
+ * ring is always in a state where it is ready to receive packets.
+ */
+static int
+mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
+ struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_desc *rx_desc,
+ struct mlx4_en_rx_mbuf *mb_list, int length)
+{
+ struct mlx4_en_frag_info *frag_info;
+ struct mbuf *mb, *first_mb, *prev_mb;
+ int flags, nr, align_len, mb_len;
+
+ first_mb = mb_list[0].mbuf;
+ prev_mb = NULL;
+ first_mb->m_pkthdr.len = length;
+ flags = M_PKTHDR;
+ align_len = MLX4_NET_IP_ALIGN;
+
+ /* Collect used fragments while replacing them in the HW descriptors */
+ for (nr = 0; nr < priv->num_frags; nr++) {
+ frag_info = &priv->frag_info[nr];
+
+ mb = mb_list[nr].mbuf;
+
+ /* Allocate a replacement page */
+ if (mlx4_en_alloc_buf(priv, ring, &rx_desc->data[nr].addr,
+ &mb_list[nr], flags, priv->frag_info[nr].frag_size))
+ goto fail;
+
+ if (prev_mb != NULL)
+ prev_mb->m_next = mb;
+ mb_len = frag_info->frag_size - align_len;
+ prev_mb = mb;
+
+ if (length <= mb_len)
+ break;
+
+ mb->m_len = mb_len;
+ length -= mb_len;
+ flags = 0;
+ align_len = 0;
+ }
+ /* Adjust size of last fragment to match actual length */
+ prev_mb->m_len = min(length, prev_mb->m_len);;
+ prev_mb->m_next = NULL;
+ return (0);
+
+fail:
+ /*
+ * At this point the fragments have been partially extracted and
+ * replaced. Free the mbufs that are no longer referenced by the ring.
+ */
+ if (first_mb != mb_list[0].mbuf)
+ m_freem(first_mb);
+ return (-ENOMEM);
+
+}
+
static struct mbuf *
mlx4_en_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring,
struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_mbuf *mb_list,
@@ -539,19 +630,12 @@
{
struct mbuf *mb;
- /* get mbuf */
mb = mb_list->mbuf;
- /* collect used fragment while atomically replacing it */
- if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list))
- return (NULL);
-
- /* range check hardware computed value */
- if (unlikely(length > mb->m_len))
- length = mb->m_len;
+ /* Move relevant fragments to mb */
+ if (unlikely(mlx4_en_complete_rx_desc(priv, ring, rx_desc, mb_list, length)))
+ return NULL;
- /* update total packet length in packet header */
- mb->m_len = mb->m_pkthdr.len = length;
return (mb);
}
@@ -591,7 +675,7 @@
/* Process all completed CQEs */
while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
cons_index & size)) {
- mb_list = ring->mbuf + index;
+ mb_list = ring->mbuf + (index << priv->log_mbuf);
rx_desc = (struct mlx4_en_rx_desc *)
(ring->buf + (index << ring->log_stride));

File Metadata

Mime Type
text/plain
Expires
Fri, Apr 3, 3:39 AM (2 h, 48 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
30742702
Default Alt Text
D11560.diff (15 KB)

Event Timeline