Index: head/sys/modules/mlxen/Makefile =================================================================== --- head/sys/modules/mlxen/Makefile +++ head/sys/modules/mlxen/Makefile @@ -11,5 +11,3 @@ CFLAGS+= -I${.CURDIR}/../../compat/linuxkpi/common/include .include - -CFLAGS+= -Wno-cast-qual -Wno-pointer-arith Index: head/sys/ofed/drivers/net/mlx4/en_netdev.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_netdev.c +++ head/sys/ofed/drivers/net/mlx4/en_netdev.c @@ -1247,7 +1247,6 @@ PAGE_SIZE); priv->rx_alloc_order = get_order(priv->rx_alloc_size); priv->rx_buf_size = roundup_pow_of_two(priv->rx_mb_size); - priv->log_rx_info = ROUNDUP_LOG2(sizeof(struct mlx4_en_rx_buf)); en_dbg(DRV, priv, "Rx buf size:%d\n", priv->rx_mb_size); /* Configure rx cq's and rings */ @@ -2091,8 +2090,6 @@ priv->port = port; priv->port_up = false; priv->flags = prof->flags; - priv->ctrl_flags = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | - MLX4_WQE_CTRL_SOLICITED); priv->num_tx_rings_p_up = mdev->profile.num_tx_rings_p_up; priv->tx_ring_num = prof->tx_ring_num; @@ -2108,7 +2105,7 @@ err = -ENOMEM; goto out; } - + priv->rx_ring_num = prof->rx_ring_num; priv->cqe_factor = (mdev->dev->caps.cqe_size == 64) ? 1 : 0; priv->mac_index = -1; Index: head/sys/ofed/drivers/net/mlx4/en_rx.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_rx.c +++ head/sys/ofed/drivers/net/mlx4/en_rx.c @@ -54,104 +54,133 @@ int possible_frags; int i; - /* Set size and memtype fields */ - for (i = 0; i < priv->num_frags; i++) { - rx_desc->data[i].byte_count = - cpu_to_be32(priv->frag_info[i].frag_size); - rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key); - } + rx_desc->data[0].byte_count = cpu_to_be32(priv->rx_mb_size); + rx_desc->data[0].lkey = cpu_to_be32(priv->mdev->mr.key); - /* If the number of used fragments does not fill up the ring stride, - * * remaining (unused) fragments must be padded with null address/size - * * and a special memory key */ + /* + * If the number of used fragments does not fill up the ring + * stride, remaining (unused) fragments must be padded with + * null address/size and a special memory key: + */ possible_frags = (ring->stride - sizeof(struct mlx4_en_rx_desc)) / DS_SIZE; - for (i = priv->num_frags; i < possible_frags; i++) { + for (i = 1; i < possible_frags; i++) { rx_desc->data[i].byte_count = 0; rx_desc->data[i].lkey = cpu_to_be32(MLX4_EN_MEMTYPE_PAD); rx_desc->data[i].addr = 0; } - } -static int mlx4_en_alloc_buf(struct mlx4_en_priv *priv, - struct mlx4_en_rx_desc *rx_desc, - struct mbuf **mb_list, - int i) +static int +mlx4_en_alloc_buf(struct mlx4_en_rx_ring *ring, + __be64 *pdma, struct mlx4_en_rx_mbuf *mb_list) { - struct mlx4_en_dev *mdev = priv->mdev; - struct mlx4_en_frag_info *frag_info = &priv->frag_info[i]; + bus_dma_segment_t segs[1]; + bus_dmamap_t map; struct mbuf *mb; - dma_addr_t dma; + int nsegs; + int err; - if (i == 0) - mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, frag_info->frag_size); - else - mb = m_getjcl(M_NOWAIT, MT_DATA, 0, frag_info->frag_size); - if (mb == NULL) { - priv->port_stats.rx_alloc_failed++; - return -ENOMEM; + /* try to allocate a new spare mbuf */ + if (unlikely(ring->spare.mbuf == NULL)) { + mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size); + if (unlikely(mb == NULL)) + return (-ENOMEM); + /* setup correct length */ + mb->m_len = ring->rx_mb_size; + + /* load spare mbuf into BUSDMA */ + err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, ring->spare.dma_map, + mb, segs, &nsegs, BUS_DMA_NOWAIT); + if (unlikely(err != 0)) { + m_freem(mb); + return (err); + } + KASSERT(nsegs == 1, + ("Number of segments is expected to be one")); + + /* store spare info */ + ring->spare.mbuf = mb; + ring->spare.paddr_be = cpu_to_be64(segs[0].ds_addr); + + bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map, + BUS_DMASYNC_PREREAD); } - dma = pci_map_single(mdev->pdev, mb->m_data, frag_info->frag_size, - PCI_DMA_FROMDEVICE); - rx_desc->data[i].addr = cpu_to_be64(dma); - mb_list[i] = mb; - return 0; -} + /* synchronize and unload the current mbuf, if any */ + if (likely(mb_list->mbuf != NULL)) { + bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, + BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(ring->dma_tag, mb_list->dma_map); + } -static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, - struct mlx4_en_rx_ring *ring, int index) -{ - struct mlx4_en_rx_desc *rx_desc = (struct mlx4_en_rx_desc *) - (ring->buf + (index * ring->stride)); - struct mbuf **mb_list = ring->rx_info + (index << priv->log_rx_info); - int i; + mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, ring->rx_mb_size); + if (unlikely(mb == NULL)) + goto use_spare; - for (i = 0; i < priv->num_frags; i++) - if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, i)) - goto err; + /* setup correct length */ + mb->m_len = ring->rx_mb_size; + + err = -bus_dmamap_load_mbuf_sg(ring->dma_tag, mb_list->dma_map, + mb, segs, &nsegs, BUS_DMA_NOWAIT); + if (unlikely(err != 0)) { + m_freem(mb); + goto use_spare; + } + KASSERT(nsegs == 1, ("Number of segments is expected to be one")); - return 0; + *pdma = cpu_to_be64(segs[0].ds_addr); + mb_list->mbuf = mb; -err: - while (i--) - m_free(mb_list[i]); - return -ENOMEM; + bus_dmamap_sync(ring->dma_tag, mb_list->dma_map, BUS_DMASYNC_PREREAD); + return (0); + +use_spare: + /* swap DMA maps */ + map = mb_list->dma_map; + mb_list->dma_map = ring->spare.dma_map; + ring->spare.dma_map = map; + + /* swap MBUFs */ + mb_list->mbuf = ring->spare.mbuf; + ring->spare.mbuf = NULL; + + /* store physical address */ + *pdma = ring->spare.paddr_be; + return (0); } -static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring) +static void +mlx4_en_free_buf(struct mlx4_en_rx_ring *ring, struct mlx4_en_rx_mbuf *mb_list) { - *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff); + bus_dmamap_t map = mb_list->dma_map; + bus_dmamap_sync(ring->dma_tag, map, BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(ring->dma_tag, map); + m_freem(mb_list->mbuf); + mb_list->mbuf = NULL; /* safety clearing */ } -static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv, - struct mlx4_en_rx_ring *ring, - int index) +static int +mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_rx_ring *ring, int index) { - struct mlx4_en_frag_info *frag_info; - struct mlx4_en_dev *mdev = priv->mdev; - struct mbuf **mb_list; struct mlx4_en_rx_desc *rx_desc = (struct mlx4_en_rx_desc *) - (ring->buf + (index << ring->log_stride)); - dma_addr_t dma; - int nr; - - mb_list = ring->rx_info + (index << priv->log_rx_info); - for (nr = 0; nr < priv->num_frags; nr++) { - en_dbg(DRV, priv, "Freeing fragment:%d\n", nr); - frag_info = &priv->frag_info[nr]; - dma = be64_to_cpu(rx_desc->data[nr].addr); - -#if BITS_PER_LONG == 64 - en_dbg(DRV, priv, "Unmaping buffer at dma:0x%lx\n", (u64) dma); -#elif BITS_PER_LONG == 32 - en_dbg(DRV, priv, "Unmaping buffer at dma:0x%llx\n", (u64) dma); -#endif - pci_unmap_single(mdev->pdev, dma, frag_info->frag_size, - PCI_DMA_FROMDEVICE); - m_free(mb_list[nr]); + (ring->buf + (index * ring->stride)); + struct mlx4_en_rx_mbuf *mb_list = ring->mbuf + index; + + mb_list->mbuf = NULL; + + if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list)) { + priv->port_stats.rx_alloc_failed++; + return (-ENOMEM); } + return (0); +} + +static inline void +mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring) +{ + *ring->wqres.db.db = cpu_to_be32(ring->prod & 0xffff); } static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) @@ -194,7 +223,8 @@ while (ring->actual_size > new_size) { ring->actual_size--; ring->prod--; - mlx4_en_free_rx_desc(priv, ring, ring->actual_size); + mlx4_en_free_buf(ring, + ring->mbuf + ring->actual_size); } } @@ -214,100 +244,106 @@ while (ring->cons != ring->prod) { index = ring->cons & ring->size_mask; en_dbg(DRV, priv, "Processing descriptor:%d\n", index); - mlx4_en_free_rx_desc(priv, ring, index); + mlx4_en_free_buf(ring, ring->mbuf + index); ++ring->cons; } } -#if MLX4_EN_MAX_RX_FRAGS == 3 -static int frag_sizes[] = { - FRAG_SZ0, - FRAG_SZ1, - FRAG_SZ2, -}; -#elif MLX4_EN_MAX_RX_FRAGS == 2 -static int frag_sizes[] = { - FRAG_SZ0, - FRAG_SZ1, -}; -#else -#error "Unknown MAX_RX_FRAGS" -#endif - void mlx4_en_calc_rx_buf(struct net_device *dev) { struct mlx4_en_priv *priv = netdev_priv(dev); int eff_mtu = dev->if_mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN; - int buf_size = 0; - int i, frag; - for (i = 0, frag = 0; buf_size < eff_mtu; frag++, i++) { - /* - * Allocate small to large but only as much as is needed for - * the tail. - */ - while (i > 0 && eff_mtu - buf_size <= frag_sizes[i - 1]) - i--; - priv->frag_info[frag].frag_size = frag_sizes[i]; - priv->frag_info[frag].frag_prefix_size = buf_size; - buf_size += priv->frag_info[frag].frag_size; - } + if (eff_mtu > MJUM16BYTES) { + en_err(priv, "MTU(%d) is too big\n", dev->if_mtu); + eff_mtu = MJUM16BYTES; + } else if (eff_mtu > MJUM9BYTES) { + eff_mtu = MJUM16BYTES; + } else if (eff_mtu > MJUMPAGESIZE) { + eff_mtu = MJUM9BYTES; + } else if (eff_mtu > MCLBYTES) { + eff_mtu = MJUMPAGESIZE; + } else { + eff_mtu = MCLBYTES; + } - priv->num_frags = frag; priv->rx_mb_size = eff_mtu; - priv->log_rx_info = - ROUNDUP_LOG2(priv->num_frags * sizeof(struct mbuf *)); - en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d " - "num_frags:%d):\n", eff_mtu, priv->num_frags); - for (i = 0; i < priv->num_frags; i++) { - en_dbg(DRV, priv, " frag:%d - size:%d prefix:%d\n", i, - priv->frag_info[i].frag_size, - priv->frag_info[i].frag_prefix_size); - } + en_dbg(DRV, priv, "Effective RX MTU: %d bytes\n", eff_mtu); } - int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring **pring, u32 size, int node) { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring; - int err = -ENOMEM; + int err; int tmp; + uint32_t x; ring = kzalloc(sizeof(struct mlx4_en_rx_ring), GFP_KERNEL); if (!ring) { en_err(priv, "Failed to allocate RX ring structure\n"); return -ENOMEM; } - + + /* Create DMA descriptor TAG */ + if ((err = -bus_dma_tag_create( + bus_get_dma_tag(mdev->pdev->dev.bsddev), + 1, /* any alignment */ + 0, /* no boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + MJUM16BYTES, /* maxsize */ + 1, /* nsegments */ + MJUM16BYTES, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg */ + &ring->dma_tag))) { + en_err(priv, "Failed to create DMA tag\n"); + goto err_ring; + } + ring->prod = 0; ring->cons = 0; ring->size = size; ring->size_mask = size - 1; - ring->stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + - DS_SIZE * MLX4_EN_MAX_RX_FRAGS); + ring->stride = roundup_pow_of_two( + sizeof(struct mlx4_en_rx_desc) + DS_SIZE); ring->log_stride = ffs(ring->stride) - 1; ring->buf_size = ring->size * ring->stride + TXBB_SIZE; - tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS * - sizeof(struct mbuf *)); + tmp = size * sizeof(struct mlx4_en_rx_mbuf); - ring->rx_info = kmalloc(tmp, GFP_KERNEL); - if (!ring->rx_info) { + ring->mbuf = kzalloc(tmp, GFP_KERNEL); + if (ring->mbuf == NULL) { err = -ENOMEM; - goto err_ring; + goto err_dma_tag; } - en_dbg(DRV, priv, "Allocated rx_info ring at addr:%p size:%d\n", - ring->rx_info, tmp); + err = -bus_dmamap_create(ring->dma_tag, 0, &ring->spare.dma_map); + if (err != 0) + goto err_info; + + for (x = 0; x != size; x++) { + err = -bus_dmamap_create(ring->dma_tag, 0, + &ring->mbuf[x].dma_map); + if (err != 0) { + while (x--) + bus_dmamap_destroy(ring->dma_tag, + ring->mbuf[x].dma_map); + goto err_info; + } + } + en_dbg(DRV, priv, "Allocated MBUF ring at addr:%p size:%d\n", + ring->mbuf, tmp); err = mlx4_alloc_hwq_res(mdev->dev, &ring->wqres, ring->buf_size, 2 * PAGE_SIZE); if (err) - goto err_info; + goto err_dma_map; err = mlx4_en_map_buffer(&ring->wqres.buf); if (err) { @@ -320,23 +356,29 @@ err_hwq: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); +err_dma_map: + for (x = 0; x != size; x++) { + bus_dmamap_destroy(ring->dma_tag, + ring->mbuf[x].dma_map); + } + bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map); err_info: - vfree(ring->rx_info); + vfree(ring->mbuf); +err_dma_tag: + bus_dma_tag_destroy(ring->dma_tag); err_ring: kfree(ring); - - return err; + return (err); } - int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv) { struct mlx4_en_rx_ring *ring; int i; int ring_ind; int err; - int stride = roundup_pow_of_two(sizeof(struct mlx4_en_rx_desc) + - DS_SIZE * priv->num_frags); + int stride = roundup_pow_of_two( + sizeof(struct mlx4_en_rx_desc) + DS_SIZE); for (ring_ind = 0; ring_ind < priv->rx_ring_num; ring_ind++) { ring = priv->rx_ring[ring_ind]; @@ -412,10 +454,22 @@ { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_rx_ring *ring = *pring; + uint32_t x; mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, size * stride + TXBB_SIZE); - vfree(ring->rx_info); + for (x = 0; x != size; x++) + bus_dmamap_destroy(ring->dma_tag, ring->mbuf[x].dma_map); + /* free spare mbuf, if any */ + if (ring->spare.mbuf != NULL) { + bus_dmamap_sync(ring->dma_tag, ring->spare.dma_map, + BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(ring->dma_tag, ring->spare.dma_map); + m_freem(ring->spare.mbuf); + } + bus_dmamap_destroy(ring->dma_tag, ring->spare.dma_map); + vfree(ring->mbuf); + bus_dma_tag_destroy(ring->dma_tag); kfree(ring); *pring = NULL; #ifdef CONFIG_RFS_ACCEL @@ -423,7 +477,6 @@ #endif } - void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring) { @@ -472,69 +525,27 @@ return 0; } - -/* Unmap a completed descriptor and free unused pages */ -static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, - struct mlx4_en_rx_desc *rx_desc, - struct mbuf **mb_list, - int length) -{ - struct mlx4_en_dev *mdev = priv->mdev; - struct mlx4_en_frag_info *frag_info; - dma_addr_t dma; - struct mbuf *mb; - int nr; - - mb = mb_list[0]; - mb->m_pkthdr.len = length; - /* Collect used fragments while replacing them in the HW descirptors */ - for (nr = 0; nr < priv->num_frags; nr++) { - frag_info = &priv->frag_info[nr]; - if (length <= frag_info->frag_prefix_size) - break; - if (nr) - mb->m_next = mb_list[nr]; - mb = mb_list[nr]; - mb->m_len = frag_info->frag_size; - dma = be64_to_cpu(rx_desc->data[nr].addr); - - /* Allocate a replacement page */ - if (mlx4_en_alloc_buf(priv, rx_desc, mb_list, nr)) - goto fail; - - /* Unmap buffer */ - pci_unmap_single(mdev->pdev, dma, frag_info->frag_size, - PCI_DMA_FROMDEVICE); - } - /* Adjust size of last fragment to match actual length */ - mb->m_len = length - priv->frag_info[nr - 1].frag_prefix_size; - mb->m_next = NULL; - return 0; - -fail: - /* Drop all accumulated fragments (which have already been replaced in - * the descriptor) of this packet; remaining fragments are reused... */ - while (nr > 0) { - nr--; - m_free(mb_list[nr]); - } - return -ENOMEM; - -} - -static struct mbuf *mlx4_en_rx_mb(struct mlx4_en_priv *priv, - struct mlx4_en_rx_desc *rx_desc, - struct mbuf **mb_list, - unsigned int length) +static struct mbuf * +mlx4_en_rx_mb(struct mlx4_en_priv *priv, struct mlx4_en_rx_ring *ring, + struct mlx4_en_rx_desc *rx_desc, struct mlx4_en_rx_mbuf *mb_list, + int length) { struct mbuf *mb; - mb = mb_list[0]; - /* Move relevant fragments to mb */ - if (unlikely(mlx4_en_complete_rx_desc(priv, rx_desc, mb_list, length))) - return NULL; + /* get mbuf */ + mb = mb_list->mbuf; - return mb; + /* collect used fragment while atomically replacing it */ + if (mlx4_en_alloc_buf(ring, &rx_desc->data[0].addr, mb_list)) + return (NULL); + + /* range check hardware computed value */ + if (unlikely(length > mb->m_len)) + length = mb->m_len; + + /* update total packet length in packet header */ + mb->m_len = mb->m_pkthdr.len = length; + return (mb); } /* For cpu arch with cache line of 64B the performance is better when cqe size==64B @@ -548,7 +559,7 @@ struct mlx4_en_priv *priv = netdev_priv(dev); struct mlx4_cqe *cqe; struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring]; - struct mbuf **mb_list; + struct mlx4_en_rx_mbuf *mb_list; struct mlx4_en_rx_desc *rx_desc; struct mbuf *mb; struct mlx4_cq *mcq = &cq->mcq; @@ -576,7 +587,7 @@ /* Process all completed CQEs */ while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK, cons_index & size)) { - mb_list = ring->rx_info + (index << priv->log_rx_info); + mb_list = ring->mbuf + index; rx_desc = (struct mlx4_en_rx_desc *) (ring->buf + (index << ring->log_stride)); @@ -593,8 +604,9 @@ */ length = be32_to_cpu(cqe->byte_cnt); length -= ring->fcs_del; - mb = mlx4_en_rx_mb(priv, rx_desc, mb_list, length); - if (!mb) { + + mb = mlx4_en_rx_mb(priv, ring, rx_desc, mb_list, length); + if (unlikely(!mb)) { ring->errors++; goto next; } Index: head/sys/ofed/drivers/net/mlx4/en_tx.c =================================================================== --- head/sys/ofed/drivers/net/mlx4/en_tx.c +++ head/sys/ofed/drivers/net/mlx4/en_tx.c @@ -67,6 +67,7 @@ { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring; + uint32_t x; int tmp; int err; @@ -79,11 +80,26 @@ } } + /* Create DMA descriptor TAG */ + if ((err = -bus_dma_tag_create( + bus_get_dma_tag(mdev->pdev->dev.bsddev), + 1, /* any alignment */ + 0, /* no boundary */ + BUS_SPACE_MAXADDR, /* lowaddr */ + BUS_SPACE_MAXADDR, /* highaddr */ + NULL, NULL, /* filter, filterarg */ + MLX4_EN_TX_MAX_PAYLOAD_SIZE, /* maxsize */ + MLX4_EN_TX_MAX_MBUF_FRAGS, /* nsegments */ + MLX4_EN_TX_MAX_MBUF_SIZE, /* maxsegsize */ + 0, /* flags */ + NULL, NULL, /* lockfunc, lockfuncarg */ + &ring->dma_tag))) + goto done; + ring->size = size; ring->size_mask = size - 1; ring->stride = stride; - ring->full_size = ring->size - HEADROOM - MAX_DESC_TXBBS; - ring->inline_thold = min(inline_thold, MAX_INLINE); + ring->inline_thold = MAX(MIN_PKT_LEN, MIN(inline_thold, MAX_INLINE)); mtx_init(&ring->tx_lock.m, "mlx4 tx", NULL, MTX_DEF); mtx_init(&ring->comp_lock.m, "mlx4 comp", NULL, MTX_DEF); @@ -92,30 +108,36 @@ M_WAITOK, &ring->tx_lock.m); if (ring->br == NULL) { en_err(priv, "Failed allocating tx_info ring\n"); - return -ENOMEM; + err = -ENOMEM; + goto err_free_dma_tag; } tmp = size * sizeof(struct mlx4_en_tx_info); - ring->tx_info = vmalloc_node(tmp, node); + ring->tx_info = kzalloc_node(tmp, GFP_KERNEL, node); if (!ring->tx_info) { - ring->tx_info = vmalloc(tmp); + ring->tx_info = kzalloc(tmp, GFP_KERNEL); if (!ring->tx_info) { err = -ENOMEM; goto err_ring; } } - en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", - ring->tx_info, tmp); - - ring->bounce_buf = kmalloc_node(MAX_DESC_SIZE, GFP_KERNEL, node); - if (!ring->bounce_buf) { - ring->bounce_buf = kmalloc(MAX_DESC_SIZE, GFP_KERNEL); - if (!ring->bounce_buf) { - err = -ENOMEM; + /* Create DMA descriptor MAPs */ + for (x = 0; x != size; x++) { + err = -bus_dmamap_create(ring->dma_tag, 0, + &ring->tx_info[x].dma_map); + if (err != 0) { + while (x--) { + bus_dmamap_destroy(ring->dma_tag, + ring->tx_info[x].dma_map); + } goto err_info; } } + + en_dbg(DRV, priv, "Allocated tx_info ring at addr:%p size:%d\n", + ring->tx_info, tmp); + ring->buf_size = ALIGN(size * ring->stride, MLX4_EN_PAGE_SIZE); /* Allocate HW buffers on provided NUMA node */ @@ -123,7 +145,7 @@ 2 * PAGE_SIZE); if (err) { en_err(priv, "Failed allocating hwq resources\n"); - goto err_bounce; + goto err_dma_map; } err = mlx4_en_map_buffer(&ring->wqres.buf); @@ -173,12 +195,16 @@ mlx4_en_unmap_buffer(&ring->wqres.buf); err_hwq_res: mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); -err_bounce: - kfree(ring->bounce_buf); +err_dma_map: + for (x = 0; x != size; x++) + bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); err_info: vfree(ring->tx_info); err_ring: buf_ring_free(ring->br, M_DEVBUF); +err_free_dma_tag: + bus_dma_tag_destroy(ring->dma_tag); +done: kfree(ring); return err; } @@ -188,6 +214,7 @@ { struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_ring *ring = *pring; + uint32_t x; en_dbg(DRV, priv, "Destroying tx ring, qpn: %d\n", ring->qpn); buf_ring_free(ring->br, M_DEVBUF); @@ -198,10 +225,12 @@ mlx4_qp_release_range(priv->mdev->dev, ring->qpn, 1); mlx4_en_unmap_buffer(&ring->wqres.buf); mlx4_free_hwq_res(mdev->dev, &ring->wqres, ring->buf_size); - kfree(ring->bounce_buf); + for (x = 0; x != ring->size; x++) + bus_dmamap_destroy(ring->dma_tag, ring->tx_info[x].dma_map); vfree(ring->tx_info); mtx_destroy(&ring->tx_lock.m); mtx_destroy(&ring->comp_lock.m); + bus_dma_tag_destroy(ring->dma_tag); kfree(ring); *pring = NULL; } @@ -219,7 +248,6 @@ ring->last_nr_txbb = 1; ring->poll_cnt = 0; ring->blocked = 0; - memset(ring->tx_info, 0, ring->size * sizeof(struct mlx4_en_tx_info)); memset(ring->buf, 0, ring->buf_size); ring->qp_state = MLX4_QP_STATE_RST; @@ -244,99 +272,63 @@ MLX4_QP_STATE_RST, NULL, 0, 0, &ring->qp); } -static void mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, - struct mlx4_en_tx_ring *ring, - int index, u8 owner) +static volatile struct mlx4_wqe_data_seg * +mlx4_en_store_inline_lso_data(volatile struct mlx4_wqe_data_seg *dseg, + struct mbuf *mb, int len, __be32 owner_bit) { - struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; - struct mlx4_en_tx_desc *tx_desc = (struct mlx4_en_tx_desc *) - (ring->buf + index * TXBB_SIZE); - void *end = ring->buf + ring->buf_size; - __be32 *ptr = (__be32 *)tx_desc; - __be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT)); - int i; - - /* Optimize the common case when there are no wraparounds */ - if (likely((void *)tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) - /* Stamp the freed descriptor */ - for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { - *ptr = stamp; - ptr += STAMP_DWORDS; - } - else - /* Stamp the freed descriptor */ - for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { - *ptr = stamp; - ptr += STAMP_DWORDS; - if ((void *)ptr >= end) { - ptr = (__be32 *)ring->buf; - stamp ^= cpu_to_be32(0x80000000); - } - } + uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); + + /* copy data into place */ + m_copydata(mb, 0, len, inl + 4); + dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); + return (dseg); } -static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, - struct mlx4_en_tx_ring *ring, - int index, u8 owner, u64 timestamp) +static void +mlx4_en_store_inline_lso_header(volatile struct mlx4_wqe_data_seg *dseg, + int len, __be32 owner_bit) +{ +} + +static void +mlx4_en_stamp_wqe(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, u32 index, u8 owner) { - struct mlx4_en_dev *mdev = priv->mdev; struct mlx4_en_tx_info *tx_info = &ring->tx_info[index]; struct mlx4_en_tx_desc *tx_desc = (struct mlx4_en_tx_desc *) - (ring->buf + index * TXBB_SIZE); - struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset; - struct mbuf *mb = tx_info->mb; - void *end = ring->buf + ring->buf_size; - int frags = tx_info->nr_segs;; - int i; - - /* Optimize the common case when there are no wraparounds */ - if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) { - if (!tx_info->inl) { - if (tx_info->linear) { - dma_unmap_single(priv->ddev, - (dma_addr_t) be64_to_cpu(data->addr), - be32_to_cpu(data->byte_count), - PCI_DMA_TODEVICE); - ++data; - } + (ring->buf + (index * TXBB_SIZE)); + volatile __be32 *ptr = (__be32 *)tx_desc; + const __be32 stamp = cpu_to_be32(STAMP_VAL | + ((u32)owner << STAMP_SHIFT)); + u32 i; + + /* Stamp the freed descriptor */ + for (i = 0; i < tx_info->nr_txbb * TXBB_SIZE; i += STAMP_STRIDE) { + *ptr = stamp; + ptr += STAMP_DWORDS; + } +} - for (i = 0; i < frags; i++) { - pci_unmap_single(mdev->pdev, - (dma_addr_t) be64_to_cpu(data[i].addr), - data[i].byte_count, PCI_DMA_TODEVICE); - } - } - } else { - if (!tx_info->inl) { - if ((void *) data >= end) { - data = (struct mlx4_wqe_data_seg *) - (ring->buf + ((void *)data - end)); - } +static u32 +mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, + struct mlx4_en_tx_ring *ring, u32 index) +{ + struct mlx4_en_tx_info *tx_info; + struct mbuf *mb; - if (tx_info->linear) { - dma_unmap_single(priv->ddev, - (dma_addr_t) be64_to_cpu(data->addr), - be32_to_cpu(data->byte_count), - PCI_DMA_TODEVICE); - ++data; - } + tx_info = &ring->tx_info[index]; + mb = tx_info->mb; + + if (mb == NULL) + goto done; + + bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, + BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(ring->dma_tag, tx_info->dma_map); - for (i = 0; i < frags; i++) { - /* Check for wraparound before unmapping */ - if ((void *) data >= end) - data = (struct mlx4_wqe_data_seg *)ring->buf; - pci_unmap_single(mdev->pdev, - (dma_addr_t) be64_to_cpu(data->addr), - data->byte_count, PCI_DMA_TODEVICE); - ++data; - } - } - } - /* Send a copy of the frame to the BPF listener */ - if (priv->dev && priv->dev->if_bpf) - ETHER_BPF_MTAP(priv->dev, mb); m_freem(mb); - return tx_info->nr_txbb; +done: + return (tx_info->nr_txbb); } int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring) @@ -356,8 +348,7 @@ while (ring->cons != ring->prod) { ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring, - ring->cons & ring->size_mask, - !!(ring->cons & ring->size), 0); + ring->cons & ring->size_mask); ring->cons += ring->last_nr_txbb; cnt++; } @@ -368,6 +359,14 @@ return cnt; } +static bool +mlx4_en_tx_ring_is_full(struct mlx4_en_tx_ring *ring) +{ + int wqs; + wqs = ring->size - (ring->prod - ring->cons); + return (wqs < (HEADROOM + (2 * MLX4_EN_TX_WQE_MAX_WQEBBS))); +} + static int mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq) { @@ -383,12 +382,7 @@ int size = cq->size; u32 size_mask = ring->size_mask; struct mlx4_cqe *buf = cq->buf; - u32 packets = 0; - u32 bytes = 0; int factor = priv->cqe_factor; - u64 timestamp = 0; - int done = 0; - if (!priv->port_up) return 0; @@ -423,16 +417,12 @@ ring_index = (ring_index + ring->last_nr_txbb) & size_mask; /* free next descriptor */ ring->last_nr_txbb = mlx4_en_free_tx_desc( - priv, ring, ring_index, - !!((ring->cons + txbbs_skipped) & - ring->size), timestamp); + priv, ring, ring_index); mlx4_en_stamp_wqe(priv, ring, stamp_index, !!((ring->cons + txbbs_stamp) & ring->size)); stamp_index = ring_index; txbbs_stamp = txbbs_skipped; - packets++; - bytes += ring->tx_info[ring_index].nr_bytes; } while (ring_index != new_index); ++cons_index; @@ -451,15 +441,14 @@ ring->cons += txbbs_skipped; /* Wakeup Tx queue if it was stopped and ring is not full */ - if (unlikely(ring->blocked) && - (ring->prod - ring->cons) <= ring->full_size) { + if (unlikely(ring->blocked) && !mlx4_en_tx_ring_is_full(ring)) { ring->blocked = 0; if (atomic_fetchadd_int(&priv->blocked, -1) == 1) atomic_clear_int(&dev->if_drv_flags ,IFF_DRV_OACTIVE); ring->wake_queue++; priv->port_stats.wake_queue++; } - return done; + return (0); } void mlx4_en_tx_irq(struct mlx4_cq *mcq) @@ -500,34 +489,6 @@ spin_unlock(&ring->comp_lock); } -static struct mlx4_en_tx_desc *mlx4_en_bounce_to_desc(struct mlx4_en_priv *priv, - struct mlx4_en_tx_ring *ring, - u32 index, - unsigned int desc_size) -{ - u32 copy = (ring->size - index) * TXBB_SIZE; - int i; - - for (i = desc_size - copy - 4; i >= 0; i -= 4) { - if ((i & (TXBB_SIZE - 1)) == 0) - wmb(); - - *((u32 *) (ring->buf + i)) = - *((u32 *) (ring->bounce_buf + copy + i)); - } - - for (i = copy - 4; i >= 4 ; i -= 4) { - if ((i & (TXBB_SIZE - 1)) == 0) - wmb(); - - *((u32 *) (ring->buf + index * TXBB_SIZE + i)) = - *((u32 *) (ring->bounce_buf + i)); - } - - /* Return real descriptor location */ - return (struct mlx4_en_tx_desc *)(ring->buf + index * TXBB_SIZE); -} - static inline void mlx4_en_xmit_poll(struct mlx4_en_priv *priv, int tx_ind) { struct mlx4_en_cq *cq = priv->tx_cq[tx_ind]; @@ -546,30 +507,22 @@ } } -static int is_inline(struct mbuf *mb, int thold) +static u16 +mlx4_en_get_inline_hdr_size(struct mlx4_en_tx_ring *ring, struct mbuf *mb) { - if (thold && mb->m_pkthdr.len <= thold && - (mb->m_pkthdr.csum_flags & CSUM_TSO) == 0) - return 1; + u16 retval; - return 0; -} + /* only copy from first fragment, if possible */ + retval = MIN(ring->inline_thold, mb->m_len); -static int inline_size(struct mbuf *mb) -{ - int len; - - len = mb->m_pkthdr.len; - if (len + CTRL_SIZE + sizeof(struct mlx4_wqe_inline_seg) - <= MLX4_INLINE_ALIGN) - return ALIGN(len + CTRL_SIZE + - sizeof(struct mlx4_wqe_inline_seg), 16); - else - return ALIGN(len + CTRL_SIZE + 2 * - sizeof(struct mlx4_wqe_inline_seg), 16); + /* check for too little data */ + if (unlikely(retval < MIN_PKT_LEN)) + retval = MIN(ring->inline_thold, mb->m_pkthdr.len); + return (retval); } -static int get_head_size(struct mbuf *mb) +static int +mlx4_en_get_header_size(struct mbuf *mb) { struct ether_vlan_header *eh; struct tcphdr *th; @@ -622,83 +575,48 @@ return (eth_hdr_len); } -static int get_real_size(struct mbuf *mb, struct net_device *dev, int *p_n_segs, - int *lso_header_size, int inl) -{ - struct mbuf *m; - int nr_segs = 0; - - for (m = mb; m != NULL; m = m->m_next) - if (m->m_len) - nr_segs++; - - if (mb->m_pkthdr.csum_flags & CSUM_TSO) { - *lso_header_size = get_head_size(mb); - if (*lso_header_size) { - if (mb->m_len == *lso_header_size) - nr_segs--; - *p_n_segs = nr_segs; - return CTRL_SIZE + nr_segs * DS_SIZE + - ALIGN(*lso_header_size + 4, DS_SIZE); - } - } else - *lso_header_size = 0; - *p_n_segs = nr_segs; - if (inl) - return inline_size(mb); - return (CTRL_SIZE + nr_segs * DS_SIZE); -} - -static struct mbuf *mb_copy(struct mbuf *mb, int *offp, char *data, int len) -{ - int bytes; - int off; - - off = *offp; - while (len) { - bytes = min(mb->m_len - off, len); - if (bytes) - memcpy(data, mb->m_data + off, bytes); - len -= bytes; - data += bytes; - off += bytes; - if (off == mb->m_len) { - off = 0; - mb = mb->m_next; - } - } - *offp = off; - return (mb); -} - -static void build_inline_wqe(struct mlx4_en_tx_desc *tx_desc, struct mbuf *mb, - int real_size, u16 *vlan_tag, int tx_ind) -{ - struct mlx4_wqe_inline_seg *inl = &tx_desc->inl; - int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - sizeof *inl; - int len; - int off; - - off = 0; - len = mb->m_pkthdr.len; - if (len <= spc) { - inl->byte_count = cpu_to_be32(1 << 31 | - (max_t(typeof(len), len, MIN_PKT_LEN))); - mb_copy(mb, &off, (void *)(inl + 1), len); - if (len < MIN_PKT_LEN) - memset(((void *)(inl + 1)) + len, 0, - MIN_PKT_LEN - len); +static volatile struct mlx4_wqe_data_seg * +mlx4_en_store_inline_data(volatile struct mlx4_wqe_data_seg *dseg, + struct mbuf *mb, int len, __be32 owner_bit) +{ + uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); + const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; + + if (unlikely(len < MIN_PKT_LEN)) { + m_copydata(mb, 0, len, inl + 4); + memset(inl + 4 + len, 0, MIN_PKT_LEN - len); + dseg += DIV_ROUND_UP(4 + MIN_PKT_LEN, DS_SIZE_ALIGNMENT); + } else if (len <= spc) { + m_copydata(mb, 0, len, inl + 4); + dseg += DIV_ROUND_UP(4 + len, DS_SIZE_ALIGNMENT); + } else { + m_copydata(mb, 0, spc, inl + 4); + m_copydata(mb, spc, len - spc, inl + 8 + spc); + dseg += DIV_ROUND_UP(8 + len, DS_SIZE_ALIGNMENT); + } + return (dseg); +} + +static void +mlx4_en_store_inline_header(volatile struct mlx4_wqe_data_seg *dseg, + int len, __be32 owner_bit) +{ + uint8_t *inl = __DEVOLATILE(uint8_t *, dseg); + const int spc = MLX4_INLINE_ALIGN - CTRL_SIZE - 4; + + if (unlikely(len < MIN_PKT_LEN)) { + *(volatile uint32_t *)inl = + SET_BYTE_COUNT((1 << 31) | MIN_PKT_LEN); + } else if (len <= spc) { + *(volatile uint32_t *)inl = + SET_BYTE_COUNT((1 << 31) | len); } else { - inl->byte_count = cpu_to_be32(1 << 31 | spc); - mb = mb_copy(mb, &off, (void *)(inl + 1), spc); - inl = (void *) (inl + 1) + spc; - mb_copy(mb, &off, (void *)(inl + 1), len - spc); + *(volatile uint32_t *)(inl + 4 + spc) = + SET_BYTE_COUNT((1 << 31) | (len - spc)); wmb(); - inl->byte_count = cpu_to_be32(1 << 31 | (len - spc)); + *(volatile uint32_t *)inl = + SET_BYTE_COUNT((1 << 31) | spc); } - tx_desc->ctrl.vlan_tag = cpu_to_be16(*vlan_tag); - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * !!(*vlan_tag); - tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; } static uint32_t hashrandom; @@ -748,168 +666,263 @@ return mac; } -static int mlx4_en_xmit(struct net_device *dev, int tx_ind, struct mbuf **mbp) +static int mlx4_en_xmit(struct mlx4_en_priv *priv, int tx_ind, struct mbuf **mbp) { - struct mlx4_en_priv *priv = netdev_priv(dev); - struct mlx4_en_dev *mdev = priv->mdev; - struct mlx4_en_tx_ring *ring; - struct mlx4_en_cq *cq; - struct mlx4_en_tx_desc *tx_desc; - struct mlx4_wqe_data_seg *data; + enum { + DS_FACT = TXBB_SIZE / DS_SIZE_ALIGNMENT, + CTRL_FLAGS = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE | + MLX4_WQE_CTRL_SOLICITED), + }; + bus_dma_segment_t segs[MLX4_EN_TX_MAX_MBUF_FRAGS]; + volatile struct mlx4_wqe_data_seg *dseg; + volatile struct mlx4_wqe_data_seg *dseg_inline; + volatile struct mlx4_en_tx_desc *tx_desc; + struct mlx4_en_tx_ring *ring = priv->tx_ring[tx_ind]; + struct ifnet *ifp = priv->dev; struct mlx4_en_tx_info *tx_info; + struct mbuf *mb = *mbp; struct mbuf *m; - int nr_txbb; + __be32 owner_bit; int nr_segs; - int desc_size; - int real_size; - dma_addr_t dma; - u32 index, bf_index, ring_size; - __be32 op_own; - u16 vlan_tag = 0; - int i; - int lso_header_size; - bool bounce = false; - bool inl = false; - struct mbuf *mb; - mb = *mbp; - int defrag = 1; - - if (!priv->port_up) - goto tx_drop; - - ring = priv->tx_ring[tx_ind]; - ring_size = ring->size; - inl = is_inline(mb, ring->inline_thold); - -retry: - real_size = get_real_size(mb, dev, &nr_segs, &lso_header_size, inl); - if (unlikely(!real_size)) - goto tx_drop; + int pad; + int err; + u32 bf_size; + u32 bf_prod; + u32 opcode; + u16 index; + u16 ds_cnt; + u16 ihs; - /* Align descriptor to TXBB size */ - desc_size = ALIGN(real_size, TXBB_SIZE); - nr_txbb = desc_size / TXBB_SIZE; - if (unlikely(nr_txbb > MAX_DESC_TXBBS)) { - if (defrag) { - mb = m_defrag(*mbp, M_NOWAIT); - if (mb == NULL) { - mb = *mbp; - goto tx_drop; - } - *mbp = mb; - defrag = 0; - goto retry; - } - en_warn(priv, "Oversized header or SG list\n"); + if (unlikely(!priv->port_up)) { + err = EINVAL; goto tx_drop; } - /* Obtain VLAN information if present */ - if (mb->m_flags & M_VLANTAG) { - vlan_tag = mb->m_pkthdr.ether_vtag; - } - - /* Check available TXBBs and 2K spare for prefetch - * Even if netif_tx_stop_queue() will be called - * driver will send current packet to ensure - * that at least one completion will be issued after - * stopping the queue - */ - if (unlikely((int)(ring->prod - ring->cons) > ring->full_size)) { - /* every full Tx ring stops queue */ - if (ring->blocked == 0) - atomic_add_int(&priv->blocked, 1); - /* Set HW-queue-is-full flag */ - atomic_set_int(&dev->if_drv_flags, IFF_DRV_OACTIVE); + /* check if TX ring is full */ + if (unlikely(mlx4_en_tx_ring_is_full(ring))) { + /* every full native Tx ring stops queue */ + if (ring->blocked == 0) + atomic_add_int(&priv->blocked, 1); + /* Set HW-queue-is-full flag */ + atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); + priv->port_stats.queue_stopped++; ring->blocked = 1; priv->port_stats.queue_stopped++; ring->queue_stopped++; /* Use interrupts to find out when queue opened */ - cq = priv->tx_cq[tx_ind]; - mlx4_en_arm_cq(priv, cq); - return EBUSY; + mlx4_en_arm_cq(priv, priv->tx_cq[tx_ind]); + return (ENOBUFS); } + /* sanity check we are not wrapping around */ + KASSERT(((~ring->prod) & ring->size_mask) >= + (MLX4_EN_TX_WQE_MAX_WQEBBS - 1), ("Wrapping around TX ring")); + /* Track current inflight packets for performance analysis */ AVG_PERF_COUNTER(priv->pstats.inflight_avg, (u32) (ring->prod - ring->cons - 1)); - /* Packet is good - grab an index and transmit it */ + /* Track current mbuf packet header length */ + AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len); + + /* Grab an index and try to transmit packet */ + owner_bit = (ring->prod & ring->size) ? + cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0; index = ring->prod & ring->size_mask; - bf_index = ring->prod; + tx_desc = (volatile struct mlx4_en_tx_desc *) + (ring->buf + index * TXBB_SIZE); + tx_info = &ring->tx_info[index]; + dseg = &tx_desc->data; - /* See if we have enough space for whole descriptor TXBB for setting - * SW ownership on next descriptor; if not, use a bounce buffer. */ - if (likely(index + nr_txbb <= ring_size)) - tx_desc = (struct mlx4_en_tx_desc *)(ring->buf + index * TXBB_SIZE); - else { - tx_desc = (struct mlx4_en_tx_desc *) ring->bounce_buf; - bounce = true; + /* send a copy of the frame to the BPF listener, if any */ + if (ifp != NULL && ifp->if_bpf != NULL) + ETHER_BPF_MTAP(ifp, mb); + + /* get default flags */ + tx_desc->ctrl.srcrb_flags = CTRL_FLAGS; + + if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) + tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); + + if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | + CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) + tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); + + /* do statistics */ + if (likely(tx_desc->ctrl.srcrb_flags != CTRL_FLAGS)) { + priv->port_stats.tx_chksum_offload++; + ring->tx_csum++; } - /* Save mb in tx_info ring */ - tx_info = &ring->tx_info[index]; - tx_info->mb = mb; - tx_info->nr_txbb = nr_txbb; - tx_info->nr_segs = nr_segs; + /* check for VLAN tag */ + if (mb->m_flags & M_VLANTAG) { + tx_desc->ctrl.vlan_tag = cpu_to_be16(mb->m_pkthdr.ether_vtag); + tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN; + } else { + tx_desc->ctrl.vlan_tag = 0; + tx_desc->ctrl.ins_vlan = 0; + } + + /* clear immediate field */ + tx_desc->ctrl.imm = 0; + + /* Handle LSO (TSO) packets */ + if (mb->m_pkthdr.csum_flags & CSUM_TSO) { + u32 payload_len; + u32 mss = mb->m_pkthdr.tso_segsz; + u32 num_pkts; + + opcode = cpu_to_be32(MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR) | + owner_bit; + ihs = mlx4_en_get_header_size(mb); + if (unlikely(ihs > MAX_INLINE)) { + ring->oversized_packets++; + err = EINVAL; + goto tx_drop; + } + tx_desc->lso.mss_hdr_size = cpu_to_be32((mss << 16) | ihs); + payload_len = mb->m_pkthdr.len - ihs; + if (unlikely(payload_len == 0)) + num_pkts = 1; + else + num_pkts = DIV_ROUND_UP(payload_len, mss); + ring->bytes += payload_len + (num_pkts * ihs); + ring->packets += num_pkts; + priv->port_stats.tso_packets++; + /* store pointer to inline header */ + dseg_inline = dseg; + /* copy data inline */ + dseg = mlx4_en_store_inline_lso_data(dseg, + mb, ihs, owner_bit); + } else { + opcode = cpu_to_be32(MLX4_OPCODE_SEND) | + owner_bit; + ihs = mlx4_en_get_inline_hdr_size(ring, mb); + ring->bytes += max_t (unsigned int, + mb->m_pkthdr.len, ETHER_MIN_LEN - ETHER_CRC_LEN); + ring->packets++; + /* store pointer to inline header */ + dseg_inline = dseg; + /* copy data inline */ + dseg = mlx4_en_store_inline_data(dseg, + mb, ihs, owner_bit); + } + m_adj(mb, ihs); + + /* trim off empty mbufs */ + while (mb->m_len == 0) { + mb = m_free(mb); + /* check if all data has been inlined */ + if (mb == NULL) { + nr_segs = 0; + goto skip_dma; + } + } - if (lso_header_size) { - memcpy(tx_desc->lso.header, mb->m_data, lso_header_size); - data = ((void *)&tx_desc->lso + ALIGN(lso_header_size + 4, - DS_SIZE)); - /* lso header is part of m_data. - * need to omit when mapping DMA */ - mb->m_data += lso_header_size; - mb->m_len -= lso_header_size; + err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, + mb, segs, &nr_segs, BUS_DMA_NOWAIT); + if (unlikely(err == EFBIG)) { + /* Too many mbuf fragments */ + m = m_defrag(mb, M_NOWAIT); + if (m == NULL) { + ring->oversized_packets++; + goto tx_drop; + } + mb = m; + /* Try again */ + err = bus_dmamap_load_mbuf_sg(ring->dma_tag, tx_info->dma_map, + mb, segs, &nr_segs, BUS_DMA_NOWAIT); + } + /* catch errors */ + if (unlikely(err != 0)) { + ring->oversized_packets++; + goto tx_drop; } - else - data = &tx_desc->data; + /* make sure all mbuf data is written to RAM */ + bus_dmamap_sync(ring->dma_tag, tx_info->dma_map, + BUS_DMASYNC_PREWRITE); + +skip_dma: + /* compute number of DS needed */ + ds_cnt = (dseg - ((volatile struct mlx4_wqe_data_seg *)tx_desc)) + nr_segs; - /* valid only for none inline segments */ - tx_info->data_offset = (void *)data - (void *)tx_desc; + /* + * Check if the next request can wrap around and fill the end + * of the current request with zero immediate data: + */ + pad = DIV_ROUND_UP(ds_cnt, DS_FACT); + pad = (~(ring->prod + pad)) & ring->size_mask; - if (inl) { - tx_info->inl = 1; + if (unlikely(pad < (MLX4_EN_TX_WQE_MAX_WQEBBS - 1))) { + /* + * Compute the least number of DS blocks we need to + * pad in order to achieve a TX ring wraparound: + */ + pad = (DS_FACT * (pad + 1)); } else { - for (i = 0, m = mb; i < nr_segs; i++, m = m->m_next) { - if (m->m_len == 0) { - i--; - continue; - } - dma = pci_map_single(mdev->dev->pdev, m->m_data, - m->m_len, PCI_DMA_TODEVICE); - data->addr = cpu_to_be64(dma); - data->lkey = cpu_to_be32(mdev->mr.key); - wmb(); - data->byte_count = cpu_to_be32(m->m_len); - data++; - } - if (lso_header_size) { - mb->m_data -= lso_header_size; - mb->m_len += lso_header_size; - } - tx_info->inl = 0; + /* + * The hardware will automatically jump to the next + * TXBB. No need for padding. + */ + pad = 0; } + /* compute total number of DS blocks */ + ds_cnt += pad; + /* + * When modifying this code, please ensure that the following + * computation is always less than or equal to 0x3F: + * + * ((MLX4_EN_TX_WQE_MAX_WQEBBS - 1) * DS_FACT) + + * (MLX4_EN_TX_WQE_MAX_WQEBBS * DS_FACT) + * + * Else the "ds_cnt" variable can become too big. + */ + tx_desc->ctrl.fence_size = (ds_cnt & 0x3f); - /* Prepare ctrl segement apart opcode+ownership, which depends on - * whether LSO is used */ - tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag); - tx_desc->ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * - !!vlan_tag; - tx_desc->ctrl.fence_size = (real_size / 16) & 0x3f; - tx_desc->ctrl.srcrb_flags = priv->ctrl_flags; - if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO | - CSUM_TCP | CSUM_UDP | CSUM_TCP_IPV6 | CSUM_UDP_IPV6)) { - if (mb->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)) - tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM); - if (mb->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | - CSUM_UDP_IPV6 | CSUM_TCP_IPV6 | CSUM_TSO)) - tx_desc->ctrl.srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM); - priv->port_stats.tx_chksum_offload++; - ring->tx_csum++; - } + /* store pointer to mbuf */ + tx_info->mb = mb; + tx_info->nr_txbb = DIV_ROUND_UP(ds_cnt, DS_FACT); + bf_size = ds_cnt * DS_SIZE_ALIGNMENT; + bf_prod = ring->prod; + + /* compute end of "dseg" array */ + dseg += nr_segs + pad; + + /* pad using zero immediate dseg */ + while (pad--) { + dseg--; + dseg->addr = 0; + dseg->lkey = 0; + wmb(); + dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0); + } + + /* fill segment list */ + while (nr_segs--) { + if (unlikely(segs[nr_segs].ds_len == 0)) { + dseg--; + dseg->addr = 0; + dseg->lkey = 0; + wmb(); + dseg->byte_count = SET_BYTE_COUNT((1 << 31)|0); + } else { + dseg--; + dseg->addr = cpu_to_be64((uint64_t)segs[nr_segs].ds_addr); + dseg->lkey = cpu_to_be32(priv->mdev->mr.key); + wmb(); + dseg->byte_count = SET_BYTE_COUNT((uint32_t)segs[nr_segs].ds_len); + } + } + + wmb(); + + /* write owner bits in reverse order */ + if ((opcode & cpu_to_be32(0x1F)) == cpu_to_be32(MLX4_OPCODE_LSO)) + mlx4_en_store_inline_lso_header(dseg_inline, ihs, owner_bit); + else + mlx4_en_store_inline_header(dseg_inline, ihs, owner_bit); if (unlikely(priv->validate_loopback)) { /* Copy dst mac address to wqe */ @@ -927,77 +940,46 @@ } } - /* Handle LSO (TSO) packets */ - if (lso_header_size) { - int segsz; - /* Mark opcode as LSO */ - op_own = cpu_to_be32(MLX4_OPCODE_LSO | (1 << 6)) | - ((ring->prod & ring_size) ? - cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); - - /* Fill in the LSO prefix */ - tx_desc->lso.mss_hdr_size = cpu_to_be32( - mb->m_pkthdr.tso_segsz << 16 | lso_header_size); - - priv->port_stats.tso_packets++; - segsz = mb->m_pkthdr.tso_segsz; - i = ((mb->m_pkthdr.len - lso_header_size + segsz - 1) / segsz); - tx_info->nr_bytes= mb->m_pkthdr.len + (i - 1) * lso_header_size; - ring->packets += i; - } else { - /* Normal (Non LSO) packet */ - op_own = cpu_to_be32(MLX4_OPCODE_SEND) | - ((ring->prod & ring_size) ? - cpu_to_be32(MLX4_EN_BIT_DESC_OWN) : 0); - tx_info->nr_bytes = max(mb->m_pkthdr.len, - (unsigned int)ETHER_MIN_LEN - ETHER_CRC_LEN); - ring->packets++; - - } - ring->bytes += tx_info->nr_bytes; - AVG_PERF_COUNTER(priv->pstats.tx_pktsz_avg, mb->m_pkthdr.len); + /* update producer counter */ + ring->prod += tx_info->nr_txbb; - if (tx_info->inl) { - build_inline_wqe(tx_desc, mb, real_size, &vlan_tag, tx_ind); - tx_info->inl = 1; - } - - ring->prod += nr_txbb; - - - /* If we used a bounce buffer then copy descriptor back into place */ - if (unlikely(bounce)) - tx_desc = mlx4_en_bounce_to_desc(priv, ring, index, desc_size); - if (ring->bf_enabled && desc_size <= MAX_BF && !bounce && !vlan_tag) { - *(__be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn); - op_own |= htonl((bf_index & 0xffff) << 8); - /* Ensure new descirptor hits memory - * before setting ownership of this descriptor to HW */ - wmb(); - tx_desc->ctrl.owner_opcode = op_own; + if (ring->bf_enabled && bf_size <= MAX_BF && + (tx_desc->ctrl.ins_vlan != MLX4_WQE_CTRL_INS_VLAN)) { - wmb(); + /* store doorbell number */ + *(volatile __be32 *) (&tx_desc->ctrl.vlan_tag) |= cpu_to_be32(ring->doorbell_qpn); - mlx4_bf_copy(ring->bf.reg + ring->bf.offset, (unsigned long *) &tx_desc->ctrl, - desc_size); + /* or in producer number for this WQE */ + opcode |= cpu_to_be32((bf_prod & 0xffff) << 8); + /* + * Ensure the new descriptor hits memory before + * setting ownership of this descriptor to HW: + */ + wmb(); + tx_desc->ctrl.owner_opcode = opcode; + wmb(); + mlx4_bf_copy(((u8 *)ring->bf.reg) + ring->bf.offset, + (volatile unsigned long *) &tx_desc->ctrl, bf_size); wmb(); - ring->bf.offset ^= ring->bf.buf_size; } else { - /* Ensure new descirptor hits memory - * before setting ownership of this descriptor to HW */ + /* + * Ensure the new descriptor hits memory before + * setting ownership of this descriptor to HW: + */ wmb(); - tx_desc->ctrl.owner_opcode = op_own; + tx_desc->ctrl.owner_opcode = opcode; wmb(); - writel(cpu_to_be32(ring->doorbell_qpn), ring->bf.uar->map + MLX4_SEND_DOORBELL); + writel(cpu_to_be32(ring->doorbell_qpn), + ((u8 *)ring->bf.uar->map) + MLX4_SEND_DOORBELL); } - return 0; + return (0); tx_drop: *mbp = NULL; m_freem(mb); - return EINVAL; + return (err); } static int @@ -1026,7 +1008,7 @@ /* Process the queue */ while ((next = drbr_peek(dev, ring->br)) != NULL) { - if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) { + if (mlx4_en_xmit(priv, tx_ind, &next) != 0) { if (next == NULL) { drbr_advance(dev, ring->br); } else { Index: head/sys/ofed/drivers/net/mlx4/mlx4_en.h =================================================================== --- head/sys/ofed/drivers/net/mlx4/mlx4_en.h +++ head/sys/ofed/drivers/net/mlx4/mlx4_en.h @@ -93,10 +93,6 @@ #define VLAN_MIN_VALUE 1 #define VLAN_MAX_VALUE 4094 -/* Typical TSO descriptor with 16 gather entries is 352 bytes... */ -#define MAX_DESC_SIZE 512 -#define MAX_DESC_TXBBS (MAX_DESC_SIZE / TXBB_SIZE) - /* * OS related constants and tunables */ @@ -111,26 +107,6 @@ MLX4_EN_ALLOC_REPLACEMENT = 1, }; -/* Receive fragment sizes; we use at most 3 fragments (for 9600 byte MTU - * and 4K allocations) */ -#if MJUMPAGESIZE == 4096 -enum { - FRAG_SZ0 = MCLBYTES, - FRAG_SZ1 = MJUMPAGESIZE, - FRAG_SZ2 = MJUMPAGESIZE, -}; -#define MLX4_EN_MAX_RX_FRAGS 3 -#elif MJUMPAGESIZE == 8192 -enum { - FRAG_SZ0 = MCLBYTES, - FRAG_SZ1 = MJUMPAGESIZE, -}; -#define MLX4_EN_MAX_RX_FRAGS 2 -#elif MJUMPAGESIZE == 8192 -#else -#error "Unknown PAGE_SIZE" -#endif - /* Maximum ring sizes */ #define MLX4_EN_DEF_TX_QUEUE_SIZE 4096 @@ -233,16 +209,10 @@ #define ILLEGAL_MAC(addr) (addr == 0xffffffffffffULL || addr == 0x0) struct mlx4_en_tx_info { + bus_dmamap_t dma_map; struct mbuf *mb; u32 nr_txbb; u32 nr_bytes; - u8 linear; - u8 nr_segs; - u8 data_offset; - u8 inl; -#if 0 - u8 ts_requested; -#endif }; @@ -278,6 +248,7 @@ struct mlx4_en_tx_ring { spinlock_t tx_lock; + bus_dma_tag_t dma_tag; struct mlx4_hwq_resources wqres; u32 size ; /* number of TXBBs */ u32 size_mask; @@ -291,7 +262,6 @@ u16 poll_cnt; int blocked; struct mlx4_en_tx_info *tx_info; - u8 *bounce_buf; u8 queue_index; cpuset_t affinity_mask; struct buf_ring *br; @@ -309,10 +279,8 @@ unsigned long wake_queue; struct mlx4_bf bf; bool bf_enabled; - struct netdev_queue *tx_queue; int hwtstamp_tx_type; spinlock_t comp_lock; - int full_size; int inline_thold; u64 watchdog_time; }; @@ -322,14 +290,21 @@ struct mlx4_wqe_data_seg data[0]; }; -struct mlx4_en_rx_buf { - dma_addr_t dma; - struct page *page; - unsigned int page_offset; +struct mlx4_en_rx_mbuf { + bus_dmamap_t dma_map; + struct mbuf *mbuf; +}; + +struct mlx4_en_rx_spare { + bus_dmamap_t dma_map; + struct mbuf *mbuf; + u64 paddr_be; }; struct mlx4_en_rx_ring { struct mlx4_hwq_resources wqres; + bus_dma_tag_t dma_tag; + struct mlx4_en_rx_spare spare; u32 size ; /* number of Rx descs*/ u32 actual_size; u32 size_mask; @@ -346,7 +321,7 @@ u32 rx_mb_size; int qpn; u8 *buf; - void *rx_info; + struct mlx4_en_rx_mbuf *mbuf; unsigned long errors; unsigned long bytes; unsigned long packets; @@ -531,12 +506,6 @@ u8 vport_num; }; -struct mlx4_en_frag_info { - u16 frag_size; - u16 frag_prefix_size; -}; - - struct mlx4_en_priv { struct mlx4_en_dev *mdev; struct mlx4_en_port_profile *prof; @@ -582,18 +551,14 @@ int cqe_factor; struct mlx4_en_rss_map rss_map; - __be32 ctrl_flags; u32 flags; u8 num_tx_rings_p_up; u32 tx_ring_num; u32 rx_ring_num; u32 rx_mb_size; - struct mlx4_en_frag_info frag_info[MLX4_EN_MAX_RX_FRAGS]; u16 rx_alloc_order; u32 rx_alloc_size; u32 rx_buf_size; - u16 num_frags; - u16 log_rx_info; struct mlx4_en_tx_ring **tx_ring; struct mlx4_en_rx_ring *rx_ring[MAX_RX_RINGS];