Changeset View
Standalone View
sys/dev/ice/ice_common_txrx.h
- This file was added.
/* SPDX-License-Identifier: BSD-3-Clause */ | |||||
/* Copyright (c) 2020, Intel Corporation | |||||
* All rights reserved. | |||||
* | |||||
* Redistribution and use in source and binary forms, with or without | |||||
* modification, are permitted provided that the following conditions are met: | |||||
* | |||||
* 1. Redistributions of source code must retain the above copyright notice, | |||||
* this list of conditions and the following disclaimer. | |||||
* | |||||
* 2. Redistributions in binary form must reproduce the above copyright | |||||
* notice, this list of conditions and the following disclaimer in the | |||||
* documentation and/or other materials provided with the distribution. | |||||
* | |||||
* 3. Neither the name of the Intel Corporation nor the names of its | |||||
* contributors may be used to endorse or promote products derived from | |||||
* this software without specific prior written permission. | |||||
* | |||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||||
* POSSIBILITY OF SUCH DAMAGE. | |||||
*/ | |||||
/*$FreeBSD$*/ | |||||
/** | |||||
* @file ice_common_txrx.h | |||||
* @brief common Tx/Rx utility functions | |||||
* | |||||
* Contains common utility functions for the Tx/Rx hot path. | |||||
* | |||||
* The functions do depend on the if_pkt_info_t structure. A suitable | |||||
* implementation of this structure must be provided if these functions are to | |||||
* be used without the iflib networking stack. | |||||
*/ | |||||
#ifndef _ICE_COMMON_TXRX_H_ | |||||
#define _ICE_COMMON_TXRX_H_ | |||||
#include <netinet/udp.h> | |||||
#include <netinet/sctp.h> | |||||
/** | |||||
* ice_tso_detect_sparse - detect TSO packets with too many segments | |||||
* @pi: packet information | |||||
* | |||||
* Hardware only transmits packets with a maximum of 8 descriptors. For TSO | |||||
* packets, hardware needs to be able to build the split packets using 8 or | |||||
* fewer descriptors. Additionally, the header must be contained within at | |||||
* most 3 descriptors. | |||||
* | |||||
* To verify this, we walk the headers to find out how many descriptors the | |||||
* headers require (usually 1). Then we ensure that, for each TSO segment, its | |||||
* data plus the headers are contained within 8 or fewer descriptors. | |||||
*/ | |||||
static inline int | |||||
ice_tso_detect_sparse(if_pkt_info_t pi) | |||||
{ | |||||
int count, curseg, i, hlen, segsz, seglen, tsolen, hdrs, maxsegs; | |||||
bus_dma_segment_t *segs = pi->ipi_segs; | |||||
int nsegs = pi->ipi_nsegs; | |||||
curseg = hdrs = 0; | |||||
hlen = pi->ipi_ehdrlen + pi->ipi_ip_hlen + pi->ipi_tcp_hlen; | |||||
tsolen = pi->ipi_len - hlen; | |||||
/* First, count the number of descriptors for the header. | |||||
* Additionally, make sure it does not span more than 3 segments. | |||||
*/ | |||||
i = 0; | |||||
curseg = segs[0].ds_len; | |||||
while (hlen > 0) { | |||||
hdrs++; | |||||
if (hdrs > ICE_MAX_TSO_HDR_SEGS) | |||||
return (1); | |||||
if (curseg == 0) { | |||||
i++; | |||||
if (__predict_false(i == nsegs)) | |||||
return (1); | |||||
curseg = segs[i].ds_len; | |||||
} | |||||
seglen = min(curseg, hlen); | |||||
curseg -= seglen; | |||||
hlen -= seglen; | |||||
} | |||||
gallatin: Can you remove or properly comment these lines (eg, // is frowned upon). When I sent you this… | |||||
Done Inline ActionsYea, I had forgotten these were still here. I really don't like this either, but there's unfortunately nothing we can do about it now :( jacob.e.keller_intel.com: Yea, I had forgotten these were still here.
I really don't like this either, but there's… | |||||
maxsegs = ICE_MAX_TX_SEGS - hdrs; | |||||
/* We must count the headers, in order to verify that they take up | |||||
* 3 or fewer descriptors. However, we don't need to check the data | |||||
* if the total segments is small. | |||||
*/ | |||||
if (nsegs <= maxsegs) | |||||
return (0); | |||||
count = 0; | |||||
/* Now check the data to make sure that each TSO segment is made up of | |||||
* no more than maxsegs descriptors. This ensures that hardware will | |||||
* be capable of performing TSO offload. | |||||
*/ | |||||
while (tsolen > 0) { | |||||
segsz = pi->ipi_tso_segsz; | |||||
while (segsz > 0 && tsolen != 0) { | |||||
count++; | |||||
if (count > maxsegs) { | |||||
return (1); | |||||
} | |||||
if (curseg == 0) { | |||||
i++; | |||||
if (__predict_false(i == nsegs)) { | |||||
return (1); | |||||
} | |||||
curseg = segs[i].ds_len; | |||||
} | |||||
seglen = min(curseg, segsz); | |||||
segsz -= seglen; | |||||
curseg -= seglen; | |||||
tsolen -= seglen; | |||||
} | |||||
count = 0; | |||||
} | |||||
return (0); | |||||
} | |||||
/** | |||||
* ice_tso_setup - Setup a context descriptor to prepare for a TSO packet | |||||
* @txq: the Tx queue to use | |||||
* @pi: the packet info to prepare for | |||||
* | |||||
* Setup a context descriptor in preparation for sending a Tx packet that | |||||
* requires the TSO offload. Returns the index of the descriptor to use when | |||||
* encapsulating the Tx packet data into descriptors. | |||||
*/ | |||||
static inline int | |||||
ice_tso_setup(struct ice_tx_queue *txq, if_pkt_info_t pi) | |||||
{ | |||||
struct ice_tx_ctx_desc *txd; | |||||
u32 cmd, mss, type, tsolen; | |||||
int idx; | |||||
u64 type_cmd_tso_mss; | |||||
idx = pi->ipi_pidx; | |||||
txd = (struct ice_tx_ctx_desc *)&txq->tx_base[idx]; | |||||
tsolen = pi->ipi_len - (pi->ipi_ehdrlen + pi->ipi_ip_hlen + pi->ipi_tcp_hlen); | |||||
type = ICE_TX_DESC_DTYPE_CTX; | |||||
cmd = ICE_TX_CTX_DESC_TSO; | |||||
/* TSO MSS must not be less than 64 */ | |||||
if (pi->ipi_tso_segsz < ICE_MIN_TSO_MSS) { | |||||
txq->stats.mss_too_small++; | |||||
pi->ipi_tso_segsz = ICE_MIN_TSO_MSS; | |||||
} | |||||
mss = pi->ipi_tso_segsz; | |||||
type_cmd_tso_mss = ((u64)type << ICE_TXD_CTX_QW1_DTYPE_S) | | |||||
((u64)cmd << ICE_TXD_CTX_QW1_CMD_S) | | |||||
((u64)tsolen << ICE_TXD_CTX_QW1_TSO_LEN_S) | | |||||
((u64)mss << ICE_TXD_CTX_QW1_MSS_S); | |||||
txd->qw1 = htole64(type_cmd_tso_mss); | |||||
txd->tunneling_params = htole32(0); | |||||
txq->tso++; | |||||
return ((idx + 1) & (txq->desc_count-1)); | |||||
} | |||||
Not Done Inline ActionsCool. Can you talk about what can be tunneled? gallatin: Cool. Can you talk about what can be tunneled? | |||||
Not Done Inline ActionsI'm not entirely sure, but the description for this field in the datasheet is almost exactly the same as it is in Fortville. The only difference in the field definition is that it looks like you can make Columbiaville calculate the tunneling UDP checksum, too. erj: I'm not entirely sure, but the description for this field in the datasheet is almost exactly… | |||||
/** | |||||
* ice_tx_setup_offload - Setup register values for performing a Tx offload | |||||
* @txq: The Tx queue, used to track checksum offload stats | |||||
* @pi: the packet info to program for | |||||
* @cmd: the cmd register value to update | |||||
* @off: the off register value to update | |||||
* | |||||
* Based on the packet info provided, update the cmd and off values for | |||||
* enabling Tx offloads. This depends on the packet type and which offloads | |||||
* have been requested. | |||||
* | |||||
* We also track the total number of times that we've requested hardware | |||||
* offload a particular type of checksum for debugging purposes. | |||||
*/ | |||||
static inline void | |||||
ice_tx_setup_offload(struct ice_tx_queue *txq, if_pkt_info_t pi, u32 *cmd, u32 *off) | |||||
{ | |||||
u32 remaining_csum_flags = pi->ipi_csum_flags; | |||||
switch (pi->ipi_etype) { | |||||
Not Done Inline ActionsWhy a counter and not just a uint64_t? Note that access to the tx/rx rings should be serialized at the point your touching counters. Counters burn 16 bytes per element per cpu, due to how UMA works. gallatin: Why a counter and not just a uint64_t?
Note that access to the tx/rx rings should be… | |||||
Done Inline ActionsEric, can you recall why these counters are used instead of just using a u64? I don't remember offhand. jacob.e.keller_intel.com: Eric, can you recall why these counters are used instead of just using a u64? I don't remember… | |||||
Not Done Inline ActionsFrom what I can remember, it was motivated by how we were storing these per-queue stats in the VSI struct instead of storing them in each queue struct. We're incrementing the stats in the hot path, so we didn't want to take some sort of per-VSI lock to protect the counters while they're being incremented. The counter(9) functionality looked like it would get around that requirement. We could've stored each counter in each queue, but I think we were concerned about how much memory that could use and how it could be annoying to see what's incremented across many queues. Though, thinking about the per CPU memory usage, it might not be that bad to keep them in a per-queue struct. We also did have an implementation that actually periodically collected the stats from each queue (in the update_admin_status() function I think) and it added each queue counter to the VSI (so that we could just stick with uint64_t's), but I think that ran into some other problem. It might have been that we would've had to take a lock on each Tx/Rx ring to get the counter values, and that didn't seem like a good idea. All that said, these were intended to help debug problems and validate that the hardware checksums were working; if it seems like there might be more harm than good or if they need to be reimplemented, it might be easier to just leave it out or have it compiled out upstream. erj: From what I can remember, it was motivated by how we were storing these per-queue stats in the… | |||||
Not Done Inline ActionsI really believe the counters should be stored per-ring, and summed without a lock. gallatin: I really believe the counters should be stored per-ring, and summed without a lock. | |||||
#ifdef INET | |||||
case ETHERTYPE_IP: | |||||
if (pi->ipi_csum_flags & ICE_CSUM_IP) { | |||||
*cmd |= ICE_TX_DESC_CMD_IIPT_IPV4_CSUM; | |||||
txq->stats.cso[ICE_CSO_STAT_TX_IP4]++; | |||||
remaining_csum_flags &= ~CSUM_IP; | |||||
} else | |||||
*cmd |= ICE_TX_DESC_CMD_IIPT_IPV4; | |||||
break; | |||||
#endif | |||||
#ifdef INET6 | |||||
case ETHERTYPE_IPV6: | |||||
*cmd |= ICE_TX_DESC_CMD_IIPT_IPV6; | |||||
/* | |||||
* This indicates that the IIPT flag was set to the IPV6 value; | |||||
* there's no checksum for IPv6 packets. | |||||
*/ | |||||
txq->stats.cso[ICE_CSO_STAT_TX_IP6]++; | |||||
break; | |||||
#endif | |||||
default: | |||||
txq->stats.cso[ICE_CSO_STAT_TX_L3_ERR]++; | |||||
break; | |||||
} | |||||
*off |= (pi->ipi_ehdrlen >> 1) << ICE_TX_DESC_LEN_MACLEN_S; | |||||
*off |= (pi->ipi_ip_hlen >> 2) << ICE_TX_DESC_LEN_IPLEN_S; | |||||
if (!(remaining_csum_flags & ~ICE_RX_CSUM_FLAGS)) | |||||
return; | |||||
switch (pi->ipi_ipproto) { | |||||
case IPPROTO_TCP: | |||||
if (pi->ipi_csum_flags & ICE_CSUM_TCP) { | |||||
*cmd |= ICE_TX_DESC_CMD_L4T_EOFT_TCP; | |||||
*off |= (pi->ipi_tcp_hlen >> 2) << | |||||
ICE_TX_DESC_LEN_L4_LEN_S; | |||||
txq->stats.cso[ICE_CSO_STAT_TX_TCP]++; | |||||
} | |||||
break; | |||||
case IPPROTO_UDP: | |||||
if (pi->ipi_csum_flags & ICE_CSUM_UDP) { | |||||
*cmd |= ICE_TX_DESC_CMD_L4T_EOFT_UDP; | |||||
*off |= (sizeof(struct udphdr) >> 2) << | |||||
ICE_TX_DESC_LEN_L4_LEN_S; | |||||
txq->stats.cso[ICE_CSO_STAT_TX_UDP]++; | |||||
} | |||||
break; | |||||
case IPPROTO_SCTP: | |||||
if (pi->ipi_csum_flags & ICE_CSUM_SCTP) { | |||||
*cmd |= ICE_TX_DESC_CMD_L4T_EOFT_SCTP; | |||||
*off |= (sizeof(struct sctphdr) >> 2) << | |||||
ICE_TX_DESC_LEN_L4_LEN_S; | |||||
txq->stats.cso[ICE_CSO_STAT_TX_SCTP]++; | |||||
} | |||||
break; | |||||
default: | |||||
txq->stats.cso[ICE_CSO_STAT_TX_L4_ERR]++; | |||||
break; | |||||
} | |||||
} | |||||
/** | |||||
* ice_rx_checksum - verify hardware checksum is valid or not | |||||
* @rxq: the Rx queue structure | |||||
* @flags: checksum flags to update | |||||
* @data: checksum data to update | |||||
* @status0: descriptor status data | |||||
* @ptype: packet type | |||||
* | |||||
* Determine whether the hardware indicated that the Rx checksum is valid. If | |||||
* so, update the checksum flags and data, informing the stack of the status | |||||
* of the checksum so that it does not spend time verifying it manually. | |||||
*/ | |||||
static void | |||||
ice_rx_checksum(struct ice_rx_queue *rxq, uint32_t *flags, uint32_t *data, | |||||
u16 status0, u16 ptype) | |||||
{ | |||||
const u16 l3_error = (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) | | |||||
BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S)); | |||||
const u16 l4_error = (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_L4E_S) | | |||||
BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EUDPE_S)); | |||||
const u16 xsum_errors = (l3_error | l4_error | | |||||
BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S)); | |||||
struct ice_rx_ptype_decoded decoded; | |||||
bool is_ipv4, is_ipv6; | |||||
/* No L3 or L4 checksum was calculated */ | |||||
if (!(status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S))) { | |||||
return; | |||||
} | |||||
decoded = ice_decode_rx_desc_ptype(ptype); | |||||
*flags = 0; | |||||
if (!(decoded.known && decoded.outer_ip)) | |||||
return; | |||||
is_ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) && | |||||
(decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4); | |||||
is_ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) && | |||||
(decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6); | |||||
/* No checksum errors were reported */ | |||||
if (!(status0 & xsum_errors)) { | |||||
if (is_ipv4) | |||||
*flags |= CSUM_L3_CALC | CSUM_L3_VALID; | |||||
switch (decoded.inner_prot) { | |||||
case ICE_RX_PTYPE_INNER_PROT_TCP: | |||||
case ICE_RX_PTYPE_INNER_PROT_UDP: | |||||
case ICE_RX_PTYPE_INNER_PROT_SCTP: | |||||
*flags |= CSUM_L4_CALC | CSUM_L4_VALID; | |||||
*data |= htons(0xffff); | |||||
break; | |||||
default: | |||||
break; | |||||
} | |||||
return; | |||||
} | |||||
/* | |||||
* Certain IPv6 extension headers impact the validity of L4 checksums. | |||||
* If one of these headers exist, hardware will set the IPV6EXADD bit | |||||
* in the descriptor. If the bit is set then pretend like hardware | |||||
* didn't checksum this packet. | |||||
*/ | |||||
if (is_ipv6 && (status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_IPV6EXADD_S))) { | |||||
rxq->stats.cso[ICE_CSO_STAT_RX_IP6_ERR]++; | |||||
return; | |||||
} | |||||
/* | |||||
* At this point, status0 must have at least one of the l3_error or | |||||
* l4_error bits set. | |||||
*/ | |||||
if (status0 & l3_error) { | |||||
if (is_ipv4) { | |||||
Not Done Inline ActionsI hate this. Maybe I'm just old fashioned, but I'd check for the 2 most common cases first (good checksum, or no checksum at all), first and likely not even increment counters for them. Eg, find all the status0 bits that would be set for a good packet, if they are all set, set the flags in one go and the data to 0xffff and be done. No counters. Then I'd decode all the fiddly stuff for the error cases and increment counters there. gallatin: I hate this.
Maybe I'm just old fashioned, but I'd check for the 2 most common cases first… | |||||
Done Inline ActionsMakes some sense. jacob.e.keller_intel.com: Makes some sense. | |||||
Not Done Inline ActionsIt does seem like a good change to make, but I wonder if it'll have a positive performance impact. erj: It does seem like a good change to make, but I wonder if it'll have a positive performance… | |||||
rxq->stats.cso[ICE_CSO_STAT_RX_IP4_ERR]++; | |||||
*flags |= CSUM_L3_CALC; | |||||
} else { | |||||
/* Hardware indicated L3 error but this isn't IPv4? */ | |||||
rxq->stats.cso[ICE_CSO_STAT_RX_L3_ERR]++; | |||||
} | |||||
/* don't bother reporting L4 errors if we got an L3 error */ | |||||
return; | |||||
} else if (is_ipv4) { | |||||
*flags |= CSUM_L3_CALC | CSUM_L3_VALID; | |||||
} | |||||
if (status0 & l4_error) { | |||||
switch (decoded.inner_prot) { | |||||
case ICE_RX_PTYPE_INNER_PROT_TCP: | |||||
rxq->stats.cso[ICE_CSO_STAT_RX_TCP_ERR]++; | |||||
*flags |= CSUM_L4_CALC; | |||||
break; | |||||
case ICE_RX_PTYPE_INNER_PROT_UDP: | |||||
rxq->stats.cso[ICE_CSO_STAT_RX_UDP_ERR]++; | |||||
*flags |= CSUM_L4_CALC; | |||||
break; | |||||
case ICE_RX_PTYPE_INNER_PROT_SCTP: | |||||
rxq->stats.cso[ICE_CSO_STAT_RX_SCTP_ERR]++; | |||||
*flags |= CSUM_L4_CALC; | |||||
break; | |||||
default: | |||||
/* | |||||
* Hardware indicated L4 error, but this isn't one of | |||||
* the expected protocols. | |||||
*/ | |||||
rxq->stats.cso[ICE_CSO_STAT_RX_L4_ERR]++; | |||||
} | |||||
} | |||||
} | |||||
/** | |||||
* ice_ptype_to_hash - Convert packet type to a hash value | |||||
* @ptype: the packet type to convert | |||||
* | |||||
* Given the packet type, convert to a suitable hashtype to report to the | |||||
* upper stack via the iri_rsstype value of the if_rxd_info_t structure. | |||||
* | |||||
* If the hash type is unknown we'll report M_HASHTYPE_OPAQUE. | |||||
*/ | |||||
static inline int | |||||
ice_ptype_to_hash(u16 ptype) | |||||
{ | |||||
struct ice_rx_ptype_decoded decoded; | |||||
if (ptype >= ARRAY_SIZE(ice_ptype_lkup)) | |||||
return M_HASHTYPE_OPAQUE; | |||||
decoded = ice_decode_rx_desc_ptype(ptype); | |||||
if (!decoded.known) | |||||
return M_HASHTYPE_OPAQUE; | |||||
if (decoded.outer_ip == ICE_RX_PTYPE_OUTER_L2) | |||||
return M_HASHTYPE_OPAQUE; | |||||
/* Note: anything that gets to this point is IP */ | |||||
if (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6) { | |||||
switch (decoded.inner_prot) { | |||||
case ICE_RX_PTYPE_INNER_PROT_TCP: | |||||
return M_HASHTYPE_RSS_TCP_IPV6; | |||||
case ICE_RX_PTYPE_INNER_PROT_UDP: | |||||
return M_HASHTYPE_RSS_UDP_IPV6; | |||||
default: | |||||
return M_HASHTYPE_RSS_IPV6; | |||||
} | |||||
} | |||||
if (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4) { | |||||
switch (decoded.inner_prot) { | |||||
case ICE_RX_PTYPE_INNER_PROT_TCP: | |||||
return M_HASHTYPE_RSS_TCP_IPV4; | |||||
case ICE_RX_PTYPE_INNER_PROT_UDP: | |||||
return M_HASHTYPE_RSS_UDP_IPV4; | |||||
default: | |||||
return M_HASHTYPE_RSS_IPV4; | |||||
} | |||||
} | |||||
/* We should never get here!! */ | |||||
return M_HASHTYPE_OPAQUE; | |||||
} | |||||
#endif |
Can you remove or properly comment these lines (eg, // is frowned upon). When I sent you this for ixl, I never intended those lines be kept.
Also, very sad/disappointed that ice has this horrible restriction that makes iflib's transmit routine more complex.