diff --git a/share/man/man9/mbuf.9 b/share/man/man9/mbuf.9 --- a/share/man/man9/mbuf.9 +++ b/share/man/man9/mbuf.9 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd December 28, 2023 +.Dd July 25, 2025 .Dt MBUF 9 .Os .\" @@ -1091,7 +1091,7 @@ altered prior to transmission. .El .Sh HARDWARE-ASSISTED CHECKSUM CALCULATION -This section currently applies to TCP/IP only. +This section currently applies to TCP/IP and SCTP only. In order to save the host CPU resources, computing checksums is offloaded to the network interface hardware if possible. The @@ -1135,34 +1135,29 @@ .Va csum_flags . .Pp The flags demanding a particular action from an interface are as follows: -.Bl -tag -width ".Dv CSUM_TCP" -offset indent +.Bl -ohang -offset indent .It Dv CSUM_IP The IP header checksum is to be computed and stored in the corresponding field of the packet. The hardware is expected to know the format of an IP header to determine the offset of the IP checksum field. -.It Dv CSUM_TCP -The TCP checksum is to be computed. -(See below.) -.It Dv CSUM_UDP -The UDP checksum is to be computed. -(See below.) -.El -.Pp -Should a TCP or UDP checksum be offloaded to the hardware, -the field +.It Dv CSUM_IP_TCP CSUM_IP_UDP CSUM_IP_SCTP +The TCP, UDP, or SCTP checksum is to be computed and stored in the +corresponding field of the packet. +To assist the hardware, the field .Va csum_data will contain the byte offset of the checksum field relative to the end of the IP header. In this case, the checksum field will be initially set by the TCP/IP module to the checksum of the pseudo header -defined by the TCP and UDP specifications. +defined by the TCP and UDP specifications or by the SCTP module to zero. +.El .Pp On input, an interface indicates the actions it has performed on a packet by setting one or more of the following flags in .Va csum_flags associated with the packet: -.Bl -tag -width ".Dv CSUM_IP_CHECKED" -offset indent +.Bl -ohang -offset indent .It Dv CSUM_IP_CHECKED The IP header checksum has been computed. .It Dv CSUM_IP_VALID @@ -1203,6 +1198,17 @@ calculated over any valid packet will be .Li 0xFFFF as long as the original checksum field is included. +.Pp +Note that the flag +.Dv CSUM_IP_TCP , +.Dv CSUM_IP_UDP , +or +.Dv CSUM_IP_SCTP +can appear on input if a packet sent by the local host with checksum +offloading switched to the input path (e.g., due to a virtual interface +such as tap or epair). +The TCP, UDP, or SCTP checksum is still incorrect but will be ignored because +the packet has not been on the wire. .Sh STRESS TESTING When running a kernel compiled with the option .Dv MBUF_STRESS_TEST , diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c --- a/sys/net/if_epair.c +++ b/sys/net/if_epair.c @@ -66,9 +66,9 @@ #include #include #include -#include #include #include +#include #include #ifdef RSS #include @@ -96,6 +96,7 @@ #define EPAIR_LOCK_DESTROY() mtx_destroy(&epair_n_index_mtx) #define EPAIR_LOCK() mtx_lock(&epair_n_index_mtx) #define EPAIR_UNLOCK() mtx_unlock(&epair_n_index_mtx) +#define EPAIR_LOCK_ASSERT() mtx_assert(&epair_n_index_mtx, MA_OWNED); struct epair_softc; struct epair_queue { @@ -425,6 +426,23 @@ imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; } +/* + * To be called under EPAIR_LOCK. Update ifp->if_hwassist according to the + * current value of ifp->if_capenable. + */ +static void +epair_caps_changed(struct ifnet *ifp) +{ + uint64_t hwassist = 0; + + EPAIR_LOCK_ASSERT(); + if (ifp->if_capenable & IFCAP_TXCSUM) + hwassist |= CSUM_IP_TCP | CSUM_IP_UDP; + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + hwassist |= CSUM_IP6_TCP | CSUM_IP6_UDP; + ifp->if_hwassist = hwassist; +} + static int epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { @@ -452,6 +470,33 @@ error = 0; break; + case SIOCGIFCAP: + ifr->ifr_reqcap = ifp->if_capabilities; + ifr->ifr_curcap = ifp->if_capenable; + error = 0; + break; + case SIOCSIFCAP: + sc = ifp->if_softc; + EPAIR_LOCK(); + ifp->if_capenable = ifr->ifr_reqcap & ifp->if_capabilities; + epair_caps_changed(ifp); + /* + * If IFCAP_TXCSUM(_IPV6) has been changed, change it on the + * other epair interface as well. + */ + if ((ifp->if_capenable ^ sc->oifp->if_capenable) & + (IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6)) { + sc->oifp->if_capenable &= + ~(IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6); + sc->oifp->if_capenable |= ifp->if_capenable & + (IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6); + epair_caps_changed(sc->oifp); + } + EPAIR_UNLOCK(); + VLAN_CAPABILITIES(ifp); + error = 0; + break; + default: /* Let the common ethernet handler process this. */ error = ether_ioctl(ifp, cmd, data); @@ -549,8 +594,12 @@ ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - ifp->if_capabilities = IFCAP_VLAN_MTU; - ifp->if_capenable = IFCAP_VLAN_MTU; + EPAIR_LOCK(); + ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_TXCSUM | + IFCAP_TXCSUM_IPV6; + ifp->if_capenable = IFCAP_VLAN_MTU | IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6; + epair_caps_changed(ifp); + EPAIR_UNLOCK(); ifp->if_transmit = epair_transmit; ifp->if_qflush = epair_qflush; ifp->if_start = epair_start; diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c --- a/sys/netinet/ip_fastfwd.c +++ b/sys/netinet/ip_fastfwd.c @@ -69,6 +69,7 @@ #include #include "opt_ipstealth.h" +#include "opt_sctp.h" #include #include @@ -102,6 +103,10 @@ #include +#if defined(SCTP) || defined(SCTP_SUPPORT) +#include +#endif + #define V_ipsendredirects VNET(ipsendredirects) static struct mbuf * @@ -460,6 +465,23 @@ } else gw = (const struct sockaddr *)dst; + /* + * If TCP/UDP header still needs a valid checksum and interface will not + * calculate it for us, do it here. + */ + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & + ~nh->nh_ifp->if_hwassist)) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } +#if defined(SCTP) || defined(SCTP_SUPPORT) + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP_SCTP & + ~nh->nh_ifp->if_hwassist)) { + sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + m->m_pkthdr.csum_flags &= ~CSUM_IP_SCTP; + } +#endif + /* Handle redirect case. */ redest.s_addr = 0; if (V_ipsendredirects && osrc.s_addr == ip->ip_src.s_addr && diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -5783,6 +5783,13 @@ if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; + } if (m->m_pkthdr.csum_flags & CSUM_IP_SCTP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + SCTP_STAT_INCR(sctps_recvzerocrc); + compute_crc = 0; } else { SCTP_STAT_INCR(sctps_recvswcrc); compute_crc = 1; diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -649,6 +649,12 @@ th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP6_TCP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + th->th_sum = 0; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { @@ -709,6 +715,12 @@ htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP_TCP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + th->th_sum = 0; } else { struct ipovly *ipov = (struct ipovly *)ip; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -555,6 +555,12 @@ ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP_UDP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + uh_sum = 0; } else { char b[offsetof(struct ipovly, ih_src)]; struct ipovly *ipov = (struct ipovly *)ip; diff --git a/sys/netinet6/ip6_fastfwd.c b/sys/netinet6/ip6_fastfwd.c --- a/sys/netinet6/ip6_fastfwd.c +++ b/sys/netinet6/ip6_fastfwd.c @@ -27,6 +27,7 @@ #include #include "opt_inet6.h" #include "opt_ipstealth.h" +#include "opt_sctp.h" #include #include @@ -54,6 +55,10 @@ #include #include +#if defined(SCTP) || defined(SCTP_SUPPORT) +#include +#endif + static int ip6_findroute(struct nhop_object **pnh, const struct sockaddr_in6 *dst, struct mbuf *m) @@ -277,6 +282,27 @@ ip6->ip6_hlim -= IPV6_HLIMDEC; } + /* + * If TCP/UDP header still needs a valid checksum and interface will not + * calculate it for us, do it here. + */ + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & + ~nh->nh_ifp->if_hwassist)) { + u_short offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + if (offset < sizeof(struct ip6_hdr) || offset > m->m_pkthdr.len) + goto drop; + in6_delayed_cksum(m, m->m_pkthdr.len - offset, offset); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; + } +#if defined(SCTP) || defined(SCTP_SUPPORT) + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP6_SCTP & + ~nh->nh_ifp->if_hwassist)) { + uint32_t offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + sctp_delayed_cksum(m, offset); + m->m_pkthdr.csum_flags &= ~CSUM_IP6_SCTP; + } +#endif + m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, ip6, nh->nh_ifp, NULL, ip6); diff --git a/sys/netinet6/ip6_forward.c b/sys/netinet6/ip6_forward.c --- a/sys/netinet6/ip6_forward.c +++ b/sys/netinet6/ip6_forward.c @@ -75,6 +75,10 @@ #include +#if defined(SCTP) || defined(SCTP_SUPPORT) +#include +#endif + /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful @@ -389,6 +393,27 @@ goto bad; } + /* + * If TCP/UDP header still needs a valid checksum and interface will not + * calculate it for us, do it here. + */ + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & + ~nh->nh_ifp->if_hwassist)) { + u_short offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + if (offset < sizeof(struct ip6_hdr) || offset > m->m_pkthdr.len) + goto bad; + in6_delayed_cksum(m, m->m_pkthdr.len - offset, offset); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; + } +#if defined(SCTP) || defined(SCTP_SUPPORT) + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP6_SCTP & + ~nh->nh_ifp->if_hwassist)) { + uint32_t offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + sctp_delayed_cksum(m, offset); + m->m_pkthdr.csum_flags &= ~CSUM_IP6_SCTP; + } +#endif + /* Currently LLE layer stores embedded IPv6 addresses */ if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6_addr)) { in6_set_unicast_scopeid(&dst.sin6_addr, dst.sin6_scope_id); diff --git a/sys/netinet6/sctp6_usrreq.c b/sys/netinet6/sctp6_usrreq.c --- a/sys/netinet6/sctp6_usrreq.c +++ b/sys/netinet6/sctp6_usrreq.c @@ -142,6 +142,13 @@ if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; + } if (m->m_pkthdr.csum_flags & CSUM_IP6_SCTP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + SCTP_STAT_INCR(sctps_recvzerocrc); + compute_crc = 0; } else { SCTP_STAT_INCR(sctps_recvswcrc); compute_crc = 1; diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -429,6 +429,12 @@ uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP6_UDP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + uh_sum = 0; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -647,10 +647,10 @@ * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested - * against ifnet if_hwassist. Note that the outbound and the inbound flags do - * not collide right now but they could be allowed to (as long as the flags are - * scrubbed appropriately when the direction of an mbuf changes). CSUM_BITS - * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX. + * against ifnet if_hwassist. Note that outbound flags CSUM_IP_UDP, + * CSUM_IP_TCP, and CSUM_IP_SCTP can appear on an inbound packet if the mbuf + * changed the direction. In such a case the checksum is still incorrect but + * TCP, UDP, or SCTP ignores that since the packet has not been on the wire. * * CSUM_INNER_ is the same as CSUM_ but it applies to the inner frame. * The CSUM_ENCAP_ bits identify the outer encapsulation.