diff --git a/share/man/man4/epair.4 b/share/man/man4/epair.4 --- a/share/man/man4/epair.4 +++ b/share/man/man4/epair.4 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd March 18, 2015 +.Dd July 28, 2025 .Dt EPAIR 4 .Os .Sh NAME @@ -96,6 +96,24 @@ can have a .Xr vlan 4 configured on top of it. +.Pp +The +.Nm +supports +TXCSUM and TXCSUM6 (for TCP and UDP) as well as RXCSUM and RXCSUM6, +but only by forwarding the order to compute the checksum or the information +that the checksum has been validated. +Thus, when using an +.Nm +interface, a sender can offload checksum computation +to a physical interface, and a receiver can skip validating the checksum if a +physical interface did it. +Note that, in case the packet does not leave the host, the checksum is +unnecessary and will be ignored if offloaded. +The TXCSUM and TXCSUM6 capabilities are synchronized between the +.Nm +interface pair (i.e., enabling/disabling the capability on one end +enables/disables it on the other end). .Sh SEE ALSO .Xr ioctl 2 , .Xr altq 4 , diff --git a/share/man/man9/mbuf.9 b/share/man/man9/mbuf.9 --- a/share/man/man9/mbuf.9 +++ b/share/man/man9/mbuf.9 @@ -22,7 +22,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd December 28, 2023 +.Dd July 28, 2025 .Dt MBUF 9 .Os .\" @@ -1203,6 +1203,17 @@ calculated over any valid packet will be .Li 0xFFFF as long as the original checksum field is included. +.Pp +Note that the flag +.Dv CSUM_TCP , +.Dv CSUM_UDP , +or +.Dv CSUM_SCTP +can appear on input if a packet sent by the local host with checksum +offloading switched to the input path (e.g., due to a virtual interface +such as tap or epair). +The TCP, UDP, or SCTP checksum is still incorrect but will be ignored because +the packet has not been on the wire. .Sh STRESS TESTING When running a kernel compiled with the option .Dv MBUF_STRESS_TEST , diff --git a/sys/net/if_epair.c b/sys/net/if_epair.c --- a/sys/net/if_epair.c +++ b/sys/net/if_epair.c @@ -66,9 +66,9 @@ #include #include #include -#include #include #include +#include #include #ifdef RSS #include @@ -96,6 +96,7 @@ #define EPAIR_LOCK_DESTROY() mtx_destroy(&epair_n_index_mtx) #define EPAIR_LOCK() mtx_lock(&epair_n_index_mtx) #define EPAIR_UNLOCK() mtx_unlock(&epair_n_index_mtx) +#define EPAIR_LOCK_ASSERT() mtx_assert(&epair_n_index_mtx, MA_OWNED); struct epair_softc; struct epair_queue { @@ -425,6 +426,23 @@ imr->ifm_active = IFM_ETHER | IFM_10G_T | IFM_FDX; } +/* + * To be called under EPAIR_LOCK. Update ifp->if_hwassist according to the + * current value of ifp->if_capenable. + */ +static void +epair_caps_changed(struct ifnet *ifp) +{ + uint64_t hwassist = 0; + + EPAIR_LOCK_ASSERT(); + if (ifp->if_capenable & IFCAP_TXCSUM) + hwassist |= CSUM_IP_TCP | CSUM_IP_UDP; + if (ifp->if_capenable & IFCAP_TXCSUM_IPV6) + hwassist |= CSUM_IP6_TCP | CSUM_IP6_UDP; + ifp->if_hwassist = hwassist; +} + static int epair_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) { @@ -452,6 +470,33 @@ error = 0; break; + case SIOCGIFCAP: + ifr->ifr_reqcap = ifp->if_capabilities; + ifr->ifr_curcap = ifp->if_capenable; + error = 0; + break; + case SIOCSIFCAP: + sc = ifp->if_softc; + EPAIR_LOCK(); + ifp->if_capenable = ifr->ifr_reqcap & ifp->if_capabilities; + epair_caps_changed(ifp); + /* + * If IFCAP_TXCSUM(_IPV6) has been changed, change it on the + * other epair interface as well. + */ + if ((ifp->if_capenable ^ sc->oifp->if_capenable) & + (IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6)) { + sc->oifp->if_capenable &= + ~(IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6); + sc->oifp->if_capenable |= ifp->if_capenable & + (IFCAP_TXCSUM | IFCAP_TXCSUM_IPV6); + epair_caps_changed(sc->oifp); + } + EPAIR_UNLOCK(); + VLAN_CAPABILITIES(ifp); + error = 0; + break; + default: /* Let the common ethernet handler process this. */ error = ether_ioctl(ifp, cmd, data); @@ -549,8 +594,13 @@ ifp->if_dname = epairname; ifp->if_dunit = unit; ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; - ifp->if_capabilities = IFCAP_VLAN_MTU; - ifp->if_capenable = IFCAP_VLAN_MTU; + EPAIR_LOCK(); + ifp->if_capabilities = IFCAP_VLAN_MTU | IFCAP_TXCSUM | + IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6; + ifp->if_capenable = IFCAP_VLAN_MTU | IFCAP_TXCSUM | + IFCAP_TXCSUM_IPV6 | IFCAP_RXCSUM | IFCAP_RXCSUM_IPV6; + epair_caps_changed(ifp); + EPAIR_UNLOCK(); ifp->if_transmit = epair_transmit; ifp->if_qflush = epair_qflush; ifp->if_start = epair_start; diff --git a/sys/netinet/ip_fastfwd.c b/sys/netinet/ip_fastfwd.c --- a/sys/netinet/ip_fastfwd.c +++ b/sys/netinet/ip_fastfwd.c @@ -69,6 +69,7 @@ #include #include "opt_ipstealth.h" +#include "opt_sctp.h" #include #include @@ -102,6 +103,10 @@ #include +#if defined(SCTP) || defined(SCTP_SUPPORT) +#include +#endif + #define V_ipsendredirects VNET(ipsendredirects) static struct mbuf * @@ -460,6 +465,23 @@ } else gw = (const struct sockaddr *)dst; + /* + * If TCP/UDP header still needs a valid checksum and interface will not + * calculate it for us, do it here. + */ + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & + ~nh->nh_ifp->if_hwassist)) { + in_delayed_cksum(m); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; + } +#if defined(SCTP) || defined(SCTP_SUPPORT) + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP_SCTP & + ~nh->nh_ifp->if_hwassist)) { + sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); + m->m_pkthdr.csum_flags &= ~CSUM_IP_SCTP; + } +#endif + /* Handle redirect case. */ redest.s_addr = 0; if (V_ipsendredirects && osrc.s_addr == ip->ip_src.s_addr && diff --git a/sys/netinet/sctp_input.c b/sys/netinet/sctp_input.c --- a/sys/netinet/sctp_input.c +++ b/sys/netinet/sctp_input.c @@ -5780,7 +5780,11 @@ goto out; } ecn_bits = ip->ip_tos; - if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { + if (m->m_pkthdr.csum_flags & (CSUM_SCTP_VALID | CSUM_IP_SCTP)) { + /* + * Packet with CSUM_IP_SCTP were sent from local host using + * checksum offloading. Checksum not required. + */ SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -649,6 +649,12 @@ th->th_sum = in6_cksum_pseudo(ip6, tlen, IPPROTO_TCP, m->m_pkthdr.csum_data); th->th_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP6_TCP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + th->th_sum = 0; } else th->th_sum = in6_cksum(m, IPPROTO_TCP, off0, tlen); if (th->th_sum) { @@ -709,6 +715,12 @@ htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); th->th_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP_TCP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + th->th_sum = 0; } else { struct ipovly *ipov = (struct ipovly *)ip; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -555,6 +555,12 @@ ip->ip_dst.s_addr, htonl((u_short)len + m->m_pkthdr.csum_data + proto)); uh_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP_UDP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + uh_sum = 0; } else { char b[offsetof(struct ipovly, ih_src)]; struct ipovly *ipov = (struct ipovly *)ip; diff --git a/sys/netinet6/ip6_fastfwd.c b/sys/netinet6/ip6_fastfwd.c --- a/sys/netinet6/ip6_fastfwd.c +++ b/sys/netinet6/ip6_fastfwd.c @@ -27,6 +27,7 @@ #include #include "opt_inet6.h" #include "opt_ipstealth.h" +#include "opt_sctp.h" #include #include @@ -54,6 +55,10 @@ #include #include +#if defined(SCTP) || defined(SCTP_SUPPORT) +#include +#endif + static int ip6_findroute(struct nhop_object **pnh, const struct sockaddr_in6 *dst, struct mbuf *m) @@ -277,6 +282,29 @@ ip6->ip6_hlim -= IPV6_HLIMDEC; } + /* + * If TCP/UDP header still needs a valid checksum and interface will not + * calculate it for us, do it here. + */ + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & + ~nh->nh_ifp->if_hwassist)) { + int offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + + if (offset < sizeof(struct ip6_hdr) || offset > m->m_pkthdr.len) + goto drop; + in6_delayed_cksum(m, m->m_pkthdr.len - offset, offset); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; + } +#if defined(SCTP) || defined(SCTP_SUPPORT) + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP6_SCTP & + ~nh->nh_ifp->if_hwassist)) { + int offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + + sctp_delayed_cksum(m, offset); + m->m_pkthdr.csum_flags &= ~CSUM_IP6_SCTP; + } +#endif + m_clrprotoflags(m); /* Avoid confusing lower layers. */ IP_PROBE(send, NULL, NULL, ip6, nh->nh_ifp, NULL, ip6); diff --git a/sys/netinet6/ip6_forward.c b/sys/netinet6/ip6_forward.c --- a/sys/netinet6/ip6_forward.c +++ b/sys/netinet6/ip6_forward.c @@ -75,6 +75,10 @@ #include +#if defined(SCTP) || defined(SCTP_SUPPORT) +#include +#endif + /* * Forward a packet. If some error occurs return the sender * an icmp packet. Note we can't always generate a meaningful @@ -389,6 +393,29 @@ goto bad; } + /* + * If TCP/UDP header still needs a valid checksum and interface will not + * calculate it for us, do it here. + */ + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6 & + ~nh->nh_ifp->if_hwassist)) { + int offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + + if (offset < sizeof(struct ip6_hdr) || offset > m->m_pkthdr.len) + goto bad; + in6_delayed_cksum(m, m->m_pkthdr.len - offset, offset); + m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; + } +#if defined(SCTP) || defined(SCTP_SUPPORT) + if (__predict_false(m->m_pkthdr.csum_flags & CSUM_IP6_SCTP & + ~nh->nh_ifp->if_hwassist)) { + int offset = ip6_lasthdr(m, 0, IPPROTO_IPV6, NULL); + + sctp_delayed_cksum(m, offset); + m->m_pkthdr.csum_flags &= ~CSUM_IP6_SCTP; + } +#endif + /* Currently LLE layer stores embedded IPv6 addresses */ if (IN6_IS_SCOPE_LINKLOCAL(&dst.sin6_addr)) { in6_set_unicast_scopeid(&dst.sin6_addr, dst.sin6_scope_id); diff --git a/sys/netinet6/sctp6_usrreq.c b/sys/netinet6/sctp6_usrreq.c --- a/sys/netinet6/sctp6_usrreq.c +++ b/sys/netinet6/sctp6_usrreq.c @@ -139,7 +139,11 @@ goto out; } ecn_bits = IPV6_TRAFFIC_CLASS(ip6); - if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) { + if (m->m_pkthdr.csum_flags & (CSUM_SCTP_VALID | CSUM_IP6_SCTP)) { + /* + * Packet with CSUM_IP6_SCTP were sent from local host using + * checksum offloading. Checksum not required. + */ SCTP_STAT_INCR(sctps_recvhwcrc); compute_crc = 0; } else { diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c --- a/sys/netinet6/udp6_usrreq.c +++ b/sys/netinet6/udp6_usrreq.c @@ -429,6 +429,12 @@ uh_sum = in6_cksum_pseudo(ip6, ulen, nxt, m->m_pkthdr.csum_data); uh_sum ^= 0xffff; + } else if (m->m_pkthdr.csum_flags & CSUM_IP6_UDP) { + /* + * Packet from local host (maybe from a VM). + * Checksum not required. + */ + uh_sum = 0; } else uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen); diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -647,10 +647,10 @@ * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested - * against ifnet if_hwassist. Note that the outbound and the inbound flags do - * not collide right now but they could be allowed to (as long as the flags are - * scrubbed appropriately when the direction of an mbuf changes). CSUM_BITS - * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX. + * against ifnet if_hwassist. Note that outbound flags CSUM_IP_UDP, + * CSUM_IP_TCP, and CSUM_IP_SCTP can appear on an inbound packet if the mbuf + * changed the direction. In such a case the checksum is still incorrect but + * TCP, UDP, or SCTP ignores that since the packet has not been on the wire. * * CSUM_INNER_ is the same as CSUM_ but it applies to the inner frame. * The CSUM_ENCAP_ bits identify the outer encapsulation.