diff --git a/sys/net/if_infiniband.c b/sys/net/if_infiniband.c index 19f7cdf7ffea..b644f91f2cda 100644 --- a/sys/net/if_infiniband.c +++ b/sys/net/if_infiniband.c @@ -1,551 +1,648 @@ /*- * Copyright (c) 2020 Mellanox Technologies. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_inet.h" #include "opt_inet6.h" #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* if_lagg(4) support */ struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *); #ifdef INET static inline void infiniband_ipv4_multicast_map(uint32_t addr, const uint8_t *broadcast, uint8_t *buf) { uint8_t scope; addr = ntohl(addr); scope = broadcast[5] & 0xF; buf[0] = 0; buf[1] = 0xff; buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; buf[6] = 0x40; buf[7] = 0x1b; buf[8] = broadcast[8]; buf[9] = broadcast[9]; buf[10] = 0; buf[11] = 0; buf[12] = 0; buf[13] = 0; buf[14] = 0; buf[15] = 0; buf[16] = (addr >> 24) & 0xff; buf[17] = (addr >> 16) & 0xff; buf[18] = (addr >> 8) & 0xff; buf[19] = addr & 0xff; } #endif #ifdef INET6 static inline void infiniband_ipv6_multicast_map(const struct in6_addr *addr, const uint8_t *broadcast, uint8_t *buf) { uint8_t scope; scope = broadcast[5] & 0xF; buf[0] = 0; buf[1] = 0xff; buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; buf[6] = 0x60; buf[7] = 0x1b; buf[8] = broadcast[8]; buf[9] = broadcast[9]; memcpy(&buf[10], &addr->s6_addr[6], 10); } #endif /* * This is for clients that have an infiniband_header in the mbuf. */ void infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb) { struct infiniband_header *ibh; struct ether_header eh; if (mb->m_len < sizeof(*ibh)) return; ibh = mtod(mb, struct infiniband_header *); eh.ether_type = ibh->ib_protocol; memset(eh.ether_shost, 0, ETHER_ADDR_LEN); memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN); mb->m_data += sizeof(*ibh); mb->m_len -= sizeof(*ibh); mb->m_pkthdr.len -= sizeof(*ibh); bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); mb->m_data -= sizeof(*ibh); mb->m_len += sizeof(*ibh); mb->m_pkthdr.len += sizeof(*ibh); } +static void +update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) +{ + int csum_flags = 0; + + if (src->m_pkthdr.csum_flags & CSUM_IP) + csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); + if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) + csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); + if (src->m_pkthdr.csum_flags & CSUM_SCTP) + csum_flags |= CSUM_SCTP_VALID; + dst->m_pkthdr.csum_flags |= csum_flags; + if (csum_flags & CSUM_DATA_VALID) + dst->m_pkthdr.csum_data = 0xffff; +} + /* - * Infiniband output routine. + * Handle link-layer encapsulation requests. */ static int -infiniband_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, - struct route *ro) +infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req) { - uint8_t edst[INFINIBAND_ADDR_LEN]; -#if defined(INET) || defined(INET6) - struct llentry *lle = NULL; -#endif - struct infiniband_header *ibh; - int error = 0; - uint16_t type; - bool is_gw; + struct infiniband_header *ih; + struct arphdr *ah; + uint16_t etype; + const uint8_t *lladdr; - is_gw = ((ro != NULL) && (ro->ro_flags & RT_HAS_GW) != 0); + if (req->rtype != IFENCAP_LL) + return (EOPNOTSUPP); -#ifdef MAC - error = mac_ifnet_check_transmit(ifp, m); - if (error) - goto bad; -#endif + if (req->bufsize < INFINIBAND_HDR_LEN) + return (ENOMEM); - M_PROFILE(m); - if (ifp->if_flags & IFF_MONITOR) { - error = ENETDOWN; - goto bad; - } - if (!((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING))) { - error = ENETDOWN; - goto bad; - } + ih = (struct infiniband_header *)req->buf; + lladdr = req->lladdr; + req->lladdr_off = 0; - switch (dst->sa_family) { - case AF_LINK: - goto output; -#ifdef INET + switch (req->family) { case AF_INET: - if (lle != NULL && (lle->la_flags & LLE_VALID)) { - memcpy(edst, lle->ll_addr, sizeof(edst)); - } else if (m->m_flags & M_MCAST) { - infiniband_ipv4_multicast_map( - ((const struct sockaddr_in *)dst)->sin_addr.s_addr, - ifp->if_broadcastaddr, edst); - } else { - error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); - if (error) { - if (error == EWOULDBLOCK) - error = 0; - m = NULL; /* mbuf is consumed by resolver */ - goto bad; - } - } - type = htons(ETHERTYPE_IP); + etype = htons(ETHERTYPE_IP); break; - case AF_ARP: { - struct arphdr *ah; - - if (m->m_len < sizeof(*ah)) { - error = EINVAL; - goto bad; - } - - ah = mtod(m, struct arphdr *); - - if (m->m_len < arphdr_len(ah)) { - error = EINVAL; - goto bad; - } + case AF_INET6: + etype = htons(ETHERTYPE_IPV6); + break; + case AF_ARP: + ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch (ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: - type = htons(ETHERTYPE_REVARP); + etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: - type = htons(ETHERTYPE_ARP); + etype = htons(ETHERTYPE_ARP); break; } - if (m->m_flags & M_BCAST) { - memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN); + if (req->flags & IFENCAP_FLAG_BROADCAST) + lladdr = ifp->if_broadcastaddr; + break; + default: + return (EAFNOSUPPORT); + } + + ih->ib_protocol = etype; + ih->ib_reserved = 0; + memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN); + req->bufsize = sizeof(struct infiniband_header); + + return (0); +} + +static int +infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro, uint8_t *phdr, + uint32_t *pflags, struct llentry **plle) +{ + struct infiniband_header *ih; + uint32_t lleflags = 0; + int error = 0; + + if (plle) + *plle = NULL; + ih = (struct infiniband_header *)phdr; + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) { + error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); } else { - if (ah->ar_hln != INFINIBAND_ADDR_LEN) { - error = EINVAL; - goto bad; + if (m->m_flags & M_BCAST) { + memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr, + INFINIBAND_ADDR_LEN); + } else { + infiniband_ipv4_multicast_map( + ((const struct sockaddr_in *)dst)->sin_addr.s_addr, + ifp->if_broadcastaddr, ih->ib_hwaddr); } - memcpy(edst, ar_tha(ah), INFINIBAND_ADDR_LEN); + ih->ib_protocol = htons(ETHERTYPE_IP); + ih->ib_reserved = 0; } break; - } #endif #ifdef INET6 - case AF_INET6: { - const struct ip6_hdr *ip6; - - ip6 = mtod(m, const struct ip6_hdr *); - if (m->m_len < sizeof(*ip6)) { - error = EINVAL; - goto bad; - } else if (lle != NULL && (lle->la_flags & LLE_VALID)) { - memcpy(edst, lle->ll_addr, sizeof(edst)); - } else if (m->m_flags & M_MCAST) { + case AF_INET6: + if ((m->m_flags & M_MCAST) == 0) { + error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle); + } else { infiniband_ipv6_multicast_map( &((const struct sockaddr_in6 *)dst)->sin6_addr, - ifp->if_broadcastaddr, edst); - } else if (ip6->ip6_nxt == IPPROTO_ICMPV6) { - memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN); - } else { - error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); - if (error) { - if (error == EWOULDBLOCK) - error = 0; - m = NULL; /* mbuf is consumed by resolver */ - goto bad; - } + ifp->if_broadcastaddr, ih->ib_hwaddr); + ih->ib_protocol = htons(ETHERTYPE_IPV6); + ih->ib_reserved = 0; } - type = htons(ETHERTYPE_IPV6); break; - } #endif default: - error = EAFNOSUPPORT; + if_printf(ifp, "can't handle af%d\n", dst->sa_family); + if (m != NULL) + m_freem(m); + return (EAFNOSUPPORT); + } + + if (error == EHOSTDOWN) { + if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) + error = EHOSTUNREACH; + } + + if (error != 0) + return (error); + + *pflags = RT_MAY_LOOP; + if (lleflags & LLE_IFADDR) + *pflags |= RT_L2_ME; + + return (0); +} + +/* + * Infiniband output routine. + */ +static int +infiniband_output(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro) +{ + uint8_t linkhdr[INFINIBAND_HDR_LEN]; + uint8_t *phdr; + struct llentry *lle = NULL; + struct infiniband_header *ih; + int error = 0; + int hlen; /* link layer header length */ + uint32_t pflags; + bool addref; + + addref = false; + phdr = NULL; + pflags = 0; + if (ro != NULL) { + /* XXX BPF uses ro_prepend */ + if (ro->ro_prepend != NULL) { + phdr = ro->ro_prepend; + hlen = ro->ro_plen; + } else if (!(m->m_flags & (M_BCAST | M_MCAST))) { + if ((ro->ro_flags & RT_LLE_CACHE) != 0) { + lle = ro->ro_lle; + if (lle != NULL && + (lle->la_flags & LLE_VALID) == 0) { + LLE_FREE(lle); + lle = NULL; /* redundant */ + ro->ro_lle = NULL; + } + if (lle == NULL) { + /* if we lookup, keep cache */ + addref = 1; + } else + /* + * Notify LLE code that + * the entry was used + * by datapath. + */ + llentry_mark_used(lle); + } + if (lle != NULL) { + phdr = lle->r_linkdata; + hlen = lle->r_hdrlen; + pflags = lle->r_flags; + } + } + } + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + goto bad; +#endif + + M_PROFILE(m); + if (ifp->if_flags & IFF_MONITOR) { + error = ENETDOWN; goto bad; } + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) { + error = ENETDOWN; + goto bad; + } + + if (phdr == NULL) { + /* No prepend data supplied. Try to calculate ourselves. */ + phdr = linkhdr; + hlen = INFINIBAND_HDR_LEN; + error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags, + addref ? &lle : NULL); + if (addref && lle != NULL) + ro->ro_lle = lle; + if (error != 0) + return (error == EWOULDBLOCK ? 0 : error); + } + + if ((pflags & RT_L2_ME) != 0) { + update_mbuf_csumflags(m, m); + return (if_simloop(ifp, m, dst->sa_family, 0)); + } /* - * Add local net header. If no space in first mbuf, + * Add local infiniband header. If no space in first mbuf, * allocate another. */ M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } - ibh = mtod(m, struct infiniband_header *); - - ibh->ib_protocol = type; - memcpy(ibh->ib_hwaddr, edst, sizeof(edst)); + if ((pflags & RT_HAS_HEADER) == 0) { + ih = mtod(m, struct infiniband_header *); + memcpy(ih, phdr, hlen); + } /* * Queue message on interface, update output statistics if * successful, and start output if interface not yet active. */ -output: return (ifp->if_transmit(ifp, m)); bad: if (m != NULL) m_freem(m); return (error); } /* * Process a received Infiniband packet. */ static void infiniband_input(struct ifnet *ifp, struct mbuf *m) { struct infiniband_header *ibh; struct epoch_tracker et; int isr; CURVNET_SET_QUIET(ifp->if_vnet); if ((ifp->if_flags & IFF_UP) == 0) { if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); goto done; } ibh = mtod(m, struct infiniband_header *); /* * Reset layer specific mbuf flags to avoid confusing upper * layers: */ m->m_flags &= ~M_VLANTAG; m_clrprotoflags(m); if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) { if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr, ifp->if_addrlen) == 0) m->m_flags |= M_BCAST; else m->m_flags |= M_MCAST; if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); } /* Let BPF have it before we strip the header. */ INFINIBAND_BPF_MTAP(ifp, m); /* Allow monitor mode to claim this frame, after stats are updated. */ if (ifp->if_flags & IFF_MONITOR) { m_freem(m); goto done; } /* Direct packet to correct FIB based on interface config. */ M_SETFIB(m, ifp->if_fib); /* Handle input from a lagg port */ if (ifp->if_type == IFT_INFINIBANDLAG) { KASSERT(lagg_input_infiniband_p != NULL, ("%s: if_lagg not loaded!", __func__)); m = (*lagg_input_infiniband_p)(ifp, m); if (__predict_false(m == NULL)) goto done; ifp = m->m_pkthdr.rcvif; } /* * Dispatch frame to upper layer. */ switch (ibh->ib_protocol) { #ifdef INET case htons(ETHERTYPE_IP): isr = NETISR_IP; break; case htons(ETHERTYPE_ARP): if (ifp->if_flags & IFF_NOARP) { /* Discard packet if ARP is disabled on interface */ m_freem(m); goto done; } isr = NETISR_ARP; break; #endif #ifdef INET6 case htons(ETHERTYPE_IPV6): isr = NETISR_IPV6; break; #endif default: if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); m_freem(m); goto done; } /* Strip off the Infiniband header. */ m_adj(m, INFINIBAND_HDR_LEN); #ifdef MAC /* * Tag the mbuf with an appropriate MAC label before any other * consumers can get to it. */ mac_ifnet_create_mbuf(ifp, m); #endif /* Allow monitor mode to claim this frame, after stats are updated. */ NET_EPOCH_ENTER_ET(et); netisr_dispatch(isr, m); NET_EPOCH_EXIT_ET(et); done: CURVNET_RESTORE(); } static int infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, struct sockaddr *sa) { struct sockaddr_dl *sdl; #ifdef INET struct sockaddr_in *sin; #endif #ifdef INET6 struct sockaddr_in6 *sin6; #endif uint8_t *e_addr; switch (sa->sa_family) { case AF_LINK: /* * No mapping needed. Just check that it's a valid MC address. */ sdl = (struct sockaddr_dl *)sa; e_addr = LLADDR(sdl); if (!INFINIBAND_IS_MULTICAST(e_addr)) return (EADDRNOTAVAIL); *llsa = NULL; return 0; #ifdef INET case AF_INET: sin = (struct sockaddr_in *)sa; if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ADDR_LEN; e_addr = LLADDR(sdl); infiniband_ipv4_multicast_map( sin->sin_addr.s_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif #ifdef INET6 case AF_INET6: sin6 = (struct sockaddr_in6 *)sa; /* * An IP6 address of 0 means listen to all of the * multicast address used for IP6. This has no meaning * in infiniband. */ if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) return (EADDRNOTAVAIL); if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) return (EADDRNOTAVAIL); sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); sdl->sdl_alen = INFINIBAND_ADDR_LEN; e_addr = LLADDR(sdl); infiniband_ipv6_multicast_map( &sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); *llsa = (struct sockaddr *)sdl; return (0); #endif default: return (EAFNOSUPPORT); } } void infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb) { struct sockaddr_dl *sdl; struct ifaddr *ifa; int i; ifp->if_addrlen = INFINIBAND_ADDR_LEN; ifp->if_hdrlen = INFINIBAND_HDR_LEN; ifp->if_mtu = INFINIBAND_MTU; if_attach(ifp); ifp->if_output = infiniband_output; ifp->if_input = infiniband_input; ifp->if_resolvemulti = infiniband_resolvemulti; + ifp->if_requestencap = infiniband_requestencap; if (ifp->if_baudrate == 0) ifp->if_baudrate = IF_Gbps(10); /* default value */ if (llb != NULL) ifp->if_broadcastaddr = llb; ifa = ifp->if_addr; KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); sdl = (struct sockaddr_dl *)ifa->ifa_addr; sdl->sdl_type = IFT_INFINIBAND; sdl->sdl_alen = ifp->if_addrlen; if (lla != NULL) { memcpy(LLADDR(sdl), lla, ifp->if_addrlen); if (ifp->if_hw_addr != NULL) memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen); } else { lla = LLADDR(sdl); } /* Attach ethernet compatible network device */ bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); /* Announce Infiniband MAC address if non-zero. */ for (i = 0; i < ifp->if_addrlen; i++) if (lla[i] != 0) break; if (i != ifp->if_addrlen) if_printf(ifp, "Infiniband address: %20D\n", lla, ":"); /* Add necessary bits are setup; announce it now. */ EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp); if (IS_DEFAULT_VNET(curvnet)) devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL); } /* * Perform common duties while detaching an Infiniband interface */ void infiniband_ifdetach(struct ifnet *ifp) { bpfdetach(ifp); if_detach(ifp); } static int infiniband_modevent(module_t mod, int type, void *data) { switch (type) { case MOD_LOAD: case MOD_UNLOAD: return (0); default: return (EOPNOTSUPP); } } static moduledata_t infiniband_mod = { .name = "if_infiniband", .evhand = &infiniband_modevent, }; DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); MODULE_VERSION(if_infiniband, 1);