Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4552,6 +4552,7 @@ compile-with "${LINUXKPI_C}" # OpenFabrics Enterprise Distribution (Infiniband) +net/if_infiniband.c optional ofed ofed/drivers/infiniband/core/ib_addr.c optional ofed \ compile-with "${OFED_C}" ofed/drivers/infiniband/core/ib_agent.c optional ofed \ Index: sys/kern/uipc_mbufhash.c =================================================================== --- sys/kern/uipc_mbufhash.c +++ sys/kern/uipc_mbufhash.c @@ -28,6 +28,7 @@ #include #include +#include #if defined(INET) || defined(INET6) #include @@ -42,7 +43,7 @@ #endif static const void * -m_ether_tcpip_hash_gethdr(const struct mbuf *m, const u_int off, +m_common_hash_gethdr(const struct mbuf *m, const u_int off, const u_int len, void *buf) { @@ -65,6 +66,15 @@ } uint32_t +m_infiniband_tcpip_hash_init(void) +{ + uint32_t seed; + + seed = arc4random(); + return (fnv_32_buf(&seed, sizeof(seed), FNV1_32_INIT)); +} + +uint32_t m_ether_tcpip_hash(const uint32_t flags, const struct mbuf *m, const uint32_t key) { @@ -105,7 +115,7 @@ p = fnv_32_buf(&m->m_pkthdr.ether_vtag, sizeof(m->m_pkthdr.ether_vtag), p); } else if (etype == ETHERTYPE_VLAN) { - vlan = m_ether_tcpip_hash_gethdr(m, off, sizeof(*vlan), &buf); + vlan = m_common_hash_gethdr(m, off, sizeof(*vlan), &buf); if (vlan == NULL) goto done; @@ -117,7 +127,7 @@ switch (etype) { #ifdef INET case ETHERTYPE_IP: - ip = m_ether_tcpip_hash_gethdr(m, off, sizeof(*ip), &buf); + ip = m_common_hash_gethdr(m, off, sizeof(*ip), &buf); if (ip == NULL) break; if (flags & MBUF_HASHFLAG_L3) { @@ -136,7 +146,7 @@ if (iphlen < sizeof(*ip)) break; off += iphlen; - ports = m_ether_tcpip_hash_gethdr(m, + ports = m_common_hash_gethdr(m, off, sizeof(*ports), &buf); if (ports == NULL) break; @@ -150,7 +160,7 @@ #endif #ifdef INET6 case ETHERTYPE_IPV6: - ip6 = m_ether_tcpip_hash_gethdr(m, off, sizeof(*ip6), &buf); + ip6 = m_common_hash_gethdr(m, off, sizeof(*ip6), &buf); if (ip6 == NULL) break; if (flags & MBUF_HASHFLAG_L3) { @@ -172,3 +182,96 @@ done: return (p); } + +uint32_t +m_infiniband_tcpip_hash(const uint32_t flags, const struct mbuf *m, + const uint32_t key) +{ + union { +#ifdef INET + struct ip ip; +#endif +#ifdef INET6 + struct ip6_hdr ip6; +#endif + struct infiniband_header hdr; + uint32_t port; + } buf; + const struct infiniband_header *ibh; +#ifdef INET + const struct ip *ip; +#endif +#ifdef INET6 + const struct ip6_hdr *ip6; +#endif + uint32_t p; + int off; + uint16_t etype; + + p = key; + off = sizeof(*ibh); + if (m->m_len < off) + goto done; + ibh = mtod(m, struct infiniband_header *); + etype = ntohs(ibh->ib_protocol); + if (flags & MBUF_HASHFLAG_L2) + p = fnv_32_buf(&ibh->ib_hwaddr, INFINIBAND_ADDR_LEN, p); + + switch (etype) { +#ifdef INET + case ETHERTYPE_IP: + ip = m_common_hash_gethdr(m, off, sizeof(*ip), &buf); + if (ip == NULL) + break; + if (flags & MBUF_HASHFLAG_L3) { + p = fnv_32_buf(&ip->ip_src, sizeof(struct in_addr), p); + p = fnv_32_buf(&ip->ip_dst, sizeof(struct in_addr), p); + } + if (flags & MBUF_HASHFLAG_L4) { + const uint32_t *ports; + int iphlen; + + switch (ip->ip_p) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + iphlen = ip->ip_hl << 2; + if (iphlen < sizeof(*ip)) + break; + off += iphlen; + ports = m_common_hash_gethdr(m, + off, sizeof(*ports), &buf); + if (ports == NULL) + break; + p = fnv_32_buf(ports, sizeof(*ports), p); + break; + default: + break; + } + } + break; +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + ip6 = m_common_hash_gethdr(m, off, sizeof(*ip6), &buf); + if (ip6 == NULL) + break; + if (flags & MBUF_HASHFLAG_L3) { + p = fnv_32_buf(&ip6->ip6_src, sizeof(struct in6_addr), p); + p = fnv_32_buf(&ip6->ip6_dst, sizeof(struct in6_addr), p); + } + if (flags & MBUF_HASHFLAG_L4) { + uint32_t flow; + + /* IPv6 flow label */ + flow = ip6->ip6_flow & IPV6_FLOWLABEL_MASK; + p = fnv_32_buf(&flow, sizeof(flow), p); + } + break; +#endif + default: + break; + } +done: + return (p); +} Index: sys/modules/Makefile =================================================================== --- sys/modules/Makefile +++ sys/modules/Makefile @@ -154,6 +154,7 @@ ${_if_gif} \ ${_if_gre} \ ${_if_me} \ + if_infiniband \ if_lagg \ ${_if_ndis} \ ${_if_stf} \ Index: sys/modules/if_infiniband/Makefile =================================================================== --- sys/modules/if_infiniband/Makefile +++ sys/modules/if_infiniband/Makefile @@ -0,0 +1,10 @@ +# $FreeBSD$ + +.PATH: ${SRCTOP}/sys/net + +KMOD= if_infiniband +SRCS= if_infiniband.c \ + opt_inet.h \ + opt_inet6.h + +.include Index: sys/net/ieee8023ad_lacp.c =================================================================== --- sys/net/ieee8023ad_lacp.c +++ sys/net/ieee8023ad_lacp.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include Index: sys/net/if_ethersubr.c =================================================================== --- sys/net/if_ethersubr.c +++ sys/net/if_ethersubr.c @@ -110,7 +110,7 @@ void (*bridge_dn_p)(struct mbuf *, struct ifnet *); /* if_lagg(4) support */ -struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); +struct mbuf *(*lagg_input_ethernet_p)(struct ifnet *, struct mbuf *); static const u_char etherbroadcastaddr[ETHER_ADDR_LEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; @@ -601,9 +601,9 @@ /* Handle input from a lagg(4) port */ if (ifp->if_type == IFT_IEEE8023ADLAG) { - KASSERT(lagg_input_p != NULL, + KASSERT(lagg_input_ethernet_p != NULL, ("%s: if_lagg not loaded!", __func__)); - m = (*lagg_input_p)(ifp, m); + m = (*lagg_input_ethernet_p)(ifp, m); if (m != NULL) ifp = m->m_pkthdr.rcvif; else { Index: sys/net/if_infiniband.c =================================================================== --- sys/net/if_infiniband.c +++ sys/net/if_infiniband.c @@ -0,0 +1,534 @@ +/*- + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "opt_inet.h" +#include "opt_inet6.h" + +#include +__FBSDID("$FreeBSD:"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +/* if_lagg(4) support */ +struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *); + +#ifdef INET +static inline void +infiniband_ipv4_multicast_map(uint32_t addr, + const uint8_t *broadcast, uint8_t *buf) +{ + uint8_t scope; + + addr = ntohl(addr); + scope = broadcast[5] & 0xF; + + buf[0] = 0; + buf[1] = 0xff; + buf[2] = 0xff; + buf[3] = 0xff; + buf[4] = 0xff; + buf[5] = 0x10 | scope; + buf[6] = 0x40; + buf[7] = 0x1b; + buf[8] = broadcast[8]; + buf[9] = broadcast[9]; + buf[10] = 0; + buf[11] = 0; + buf[12] = 0; + buf[13] = 0; + buf[14] = 0; + buf[15] = 0; + buf[16] = (addr >> 24) & 0xff; + buf[17] = (addr >> 16) & 0xff; + buf[18] = (addr >> 8) & 0xff; + buf[19] = addr & 0xff; +} +#endif + +#ifdef INET6 +static inline void +infiniband_ipv6_multicast_map(const struct in6_addr *addr, + const uint8_t *broadcast, uint8_t *buf) +{ + uint8_t scope; + + scope = broadcast[5] & 0xF; + + buf[0] = 0; + buf[1] = 0xff; + buf[2] = 0xff; + buf[3] = 0xff; + buf[4] = 0xff; + buf[5] = 0x10 | scope; + buf[6] = 0x60; + buf[7] = 0x1b; + buf[8] = broadcast[8]; + buf[9] = broadcast[9]; + memcpy(&buf[10], &addr->s6_addr[6], 10); +} +#endif + +/* + * This is for clients that have an infiniband_header in the mbuf. + */ +void +infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb) +{ + struct infiniband_header *ibh; + struct ether_header eh; + + if (mb->m_len < sizeof(*ibh)) + return; + + ibh = mtod(mb, struct infiniband_header *); + eh.ether_type = ibh->ib_protocol; + memset(eh.ether_shost, 0, ETHER_ADDR_LEN); + memcpy(eh.ether_dhost, ibh->ib_hwaddr + 4, ETHER_ADDR_LEN); + mb->m_data += sizeof(*ibh); + mb->m_len -= sizeof(*ibh); + mb->m_pkthdr.len -= sizeof(*ibh); + bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); + mb->m_data -= sizeof(*ibh); + mb->m_len += sizeof(*ibh); + mb->m_pkthdr.len += sizeof(*ibh); +} + +/* + * Infiniband output routine. + */ +static int +infiniband_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, + struct route *ro) +{ + uint8_t edst[INFINIBAND_ADDR_LEN]; +#if defined(INET) || defined(INET6) + struct llentry *lle = NULL; +#endif + struct infiniband_header *ibh; + int error = 0; + uint16_t type; + bool is_gw; + + NET_EPOCH_ASSERT(); + + is_gw = ((ro != NULL) && (ro->ro_flags & RT_HAS_GW) != 0); + +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + goto bad; +#endif + + M_PROFILE(m); + if (ifp->if_flags & IFF_MONITOR) { + error = ENETDOWN; + goto bad; + } + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) { + error = ENETDOWN; + goto bad; + } + + switch (dst->sa_family) { + case AF_LINK: + goto output; +#ifdef INET + case AF_INET: + if (lle != NULL && (lle->la_flags & LLE_VALID)) { + memcpy(edst, lle->ll_addr, sizeof(edst)); + } else if (m->m_flags & M_MCAST) { + infiniband_ipv4_multicast_map( + ((const struct sockaddr_in *)dst)->sin_addr.s_addr, + ifp->if_broadcastaddr, edst); + } else { + error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); + if (error) { + if (error == EWOULDBLOCK) + error = 0; + goto bad; + } + } + type = htons(ETHERTYPE_IP); + break; + case AF_ARP: { + struct arphdr *ah; + + if (m->m_len < sizeof(*ah)) { + error = EINVAL; + goto bad; + } + + ah = mtod(m, struct arphdr *); + + if (m->m_len < arphdr_len(ah)) { + error = EINVAL; + goto bad; + } + ah->ar_hrd = htons(ARPHRD_INFINIBAND); + + switch (ntohs(ah->ar_op)) { + case ARPOP_REVREQUEST: + case ARPOP_REVREPLY: + type = htons(ETHERTYPE_REVARP); + break; + case ARPOP_REQUEST: + case ARPOP_REPLY: + default: + type = htons(ETHERTYPE_ARP); + break; + } + + if (m->m_flags & M_BCAST) { + memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN); + } else { + if (ah->ar_hln != INFINIBAND_ADDR_LEN) { + error = EINVAL; + goto bad; + } + memcpy(edst, ar_tha(ah), INFINIBAND_ADDR_LEN); + } + break; + } +#endif +#ifdef INET6 + case AF_INET6: { + const struct ip6_hdr *ip6; + + ip6 = mtod(m, const struct ip6_hdr *); + if (m->m_len < sizeof(*ip6)) { + error = EINVAL; + goto bad; + } else if (lle != NULL && (lle->la_flags & LLE_VALID)) { + memcpy(edst, lle->ll_addr, sizeof(edst)); + } else if (m->m_flags & M_MCAST) { + infiniband_ipv6_multicast_map( + &((const struct sockaddr_in6 *)dst)->sin6_addr, + ifp->if_broadcastaddr, edst); + } else if (ip6->ip6_nxt == IPPROTO_ICMPV6) { + memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN); + } else { + error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); + if (error) { + if (error == EWOULDBLOCK) + error = 0; + goto bad; + } + } + type = htons(ETHERTYPE_IPV6); + break; + } +#endif + default: + error = EAFNOSUPPORT; + goto bad; + } + + /* + * Add local net header. If no space in first mbuf, + * allocate another. + */ + M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); + if (m == NULL) { + error = ENOBUFS; + goto bad; + } + ibh = mtod(m, struct infiniband_header *); + + ibh->ib_protocol = type; + memcpy(ibh->ib_hwaddr, edst, sizeof(edst)); + + /* + * Queue message on interface, update output statistics if + * successful, and start output if interface not yet active. + */ +output: + return (ifp->if_transmit(ifp, m)); +bad: + if (m != NULL) + m_freem(m); + return (error); +} + +/* + * Process a received Infiniband packet. + */ +static void +infiniband_input(struct ifnet *ifp, struct mbuf *m) +{ + struct infiniband_header *ibh; + struct epoch_tracker et; + int isr; + + CURVNET_SET_QUIET(ifp->if_vnet); + + if ((ifp->if_flags & IFF_UP) == 0) { + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + m_freem(m); + goto done; + } + + ibh = mtod(m, struct infiniband_header *); + + /* + * Reset layer specific mbuf flags to avoid confusing upper + * layers: + */ + m->m_flags &= ~M_VLANTAG; + m_clrprotoflags(m); + + if (INFINIBAND_IS_MULTICAST(ibh->ib_hwaddr)) { + if (memcmp(ibh->ib_hwaddr, ifp->if_broadcastaddr, + ifp->if_addrlen) == 0) + m->m_flags |= M_BCAST; + else + m->m_flags |= M_MCAST; + if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); + } + + /* Let BPF have it before we strip the header. */ + INFINIBAND_BPF_MTAP(ifp, m); + + /* Allow monitor mode to claim this frame, after stats are updated. */ + if (ifp->if_flags & IFF_MONITOR) { + m_freem(m); + goto done; + } + + /* Handle input from a bond port */ + if (ifp->if_type == IFT_INFINIBANDLAG) { + KASSERT(lagg_input_infiniband_p != NULL, + ("%s: if_lagg not loaded!", __func__)); + m = (*lagg_input_infiniband_p)(ifp, m); + if (__predict_false(m == NULL)) + goto done; + ifp = m->m_pkthdr.rcvif; + } + + /* + * Dispatch frame to upper layer. + */ + switch (ibh->ib_protocol) { +#ifdef INET + case htons(ETHERTYPE_IP): + isr = NETISR_IP; + break; + + case htons(ETHERTYPE_ARP): + if (ifp->if_flags & IFF_NOARP) { + /* Discard packet if ARP is disabled on interface */ + m_freem(m); + goto done; + } + isr = NETISR_ARP; + break; +#endif +#ifdef INET6 + case htons(ETHERTYPE_IPV6): + isr = NETISR_IPV6; + break; +#endif + default: + if_inc_counter(ifp, IFCOUNTER_IERRORS, 1); + m_freem(m); + goto done; + } + + /* Strip off the Infiniband header. */ + m_adj(m, INFINIBAND_HDR_LEN); + +#ifdef MAC + /* + * Tag the mbuf with an appropriate MAC label before any other + * consumers can get to it. + */ + mac_ifnet_create_mbuf(ifp, m); +#endif + /* Allow monitor mode to claim this frame, after stats are updated. */ + NET_EPOCH_ENTER(et); + netisr_dispatch(isr, m); + NET_EPOCH_EXIT(et); +done: + CURVNET_RESTORE(); +} + +static int +infiniband_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, + struct sockaddr *sa) +{ + struct sockaddr_dl *sdl; +#ifdef INET + struct sockaddr_in *sin; +#endif +#ifdef INET6 + struct sockaddr_in6 *sin6; +#endif + uint8_t *e_addr; + + switch (sa->sa_family) { + case AF_LINK: + /* + * No mapping needed. Just check that it's a valid MC address. + */ + sdl = (struct sockaddr_dl *)sa; + e_addr = LLADDR(sdl); + if (!INFINIBAND_IS_MULTICAST(e_addr)) + return (EADDRNOTAVAIL); + *llsa = NULL; + return 0; + +#ifdef INET + case AF_INET: + sin = (struct sockaddr_in *)sa; + if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) + return (EADDRNOTAVAIL); + sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); + sdl->sdl_alen = INFINIBAND_ADDR_LEN; + e_addr = LLADDR(sdl); + infiniband_ipv4_multicast_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, + e_addr); + *llsa = (struct sockaddr *)sdl; + return (0); +#endif +#ifdef INET6 + case AF_INET6: + sin6 = (struct sockaddr_in6 *)sa; + /* + * An IP6 address of 0 means listen to all of the + * multicast address used for IP6. This has no meaning + * in infiniband. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + return (EADDRNOTAVAIL); + if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) + return (EADDRNOTAVAIL); + sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); + sdl->sdl_alen = INFINIBAND_ADDR_LEN; + e_addr = LLADDR(sdl); + infiniband_ipv6_multicast_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); + *llsa = (struct sockaddr *)sdl; + return (0); +#endif + default: + return (EAFNOSUPPORT); + } +} + +void +infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb) +{ + struct sockaddr_dl *sdl; + struct ifaddr *ifa; + int i; + + ifp->if_addrlen = INFINIBAND_ADDR_LEN; + ifp->if_hdrlen = INFINIBAND_HDR_LEN; + ifp->if_mtu = INFINIBAND_MTU; + if_attach(ifp); + ifp->if_output = infiniband_output; + ifp->if_input = infiniband_input; + ifp->if_resolvemulti = infiniband_resolvemulti; + + if (ifp->if_baudrate == 0) + ifp->if_baudrate = IF_Gbps(10); /* default value */ + if (llb != NULL) + ifp->if_broadcastaddr = llb; + + ifa = ifp->if_addr; + KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); + sdl = (struct sockaddr_dl *)ifa->ifa_addr; + sdl->sdl_type = IFT_INFINIBAND; + sdl->sdl_alen = ifp->if_addrlen; + + if (lla != NULL) { + memcpy(LLADDR(sdl), lla, ifp->if_addrlen); + + if (ifp->if_hw_addr != NULL) + memcpy(ifp->if_hw_addr, lla, ifp->if_addrlen); + } else { + lla = LLADDR(sdl); + } + + /* Attach ethernet compatible network device */ + bpfattach(ifp, DLT_EN10MB, ETHER_HDR_LEN); + + /* Announce Infiniband MAC address if non-zero. */ + for (i = 0; i < ifp->if_addrlen; i++) + if (lla[i] != 0) + break; + if (i != ifp->if_addrlen) + if_printf(ifp, "Infiniband address: %20D\n", lla, ":"); + + /* Add necessary bits are setup; announce it now. */ + EVENTHANDLER_INVOKE(infiniband_ifattach_event, ifp); + + if (IS_DEFAULT_VNET(curvnet)) + devctl_notify("INFINIBAND", ifp->if_xname, "IFATTACH", NULL); +} + +/* + * Perform common duties while detaching an Infiniband interface + */ +void +infiniband_ifdetach(struct ifnet *ifp) +{ + bpfdetach(ifp); + if_detach(ifp); +} + +static moduledata_t infiniband_mod = { + .name = "if_infiniband", +}; + +DECLARE_MODULE(if_infiniband, infiniband_mod, SI_SUB_INIT_IF, SI_ORDER_ANY); +MODULE_VERSION(if_infiniband, 1); Index: sys/net/if_lagg.h =================================================================== --- sys/net/if_lagg.h +++ sys/net/if_lagg.h @@ -206,7 +206,7 @@ struct lagg_softc { struct ifnet *sc_ifp; /* virtual interface */ - struct rmlock sc_mtx; + struct mtx sc_mtx; struct sx sc_sx; int sc_proto; /* lagg protocol */ u_int sc_count; /* number of ports */ @@ -230,12 +230,15 @@ u_int sc_opts; int flowid_shift; /* shift the flowid */ struct lagg_counters detached_counters; /* detached ports sum */ + struct callout sc_watchdog; /* watchdog timer */ }; struct lagg_port { struct ifnet *lp_ifp; /* physical interface */ struct lagg_softc *lp_softc; /* parent lagg */ - uint8_t lp_lladdr[ETHER_ADDR_LEN]; +#define LAGG_ADDR_LEN \ + MAX(INFINIBAND_ADDR_LEN, ETHER_ADDR_LEN) + uint8_t lp_lladdr[LAGG_ADDR_LEN]; u_char lp_iftype; /* interface type */ uint32_t lp_prio; /* port priority */ @@ -257,7 +260,8 @@ struct epoch_context lp_epoch_ctx; }; -extern struct mbuf *(*lagg_input_p)(struct ifnet *, struct mbuf *); +extern struct mbuf *(*lagg_input_ethernet_p)(struct ifnet *, struct mbuf *); +extern struct mbuf *(*lagg_input_infiniband_p)(struct ifnet *, struct mbuf *); extern void (*lagg_linkstate_p)(struct ifnet *, int ); int lagg_enqueue(struct ifnet *, struct mbuf *); Index: sys/net/if_lagg.c =================================================================== --- sys/net/if_lagg.c +++ sys/net/if_lagg.c @@ -55,6 +55,7 @@ #include #include #include +#include #if defined(INET) || defined(INET6) #include @@ -121,17 +122,23 @@ #define LAGG_LIST_UNLOCK(x) mtx_unlock(&V_lagg_list_mtx) eventhandler_tag lagg_detach_cookie = NULL; -static int lagg_clone_create(struct if_clone *, int, caddr_t); -static void lagg_clone_destroy(struct ifnet *); +static int lagg_clone_create_ethernet(struct if_clone *, int, caddr_t); +static void lagg_clone_destroy_ethernet(struct ifnet *); +static int lagg_clone_create_infiniband(struct if_clone *, int, caddr_t); +static void lagg_clone_destroy_infiniband(struct ifnet *); VNET_DEFINE_STATIC(struct if_clone *, lagg_cloner); +VNET_DEFINE_STATIC(struct if_clone *, bond_cloner); #define V_lagg_cloner VNET(lagg_cloner) +#define V_bond_cloner VNET(bond_cloner) static const char laggname[] = "lagg"; +static const char bondname[] = "bond"; static MALLOC_DEFINE(M_LAGG, laggname, "802.3AD Link Aggregation Interface"); static void lagg_capabilities(struct lagg_softc *); static int lagg_port_create(struct lagg_softc *, struct ifnet *); static int lagg_port_destroy(struct lagg_port *, int); -static struct mbuf *lagg_input(struct ifnet *, struct mbuf *); +static struct mbuf *lagg_input_ethernet(struct ifnet *, struct mbuf *); +static struct mbuf *lagg_input_infiniband(struct ifnet *, struct mbuf *); static void lagg_linkstate(struct lagg_softc *); static void lagg_port_state(struct ifnet *, int); static int lagg_port_ioctl(struct ifnet *, u_long, caddr_t); @@ -164,7 +171,8 @@ int (*func)(struct ifnet *, int)); static int lagg_setflags(struct lagg_port *, int status); static uint64_t lagg_get_counter(struct ifnet *ifp, ift_counter cnt); -static int lagg_transmit(struct ifnet *, struct mbuf *); +static int lagg_transmit_ethernet(struct ifnet *, struct mbuf *); +static int lagg_transmit_infiniband(struct ifnet *, struct mbuf *); static void lagg_qflush(struct ifnet *); static int lagg_media_change(struct ifnet *); static void lagg_media_status(struct ifnet *, struct ifmediareq *); @@ -305,8 +313,10 @@ LAGG_LIST_LOCK_INIT(); SLIST_INIT(&V_lagg_list); - V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create, - lagg_clone_destroy, 0); + V_lagg_cloner = if_clone_simple(laggname, lagg_clone_create_ethernet, + lagg_clone_destroy_ethernet, 0); + V_bond_cloner = if_clone_simple(bondname, lagg_clone_create_infiniband, + lagg_clone_destroy_infiniband, 0); } VNET_SYSINIT(vnet_lagg_init, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY, vnet_lagg_init, NULL); @@ -315,6 +325,7 @@ vnet_lagg_uninit(const void *unused __unused) { + if_clone_detach(V_bond_cloner); if_clone_detach(V_lagg_cloner); LAGG_LIST_LOCK_DESTROY(); } @@ -327,7 +338,8 @@ switch (type) { case MOD_LOAD: - lagg_input_p = lagg_input; + lagg_input_ethernet_p = lagg_input_ethernet; + lagg_input_infiniband_p = lagg_input_infiniband; lagg_linkstate_p = lagg_port_state; lagg_detach_cookie = EVENTHANDLER_REGISTER( ifnet_departure_event, lagg_port_ifdetach, NULL, @@ -336,7 +348,8 @@ case MOD_UNLOAD: EVENTHANDLER_DEREGISTER(ifnet_departure_event, lagg_detach_cookie); - lagg_input_p = NULL; + lagg_input_ethernet_p = NULL; + lagg_input_infiniband_p = NULL; lagg_linkstate_p = NULL; break; default: @@ -353,6 +366,7 @@ DECLARE_MODULE(if_lagg, lagg_mod, SI_SUB_PSEUDO, SI_ORDER_ANY); MODULE_VERSION(if_lagg, 1); +MODULE_DEPEND(if_lagg, if_infiniband, 1, 1, 1); static void lagg_proto_attach(struct lagg_softc *sc, lagg_proto pr) @@ -502,20 +516,28 @@ } static int -lagg_clone_create(struct if_clone *ifc, int unit, caddr_t params) +lagg_clone_create_common(struct if_clone *ifc, int unit, caddr_t params, int if_type) { struct lagg_softc *sc; struct ifnet *ifp; - static const u_char eaddr[6]; /* 00:00:00:00:00:00 */ + static const uint8_t eaddr[LAGG_ADDR_LEN]; + static const uint8_t ib_bcast_addr[INFINIBAND_ADDR_LEN] = { + 0x00, 0xff, 0xff, 0xff, + 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff + }; sc = malloc(sizeof(*sc), M_LAGG, M_WAITOK|M_ZERO); - ifp = sc->sc_ifp = if_alloc(IFT_ETHER); + ifp = sc->sc_ifp = if_alloc(if_type); if (ifp == NULL) { free(sc, M_LAGG); return (ENOSPC); } LAGG_SX_INIT(sc); + mtx_init(&sc->sc_mtx, "lagg-mtx", NULL, MTX_DEF); + callout_init_mtx(&sc->sc_watchdog, &sc->sc_mtx, 0); + LAGG_XLOCK(sc); if (V_def_use_flowid) sc->sc_opts |= LAGG_OPT_USE_FLOWID; @@ -530,15 +552,25 @@ CK_SLIST_INIT(&sc->sc_ports); - /* Initialise pseudo media types */ - ifmedia_init(&sc->sc_media, 0, lagg_media_change, - lagg_media_status); - ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); - ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); + switch (if_type) { + case IFT_ETHER: + /* Initialise pseudo media types */ + ifmedia_init(&sc->sc_media, 0, lagg_media_change, + lagg_media_status); + ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO); - if_initname(ifp, laggname, unit); + if_initname(ifp, laggname, unit); + ifp->if_transmit = lagg_transmit_ethernet; + break; + case IFT_INFINIBAND: + if_initname(ifp, bondname, unit); + ifp->if_transmit = lagg_transmit_infiniband; + break; + default: + break; + } ifp->if_softc = sc; - ifp->if_transmit = lagg_transmit; ifp->if_qflush = lagg_qflush; ifp->if_init = lagg_init; ifp->if_ioctl = lagg_ioctl; @@ -555,9 +587,18 @@ /* * Attach as an ordinary ethernet device, children will be attached - * as special device IFT_IEEE8023ADLAG. + * as special device IFT_IEEE8023ADLAG or IFT_INFINIBANDLAG. */ - ether_ifattach(ifp, eaddr); + switch (if_type) { + case IFT_ETHER: + ether_ifattach(ifp, eaddr); + break; + case IFT_INFINIBAND: + infiniband_ifattach(ifp, eaddr, ib_bcast_addr); + break; + default: + break; + } sc->vlan_attach = EVENTHANDLER_REGISTER(vlan_config, lagg_register_vlan, sc, EVENTHANDLER_PRI_FIRST); @@ -573,8 +614,20 @@ return (0); } +static int +lagg_clone_create_ethernet(struct if_clone *ifc, int unit, caddr_t params) +{ + return (lagg_clone_create_common(ifc, unit, params, IFT_ETHER)); +} + +static int +lagg_clone_create_infiniband(struct if_clone *ifc, int unit, caddr_t params) +{ + return (lagg_clone_create_common(ifc, unit, params, IFT_INFINIBAND)); +} + static void -lagg_clone_destroy(struct ifnet *ifp) +lagg_clone_destroy_common(struct ifnet *ifp, int if_type) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; struct lagg_port *lp; @@ -595,19 +648,41 @@ lagg_proto_detach(sc); LAGG_XUNLOCK(sc); - ifmedia_removeall(&sc->sc_media); - ether_ifdetach(ifp); + switch (if_type) { + case IFT_ETHER: + ifmedia_removeall(&sc->sc_media); + ether_ifdetach(ifp); + break; + case IFT_INFINIBAND: + infiniband_ifdetach(ifp); + break; + default: + break; + } if_free(ifp); LAGG_LIST_LOCK(); SLIST_REMOVE(&V_lagg_list, sc, lagg_softc, sc_entries); LAGG_LIST_UNLOCK(); + mtx_destroy(&sc->sc_mtx); LAGG_SX_DESTROY(sc); free(sc, M_LAGG); } static void +lagg_clone_destroy_ethernet(struct ifnet *ifp) +{ + lagg_clone_destroy_common(ifp, IFT_ETHER); +} + +static void +lagg_clone_destroy_infiniband(struct ifnet *ifp) +{ + lagg_clone_destroy_common(ifp, IFT_INFINIBAND); +} + +static void lagg_capabilities(struct lagg_softc *sc) { struct lagg_port *lp; @@ -669,6 +744,7 @@ struct lagg_port *lp, *tlp; struct ifreq ifr; int error, i, oldmtu; + int if_type; uint64_t *pval; LAGG_XLOCK_ASSERT(sc); @@ -695,9 +771,22 @@ return (EBUSY); } - /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ - if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) - return (EPROTONOSUPPORT); + switch (sc->sc_ifp->if_type) { + case IFT_ETHER: + /* XXX Disallow non-ethernet interfaces (this should be any of 802) */ + if (ifp->if_type != IFT_ETHER && ifp->if_type != IFT_L2VLAN) + return (EPROTONOSUPPORT); + if_type = IFT_IEEE8023ADLAG; + break; + case IFT_INFINIBAND: + /* XXX Disallow non-infiniband interfaces */ + if (ifp->if_type != IFT_INFINIBAND) + return (EPROTONOSUPPORT); + if_type = IFT_INFINIBANDLAG; + break; + default: + break; + } /* Allow the first Ethernet member to define the MTU */ oldmtu = -1; @@ -754,14 +843,14 @@ if_ref(ifp); lp->lp_ifp = ifp; - bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ETHER_ADDR_LEN); + bcopy(IF_LLADDR(ifp), lp->lp_lladdr, ifp->if_addrlen); lp->lp_ifcapenable = ifp->if_capenable; if (CK_SLIST_EMPTY(&sc->sc_ports)) { - bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + bcopy(IF_LLADDR(ifp), IF_LLADDR(sc->sc_ifp), ifp->if_addrlen); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } else { - if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + if_setlladdr(ifp, IF_LLADDR(sc->sc_ifp), ifp->if_addrlen); } lagg_setflags(lp, 1); @@ -770,7 +859,7 @@ /* Change the interface type */ lp->lp_iftype = ifp->if_type; - ifp->if_type = IFT_IEEE8023ADLAG; + ifp->if_type = if_type; ifp->if_lagg = lp; lp->lp_ioctl = ifp->if_ioctl; ifp->if_ioctl = lagg_port_ioctl; @@ -887,15 +976,15 @@ /* Update the primary interface */ if (lp == sc->sc_primary) { - uint8_t lladdr[ETHER_ADDR_LEN]; + uint8_t lladdr[LAGG_ADDR_LEN]; if ((lp0 = CK_SLIST_FIRST(&sc->sc_ports)) == NULL) - bzero(&lladdr, ETHER_ADDR_LEN); + bzero(&lladdr, LAGG_ADDR_LEN); else - bcopy(lp0->lp_lladdr, lladdr, ETHER_ADDR_LEN); + bcopy(lp0->lp_lladdr, lladdr, LAGG_ADDR_LEN); sc->sc_primary = lp0; if (sc->sc_destroying == 0) { - bcopy(lladdr, IF_LLADDR(sc->sc_ifp), ETHER_ADDR_LEN); + bcopy(lladdr, IF_LLADDR(sc->sc_ifp), sc->sc_ifp->if_addrlen); lagg_proto_lladdr(sc); EVENTHANDLER_INVOKE(iflladdr_event, sc->sc_ifp); } @@ -905,7 +994,7 @@ * as well, to switch from old lladdr to its 'real' one) */ CK_SLIST_FOREACH(lp_ptr, &sc->sc_ports, lp_entries) - if_setlladdr(lp_ptr->lp_ifp, lladdr, ETHER_ADDR_LEN); + if_setlladdr(lp_ptr->lp_ifp, lladdr, lp_ptr->lp_ifp->if_addrlen); } if (lp->lp_ifflags) @@ -914,7 +1003,7 @@ if (lp->lp_detaching == 0) { lagg_setflags(lp, 0); lagg_setcaps(lp, lp->lp_ifcapenable); - if_setlladdr(ifp, lp->lp_lladdr, ETHER_ADDR_LEN); + if_setlladdr(ifp, lp->lp_lladdr, ifp->if_addrlen); } /* @@ -938,9 +1027,15 @@ int error = 0; /* Should be checked by the caller */ - if (ifp->if_type != IFT_IEEE8023ADLAG || - (lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) + switch (ifp->if_type) { + case IFT_IEEE8023ADLAG: + case IFT_INFINIBANDLAG: + if ((lp = ifp->if_lagg) == NULL || (sc = lp->lp_softc) == NULL) + goto fallback; + break; + default: goto fallback; + } switch (cmd) { case SIOCGLAGGPORT: @@ -1130,6 +1225,29 @@ } static void +lagg_watchdog_infiniband(void *arg) +{ + struct lagg_softc *sc; + struct lagg_port *lp; + + sc = arg; + + /* + * Because infiniband nodes have a fixed mac address, we need + * to regularly update the link level address of the parent + * bond device instead. This operation does not have to be + * atomic. + */ + LAGG_RLOCK(); + lp = lagg_link_active(sc, sc->sc_primary); + if (lp != NULL) + bcopy(IF_LLADDR(lp->lp_ifp), IF_LLADDR(sc->sc_ifp), lp->lp_ifp->if_addrlen); + LAGG_RUNLOCK(); + + callout_reset(&sc->sc_watchdog, hz, &lagg_watchdog_infiniband, arg); +} + +static void lagg_init(void *xsc) { struct lagg_softc *sc = (struct lagg_softc *)xsc; @@ -1151,12 +1269,18 @@ */ CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) { if (memcmp(IF_LLADDR(ifp), IF_LLADDR(lp->lp_ifp), - ETHER_ADDR_LEN) != 0) - if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ETHER_ADDR_LEN); + ifp->if_addrlen) != 0) + if_setlladdr(lp->lp_ifp, IF_LLADDR(ifp), ifp->if_addrlen); } lagg_proto_init(sc); + if (ifp->if_type == IFT_INFINIBAND) { + mtx_lock(&sc->sc_mtx); + lagg_watchdog_infiniband(sc); + mtx_unlock(&sc->sc_mtx); + } + LAGG_XUNLOCK(sc); } @@ -1173,6 +1297,12 @@ ifp->if_drv_flags &= ~IFF_DRV_RUNNING; lagg_proto_stop(sc); + + mtx_lock(&sc->sc_mtx); + callout_stop(&sc->sc_watchdog); + mtx_unlock(&sc->sc_mtx); + + callout_drain(&sc->sc_watchdog); } static int @@ -1228,7 +1358,12 @@ error = EPROTONOSUPPORT; break; } - + /* Infiniband only supports the failover protocol. */ + if (ra->ra_proto != LAGG_PROTO_FAILOVER && + ifp->if_type == IFT_INFINIBAND) { + error = EPROTONOSUPPORT; + break; + } LAGG_XLOCK(sc); lagg_proto_detach(sc); LAGG_UNLOCK_ASSERT(); @@ -1546,7 +1681,10 @@ break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: - error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); + if (ifp->if_type == IFT_INFINIBAND) + error = EINVAL; + else + error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, cmd); break; case SIOCSIFCAP: @@ -1855,7 +1993,7 @@ } static int -lagg_transmit(struct ifnet *ifp, struct mbuf *m) +lagg_transmit_ethernet(struct ifnet *ifp, struct mbuf *m) { struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; int error; @@ -1880,6 +2018,32 @@ return (error); } +static int +lagg_transmit_infiniband(struct ifnet *ifp, struct mbuf *m) +{ + struct lagg_softc *sc = (struct lagg_softc *)ifp->if_softc; + int error; + +#if defined(KERN_TLS) || defined(RATELIMIT) + if (m->m_pkthdr.csum_flags & CSUM_SND_TAG) + MPASS(m->m_pkthdr.snd_tag->ifp == ifp); +#endif + LAGG_RLOCK(); + /* We need a Tx algorithm and at least one port */ + if (sc->sc_proto == LAGG_PROTO_NONE || sc->sc_count == 0) { + LAGG_RUNLOCK(); + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENXIO); + } + + INFINIBAND_BPF_MTAP(ifp, m); + + error = lagg_proto_start(sc, m); + LAGG_RUNLOCK(); + return (error); +} + /* * The ifp->if_qflush entry point for lagg(4) is no-op. */ @@ -1889,7 +2053,7 @@ } static struct mbuf * -lagg_input(struct ifnet *ifp, struct mbuf *m) +lagg_input_ethernet(struct ifnet *ifp, struct mbuf *m) { struct lagg_port *lp = ifp->if_lagg; struct lagg_softc *sc = lp->lp_softc; @@ -1916,6 +2080,34 @@ return (m); } +static struct mbuf * +lagg_input_infiniband(struct ifnet *ifp, struct mbuf *m) +{ + struct lagg_port *lp = ifp->if_lagg; + struct lagg_softc *sc = lp->lp_softc; + struct ifnet *scifp = sc->sc_ifp; + + LAGG_RLOCK(); + if ((scifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || + lp->lp_detaching != 0 || + sc->sc_proto == LAGG_PROTO_NONE) { + LAGG_RUNLOCK(); + m_freem(m); + return (NULL); + } + + INFINIBAND_BPF_MTAP(scifp, m); + + m = lagg_proto_input(sc, lp, m); + if (m != NULL && (scifp->if_flags & IFF_MONITOR) != 0) { + m_freem(m); + m = NULL; + } + + LAGG_RUNLOCK(); + return (m); +} + static int lagg_media_change(struct ifnet *ifp) { @@ -2236,7 +2428,10 @@ LAGG_XLOCK_ASSERT(sc); lb = malloc(sizeof(struct lagg_lb), M_LAGG, M_WAITOK | M_ZERO); - lb->lb_key = m_ether_tcpip_hash_init(); + if (sc->sc_ifp->if_type == IFT_INFINIBAND) + lb->lb_key = m_infiniband_tcpip_hash_init(); + else + lb->lb_key = m_ether_tcpip_hash_init(); sc->sc_psc = lb; CK_SLIST_FOREACH(lp, &sc->sc_ports, lp_entries) @@ -2303,6 +2498,8 @@ if ((sc->sc_opts & LAGG_OPT_USE_FLOWID) && M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) p = m->m_pkthdr.flowid >> sc->flowid_shift; + else if (sc->sc_ifp->if_type == IFT_INFINIBAND) + p = m_infiniband_tcpip_hash(sc->sc_flags, m, lb->lb_key); else p = m_ether_tcpip_hash(sc->sc_flags, m, lb->lb_key); p %= sc->sc_count; Index: sys/net/if_types.h =================================================================== --- sys/net/if_types.h +++ sys/net/if_types.h @@ -242,6 +242,7 @@ IFT_OPTICALCHANNEL = 0xc3, /* Optical Channel */ IFT_OPTICALTRANSPORT = 0xc4, /* Optical Transport */ IFT_INFINIBAND = 0xc7, /* Infiniband */ + IFT_INFINIBANDLAG = 0xc8, /* Infiniband Link Aggregate */ IFT_BRIDGE = 0xd1, /* Transparent bridge interface */ IFT_STF = 0xd7, /* 6to4 interface */ Index: sys/net/infiniband.h =================================================================== --- sys/net/infiniband.h +++ sys/net/infiniband.h @@ -0,0 +1,80 @@ +/*- + * Copyright (c) 2020 Mellanox Technologies. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __INFINIBAND_H__ +#define __INFINIBAND_H__ + +#include +#include + +#define INFINIBAND_ADDR_LEN 20 /* bytes */ +#define INFINIBAND_MTU 1500 /* bytes - default value */ + +#define INFINIBAND_ENC_LEN 4 /* bytes */ +#define INFINIBAND_HDR_LEN \ + (INFINIBAND_ADDR_LEN + INFINIBAND_ENC_LEN) + +#define INFINIBAND_IS_MULTICAST(addr) \ + ((addr)[4] == 0xff) + +#define INFINIBAND_BPF_MTAP(_ifp, _m) \ +do { \ + if (bpf_peers_present((_ifp)->if_bpf)) { \ + M_ASSERTVALID(_m); \ + infiniband_bpf_mtap(_ifp, _m); \ + } \ +} while (0) + +struct infiniband_header { + uint8_t ib_hwaddr[INFINIBAND_ADDR_LEN]; + uint16_t ib_protocol; /* big endian */ + uint16_t ib_reserved; /* zero */ +} __packed; + +struct infiniband_address { + uint8_t octet[INFINIBAND_ADDR_LEN]; +} __packed; + +#ifdef _KERNEL + +#include + +struct ifnet; +struct mbuf; + +extern void infiniband_ifattach(struct ifnet *, const uint8_t *hwaddr, const uint8_t *bcaddr); +extern void infiniband_ifdetach(struct ifnet *); +extern void infiniband_bpf_mtap(struct ifnet *, struct mbuf *); + +/* new infiniband interface attached event */ +typedef void (*infiniband_ifattach_event_handler_t)(void *, struct ifnet *); + +EVENTHANDLER_DECLARE(infiniband_ifattach_event, infiniband_ifattach_event_handler_t); + +#endif + +#endif /* __INFINIBAND_H__ */ Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h @@ -438,16 +438,7 @@ extern struct workqueue_struct *ipoib_workqueue; -#define IPOIB_MTAP_PROTO(_ifp, _m, _proto) \ -do { \ - if (bpf_peers_present((_ifp)->if_bpf)) { \ - M_ASSERTVALID(_m); \ - ipoib_mtap_proto((_ifp), (_m), (_proto)); \ - } \ -} while (0) - /* functions */ -void ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto); void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr); void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr); @@ -463,8 +454,6 @@ int ipoib_add_pkey_attr(struct ipoib_dev_priv *priv); int ipoib_add_umcast_attr(struct ipoib_dev_priv *priv); -void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto); - void ipoib_send(struct ipoib_dev_priv *priv, struct mbuf *mb, struct ipoib_ah *address, u32 qpn); void ipoib_reap_ah(struct work_struct *work); @@ -540,7 +529,7 @@ void ipoib_dma_unmap_rx(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req); void ipoib_dma_mb(struct ipoib_dev_priv *priv, struct mbuf *mb, unsigned int length); -struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int size); +struct mbuf *ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, int align, int size); void ipoib_set_ethtool_ops(struct ifnet *dev); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -153,7 +153,7 @@ ipoib_cm_alloc_rx_mb(struct ipoib_dev_priv *priv, struct ipoib_cm_rx_buf *rx_req) { return ipoib_alloc_map_mb(priv, (struct ipoib_rx_buf *)rx_req, - priv->cm.max_cm_mtu); + sizeof(struct ipoib_pseudoheader), priv->cm.max_cm_mtu); } static void ipoib_cm_free_rx_ring(struct ipoib_dev_priv *priv, @@ -484,10 +484,7 @@ struct mbuf *mb, *newmb; struct ipoib_cm_rx *p; int has_srq; - u_short proto; - CURVNET_SET_QUIET(dev->if_vnet); - ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n", wr_id, wc->status); @@ -561,16 +558,24 @@ ipoib_dma_mb(priv, mb, wc->byte_len); - if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); - if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); - mb->m_pkthdr.rcvif = dev; - proto = *mtod(mb, uint16_t *); - m_adj(mb, IPOIB_ENCAP_LEN); - IPOIB_MTAP_PROTO(dev, mb, proto); - ipoib_demux(dev, mb, ntohs(proto)); + M_PREPEND(mb, sizeof(struct ipoib_pseudoheader), M_NOWAIT); + if (likely(mb != NULL)) { + struct ipoib_header *ibh; + if_inc_counter(dev, IFCOUNTER_IPACKETS, 1); + if_inc_counter(dev, IFCOUNTER_IBYTES, mb->m_pkthdr.len); + + /* fixup destination infiniband address */ + ibh = mtod(mb, struct ipoib_header *); + memset(ibh->hwaddr, 0, 4); + memcpy(ibh->hwaddr + 4, priv->local_gid.raw, sizeof(union ib_gid)); + + dev->if_input(dev, mb); + } else { + if_inc_counter(dev, IFCOUNTER_IERRORS, 1); + } repost: if (has_srq) { if (unlikely(ipoib_cm_post_receive_srq(priv, wr_id))) @@ -587,7 +592,6 @@ } } done: - CURVNET_RESTORE(); return; } Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -112,17 +112,19 @@ struct mbuf * ipoib_alloc_map_mb(struct ipoib_dev_priv *priv, struct ipoib_rx_buf *rx_req, - int size) + int align, int size) { struct mbuf *mb, *m; int i, j; rx_req->mb = NULL; - mb = m_getm2(NULL, size, M_NOWAIT, MT_DATA, M_PKTHDR); + mb = m_getm2(NULL, align + size, M_NOWAIT, MT_DATA, M_PKTHDR); if (mb == NULL) return (NULL); for (i = 0, m = mb; m != NULL; m = m->m_next, i++) { - m->m_len = M_SIZE(m); + m->m_len = M_SIZE(m) - align; + m->m_data += align; + align = 0; mb->m_pkthdr.len += m->m_len; rx_req->mapping[i] = ib_dma_map_single(priv->ca, mtod(m, void *), m->m_len, DMA_FROM_DEVICE); @@ -174,7 +176,7 @@ { return ipoib_alloc_map_mb(priv, &priv->rx_ring[id], - priv->max_ib_mtu + IB_GRH_BYTES); + 0, priv->max_ib_mtu + IB_GRH_BYTES); } static int ipoib_ib_post_receives(struct ipoib_dev_priv *priv) Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -40,21 +40,16 @@ #include "ipoib.h" #include -static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, - struct sockaddr *); - - #include #include #include #include -#include /* For ARPHRD_xxx */ #include -#include -#include +#include + #include MODULE_AUTHOR("Roland Dreier"); @@ -98,19 +93,8 @@ const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static void ipoib_start(struct ifnet *dev); -static int ipoib_output(struct ifnet *ifp, struct mbuf *m, - const struct sockaddr *dst, struct route *ro); static int ipoib_ioctl(struct ifnet *ifp, u_long command, caddr_t data); -static void ipoib_input(struct ifnet *ifp, struct mbuf *m); -#define IPOIB_MTAP(_ifp, _m) \ -do { \ - if (bpf_peers_present((_ifp)->if_bpf)) { \ - M_ASSERTVALID(_m); \ - ipoib_mtap_mb((_ifp), (_m)); \ - } \ -} while (0) - static struct unrhdr *ipoib_unrhdr; static void @@ -136,37 +120,6 @@ } SYSUNINIT(ipoib_unrhdr_uninit, SI_SUB_KLD - 1, SI_ORDER_ANY, ipoib_unrhdr_uninit, NULL); -/* - * This is for clients that have an ipoib_header in the mbuf. - */ -static void -ipoib_mtap_mb(struct ifnet *ifp, struct mbuf *mb) -{ - struct ipoib_header *ih; - struct ether_header eh; - - ih = mtod(mb, struct ipoib_header *); - eh.ether_type = ih->proto; - bcopy(ih->hwaddr, &eh.ether_dhost, ETHER_ADDR_LEN); - bzero(&eh.ether_shost, ETHER_ADDR_LEN); - mb->m_data += sizeof(struct ipoib_header); - mb->m_len -= sizeof(struct ipoib_header); - bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); - mb->m_data -= sizeof(struct ipoib_header); - mb->m_len += sizeof(struct ipoib_header); -} - -void -ipoib_mtap_proto(struct ifnet *ifp, struct mbuf *mb, uint16_t proto) -{ - struct ether_header eh; - - eh.ether_type = proto; - bzero(&eh.ether_shost, ETHER_ADDR_LEN); - bzero(&eh.ether_dhost, ETHER_ADDR_LEN); - bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); -} - static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, @@ -787,7 +740,7 @@ IFQ_DRV_DEQUEUE(&dev->if_snd, mb); if (mb == NULL) break; - IPOIB_MTAP(dev, mb); + INFINIBAND_BPF_MTAP(dev, mb); ipoib_send_one(priv, mb); } } @@ -875,8 +828,7 @@ dev = priv->dev; if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { priv->gone = 1; - bpfdetach(dev); - if_detach(dev); + infiniband_ifdetach(dev); if_free(dev); free_unr(ipoib_unrhdr, priv->unit); } else @@ -935,7 +887,6 @@ ipoib_intf_alloc(const char *name) { struct ipoib_dev_priv *priv; - struct sockaddr_dl *sdl; struct ifnet *dev; priv = ipoib_priv_alloc(); @@ -953,24 +904,17 @@ } if_initname(dev, name, priv->unit); dev->if_flags = IFF_BROADCAST | IFF_MULTICAST; - dev->if_addrlen = INFINIBAND_ALEN; - dev->if_hdrlen = IPOIB_HEADER_LEN; - if_attach(dev); + + infiniband_ifattach(dev, NULL, priv->broadcastaddr); + dev->if_init = ipoib_init; dev->if_ioctl = ipoib_ioctl; dev->if_start = ipoib_start; - dev->if_output = ipoib_output; - dev->if_input = ipoib_input; - dev->if_resolvemulti = ipoib_resolvemulti; - dev->if_baudrate = IF_Gbps(10); - dev->if_broadcastaddr = priv->broadcastaddr; + dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; - sdl = (struct sockaddr_dl *)dev->if_addr->ifa_addr; - sdl->sdl_type = IFT_INFINIBAND; - sdl->sdl_alen = dev->if_addrlen; + priv->dev = dev; if_link_state_change(dev, LINK_STATE_DOWN); - bpfattach(dev, DLT_EN10MB, ETHER_HDR_LEN); return dev->if_softc; } @@ -1165,7 +1109,6 @@ struct ifaddr *ifa; int retval = 0; - CURVNET_SET(dev->if_vnet); NET_EPOCH_ENTER(et); CK_STAILQ_FOREACH(ifa, &dev->if_addrhead, ifa_link) { if (ifa->ifa_addr == NULL || @@ -1179,7 +1122,6 @@ } } NET_EPOCH_EXIT(et); - CURVNET_RESTORE(); return (retval); } @@ -1475,284 +1417,6 @@ ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); } - -/* - * Infiniband output routine. - */ -static int -ipoib_output(struct ifnet *ifp, struct mbuf *m, - const struct sockaddr *dst, struct route *ro) -{ - u_char edst[INFINIBAND_ALEN]; -#if defined(INET) || defined(INET6) - struct llentry *lle = NULL; -#endif - struct ipoib_header *eh; - int error = 0, is_gw = 0; - short type; - - NET_EPOCH_ASSERT(); - - if (ro != NULL) - is_gw = (ro->ro_flags & RT_HAS_GW) != 0; -#ifdef MAC - error = mac_ifnet_check_transmit(ifp, m); - if (error) - goto bad; -#endif - - M_PROFILE(m); - if (ifp->if_flags & IFF_MONITOR) { - error = ENETDOWN; - goto bad; - } - if (!((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING))) { - error = ENETDOWN; - goto bad; - } - - switch (dst->sa_family) { -#ifdef INET - case AF_INET: - if (lle != NULL && (lle->la_flags & LLE_VALID)) - memcpy(edst, lle->ll_addr, sizeof(edst)); - else if (m->m_flags & M_MCAST) - ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); - else - error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); - if (error) - return (error == EWOULDBLOCK ? 0 : error); - type = htons(ETHERTYPE_IP); - break; - case AF_ARP: - { - struct arphdr *ah; - ah = mtod(m, struct arphdr *); - ah->ar_hrd = htons(ARPHRD_INFINIBAND); - - switch(ntohs(ah->ar_op)) { - case ARPOP_REVREQUEST: - case ARPOP_REVREPLY: - type = htons(ETHERTYPE_REVARP); - break; - case ARPOP_REQUEST: - case ARPOP_REPLY: - default: - type = htons(ETHERTYPE_ARP); - break; - } - - if (m->m_flags & M_BCAST) - bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); - else - bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); - - } - break; -#endif -#ifdef INET6 - case AF_INET6: - if (lle != NULL && (lle->la_flags & LLE_VALID)) - memcpy(edst, lle->ll_addr, sizeof(edst)); - else if (m->m_flags & M_MCAST) - ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); - else - error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); - if (error) - return error; - type = htons(ETHERTYPE_IPV6); - break; -#endif - - default: - if_printf(ifp, "can't handle af%d\n", dst->sa_family); - error = EAFNOSUPPORT; - goto bad; - } - - /* - * Add local net header. If no space in first mbuf, - * allocate another. - */ - M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); - if (m == NULL) { - error = ENOBUFS; - goto bad; - } - eh = mtod(m, struct ipoib_header *); - (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); - (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); - - /* - * Queue message on interface, update output statistics if - * successful, and start output if interface not yet active. - */ - return ((ifp->if_transmit)(ifp, m)); -bad: - if (m != NULL) - m_freem(m); - return (error); -} - -/* - * Upper layer processing for a received Infiniband packet. - */ -void -ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto) -{ - struct epoch_tracker et; - int isr; - -#ifdef MAC - /* - * Tag the mbuf with an appropriate MAC label before any other - * consumers can get to it. - */ - mac_ifnet_create_mbuf(ifp, m); -#endif - /* Allow monitor mode to claim this frame, after stats are updated. */ - if (ifp->if_flags & IFF_MONITOR) { - if_printf(ifp, "discard frame at IFF_MONITOR\n"); - m_freem(m); - return; - } - /* - * Dispatch frame to upper layer. - */ - switch (proto) { -#ifdef INET - case ETHERTYPE_IP: - isr = NETISR_IP; - break; - - case ETHERTYPE_ARP: - if (ifp->if_flags & IFF_NOARP) { - /* Discard packet if ARP is disabled on interface */ - m_freem(m); - return; - } - isr = NETISR_ARP; - break; -#endif -#ifdef INET6 - case ETHERTYPE_IPV6: - isr = NETISR_IPV6; - break; -#endif - default: - goto discard; - } - NET_EPOCH_ENTER(et); - netisr_dispatch(isr, m); - NET_EPOCH_EXIT(et); - return; - -discard: - m_freem(m); -} - -/* - * Process a received Infiniband packet. - */ -static void -ipoib_input(struct ifnet *ifp, struct mbuf *m) -{ - struct ipoib_header *eh; - - if ((ifp->if_flags & IFF_UP) == 0) { - m_freem(m); - return; - } - CURVNET_SET_QUIET(ifp->if_vnet); - - /* Let BPF have it before we strip the header. */ - IPOIB_MTAP(ifp, m); - eh = mtod(m, struct ipoib_header *); - /* - * Reset layer specific mbuf flags to avoid confusing upper layers. - * Strip off Infiniband header. - */ - m->m_flags &= ~M_VLANTAG; - m_clrprotoflags(m); - m_adj(m, IPOIB_HEADER_LEN); - - if (IPOIB_IS_MULTICAST(eh->hwaddr)) { - if (memcmp(eh->hwaddr, ifp->if_broadcastaddr, - ifp->if_addrlen) == 0) - m->m_flags |= M_BCAST; - else - m->m_flags |= M_MCAST; - if_inc_counter(ifp, IFCOUNTER_IMCASTS, 1); - } - - ipoib_demux(ifp, m, ntohs(eh->proto)); - CURVNET_RESTORE(); -} - -static int -ipoib_resolvemulti(struct ifnet *ifp, struct sockaddr **llsa, - struct sockaddr *sa) -{ - struct sockaddr_dl *sdl; -#ifdef INET - struct sockaddr_in *sin; -#endif -#ifdef INET6 - struct sockaddr_in6 *sin6; -#endif - u_char *e_addr; - - switch(sa->sa_family) { - case AF_LINK: - /* - * No mapping needed. Just check that it's a valid MC address. - */ - sdl = (struct sockaddr_dl *)sa; - e_addr = LLADDR(sdl); - if (!IPOIB_IS_MULTICAST(e_addr)) - return EADDRNOTAVAIL; - *llsa = NULL; - return 0; - -#ifdef INET - case AF_INET: - sin = (struct sockaddr_in *)sa; - if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) - return EADDRNOTAVAIL; - sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); - sdl->sdl_alen = INFINIBAND_ALEN; - e_addr = LLADDR(sdl); - ip_ib_mc_map(sin->sin_addr.s_addr, ifp->if_broadcastaddr, - e_addr); - *llsa = (struct sockaddr *)sdl; - return 0; -#endif -#ifdef INET6 - case AF_INET6: - sin6 = (struct sockaddr_in6 *)sa; - /* - * An IP6 address of 0 means listen to all - * of the multicast address used for IP6. - * This has no meaning in ipoib. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) - return EADDRNOTAVAIL; - if (!IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) - return EADDRNOTAVAIL; - sdl = link_init_sdl(ifp, *llsa, IFT_INFINIBAND); - sdl->sdl_alen = INFINIBAND_ALEN; - e_addr = LLADDR(sdl); - ipv6_ib_mc_map(&sin6->sin6_addr, ifp->if_broadcastaddr, e_addr); - *llsa = (struct sockaddr *)sdl; - return 0; -#endif - - default: - return EAFNOSUPPORT; - } -} - module_init_order(ipoib_init_module, SI_ORDER_FIFTH); module_exit_order(ipoib_cleanup_module, SI_ORDER_FIFTH); @@ -1769,4 +1433,5 @@ DECLARE_MODULE(ipoib, ipoib_mod, SI_SUB_LAST, SI_ORDER_ANY); MODULE_DEPEND(ipoib, ibcore, 1, 1, 1); +MODULE_DEPEND(ipoib, if_infiniband, 1, 1, 1); MODULE_DEPEND(ipoib, linuxkpi, 1, 1, 1); Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -1455,7 +1455,7 @@ ((_m)->m_pkthdr.fibnum) = (_fib); \ } while (0) -/* flags passed as first argument for "m_ether_tcpip_hash()" */ +/* flags passed as first argument for "m_xxx_tcpip_hash()" */ #define MBUF_HASHFLAG_L2 (1 << 2) #define MBUF_HASHFLAG_L3 (1 << 3) #define MBUF_HASHFLAG_L4 (1 << 4) @@ -1463,6 +1463,8 @@ /* mbuf hashing helper routines */ uint32_t m_ether_tcpip_hash_init(void); uint32_t m_ether_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t); +uint32_t m_infiniband_tcpip_hash_init(void); +uint32_t m_infiniband_tcpip_hash(const uint32_t, const struct mbuf *, const uint32_t); #ifdef MBUF_PROFILING void m_profile(struct mbuf *m);