Changeset View
Standalone View
sys/net/if_infiniband.c
Show First 20 Lines • Show All 137 Lines • ▼ Show 20 Lines | infiniband_bpf_mtap(struct ifnet *ifp, struct mbuf *mb) | ||||
mb->m_len -= sizeof(*ibh); | mb->m_len -= sizeof(*ibh); | ||||
mb->m_pkthdr.len -= sizeof(*ibh); | mb->m_pkthdr.len -= sizeof(*ibh); | ||||
bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); | bpf_mtap2(ifp->if_bpf, &eh, sizeof(eh), mb); | ||||
mb->m_data -= sizeof(*ibh); | mb->m_data -= sizeof(*ibh); | ||||
mb->m_len += sizeof(*ibh); | mb->m_len += sizeof(*ibh); | ||||
mb->m_pkthdr.len += sizeof(*ibh); | mb->m_pkthdr.len += sizeof(*ibh); | ||||
} | } | ||||
static void | |||||
update_mbuf_csumflags(struct mbuf *src, struct mbuf *dst) | |||||
{ | |||||
int csum_flags = 0; | |||||
if (src->m_pkthdr.csum_flags & CSUM_IP) | |||||
csum_flags |= (CSUM_IP_CHECKED|CSUM_IP_VALID); | |||||
if (src->m_pkthdr.csum_flags & CSUM_DELAY_DATA) | |||||
csum_flags |= (CSUM_DATA_VALID|CSUM_PSEUDO_HDR); | |||||
if (src->m_pkthdr.csum_flags & CSUM_SCTP) | |||||
csum_flags |= CSUM_SCTP_VALID; | |||||
dst->m_pkthdr.csum_flags |= csum_flags; | |||||
if (csum_flags & CSUM_DATA_VALID) | |||||
dst->m_pkthdr.csum_data = 0xffff; | |||||
} | |||||
/* | /* | ||||
* Infiniband output routine. | * Handle link-layer encapsulation requests. | ||||
*/ | */ | ||||
static int | static int | ||||
infiniband_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst, | infiniband_requestencap(struct ifnet *ifp, struct if_encap_req *req) | ||||
struct route *ro) | |||||
{ | { | ||||
uint8_t edst[INFINIBAND_ADDR_LEN]; | struct infiniband_header *ih; | ||||
#if defined(INET) || defined(INET6) | struct arphdr *ah; | ||||
struct llentry *lle = NULL; | uint16_t etype; | ||||
#endif | const uint8_t *lladdr; | ||||
struct infiniband_header *ibh; | |||||
int error = 0; | |||||
uint16_t type; | |||||
bool is_gw; | |||||
NET_EPOCH_ASSERT(); | if (req->rtype != IFENCAP_LL) | ||||
return (EOPNOTSUPP); | |||||
is_gw = ((ro != NULL) && (ro->ro_flags & RT_HAS_GW) != 0); | if (req->bufsize < INFINIBAND_HDR_LEN) | ||||
return (ENOMEM); | |||||
#ifdef MAC | ih = (struct infiniband_header *)req->buf; | ||||
error = mac_ifnet_check_transmit(ifp, m); | lladdr = req->lladdr; | ||||
if (error) | req->lladdr_off = 0; | ||||
goto bad; | |||||
#endif | |||||
M_PROFILE(m); | switch (req->family) { | ||||
if (ifp->if_flags & IFF_MONITOR) { | |||||
error = ENETDOWN; | |||||
goto bad; | |||||
} | |||||
if (!((ifp->if_flags & IFF_UP) && | |||||
(ifp->if_drv_flags & IFF_DRV_RUNNING))) { | |||||
error = ENETDOWN; | |||||
goto bad; | |||||
} | |||||
switch (dst->sa_family) { | |||||
case AF_LINK: | |||||
goto output; | |||||
#ifdef INET | |||||
case AF_INET: | case AF_INET: | ||||
if (lle != NULL && (lle->la_flags & LLE_VALID)) { | etype = htons(ETHERTYPE_IP); | ||||
melifaro: Sorry, side note: would it be possible to move arp/nd encap logic to if_requesencap callback… | |||||
Done Inline ActionsYes. Weren't you supposed to do that? ;-) hselasky: Yes. Weren't you supposed to do that? ;-) | |||||
Not Done Inline Actions
Well, I guess we misread our dialogue in D26254 :-( Also, unfortunately I don't have IB-capable HW handy. melifaro: > Yes. Weren't you supposed to do that? ;-)
Well, I guess we misread our dialogue in D26254… | |||||
memcpy(edst, lle->ll_addr, sizeof(edst)); | |||||
} else if (m->m_flags & M_MCAST) { | |||||
infiniband_ipv4_multicast_map( | |||||
((const struct sockaddr_in *)dst)->sin_addr.s_addr, | |||||
ifp->if_broadcastaddr, edst); | |||||
} else { | |||||
error = arpresolve(ifp, is_gw, m, dst, edst, NULL, NULL); | |||||
if (error) { | |||||
if (error == EWOULDBLOCK) | |||||
error = 0; | |||||
m = NULL; /* mbuf is consumed by resolver */ | |||||
goto bad; | |||||
} | |||||
} | |||||
type = htons(ETHERTYPE_IP); | |||||
break; | break; | ||||
case AF_ARP: { | case AF_INET6: | ||||
Done Inline Actions@bz: Have a look here, this is what we do for IPv4. I'll have a closer look at it. hselasky: @bz: Have a look here, this is what we do for IPv4.
I'll have a closer look at it. | |||||
struct arphdr *ah; | etype = htons(ETHERTYPE_IPV6); | ||||
break; | |||||
if (m->m_len < sizeof(*ah)) { | case AF_ARP: | ||||
error = EINVAL; | ah = (struct arphdr *)req->hdata; | ||||
goto bad; | |||||
} | |||||
ah = mtod(m, struct arphdr *); | |||||
if (m->m_len < arphdr_len(ah)) { | |||||
error = EINVAL; | |||||
goto bad; | |||||
} | |||||
ah->ar_hrd = htons(ARPHRD_INFINIBAND); | ah->ar_hrd = htons(ARPHRD_INFINIBAND); | ||||
switch (ntohs(ah->ar_op)) { | switch (ntohs(ah->ar_op)) { | ||||
case ARPOP_REVREQUEST: | case ARPOP_REVREQUEST: | ||||
case ARPOP_REVREPLY: | case ARPOP_REVREPLY: | ||||
type = htons(ETHERTYPE_REVARP); | etype = htons(ETHERTYPE_REVARP); | ||||
break; | break; | ||||
case ARPOP_REQUEST: | case ARPOP_REQUEST: | ||||
case ARPOP_REPLY: | case ARPOP_REPLY: | ||||
default: | default: | ||||
type = htons(ETHERTYPE_ARP); | etype = htons(ETHERTYPE_ARP); | ||||
break; | break; | ||||
} | } | ||||
if (req->flags & IFENCAP_FLAG_BROADCAST) | |||||
lladdr = ifp->if_broadcastaddr; | |||||
break; | |||||
default: | |||||
return (EAFNOSUPPORT); | |||||
} | |||||
ih->ib_protocol = etype; | |||||
ih->ib_reserved = 0; | |||||
memcpy(ih->ib_hwaddr, lladdr, INFINIBAND_ADDR_LEN); | |||||
req->bufsize = sizeof(struct infiniband_header); | |||||
return (0); | |||||
} | |||||
static int | |||||
infiniband_resolve_addr(struct ifnet *ifp, struct mbuf *m, | |||||
const struct sockaddr *dst, struct route *ro, uint8_t *phdr, | |||||
uint32_t *pflags, struct llentry **plle) | |||||
{ | |||||
struct infiniband_header *ih; | |||||
uint32_t lleflags = 0; | |||||
int error = 0; | |||||
if (plle) | |||||
*plle = NULL; | |||||
ih = (struct infiniband_header *)phdr; | |||||
switch (dst->sa_family) { | |||||
#ifdef INET | |||||
case AF_INET: | |||||
if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) { | |||||
error = arpresolve(ifp, 0, m, dst, phdr, &lleflags, plle); | |||||
} else { | |||||
if (m->m_flags & M_BCAST) { | if (m->m_flags & M_BCAST) { | ||||
memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN); | memcpy(ih->ib_hwaddr, ifp->if_broadcastaddr, | ||||
INFINIBAND_ADDR_LEN); | |||||
} else { | } else { | ||||
if (ah->ar_hln != INFINIBAND_ADDR_LEN) { | infiniband_ipv4_multicast_map( | ||||
error = EINVAL; | ((const struct sockaddr_in *)dst)->sin_addr.s_addr, | ||||
goto bad; | ifp->if_broadcastaddr, ih->ib_hwaddr); | ||||
} | } | ||||
memcpy(edst, ar_tha(ah), INFINIBAND_ADDR_LEN); | ih->ib_protocol = htons(ETHERTYPE_IP); | ||||
ih->ib_reserved = 0; | |||||
} | } | ||||
break; | break; | ||||
} | |||||
#endif | #endif | ||||
#ifdef INET6 | #ifdef INET6 | ||||
case AF_INET6: { | case AF_INET6: | ||||
Done Inline ActionsGiven there is no BCAST in IPv6, I wonder what this means? bz: Given there is no BCAST in IPv6, I wonder what this means? | |||||
Done Inline ActionsI just wanted to cases to be symmetric, but like you say there is no BCAST in IPv6, so now the packet is simply dropped. hselasky: I just wanted to cases to be symmetric, but like you say there is no BCAST in IPv6, so now the… | |||||
const struct ip6_hdr *ip6; | if ((m->m_flags & M_MCAST) == 0) { | ||||
Done Inline ActionsThis seems weird as well to me; I would assume that non-directed ICMPv6 traffic has M_MCAST set the below clause would handle that? Why is ICMPv6 special at all compared to TCP or UDP? While link-layer address resolution works on ICMPv6 level it is not a separate protocol like ARP for IPv4 but above IPv6. bz: This seems weird as well to me; I would assume that non-directed ICMPv6 traffic has M_MCAST… | |||||
Done Inline ActionsIt might be the answer is that multicast handling in IPoIB is broken. I need to investigate this. hselasky: It might be the answer is that multicast handling in IPoIB is broken. I need to investigate… | |||||
error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags, plle); | |||||
ip6 = mtod(m, const struct ip6_hdr *); | } else { | ||||
if (m->m_len < sizeof(*ip6)) { | |||||
error = EINVAL; | |||||
goto bad; | |||||
} else if (lle != NULL && (lle->la_flags & LLE_VALID)) { | |||||
memcpy(edst, lle->ll_addr, sizeof(edst)); | |||||
} else if (m->m_flags & M_MCAST) { | |||||
infiniband_ipv6_multicast_map( | infiniband_ipv6_multicast_map( | ||||
&((const struct sockaddr_in6 *)dst)->sin6_addr, | &((const struct sockaddr_in6 *)dst)->sin6_addr, | ||||
ifp->if_broadcastaddr, edst); | ifp->if_broadcastaddr, ih->ib_hwaddr); | ||||
} else if (ip6->ip6_nxt == IPPROTO_ICMPV6) { | ih->ib_protocol = htons(ETHERTYPE_IPV6); | ||||
memcpy(edst, ifp->if_broadcastaddr, INFINIBAND_ADDR_LEN); | ih->ib_reserved = 0; | ||||
} else { | |||||
error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL, NULL); | |||||
if (error) { | |||||
if (error == EWOULDBLOCK) | |||||
error = 0; | |||||
m = NULL; /* mbuf is consumed by resolver */ | |||||
goto bad; | |||||
} | } | ||||
} | |||||
type = htons(ETHERTYPE_IPV6); | |||||
break; | break; | ||||
} | |||||
#endif | #endif | ||||
default: | default: | ||||
error = EAFNOSUPPORT; | if_printf(ifp, "can't handle af%d\n", dst->sa_family); | ||||
if (m != NULL) | |||||
m_freem(m); | |||||
return (EAFNOSUPPORT); | |||||
} | |||||
if (error == EHOSTDOWN) { | |||||
if (ro != NULL && (ro->ro_flags & RT_HAS_GW) != 0) | |||||
error = EHOSTUNREACH; | |||||
} | |||||
if (error != 0) | |||||
return (error); | |||||
*pflags = RT_MAY_LOOP; | |||||
if (lleflags & LLE_IFADDR) | |||||
*pflags |= RT_L2_ME; | |||||
return (0); | |||||
} | |||||
/* | |||||
* Infiniband output routine. | |||||
*/ | |||||
static int | |||||
infiniband_output(struct ifnet *ifp, struct mbuf *m, | |||||
const struct sockaddr *dst, struct route *ro) | |||||
{ | |||||
uint8_t linkhdr[INFINIBAND_HDR_LEN]; | |||||
uint8_t *phdr; | |||||
#if defined(INET) || defined(INET6) | |||||
struct llentry *lle = NULL; | |||||
#endif | |||||
struct infiniband_header *ih; | |||||
int error = 0; | |||||
int hlen; /* link layer header length */ | |||||
uint32_t pflags; | |||||
bool addref; | |||||
NET_EPOCH_ASSERT(); | |||||
addref = false; | |||||
phdr = NULL; | |||||
pflags = 0; | |||||
if (ro != NULL) { | |||||
/* XXX BPF uses ro_prepend */ | |||||
if (ro->ro_prepend != NULL) { | |||||
phdr = ro->ro_prepend; | |||||
hlen = ro->ro_plen; | |||||
} else if (!(m->m_flags & (M_BCAST | M_MCAST))) { | |||||
if ((ro->ro_flags & RT_LLE_CACHE) != 0) { | |||||
lle = ro->ro_lle; | |||||
if (lle != NULL && | |||||
(lle->la_flags & LLE_VALID) == 0) { | |||||
LLE_FREE(lle); | |||||
lle = NULL; /* redundant */ | |||||
ro->ro_lle = NULL; | |||||
} | |||||
if (lle == NULL) { | |||||
/* if we lookup, keep cache */ | |||||
addref = 1; | |||||
} else | |||||
/* | |||||
* Notify LLE code that | |||||
* the entry was used | |||||
* by datapath. | |||||
*/ | |||||
llentry_mark_used(lle); | |||||
} | |||||
if (lle != NULL) { | |||||
phdr = lle->r_linkdata; | |||||
hlen = lle->r_hdrlen; | |||||
pflags = lle->r_flags; | |||||
} | |||||
} | |||||
} | |||||
#ifdef MAC | |||||
error = mac_ifnet_check_transmit(ifp, m); | |||||
if (error) | |||||
goto bad; | goto bad; | ||||
#endif | |||||
M_PROFILE(m); | |||||
if (ifp->if_flags & IFF_MONITOR) { | |||||
error = ENETDOWN; | |||||
goto bad; | |||||
} | } | ||||
if (!((ifp->if_flags & IFF_UP) && | |||||
(ifp->if_drv_flags & IFF_DRV_RUNNING))) { | |||||
error = ENETDOWN; | |||||
goto bad; | |||||
} | |||||
if (phdr == NULL) { | |||||
/* No prepend data supplied. Try to calculate ourselves. */ | |||||
phdr = linkhdr; | |||||
hlen = INFINIBAND_HDR_LEN; | |||||
error = infiniband_resolve_addr(ifp, m, dst, ro, phdr, &pflags, | |||||
addref ? &lle : NULL); | |||||
if (addref && lle != NULL) | |||||
ro->ro_lle = lle; | |||||
if (error != 0) | |||||
return (error == EWOULDBLOCK ? 0 : error); | |||||
} | |||||
if ((pflags & RT_L2_ME) != 0) { | |||||
update_mbuf_csumflags(m, m); | |||||
return (if_simloop(ifp, m, dst->sa_family, 0)); | |||||
} | |||||
/* | /* | ||||
* Add local net header. If no space in first mbuf, | * Add local infiniband header. If no space in first mbuf, | ||||
* allocate another. | * allocate another. | ||||
*/ | */ | ||||
M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); | M_PREPEND(m, INFINIBAND_HDR_LEN, M_NOWAIT); | ||||
if (m == NULL) { | if (m == NULL) { | ||||
error = ENOBUFS; | error = ENOBUFS; | ||||
goto bad; | goto bad; | ||||
} | } | ||||
ibh = mtod(m, struct infiniband_header *); | if ((pflags & RT_HAS_HEADER) == 0) { | ||||
ih = mtod(m, struct infiniband_header *); | |||||
memcpy(ih, phdr, hlen); | |||||
} | |||||
ibh->ib_protocol = type; | |||||
memcpy(ibh->ib_hwaddr, edst, sizeof(edst)); | |||||
/* | /* | ||||
* Queue message on interface, update output statistics if | * Queue message on interface, update output statistics if | ||||
* successful, and start output if interface not yet active. | * successful, and start output if interface not yet active. | ||||
*/ | */ | ||||
output: | |||||
return (ifp->if_transmit(ifp, m)); | return (ifp->if_transmit(ifp, m)); | ||||
bad: | bad: | ||||
if (m != NULL) | if (m != NULL) | ||||
m_freem(m); | m_freem(m); | ||||
return (error); | return (error); | ||||
} | } | ||||
/* | /* | ||||
▲ Show 20 Lines • Show All 173 Lines • ▼ Show 20 Lines | infiniband_ifattach(struct ifnet *ifp, const uint8_t *lla, const uint8_t *llb) | ||||
ifp->if_addrlen = INFINIBAND_ADDR_LEN; | ifp->if_addrlen = INFINIBAND_ADDR_LEN; | ||||
ifp->if_hdrlen = INFINIBAND_HDR_LEN; | ifp->if_hdrlen = INFINIBAND_HDR_LEN; | ||||
ifp->if_mtu = INFINIBAND_MTU; | ifp->if_mtu = INFINIBAND_MTU; | ||||
if_attach(ifp); | if_attach(ifp); | ||||
ifp->if_output = infiniband_output; | ifp->if_output = infiniband_output; | ||||
ifp->if_input = infiniband_input; | ifp->if_input = infiniband_input; | ||||
ifp->if_resolvemulti = infiniband_resolvemulti; | ifp->if_resolvemulti = infiniband_resolvemulti; | ||||
ifp->if_requestencap = infiniband_requestencap; | |||||
if (ifp->if_baudrate == 0) | if (ifp->if_baudrate == 0) | ||||
ifp->if_baudrate = IF_Gbps(10); /* default value */ | ifp->if_baudrate = IF_Gbps(10); /* default value */ | ||||
if (llb != NULL) | if (llb != NULL) | ||||
ifp->if_broadcastaddr = llb; | ifp->if_broadcastaddr = llb; | ||||
ifa = ifp->if_addr; | ifa = ifp->if_addr; | ||||
KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); | KASSERT(ifa != NULL, ("%s: no lladdr!\n", __func__)); | ||||
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines |
Sorry, side note: would it be possible to move arp/nd encap logic to if_requesencap callback, so we have it consistent with ether_output() and simplify datapath code?