Index: sys/dev/cxgb/ulp/tom/cxgb_l2t.c =================================================================== --- sys/dev/cxgb/ulp/tom/cxgb_l2t.c +++ sys/dev/cxgb/ulp/tom/cxgb_l2t.c @@ -215,7 +215,7 @@ struct tom_data *td = sc->tom_softc; struct toedev *tod = &td->tod; struct sockaddr_in sin = {0}; - uint8_t dmac[ETHER_ADDR_LEN]; + uint8_t dmac[ETHER_HDR_LEN]; uint16_t vtag = EVL_VLID_MASK; int rc; Index: sys/dev/cxgbe/tom/t4_tom_l2t.c =================================================================== --- sys/dev/cxgbe/tom/t4_tom_l2t.c +++ sys/dev/cxgbe/tom/t4_tom_l2t.c @@ -233,7 +233,7 @@ struct sockaddr_in sin = {0}; struct sockaddr_in6 sin6 = {0}; struct sockaddr *sa; - uint8_t dmac[ETHER_ADDR_LEN]; + uint8_t dmac[ETHER_HDR_LEN]; uint16_t vtag = VLAN_NONE; int rc; Index: sys/net/bpf.c =================================================================== --- sys/net/bpf.c +++ sys/net/bpf.c @@ -69,6 +69,7 @@ #include #include +#include #include #include #ifdef BPF_JITTER @@ -76,6 +77,7 @@ #endif #include #include +#include #include #include @@ -164,7 +166,7 @@ static void bpf_detachd_locked(struct bpf_d *); static void bpf_freed(struct bpf_d *); static int bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **, - struct sockaddr *, int *, struct bpf_insn *); + struct sockaddr *, int *, struct bpf_d *); static int bpf_setif(struct bpf_d *, struct ifreq *); static void bpf_timed_out(void *); static __inline void @@ -454,7 +456,7 @@ */ static int bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp, - struct sockaddr *sockp, int *hdrlen, struct bpf_insn *wfilter) + struct sockaddr *sockp, int *hdrlen, struct bpf_d *d) { const struct ieee80211_bpf_params *p; struct ether_header *eh; @@ -549,7 +551,7 @@ if (error) goto bad; - slen = bpf_filter(wfilter, mtod(m, u_char *), len, len); + slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len); if (slen == 0) { error = EPERM; goto bad; @@ -566,6 +568,10 @@ else m->m_flags |= M_MCAST; } + if (d->bd_hdrcmplt == 0) { + memcpy(eh->ether_shost, IF_LLADDR(ifp), + sizeof(eh->ether_shost)); + } break; } @@ -1088,6 +1094,7 @@ struct ifnet *ifp; struct mbuf *m, *mc; struct sockaddr dst; + struct route ro; int error, hlen; error = devfs_get_cdevpriv((void **)&d); @@ -1119,7 +1126,7 @@ hlen = 0; /* XXX: bpf_movein() can sleep */ error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp, - &m, &dst, &hlen, d->bd_wfilter); + &m, &dst, &hlen, d); if (error) { d->bd_wdcount++; return (error); @@ -1151,7 +1158,14 @@ BPFD_UNLOCK(d); #endif - error = (*ifp->if_output)(ifp, m, &dst, NULL); + bzero(&ro, sizeof(ro)); + if (hlen != 0) { + ro.ro_prepend = (u_char *)&dst.sa_data; + ro.ro_plen = hlen; + ro.ro_flags = RT_HAS_HEADER; + } + + error = (*ifp->if_output)(ifp, m, &dst, &ro); if (error) d->bd_wdcount++; Index: sys/net/ethernet.h =================================================================== --- sys/net/ethernet.h +++ sys/net/ethernet.h @@ -387,6 +387,7 @@ struct route; struct sockaddr; struct bpf_if; +struct if_encap_req; extern uint32_t ether_crc32_le(const uint8_t *, size_t); extern uint32_t ether_crc32_be(const uint8_t *, size_t); @@ -397,6 +398,7 @@ extern int ether_output(struct ifnet *, struct mbuf *, const struct sockaddr *, struct route *); extern int ether_output_frame(struct ifnet *, struct mbuf *); +extern int ether_requestencap(struct ifnet *, struct if_encap_req *); extern char *ether_sprintf(const u_int8_t *); void ether_vlan_mtap(struct bpf_if *, struct mbuf *, void *, u_int); Index: sys/net/flowtable.c =================================================================== --- sys/net/flowtable.c +++ sys/net/flowtable.c @@ -665,6 +665,7 @@ flowtable_lookup(sa_family_t sa, struct mbuf *m, struct route *ro) { struct flentry *fle; + struct llentry *lle; if (V_flowtable_enable == 0) return (ENXIO); @@ -693,8 +694,14 @@ } ro->ro_rt = fle->f_rt; - ro->ro_lle = fle->f_lle; ro->ro_flags |= RT_NORTREF; + lle = fle->f_lle; + if (lle != NULL && (lle->la_flags & LLE_VALID)) { + ro->ro_prepend = lle->r_linkdata; + ro->ro_plen = lle->r_hdrlen; + ro->ro_flags |= RT_MAY_LOOP; + ro->ro_flags |= (!!(lle->la_flags & LLE_IFADDR)) << RT_L2_ME_BIT; + } return (0); } Index: sys/net/if.c =================================================================== --- sys/net/if.c +++ sys/net/if.c @@ -669,6 +669,9 @@ if (ifp->if_input == NULL) ifp->if_input = if_input_default; + if (ifp->if_requestencap == NULL) + ifp->if_requestencap = if_requestencap_default; + if (!vmove) { #ifdef MAC mac_ifnet_create(ifp); Index: sys/net/if_ethersubr.c =================================================================== --- sys/net/if_ethersubr.c +++ sys/net/if_ethersubr.c @@ -136,138 +136,196 @@ } /* - * Ethernet output routine. - * Encapsulate a packet of type family for the local net. - * Use trailer local net encapsulation if enough data in first - * packet leaves a multiple of 512 bytes of data in remainder. + * Handle link-layer encapsulation requests. */ int -ether_output(struct ifnet *ifp, struct mbuf *m, - const struct sockaddr *dst, struct route *ro) +ether_requestencap(struct ifnet *ifp, struct if_encap_req *req) { - short type; - int error = 0, hdrcmplt = 0; - u_char edst[ETHER_ADDR_LEN]; - struct llentry *lle = NULL; - struct rtentry *rt0 = NULL; struct ether_header *eh; - struct pf_mtag *t; - int loop_copy = 1; - int hlen; /* link layer header length */ - int is_gw = 0; - uint32_t pflags = 0; + struct arphdr *ah; + uint16_t etype; + const u_char *lladdr; - if (ro != NULL) { - if (!(m->m_flags & (M_BCAST | M_MCAST))) { - lle = ro->ro_lle; - if (lle != NULL) - pflags = lle->la_flags; - } - rt0 = ro->ro_rt; - if (rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) != 0) - is_gw = 1; - } -#ifdef MAC - error = mac_ifnet_check_transmit(ifp, m); - if (error) - senderr(error); -#endif + if (req->rtype != IFENCAP_LL) + return (EOPNOTSUPP); - M_PROFILE(m); - if (ifp->if_flags & IFF_MONITOR) - senderr(ENETDOWN); - if (!((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING))) - senderr(ENETDOWN); + if (req->bufsize < ETHER_HDR_LEN) + return (ENOMEM); - hlen = ETHER_HDR_LEN; - switch (dst->sa_family) { -#ifdef INET + eh = (struct ether_header *)req->buf; + lladdr = req->lladdr; + req->lladdr_off = 0; + + switch (req->family) { case AF_INET: - if (lle != NULL && (pflags & LLE_VALID) != 0) - memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); - else - error = arpresolve(ifp, is_gw, m, dst, edst, &pflags); - if (error) - return (error == EWOULDBLOCK ? 0 : error); - type = htons(ETHERTYPE_IP); + etype = htons(ETHERTYPE_IP); + break; + case AF_INET6: + etype = htons(ETHERTYPE_IPV6); break; case AF_ARP: - { - struct arphdr *ah; - ah = mtod(m, struct arphdr *); + ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_ETHER); - loop_copy = 0; /* if this is for us, don't do it */ - switch(ntohs(ah->ar_op)) { case ARPOP_REVREQUEST: case ARPOP_REVREPLY: - type = htons(ETHERTYPE_REVARP); + etype = htons(ETHERTYPE_REVARP); break; case ARPOP_REQUEST: case ARPOP_REPLY: default: - type = htons(ETHERTYPE_ARP); + etype = htons(ETHERTYPE_ARP); break; } - if (m->m_flags & M_BCAST) - bcopy(ifp->if_broadcastaddr, edst, ETHER_ADDR_LEN); - else - bcopy(ar_tha(ah), edst, ETHER_ADDR_LEN); - + if (req->flags & IFENCAP_FLAG_BROADCAST) + lladdr = ifp->if_broadcastaddr; + break; + default: + return (EAFNOSUPPORT); } - break; + + memcpy(&eh->ether_type, &etype, sizeof(eh->ether_type)); + memcpy(eh->ether_dhost, lladdr, ETHER_ADDR_LEN); + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + req->bufsize = sizeof(struct ether_header); + + return (0); +} + + +static inline int +ether_resolve_addr(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro, u_char *phdr, + uint32_t *pflags) +{ + struct ether_header *eh; + struct rtentry *rt; + uint32_t lleflags = 0; + uint16_t etype; + int error = 0; + + eh = (struct ether_header *)phdr; + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) + error = arpresolve(ifp, 0, m, dst, phdr, &lleflags); + else { + if (m->m_flags & M_BCAST) + memcpy(eh->ether_dhost, ifp->if_broadcastaddr, + ETHER_ADDR_LEN); + else { + const struct in_addr *a; + a = &(((const struct sockaddr_in *)dst)->sin_addr); + ETHER_MAP_IP_MULTICAST(a, eh->ether_dhost); + } + etype = htons(ETHERTYPE_IP); + memcpy(&eh->ether_type, &etype, sizeof(etype)); + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + } + break; #endif #ifdef INET6 case AF_INET6: - if (lle != NULL && (pflags & LLE_VALID)) - memcpy(edst, &lle->ll_addr.mac16, sizeof(edst)); - else - error = nd6_resolve(ifp, is_gw, m, dst, (u_char *)edst, - &pflags); - if (error) - return (error == EWOULDBLOCK ? 0 : error); - type = htons(ETHERTYPE_IPV6); + if ((m->m_flags & M_MCAST) == 0) + error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags); + else { + const struct in6_addr *a6; + a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); + ETHER_MAP_IPV6_MULTICAST(a6, eh->ether_dhost); + etype = htons(ETHERTYPE_IPV6); + memcpy(&eh->ether_type, &etype, sizeof(etype)); + memcpy(eh->ether_shost, IF_LLADDR(ifp), ETHER_ADDR_LEN); + } break; #endif - case pseudo_AF_HDRCMPLT: - { - const struct ether_header *eh; - - hdrcmplt = 1; - /* FALLTHROUGH */ - - case AF_UNSPEC: - loop_copy = 0; /* if this is for us, don't do it */ - eh = (const struct ether_header *)dst->sa_data; - (void)memcpy(edst, eh->ether_dhost, sizeof (edst)); - type = eh->ether_type; - break; - } default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); - senderr(EAFNOSUPPORT); + if (m != NULL) + m_freem(m); + return (EAFNOSUPPORT); + } + + if (error == EHOSTDOWN) { + rt = (ro != NULL) ? ro->ro_rt : NULL; + if (rt != NULL && (rt->rt_flags & RTF_GATEWAY) != 0) + error = EHOSTUNREACH; } - if ((pflags & LLE_IFADDR) != 0) { + if (error != 0) + return (error); + + *pflags = ((!!(lleflags & LLE_IFADDR)) << RT_L2_ME_BIT) | RT_MAY_LOOP; + + return (0); +} + +/* + * Ethernet output routine. + * Encapsulate a packet of type family for the local net. + * Use trailer local net encapsulation if enough data in first + * packet leaves a multiple of 512 bytes of data in remainder. + */ +int +ether_output(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro) +{ + int error = 0; + char linkhdr[ETHER_HDR_LEN], *phdr; + struct ether_header *eh; + struct pf_mtag *t; + int loop_copy = 1; + int hlen; /* link layer header length */ + uint32_t pflags; + + phdr = NULL; + pflags = 0; + if (ro != NULL) { + phdr = ro->ro_prepend; + hlen = ro->ro_plen; + pflags = ro->ro_flags; + } +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + senderr(error); +#endif + + M_PROFILE(m); + if (ifp->if_flags & IFF_MONITOR) + senderr(ENETDOWN); + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) + senderr(ENETDOWN); + + if (phdr == NULL) { + /* No prepend data supplied. Try to calculate ourselves. */ + phdr = linkhdr; + hlen = ETHER_HDR_LEN; + error = ether_resolve_addr(ifp, m, dst, ro, phdr, &pflags); + if (error != 0) + return (error == EWOULDBLOCK ? 0 : error); + } + + if ((pflags & RT_L2_ME) != 0) { update_mbuf_csumflags(m, m); return (if_simloop(ifp, m, dst->sa_family, 0)); } + loop_copy = pflags & RT_MAY_LOOP; /* * Add local net header. If no space in first mbuf, * allocate another. */ - M_PREPEND(m, ETHER_HDR_LEN, M_NOWAIT); + M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) senderr(ENOBUFS); - eh = mtod(m, struct ether_header *); - if (hdrcmplt == 0) { - memcpy(&eh->ether_type, &type, sizeof(eh->ether_type)); - memcpy(eh->ether_dhost, edst, sizeof (edst)); - memcpy(eh->ether_shost, IF_LLADDR(ifp),sizeof(eh->ether_shost)); + if ((pflags & RT_HAS_HEADER) == 0) { + eh = mtod(m, struct ether_header *); + memcpy(eh, phdr, hlen); } /* @@ -279,34 +337,27 @@ * on the wire). However, we don't do that here for security * reasons and compatibility with the original behavior. */ - if ((ifp->if_flags & IFF_SIMPLEX) && loop_copy && + if ((m->m_flags & M_BCAST) && loop_copy && (ifp->if_flags & IFF_SIMPLEX) && ((t = pf_find_mtag(m)) == NULL || !t->routed)) { - if (m->m_flags & M_BCAST) { - struct mbuf *n; + struct mbuf *n; - /* - * Because if_simloop() modifies the packet, we need a - * writable copy through m_dup() instead of a readonly - * one as m_copy[m] would give us. The alternative would - * be to modify if_simloop() to handle the readonly mbuf, - * but performancewise it is mostly equivalent (trading - * extra data copying vs. extra locking). - * - * XXX This is a local workaround. A number of less - * often used kernel parts suffer from the same bug. - * See PR kern/105943 for a proposed general solution. - */ - if ((n = m_dup(m, M_NOWAIT)) != NULL) { - update_mbuf_csumflags(m, n); - (void)if_simloop(ifp, n, dst->sa_family, hlen); - } else - if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); - } else if (bcmp(eh->ether_dhost, eh->ether_shost, - ETHER_ADDR_LEN) == 0) { - update_mbuf_csumflags(m, m); - (void) if_simloop(ifp, m, dst->sa_family, hlen); - return (0); /* XXX */ - } + /* + * Because if_simloop() modifies the packet, we need a + * writable copy through m_dup() instead of a readonly + * one as m_copy[m] would give us. The alternative would + * be to modify if_simloop() to handle the readonly mbuf, + * but performancewise it is mostly equivalent (trading + * extra data copying vs. extra locking). + * + * XXX This is a local workaround. A number of less + * often used kernel parts suffer from the same bug. + * See PR kern/105943 for a proposed general solution. + */ + if ((n = m_dup(m, M_NOWAIT)) != NULL) { + update_mbuf_csumflags(m, n); + (void)if_simloop(ifp, n, dst->sa_family, hlen); + } else + if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1); } /* @@ -798,6 +849,7 @@ ifp->if_output = ether_output; ifp->if_input = ether_input; ifp->if_resolvemulti = ether_resolvemulti; + ifp->if_requestencap = ether_requestencap; #ifdef VIMAGE ifp->if_reassign = ether_reassign; #endif Index: sys/net/if_llatbl.h =================================================================== --- sys/net/if_llatbl.h +++ sys/net/if_llatbl.h @@ -48,6 +48,7 @@ #define LLTABLE_WUNLOCK() rw_wunlock(&lltable_rwlock) #define LLTABLE_LOCK_ASSERT() rw_assert(&lltable_rwlock, RA_LOCKED) +#define LLE_MAX_LINKHDR 24 /* Full IB header */ /* * Code referencing llentry must at least hold * a shared lock @@ -58,12 +59,9 @@ struct in_addr addr4; struct in6_addr addr6; } r_l3addr; - union { - uint64_t mac_aligned; - uint16_t mac16[3]; - uint8_t mac8[20]; /* IB needs 20 bytes. */ - } ll_addr; - uint32_t spare0; + char r_linkdata[LLE_MAX_LINKHDR]; /* L2 data */ + uint8_t r_hdrlen; /* length for LL header */ + uint8_t spare0[3]; uint64_t spare1; struct lltable *lle_tbl; @@ -79,6 +77,7 @@ uint16_t ln_router; time_t ln_ntick; int lle_refcnt; + char *ll_addr; /* link-layer address */ LIST_ENTRY(llentry) lle_chain; /* chain of deleted items */ struct callout lle_timer; @@ -187,6 +186,8 @@ #define LLE_LINKED 0x0040 /* linked to lookup structure */ /* LLE request flags */ #define LLE_EXCLUSIVE 0x2000 /* return lle xlocked */ +#define LLE_ADDRONLY 0x4000 /* return lladdr instead of full header */ +#define LLE_CREATE 0x8000 /* hint to avoid lle lookup */ #define LLATBL_HASH(key, mask) \ (((((((key >> 8) ^ key) >> 8) ^ key) >> 8) ^ key) & mask) @@ -208,8 +209,11 @@ /* helper functions */ size_t lltable_drop_entry_queue(struct llentry *); void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle, - const char *lladdr); + const char *linkhdr, size_t linkhdrsize, int lladdr_off); +int lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr, + char *buf, size_t *bufsize, int *lladdr_off); +int lltable_update_ifaddr(struct lltable *llt); struct llentry *lltable_alloc_entry(struct lltable *llt, u_int flags, const struct sockaddr *l4addr); void lltable_free_entry(struct lltable *llt, struct llentry *lle); Index: sys/net/if_llatbl.c =================================================================== --- sys/net/if_llatbl.c +++ sys/net/if_llatbl.c @@ -279,14 +279,98 @@ void lltable_set_entry_addr(struct ifnet *ifp, struct llentry *lle, - const char *lladdr) + const char *linkhdr, size_t linkhdrsize, int lladdr_off) { - bcopy(lladdr, &lle->ll_addr, ifp->if_addrlen); + memcpy(lle->r_linkdata, linkhdr, linkhdrsize); + lle->r_hdrlen = linkhdrsize; + lle->ll_addr = &lle->r_linkdata[lladdr_off]; lle->la_flags |= LLE_VALID; } /* + * Helper function used to pre-compute full/partial link-layer + * header data suitable for feeding into if_output(). + */ +int +lltable_calc_llheader(struct ifnet *ifp, int family, char *lladdr, + char *buf, size_t *bufsize, int *lladdr_off) +{ + struct if_encap_req ereq; + int error; + + bzero(buf, *bufsize); + bzero(&ereq, sizeof(ereq)); + ereq.buf = buf; + ereq.bufsize = *bufsize; + ereq.rtype = IFENCAP_LL; + ereq.family = family; + ereq.lladdr = lladdr; + ereq.lladdr_len = ifp->if_addrlen; + error = ifp->if_requestencap(ifp, &ereq); + if (error == 0) { + *bufsize = ereq.bufsize; + *lladdr_off = ereq.lladdr_off; + } + + return (error); +} + +/* + * Update link-layer header for given @lle after + * interface lladdr was changed. + */ +static int +llentry_update_ifaddr(struct lltable *llt, struct llentry *lle, void *farg) +{ + struct ifnet *ifp; + u_char linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + u_char *lladdr; + int lladdr_off; + + ifp = (struct ifnet *)farg; + + lladdr = lle->ll_addr; + + LLE_WLOCK(lle); + if ((lle->la_flags & LLE_VALID) == 0) { + LLE_WUNLOCK(lle); + return (0); + } + + if ((lle->la_flags & LLE_IFADDR) != 0) + lladdr = IF_LLADDR(ifp); + + linkhdrsize = sizeof(linkhdr); + lltable_calc_llheader(ifp, llt->llt_af, lladdr, linkhdr, &linkhdrsize, + &lladdr_off); + memcpy(lle->r_linkdata, linkhdr, linkhdrsize); + LLE_WUNLOCK(lle); + + return (0); +} + +/* + * Update all calculated headers for given @llt + */ +int +lltable_update_ifaddr(struct lltable *llt) +{ + int error; + + if (llt->llt_ifp->if_flags & IFF_LOOPBACK) + return (0); + error = 0; + + IF_AFDATA_WLOCK(llt->llt_ifp); + lltable_foreach_lle(llt, llentry_update_ifaddr, llt->llt_ifp); + IF_AFDATA_WUNLOCK(llt->llt_ifp); + + return (error); +} + +/* * * Performes generic cleanup routines and frees lle. * @@ -601,6 +685,9 @@ struct ifnet *ifp; struct lltable *llt; struct llentry *lle, *lle_tmp; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; u_int laflags = 0; int error; @@ -636,10 +723,14 @@ if (lle == NULL) return (ENOMEM); - bcopy(LLADDR(dl), &lle->ll_addr, ifp->if_addrlen); + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, dst->sa_family, LLADDR(dl), + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return (EINVAL); + lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, + lladdr_off); if ((rtm->rtm_flags & RTF_ANNOUNCE)) lle->la_flags |= LLE_PUB; - lle->la_flags |= LLE_VALID; lle->la_expire = rtm->rtm_rmx.rmx_expire; laflags = lle->la_flags; @@ -734,7 +825,7 @@ db_printf(" ln_router=%u\n", lle->ln_router); db_printf(" ln_ntick=%ju\n", (uintmax_t)lle->ln_ntick); db_printf(" lle_refcnt=%d\n", lle->lle_refcnt); - bcopy(&lle->ll_addr.mac16, octet, sizeof(octet)); + bcopy(lle->ll_addr, octet, sizeof(octet)); db_printf(" ll_addr=%02x:%02x:%02x:%02x:%02x:%02x\n", octet[0], octet[1], octet[2], octet[3], octet[4], octet[5]); db_printf(" lle_timer=%p\n", &lle->lle_timer); Index: sys/net/if_var.h =================================================================== --- sys/net/if_var.h +++ sys/net/if_var.h @@ -126,6 +126,8 @@ u_int tsomaxsegsize; /* TSO maximum segment size in bytes */ }; +struct if_encap_req; + /* * Structure defining a network interface. * @@ -227,6 +229,8 @@ void (*if_reassign) /* reassign to vnet routine */ (struct ifnet *, struct vnet *, char *); if_get_counter_t if_get_counter; /* get counter values */ + int (*if_requestencap) /* make link header from request */ + (struct ifnet *, struct if_encap_req *); /* Statistics. */ counter_u64_t if_counters[IFCOUNTERS]; Index: sys/net/route.h =================================================================== --- sys/net/route.h +++ sys/net/route.h @@ -51,14 +51,21 @@ */ struct route { struct rtentry *ro_rt; - struct llentry *ro_lle; - struct in_ifaddr *ro_ia; - int ro_flags; + char *ro_prepend; + uint16_t ro_plen; + uint16_t ro_flags; struct sockaddr ro_dst; }; +#define RT_L2_ME_BIT 2 /* dst L2 addr is our address */ +#define RT_MAY_LOOP_BIT 3 /* dst may require loop copy */ +#define RT_HAS_HEADER_BIT 4 /* mbuf already have its header prepended */ + #define RT_CACHING_CONTEXT 0x1 /* XXX: not used anywhere */ #define RT_NORTREF 0x2 /* doesn't hold reference on ro_rt */ +#define RT_L2_ME (1 << RT_L2_ME_BIT) +#define RT_MAY_LOOP (1 << RT_MAY_LOOP_BIT) +#define RT_HAS_HEADER (1 << RT_HAS_HEADER_BIT) struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ @@ -343,6 +350,27 @@ } \ } while (0) + +/* Encap request types */ +typedef enum { + IFENCAP_LL = 1 /* pre-calculate link-layer header */ +} ife_type; + +struct if_encap_req { + u_char *buf; /* Destination buffer */ + size_t bufsize; /* pointer to size of provided buffer */ + ife_type rtype; /* request type */ + uint32_t flags; /* Request flags */ + int family; /* Address family */ + int lladdr_off; /* offset from header start (w) */ + int lladdr_len; /* lladdr length */ + char *lladdr; /* link-level address pointer */ + char *hdata; /* Upper layer header data */ +}; + +#define IFENCAP_FLAG_BROADCAST 0x02 /* Destination is broadcast */ +int if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req); + struct radix_node_head *rt_tables_get_rnh(int, int); struct ifmultiaddr; Index: sys/net/route.c =================================================================== --- sys/net/route.c +++ sys/net/route.c @@ -1119,6 +1119,25 @@ } } +/* + * Blank function for default encapsulation requests. + */ +int +if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req) +{ + + if (req->rtype != IFENCAP_LL) + return (EOPNOTSUPP); + + if (req->bufsize < req->lladdr_len) + return (ENOMEM); + + /* Copy lladdr to storage as is */ + memmove(req->buf, req->lladdr, req->lladdr_len); + req->lladdr_off = 0; + + return (0); +} #if 0 int p_sockaddr(char *buf, int buflen, struct sockaddr *s); Index: sys/netinet/if_ether.h =================================================================== --- sys/netinet/if_ether.h +++ sys/netinet/if_ether.h @@ -114,6 +114,8 @@ struct ifaddr; +int arpresolve_addr(struct ifnet *ifp, int flags, + const struct sockaddr *dst, char *desten, uint32_t *pflags); int arpresolve(struct ifnet *ifp, int is_gw, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags); void arprequest(struct ifnet *, const struct in_addr *, Index: sys/netinet/if_ether.c =================================================================== --- sys/netinet/if_ether.c +++ sys/netinet/if_ether.c @@ -142,7 +142,9 @@ static void arp_check_update_lle(struct arphdr *ah, struct in_addr isaddr, struct ifnet *ifp, int bridged, struct llentry *la); static void arp_mark_lle_reachable(struct llentry *la); +static void arp_iflladdr(void *arg __unused, struct ifnet *ifp); +static eventhandler_tag iflladdr_tag; static const struct netisr_handler arp_nh = { .nh_name = "arp", @@ -218,6 +220,31 @@ CURVNET_RESTORE(); } +static int +arp_fillheader(struct ifnet *ifp, struct arphdr *ah, int bcast, u_char *buf, + size_t *bufsize) +{ + struct if_encap_req ereq; + int error; + + bzero(buf, *bufsize); + bzero(&ereq, sizeof(ereq)); + ereq.buf = buf; + ereq.bufsize = *bufsize; + ereq.rtype = IFENCAP_LL; + ereq.family = AF_ARP; + ereq.lladdr = ar_tha(ah); + ereq.hdata = (u_char *)ah; + if (bcast) + ereq.flags = IFENCAP_FLAG_BROADCAST; + error = ifp->if_requestencap(ifp, &ereq); + if (error == 0) + *bufsize = ereq.bufsize; + + return (error); +} + + /* * Broadcast an ARP request. Caller specifies: * - arp header source ip address @@ -232,6 +259,10 @@ struct arphdr *ah; struct sockaddr sa; u_char *carpaddr = NULL; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + struct route ro; + int error; if (sip == NULL) { /* @@ -287,12 +318,27 @@ bcopy(tip, ar_tpa(ah), ah->ar_pln); sa.sa_family = AF_ARP; sa.sa_len = 2; + + /* Calculate link header for sending frame */ + bzero(&ro, sizeof(ro)); + linkhdrsize = sizeof(linkhdr); + error = arp_fillheader(ifp, ah, 1, linkhdr, &linkhdrsize); + if (error != 0) { + if_printf(ifp, "Failed to calculate ARP header: %d\n", error); + return; + } + + ro.ro_prepend = linkhdr; + ro.ro_plen = linkhdrsize; + ro.ro_flags = 0; + m->m_flags |= M_BCAST; m_clrprotoflags(m); /* Avoid confusing lower layers. */ - (*ifp->if_output)(ifp, m, &sa, NULL); + (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txrequests); } + /* * Resolve an IP address into an ethernet address - heavy version. * Used internally by arpresolve(). @@ -305,18 +351,20 @@ * Note that m_freem() handles NULL. */ static int -arpresolve_full(struct ifnet *ifp, int is_gw, int create, struct mbuf *m, +arpresolve_full(struct ifnet *ifp, int is_gw, int flags, struct mbuf *m, const struct sockaddr *dst, u_char *desten, uint32_t *pflags) { struct llentry *la = NULL, *la_tmp; struct mbuf *curr = NULL; struct mbuf *next = NULL; int error, renew; + char *lladdr; + int ll_len; if (pflags != NULL) *pflags = 0; - if (create == 0) { + if ((flags & LLE_CREATE) == 0) { IF_AFDATA_RLOCK(ifp); la = lla_lookup(LLTABLE(ifp), LLE_EXCLUSIVE, dst); IF_AFDATA_RUNLOCK(ifp); @@ -350,7 +398,14 @@ if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { - bcopy(&la->ll_addr, desten, ifp->if_addrlen); + if (flags & LLE_ADDRONLY) { + lladdr = la->ll_addr; + ll_len = ifp->if_addrlen; + } else { + lladdr = la->r_linkdata; + ll_len = la->r_hdrlen; + } + bcopy(lladdr, desten, ll_len); renew = 0; /* * If entry has an expiry time and it is approaching, @@ -364,7 +419,7 @@ } if (pflags != NULL) - *pflags = la->la_flags; + *pflags = la->la_flags & LLE_IFADDR; LLE_WUNLOCK(la); @@ -432,15 +487,30 @@ /* * Resolve an IP address into an ethernet address. + */ +int +arpresolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, + char *desten, uint32_t *pflags) +{ + int error; + + flags |= LLE_ADDRONLY; + error = arpresolve_full(ifp, 0, flags, NULL, dst, desten, pflags); + return (error); +} + +/* + * Lookups link header based on an IP address. * On input: * ifp is the interface we use * is_gw != 0 if @dst represents gateway to some destination * m is the mbuf. May be NULL if we don't have a packet. * dst is the next hop, - * desten is the storage to put LL address. + * desten is the storage to put LL header. * flags returns lle entry flags. * - * On success, desten and flags are filled in and the function returns 0; + * On success, full/partial link header and flags are filled in and + * the function returns 0. * If the packet must be held pending resolution, we return EWOULDBLOCK * On other errors, we return the corresponding error code. * Note that m_freem() handles NULL. @@ -474,11 +544,12 @@ IF_AFDATA_RUNLOCK(ifp); if (la == NULL) - return (arpresolve_full(ifp, is_gw, 1, m, dst, desten, pflags)); + return (arpresolve_full(ifp, is_gw, LLE_CREATE, m, dst, desten, + pflags)); if ((la->la_flags & LLE_VALID) && ((la->la_flags & LLE_STATIC) || la->la_expire > time_uptime)) { - bcopy(&la->ll_addr, desten, ifp->if_addrlen); + bcopy(la->r_linkdata, desten, la->r_hdrlen); renew = 0; /* * If entry has an expiry time and it is approaching, @@ -492,7 +563,7 @@ } if (pflags != NULL) - *pflags = la->la_flags; + *pflags = la->la_flags & LLE_IFADDR; LLE_RUNLOCK(la); @@ -503,7 +574,7 @@ } LLE_RUNLOCK(la); - return (arpresolve_full(ifp, is_gw, 0, m, dst, desten, pflags)); + return (arpresolve_full(ifp, is_gw, LLE_CREATE, m, dst, desten,pflags)); } /* @@ -647,6 +718,13 @@ int carped; struct sockaddr_in sin; struct sockaddr *dst; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; + struct route ro; + int error; + + sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_addr.s_addr = 0; @@ -811,14 +889,19 @@ if (la != NULL) arp_check_update_lle(ah, isaddr, ifp, bridged, la); else if (itaddr.s_addr == myaddr.s_addr) { - /* - * Reply to our address, but no lle exists yet. - * do we really have to create an entry? - */ + /* Reply to our address, but no lle exists yet. */ + /* Calculate full link prepend to use in lle */ + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, + &linkhdrsize, &lladdr_off) != 0) + goto drop; + + /* Allocate new entry */ la = lltable_alloc_entry(LLTABLE(ifp), 0, dst); if (la == NULL) goto drop; - lltable_set_entry_addr(ifp, la, ar_sha(ah)); + lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, + lladdr_off); IF_AFDATA_WLOCK(ifp); LLE_WLOCK(la); @@ -876,7 +959,7 @@ if ((lle != NULL) && (lle->la_flags & LLE_PUB)) { (void)memcpy(ar_tha(ah), ar_sha(ah), ah->ar_hln); - (void)memcpy(ar_sha(ah), &lle->ll_addr, ah->ar_hln); + (void)memcpy(ar_sha(ah), lle->ll_addr, ah->ar_hln); LLE_RUNLOCK(lle); } else { @@ -956,8 +1039,31 @@ m->m_pkthdr.rcvif = NULL; sa.sa_family = AF_ARP; sa.sa_len = 2; + + /* Calculate link header for sending frame */ + bzero(&ro, sizeof(ro)); + linkhdrsize = sizeof(linkhdr); + error = arp_fillheader(ifp, ah, 0, linkhdr, &linkhdrsize); + + /* + * arp_fillheader() may fail due to lack of support inside encap request + * routing. This is not necessary an error, AF_ARP can/should be handled + * ny if_output(). + */ + if (error != 0 && error != EAFNOSUPPORT) { + printf("Failed to calculate ARP header: %d\n", error); + goto drop; + } + + if (error == 0) { + ro.ro_prepend = linkhdr; + ro.ro_plen = linkhdrsize; + ro.ro_flags = 0; + + } + m_clrprotoflags(m); /* Avoid confusing lower layers. */ - (*ifp->if_output)(ifp, m, &sa, NULL); + (*ifp->if_output)(ifp, m, &sa, &ro); ARPSTAT_INC(txreplies); return; @@ -976,6 +1082,9 @@ { struct sockaddr sa; struct mbuf *m_hold, *m_hold_next; + uint8_t linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; LLE_WLOCK_ASSERT(la); @@ -992,7 +1101,7 @@ return; } if ((la->la_flags & LLE_VALID) && - bcmp(ar_sha(ah), &la->ll_addr, ifp->if_addrlen)) { + bcmp(ar_sha(ah), la->ll_addr, ifp->if_addrlen)) { if (la->la_flags & LLE_STATIC) { LLE_WUNLOCK(la); if (log_arp_permanent_modify) @@ -1015,8 +1124,14 @@ } } + /* Calculate full link prepend to use in lle */ + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET, ar_sha(ah), linkhdr, + &linkhdrsize, &lladdr_off) != 0) + return; + /* Check if something has changed */ - if (memcmp(&la->ll_addr, ar_sha(ah), ifp->if_addrlen) != 0 || + if (memcmp(la->r_linkdata, linkhdr, linkhdrsize) != 0 || (la->la_flags & LLE_VALID) == 0) { /* Perform real LLE update */ /* use afdata WLOCK to update fields */ @@ -1036,7 +1151,8 @@ } /* Update data */ - lltable_set_entry_addr(ifp, la, ar_sha(ah)); + lltable_set_entry_addr(ifp, la, linkhdr, linkhdrsize, + lladdr_off); IF_AFDATA_WUNLOCK(ifp); LLE_REMREF(la); @@ -1150,10 +1266,23 @@ ifa->ifa_rtrequest = NULL; } +/* + * A handler for interface link layer address change event. + */ +static __noinline void +arp_iflladdr(void *arg __unused, struct ifnet *ifp) +{ + + lltable_update_ifaddr(LLTABLE(ifp)); +} + static void arp_init(void) { netisr_register(&arp_nh); + if (IS_DEFAULT_VNET(curvnet)) + iflladdr_tag = EVENTHANDLER_REGISTER(iflladdr_event, + arp_iflladdr, NULL, EVENTHANDLER_PRI_ANY); } SYSINIT(arp, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, arp_init, 0); Index: sys/netinet/in.c =================================================================== --- sys/netinet/in.c +++ sys/netinet/in.c @@ -1238,6 +1238,9 @@ const struct sockaddr_in *sin = (const struct sockaddr_in *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; + char linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; KASSERT(l3addr->sa_family == AF_INET, ("sin_family %d", l3addr->sa_family)); @@ -1258,7 +1261,12 @@ } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { - lltable_set_entry_addr(ifp, lle, IF_LLADDR(ifp)); + linkhdrsize = LLE_MAX_LINKHDR; + if (lltable_calc_llheader(ifp, AF_INET, IF_LLADDR(ifp), + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return (NULL); + lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, + lladdr_off); lle->la_flags |= LLE_STATIC; } @@ -1337,7 +1345,7 @@ sdl->sdl_type = ifp->if_type; if ((lle->la_flags & LLE_VALID) == LLE_VALID) { sdl->sdl_alen = ifp->if_addrlen; - bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); + bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); } else { sdl->sdl_alen = 0; bzero(LLADDR(sdl), ifp->if_addrlen); Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -567,7 +567,7 @@ RO_RTFREE(ro); if (have_ia_ref) ifa_free(&ia->ia_ifa); - ro->ro_lle = NULL; + ro->ro_prepend = NULL; rte = NULL; gw = dst; ip = mtod(m, struct ip *); Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -428,7 +428,7 @@ KASSERT(lle->la_flags & LLE_VALID, ("%s: %p resolved but not valid?", __func__, lle)); - lladdr = (uint8_t *)&lle->ll_addr; + lladdr = (uint8_t *)lle->ll_addr; #ifdef VLAN_TAG VLAN_TAG(ifp, &vtag); #endif Index: sys/netinet6/icmp6.c =================================================================== --- sys/netinet6/icmp6.c +++ sys/netinet6/icmp6.c @@ -2641,7 +2641,7 @@ nd_opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; nd_opt->nd_opt_len = len >> 3; lladdr = (char *)(nd_opt + 1); - bcopy(&ln->ll_addr, lladdr, ifp->if_addrlen); + bcopy(ln->ll_addr, lladdr, ifp->if_addrlen); p += len; } } Index: sys/netinet6/in6.h =================================================================== --- sys/netinet6/in6.h +++ sys/netinet6/in6.h @@ -375,9 +375,9 @@ #if __BSD_VISIBLE struct route_in6 { struct rtentry *ro_rt; - struct llentry *ro_lle; - struct in6_addr *ro_ia6; - int ro_flags; + char *ro_prepend; + uint16_t ro_plen; + uint16_t ro_flags; struct sockaddr_in6 ro_dst; }; #endif Index: sys/netinet6/in6.c =================================================================== --- sys/netinet6/in6.c +++ sys/netinet6/in6.c @@ -2241,6 +2241,9 @@ const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr; struct ifnet *ifp = llt->llt_ifp; struct llentry *lle; + char linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; KASSERT(l3addr->sa_family == AF_INET6, ("sin_family %d", l3addr->sa_family)); @@ -2261,7 +2264,12 @@ } lle->la_flags = flags; if ((flags & LLE_IFADDR) == LLE_IFADDR) { - lltable_set_entry_addr(ifp, lle, IF_LLADDR(ifp)); + linkhdrsize = LLE_MAX_LINKHDR; + if (lltable_calc_llheader(ifp, AF_INET6, IF_LLADDR(ifp), + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return (NULL); + lltable_set_entry_addr(ifp, lle, linkhdr, linkhdrsize, + lladdr_off); lle->la_flags |= LLE_STATIC; } @@ -2348,7 +2356,7 @@ sdl->sdl_alen = ifp->if_addrlen; sdl->sdl_index = ifp->if_index; sdl->sdl_type = ifp->if_type; - bcopy(&lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); + bcopy(lle->ll_addr, LLADDR(sdl), ifp->if_addrlen); ndpc.rtm.rtm_rmx.rmx_expire = lle->la_flags & LLE_STATIC ? 0 : lle->la_expire; ndpc.rtm.rtm_flags |= (RTF_HOST | RTF_LLDATA); Index: sys/netinet6/nd6.h =================================================================== --- sys/netinet6/nd6.h +++ sys/netinet6/nd6.h @@ -410,6 +410,8 @@ void nd6_llinfo_setstate(struct llentry *lle, int newstate); void nd6_timer(void *); void nd6_purge(struct ifnet *); +int nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, + char *desten, uint32_t *pflags); int nd6_resolve(struct ifnet *, int, struct mbuf *, const struct sockaddr *, u_char *, uint32_t *); int nd6_ioctl(u_long, caddr_t, struct ifnet *); Index: sys/netinet6/nd6.c =================================================================== --- sys/netinet6/nd6.c +++ sys/netinet6/nd6.c @@ -111,7 +111,7 @@ VNET_DEFINE(int, nd6_debug) = 0; #endif -static eventhandler_tag lle_event_eh; +static eventhandler_tag lle_event_eh, iflladdr_event_eh; /* for debugging? */ #if 0 @@ -137,7 +137,7 @@ static void nd6_llinfo_settimer_locked(struct llentry *, long); static void clear_llinfo_pqueue(struct llentry *); static void nd6_rtrequest(int, struct rtentry *, struct rt_addrinfo *); -static int nd6_resolve_slow(struct ifnet *, struct mbuf *, +static int nd6_resolve_slow(struct ifnet *, int, struct mbuf *, const struct sockaddr_in6 *, u_char *, uint32_t *); static int nd6_need_cache(struct ifnet *); @@ -188,7 +188,7 @@ gw.sdl_index = ifp->if_index; gw.sdl_type = ifp->if_type; if (evt == LLENTRY_RESOLVED) - bcopy(&lle->ll_addr, gw.sdl_data, ifp->if_addrlen); + bcopy(lle->ll_addr, gw.sdl_data, ifp->if_addrlen); rtinfo.rti_info[RTAX_DST] = (struct sockaddr *)&dst; rtinfo.rti_info[RTAX_GATEWAY] = (struct sockaddr *)&gw; rtinfo.rti_addrs = RTA_DST | RTA_GATEWAY; @@ -196,6 +196,16 @@ type == RTM_ADD ? RTF_UP: 0), 0, RT_DEFAULT_FIB); } +/* + * A handler for interface link layer address change event. + */ +static __noinline void +nd6_iflladdr(void *arg __unused, struct ifnet *ifp) +{ + + lltable_update_ifaddr(LLTABLE6(ifp)); +} + void nd6_init(void) { @@ -211,9 +221,12 @@ nd6_slowtimo, curvnet); nd6_dad_init(); - if (IS_DEFAULT_VNET(curvnet)) + if (IS_DEFAULT_VNET(curvnet)) { lle_event_eh = EVENTHANDLER_REGISTER(lle_event, nd6_lle_event, NULL, EVENTHANDLER_PRI_ANY); + iflladdr_event_eh = EVENTHANDLER_REGISTER(iflladdr_event, + nd6_iflladdr, NULL, EVENTHANDLER_PRI_ANY); + } } #ifdef VIMAGE @@ -223,8 +236,10 @@ callout_drain(&V_nd6_slowtimo_ch); callout_drain(&V_nd6_timer_ch); - if (IS_DEFAULT_VNET(curvnet)) + if (IS_DEFAULT_VNET(curvnet)) { EVENTHANDLER_DEREGISTER(lle_event, lle_event_eh); + EVENTHANDLER_DEREGISTER(iflladdr_event, iflladdr_event_eh); + } } #endif @@ -1704,6 +1719,9 @@ uint16_t router = 0; struct sockaddr_in6 sin6; struct mbuf *chain = NULL; + u_char linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; IF_AFDATA_UNLOCK_ASSERT(ifp); @@ -1738,8 +1756,15 @@ * Since we already know all the data for the new entry, * fill it before insertion. */ - if (lladdr != NULL) - lltable_set_entry_addr(ifp, ln, lladdr); + if (lladdr != NULL) { + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET6, lladdr, + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return; + lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, + lladdr_off); + } + IF_AFDATA_WLOCK(ifp); LLE_WLOCK(ln); /* Prefer any existing lle over newly-created one */ @@ -1771,7 +1796,7 @@ olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0; if (olladdr && lladdr) { - llchange = bcmp(lladdr, &ln->ll_addr, + llchange = bcmp(lladdr, ln->ll_addr, ifp->if_addrlen); } else if (!olladdr && lladdr) llchange = 1; @@ -1797,7 +1822,13 @@ * Record source link-layer address * XXX is it dependent to ifp->if_type? */ - lltable_set_entry_addr(ifp, ln, lladdr); + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET6, lladdr, + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return; + lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, + lladdr_off); + nd6_llinfo_setstate(ln, ND6_LLINFO_STALE); EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); @@ -1949,8 +1980,8 @@ } /* - * Do L2 address resolution for @sa_dst address. Stores found - * address in @desten buffer. Copy of lle ln_flags can be also + * Lookup link headerfor @sa_dst address. Stores found + * data in @desten buffer. Copy of lle ln_flags can be also * saved in @pflags if @pflags is non-NULL. * * If destination LLE does not exists or lle state modification @@ -2013,13 +2044,13 @@ /* Fall back to slow processing path */ if (ln != NULL) LLE_RUNLOCK(ln); - return (nd6_resolve_slow(ifp, m, dst6, desten, pflags)); + return (nd6_resolve_slow(ifp, 0, m, dst6, desten, pflags)); } - bcopy(&ln->ll_addr, desten, ifp->if_addrlen); + bcopy(ln->r_linkdata, desten, ln->r_hdrlen); if (pflags != NULL) - *pflags = ln->la_flags; + *pflags = ln->la_flags & LLE_IFADDR; LLE_RUNLOCK(ln); return (0); } @@ -2037,12 +2068,13 @@ * Set noinline to be dtrace-friendly */ static __noinline int -nd6_resolve_slow(struct ifnet *ifp, struct mbuf *m, +nd6_resolve_slow(struct ifnet *ifp, int flags, struct mbuf *m, const struct sockaddr_in6 *dst, u_char *desten, uint32_t *pflags) { struct llentry *lle = NULL, *lle_tmp; struct in6_addr *psrc, src; - int send_ns; + int send_ns, ll_len; + char *lladdr; /* * Address resolution or Neighbor Unreachability Detection @@ -2114,7 +2146,14 @@ * send the packet. */ if (lle->ln_state > ND6_LLINFO_INCOMPLETE) { - bcopy(&lle->ll_addr, desten, ifp->if_addrlen); + if (flags & LLE_ADDRONLY) { + lladdr = lle->ll_addr; + ll_len = ifp->if_addrlen; + } else { + lladdr = lle->r_linkdata; + ll_len = lle->r_hdrlen; + } + bcopy(lladdr, desten, ll_len); if (pflags != NULL) *pflags = lle->la_flags; LLE_WUNLOCK(lle); @@ -2174,6 +2213,27 @@ return (EWOULDBLOCK); } +/* + * Do L2 address resolution for @sa_dst address. Stores found + * address in @desten buffer. Copy of lle ln_flags can be also + * saved in @pflags if @pflags is non-NULL. + * + * Return values: + * - 0 on success (address copied to buffer). + * - EWOULDBLOCK (no local error, but address is still unresolved) + * - other errors (alloc failure, etc) + */ +int +nd6_resolve_addr(struct ifnet *ifp, int flags, const struct sockaddr *dst, + char *desten, uint32_t *pflags) +{ + int error; + + flags |= LLE_ADDRONLY; + error = nd6_resolve_slow(ifp, flags, NULL, + (const struct sockaddr_in6 *)dst, desten, pflags); + return (error); +} int nd6_flush_holdchain(struct ifnet *ifp, struct ifnet *origifp, struct mbuf *chain, Index: sys/netinet6/nd6_nbr.c =================================================================== --- sys/netinet6/nd6_nbr.c +++ sys/netinet6/nd6_nbr.c @@ -643,6 +643,9 @@ union nd_opts ndopts; struct mbuf *chain = NULL; struct sockaddr_in6 sin6; + u_char linkhdr[LLE_MAX_LINKHDR]; + size_t linkhdrsize; + int lladdr_off; char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN]; if (ip6->ip6_hlim != 255) { @@ -765,7 +768,13 @@ /* * Record link-layer address, and update the state. */ - lltable_set_entry_addr(ifp, ln, lladdr); + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET6, lladdr, + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return; + lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, + lladdr_off); + EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); if (is_solicited) nd6_llinfo_setstate(ln, ND6_LLINFO_REACHABLE); @@ -789,7 +798,7 @@ llchange = 0; else { if (ln->la_flags & LLE_VALID) { - if (bcmp(lladdr, &ln->ll_addr, ifp->if_addrlen)) + if (bcmp(lladdr, ln->ll_addr, ifp->if_addrlen)) llchange = 1; else llchange = 0; @@ -831,7 +840,13 @@ * Update link-local address, if any. */ if (lladdr != NULL) { - lltable_set_entry_addr(ifp, ln, lladdr); + linkhdrsize = sizeof(linkhdr); + if (lltable_calc_llheader(ifp, AF_INET6, lladdr, + linkhdr, &linkhdrsize, &lladdr_off) != 0) + return; + lltable_set_entry_addr(ifp, ln, linkhdr, linkhdrsize, + lladdr_off); + EVENTHANDLER_INVOKE(lle_event, ln, LLENTRY_RESOLVED); } Index: sys/ofed/drivers/infiniband/core/addr.c =================================================================== --- sys/ofed/drivers/infiniband/core/addr.c +++ sys/ofed/drivers/infiniband/core/addr.c @@ -281,8 +281,6 @@ RTFREE_LOCKED(rte); return -EHOSTUNREACH; } - if (rte->rt_flags & RTF_GATEWAY) - is_gw = 1; /* * If it's not multicast or broadcast and the route doesn't match the * requested interface return unreachable. Otherwise fetch the @@ -325,20 +323,18 @@ * Resolve the link local address. */ switch (dst_in->sa_family) { -#ifdef INET case AF_INET: - error = arpresolve(ifp, is_gw, NULL, dst_in, edst, NULL); + error = arpresolve_addr(ifp, 0, dst_in, edst, NULL); break; -#endif -#ifdef INET6 case AF_INET6: - error = nd6_resolve(ifp, is_gw, NULL, dst_in, edst, NULL); + error = nd6_resolve_addr(ifp, 0, dst_in, edst, NULL); break; -#endif default: /* XXX: Shouldn't happen. */ error = -EINVAL; } + if (error == EHOSTDOWN && (rte->rt_flags & RTF_GATEWAY)) + error = EHOSTUNREACH; RTFREE(rte); if (error == 0) { memcpy(src_in, ifa->ifa_addr, ip_addr_size(ifa->ifa_addr)); Index: sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c =================================================================== --- sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -36,6 +36,7 @@ static int ipoib_resolvemulti(struct ifnet *, struct sockaddr **, struct sockaddr *); +static int ipoib_requestencap(struct ifnet *, struct if_encap_req *); #include @@ -876,6 +877,7 @@ dev->if_output = ipoib_output; dev->if_input = ipoib_input; dev->if_resolvemulti = ipoib_resolvemulti; + dev->if_requestencap = ipoib_requestencap; dev->if_baudrate = IF_Gbps(10); dev->if_broadcastaddr = priv->broadcastaddr; dev->if_snd.ifq_maxlen = ipoib_sendq_size * 2; @@ -1249,61 +1251,33 @@ destroy_workqueue(ipoib_workqueue); } -/* - * Infiniband output routine. - */ static int -ipoib_output(struct ifnet *ifp, struct mbuf *m, - const struct sockaddr *dst, struct route *ro) +ipoib_requestencap(struct ifnet *ifp, struct if_encap_req *req) { - u_char edst[INFINIBAND_ALEN]; - struct llentry *lle = NULL; - struct rtentry *rt0 = NULL; - struct ipoib_header *eh; - int error = 0, is_gw = 0; + struct ipoib_header *ih; + struct arphdr *ah; short type; + const char *lladdr; - if (ro != NULL) { - if (!(m->m_flags & (M_BCAST | M_MCAST))) - lle = ro->ro_lle; - rt0 = ro->ro_rt; - if (rt0 != NULL && (rt0->rt_flags & RTF_GATEWAY) != 0) - is_gw = 1; - } -#ifdef MAC - error = mac_ifnet_check_transmit(ifp, m); - if (error) - goto bad; -#endif + if (req->rtype != IFENCAP_LL) + return (EOPNOTSUPP); - M_PROFILE(m); - if (ifp->if_flags & IFF_MONITOR) { - error = ENETDOWN; - goto bad; - } - if (!((ifp->if_flags & IFF_UP) && - (ifp->if_drv_flags & IFF_DRV_RUNNING))) { - error = ENETDOWN; - goto bad; - } + if (req->bufsize < sizeof(struct ipoib_header)) + return (ENOMEM); - switch (dst->sa_family) { -#ifdef INET + ih = (struct ipoib_header *)req->buf; + lladdr = req->lladdr; + req->lladdr_off = offsetof(struct ipoib_header, hwaddr); + + switch (req->family) { case AF_INET: - if (lle != NULL && (lle->la_flags & LLE_VALID)) - memcpy(edst, &lle->ll_addr.mac8, sizeof(edst)); - else if (m->m_flags & M_MCAST) - ip_ib_mc_map(((struct sockaddr_in *)dst)->sin_addr.s_addr, ifp->if_broadcastaddr, edst); - else - error = arpresolve(ifp, is_gw, m, dst, edst, NULL); - if (error) - return (error == EWOULDBLOCK ? 0 : error); type = htons(ETHERTYPE_IP); break; + case AF_INET6: + type = htons(ETHERTYPE_IPV6); + break; case AF_ARP: - { - struct arphdr *ah; - ah = mtod(m, struct arphdr *); + ah = (struct arphdr *)req->hdata; ah->ar_hrd = htons(ARPHRD_INFINIBAND); switch(ntohs(ah->ar_op)) { @@ -1318,46 +1292,147 @@ break; } - if (m->m_flags & M_BCAST) - bcopy(ifp->if_broadcastaddr, edst, INFINIBAND_ALEN); - else - bcopy(ar_tha(ah), edst, INFINIBAND_ALEN); - + if (req->flags & IFENCAP_FLAG_BROADCAST) + lladdr = ifp->if_broadcastaddr; + break; + default: + return (EAFNOSUPPORT); } - break; + + memcpy(&ih->proto , &type, sizeof(ih->proto)); + memcpy(ih->hwaddr, lladdr, INFINIBAND_ALEN); + req->bufsize = sizeof(struct ipoib_header); + + return (0); +} + +static inline int +ipoib_resolve_addr(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro, char *phdr, + uint32_t *pflags) +{ + struct ipoib_header *ih; + uint32_t lleflags = 0; + struct rtentry *rt; + short type; + int error = 0; + + ih = (struct ipoib_header *)phdr; + + switch (dst->sa_family) { +#ifdef INET + case AF_INET: + if ((m->m_flags & (M_BCAST | M_MCAST)) == 0) + error = arpresolve(ifp, 0, m, dst, phdr, &lleflags); + else { + if (m->m_flags & M_BCAST) + memcpy(&ih->hwaddr, ifp->if_broadcastaddr, + INFINIBAND_ALEN); + else { + const struct in_addr *a; + a = &(((const struct sockaddr_in *)dst)->sin_addr); + ip_ib_mc_map(a->s_addr, ifp->if_broadcastaddr, + (char *)&ih->hwaddr); + } + type = htons(ETHERTYPE_IP); + memcpy(&ih->proto, &type, sizeof(ih->proto)); + } + break; #endif #ifdef INET6 case AF_INET6: - if (lle != NULL && (lle->la_flags & LLE_VALID)) - memcpy(edst, &lle->ll_addr.mac8, sizeof(edst)); - else if (m->m_flags & M_MCAST) - ipv6_ib_mc_map(&((struct sockaddr_in6 *)dst)->sin6_addr, ifp->if_broadcastaddr, edst); - else - error = nd6_resolve(ifp, is_gw, m, dst, edst, NULL); - if (error) - return error; - type = htons(ETHERTYPE_IPV6); + if ((m->m_flags & M_MCAST) == 0) + error = nd6_resolve(ifp, 0, m, dst, phdr, &lleflags); + else { + const struct in6_addr *a6; + a6 = &(((const struct sockaddr_in6 *)dst)->sin6_addr); + ipv6_ib_mc_map(a6, ifp->if_broadcastaddr, + (char *)&ih->hwaddr); + type = htons(ETHERTYPE_IPV6); + memcpy(&ih->proto, &type, sizeof(ih->proto)); + } break; #endif - default: if_printf(ifp, "can't handle af%d\n", dst->sa_family); - error = EAFNOSUPPORT; + if (m != NULL) + m_freem(m); + return (EAFNOSUPPORT); + } + + if (error == EHOSTDOWN) { + rt = (ro != NULL) ? ro->ro_rt : NULL; + if (rt != NULL && (rt->rt_flags & RTF_GATEWAY) != 0) + error = EHOSTUNREACH; + } + + if (error != 0) + return (error); + + *pflags = ((!!(lleflags & LLE_IFADDR)) << RT_L2_ME_BIT) | RT_MAY_LOOP; + + return (0); +} + +/* + * Infiniband output routine. + */ +static int +ipoib_output(struct ifnet *ifp, struct mbuf *m, + const struct sockaddr *dst, struct route *ro) +{ + char linkhdr[IPOIB_HEADER_LEN], *phdr; + struct ipoib_header *ih; + int hlen; /* link layer header length */ + int error = 0; + uint32_t pflags; + + phdr = NULL; + pflags = 0; + if (ro != NULL) { + phdr = ro->ro_prepend; + hlen = ro->ro_plen; + pflags = ro->ro_flags; + } +#ifdef MAC + error = mac_ifnet_check_transmit(ifp, m); + if (error) + goto bad; +#endif + + M_PROFILE(m); + if (ifp->if_flags & IFF_MONITOR) { + error = ENETDOWN; + goto bad; + } + if (!((ifp->if_flags & IFF_UP) && + (ifp->if_drv_flags & IFF_DRV_RUNNING))) { + error = ENETDOWN; goto bad; } + if (phdr == NULL) { + /* No prepend data supplied. Try to calculate ourselves. */ + phdr = linkhdr; + hlen = IPOIB_HEADER_LEN; + error = ipoib_resolve_addr(ifp, m, dst, ro, phdr, &pflags); + if (error != 0) + return (error == EWOULDBLOCK ? 0 : error); + } + /* * Add local net header. If no space in first mbuf, * allocate another. */ - M_PREPEND(m, IPOIB_HEADER_LEN, M_NOWAIT); + M_PREPEND(m, hlen, M_NOWAIT); if (m == NULL) { error = ENOBUFS; goto bad; } - eh = mtod(m, struct ipoib_header *); - (void)memcpy(&eh->proto, &type, sizeof(eh->proto)); - (void)memcpy(&eh->hwaddr, edst, sizeof (edst)); + if ((pflags & RT_HAS_HEADER) == 0) { + ih = mtod(m, struct ipoib_header *); + memcpy(ih, phdr, hlen); + } /* * Queue message on interface, update output statistics if